diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index adf28df7..c41e59e5 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -1,12 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-# NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
+#
 # http://www.apache.org/licenses/LICENSE-2.0
 #
-# See the License for the specific language governing permissions and limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-name: CodeQL Analysis
+name: "CodeQL"
 
 on:
   push:
@@ -17,12 +24,9 @@ on:
     - cron: '28 22 * * 1'
 
 jobs:
-  codeql-analysis:
-    name: CodeQL Analysis
+  analyze:
+    name: Analyze
     runs-on: ubuntu-22.04-64core
-    container:
-      image: nvidia/cuda:12.2.0-devel-ubuntu22.04
-      options: --user root
     timeout-minutes: 360
     permissions:
       actions: write
@@ -35,73 +39,89 @@ jobs:
         language: [ 'c-cpp', 'javascript-typescript', 'python' ]
 
     steps:
-      - name: Set up Environment
-        run: |
-          apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nvidia-open \
-          git git-lfs gcc-11 g++-11 ninja-build build-essential ccache libgtest-dev libgmock-dev \
-          shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils \
-          texlive-latex-extra ghostscript graphviz rsync \
-          && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \
-          && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && cp -r /tmp/cmake-3.20.1-linux-x86_64/bin/ /usr/local/ \
-          && cp -r /tmp/cmake-3.20.1-linux-x86_64/share/ /usr/local/ && cp -r /tmp/cmake-3.20.1-linux-x86_64/doc/ /usr/local/ \
-          && rm -rf /tmp/cmake-3.20.1*
-
-      - name: Checkout Repository
-        uses: actions/checkout@v4
-        with:
-          lfs: true
-          submodules: recursive
-
-      - name: Install Python Dependencies (C/C++)
-        if: matrix.language == 'c-cpp'
-        run: |
-          apt-get update -y && apt-get install -y --no-install-recommends \
-          python3 python3-pip python3-dev python3-distutils doxygen \
-          && rm -rf /var/lib/apt/lists/* \
-          && python3 -m pip install sphinx-rtd-theme sphinx breathe recommonmark graphviz \
-          && python3 -m pip install numpy==2.0.1 patchelf==0.17.2.1 \
-          && python3 -m pip install cuda-python==12.2.0 \ 
-          && python3 -m pip install -U sphinx
-
-      - name: Initialize CodeQL
-        uses: github/codeql-action/init@v3
-        with:
-          languages: ${{ matrix.language }}
-          queries: +security-and-quality
-
-      - name: Autobuild (Non C/C++)
-        if: matrix.language != 'c-cpp'
-        uses: github/codeql-action/autobuild@v3
-
-      - name: Build CMake Project (C/C++)
-        if: matrix.language == 'c-cpp'
-        run: |
-          echo "Running CMake project build script"
-          ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_PYTHON=ON"
-
-      - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v3
-        with:
-          category: "/language:${{ matrix.language }}"
-
-      - name: Build and Clean Documentation (C/C++, Push Event)
-        if: matrix.language == 'c-cpp' && github.event_name == 'push'
-        run: |
-          ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_DOCS=ON -DBUILD_PYTHON=ON -DPYTHON_VERSIONS=3.10"
-          find build/docs/sphinx -name '*.doctree' -delete
-          find build/docs/sphinx -name '*.map' -delete
-          find build/docs/sphinx -name '*.pickle' -delete
-          find build/docs/sphinx -name '*.inv' -delete
-          find build/docs/sphinx -name '*.gz' -delete
-
-      - name: Create .nojekyll File (C/C++, Push Event)
-        if: matrix.language == 'c-cpp' && github.event_name == 'push'
-        run: touch build/docs/sphinx/.nojekyll
-
-      - name: Deploy to GitHub Pages (C/C++, Push Event)
-        if: matrix.language == 'c-cpp' && github.event_name == 'push'
-        uses: JamesIves/github-pages-deploy-action@v4
-        with:
-          folder: build/docs/sphinx
-          branch: gh-pages
-          clean: true
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        lfs: true
+        submodules: 'recursive'
+
+    - if: matrix.language == 'c-cpp'
+      name: Setup environment
+      run: |
+        sudo apt update -y && sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
+        sudo apt update -y && sudo apt install -y --no-install-recommends \
+        git git-lfs gcc-11 g++-11 ninja-build ccache libgtest-dev libgmock-dev \
+        shellcheck curl doxygen python3 python3-pip python3-dev python3-distutils \
+        texlive-latex-extra ghostscript graphviz \
+        && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \
+        && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/bin/ /usr/local/ \
+        && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/share/ /usr/local/ && sudo cp -r /tmp/cmake-3.20.1-linux-x86_64/doc/ /usr/local/ \
+        && rm -rf /tmp/cmake-3.20.1*
+
+    - if: matrix.language == 'c-cpp'
+      name: Install Python Dependencies
+      run: |
+        sudo apt update -y && sudo apt install -y --no-install-recommends \
+        python3 python3-pip python3-dev python3-distutils doxygen && sudo rm -rf /var/lib/apt/lists/* \
+        && python3 -m pip install sphinx-rtd-theme sphinx breathe recommonmark graphviz \
+        && python3 -m pip install numpy==2.0.1 patchelf==0.17.2.1
+
+    - if: matrix.language == 'c-cpp'
+      name: Install CUDA Toolkit
+      uses: Jimver/cuda-toolkit@v0.2.16
+      id: cuda-toolkit
+      with:
+        cuda: '12.2.0'
+        linux-local-args: '["--toolkit"]'
+
+    - if: matrix.language == 'c-cpp'
+      name: Verify CUDA installation
+      run: |
+          echo "Installed CUDA version is: ${{ steps.cuda-toolkit.outputs.cuda }}"
+          echo "CUDA install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}"
+          sudo ln -s ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/lib64/libcudart.so \
+            /usr/lib/x86_64-linux-gnu/libcuda.so
+          nvcc -V
+
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v3
+      with:
+        languages: ${{ matrix.language }}
+        queries: +security-and-quality
+
+    - if: matrix.language != 'c-cpp'
+      name: Autobuild
+      uses: github/codeql-action/autobuild@v3
+
+    - if: matrix.language == 'c-cpp'
+      name: Build CMake project
+      run: |
+        echo "Running CMake project build script"
+        ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_PYTHON=ON" $*
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v3
+      with:
+        category: "/language:${{matrix.language}}"
+
+    - if: matrix.language == 'c-cpp' && github.event_name == 'push'
+      name: Build Docs and Clean up Sphinx Build Directory
+      run: |
+        ./ci/build.sh debug build "-DBUILD_SAMPLES=OFF -DBUILD_TESTS=OFF -DBUILD_DOCS=ON -DBUILD_PYTHON=ON -DPYTHON_VERSIONS=3.10" $*
+        find build/docs/sphinx -name '*.doctree' -delete
+        find build/docs/sphinx -name '*.map' -delete
+        find build/docs/sphinx -name '*.pickle' -delete
+        find build/docs/sphinx -name '*.inv' -delete
+        find build/docs/sphinx -name '*.gz' -delete
+
+    - if: matrix.language == 'c-cpp' && github.event_name == 'push'
+      name: Create .nojekyll file
+      run: touch build/docs/sphinx/.nojekyll
+
+    - if: matrix.language == 'c-cpp' && github.event_name == 'push'
+      name: Deploy to GitHub Pages
+      uses: JamesIves/github-pages-deploy-action@v4
+      with:
+        folder: build/docs/sphinx
+        branch: gh-pages
+        clean: true
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 13378bf1..4c2f48cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ endif()
 
 project(cvcuda
         LANGUAGES C CXX
-        VERSION 0.11.0
+        VERSION 0.12.0
         DESCRIPTION "CUDA-accelerated Computer Vision algorithms"
 )
 
diff --git a/README.md b/README.md
index 07f84fec..ddb93741 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0)
 
-![Version](https://img.shields.io/badge/Version-v0.11.0--beta-blue)
+![Version](https://img.shields.io/badge/Version-v0.12.0--beta-blue)
 
 ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray)
 
@@ -61,27 +61,6 @@ To get a local copy up and running follow these steps.
 - Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version.
 - Documentation built on Ubuntu 20.04 needs an up-to-date version of sphinx (`pip install --upgrade sphinx`) as well as explicitly parsing the system's default python version ` ./ci/build_docs path/to/build -DPYTHON_VERSIONS="<py_ver>"`.
 - The Resize and RandomResizedCrop operators incorrectly interpolate pixel values near the boundary of an image or tensor when using cubic interpolation. This will be fixed in an upcoming release.
-- The CvtColor operator incorrectly computes the data location of the second chromaticity channel for conversions that involve YUV(420) semi-planar formats. This issue persists through the current release and we intend to address this bug in CV-CUDA v0.12. We do not recommend using these formats.​
-  - Known affected formats:​
-    - NVCV_COLOR_YUV2RGB_I420​
-    - NVCV_COLOR_RGB2YUV_I420​
-    - NVCV_COLOR_YUV2BGR_I420​
-    - NVCV_COLOR_BGR2YUV_I420​
-    - NVCV_COLOR_YUV2RGBA_I420​
-    - NVCV_COLOR_RGBA2YUV_I420​
-    - NVCV_COLOR_YUV2BGRA_I420​
-    - NVCV_COLOR_BGRA2YUV_I420​
-    - NVCV_COLOR_RGB2YUV_I420​
-    - NVCV_COLOR_YUV2RGB_YV12​
-    - NVCV_COLOR_RGB2YUV_YV12​
-    - NVCV_COLOR_YUV2BGR_YV12​
-    - NVCV_COLOR_BGR2YUV_YV12​
-    - NVCV_COLOR_YUV2RGBA_YV12​
-    - NVCV_COLOR_RGBA2YUV_YV12​
-    - NVCV_COLOR_YUV2BGRA_YV12​
-    - NVCV_COLOR_BGRA2YUV_YV12​
-    - NVCV_COLOR_RGB2YUV_YV12​
-    - NVCV_COLOR_YUV2GRAY_420​
 
 ### Installation
 
diff --git a/bench/BenchCvtColor.cpp b/bench/BenchCvtColor.cpp
index 7d6c5c39..de110a61 100644
--- a/bench/BenchCvtColor.cpp
+++ b/bench/BenchCvtColor.cpp
@@ -21,63 +21,163 @@
 
 #include <nvbench/nvbench.cuh>
 
-template<typename T>
-inline void CvtColor(nvbench::state &state, nvbench::type_list<T>)
+#include <map>
+#include <stdexcept>
+#include <tuple>
+
+using ConvCodeToFormat = std::tuple<NVCVColorConversionCode, NVCVImageFormat, NVCVImageFormat>;
+using CodeMap          = std::map<std::string, ConvCodeToFormat>;
+
+inline static ConvCodeToFormat str2Frmt(const std::string &str)
+{
+    // clang-format off
+    static const CodeMap codeMap {
+        {     "RGB2BGR", {NVCV_COLOR_RGB2BGR,      NVCV_IMAGE_FORMAT_RGB8,  NVCV_IMAGE_FORMAT_BGR8 }},
+        {    "RGB2RGBA", {NVCV_COLOR_RGB2RGBA,     NVCV_IMAGE_FORMAT_RGB8,  NVCV_IMAGE_FORMAT_RGBA8}},
+        {    "RGBA2RGB", {NVCV_COLOR_RGBA2RGB,     NVCV_IMAGE_FORMAT_RGBA8, NVCV_IMAGE_FORMAT_RGB8 }},
+        {    "RGB2GRAY", {NVCV_COLOR_RGB2GRAY,     NVCV_IMAGE_FORMAT_RGB8,  NVCV_IMAGE_FORMAT_Y8   }},
+        {    "GRAY2RGB", {NVCV_COLOR_GRAY2RGB,     NVCV_IMAGE_FORMAT_Y8,    NVCV_IMAGE_FORMAT_RGB8 }},
+        {     "RGB2HSV", {NVCV_COLOR_RGB2HSV,      NVCV_IMAGE_FORMAT_RGB8,  NVCV_IMAGE_FORMAT_HSV8 }},
+        {     "HSV2RGB", {NVCV_COLOR_HSV2RGB,      NVCV_IMAGE_FORMAT_HSV8,  NVCV_IMAGE_FORMAT_RGB8 }},
+        {     "RGB2YUV", {NVCV_COLOR_RGB2YUV,      NVCV_IMAGE_FORMAT_RGB8,  NVCV_IMAGE_FORMAT_YUV8 }},
+        {     "YUV2RGB", {NVCV_COLOR_YUV2RGB,      NVCV_IMAGE_FORMAT_YUV8,  NVCV_IMAGE_FORMAT_RGB8 }},
+        {"RGB2YUV_NV12", {NVCV_COLOR_RGB2YUV_NV12, NVCV_IMAGE_FORMAT_RGB8,  NVCV_IMAGE_FORMAT_NV12 }},
+        {"YUV2RGB_NV12", {NVCV_COLOR_YUV2RGB_NV12, NVCV_IMAGE_FORMAT_NV12,  NVCV_IMAGE_FORMAT_RGB8 }},
+    };
+    // clang-format on
+
+    if (auto it = codeMap.find(str); it != codeMap.end())
+    {
+        return it->second;
+    }
+    else
+    {
+        throw std::invalid_argument("Unrecognized color code");
+    }
+}
+
+template<typename BT>
+inline float bytesPerPixel(NVCVImageFormat imgFormat)
+{
+#define BPP_CASE(frmt, bytes) \
+    case frmt:                \
+        return bytes * sizeof(BT)
+
+    switch (imgFormat)
+    {
+        BPP_CASE(NVCV_IMAGE_FORMAT_RGB8, 3);
+        BPP_CASE(NVCV_IMAGE_FORMAT_BGR8, 3);
+        BPP_CASE(NVCV_IMAGE_FORMAT_HSV8, 3);
+        BPP_CASE(NVCV_IMAGE_FORMAT_RGBA8, 4);
+        BPP_CASE(NVCV_IMAGE_FORMAT_YUV8, 3);
+        BPP_CASE(NVCV_IMAGE_FORMAT_NV12, 1.5f);
+        BPP_CASE(NVCV_IMAGE_FORMAT_Y8, 1);
+    default:
+        throw std::invalid_argument("Unrecognized format");
+    }
+#undef BPP_CASE
+}
+
+// Adapted from src/util/TensorDataUtils.hpp
+inline static nvcv::Tensor CreateTensor(int numImages, int imgWidth, int imgHeight, const nvcv::ImageFormat &imgFormat)
+{
+    if (imgFormat == NVCV_IMAGE_FORMAT_NV12 || imgFormat == NVCV_IMAGE_FORMAT_NV12_ER
+        || imgFormat == NVCV_IMAGE_FORMAT_NV21 || imgFormat == NVCV_IMAGE_FORMAT_NV21_ER)
+    {
+        if (imgHeight % 2 != 0 || imgWidth % 2 != 0)
+        {
+            throw std::invalid_argument("Invalid height");
+        }
+
+        int height420 = (imgHeight * 3) / 2;
+
+        return nvcv::Tensor(numImages, {imgWidth, height420}, nvcv::ImageFormat(NVCV_IMAGE_FORMAT_Y8));
+    }
+    else
+    {
+        return nvcv::Tensor(numImages, {imgWidth, imgHeight}, imgFormat);
+    }
+}
+
+template<typename BT>
+inline void CvtColor(nvbench::state &state, nvbench::type_list<BT>)
 try
 {
     long3 shape    = benchutils::GetShape<3>(state.get_string("shape"));
     long  varShape = state.get_int64("varShape");
 
-    using BT = typename nvcv::cuda::BaseType<T>;
+    ConvCodeToFormat formats = str2Frmt(state.get_string("code"));
 
-    int ch = nvcv::cuda::NumElements<T>;
+    NVCVColorConversionCode code = std::get<0>(formats);
+    nvcv::ImageFormat       inFormat{std::get<1>(formats)};
+    nvcv::ImageFormat       outFormat{std::get<2>(formats)};
 
-    NVCVColorConversionCode code = ch == 3 ? NVCV_COLOR_BGR2RGB : NVCV_COLOR_BGRA2RGBA;
-
-    state.add_global_memory_reads(shape.x * shape.y * shape.z * sizeof(T));
-    state.add_global_memory_writes(shape.x * shape.y * shape.z * sizeof(T));
+    state.add_global_memory_reads(shape.x * shape.y * shape.z * bytesPerPixel<BT>(inFormat));
+    state.add_global_memory_writes(shape.x * shape.y * shape.z * bytesPerPixel<BT>(outFormat));
 
     cvcuda::CvtColor op;
 
-    // clang-format off
-
     if (varShape < 0) // negative var shape means use Tensor
     {
-        nvcv::Tensor src({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
-        nvcv::Tensor dst({{shape.x, shape.y, shape.z, ch}, "NHWC"}, benchutils::GetDataType<BT>());
+        nvcv::Tensor src = CreateTensor(shape.x, shape.z, shape.y, inFormat);
+        nvcv::Tensor dst = CreateTensor(shape.x, shape.z, shape.y, outFormat);
 
         benchutils::FillTensor<BT>(src, benchutils::RandomValues<BT>());
 
-        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code](nvbench::launch &launch)
-        {
-            op(launch.get_stream(), src, dst, code);
-        });
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &code](nvbench::launch &launch) { op(launch.get_stream(), src, dst, code); });
     }
     else // zero and positive var shape means use ImageBatchVarShape
     {
-        nvcv::ImageBatchVarShape src(shape.x);
-        nvcv::ImageBatchVarShape dst(shape.x);
+        if (inFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444
+            || outFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444)
+        {
+            state.skip("Skipping formats that have subsampled planes for the varshape benchmark");
+        }
+
+        std::vector<nvcv::Image>          imgSrc;
+        std::vector<nvcv::Image>          imgDst;
+        nvcv::ImageBatchVarShape          src(shape.x);
+        nvcv::ImageBatchVarShape          dst(shape.x);
+        std::vector<std::vector<uint8_t>> srcVec(shape.x);
 
-        benchutils::FillImageBatch<T>(src, long2{shape.z, shape.y}, long2{varShape, varShape},
-                                      benchutils::RandomValues<T>());
-        dst.pushBack(src.begin(), src.end());
+        auto randomValuesU8 = benchutils::RandomValues<uint8_t>();
 
-        state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &code](nvbench::launch &launch)
+        for (int i = 0; i < shape.x; i++)
         {
-            op(launch.get_stream(), src, dst, code);
-        });
+            imgSrc.emplace_back(nvcv::Size2D{(int)shape.z, (int)shape.y}, inFormat);
+            imgDst.emplace_back(nvcv::Size2D{(int)shape.z, (int)shape.y}, outFormat);
+
+            int srcRowStride = imgSrc[i].size().w * inFormat.planePixelStrideBytes(0);
+            int srcBufSize   = imgSrc[i].size().h * srcRowStride;
+            srcVec[i].resize(srcBufSize);
+            for (int idx = 0; idx < srcBufSize; idx++)
+            {
+                srcVec[i][idx] = randomValuesU8();
+            }
+
+            auto imgData = imgSrc[i].exportData<nvcv::ImageDataStridedCuda>();
+            CUDA_CHECK_ERROR(cudaMemcpy2D(imgData->plane(0).basePtr, imgData->plane(0).rowStride, srcVec[i].data(),
+                                          srcRowStride, srcRowStride, imgSrc[i].size().h, cudaMemcpyHostToDevice));
+        }
+        src.pushBack(imgSrc.begin(), imgSrc.end());
+        dst.pushBack(imgDst.begin(), imgDst.end());
+
+        state.exec(nvbench::exec_tag::sync,
+                   [&op, &src, &dst, &code](nvbench::launch &launch) { op(launch.get_stream(), src, dst, code); });
     }
 }
+
 catch (const std::exception &err)
 {
     state.skip(err.what());
 }
 
-// clang-format on
-
-using CvtColorTypes = nvbench::type_list<uchar3, uchar4>;
+using BaseTypes = nvbench::type_list<uint8_t>;
 
-NVBENCH_BENCH_TYPES(CvtColor, NVBENCH_TYPE_AXES(CvtColorTypes))
-    .set_type_axes_names({"InOutDataType"})
-    .add_string_axis("shape", {"1x1080x1920"})
+NVBENCH_BENCH_TYPES(CvtColor, NVBENCH_TYPE_AXES(BaseTypes))
+    .set_type_axes_names({"BaseType"})
+    .add_string_axis("shape", {"1x1080x1920", "64x720x1280"})
+    .add_string_axis("code", {"RGB2BGR", "RGB2RGBA", "RGBA2RGB", "RGB2GRAY", "GRAY2RGB", "RGB2HSV", "HSV2RGB",
+                              "RGB2YUV", "YUV2RGB", "RGB2YUV_NV12", "YUV2RGB_NV12"})
     .add_int64_axis("varShape", {-1, 0});
diff --git a/cmake/ConfigCUDA.cmake b/cmake/ConfigCUDA.cmake
index 56bf632f..319a157e 100644
--- a/cmake/ConfigCUDA.cmake
+++ b/cmake/ConfigCUDA.cmake
@@ -32,6 +32,9 @@ set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
 # Compress kernels to generate smaller executables
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=--compress-all")
 
+# Enable device lambdas
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
+
 if(NOT USE_CMAKE_CUDA_ARCHITECTURES)
     set(CMAKE_CUDA_ARCHITECTURES "$ENV{CUDAARCHS}")
 
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index 15672135..9ab82fd6 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -43,10 +43,6 @@
 lib_path = os.getenv("SPHINX_PYTHON_SRC", default=".")
 sys.path.insert(0, os.path.abspath(lib_path))
 
-# -- Module mocking ----------------------------------------------------------
-
-autodoc_mock_imports = ['nvcv', 'cvcuda']
-
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index 66f03f3f..69d555b1 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -123,7 +123,9 @@ Copyright
     :maxdepth: 1
     :hidden:
 
+    v0.12.0-beta <relnotes/v0.12.0-beta>
     v0.11.0-beta <relnotes/v0.11.0-beta>
+    v0.10.1-beta <relnotes/v0.10.1-beta>
     v0.10.0-beta <relnotes/v0.10.0-beta>
     v0.9.0-beta <relnotes/v0.9.0-beta>
     v0.8.0-beta <relnotes/v0.8.0-beta>
diff --git a/docs/sphinx/relnotes/v0.10.1-beta.rst b/docs/sphinx/relnotes/v0.10.1-beta.rst
index 2e03b5b6..a03c4166 100644
--- a/docs/sphinx/relnotes/v0.10.1-beta.rst
+++ b/docs/sphinx/relnotes/v0.10.1-beta.rst
@@ -1,44 +1,44 @@
-..
-  # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-  # SPDX-License-Identifier: Apache-2.0
-  #
-  # Licensed under the Apache License, Version 2.0 (the "License");
-  # you may not use this file except in compliance with the License.
-  # You may obtain a copy of the License at
-  #
-  # http://www.apache.org/licenses/LICENSE-2.0
-  #
-  # Unless required by applicable law or agreed to in writing, software
-  # distributed under the License is distributed on an "AS IS" BASIS,
-  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  # See the License for the specific language governing permissions and
-  # limitations under the License.
-
-.. _v0.10.1-beta:
-
-v0.10.1-beta
-============
-
-Release Highlights
-------------------
-
-CV-CUDA v0.10.1 reverts the OpCvtColor performance improvements introduced in v0.10.0 due to discovered bugs.
-These optimizations will be reintroduced, with consolidated testing, in a future release.
-
-License
--------
-
-CV-CUDA is licensed under the `Apache 2.0 <https://github.com/CVCUDA/CV-CUDA/blob/main/LICENSE.md>`_ license.
-
-Resources
----------
-
-1. `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_
-2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA <https://developer.nvidia.com/blog/increasing-throughput-and-reducing-costs-for-computer-vision-with-cv-cuda/>`_
-3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI <https://blogs.nvidia.com/blog/2023/03/21/cv-cuda-ai-computer-vision/>`_
-4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI <https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/>`_
-
-Acknowledgements
-----------------
-
-CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
+..
+  # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-License-Identifier: Apache-2.0
+  #
+  # Licensed under the Apache License, Version 2.0 (the "License");
+  # you may not use this file except in compliance with the License.
+  # You may obtain a copy of the License at
+  #
+  # http://www.apache.org/licenses/LICENSE-2.0
+  #
+  # Unless required by applicable law or agreed to in writing, software
+  # distributed under the License is distributed on an "AS IS" BASIS,
+  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  # See the License for the specific language governing permissions and
+  # limitations under the License.
+
+.. _v0.10.1-beta:
+
+v0.10.1-beta
+============
+
+Release Highlights
+------------------
+
+CV-CUDA v0.10.1 reverts the OpCvtColor performance improvements introduced in v0.10.0 due to discovered bugs.
+These optimizations will be reintroduced, with consolidated testing, in a future release.
+
+License
+-------
+
+CV-CUDA is licensed under the `Apache 2.0 <https://github.com/CVCUDA/CV-CUDA/blob/main/LICENSE.md>`_ license.
+
+Resources
+---------
+
+1. `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_
+2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA <https://developer.nvidia.com/blog/increasing-throughput-and-reducing-costs-for-computer-vision-with-cv-cuda/>`_
+3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI <https://blogs.nvidia.com/blog/2023/03/21/cv-cuda-ai-computer-vision/>`_
+4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI <https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/>`_
+
+Acknowledgements
+----------------
+
+CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
diff --git a/docs/sphinx/relnotes/v0.11.0-beta.rst b/docs/sphinx/relnotes/v0.11.0-beta.rst
index 9f50e975..2957e40e 100644
--- a/docs/sphinx/relnotes/v0.11.0-beta.rst
+++ b/docs/sphinx/relnotes/v0.11.0-beta.rst
@@ -38,28 +38,9 @@ Compatibility and Known Limitations
   
 * **Pre-existing limitations**:
 
-  * The CvtColor operator incorrectly computes the data location of the second chromaticity channel for conversions that involve YUV(420) semi-planar formats. This issue persists through the current release and we intend to address this bug in CV-CUDA v0.12. We do not recommend using these formats.​
-
-    * Known affected formats:​
-      * NVCV_COLOR_YUV2RGB_I420​
-      * NVCV_COLOR_RGB2YUV_I420​
-      * NVCV_COLOR_YUV2BGR_I420​
-      * NVCV_COLOR_BGR2YUV_I420​
-      * NVCV_COLOR_YUV2RGBA_I420​
-      * NVCV_COLOR_RGBA2YUV_I420​
-      * NVCV_COLOR_YUV2BGRA_I420​
-      * NVCV_COLOR_BGRA2YUV_I420​
-      * NVCV_COLOR_RGB2YUV_I420​
-      * NVCV_COLOR_YUV2RGB_YV12​
-      * NVCV_COLOR_RGB2YUV_YV12​
-      * NVCV_COLOR_YUV2BGR_YV12​
-      * NVCV_COLOR_BGR2YUV_YV12​
-      * NVCV_COLOR_YUV2RGBA_YV12​
-      * NVCV_COLOR_RGBA2YUV_YV12​
-      * NVCV_COLOR_YUV2BGRA_YV12​
-      * NVCV_COLOR_BGRA2YUV_YV12​
-      * NVCV_COLOR_RGB2YUV_YV12​
-      * NVCV_COLOR_YUV2GRAY_420​
+  * We note a bug in the YUV(420) color conversion API (NVCV_COLOR_RGB2YUV_I420) which incorrectly computes the U and V plane index​
+
+    * This persists through this release and we intend to address this bug in CV-CUDA v0.12.0​
 
 For the full list, see main README on `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_.
 
diff --git a/docs/sphinx/relnotes/v0.12.0-beta.rst b/docs/sphinx/relnotes/v0.12.0-beta.rst
new file mode 100644
index 00000000..2fe84ca5
--- /dev/null
+++ b/docs/sphinx/relnotes/v0.12.0-beta.rst
@@ -0,0 +1,60 @@
+..
+  # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-License-Identifier: Apache-2.0
+  #
+  # Licensed under the Apache License, Version 2.0 (the "License");
+  # you may not use this file except in compliance with the License.
+  # You may obtain a copy of the License at
+  #
+  # http://www.apache.org/licenses/LICENSE-2.0
+  #
+  # Unless required by applicable law or agreed to in writing, software
+  # distributed under the License is distributed on an "AS IS" BASIS,
+  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  # See the License for the specific language governing permissions and
+  # limitations under the License.
+
+.. _v0.12.0-beta:
+
+v0.12.0-beta
+============
+
+Release Highlights
+------------------
+
+CV-CUDA v0.12.0 includes critical bug fixes alongside the following changes:​
+
+* **New Features**:​
+
+  * Increased functional test coverage of color conversions. ​
+  * Reintroduced from 24.07: Improved performance of color conversion operators (e.g., 2x faster RGB2YUV).
+
+* **Bug Fixes**:​
+
+  * Fixed bug in YUV(420) conversions: The CvtColor operator incorrectly computed the data location of the second chromaticity channel for conversions.​
+  * Fixed bug in YUV(422) conversions: The CvtColor operator incorrectly interpreted the interleaved YUV(422) data layout as a three-channel tensor.​
+  * Prevent CV_16F alpha addition: some color conversions in the CvtColor operator allowed for the addition of an alpha channel to the destination tensor, which is undefined for the CV_16F data type.
+
+
+Compatibility and Known Limitations
+-----------------------------------
+
+For the full list, see main README on `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_.
+
+License
+-------
+
+CV-CUDA is licensed under the `Apache 2.0 <https://github.com/CVCUDA/CV-CUDA/blob/main/LICENSE.md>`_ license.
+
+Resources
+---------
+
+1. `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_
+2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA <https://developer.nvidia.com/blog/increasing-throughput-and-reducing-costs-for-computer-vision-with-cv-cuda/>`_
+3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI <https://blogs.nvidia.com/blog/2023/03/21/cv-cuda-ai-computer-vision/>`_
+4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI <https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/>`_
+
+Acknowledgements
+----------------
+
+CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh
index e207fd30..f7f17b5b 100755
--- a/samples/scripts/run_samples.sh
+++ b/samples/scripts/run_samples.sh
@@ -136,4 +136,3 @@ python3 $SAMPLES_DIR/label/python/main.py -o "$LABEL_RUN_DIR"
 # Run it with batch size 1 on a single image
 LABEL_RUN_DIR=$(create_output_dir "$LABEL_OUT_DIR")
 python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg  -b 1 -o "$LABEL_RUN_DIR"
-
diff --git a/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp b/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp
index 2a7d8bd3..c413a468 100644
--- a/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp
+++ b/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp
@@ -220,7 +220,11 @@ class Vector
     }
 
     // On-purpose public data to allow POD-class direct initialization.
+#ifdef __CUDA_ARCH__
     T m_data[N];
+#else
+    T m_data[N] = {};
+#endif
 };
 
 /**
diff --git a/src/cvcuda/priv/legacy/cvt_color.cu b/src/cvcuda/priv/legacy/cvt_color.cu
index 332ff865..d202ac54 100644
--- a/src/cvcuda/priv/legacy/cvt_color.cu
+++ b/src/cvcuda/priv/legacy/cvt_color.cu
@@ -27,9 +27,13 @@
 
 #include <cfloat>
 
-static constexpr float B2YF = 0.114f;
-static constexpr float G2YF = 0.587f;
-static constexpr float R2YF = 0.299f;
+// NOTE: Below are the "standard" (NTSC and ITU Rec.601) RGB to luma conversion
+// coefficients. More accurate coefficents, given as comments on the right, are
+// found at http://www.brucelindbloom.com/index.html?WorkingSpaceInfo.html and
+// https://www.imagemagick.org/include/api/pixel.php.
+static constexpr float R2YF = 0.299f; // 0.298839
+static constexpr float G2YF = 0.587f; // 0.586811
+static constexpr float B2YF = 0.114f; // 0.114350
 
 static constexpr int gray_shift = 15;
 static constexpr int yuv_shift  = 14;
@@ -43,8 +47,8 @@ static constexpr int B2Y  = 1868;  // == B2YF*16384
 static constexpr int R2VI = 14369; // == R2VF*16384
 static constexpr int B2UI = 8061;  // == B2UF*16384
 
-static constexpr float B2UF = 0.492f;
-static constexpr float R2VF = 0.877f;
+static constexpr float B2UF = 0.492f; // 0.492111: U = (B - Y) * B2UF + 0.5
+static constexpr float R2VF = 0.877f; // 0.877283: V = (R - Y) * R2VF + 0.5
 
 static constexpr int U2BI = 33292;
 static constexpr int U2GI = -6472;
@@ -77,354 +81,541 @@ static constexpr int ITUR_BT_601_CBV = -74448;
 
 #define BLOCK 32
 
+#define DEVICE_INLINE __device__ __forceinline__
+#define GLOBAL_BOUNDS __global__ __launch_bounds__(Policy::BlockSize)
+
+template<typename T, typename BT = nvcv::cuda::BaseType<T>>
+constexpr BT Alpha = std::is_floating_point_v<BT> ? 1 : nvcv::cuda::TypeTraits<BT>::max;
+
 namespace nvcv::legacy::cuda_op {
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void rgb_to_bgr_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int sch, int dch, int bidx)
+template<typename T, typename StrideT>
+using TensorWrap3D = nvcv::cuda::Tensor3DWrap<T, StrideT>;
+
+template<typename T, typename StrideT>
+using TensorWrap4D = nvcv::cuda::Tensor4DWrap<T, StrideT>;
+
+template<int BlockWidth_, int BlockHeight_, int RowsPerThread_>
+struct CvtKernelPolicy
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
+    static_assert(BlockWidth_ % 32 == 0);
+    static constexpr int BlockWidth      = BlockWidth_;
+    static constexpr int BlockHeight     = BlockHeight_;
+    static constexpr int BlockSize       = BlockWidth * BlockHeight;
+    static constexpr int RowsPerThread   = RowsPerThread_;
+    static constexpr int TileWidth       = BlockWidth;
+    static constexpr int TileHeight      = BlockHeight * RowsPerThread;
+    static constexpr int ThreadRowStride = BlockHeight;
+};
+
+template<typename Policy, int N_IN, int N_OUT, typename EltT, typename LoadOpT, typename ConvOpT, typename StoreOpT>
+DEVICE_INLINE void color_conversion_common(LoadOpT load_op, ConvOpT conv_op, StoreOpT store_op, int2 size)
+{
+    const int x         = blockIdx.x * Policy::TileWidth + threadIdx.x;
+    const int y0        = blockIdx.y * Policy::TileHeight + threadIdx.y;
     const int batch_idx = get_batch_idx();
+    if (x >= size.x)
+    {
+        return;
+    }
 
-    T b = *src.ptr(batch_idx, dst_y, dst_x, bidx);
-    T g = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    T r = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2);
-
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = b;
-    *dst.ptr(batch_idx, dst_y, dst_x, 1) = g;
-    *dst.ptr(batch_idx, dst_y, dst_x, 2) = r;
+    // Branchless efficient path for inner blocks.
+    if (y0 + Policy::TileHeight <= size.y)
+    {
+        EltT r_in[Policy::RowsPerThread][N_IN];
+        EltT r_out[Policy::RowsPerThread][N_OUT];
 
-    if (dch == 4)
+#pragma unroll
+        for (int i = 0; i < Policy::RowsPerThread; i++)
+        {
+            const int y = y0 + Policy::ThreadRowStride * i;
+            load_op(r_in[i], batch_idx, x, y);
+        }
+#pragma unroll
+        for (int i = 0; i < Policy::RowsPerThread; i++) conv_op(r_in[i], r_out[i]);
+#pragma unroll
+        for (int i = 0; i < Policy::RowsPerThread; i++)
+        {
+            const int y = y0 + Policy::ThreadRowStride * i;
+            store_op(r_out[i], batch_idx, x, y);
+        }
+    }
+    else
     {
-        T al = sch == 4 ? *src.ptr(batch_idx, dst_y, dst_x, 3) : cuda::TypeTraits<T>::max;
-        *dst.ptr(batch_idx, dst_y, dst_x, 3) = al;
+        int y = y0;
+        for (int i = 0; i < Policy::RowsPerThread && y < size.y; i++)
+        {
+            EltT r_in[N_IN];
+            EltT r_out[N_OUT];
+
+            load_op(r_in, batch_idx, x, y);
+            conv_op(r_in, r_out);
+            store_op(r_out, batch_idx, x, y);
+
+            y += Policy::ThreadRowStride;
+        }
     }
 }
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void gray_to_bgr_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dch)
+template<typename SrcT, typename EltT, typename StrideT>
+DEVICE_INLINE void load3_nhwc(const TensorWrap3D<const SrcT, StrideT> &src, EltT &C0, EltT &C1, EltT &C2, int batch_idx,
+                              int x, int y)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
+    SrcT vec = *src.ptr(batch_idx, y, x);
+    C0       = vec.x;
+    C1       = vec.y;
+    C2       = vec.z;
+}
 
-    T g = *src.ptr(batch_idx, dst_y, dst_x, 0);
+template<typename DstT, typename EltT, typename StrideT>
+DEVICE_INLINE void store3_nhwc(const TensorWrap3D<DstT, StrideT> &dst, EltT C0, EltT C1, EltT C2, int batch_idx, int x,
+                               int y)
+{
+    DstT vec;
+    vec.x                     = C0;
+    vec.y                     = C1;
+    vec.z                     = C2;
+    *dst.ptr(batch_idx, y, x) = vec;
+}
 
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = g;
-    *dst.ptr(batch_idx, dst_y, dst_x, 1) = g;
-    *dst.ptr(batch_idx, dst_y, dst_x, 2) = g;
-    if (dch == 4)
+template<typename SrcT, typename EltT, typename StrideT>
+DEVICE_INLINE void load_bgra_nhwc(const TensorWrap3D<const SrcT, StrideT> &src, EltT &B, EltT &G, EltT &R, EltT &A,
+                                  int batch_idx, int x, int y, int bidx)
+{
+    SrcT vec = *src.ptr(batch_idx, y, x);
+    B        = bidx == 0 ? vec.x : vec.z;
+    G        = vec.y;
+    R        = bidx == 0 ? vec.z : vec.x;
+    if constexpr (nvcv::cuda::NumComponents<SrcT> == 4)
     {
-        *dst.ptr(batch_idx, dst_y, dst_x, 3) = g;
+        A = vec.w;
+    }
+    else
+    {
+        A = Alpha<EltT>;
     }
 }
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void bgr_to_gray_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx)
+template<typename DstT, typename EltT, typename StrideT>
+DEVICE_INLINE void store_bgra_nhwc(const TensorWrap3D<DstT, StrideT> &dst, EltT B, EltT G, EltT R, EltT A,
+                                   int batch_idx, int x, int y, int bidx)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
-    int       b         = *src.ptr(batch_idx, dst_y, dst_x, bidx);
-    int       g         = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    int       r         = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2);
+    DstT vec;
+    vec.x = bidx == 0 ? B : R;
+    vec.y = G;
+    vec.z = bidx == 0 ? R : B;
+    if constexpr (nvcv::cuda::NumComponents<DstT> == 4)
+    {
+        vec.w = A;
+    }
+    *dst.ptr(batch_idx, y, x) = vec;
+}
 
-    T gray                               = (T)CV_DESCALE(b * BY15 + g * GY15 + r * RY15, gray_shift);
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = gray;
+template<typename Policy, typename SrcT, typename DstT, typename StrideT>
+GLOBAL_BOUNDS void rgb_to_bgr_nhwc(const TensorWrap3D<const SrcT, StrideT> src, const TensorWrap3D<DstT, StrideT> dst,
+                                   int2 dstSize, int bidx)
+{
+    using EltT = nvcv::cuda::BaseType<SrcT>;
+    color_conversion_common<Policy, 4, 4, EltT>(
+        [&src, bidx] __device__(EltT(&r_in)[4], int batch_idx, int x, int y)
+        { load_bgra_nhwc(src, r_in[0], r_in[1], r_in[2], r_in[3], batch_idx, x, y, bidx); },
+        [] __device__(const EltT(&r_in)[4], EltT(&r_out)[4])
+        {
+#pragma unroll
+            for (int i = 0; i < 4; i++) r_out[i] = r_in[i];
+        },
+        [&dst] __device__(const EltT(&r_out)[4], int batch_idx, int x, int y)
+        { store_bgra_nhwc(dst, r_out[0], r_out[1], r_out[2], r_out[3], batch_idx, x, y, 0); },
+        dstSize);
 }
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void bgr_to_gray_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx)
+template<typename Policy, typename SrcT, typename DstT, typename StrideT>
+GLOBAL_BOUNDS void gray_to_bgr_nhwc(const TensorWrap3D<const SrcT, StrideT> src, const TensorWrap3D<DstT, StrideT> dst,
+                                    int2 dstSize)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
-    T         b         = *src.ptr(batch_idx, dst_y, dst_x, bidx);
-    T         g         = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    T         r         = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2);
+    using EltT = nvcv::cuda::BaseType<SrcT>;
+    color_conversion_common<Policy, 1, 4, EltT>(
+        [&src] __device__(EltT(&r_gray)[1], int batch_idx, int x, int y) { r_gray[0] = *src.ptr(batch_idx, y, x); },
+        [] __device__(const EltT(&r_gray)[1], EltT(&r_BGRA)[4])
+        {
+#pragma unroll
+            for (int i = 0; i < 4; i++) r_BGRA[i] = r_gray[0];
+        },
+        [&dst] __device__(const EltT(&r_BGRA)[4], int batch_idx, int x, int y)
+        { store_bgra_nhwc(dst, r_BGRA[0], r_BGRA[1], r_BGRA[2], r_BGRA[3], batch_idx, x, y, 0); },
+        dstSize);
+}
 
-    T gray                               = (T)(b * B2YF + g * G2YF + r * R2YF);
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = gray;
+template<typename Policy, typename SrcT, typename DstT, typename StrideT>
+GLOBAL_BOUNDS void bgr_to_gray_nhwc(const TensorWrap3D<const SrcT, StrideT> src, const TensorWrap3D<DstT, StrideT> dst,
+                                    int2 dstSize, int bidx)
+{
+    using EltT = nvcv::cuda::BaseType<SrcT>;
+    color_conversion_common<Policy, 3, 1, EltT>(
+        [&src, bidx] __device__(EltT(&r_BGR)[3], int batch_idx, int x, int y)
+        {
+            EltT A;
+            load_bgra_nhwc(src, r_BGR[0], r_BGR[1], r_BGR[2], A, batch_idx, x, y, bidx);
+        },
+        [] __device__(const EltT(&r_BGR)[3], EltT(&r_gray)[1])
+        {
+            if constexpr (std::is_integral_v<EltT>)
+                r_gray[0]
+                    = (EltT)CV_DESCALE((int)r_BGR[0] * BY15 + (int)r_BGR[1] * GY15 + (int)r_BGR[2] * RY15, gray_shift);
+            else
+                r_gray[0] = (EltT)(r_BGR[0] * B2YF + r_BGR[1] * G2YF + r_BGR[2] * R2YF);
+        },
+        [&dst] __device__(const EltT(&r_gray)[1], int batch_idx, int x, int y)
+        { *dst.ptr(batch_idx, y, x) = r_gray[0]; },
+        dstSize);
 }
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void bgr_to_yuv_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx)
+template<typename T>
+DEVICE_INLINE void bgr_to_yuv_int(T B_, T G_, T R_, T &Y_, T &Cb_, T &Cr_)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
-    int       B         = *src.ptr(batch_idx, dst_y, dst_x, bidx);
-    int       G         = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    int       R         = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2);
-
-    int C0 = R2Y, C1 = G2Y, C2 = B2Y, C3 = R2VI, C4 = B2UI;
-    int delta = ((T)(cuda::TypeTraits<T>::max / 2 + 1)) * (1 << yuv_shift);
-    int Y     = CV_DESCALE(R * C0 + G * C1 + B * C2, yuv_shift);
-    int Cr    = CV_DESCALE((R - Y) * C3 + delta, yuv_shift);
-    int Cb    = CV_DESCALE((B - Y) * C4 + delta, yuv_shift);
-
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = cuda::SaturateCast<T>(Y);
-    *dst.ptr(batch_idx, dst_y, dst_x, 1) = cuda::SaturateCast<T>(Cb);
-    *dst.ptr(batch_idx, dst_y, dst_x, 2) = cuda::SaturateCast<T>(Cr);
+    constexpr int C0 = R2Y, C1 = G2Y, C2 = B2Y, C3 = R2VI, C4 = B2UI;
+    constexpr int delta = ((T)(cuda::TypeTraits<T>::max / 2 + 1)) << yuv_shift;
+
+    const int B = B_, G = G_, R = R_;
+
+    const int Y  = CV_DESCALE(R * C0 + G * C1 + B * C2, yuv_shift);
+    const int Cr = CV_DESCALE((R - Y) * C3 + delta, yuv_shift);
+    const int Cb = CV_DESCALE((B - Y) * C4 + delta, yuv_shift);
+
+    Y_  = cuda::SaturateCast<T>(Y);
+    Cb_ = cuda::SaturateCast<T>(Cb);
+    Cr_ = cuda::SaturateCast<T>(Cr);
 }
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void bgr_to_yuv_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx)
+DEVICE_INLINE void bgr_to_yuv_float(float B, float G, float R, float &Y, float &Cb, float &Cr)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
-    T         B         = *src.ptr(batch_idx, dst_y, dst_x, bidx);
-    T         G         = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    T         R         = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2);
-
-    T C0 = R2YF, C1 = G2YF, C2 = B2YF, C3 = R2VF, C4 = B2UF;
-    T delta                              = 0.5f;
-    T Y                                  = R * C0 + G * C1 + B * C2;
-    T Cr                                 = (R - Y) * C3 + delta;
-    T Cb                                 = (B - Y) * C4 + delta;
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y;
-    *dst.ptr(batch_idx, dst_y, dst_x, 1) = Cb;
-    *dst.ptr(batch_idx, dst_y, dst_x, 2) = Cr;
+    constexpr float C0 = R2YF, C1 = G2YF, C2 = B2YF, C3 = R2VF, C4 = B2UF;
+    constexpr float delta = 0.5f;
+
+    Y  = R * C0 + G * C1 + B * C2;
+    Cr = (R - Y) * C3 + delta;
+    Cb = (B - Y) * C4 + delta;
 }
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void yuv_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx)
+template<typename Policy, typename SrcT, typename DstT, typename StrideT>
+GLOBAL_BOUNDS void bgr_to_yuv_nhwc(const TensorWrap3D<const SrcT, StrideT> src, const TensorWrap3D<DstT, StrideT> dst,
+                                   int2 dstSize, int bidx)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
-    T         Y         = *src.ptr(batch_idx, dst_y, dst_x, 0);
-    T         Cb        = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    T         Cr        = *src.ptr(batch_idx, dst_y, dst_x, 2);
-
-    int C0 = V2RI, C1 = V2GI, C2 = U2GI, C3 = U2BI;
-    int delta = ((T)(cuda::TypeTraits<T>::max / 2 + 1));
-    int b     = Y + CV_DESCALE((Cb - delta) * C3, yuv_shift);
-    int g     = Y + CV_DESCALE((Cb - delta) * C2 + (Cr - delta) * C1, yuv_shift);
-    int r     = Y + CV_DESCALE((Cr - delta) * C0, yuv_shift);
-
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = cuda::SaturateCast<T>(b);
-    *dst.ptr(batch_idx, dst_y, dst_x, 1)        = cuda::SaturateCast<T>(g);
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = cuda::SaturateCast<T>(r);
+    using EltT = nvcv::cuda::BaseType<SrcT>;
+    color_conversion_common<Policy, 3, 3, EltT>(
+        [&src, bidx] __device__(EltT(&r_BGR)[3], int batch_idx, int x, int y)
+        {
+            EltT A;
+            load_bgra_nhwc(src, r_BGR[0], r_BGR[1], r_BGR[2], A, batch_idx, x, y, bidx);
+        },
+        [] __device__(const EltT(&r_BGR)[3], EltT(&r_YCbCr)[3])
+        {
+            if constexpr (std::is_integral_v<EltT>)
+                bgr_to_yuv_int(r_BGR[0], r_BGR[1], r_BGR[2], r_YCbCr[0], r_YCbCr[1], r_YCbCr[2]);
+            else
+                bgr_to_yuv_float(r_BGR[0], r_BGR[1], r_BGR[2], r_YCbCr[0], r_YCbCr[1], r_YCbCr[2]);
+        },
+        [&dst] __device__(const EltT(&r_YCbCr)[3], int batch_idx, int x, int y)
+        { store3_nhwc(dst, r_YCbCr[0], r_YCbCr[1], r_YCbCr[2], batch_idx, x, y); },
+        dstSize);
 }
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void yuv_to_bgr_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx)
+template<typename T>
+DEVICE_INLINE void yuv_to_bgr_int(T Y_, T Cb_, T Cr_, T &B_, T &G_, T &R_)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
-    T         Y         = *src.ptr(batch_idx, dst_y, dst_x, 0);
-    T         Cb        = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    T         Cr        = *src.ptr(batch_idx, dst_y, dst_x, 2);
+    constexpr int C0 = V2RI, C1 = V2GI, C2 = U2GI, C3 = U2BI;
+    constexpr int delta = ((T)(cuda::TypeTraits<T>::max / 2 + 1));
 
-    T C0 = V2RF, C1 = V2GF, C2 = U2GF, C3 = U2BF;
-    T delta = 0.5f;
-    T b     = Y + (Cb - delta) * C3;
-    T g     = Y + (Cb - delta) * C2 + (Cr - delta) * C1;
-    T r     = Y + (Cr - delta) * C0;
+    const int Y = Y_, Cb = Cb_, Cr = Cr_;
+    const int B = Y + CV_DESCALE((Cb - delta) * C3, yuv_shift);
+    const int G = Y + CV_DESCALE((Cb - delta) * C2 + (Cr - delta) * C1, yuv_shift);
+    const int R = Y + CV_DESCALE((Cr - delta) * C0, yuv_shift);
 
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = b;
-    *dst.ptr(batch_idx, dst_y, dst_x, 1)        = g;
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r;
+    B_ = cuda::SaturateCast<T>(B);
+    G_ = cuda::SaturateCast<T>(G);
+    R_ = cuda::SaturateCast<T>(R);
 }
 
-template<class SrcWrapper, class DstWrapper>
-__global__ void bgr_to_hsv_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx, bool isFullRange)
+DEVICE_INLINE void yuv_to_bgr_flt(float Y, float Cb, float Cr, float &B, float &G, float &R)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
+    constexpr float C0 = V2RF, C1 = V2GF, C2 = U2GF, C3 = U2BF;
+    constexpr float delta = 0.5f;
 
-    int       b         = *src.ptr(batch_idx, dst_y, dst_x, bidx);
-    int       g         = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    int       r         = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2);
-    int       hrange    = isFullRange ? 256 : 180;
-    int       hr        = hrange;
-    const int hsv_shift = 12;
-    int       h, s, v = b;
-    int       vmin = b;
-    int       vr, vg;
-
-    v    = cuda::max(v, g);
-    v    = cuda::max(v, r);
-    vmin = cuda::min(vmin, g);
-    vmin = cuda::min(vmin, r);
-
-    unsigned char diff = cuda::SaturateCast<unsigned char>(v - vmin);
-    vr                 = v == r ? -1 : 0;
-    vg                 = v == g ? -1 : 0;
-
-    int hdiv_table = diff == 0 ? 0 : cuda::SaturateCast<int>((hrange << hsv_shift) / (6. * diff));
-    int sdiv_table = v == 0 ? 0 : cuda::SaturateCast<int>((255 << hsv_shift) / (1. * v));
-    s              = (diff * sdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift;
-    h              = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
-    h              = (h * hdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift;
-    h += h < 0 ? hr : 0;
-
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = cuda::SaturateCast<unsigned char>(h);
-    *dst.ptr(batch_idx, dst_y, dst_x, 1) = (unsigned char)s;
-    *dst.ptr(batch_idx, dst_y, dst_x, 2) = (unsigned char)v;
+    B = Y + (Cb - delta) * C3;
+    G = Y + (Cb - delta) * C2 + (Cr - delta) * C1;
+    R = Y + (Cr - delta) * C0;
 }
 
-template<class SrcWrapper, class DstWrapper>
-__global__ void bgr_to_hsv_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx)
+template<typename Policy, typename SrcT, typename DstT, typename StrideT>
+GLOBAL_BOUNDS void yuv_to_bgr_nhwc(const TensorWrap3D<const SrcT, StrideT> src, const TensorWrap3D<DstT, StrideT> dst,
+                                   int2 dstSize, int bidx)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
+    using EltT = nvcv::cuda::BaseType<SrcT>;
+    color_conversion_common<Policy, 3, 3, EltT>(
+        [&src] __device__(EltT(&r_YCbCr)[3], int batch_idx, int x, int y)
+        { load3_nhwc(src, r_YCbCr[0], r_YCbCr[1], r_YCbCr[2], batch_idx, x, y); },
+        [] __device__(const EltT(&r_YCbCr)[3], EltT(&r_BGR)[3])
+        {
+            if constexpr (std::is_integral_v<EltT>)
+                yuv_to_bgr_int(r_YCbCr[0], r_YCbCr[1], r_YCbCr[2], r_BGR[0], r_BGR[1], r_BGR[2]);
+            else
+                yuv_to_bgr_flt(r_YCbCr[0], r_YCbCr[1], r_YCbCr[2], r_BGR[0], r_BGR[1], r_BGR[2]);
+        },
+        [&dst, bidx] __device__(const EltT(&r_BGR)[3], int batch_idx, int x, int y)
+        { store_bgra_nhwc(dst, r_BGR[0], r_BGR[1], r_BGR[2], Alpha<EltT>, batch_idx, x, y, bidx); },
+        dstSize);
+}
 
-    float b = *src.ptr(batch_idx, dst_y, dst_x, bidx);
-    float g = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    float r = *src.ptr(batch_idx, dst_y, dst_x, bidx ^ 2);
-    float h, s, v;
-    float hrange = 360.0;
-    float hscale = hrange * (1.f / 360.f);
-
-    float vmin, diff;
-
-    v = vmin = r;
-    if (v < g)
-        v = g;
-    if (v < b)
-        v = b;
-    if (vmin > g)
-        vmin = g;
-    if (vmin > b)
-        vmin = b;
-
-    diff = v - vmin;
-    s    = diff / (float)(fabs(v) + FLT_EPSILON);
-    diff = (float)(60. / (diff + FLT_EPSILON));
-    if (v == r)
-        h = (g - b) * diff;
-    else if (v == g)
-        h = (b - r) * diff + 120.f;
-    else
-        h = (r - g) * diff + 240.f;
+DEVICE_INLINE void bgr_to_hsv_uchar(uchar b8, uchar g8, uchar r8, uchar &h8, uchar &s8, uchar &v8, bool isFullRange)
+{
+    const int hrange    = isFullRange ? 256 : 180;
+    const int hsv_shift = 12;
 
-    if (h < 0)
-        h += 360.f;
+    const int b = (int)b8;
+    const int g = (int)g8;
+    const int r = (int)r8;
 
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = h * hscale;
-    *dst.ptr(batch_idx, dst_y, dst_x, 1) = s;
-    *dst.ptr(batch_idx, dst_y, dst_x, 2) = v;
+    const int vmin = cuda::min(b, cuda::min(g, r));
+    const int v    = cuda::max(b, cuda::max(g, r));
+
+    const int diff = v - vmin;
+    const int vr   = v == r ? -1 : 0;
+    const int vg   = v == g ? -1 : 0;
+
+    const int hdiv_table = diff == 0 ? 0 : cuda::SaturateCast<int>((hrange << hsv_shift) / (6.f * diff));
+    const int sdiv_table = v == 0 ? 0 : cuda::SaturateCast<int>((255 << hsv_shift) / (float)v);
+
+    const int s = (diff * sdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift;
+    int       h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+
+    h = (h * hdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift;
+    h += h < 0 ? hrange : 0;
+
+    h8 = cuda::SaturateCast<uint8_t>(h);
+    s8 = (uint8_t)s;
+    v8 = (uint8_t)v;
+}
+
+DEVICE_INLINE void bgr_to_hsv_float(float b, float g, float r, float &h, float &s, float &v)
+{
+    float vmin = cuda::min(r, cuda::min(g, b));
+    v          = cuda::max(r, cuda::max(g, b));
+    float diff = v - vmin;
+    s          = diff / (fabs(v) + FLT_EPSILON);
+    diff       = 60.f / (diff + FLT_EPSILON);
+
+    // clang-format off
+    if      (v == r) h = (g - b) * diff;
+    else if (v == g) h = (b - r) * diff + 120.f;
+    else             h = (r - g) * diff + 240.f;
+
+    if (h < 0.f) h += 360.f;
+    // clang-format on
+}
+
+template<typename Policy, typename SrcT, typename DstT, typename StrideT>
+GLOBAL_BOUNDS void bgr_to_hsv_nhwc(const TensorWrap3D<const SrcT, StrideT> src, const TensorWrap3D<DstT, StrideT> dst,
+                                   int2 dstSize, int bidx, bool isFullRange)
+{
+    using EltT = nvcv::cuda::BaseType<SrcT>;
+    color_conversion_common<Policy, 3, 3, EltT>(
+        [&src, bidx] __device__(EltT(&r_BGR)[3], int batch_idx, int x, int y)
+        {
+            EltT A;
+            load_bgra_nhwc(src, r_BGR[0], r_BGR[1], r_BGR[2], A, batch_idx, x, y, bidx);
+        },
+        [isFullRange] __device__(const EltT(&r_BGR)[3], EltT(&r_HSV)[3])
+        {
+            if constexpr (std::is_integral_v<EltT>)
+                bgr_to_hsv_uchar(r_BGR[0], r_BGR[1], r_BGR[2], r_HSV[0], r_HSV[1], r_HSV[2], isFullRange);
+            else
+                bgr_to_hsv_float(r_BGR[0], r_BGR[1], r_BGR[2], r_HSV[0], r_HSV[1], r_HSV[2]);
+        },
+        [&dst] __device__(const EltT(&r_HSV)[3], int batch_idx, int x, int y)
+        { store3_nhwc(dst, r_HSV[0], r_HSV[1], r_HSV[2], batch_idx, x, y); },
+        dstSize);
 }
 
-__device__ inline void HSV2RGB_native(float h, float s, float v, float &b, float &g, float &r, const float hscale)
+template<typename T>
+DEVICE_INLINE T select4_reg(const T (&tab)[4], int idx)
+{
+    // Random access in a register array of size 4, with 6 instructions.
+    // The compiler was generating 10 instructions for tab[idx].
+    T out;
+    out = idx == 1 ? tab[1] : tab[0];
+    out = idx == 2 ? tab[2] : out;
+    out = idx == 3 ? tab[3] : out;
+    return out;
+}
+
+DEVICE_INLINE void hsv_to_bgr_float(float h, float s, float v, float &b, float &g, float &r)
 {
     if (s == 0)
         b = g = r = v;
     else
     {
-        static const int sector_data[][3] = {
-            {1, 3, 0},
-            {1, 0, 2},
-            {3, 0, 1},
-            {0, 2, 1},
-            {0, 1, 3},
-            {2, 1, 0}
-        };
-        float tab[4];
-        int   sector;
-        h *= hscale;
-        h      = fmod(h, 6.f);
-        sector = (int)floor(h);
-        h -= sector;
-        if ((unsigned)sector >= 6u)
-        {
-            sector = 0;
-            h      = 0.f;
-        }
+        h += 6 * (h < 0);
+        int idx = static_cast<int>(h); // Sector index.
+        h -= idx;                      // Fractional part of h.
+        idx = (idx % 6) << 2;          // Shift index for sector LUT.
+
+        // clang-format off
+        const float tab[4] {v,
+                            v * (1 - s),
+                            v * (1 - s * h),
+                            v * (1 - s * (1 - h))};
+        // clang-format on
 
-        tab[0] = v;
-        tab[1] = v * (1.f - s);
-        tab[2] = v * (1.f - s * h);
-        tab[3] = v * (1.f - s * (1.f - h));
+        constexpr int32_t idx_lutb = 0x00200311;
+        constexpr int32_t idx_lutg = 0x00112003;
+        constexpr int32_t idx_lutr = 0x00031120;
 
-        b = tab[sector_data[sector][0]];
-        g = tab[sector_data[sector][1]];
-        r = tab[sector_data[sector][2]];
+        b = select4_reg(tab, (idx_lutb >> idx) & 0xf);
+        g = select4_reg(tab, (idx_lutg >> idx) & 0xf);
+        r = select4_reg(tab, (idx_lutr >> idx) & 0xf);
     }
 }
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void hsv_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx, int dcn, bool isFullRange)
+template<typename Policy, typename SrcT, typename DstT, typename StrideT>
+GLOBAL_BOUNDS void hsv_to_bgr_nhwc(const TensorWrap3D<const SrcT, StrideT> src, const TensorWrap3D<DstT, StrideT> dst,
+                                   int2 dstSize, int bidx, bool isFullRange)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
+    using EltT = nvcv::cuda::BaseType<SrcT>;
+    color_conversion_common<Policy, 3, 3, EltT>(
+        [&src] __device__(EltT(&r_HSV)[3], int batch_idx, int x, int y)
+        { load3_nhwc(src, r_HSV[0], r_HSV[1], r_HSV[2], batch_idx, x, y); },
+        [isFullRange] __device__(const EltT(&r_HSV)[3], EltT(&r_BGR)[3])
+        {
+            if constexpr (std::is_same_v<EltT, uchar>)
+            {
+                const float     scaleH  = isFullRange ? (6.0f / 256.0f) : (6.0f / 180.0f);
+                constexpr float scaleSV = 1.0f / 255.0f;
+
+                float Bf, Gf, Rf;
+
+                hsv_to_bgr_float((float)r_HSV[0] * scaleH, r_HSV[1] * scaleSV, r_HSV[2] * scaleSV, Bf, Gf, Rf);
+
+                r_BGR[0] = cuda::SaturateCast<uchar>(Bf * 255.0f);
+                r_BGR[1] = cuda::SaturateCast<uchar>(Gf * 255.0f);
+                r_BGR[2] = cuda::SaturateCast<uchar>(Rf * 255.0f);
+            }
+            else
+            {
+                constexpr float scaleH = 6.0f / 360.0f;
+
+                hsv_to_bgr_float(r_HSV[0] * scaleH, r_HSV[1], r_HSV[2], r_BGR[0], r_BGR[1], r_BGR[2]);
+            }
+        },
+        [&dst, bidx] __device__(const EltT(&r_BGR)[3], int batch_idx, int x, int y)
+        { store_bgra_nhwc(dst, r_BGR[0], r_BGR[1], r_BGR[2], Alpha<EltT>, batch_idx, x, y, bidx); },
+        dstSize);
+}
+
+template<bool IsSemiPlanar, typename EltT, typename StrideT>
+DEVICE_INLINE void load_yuv420(const nvcv::cuda::Tensor4DWrap<const EltT, StrideT> &src, EltT &Y, EltT &U, EltT &V,
+                               int2 size, int batch_idx, int x, int y, int uidx)
+{
+    if constexpr (IsSemiPlanar)
+    {
+        // U and V are subsampled at half the full resolution (in both x and y), combined (i.e., interleaved), and
+        // arranged as full rows after the full resolution Y data. Example memory layout for 4 x 4 image (NV12):
+        //   Y_00 Y_01 Y_02 Y_03
+        //   Y_10 Y_11 Y_12 Y_13
+        //   Y_20 Y_21 Y_22 Y_23
+        //   Y_30 Y_31 Y_32 Y_33
+        //   U_00 V_00 U_02 V_02
+        //   U_20 V_20 U_22 V_22
+        // Each U and V value corresponds to a 2x2 block of Y values--e.g. U_00 and V_00 correspond to Y_00, Y_01, Y_10,
+        // and Y_11. Each full U-V row represents 2 rows of Y values. Some layouts (e.g., NV21) swap the location
+        // of the U and V values in each U-V pair (indicated by the uidx parameter).
+
+        const int uv_y = size.y + y / 2; // The interleaved U-V semi-plane is 1/2 the height of the Y data.
+        const int uv_x = (x & ~1);       // Convert x to even # (set lowest bit to 0).
+
+        Y = *src.ptr(batch_idx, y, x);                    // Y (luma) is at full resolution.
+        U = *src.ptr(batch_idx, uv_y, uv_x + uidx);       // Some formats swap the U and V elements (as indicated
+        V = *src.ptr(batch_idx, uv_y, uv_x + (uidx ^ 1)); //   by the uidx parameter).
+    }
+    else
+    {
+        // U and V are subsampled at half the full resolution (in both x and y) and arranged as non-interleaved planes
+        // (i.e., planar format). Each subsampled U and V "plane" is arranged as full rows after the full resolution Y
+        // data--so two consecutive subsampled U or V rows are combined into one row spanning the same width as the Y
+        // plane. Example memory layout for 4 x 4 image (e.g. I420):
+        //   Y_00 Y_01 Y_02 Y_03
+        //   Y_10 Y_11 Y_12 Y_13
+        //   Y_20 Y_21 Y_22 Y_23
+        //   Y_30 Y_31 Y_32 Y_33
+        //   U_00 U_02 U_20 U_22
+        //   V_00 V_02 V_20 V_22
+        // Each U and V value corresponds to a 2x2 block of Y values--e.g. U_00 and V_00 correspond to Y_00, Y_01, Y_10,
+        // and Y_11. Each full U and V row represents 4 rows of Y values. Some layouts (e.g., YV12) swap the location
+        // of the U and V planes (indicated by the uidx parameter).
+
+        const int by = size.y + y / 4; // Base row coordinate for U and V: subsampled plane is 1/4 the height.
+        const int h4 = size.y / 4;     // Height (# of rows) of each subsampled U and V plane.
+
+        // Compute x position that combines two subsampled rows into one.
+        const int uv_x = (x / 2) + ((size.x / 2) & -((y / 2) & 1)); // Second half of row for odd y coordinates.
+
+        Y = *src.ptr(batch_idx, y, x);                       // Y (luma) is at full resolution.
+        U = *src.ptr(batch_idx, by + h4 * uidx, uv_x);       // Some formats swap the U and V "planes" (as indicated
+        V = *src.ptr(batch_idx, by + h4 * (uidx ^ 1), uv_x); //   by the uidx parameter).
+    }
+}
 
-    float h = *src.ptr(batch_idx, dst_y, dst_x, 0);
-    float s = *src.ptr(batch_idx, dst_y, dst_x, 1) * (1.0f / 255.0f);
-    float v = *src.ptr(batch_idx, dst_y, dst_x, 2) * (1.0f / 255.0f);
+template<bool IsSemiPlanar, typename EltT, typename StrideT>
+DEVICE_INLINE void store_yuv420(const TensorWrap4D<EltT, StrideT> &dst, EltT Y, EltT U, EltT V, int2 size,
+                                int batch_idx, int x, int y, int uidx)
+{
+    if constexpr (IsSemiPlanar)
+    {
+        // See YUV420 semi-planar layout commments in load_yuv420 above.
+        *dst.ptr(batch_idx, y, x) = Y; // Y (luma) is at full resolution.
+        if (y % 2 == 0 && x % 2 == 0)
+        {
+            const int uv_y = size.y + y / 2; // The interleaved U-V semi-plane is 1/2 the height of the Y data.
+            const int uv_x = (x & ~1);       // Convert x to even # (set lowest bit to 0).
 
-    float         hrange = isFullRange ? 255 : 180;
-    unsigned char alpha  = cuda::TypeTraits<T>::max;
-    float         hs     = 6.f / hrange;
+            *dst.ptr(batch_idx, uv_y, uv_x + uidx)       = U; // Some formats swap the U and V elements (as indicated
+            *dst.ptr(batch_idx, uv_y, uv_x + (uidx ^ 1)) = V; //   by the uidx parameter).
+        }
+    }
+    else
+    {
+        // See YUV420 planar layout commments in load_yuv420 above.
+        *dst.ptr(batch_idx, y, x, 0) = Y; // Y (luma) is at full resolution.
+        if (y % 2 == 0 && x % 2 == 0)
+        {
+            const int by = size.y + y / 4; // Base row coordinate for U and V: subsampled plane is 1/4 the height.
+            const int h4 = size.y / 4;     // Height (# of rows) of each subsampled U and V plane.
 
-    float b, g, r;
-    HSV2RGB_native(h, s, v, b, g, r, hs);
+            // Compute x position that combines two subsampled rows into one.
+            const int uv_x = (x / 2) + ((size.x / 2) & -((y / 2) & 1)); // Second half of row for odd y coordinates.
 
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = cuda::SaturateCast<uchar>(b * 255.0f);
-    *dst.ptr(batch_idx, dst_y, dst_x, 1)        = cuda::SaturateCast<uchar>(g * 255.0f);
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = cuda::SaturateCast<uchar>(r * 255.0f);
-    if (dcn == 4)
-        *dst.ptr(batch_idx, dst_y, dst_x, 3) = alpha;
+            *dst.ptr(batch_idx, by + h4 * uidx, uv_x)       = U; // Some formats swap the U and V "planes" (as indicated
+            *dst.ptr(batch_idx, by + h4 * (uidx ^ 1), uv_x) = V; //   by the uidx parameter).
+        }
+    }
 }
 
-template<class SrcWrapper, class DstWrapper>
-__global__ void hsv_to_bgr_float_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int bidx, int dcn)
+DEVICE_INLINE void bgr_to_yuv42xxp(const uchar &b, const uchar &g, const uchar &r, uchar &Y, uchar &U, uchar &V)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
-
-    float h = *src.ptr(batch_idx, dst_y, dst_x, 0);
-    float s = *src.ptr(batch_idx, dst_y, dst_x, 1);
-    float v = *src.ptr(batch_idx, dst_y, dst_x, 2);
+    const int shifted16 = (16 << ITUR_BT_601_SHIFT);
+    const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
+    int       yy        = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16;
 
-    float hrange = 360.0;
-    float alpha  = 1.f;
-    float hs     = 6.f / hrange;
+    Y = cuda::SaturateCast<uchar>(yy >> ITUR_BT_601_SHIFT);
 
-    float b, g, r;
-    HSV2RGB_native(h, s, v, b, g, r, hs);
+    const int shifted128 = (128 << ITUR_BT_601_SHIFT);
+    int       uu         = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128;
+    int       vv         = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128;
 
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = b;
-    *dst.ptr(batch_idx, dst_y, dst_x, 1)        = g;
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r;
-    if (dcn == 4)
-        *dst.ptr(batch_idx, dst_y, dst_x, 3) = alpha;
+    U = cuda::SaturateCast<uchar>(uu >> ITUR_BT_601_SHIFT);
+    V = cuda::SaturateCast<uchar>(vv >> ITUR_BT_601_SHIFT);
 }
 
-__device__ __forceinline__ void yuv42xxp_to_bgr_kernel(const int &Y, const int &U, const int &V, uchar &r, uchar &g,
-                                                       uchar &b)
+DEVICE_INLINE void yuv42xxp_to_bgr(const int &Y, const int &U, const int &V, uchar &b, uchar &g, uchar &r)
 {
     //R = 1.164(Y - 16) + 1.596(V - 128)
     //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -446,181 +637,139 @@ __device__ __forceinline__ void yuv42xxp_to_bgr_kernel(const int &Y, const int &
     b = cuda::SaturateCast<uchar>(CV_DESCALE((yy + C4 * uu), yuv4xx_shift));
 }
 
-__device__ __forceinline__ void bgr_to_yuv42xxp_kernel(const uchar &r, const uchar &g, const uchar &b, uchar &Y,
-                                                       uchar &U, uchar &V)
+template<bool IsSemiPlanar, typename Policy, typename SrcT, typename EltT, typename StrideT>
+GLOBAL_BOUNDS void bgr_to_yuv420_char_nhwc(const TensorWrap3D<const SrcT, StrideT> src,
+                                           const TensorWrap4D<EltT, StrideT> dst, int2 size, int bidx, int uidx)
 {
-    const int shifted16 = (16 << ITUR_BT_601_SHIFT);
-    const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
-    int       yy        = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16;
-
-    Y = cuda::SaturateCast<uchar>(yy >> ITUR_BT_601_SHIFT);
-
-    const int shifted128 = (128 << ITUR_BT_601_SHIFT);
-    int       uu         = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128;
-    int       vv         = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128;
-
-    U = cuda::SaturateCast<uchar>(uu >> ITUR_BT_601_SHIFT);
-    V = cuda::SaturateCast<uchar>(vv >> ITUR_BT_601_SHIFT);
+    static_assert(std::is_same_v<nvcv::cuda::BaseType<SrcT>, EltT>);
+    color_conversion_common<Policy, 3, 3, EltT>(
+        [&src, bidx] __device__(EltT(&r_BGR)[3], int batch_idx, int x, int y)
+        {
+            EltT A;
+            load_bgra_nhwc(src, r_BGR[0], r_BGR[1], r_BGR[2], A, batch_idx, x, y, bidx);
+        },
+        [] __device__(const EltT(&r_BGR)[3], EltT(&r_YUV)[3])
+        { bgr_to_yuv42xxp(r_BGR[0], r_BGR[1], r_BGR[2], r_YUV[0], r_YUV[1], r_YUV[2]); },
+        [&dst, uidx, size] __device__(const EltT(&r_YUV)[3], int batch_idx, int x, int y)
+        { store_yuv420<IsSemiPlanar>(dst, r_YUV[0], r_YUV[1], r_YUV[2], size, batch_idx, x, y, uidx); },
+        size);
 }
 
-template<class SrcWrapper, class DstWrapper>
-__global__ void bgr_to_yuv420p_char_nhwc(SrcWrapper src, DstWrapper dst, int2 srcSize, int scn, int bidx, int uidx)
+template<bool IsSemiPlanar, typename Policy, typename EltT, typename DstT, typename StrideT>
+GLOBAL_BOUNDS void yuv420_to_bgr_char_nhwc(const TensorWrap4D<const EltT, StrideT> src,
+                                           const TensorWrap3D<DstT, StrideT> dst, int2 size, int bidx, int uidx)
 {
-    int src_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int src_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (src_x >= srcSize.x || src_y >= srcSize.y)
-        return;
-    const int batch_idx     = get_batch_idx();
-    int       plane_y_step  = srcSize.y * srcSize.x;
-    int       plane_uv_step = plane_y_step / 4;
-    int       uv_x          = (src_y % 4 < 2) ? src_x / 2 : (src_x / 2 + srcSize.x / 2);
-
-    uchar b = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, bidx));
-    uchar g = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, 1));
-    uchar r = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, bidx ^ 2));
-    // Ignore gray channel if input is RGBA
-
-    uchar Y{0}, U{0}, V{0};
-    bgr_to_yuv42xxp_kernel(r, g, b, Y, U, V);
-
-    *dst.ptr(batch_idx, src_y, src_x, 0) = Y;
-    if (src_y % 2 == 0 && src_x % 2 == 0)
-    {
-        *dst.ptr(batch_idx, srcSize.y + src_y / 4, uv_x + plane_uv_step * uidx)       = U;
-        *dst.ptr(batch_idx, srcSize.y + src_y / 4, uv_x + plane_uv_step * (1 - uidx)) = V;
-    }
+    static_assert(std::is_same_v<nvcv::cuda::BaseType<DstT>, EltT>);
+    color_conversion_common<Policy, 3, 3, EltT>(
+        [&src, uidx, size] __device__(EltT(&r_YUV)[3], int batch_idx, int x, int y)
+        { load_yuv420<IsSemiPlanar>(src, r_YUV[0], r_YUV[1], r_YUV[2], size, batch_idx, x, y, uidx); },
+        [] __device__(const EltT(&r_YUV)[3], EltT(&r_BGR)[3])
+        {
+            yuv42xxp_to_bgr(static_cast<int>(r_YUV[0]), static_cast<int>(r_YUV[1]), static_cast<int>(r_YUV[2]),
+                            r_BGR[0], r_BGR[1], r_BGR[2]);
+        },
+        [&dst, bidx] __device__(const EltT(&r_BGR)[3], int batch_idx, int x, int y)
+        { store_bgra_nhwc(dst, r_BGR[0], r_BGR[1], r_BGR[2], Alpha<EltT>, batch_idx, x, y, bidx); },
+        size);
 }
 
+// YUV 422 interleaved formats (e.g., YUYV, YVYU, and UYVY) group 2 pixels into groups of 4 elements. Each group of two
+// pixels has two distinct luma (Y) values, one for each pixel. The chromaticity values (U and V) are subsampled by a
+// factor of two so that there is only one U and one V value for each group of 2 pixels. Example memory layout for
+// 4 x 4 image (UYVY format):
+//   U_00 Y_00 V_00 Y_01 U_02 Y_02 V_02 Y_03
+//   U_10 Y_10 V_10 Y_11 U_12 Y_12 V_12 Y_13
+//   U_20 Y_20 V_20 Y_21 U_22 Y_22 V_22 Y_23
+//   U_30 Y_30 V_30 Y_31 U_32 Y_32 V_32 Y_33
+// Each U and V value corresponds to two Y values--e.g. U_00 and V_00 correspond to Y_00 and Y_10 while U_12 and V_12
+// correspond to Y_12 and Y_13. Thus, a given Y value, Y_rc = Y(r,c) (where r is the row, or y coordinate, and c is the
+// column, or x coordinate), corresponds to U(r,c') and V(r,c') where c' is the even column coordinate <= c -- that is,
+// c' = 2 * floor(c/2) = (c & ~1). Some layouts swap the positions of the chromaticity and luma values (e.g., YUYV)
+// (indicated by the yidx parameter) and / or swap the the positions of the U and V chromaticity valus (e.g., YVYU)
+// (indicated by the uidx parameter).
+// The data layout is treated as a single channel tensor, so each group of 4 values corresponds to two pixels. As such,
+// the tensor width is twice the actual pixel width. Thus, it's easiest to process 4 consecutive values (2 pixels) per
+// thread.
 template<class SrcWrapper, class DstWrapper>
-__global__ void bgr_to_yuv420sp_char_nhwc(SrcWrapper src, DstWrapper dst, int2 srcSize, int scn, int bidx, int uidx)
+__global__ void yuv422_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dcn, int bidx, int yidx,
+                                        int uidx)
 {
-    int src_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int src_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (src_x >= srcSize.x || src_y >= srcSize.y)
+    using T = typename DstWrapper::ValueType;
+
+    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (dst_y >= dstSize.y)
         return;
-    const int batch_idx = get_batch_idx();
-    int       uv_x      = (src_x % 2 == 0) ? src_x : (src_x - 1);
 
-    uchar b = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, bidx));
-    uchar g = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, 1));
-    uchar r = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, bidx ^ 2));
-    // Ignore gray channel if input is RGBA
+    int dst_x = 2 * (blockIdx.x * blockDim.x + threadIdx.x); // Process 2 destination pixels/thread.
+    if (dst_x >= dstSize.x)
+        return;
 
-    uchar Y{0}, U{0}, V{0};
-    bgr_to_yuv42xxp_kernel(r, g, b, Y, U, V);
+    const int batch_idx = get_batch_idx();
 
-    *dst.ptr(batch_idx, src_y, src_x, 0) = Y;
-    if (src_y % 2 == 0 && src_x % 2 == 0)
-    {
-        *dst.ptr(batch_idx, srcSize.y + src_y / 2, uv_x + uidx)       = U;
-        *dst.ptr(batch_idx, srcSize.y + src_y / 2, uv_x + (1 - uidx)) = V;
-    }
-}
+    const int src_x = 2 * dst_x;    // Process 4 source elements/thread (i.e., 2 destination pixels).
+    const int uv_x  = (src_x & ~3); // Compute "even" x coordinate for U and V (set lowest two bits to 0).
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void yuv420sp_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dcn, int bidx, int uidx)
-{
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx = get_batch_idx();
-    int       uv_x      = (dst_x % 2 == 0) ? dst_x : (dst_x - 1);
+    const T Y0 = *src.ptr(batch_idx, dst_y, src_x + yidx);
+    const T Y1 = *src.ptr(batch_idx, dst_y, src_x + yidx + 2);
+    const T U  = *src.ptr(batch_idx, dst_y, uv_x + (yidx ^ 1) + uidx);
+    const T V  = *src.ptr(batch_idx, dst_y, uv_x + (yidx ^ 1) + (uidx ^ 2));
 
-    T Y = *src.ptr(batch_idx, dst_y, dst_x, 0);
-    T U = *src.ptr(batch_idx, dstSize.y + dst_y / 2, uv_x + uidx);
-    T V = *src.ptr(batch_idx, dstSize.y + dst_y / 2, uv_x + 1 - uidx);
+    T r{0}, g{0}, b{0};
 
-    uchar r{0}, g{0}, b{0}, a{0xff};
-    yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b);
+    yuv42xxp_to_bgr(int(Y0), int(U), int(V), b, g, r);
 
     *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = b;
     *dst.ptr(batch_idx, dst_y, dst_x, 1)        = g;
     *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r;
     if (dcn == 4)
-    {
-        *dst.ptr(batch_idx, dst_y, dst_x, 3) = a;
-    }
-}
+        *dst.ptr(batch_idx, dst_y, dst_x, 3) = Alpha<T>;
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void yuv420p_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dcn, int bidx, int uidx)
-{
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-
-    const int batch_idx     = get_batch_idx();
-    int       plane_y_step  = dstSize.y * dstSize.x;
-    int       plane_uv_step = plane_y_step / 4;
-    int       uv_x          = (dst_y % 4 < 2) ? dst_x / 2 : (dst_x / 2 + dstSize.x / 2);
-
-    T Y = *src.ptr(batch_idx, dst_y, dst_x, 0);
-    T U = *src.ptr(batch_idx, dstSize.y + dst_y / 4, uv_x + plane_uv_step * uidx);
-    T V = *src.ptr(batch_idx, dstSize.y + dst_y / 4, uv_x + plane_uv_step * (1 - uidx));
-
-    uchar r{0}, g{0}, b{0}, a{0xff};
-    yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b);
+    dst_x++; // Move to next output pixel.
+    yuv42xxp_to_bgr(int(Y1), int(U), int(V), b, g, r);
 
     *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = b;
     *dst.ptr(batch_idx, dst_y, dst_x, 1)        = g;
     *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r;
     if (dcn == 4)
-    {
-        *dst.ptr(batch_idx, dst_y, dst_x, 3) = a;
-    }
+        *dst.ptr(batch_idx, dst_y, dst_x, 3) = Alpha<T>;
 }
 
 template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void yuv422_to_bgr_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int dcn, int bidx, int yidx,
-                                        int uidx)
+__global__ void yuv422_to_gray_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int yidx)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
     int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
+    if (dst_y >= dstSize.y)
         return;
-    const int batch_idx = get_batch_idx();
-    int       uv_x      = (dst_x % 2 == 0) ? dst_x : dst_x - 1;
 
-    T Y = *src.ptr(batch_idx, dst_y, dst_x, yidx);
-    T U = *src.ptr(batch_idx, dst_y, uv_x, (1 - yidx) + uidx);
-    T V = *src.ptr(batch_idx, dst_y, uv_x, (1 - yidx) + uidx ^ 2);
+    int dst_x = 2 * (blockIdx.x * blockDim.x + threadIdx.x); // Process 2 destination pixels/thread.
+    if (dst_x >= dstSize.x)
+        return;
 
-    uchar r{0}, g{0}, b{0}, a{0xff};
-    yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b);
+    const int batch_idx = get_batch_idx();
 
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = b;
-    *dst.ptr(batch_idx, dst_y, dst_x, 1)        = g;
-    *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r;
-    if (dcn == 4)
-    {
-        *dst.ptr(batch_idx, dst_y, dst_x, 3) = a;
-    }
-}
+    const int src_x = 2 * dst_x; // Process 4 source elements/thread.
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void yuv420_to_gray_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize)
-{
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx                  = get_batch_idx();
-    T         Y                          = *src.ptr(batch_idx, dst_y, dst_x, 0);
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y;
+    *dst.ptr(batch_idx, dst_y, dst_x++) = *src.ptr(batch_idx, dst_y, src_x + yidx);
+    *dst.ptr(batch_idx, dst_y, dst_x)   = *src.ptr(batch_idx, dst_y, src_x + yidx + 2);
 }
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void yuv422_to_gray_char_nhwc(SrcWrapper src, DstWrapper dst, int2 dstSize, int yidx)
+template<typename SrcT, typename DstT>
+inline ErrorCode Launch_BGR_to_RGB(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                   NVCVColorConversionCode code, cuda_op::DataShape shape, int bidx,
+                                   cudaStream_t stream)
 {
-    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
-    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (dst_x >= dstSize.x || dst_y >= dstSize.y)
-        return;
-    const int batch_idx                  = get_batch_idx();
-    T         Y                          = *src.ptr(batch_idx, dst_y, dst_x, yidx);
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y;
+    using Policy = CvtKernelPolicy<32, 4, 4>;
+
+    dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight);
+    dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N);
+    int2 dstSize{shape.W, shape.H};
+
+    auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT>(inData);
+    auto dstWrap = cuda::CreateTensorWrapNHW<DstT>(outData);
+    rgb_to_bgr_nhwc<Policy><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
+    checkKernelErrors();
+
+    return ErrorCode::SUCCESS;
 }
 
 inline ErrorCode BGR_to_RGB(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
@@ -628,10 +777,7 @@ inline ErrorCode BGR_to_RGB(const TensorDataStridedCuda &inData, const TensorDat
 {
     int sch  = (code == NVCV_COLOR_BGRA2BGR || code == NVCV_COLOR_RGBA2BGR || code == NVCV_COLOR_BGRA2RGBA) ? 4 : 3;
     int dch  = (code == NVCV_COLOR_BGR2BGRA || code == NVCV_COLOR_BGR2RGBA || code == NVCV_COLOR_BGRA2RGBA) ? 4 : 3;
-    int bidx = (code == NVCV_COLOR_BGR2RGB || code == NVCV_COLOR_RGBA2BGR || code == NVCV_COLOR_BGRA2RGBA
-                || code == NVCV_COLOR_BGR2RGBA)
-                 ? 2
-                 : 0;
+    int bidx = (code != NVCV_COLOR_BGRA2BGR && code != NVCV_COLOR_BGR2BGRA) ? 2 : 0;
 
     auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData);
     NVCV_ASSERT(inAccess);
@@ -647,72 +793,82 @@ inline ErrorCode BGR_to_RGB(const TensorDataStridedCuda &inData, const TensorDat
 
     if (inputShape.C != sch)
     {
-        LOG_ERROR("Invalid input channel number " << inputShape.C << " expecting: " << sch);
+        LOG_ERROR("Invalid input channel number " << inputShape.C << " -- expecting " << sch);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+    if (outputShape.C != dch)
+    {
+        LOG_ERROR("Invalid output channel number " << outputShape.C << " -- expecting " << dch);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
     if (outDataType != inDataType)
     {
-        LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType);
+        LOG_ERROR("Mismatched input / output DataTypes " << inDataType << " / " << outDataType);
         return ErrorCode::INVALID_DATA_TYPE;
     }
-    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N
-        || outputShape.C != dch)
+    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N)
     {
-        LOG_ERROR("Invalid output shape " << outputShape);
+        LOG_ERROR("Shape mismatch -- output tensor shape " << outputShape << " doesn't match input tensor shape "
+                                                           << inputShape);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+    if (outDataType == kCV_16F && sch < 4 && dch == 4)
+    {
+        LOG_ERROR("Adding alpha to the output is not supported for " << outDataType);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    dim3 blockSize(BLOCK, BLOCK / 4, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
+#define CVCUDA_BGR2RGB_IF(SCH, DCH, SRC_T, DST_T) \
+    if (sch == SCH && dch == DCH)                 \
+    return Launch_BGR_to_RGB<SRC_T, DST_T>(inData, outData, code, inputShape, bidx, stream)
 
-    int2 dstSize{outputShape.W, outputShape.H};
+#define CVCUDA_BGR2RGB_CASE(T3, T4)       \
+    CVCUDA_BGR2RGB_IF(3, 3, T3, T3);      \
+    else CVCUDA_BGR2RGB_IF(3, 4, T3, T4); \
+    else CVCUDA_BGR2RGB_IF(4, 3, T4, T3); \
+    else CVCUDA_BGR2RGB_IF(4, 4, T4, T4); \
+    else return ErrorCode::INVALID_DATA_SHAPE
 
     switch (inDataType)
     {
     case kCV_8U:
     case kCV_8S:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
-        rgb_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, sch, dch, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2RGB_CASE(uchar3, uchar4);
+    case kCV_16F: // Not properly handled when adding alpha to the destination.
     case kCV_16U:
-    case kCV_16F:
     case kCV_16S:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint16_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint16_t>(outData);
-        rgb_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, sch, dch, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2RGB_CASE(ushort3, ushort4);
     case kCV_32S:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<int32_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<int32_t>(outData);
-        rgb_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, sch, dch, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2RGB_CASE(int3, int4);
     case kCV_32F:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<float>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<float>(outData);
-        rgb_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, sch, dch, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2RGB_CASE(float3, float4);
     case kCV_64F:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<double>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<double>(outData);
-        rgb_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, sch, dch, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2RGB_CASE(double3, double4);
+    default:
+        LOG_ERROR("Unsupported DataType " << inDataType);
+        return ErrorCode::INVALID_DATA_TYPE;
     }
+#undef CVCUDA_BGR2RGB_CASE
+#undef CVCUDA_BGR2RGB_IF
+    return ErrorCode::SUCCESS;
+}
+
+template<typename SrcT, typename DstT>
+inline ErrorCode Launch_GRAY_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                    cuda_op::DataShape shape, cudaStream_t stream)
+{
+    using Policy = CvtKernelPolicy<32, 4, 8>;
+
+    dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight);
+    dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N);
+
+    int2 dstSize{shape.W, shape.H};
+
+    auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT>(inData);
+    auto dstWrap = cuda::CreateTensorWrapNHW<DstT>(outData);
+    gray_to_bgr_nhwc<Policy><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize);
+    checkKernelErrors();
+
     return ErrorCode::SUCCESS;
 }
 
@@ -735,72 +891,80 @@ inline ErrorCode GRAY_to_BGR(const TensorDataStridedCuda &inData, const TensorDa
 
     if (inputShape.C != 1)
     {
-        LOG_ERROR("Invalid input channel number " << inputShape.C);
+        LOG_ERROR("Invalid input channel number " << inputShape.C << " -- expecting 1");
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+    if (outputShape.C != dch)
+    {
+        LOG_ERROR("Invalid output channel number " << outputShape.C << " -- expecting " << dch);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
     if (outDataType != inDataType)
     {
-        LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType);
+        LOG_ERROR("Mismatched input / output DataTypes " << inDataType << " / " << outDataType);
         return ErrorCode::INVALID_DATA_TYPE;
     }
-    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N
-        || outputShape.C != dch)
+    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N)
     {
-        LOG_ERROR("Invalid output shape " << outputShape);
+        LOG_ERROR("Shape mismatch -- output tensor shape " << outputShape << " doesn't match input tensor shape "
+                                                           << inputShape);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+    if (outDataType == kCV_16F && dch == 4)
+    {
+        LOG_ERROR("Adding alpha to the output is not supported for " << outDataType);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    dim3 blockSize(BLOCK, BLOCK / 4, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
+#define CVCUDA_GRAY2BGR_IF(DCH, SRC_T, DST_T) \
+    if (dch == DCH)                           \
+    return Launch_GRAY_to_BGR<SRC_T, DST_T>(inData, outData, inputShape, stream)
 
-    int2 dstSize{outputShape.W, outputShape.H};
+#define CVCUDA_GRAY2BGR_CASE(T, T3, T4) \
+    CVCUDA_GRAY2BGR_IF(3, T, T3);       \
+    else CVCUDA_GRAY2BGR_IF(4, T, T4);  \
+    else return ErrorCode::INVALID_DATA_SHAPE
 
     switch (inDataType)
     {
     case kCV_8U:
     case kCV_8S:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
-        gray_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, dch);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_GRAY2BGR_CASE(uchar, uchar3, uchar4);
+    case kCV_16F: // Not properly handled when adding alpha to the destination.
     case kCV_16U:
-    case kCV_16F:
     case kCV_16S:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint16_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint16_t>(outData);
-        gray_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, dch);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_GRAY2BGR_CASE(ushort, ushort3, ushort4);
     case kCV_32S:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<int32_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<int32_t>(outData);
-        gray_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, dch);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_GRAY2BGR_CASE(int, int3, int4);
     case kCV_32F:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<float>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<float>(outData);
-        gray_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, dch);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_GRAY2BGR_CASE(float, float3, float4);
     case kCV_64F:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<double>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<double>(outData);
-        gray_to_bgr_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, dch);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_GRAY2BGR_CASE(double, double3, double4);
+    default:
+        LOG_ERROR("Unsupported DataType " << inDataType);
+        return ErrorCode::INVALID_DATA_TYPE;
     }
+#undef CVCUDA_GRAY2BGR_CASE
+#undef CVCUDA_GRAY2BGR_IF
+    return ErrorCode::SUCCESS;
+}
+
+template<typename SrcT, typename DstT>
+inline ErrorCode Launch_BGR_to_GRAY(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                    cuda_op::DataShape shape, int bidx, cudaStream_t stream)
+{
+    using Policy = CvtKernelPolicy<32, 4, 4>;
+
+    dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight);
+    dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N);
+
+    int2 dstSize{shape.W, shape.H};
+
+    auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT>(inData);
+    auto dstWrap = cuda::CreateTensorWrapNHW<DstT>(outData);
+    bgr_to_gray_nhwc<Policy><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
+    checkKernelErrors();
+
     return ErrorCode::SUCCESS;
 }
 
@@ -824,56 +988,68 @@ inline ErrorCode BGR_to_GRAY(const TensorDataStridedCuda &inData, const TensorDa
 
     if (inputShape.C != sch)
     {
-        LOG_ERROR("Invalid input channel number " << inputShape.C << " expecting: " << sch);
+        LOG_ERROR("Invalid input channel number " << inputShape.C << " -- expecting " << sch);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+    if (outputShape.C != 1)
+    {
+        LOG_ERROR("Invalid output channel number " << outputShape.C << " -- expecting 1");
         return ErrorCode::INVALID_DATA_SHAPE;
     }
     if (outDataType != inDataType)
     {
-        LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType);
+        LOG_ERROR("Mismatched input / output DataTypes " << inDataType << " / " << outDataType);
         return ErrorCode::INVALID_DATA_TYPE;
     }
-    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N
-        || outputShape.C != 1)
+    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N)
     {
-        LOG_ERROR("Invalid output shape " << outputShape);
+        LOG_ERROR("Shape mismatch -- output tensor shape " << outputShape << " doesn't match input tensor shape "
+                                                           << inputShape);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    dim3 blockSize(BLOCK, BLOCK / 4, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
+#define CVCUDA_BGR2GRAY_IF(SCH, SRC_T, DST_T) \
+    if (sch == SCH)                           \
+    return Launch_BGR_to_GRAY<SRC_T, DST_T>(inData, outData, inputShape, bidx, stream)
 
-    int2 dstSize{outputShape.W, outputShape.H};
+#define CVCUDA_BGR2GRAY_CASE(T, T3, T4) \
+    CVCUDA_BGR2GRAY_IF(3, T3, T);       \
+    else CVCUDA_BGR2GRAY_IF(4, T4, T);  \
+    else return ErrorCode::INVALID_DATA_SHAPE
 
     switch (inDataType)
     {
     case kCV_8U:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
-        bgr_to_gray_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2GRAY_CASE(uchar, uchar3, uchar4);
     case kCV_16U:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint16_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint16_t>(outData);
-        bgr_to_gray_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2GRAY_CASE(ushort, ushort3, ushort4);
     case kCV_32F:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<float>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<float>(outData);
-        bgr_to_gray_float_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2GRAY_CASE(float, float3, float4);
     default:
         LOG_ERROR("Unsupported DataType " << inDataType);
         return ErrorCode::INVALID_DATA_TYPE;
     }
+#undef CVCUDA_BGR2GRAY_CASE
+#undef CVCUDA_BGR2GRAY_IF
+    return ErrorCode::SUCCESS;
+}
+
+template<typename SrcT, typename DstT>
+inline ErrorCode Launch_BGR_to_YUV(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                   cuda_op::DataShape shape, int bidx, cudaStream_t stream)
+{
+    using Policy = CvtKernelPolicy<32, 4, 4>;
+
+    dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight);
+    dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N);
+
+    int2 dstSize{shape.W, shape.H};
+
+    auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT>(inData);
+    auto dstWrap = cuda::CreateTensorWrapNHW<DstT>(outData);
+    bgr_to_yuv_nhwc<Policy><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
+    checkKernelErrors();
+
     return ErrorCode::SUCCESS;
 }
 
@@ -910,41 +1086,39 @@ inline ErrorCode BGR_to_YUV(const TensorDataStridedCuda &inData, const TensorDat
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    dim3 blockSize(BLOCK, BLOCK / 4, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
-
-    int2 dstSize{outputShape.W, outputShape.H};
-
+#define CVCUDA_BGR2YUV_CASE(T3) return Launch_BGR_to_YUV<T3, T3>(inData, outData, inputShape, bidx, stream)
     switch (inDataType)
     {
     case kCV_8U:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
-        bgr_to_yuv_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2YUV_CASE(uchar3);
     case kCV_16U:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint16_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint16_t>(outData);
-        bgr_to_yuv_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2YUV_CASE(ushort3);
     case kCV_32F:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<float>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<float>(outData);
-        bgr_to_yuv_float_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_BGR2YUV_CASE(float3);
     default:
         LOG_ERROR("Unsupported DataType " << inDataType);
         return ErrorCode::INVALID_DATA_TYPE;
     }
+#undef CVCUDA_BGR2YUV_CASE
+    return ErrorCode::SUCCESS;
+}
+
+template<typename SrcT, typename DstT>
+inline ErrorCode Launch_YUV_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                   cuda_op::DataShape shape, int bidx, cudaStream_t stream)
+{
+    using Policy = CvtKernelPolicy<32, 4, 4>;
+
+    dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight);
+    dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N);
+
+    int2 dstSize{shape.W, shape.H};
+
+    auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT>(inData);
+    auto dstWrap = cuda::CreateTensorWrapNHW<DstT>(outData);
+    yuv_to_bgr_nhwc<Policy><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
+    checkKernelErrors();
+
     return ErrorCode::SUCCESS;
 }
 
@@ -981,41 +1155,49 @@ inline ErrorCode YUV_to_BGR(const TensorDataStridedCuda &inData, const TensorDat
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    dim3 blockSize(BLOCK, BLOCK / 4, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
-
-    int2 dstSize{outputShape.W, outputShape.H};
-
+#define CVCUDA_YUV2BGR_CASE(T3) return Launch_YUV_to_BGR<T3, T3>(inData, outData, inputShape, bidx, stream)
     switch (inDataType)
     {
     case kCV_8U:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
-        yuv_to_bgr_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_YUV2BGR_CASE(uchar3);
     case kCV_16U:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint16_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint16_t>(outData);
-        yuv_to_bgr_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_YUV2BGR_CASE(ushort3);
     case kCV_32F:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<float>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<float>(outData);
-        yuv_to_bgr_float_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_YUV2BGR_CASE(float3);
     default:
         LOG_ERROR("Unsupported DataType " << inDataType);
         return ErrorCode::INVALID_DATA_TYPE;
     }
+#undef CVCUDA_YUV2BGR_CASE
+    return ErrorCode::SUCCESS;
+}
+
+template<typename SrcT, typename DstT>
+inline ErrorCode Launch_BGR_to_HSV(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                   cuda_op::DataShape shape, int bidx, bool isFullRange, bool strides_64b,
+                                   cudaStream_t stream)
+{
+    using Policy = CvtKernelPolicy<32, 4, 4>;
+
+    dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight);
+    dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N);
+
+    int2 dstSize{shape.W, shape.H};
+
+    if (strides_64b)
+    {
+        auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT, int64_t>(inData);
+        auto dstWrap = cuda::CreateTensorWrapNHW<DstT, int64_t>(outData);
+        bgr_to_hsv_nhwc<Policy><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx, isFullRange);
+    }
+    else
+    {
+        auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT, int32_t>(inData);
+        auto dstWrap = cuda::CreateTensorWrapNHW<DstT, int32_t>(outData);
+        bgr_to_hsv_nhwc<Policy><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx, isFullRange);
+    }
+    checkKernelErrors();
+
     return ErrorCode::SUCCESS;
 }
 
@@ -1053,33 +1235,53 @@ inline ErrorCode BGR_to_HSV(const TensorDataStridedCuda &inData, const TensorDat
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    dim3 blockSize(BLOCK, BLOCK / 4, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
+    const bool strides_64b = std::max(inAccess->sampleStride() * inAccess->numSamples(),
+                                      outAccess->sampleStride() * outAccess->numSamples())
+                           > nvcv::cuda::TypeTraits<int32_t>::max;
+
+#define CVCUDA_BGR2HSV_CASE(T3) \
+    return Launch_BGR_to_HSV<T3, T3>(inData, outData, inputShape, bidx, isFullRange, strides_64b, stream)
+
+    switch (inDataType)
+    {
+    case kCV_8U:
+        CVCUDA_BGR2HSV_CASE(uchar3);
+    case kCV_32F:
+        CVCUDA_BGR2HSV_CASE(float3);
+    default:
+        LOG_ERROR("Unsupported DataType " << inDataType);
+        return ErrorCode::INVALID_DATA_TYPE;
+    }
+#undef CVCUDA_BGR2HSV_CASE
+    return ErrorCode::SUCCESS;
+}
+
+template<typename SrcT, typename DstT>
+inline ErrorCode Launch_HSV_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                   cuda_op::DataShape shape, int bidx, bool isFullRange, bool strides_64b,
+                                   cudaStream_t stream)
+{
+    using Policy = CvtKernelPolicy<32, 4, 4>;
+
+    dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight);
+    dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N);
 
-    int2 dstSize{outputShape.W, outputShape.H};
+    int2 dstSize{shape.W, shape.H};
 
-    switch (inDataType)
-    {
-    case kCV_8U:
+    if (strides_64b)
     {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
-        bgr_to_hsv_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx, isFullRange);
-        checkKernelErrors();
+        auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT, int64_t>(inData);
+        auto dstWrap = cuda::CreateTensorWrapNHW<DstT, int64_t>(outData);
+        hsv_to_bgr_nhwc<Policy><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx, isFullRange);
     }
-    break;
-    case kCV_32F:
+    else
     {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<float>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<float>(outData);
-        bgr_to_hsv_float_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx);
-        checkKernelErrors();
-    }
-    break;
-    default:
-        LOG_ERROR("Unsupported DataType " << inDataType);
-        return ErrorCode::INVALID_DATA_TYPE;
+        auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT, int32_t>(inData);
+        auto dstWrap = cuda::CreateTensorWrapNHW<DstT, int32_t>(outData);
+        hsv_to_bgr_nhwc<Policy><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx, isFullRange);
     }
+    checkKernelErrors();
+
     return ErrorCode::SUCCESS;
 }
 
@@ -1122,34 +1324,63 @@ inline ErrorCode HSV_to_BGR(const TensorDataStridedCuda &inData, const TensorDat
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    dim3 blockSize(BLOCK, BLOCK / 4, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
+    const int  dcn         = outputShape.C;
+    const bool strides_64b = std::max(inAccess->sampleStride() * inAccess->numSamples(),
+                                      outAccess->sampleStride() * outAccess->numSamples())
+                           > nvcv::cuda::TypeTraits<int32_t>::max;
 
-    int2 dstSize{outputShape.W, outputShape.H};
-    int  dcn = outputShape.C;
+#define CVCUDA_HSV2BGR_CASE(T3, T4)                                                                            \
+    if (dcn == 3)                                                                                              \
+        return Launch_HSV_to_BGR<T3, T3>(inData, outData, inputShape, bidx, isFullRange, strides_64b, stream); \
+    else                                                                                                       \
+        return Launch_HSV_to_BGR<T3, T4>(inData, outData, inputShape, bidx, isFullRange, strides_64b, stream)
 
     switch (inDataType)
     {
     case kCV_8U:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
-        hsv_to_bgr_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx, dcn, isFullRange);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_HSV2BGR_CASE(uchar3, uchar4);
     case kCV_32F:
-    {
-        auto srcWrap = cuda::CreateTensorWrapNHWC<float>(inData);
-        auto dstWrap = cuda::CreateTensorWrapNHWC<float>(outData);
-        hsv_to_bgr_float_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx, dcn);
-        checkKernelErrors();
-    }
-    break;
+        CVCUDA_HSV2BGR_CASE(float3, float4);
     default:
         LOG_ERROR("Unsupported DataType " << inDataType);
         return ErrorCode::INVALID_DATA_TYPE;
     }
+#undef CVCUDA_HSV2BGR_CASE
+    return ErrorCode::SUCCESS;
+}
+
+template<bool IsSemiPlanar, typename SrcT, typename DstT>
+inline ErrorCode Launch_YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                        cuda_op::DataShape shape, int bidx, int uidx, bool strides_64b,
+                                        cudaStream_t stream)
+{
+    using Policy = CvtKernelPolicy<32, 4, 4>;
+
+    dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight);
+    dim3 gridSize(divUp(shape.W, Policy::TileWidth), divUp(shape.H, Policy::TileHeight), shape.N);
+
+    int2 dstSize{shape.W, shape.H};
+
+    if (strides_64b)
+    {
+        // YUV420 input: 4D tensor with scalar type.
+        auto srcWrap = cuda::CreateTensorWrapNHWC<const SrcT, int64_t>(inData);
+        // BGR output: 3D tensor with vector type.
+        auto dstWrap = cuda::CreateTensorWrapNHW<DstT, int64_t>(outData);
+        yuv420_to_bgr_char_nhwc<IsSemiPlanar, Policy>
+            <<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx, uidx);
+    }
+    else
+    {
+        // YUV420 input: 4D tensor with scalar type.
+        auto srcWrap = cuda::CreateTensorWrapNHWC<const SrcT, int32_t>(inData);
+        // BGR output: 3D tensor with vector type.
+        auto dstWrap = cuda::CreateTensorWrapNHW<DstT, int32_t>(outData);
+        yuv420_to_bgr_char_nhwc<IsSemiPlanar, Policy>
+            <<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, bidx, uidx);
+    }
+    checkKernelErrors();
+
     return ErrorCode::SUCCESS;
 }
 
@@ -1170,6 +1401,13 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens
             ? 0
             : 1;
 
+    // clang-format off
+    bool p420 = (code == NVCV_COLOR_YUV2BGR_YV12 || code == NVCV_COLOR_YUV2BGRA_YV12 ||
+                 code == NVCV_COLOR_YUV2RGB_YV12 || code == NVCV_COLOR_YUV2RGBA_YV12 ||
+                 code == NVCV_COLOR_YUV2BGR_IYUV || code == NVCV_COLOR_YUV2BGRA_IYUV ||
+                 code == NVCV_COLOR_YUV2RGB_IYUV || code == NVCV_COLOR_YUV2RGBA_IYUV);
+    // clang-format on
+
     auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData);
     NVCV_ASSERT(inAccess);
 
@@ -1182,7 +1420,7 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens
     cuda_op::DataType  outDataType = helpers::GetLegacyDataType(outData.dtype());
     cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape());
 
-    if (outputShape.C != 3 && outputShape.C != 4)
+    if ((code != NVCV_COLOR_YUV2GRAY_420 || outputShape.C != 1) && outputShape.C != 3 && outputShape.C != 4)
     {
         LOG_ERROR("Invalid output channel number " << outputShape.C);
         return ErrorCode::INVALID_DATA_SHAPE;
@@ -1211,32 +1449,36 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens
         LOG_ERROR("Invalid output shape " << outputShape);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
+    if (p420 && rgb_height % 4 != 0) // YUV 420 planar formats need 4 rows of Y for every full row of U or V.
+    {
+        LOG_ERROR(
+            "Invalid input shape: to convert from YUV 420 planar formats, the output "
+            "tensor height must be a multiple of 4; height = "
+            << rgb_height);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
 
-    dim3 blockSize(BLOCK, BLOCK / 1, 1);
-    dim3 gridSize(divUp(rgb_width, blockSize.x), divUp(rgb_height, blockSize.y), inputShape.N);
-
-    int2 dstSize{outputShape.W, outputShape.H};
-    int  dcn = outputShape.C;
-
-    auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-    auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
+    const int  dcn         = outputShape.C;
+    const bool strides_64b = std::max(inAccess->sampleStride() * inAccess->numSamples(),
+                                      outAccess->sampleStride() * outAccess->numSamples())
+                           > nvcv::cuda::TypeTraits<int32_t>::max;
 
     switch (code)
     {
     case NVCV_COLOR_YUV2GRAY_420:
     {
-        /* Method 1 */
-        // yuv420_to_gray_char_nhwc<unsigned char><<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize);
-        // checkKernelErrors();
+        int dpitch = static_cast<int>(outAccess->rowStride());
+        int spitch = static_cast<int>(inAccess->rowStride());
 
-        /* Method 2 (Better performance, but only works with fixed input shapes) */
-        int dpitch     = static_cast<int>(outAccess->sampleStride());
-        int spitch     = static_cast<int>(inAccess->sampleStride());
-        int cpy_width  = static_cast<int>(outAccess->sampleStride());
-        int cpy_height = inputShape.N;
+        for (int i = 0; i < inputShape.N; i++)
+        {
+            const void *srcPtr = inData.basePtr() + (size_t)i * inAccess->sampleStride();
+
+            void *dstPtr = outData.basePtr() + (size_t)i * outAccess->sampleStride();
 
-        checkCudaErrors(cudaMemcpy2DAsync(outData.basePtr(), dpitch, inData.basePtr(), spitch, cpy_width, cpy_height,
-                                          cudaMemcpyDeviceToDevice, stream));
+            checkCudaErrors(cudaMemcpy2DAsync(dstPtr, dpitch, srcPtr, spitch, rgb_width, rgb_height,
+                                              cudaMemcpyDeviceToDevice, stream));
+        }
     }
     break;
     case NVCV_COLOR_YUV2BGR_NV12:
@@ -1247,11 +1489,12 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens
     case NVCV_COLOR_YUV2RGB_NV21:
     case NVCV_COLOR_YUV2RGBA_NV12:
     case NVCV_COLOR_YUV2RGBA_NV21:
-    {
-        yuv420sp_to_bgr_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, dcn, bidx, uidx);
-        checkKernelErrors();
-    }
-    break;
+        if (dcn == 3)
+            return Launch_YUV420xp_to_BGR<true, uchar, uchar3>(inData, outData, outputShape, bidx, uidx, strides_64b,
+                                                               stream);
+        else
+            return Launch_YUV420xp_to_BGR<true, uchar, uchar4>(inData, outData, outputShape, bidx, uidx, strides_64b,
+                                                               stream);
     case NVCV_COLOR_YUV2BGR_YV12:
     case NVCV_COLOR_YUV2BGR_IYUV:
     case NVCV_COLOR_YUV2BGRA_YV12:
@@ -1260,11 +1503,12 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens
     case NVCV_COLOR_YUV2RGB_IYUV:
     case NVCV_COLOR_YUV2RGBA_YV12:
     case NVCV_COLOR_YUV2RGBA_IYUV:
-    {
-        yuv420p_to_bgr_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, dcn, bidx, uidx);
-        checkKernelErrors();
-    }
-    break;
+        if (dcn == 3)
+            return Launch_YUV420xp_to_BGR<false, uchar, uchar3>(inData, outData, outputShape, bidx, uidx, strides_64b,
+                                                                stream);
+        else
+            return Launch_YUV420xp_to_BGR<false, uchar, uchar4>(inData, outData, outputShape, bidx, uidx, strides_64b,
+                                                                stream);
     default:
         LOG_ERROR("Unsupported conversion code " << code);
         return ErrorCode::INVALID_PARAMETER;
@@ -1272,28 +1516,61 @@ inline ErrorCode YUV420xp_to_BGR(const TensorDataStridedCuda &inData, const Tens
     return ErrorCode::SUCCESS;
 }
 
-inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
-                               NVCVColorConversionCode code, cudaStream_t stream)
+template<bool IsSemiPlanar, typename SrcT, typename DstT>
+inline ErrorCode Launch_BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                        DataShape inputShape, int bidx, int uidx, bool strides_64b, cudaStream_t stream)
+{
+    using Policy = CvtKernelPolicy<32, 4, 4>;
+
+    int2 srcSize{inputShape.W, inputShape.H};
+
+    dim3 blockSize(Policy::BlockWidth, Policy::BlockHeight);
+    dim3 gridSize(divUp(inputShape.W, Policy::TileWidth), divUp(inputShape.H, Policy::TileHeight), inputShape.N);
+
+    if (strides_64b)
+    {
+        auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT, int64_t>(inData); // RGB input: 3D tensor with vector type.
+        auto dstWrap = cuda::CreateTensorWrapNHWC<DstT, int64_t>(outData);     // YUV420 output: 4D scalar tensor.
+
+        bgr_to_yuv420_char_nhwc<IsSemiPlanar, Policy>
+            <<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, srcSize, bidx, uidx);
+    }
+    else
+    {
+        auto srcWrap = cuda::CreateTensorWrapNHW<const SrcT, int32_t>(inData); // RGB input: 3D tensor with vector type.
+        auto dstWrap = cuda::CreateTensorWrapNHWC<DstT, int32_t>(outData);     // YUV420 output: 4D scalar tensor.
+
+        bgr_to_yuv420_char_nhwc<IsSemiPlanar, Policy>
+            <<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, srcSize, bidx, uidx);
+    }
+    checkKernelErrors();
+
+    return ErrorCode::SUCCESS;
+}
+
+inline ErrorCode BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                                 NVCVColorConversionCode code, cudaStream_t stream)
 {
     int bidx
-        = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2BGR_YVYU
-           || code == NVCV_COLOR_YUV2BGRA_YVYU || code == NVCV_COLOR_YUV2BGR_UYVY || code == NVCV_COLOR_YUV2BGRA_UYVY)
+        = (code == NVCV_COLOR_BGR2YUV_NV12 || code == NVCV_COLOR_BGRA2YUV_NV12 || code == NVCV_COLOR_BGR2YUV_NV21
+           || code == NVCV_COLOR_BGRA2YUV_NV21 || code == NVCV_COLOR_BGR2YUV_YV12 || code == NVCV_COLOR_BGRA2YUV_YV12
+           || code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV)
             ? 0
             : 2;
 
-    int yidx
-        = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2RGB_YUY2
-           || code == NVCV_COLOR_YUV2RGBA_YUY2 || code == NVCV_COLOR_YUV2BGR_YVYU || code == NVCV_COLOR_YUV2BGRA_YVYU
-           || code == NVCV_COLOR_YUV2RGB_YVYU || code == NVCV_COLOR_YUV2RGBA_YVYU || code == NVCV_COLOR_YUV2GRAY_YUY2)
+    int uidx
+        = (code == NVCV_COLOR_BGR2YUV_NV12 || code == NVCV_COLOR_BGRA2YUV_NV12 || code == NVCV_COLOR_RGB2YUV_NV12
+           || code == NVCV_COLOR_RGBA2YUV_NV12 || code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV
+           || code == NVCV_COLOR_RGB2YUV_IYUV || code == NVCV_COLOR_RGBA2YUV_IYUV)
             ? 0
             : 1;
 
-    int uidx
-        = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2RGB_YUY2
-           || code == NVCV_COLOR_YUV2RGBA_YUY2 || code == NVCV_COLOR_YUV2BGR_UYVY || code == NVCV_COLOR_YUV2BGRA_UYVY
-           || code == NVCV_COLOR_YUV2RGB_UYVY || code == NVCV_COLOR_YUV2RGBA_UYVY)
-            ? 0
-            : 2;
+    // clang-format off
+    bool p420 = (code == NVCV_COLOR_BGR2YUV_YV12 || code == NVCV_COLOR_BGRA2YUV_YV12 ||
+                 code == NVCV_COLOR_RGB2YUV_YV12 || code == NVCV_COLOR_RGBA2YUV_YV12 ||
+                 code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV ||
+                 code == NVCV_COLOR_RGB2YUV_IYUV || code == NVCV_COLOR_RGBA2YUV_IYUV);
+    // clang-format on
 
     auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData);
     NVCV_ASSERT(inAccess);
@@ -1307,14 +1584,22 @@ inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const Tensor
     cuda_op::DataType  outDataType = helpers::GetLegacyDataType(outData.dtype());
     cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape());
 
-    if (outputShape.C != 3 && outputShape.C != 4)
+    if (inputShape.C != 3 && inputShape.C != 4)
     {
-        LOG_ERROR("Invalid output channel number " << outputShape.C);
+        LOG_ERROR("Invalid input channel number " << inputShape.C);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
-    if (inputShape.C != 2)
+    if (inputShape.H % 2 != 0 || inputShape.W % 2 != 0)
     {
-        LOG_ERROR("Invalid input channel number " << inputShape.C);
+        LOG_ERROR("Invalid input shape " << inputShape);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+    if (p420 && inputShape.H % 4 != 0) // YUV 420 planar formats need 4 rows of Y for every full row of U or V.
+    {
+        LOG_ERROR(
+            "Invalid input shape: to convert to YUV 420 planar formats, the input "
+            "tensor height must be a multiple of 4; height = "
+            << inputShape.H);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
     if (inDataType != kCV_8U || outDataType != kCV_8U)
@@ -1322,47 +1607,50 @@ inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const Tensor
         LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType);
         return ErrorCode::INVALID_DATA_TYPE;
     }
-    if (outputShape.H != inputShape.H || outputShape.W != inputShape.W || outputShape.N != inputShape.N)
+
+    int yuv420_width  = inputShape.W;
+    int yuv420_height = inputShape.H / 2 * 3;
+
+    if (outputShape.H != yuv420_height || outputShape.W != yuv420_width || outputShape.N != inputShape.N)
     {
         LOG_ERROR("Invalid output shape " << outputShape);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    dim3 blockSize(BLOCK, BLOCK / 4, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
-
-    int2 dstSize{outputShape.W, outputShape.H};
-    int  dcn = outputShape.C;
-
-    auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-    auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
+    const bool strides_64b = std::max(inAccess->sampleStride() * inAccess->numSamples(),
+                                      outAccess->sampleStride() * outAccess->numSamples())
+                           > nvcv::cuda::TypeTraits<int32_t>::max;
 
     switch (code)
     {
-    case NVCV_COLOR_YUV2GRAY_YUY2:
-    case NVCV_COLOR_YUV2GRAY_UYVY:
-    {
-        yuv422_to_gray_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, yidx);
-        checkKernelErrors();
-    }
-    break;
-    case NVCV_COLOR_YUV2BGR_YUY2:
-    case NVCV_COLOR_YUV2BGR_YVYU:
-    case NVCV_COLOR_YUV2BGRA_YUY2:
-    case NVCV_COLOR_YUV2BGRA_YVYU:
-    case NVCV_COLOR_YUV2RGB_YUY2:
-    case NVCV_COLOR_YUV2RGB_YVYU:
-    case NVCV_COLOR_YUV2RGBA_YUY2:
-    case NVCV_COLOR_YUV2RGBA_YVYU:
-    case NVCV_COLOR_YUV2RGB_UYVY:
-    case NVCV_COLOR_YUV2BGR_UYVY:
-    case NVCV_COLOR_YUV2RGBA_UYVY:
-    case NVCV_COLOR_YUV2BGRA_UYVY:
-    {
-        yuv422_to_bgr_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, dcn, bidx, yidx, uidx);
-        checkKernelErrors();
-    }
-    break;
+    case NVCV_COLOR_BGR2YUV_NV12:
+    case NVCV_COLOR_BGR2YUV_NV21:
+    case NVCV_COLOR_BGRA2YUV_NV12:
+    case NVCV_COLOR_BGRA2YUV_NV21:
+    case NVCV_COLOR_RGB2YUV_NV12:
+    case NVCV_COLOR_RGB2YUV_NV21:
+    case NVCV_COLOR_RGBA2YUV_NV12:
+    case NVCV_COLOR_RGBA2YUV_NV21:
+        if (inputShape.C == 3)
+            return Launch_BGR_to_YUV420xp<true, uchar3, uchar>(inData, outData, inputShape, bidx, uidx, strides_64b,
+                                                               stream);
+        else
+            return Launch_BGR_to_YUV420xp<true, uchar4, uchar>(inData, outData, inputShape, bidx, uidx, strides_64b,
+                                                               stream);
+    case NVCV_COLOR_BGR2YUV_YV12:
+    case NVCV_COLOR_BGR2YUV_IYUV:
+    case NVCV_COLOR_BGRA2YUV_YV12:
+    case NVCV_COLOR_BGRA2YUV_IYUV:
+    case NVCV_COLOR_RGB2YUV_YV12:
+    case NVCV_COLOR_RGB2YUV_IYUV:
+    case NVCV_COLOR_RGBA2YUV_YV12:
+    case NVCV_COLOR_RGBA2YUV_IYUV:
+        if (inputShape.C == 3)
+            return Launch_BGR_to_YUV420xp<false, uchar3, uchar>(inData, outData, inputShape, bidx, uidx, strides_64b,
+                                                                stream);
+        else
+            return Launch_BGR_to_YUV420xp<false, uchar4, uchar>(inData, outData, inputShape, bidx, uidx, strides_64b,
+                                                                stream);
     default:
         LOG_ERROR("Unsupported conversion code " << code);
         return ErrorCode::INVALID_PARAMETER;
@@ -1370,53 +1658,29 @@ inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const Tensor
     return ErrorCode::SUCCESS;
 }
 
-template<class SrcWrapper, class DstWrapper>
-inline static void bgr_to_yuv420p_launcher(SrcWrapper srcWrap, DstWrapper dstWrap, DataShape inputShape, int bidx,
-                                           int uidx, cudaStream_t stream)
-{
-    int2 srcSize{inputShape.W, inputShape.H};
-    // method 1
-    dim3 blockSize(BLOCK, BLOCK / 1, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
-    bgr_to_yuv420p_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, srcSize, inputShape.C, bidx, uidx);
-    checkKernelErrors();
-
-    // method 2 (TODO)
-    // NPP
-}
-
-template<class SrcWrapper, class DstWrapper>
-inline static void bgr_to_yuv420sp_launcher(SrcWrapper srcWrap, DstWrapper dstWrap, DataShape inputShape, int bidx,
-                                            int uidx, cudaStream_t stream)
-{
-    int2 srcSize{inputShape.W, inputShape.H};
-    // method 1
-    dim3 blockSize(BLOCK, BLOCK / 1, 1);
-    dim3 gridSize(divUp(inputShape.W, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
-    bgr_to_yuv420sp_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, srcSize, inputShape.C, bidx, uidx);
-    checkKernelErrors();
-
-    // method 2 (TODO)
-    // NPP
-}
-
-inline ErrorCode BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
-                                 NVCVColorConversionCode code, cudaStream_t stream)
+inline ErrorCode YUV422_to_BGR(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
+                               NVCVColorConversionCode code, cudaStream_t stream)
 {
     int bidx
-        = (code == NVCV_COLOR_BGR2YUV_NV12 || code == NVCV_COLOR_BGRA2YUV_NV12 || code == NVCV_COLOR_BGR2YUV_NV21
-           || code == NVCV_COLOR_BGRA2YUV_NV21 || code == NVCV_COLOR_BGR2YUV_YV12 || code == NVCV_COLOR_BGRA2YUV_YV12
-           || code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV)
+        = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2BGR_YVYU
+           || code == NVCV_COLOR_YUV2BGRA_YVYU || code == NVCV_COLOR_YUV2BGR_UYVY || code == NVCV_COLOR_YUV2BGRA_UYVY)
             ? 0
             : 2;
 
-    int uidx
-        = (code == NVCV_COLOR_BGR2YUV_NV12 || code == NVCV_COLOR_BGRA2YUV_NV12 || code == NVCV_COLOR_RGB2YUV_NV12
-           || code == NVCV_COLOR_RGBA2YUV_NV12 || code == NVCV_COLOR_BGR2YUV_IYUV || code == NVCV_COLOR_BGRA2YUV_IYUV
-           || code == NVCV_COLOR_RGB2YUV_IYUV || code == NVCV_COLOR_RGBA2YUV_IYUV)
+    int yidx
+        = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2RGB_YUY2
+           || code == NVCV_COLOR_YUV2RGBA_YUY2 || code == NVCV_COLOR_YUV2BGR_YVYU || code == NVCV_COLOR_YUV2BGRA_YVYU
+           || code == NVCV_COLOR_YUV2RGB_YVYU || code == NVCV_COLOR_YUV2RGBA_YVYU || code == NVCV_COLOR_YUV2GRAY_YUY2)
             ? 0
             : 1;
 
+    int uidx
+        = (code == NVCV_COLOR_YUV2BGR_YUY2 || code == NVCV_COLOR_YUV2BGRA_YUY2 || code == NVCV_COLOR_YUV2RGB_YUY2
+           || code == NVCV_COLOR_YUV2RGBA_YUY2 || code == NVCV_COLOR_YUV2BGR_UYVY || code == NVCV_COLOR_YUV2BGRA_UYVY
+           || code == NVCV_COLOR_YUV2RGB_UYVY || code == NVCV_COLOR_YUV2RGBA_UYVY)
+            ? 0
+            : 2;
+
     auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData);
     NVCV_ASSERT(inAccess);
 
@@ -1429,61 +1693,68 @@ inline ErrorCode BGR_to_YUV420xp(const TensorDataStridedCuda &inData, const Tens
     cuda_op::DataType  outDataType = helpers::GetLegacyDataType(outData.dtype());
     cuda_op::DataShape outputShape = helpers::GetLegacyDataShape(outAccess->infoShape());
 
-    if (inputShape.C != 3 && inputShape.C != 4)
+    if (inputShape.W % 4 != 0)
     {
-        LOG_ERROR("Invalid input channel number " << inputShape.C);
+        LOG_ERROR("Invalid input shape " << inputShape << " -- width must be a multiple of 4");
         return ErrorCode::INVALID_DATA_SHAPE;
     }
-    if (inputShape.H % 2 != 0 || inputShape.W % 2 != 0)
+
+    if ((code != NVCV_COLOR_YUV2GRAY_UYVY && code != NVCV_COLOR_YUV2GRAY_YUY2 || outputShape.C != 1)
+        && outputShape.C != 3 && outputShape.C != 4)
     {
-        LOG_ERROR("Invalid input shape " << inputShape);
+        LOG_ERROR("Invalid output channel number "
+                  << outputShape.C
+                  << " -- RGB output must have 3 or 4 channels and grayscale output must have 1 channel");
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+    if (inputShape.C != 1)
+    {
+        LOG_ERROR("Invalid input channel number " << inputShape.C << " -- input must have 1 channel");
         return ErrorCode::INVALID_DATA_SHAPE;
     }
     if (inDataType != kCV_8U || outDataType != kCV_8U)
     {
-        LOG_ERROR("Unsupported input/output DataType " << inDataType << "/" << outDataType);
+        LOG_ERROR("Unsupported input / output DataType " << inDataType << " / " << outDataType);
         return ErrorCode::INVALID_DATA_TYPE;
     }
-
-    int yuv420_width  = inputShape.W;
-    int yuv420_height = inputShape.H / 2 * 3;
-
-    if (outputShape.H != yuv420_height || outputShape.W != yuv420_width || outputShape.N != inputShape.N)
+    if (outputShape.H != inputShape.H || 2 * outputShape.W != inputShape.W || outputShape.N != inputShape.N)
     {
         LOG_ERROR("Invalid output shape " << outputShape);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
-    // BGR input
+    dim3 blockSize(BLOCK, BLOCK / 4, 1);
+    dim3 gridSize(divUp(inputShape.W / 4, blockSize.x), divUp(inputShape.H, blockSize.y), inputShape.N);
+
+    int2 dstSize{outputShape.W, outputShape.H};
+    int  dcn = outputShape.C;
+
     auto srcWrap = cuda::CreateTensorWrapNHWC<uint8_t>(inData);
-    // YUV420 output
     auto dstWrap = cuda::CreateTensorWrapNHWC<uint8_t>(outData);
 
     switch (code)
     {
-    case NVCV_COLOR_BGR2YUV_NV12:
-    case NVCV_COLOR_BGR2YUV_NV21:
-    case NVCV_COLOR_BGRA2YUV_NV12:
-    case NVCV_COLOR_BGRA2YUV_NV21:
-    case NVCV_COLOR_RGB2YUV_NV12:
-    case NVCV_COLOR_RGB2YUV_NV21:
-    case NVCV_COLOR_RGBA2YUV_NV12:
-    case NVCV_COLOR_RGBA2YUV_NV21:
+    case NVCV_COLOR_YUV2GRAY_YUY2:
+    case NVCV_COLOR_YUV2GRAY_UYVY:
     {
-        bgr_to_yuv420sp_launcher(srcWrap, dstWrap, inputShape, bidx, uidx, stream);
+        yuv422_to_gray_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, yidx);
         checkKernelErrors();
     }
     break;
-    case NVCV_COLOR_BGR2YUV_YV12:
-    case NVCV_COLOR_BGR2YUV_IYUV:
-    case NVCV_COLOR_BGRA2YUV_YV12:
-    case NVCV_COLOR_BGRA2YUV_IYUV:
-    case NVCV_COLOR_RGB2YUV_YV12:
-    case NVCV_COLOR_RGB2YUV_IYUV:
-    case NVCV_COLOR_RGBA2YUV_YV12:
-    case NVCV_COLOR_RGBA2YUV_IYUV:
+    case NVCV_COLOR_YUV2BGR_YUY2:
+    case NVCV_COLOR_YUV2BGR_YVYU:
+    case NVCV_COLOR_YUV2BGRA_YUY2:
+    case NVCV_COLOR_YUV2BGRA_YVYU:
+    case NVCV_COLOR_YUV2RGB_YUY2:
+    case NVCV_COLOR_YUV2RGB_YVYU:
+    case NVCV_COLOR_YUV2RGBA_YUY2:
+    case NVCV_COLOR_YUV2RGBA_YVYU:
+    case NVCV_COLOR_YUV2RGB_UYVY:
+    case NVCV_COLOR_YUV2BGR_UYVY:
+    case NVCV_COLOR_YUV2RGBA_UYVY:
+    case NVCV_COLOR_YUV2BGRA_UYVY:
     {
-        bgr_to_yuv420p_launcher(srcWrap, dstWrap, inputShape, bidx, uidx, stream);
+        yuv422_to_bgr_char_nhwc<<<gridSize, blockSize, 0, stream>>>(srcWrap, dstWrap, dstSize, dcn, bidx, yidx, uidx);
         checkKernelErrors();
     }
     break;
diff --git a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu
index d40fa847..a12c50d3 100644
--- a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu
+++ b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu
@@ -338,9 +338,9 @@ __global__ void bgr_to_hsv_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src, cu
     vmin = min(vmin, g);
     vmin = min(vmin, r);
 
-    unsigned char diff = cuda::SaturateCast<unsigned char>(v - vmin);
-    vr                 = v == r ? -1 : 0;
-    vg                 = v == g ? -1 : 0;
+    uint8_t diff = cuda::SaturateCast<uint8_t>(v - vmin);
+    vr           = v == r ? -1 : 0;
+    vg           = v == g ? -1 : 0;
 
     int hdiv_table = diff == 0 ? 0 : cuda::SaturateCast<int>((hrange << hsv_shift) / (6. * diff));
     int sdiv_table = v == 0 ? 0 : cuda::SaturateCast<int>((255 << hsv_shift) / (1. * v));
@@ -349,9 +349,9 @@ __global__ void bgr_to_hsv_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src, cu
     h              = (h * hdiv_table + (1 << (hsv_shift - 1))) >> hsv_shift;
     h += h < 0 ? hr : 0;
 
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = cuda::SaturateCast<unsigned char>(h);
-    *dst.ptr(batch_idx, dst_y, dst_x, 1) = (unsigned char)s;
-    *dst.ptr(batch_idx, dst_y, dst_x, 2) = (unsigned char)v;
+    *dst.ptr(batch_idx, dst_y, dst_x, 0) = cuda::SaturateCast<uint8_t>(h);
+    *dst.ptr(batch_idx, dst_y, dst_x, 1) = (uint8_t)s;
+    *dst.ptr(batch_idx, dst_y, dst_x, 2) = (uint8_t)v;
 }
 
 template<class T>
@@ -401,8 +401,7 @@ __global__ void bgr_to_hsv_float_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src, c
     *dst.ptr(batch_idx, dst_y, dst_x, 2) = v;
 }
 
-inline __device__ void HSV2RGB_native_var_shape(float h, float s, float v, float &b, float &g, float &r,
-                                                const float hscale)
+inline __device__ void HSV2RGB_native_var_shape(float h, float s, float v, float &b, float &g, float &r)
 {
     if (s == 0)
         b = g = r = v;
@@ -416,26 +415,22 @@ inline __device__ void HSV2RGB_native_var_shape(float h, float s, float v, float
             {0, 1, 3},
             {2, 1, 0}
         };
-        float tab[4];
-        int   sector;
-        h *= hscale;
-        h      = fmod(h, 6.f);
-        sector = (int)floor(h);
-        h -= sector;
-        if ((unsigned)sector >= 6u)
-        {
-            sector = 0;
-            h      = 0.f;
-        }
 
-        tab[0] = v;
-        tab[1] = v * (1.f - s);
-        tab[2] = v * (1.f - s * h);
-        tab[3] = v * (1.f - s * (1.f - h));
+        h += 6 * (h < 0);              // Add 6 if h < 0.
+        int idx = static_cast<int>(h); // Sector index.
+        h -= idx;                      // Fractional part of h.
+        idx %= 6;                      // Make sure index is in valid range.
+
+        // clang-format off
+        const float tab[4] {v,
+                            v * (1 - s),
+                            v * (1 - s * h),
+                            v * (1 - s * (1 - h))};
+        // clang-format on
 
-        b = tab[sector_data[sector][0]];
-        g = tab[sector_data[sector][1]];
-        r = tab[sector_data[sector][2]];
+        b = tab[sector_data[idx][0]];
+        g = tab[sector_data[idx][1]];
+        r = tab[sector_data[idx][2]];
     }
 }
 
@@ -449,16 +444,16 @@ __global__ void hsv_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src, cu
     if (dst_x >= dst.width(batch_idx) || dst_y >= dst.height(batch_idx))
         return;
 
-    float h = *src.ptr(batch_idx, dst_y, dst_x, 0);
-    float s = *src.ptr(batch_idx, dst_y, dst_x, 1) * (1.0f / 255.0f);
-    float v = *src.ptr(batch_idx, dst_y, dst_x, 2) * (1.0f / 255.0f);
+    const float     scaleH  = 6.f / (isFullRange ? 256 : 180);
+    constexpr float scaleSV = 1.0f / 255.0f;
+    constexpr T     alpha   = cuda::TypeTraits<T>::max;
 
-    float         hrange = isFullRange ? 255 : 180;
-    unsigned char alpha  = cuda::TypeTraits<T>::max;
-    float         hs     = 6.f / hrange;
+    float h = *src.ptr(batch_idx, dst_y, dst_x, 0) * scaleH;
+    float s = *src.ptr(batch_idx, dst_y, dst_x, 1) * scaleSV;
+    float v = *src.ptr(batch_idx, dst_y, dst_x, 2) * scaleSV;
 
     float b, g, r;
-    HSV2RGB_native_var_shape(h, s, v, b, g, r, hs);
+    HSV2RGB_native_var_shape(h, s, v, b, g, r);
 
     *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = cuda::SaturateCast<uchar>(b * 255.0f);
     *dst.ptr(batch_idx, dst_y, dst_x, 1)        = cuda::SaturateCast<uchar>(g * 255.0f);
@@ -477,16 +472,15 @@ __global__ void hsv_to_bgr_float_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src, c
     if (dst_x >= dst.width(batch_idx) || dst_y >= dst.height(batch_idx))
         return;
 
-    float h = *src.ptr(batch_idx, dst_y, dst_x, 0);
+    constexpr float scaleH = 6.0f / 360.0f;
+    constexpr float alpha  = 1.0f;
+
+    float h = *src.ptr(batch_idx, dst_y, dst_x, 0) * scaleH;
     float s = *src.ptr(batch_idx, dst_y, dst_x, 1);
     float v = *src.ptr(batch_idx, dst_y, dst_x, 2);
 
-    float hrange = 360.0;
-    float alpha  = 1.f;
-    float hs     = 6.f / hrange;
-
     float b, g, r;
-    HSV2RGB_native_var_shape(h, s, v, b, g, r, hs);
+    HSV2RGB_native_var_shape(h, s, v, b, g, r);
 
     *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = b;
     *dst.ptr(batch_idx, dst_y, dst_x, 1)        = g;
@@ -550,21 +544,34 @@ __global__ void bgr_to_yuv420sp_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> sr
 
     assert(checkShapeFromYUV420(dst.height(batch_idx), dst.width(batch_idx), code));
 
-    int uv_x = (src_x % 2 == 0) ? src_x : (src_x - 1);
-
     uchar b = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, bidx));
     uchar g = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, 1));
     uchar r = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, bidx ^ 2));
-    // Ignore gray channel if input is RGBA
+    // Ignore alpha channel if input is RGBA.
 
     uchar Y{0}, U{0}, V{0};
     bgr_to_yuv42xxp_kernel(r, g, b, Y, U, V);
 
-    *dst.ptr(batch_idx, src_y, src_x, 0) = Y;
+    // U and V are subsampled at half the full resolution (both in x and y), combined (i.e., interleaved), and arranged
+    // as full rows after the full resolution Y data. Example memory layout for 4 x 4 image (NV12):
+    //   Y_00 Y_01 Y_02 Y_03
+    //   Y_10 Y_11 Y_12 Y_13
+    //   Y_20 Y_21 Y_22 Y_23
+    //   Y_30 Y_31 Y_32 Y_33
+    //   U_00 V_00 U_02 V_02
+    //   U_20 V_20 U_22 V_22
+    // Each U and V value corresponds to a 2x2 block of Y values--e.g. U_00 and V_00 correspond to Y_00, Y_01, Y_10,
+    // and Y_11. Each full U-V row represents 2 rows of Y values. Some layouts (e.g., NV21) swap the location
+    // of the U and V values in each U-V pair.
+
+    *dst.ptr(batch_idx, src_y, src_x) = Y;
     if (src_y % 2 == 0 && src_x % 2 == 0)
     {
-        *dst.ptr(batch_idx, src_rows + src_y / 2, uv_x + uidx)       = U;
-        *dst.ptr(batch_idx, src_rows + src_y / 2, uv_x + (1 - uidx)) = V;
+        const int uv_y = src_rows + src_y / 2; // The interleaved U-V semi-plane is 1/2 the height of the Y data.
+        const int uv_x = (src_x & ~1);         // Convert x to even # (set lowest bit to 0).
+
+        *dst.ptr(batch_idx, uv_y, uv_x + uidx)       = U; // Some formats swap the U and V elements (as indicated
+        *dst.ptr(batch_idx, uv_y, uv_x + (uidx ^ 1)) = V; //   by the uidx parameter).
     }
 }
 
@@ -583,23 +590,39 @@ __global__ void bgr_to_yuv420p_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src
 
     assert(checkShapeFromYUV420(dst.height(batch_idx), dst.width(batch_idx), code));
 
-    int plane_y_step  = src_rows * src_cols;
-    int plane_uv_step = plane_y_step / 4;
-    int uv_x          = (src_y % 4 < 2) ? src_x / 2 : (src_x / 2 + src_cols / 2);
-
     uchar b = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, bidx));
     uchar g = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, 1));
     uchar r = static_cast<uchar>(*src.ptr(batch_idx, src_y, src_x, bidx ^ 2));
-    // Ignore gray channel if input is RGBA
+    // Ignore alpha channel if input is RGBA.
 
     uchar Y{0}, U{0}, V{0};
     bgr_to_yuv42xxp_kernel(r, g, b, Y, U, V);
 
-    *dst.ptr(batch_idx, src_y, src_x, 0) = Y;
+    // U and V are sampled at half the full resolution (in both x and y) and arranged as non-interleaved planes
+    // (i.e., planar format). Each subsampled U and V "plane" is arranged as full rows after the full resolution Y
+    // data--so two consecutive subsampled U or V rows are combined into one row spanning the same width as the Y
+    // plane. Example memory layout for 4 x 4 image (e.g. I420):
+    //   Y_00 Y_01 Y_02 Y_03
+    //   Y_10 Y_11 Y_12 Y_13
+    //   Y_20 Y_21 Y_22 Y_23
+    //   Y_30 Y_31 Y_32 Y_33
+    //   U_00 U_02 U_20 U_22
+    //   V_00 V_02 V_20 V_22
+    // Each U and V value corresponds to a 2x2 block of Y values--e.g. U_00 and V_00 correspond to Y_00, Y_01, Y_10,
+    // and Y_11. Each full U and V row represents 4 rows of Y values. Some layouts (e.g., YV12) swap the location
+    // of the U and V planes.
+
+    *dst.ptr(batch_idx, src_y, src_x) = Y;
     if (src_y % 2 == 0 && src_x % 2 == 0)
     {
-        *dst.ptr(batch_idx, src_rows + src_y / 4, uv_x + plane_uv_step * uidx)       = U;
-        *dst.ptr(batch_idx, src_rows + src_y / 4, uv_x + plane_uv_step * (1 - uidx)) = V;
+        const int by = src_rows + src_y / 4; // Base row index for U and V: subsampled plane is 1/4 the height.
+        const int h4 = src_rows / 4;         // Height (# of rows) of each subsampled U and V plane.
+
+        // Compute x position that combines two subsampled rows into one.
+        const int uv_x = (src_x / 2) + ((src_rows / 2) & -((src_y / 2) & 1));
+
+        *dst.ptr(batch_idx, by + h4 * uidx, uv_x)       = U; // Some formats swap the U and V "planes" (as indicated
+        *dst.ptr(batch_idx, by + h4 * (uidx ^ 1), uv_x) = V; //   by the uidx parameter).
     }
 }
 
@@ -618,11 +641,13 @@ __global__ void yuv420sp_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> sr
 
     assert(checkShapeFromYUV420(src.height(batch_idx), src.width(batch_idx), code));
 
-    int uv_x = (dst_x % 2 == 0) ? dst_x : (dst_x - 1);
+    // See layout commments in bgr_to_yuv420sp_char_nhwc.
+    const int uv_y = dst_rows + dst_y / 2; // The interleaved U-V semi-plane is 1/2 the height of the Y data.
+    const int uv_x = (dst_x & ~1);         // Convert x to even # (set lowest bit to 0).
 
     T Y = *src.ptr(batch_idx, dst_y, dst_x);
-    T U = *src.ptr(batch_idx, dst_rows + dst_y / 2, uv_x + uidx);
-    T V = *src.ptr(batch_idx, dst_rows + dst_y / 2, uv_x + 1 - uidx);
+    T U = *src.ptr(batch_idx, uv_y, uv_x + uidx);       // Some formats swap the U and V elements (as indicated
+    T V = *src.ptr(batch_idx, uv_y, uv_x + (uidx ^ 1)); //   by the uidx parameter).
 
     uchar r{0}, g{0}, b{0}, a{0xff};
     yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b);
@@ -651,13 +676,16 @@ __global__ void yuv420p_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src
 
     assert(checkShapeFromYUV420(src.height(batch_idx), src.width(batch_idx), code));
 
-    int plane_y_step  = dst_rows * dst_cols;
-    int plane_uv_step = plane_y_step / 4;
-    int uv_x          = (dst_y % 4 < 2) ? dst_x / 2 : (dst_x / 2 + dst_cols / 2);
+    // See layout commments in bgr_to_yuv420p_char_nhwc.
+    const int by = dst_rows + dst_y / 4; // Base row index for U and V: subsampled plane is 1/4 the height.
+    const int h4 = dst_rows / 4;         // Height (# of rows) of each subsampled U and V plane.
+
+    // Compute x position that combines two subsampled rows into one.
+    const int uv_x = (dst_x / 2) + ((dst_cols / 2) & -((dst_y / 2) & 1));
 
     T Y = *src.ptr(batch_idx, dst_y, dst_x);
-    T U = *src.ptr(batch_idx, dst_rows + dst_y / 4, uv_x + plane_uv_step * uidx);
-    T V = *src.ptr(batch_idx, dst_rows + dst_y / 4, uv_x + plane_uv_step * (1 - uidx));
+    T U = *src.ptr(batch_idx, by + h4 * uidx, uv_x);       // Some formats swap the U and V "planes" (as indicated
+    T V = *src.ptr(batch_idx, by + h4 * (uidx ^ 1), uv_x); //   by the uidx parameter).
 
     uchar r{0}, g{0}, b{0}, a{0xff};
     yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b);
@@ -671,33 +699,63 @@ __global__ void yuv420p_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src
     }
 }
 
+// YUV 422 interleaved formats (e.g., YUYV, YVYU, and UYVY) group 2 pixels into groups of 4 elements. Each group of two
+// pixels has two distinct luma (Y) values, one for each pixel. The chromaticity values (U and V) are subsampled by a
+// factor of two so that there is only one U and one V value for each group of 2 pixels. Example memory layout for
+// 4 x 4 image (UYVY format):
+//   U_00 Y_00 V_00 Y_01 U_02 Y_02 V_02 Y_03
+//   U_10 Y_10 V_10 Y_11 U_12 Y_12 V_12 Y_13
+//   U_20 Y_20 V_20 Y_21 U_22 Y_22 V_22 Y_23
+//   U_30 Y_30 V_30 Y_31 U_32 Y_32 V_32 Y_33
+// Each U and V value corresponds to two Y values--e.g. U_00 and V_00 correspond to Y_00 and Y_10 while U_12 and V_12
+// correspond to Y_12 and Y_13. Thus, a given Y value, Y_rc = Y(r,c) (where r is the row, or y coordinate, and c is the
+// column, or x coordinate), corresponds to U(r,c') and V(r,c') where c' is the even column coordinate <= c -- that is,
+// c' = 2 * floor(c/2) = (c & ~1). Some layouts swap the positions of the chromaticity and luma values (e.g., YUYV)
+// (indicated by the yidx parameter) and / or swap the the positions of the U and V chromaticity valus (e.g., YVYU)
+// (indicated by the uidx parameter).
+// The data layout is treated as a single channel tensor, so each group of 4 values corresponds to two pixels. As such,
+// the tensor width is twice the actual pixel width. Thus, it's easiest to process 4 consecutive values (2 pixels) per
+// thread.
 template<class T>
-__global__ void yuv422_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src,
-                                        cuda::ImageBatchVarShapeWrapNHWC<T> dst, int bidx, int yidx, int uidx)
+__global__ void yuv422_to_bgr_char_nhwc(cuda::ImageBatchVarShapeWrap<T> src, cuda::ImageBatchVarShapeWrapNHWC<T> dst,
+                                        int dcn, int bidx, int yidx, int uidx)
 {
-    int       dst_x     = blockIdx.x * blockDim.x + threadIdx.x;
-    int       dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
     const int batch_idx = get_batch_idx();
-    int       dst_cols  = dst.width(batch_idx);
-    int       dst_rows  = dst.height(batch_idx);
-    if (dst_x >= dst_cols || dst_y >= dst_rows)
+
+    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (dst_y >= dst.height(batch_idx))
+        return;
+
+    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dst_x >= dst.width(batch_idx))
         return;
-    int uv_x = (dst_x % 2 == 0) ? dst_x : dst_x - 1;
 
-    T Y = *src.ptr(batch_idx, dst_y, dst_x, yidx);
-    T U = *src.ptr(batch_idx, dst_y, uv_x, (1 - yidx) + uidx);
-    T V = *src.ptr(batch_idx, dst_y, uv_x, (1 - yidx) + uidx ^ 2);
+    int src_x = 2 * dst_x;    // Process 4 source elements/thread (i.e., 2 destination pixels).
+    int uv_x  = (src_x & ~3); // Compute "even" x coordinate for U and V (set lowest two bits to 0).
+
+    T Y0 = *src.ptr(batch_idx, dst_y, src_x + yidx);
+    T Y1 = *src.ptr(batch_idx, dst_y, src_x + yidx + 2);
+    T U  = *src.ptr(batch_idx, dst_y, uv_x + (yidx ^ 1) + uidx);
+    T V  = *src.ptr(batch_idx, dst_y, uv_x + (yidx ^ 1) + (uidx ^ 2));
 
     uchar r{0}, g{0}, b{0}, a{0xff};
-    yuv42xxp_to_bgr_kernel(int(Y), int(U), int(V), r, g, b);
+
+    yuv42xxp_to_bgr_kernel(int(Y0), int(U), int(V), r, g, b);
 
     *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = b;
     *dst.ptr(batch_idx, dst_y, dst_x, 1)        = g;
     *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r;
-    if (dst.numChannels() == 4)
-    {
+    if (dcn == 4)
+        *dst.ptr(batch_idx, dst_y, dst_x, 3) = a;
+
+    dst_x++; // Move to next output pixel.
+    yuv42xxp_to_bgr_kernel(int(Y1), int(U), int(V), r, g, b);
+
+    *dst.ptr(batch_idx, dst_y, dst_x, bidx)     = b;
+    *dst.ptr(batch_idx, dst_y, dst_x, 1)        = g;
+    *dst.ptr(batch_idx, dst_y, dst_x, bidx ^ 2) = r;
+    if (dcn == 4)
         *dst.ptr(batch_idx, dst_y, dst_x, 3) = a;
-    }
 }
 
 template<class T>
@@ -716,17 +774,25 @@ __global__ void yuv420_to_gray_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src
     *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y;
 }
 
+// See layout comment before yuv422_to_bgr_char_nhwc.
 template<class T>
-__global__ void yuv422_to_gray_char_nhwc(cuda::ImageBatchVarShapeWrapNHWC<T> src,
-                                         cuda::ImageBatchVarShapeWrapNHWC<T> dst, int yidx)
+__global__ void yuv422_to_gray_char_nhwc(cuda::ImageBatchVarShapeWrap<T> src, cuda::ImageBatchVarShapeWrapNHWC<T> dst,
+                                         int yidx)
 {
-    int       dst_x     = blockIdx.x * blockDim.x + threadIdx.x;
-    int       dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
     const int batch_idx = get_batch_idx();
-    if (dst_x >= dst.width(batch_idx) || dst_y >= dst.height(batch_idx))
+
+    int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (dst_y >= dst.height(batch_idx))
         return;
-    T Y                                  = *src.ptr(batch_idx, dst_y, dst_x, yidx);
-    *dst.ptr(batch_idx, dst_y, dst_x, 0) = Y;
+
+    int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dst_x >= dst.width(batch_idx))
+        return;
+
+    int src_x = 2 * dst_x; // Process 4 source elements/thread (i.e., 2 destination pixels).
+
+    *dst.ptr(batch_idx, dst_y, dst_x++) = *src.ptr(batch_idx, dst_y, src_x + yidx);
+    *dst.ptr(batch_idx, dst_y, dst_x)   = *src.ptr(batch_idx, dst_y, src_x + yidx + 2);
 }
 
 inline ErrorCode BGR_to_RGB(const ImageBatchVarShapeDataStridedCuda &inData,
@@ -776,6 +842,12 @@ inline ErrorCode BGR_to_RGB(const ImageBatchVarShapeDataStridedCuda &inData,
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
+    if (out_data_type == kCV_16F && sch < 4 && dch == 4)
+    {
+        LOG_ERROR("Adding alpha to the output is not supported for " << out_data_type);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+
     int max_width  = inData.maxSize().w;
     int max_height = inData.maxSize().h;
     int batch_size = inData.numImages();
@@ -794,8 +866,8 @@ inline ErrorCode BGR_to_RGB(const ImageBatchVarShapeDataStridedCuda &inData,
         checkKernelErrors();
     }
     break;
+    case kCV_16F: // Not properly handled when adding alpha to the destination.
     case kCV_16U:
-    case kCV_16F:
     case kCV_16S:
     {
         cuda::ImageBatchVarShapeWrapNHWC<uint16_t> src_ptr(inData, sch);
@@ -874,6 +946,12 @@ inline ErrorCode GRAY_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData,
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
+    if (out_data_type == kCV_16F && dch == 4)
+    {
+        LOG_ERROR("Adding alpha to the output is not supported for " << out_data_type);
+        return ErrorCode::INVALID_DATA_SHAPE;
+    }
+
     int max_width  = inData.maxSize().w;
     int max_height = inData.maxSize().h;
     int batch_size = inData.numImages();
@@ -892,8 +970,8 @@ inline ErrorCode GRAY_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData,
         checkKernelErrors();
     }
     break;
+    case kCV_16F: // Not properly handled when adding alpha to the destination.
     case kCV_16U:
-    case kCV_16F:
     case kCV_16S:
     {
         cuda::ImageBatchVarShapeWrapNHWC<uint16_t> src_ptr(inData, channels);
@@ -1464,7 +1542,7 @@ inline ErrorCode YUV422_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData,
     int      channels  = inData.uniqueFormat().numChannels();
     DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat());
 
-    if (channels != 2)
+    if (channels != 3)
     {
         LOG_ERROR("Invalid input channel number " << channels);
         return ErrorCode::INVALID_DATA_SHAPE;
@@ -1483,28 +1561,52 @@ inline ErrorCode YUV422_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData,
 
     int dcn = outData.uniqueFormat().numChannels();
 
-    if (dcn != 3 && dcn != 4)
+    if ((code != NVCV_COLOR_YUV2GRAY_UYVY && code != NVCV_COLOR_YUV2GRAY_YUY2 || dcn != 1) && dcn != 3 && dcn != 4)
     {
         LOG_ERROR("Invalid output channel number " << dcn);
         return ErrorCode::INVALID_DATA_SHAPE;
     }
 
+    auto inList = inData.imageList();
+
+    for (int i = 0; i < inData.numImages(); i++)
+    {
+        if (inList[i].numPlanes != 1)
+        {
+            LOG_ERROR("Input batch images must all be a single plane of data");
+            return ErrorCode::INVALID_DATA_SHAPE;
+        }
+
+        NVCVImagePlaneStrided plane = inList[i].planes[0];
+
+        if (plane.width % 2 != 0)
+        {
+            LOG_ERROR("Input batch images must all have a width that is a multiple of 2");
+            return ErrorCode::INVALID_DATA_SHAPE;
+        }
+        if (plane.rowStride < plane.width * 2)
+        {
+            LOG_ERROR("Insufficient input batch image stride");
+            return ErrorCode::INVALID_DATA_SHAPE;
+        }
+    }
+
     int max_width  = inData.maxSize().w;
     int max_height = inData.maxSize().h;
     int batch_size = inData.numImages();
 
     dim3 blockSize(BLOCK, BLOCK / 4, 1);
-    dim3 gridSize(divUp(max_width, blockSize.x), divUp(max_height, blockSize.y), batch_size);
+    dim3 gridSize(divUp(max_width / 2, blockSize.x), divUp(max_height, blockSize.y), batch_size);
 
-    cuda::ImageBatchVarShapeWrapNHWC<unsigned char> src_ptr(inData, channels);
-    cuda::ImageBatchVarShapeWrapNHWC<unsigned char> dst_ptr(outData, dcn);
+    cuda::ImageBatchVarShapeWrap<uint8_t>     src_ptr(inData);
+    cuda::ImageBatchVarShapeWrapNHWC<uint8_t> dst_ptr(outData, dcn);
 
     switch (code)
     {
     case NVCV_COLOR_YUV2GRAY_YUY2:
     case NVCV_COLOR_YUV2GRAY_UYVY:
     {
-        yuv422_to_gray_char_nhwc<unsigned char><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr, yidx);
+        yuv422_to_gray_char_nhwc<uint8_t><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr, yidx);
         checkKernelErrors();
     }
     break;
@@ -1521,7 +1623,7 @@ inline ErrorCode YUV422_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData,
     case NVCV_COLOR_YUV2RGBA_UYVY:
     case NVCV_COLOR_YUV2BGRA_UYVY:
     {
-        yuv422_to_bgr_char_nhwc<unsigned char><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr, bidx, yidx, uidx);
+        yuv422_to_bgr_char_nhwc<uint8_t><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr, dcn, bidx, yidx, uidx);
         checkKernelErrors();
     }
     break;
diff --git a/src/cvcuda/priv/legacy/morphology_var_shape.cu b/src/cvcuda/priv/legacy/morphology_var_shape.cu
index 9edfa907..9f751f39 100644
--- a/src/cvcuda/priv/legacy/morphology_var_shape.cu
+++ b/src/cvcuda/priv/legacy/morphology_var_shape.cu
@@ -209,12 +209,6 @@ ErrorCode MorphologyVarShape::infer(const nvcv::ImageBatchVarShape &inBatch, con
         return ErrorCode::INVALID_DATA_FORMAT;
     }
 
-    if (input_format != output_format)
-    {
-        LOG_ERROR("Invalid DataFormat between input (" << input_format << ") and output (" << output_format << ")");
-        return ErrorCode::INVALID_DATA_FORMAT;
-    }
-
     DataFormat format = input_format;
     if (!(format == kNHWC || format == kHWC))
     {
diff --git a/src/nvcv/CMakeLists.txt b/src/nvcv/CMakeLists.txt
index f5f544ff..2f2ae1ea 100644
--- a/src/nvcv/CMakeLists.txt
+++ b/src/nvcv/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.20.1)
 
 project(nvcv
         LANGUAGES C CXX
-        VERSION 0.11.0
+        VERSION 0.12.0
         DESCRIPTION "NVCV is NVIDIA Computer Vision library"
 )
 
diff --git a/tests/common/TensorDataUtils.cpp b/tests/common/TensorDataUtils.cpp
index a5486a13..243fb69e 100644
--- a/tests/common/TensorDataUtils.cpp
+++ b/tests/common/TensorDataUtils.cpp
@@ -184,14 +184,17 @@ nvcv::Tensor CreateTensor(int numImages, int imgWidth, int imgHeight, const nvcv
     if (imgFormat == NVCV_IMAGE_FORMAT_NV12 || imgFormat == NVCV_IMAGE_FORMAT_NV12_ER
         || imgFormat == NVCV_IMAGE_FORMAT_NV21 || imgFormat == NVCV_IMAGE_FORMAT_NV21_ER)
     {
-        int height420
-            = (imgHeight * 3) / 2; // tensor size is 3/2 times the image size to accommodate the 1/2 chroma planes
-        // width must be even and height must be multiple of 3 (original height must be even and multiple of 2)
-        if (height420 % 3 != 0 || imgWidth % 2 != 0)
+        // Width and height must be a multiple of 2 (i.e., even).
+        if (imgHeight % 2 != 0 || imgWidth % 2 != 0)
         {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Invalid height");
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Invalid height or width: height and width need to be a "
+                                  "multiple of 2 for planar and semi-planar YUV420 formats.");
         }
 
+        // Tensor height is 3/2 times the image height to accommodate the half-height chroma planes.
+        int height420 = (imgHeight * 3) / 2;
+
         if (numImages == 1)
         {
             return nvcv::Tensor(
@@ -209,6 +212,25 @@ nvcv::Tensor CreateTensor(int numImages, int imgWidth, int imgHeight, const nvcv
             return nvcv::Tensor(numImages, {imgWidth, height420}, nvcv::ImageFormat(NVCV_IMAGE_FORMAT_Y8));
         }
     }
+    else if (imgFormat == NVCV_IMAGE_FORMAT_UYVY || imgFormat == NVCV_IMAGE_FORMAT_UYVY_ER
+             || imgFormat == NVCV_IMAGE_FORMAT_YUYV || imgFormat == NVCV_IMAGE_FORMAT_YUYV_ER)
+    {
+        if (imgWidth % 2 != 0)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Invalid width: width needs to be a multiple of 2 for interleaved YUV422 formats.");
+        }
+
+        int wdth422 = 2 * imgWidth; // Tensor width is 2x the image width to accomodate two chromaticity values for
+                                    //   each group of two luma values (UYVY or YUYV).
+        // clang-format off
+        nvcv::DataType    type  = imgFormat.planeDataType(0).channelType(0);
+        nvcv::TensorShape shape = numImages > 1 ? nvcv::TensorShape({numImages, imgHeight, wdth422, 1}, "NHWC")
+                                                : nvcv::TensorShape(           {imgHeight, wdth422, 1},  "HWC");
+
+        return nvcv::Tensor(shape, type);
+        // clang-format on
+    }
     if (numImages == 1)
     {
         int numChannels = imgFormat.numPlanes() == 1 ? imgFormat.planeNumChannels(0) : imgFormat.numPlanes();
diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt
index fcf9a455..c89260a3 100644
--- a/tests/cvcuda/system/CMakeLists.txt
+++ b/tests/cvcuda/system/CMakeLists.txt
@@ -53,7 +53,9 @@ add_executable(cvcuda_test_system
     TestOpCropFlipNormalizeReformat.cpp
     FlipUtils.cpp
     ConvUtils.cpp
+    CvtColorUtils.cpp
     ResizeUtils.cpp
+    TestUtils.cpp
     TestOpNonMaximumSuppression.cpp
     TestOpReformat.cpp
     TestOpResize.cpp
diff --git a/tests/cvcuda/system/CvtColorUtils.cpp b/tests/cvcuda/system/CvtColorUtils.cpp
new file mode 100644
index 00000000..b3fb91a2
--- /dev/null
+++ b/tests/cvcuda/system/CvtColorUtils.cpp
@@ -0,0 +1,1074 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CvtColorUtils.hpp"
+
+#include <cvcuda/cuda_tools/SaturateCast.hpp>
+#include <cvcuda/cuda_tools/math/LinAlg.hpp>
+
+#include <cmath>   // For std::floor
+#include <cstring> // For std::memcpy
+
+namespace cuda = nvcv::cuda;
+
+template<typename T>
+using Vector3 = cuda::math::Vector<T, 3>;
+
+template<typename T>
+using Matrix3x3 = cuda::math::Matrix<T, 3, 3>;
+
+using Vec3f = Vector3<float>;
+using Vec3d = Vector3<double>;
+
+using Mat3f = Matrix3x3<float>;
+using Mat3d = Matrix3x3<double>;
+
+using std::vector;
+
+// Accurate coefficients for converting RGB to ITU Rec.601 luma.
+// Found at
+//   http://www.brucelindbloom.com/index.html?WorkingSpaceInfo.html
+// and
+//   https://www.imagemagick.org/include/api/pixel.php.
+// NOTE: These coefficients are more accurate than the standard [0.299, 0.587, 0.144] values used elsewhere and may
+// results in slightly different floating point or large integer (e.g., uint16 or uint32) pixel values.
+// static constexpr double Red2Y = 0.298839;  // Y = Red2Y * R
+// static constexpr double Grn2Y = 0.586811;  //   + Grn2Y * G
+// static constexpr double Blu2Y = 0.114350;  //   + Blu2Y * B
+static constexpr double Red2Y = 0.299; // Y = Red2Y * R
+static constexpr double Grn2Y = 0.587; //   + Grn2Y * G
+static constexpr double Blu2Y = 0.114; //   + Blu2Y * B
+
+// Coefficients to convert non-linear RGB to PAL (analog color TV standard) chromaticity (U and V) components.
+// NOTE: Both PAL and NTSC use the ITU Rec.601 RGB coefficients to compute Y.
+// static constexpr double Blu2U_PAL = 0.492111; // U = Blu2U_PAL * (B - Y) + 0.5
+// static constexpr double Red2V_PAL = 0.877283; // V = Red2V_PAL * (R - Y) + 0.5
+static constexpr double Blu2U_PAL = 0.492; // U = Blu2U_PAL * (B - Y) + 0.5
+static constexpr double Red2V_PAL = 0.877; // V = Red2V_PAL * (R - Y) + 0.5
+
+// Coefficients to convert non-linear RGB to ITU Rec.601 chromaticity (Cb and Cr) components.
+static constexpr double Blu2Cb_601 = 0.56455710; // 1.0 / 1.7713   Cb/U
+static constexpr double Red2Cr_601 = 0.71310298; // 1.0 / 1.402322 Cr/V
+
+// clang-format off
+
+// Coefficients to convert chromaticity (U and V) components to RGB .
+// static constexpr double U2Blu =  2.03211;
+// static constexpr double U2Grn = -0.39465;
+// static constexpr double V2Grn = -0.58060;
+// static constexpr double V2Red =  1.13983;
+static constexpr double U2Blu =  2.032;
+static constexpr double U2Grn = -0.395;
+static constexpr double V2Grn = -0.581;
+static constexpr double V2Red =  1.140;
+
+// Coefficients to convert RGB to ITU Rec.601 YCbCr.
+static constexpr double R2Y_NV12 =  0.255785;
+static constexpr double G2Y_NV12 =  0.502160;
+static constexpr double B2Y_NV12 =  0.097523;
+
+static constexpr double R2U_NV12 = -0.147644;
+static constexpr double G2U_NV12 = -0.289856;
+static constexpr double B2U_NV12 =  0.4375;
+
+static constexpr double R2V_NV12 =  0.4375;
+static constexpr double G2V_NV12 = -0.366352;
+static constexpr double B2V_NV12 = -0.071148;
+
+// Coefficients to convert RGB to ITU Rec.601 YCbCr.
+static constexpr double Y2R_NV12 =  1.16895;
+static constexpr double U2R_NV12 =  0.0;
+static constexpr double V2R_NV12 =  1.60229;
+
+static constexpr double Y2G_NV12 =  1.16895;
+static constexpr double U2G_NV12 = -0.3933;
+static constexpr double V2G_NV12 = -0.81616;
+
+static constexpr double Y2B_NV12 =  1.16895;
+static constexpr double U2B_NV12 =  2.02514;
+static constexpr double V2B_NV12 =  0.0;
+
+// Coefficients to add or subtract from YCbCr (abbreviated YUV)components to convert between RGB and ITU Rec.601 YCbCr.
+static constexpr double Add2Y_NV12 =  16.0;
+static constexpr double Add2U_NV12 = 128.0;
+static constexpr double Add2V_NV12 = 128.0;
+
+// clang-format on
+
+template<typename T, typename BT = cuda::BaseType<T>>
+constexpr BT Alpha = std::is_floating_point_v<BT> ? 1 : cuda::TypeTraits<BT>::max;
+
+//-==================================================================================================================-//
+// Set AlphaOnly to true to add/remove alpha channel to RGB/BGR image (without switching between RGB and BGR).
+template<typename T, bool AlphaOnly>
+static void convertRGBtoBGR(T *dst, const T *src, size_t numPixels, bool srcRGBA, bool dstRGBA)
+{
+    const uint incr = 3 + srcRGBA;
+
+    for (size_t i = 0; i < numPixels; i++, src += incr)
+    {
+        // clang-format off
+        if constexpr (AlphaOnly) { *dst++ = src[0];  *dst++ = src[1];  *dst++ = src[2]; }
+        else                     { *dst++ = src[2];  *dst++ = src[1];  *dst++ = src[0]; }
+        if (dstRGBA) *dst++ = srcRGBA ? src[3] : Alpha<T>;
+        // clang-format on
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertRGBtoBGR(vector<T> &dst, const vector<T> &src, size_t numPixels, bool srcRGBA, bool dstRGBA)
+{
+    convertRGBtoBGR<T, false>(dst.data(), src.data(), numPixels, srcRGBA, dstRGBA);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_RGBtoBGR(T) template void convertRGBtoBGR<T>(vector<T> &, const vector<T> &, size_t, bool, bool)
+
+MAKE_RGBtoBGR(uint8_t);
+MAKE_RGBtoBGR(uint16_t);
+MAKE_RGBtoBGR(int32_t);
+MAKE_RGBtoBGR(float);
+MAKE_RGBtoBGR(double);
+
+#undef MAKE_RGBtoBGR
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void changeAlpha(vector<T> &dst, const vector<T> &src, size_t numPixels, bool srcRGBA, bool dstRGBA)
+{
+    convertRGBtoBGR<T, true>(dst.data(), src.data(), numPixels, srcRGBA, dstRGBA);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_CHANGE_ALPHA(T) template void changeAlpha<T>(vector<T> &, const vector<T> &, size_t, bool, bool)
+
+MAKE_CHANGE_ALPHA(uint8_t);
+MAKE_CHANGE_ALPHA(uint16_t);
+MAKE_CHANGE_ALPHA(int32_t);
+MAKE_CHANGE_ALPHA(float);
+MAKE_CHANGE_ALPHA(double);
+
+#undef MAKE_CHANGE_ALPHA
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void convertRGBtoGray(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr)
+{
+    const int incr = 3 + rgba;
+
+    for (size_t i = 0; i < numPixels; i++, dst++, src += incr)
+    {
+        // clang-format off
+        if (bgr) *dst = static_cast<T>(Blu2Y * src[0] + Grn2Y * src[1] + Red2Y * src[2]);
+        else     *dst = static_cast<T>(Red2Y * src[0] + Grn2Y * src[1] + Blu2Y * src[2]);
+        // clang-format on
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertRGBtoGray(vector<T> &dst, const vector<T> &src, size_t numPixels, bool rgba, bool bgr)
+{
+    convertRGBtoGray<T>(dst.data(), src.data(), numPixels, rgba, bgr);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_RGBtoGray(T) template void convertRGBtoGray<T>(vector<T> &, const vector<T> &, size_t, bool, bool)
+
+MAKE_RGBtoGray(uint8_t);
+MAKE_RGBtoGray(uint16_t);
+MAKE_RGBtoGray(int32_t);
+MAKE_RGBtoGray(float);
+MAKE_RGBtoGray(double);
+
+#undef MAKE_RGBtoGray
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void convertGrayToRGB(T *dst, const T *src, size_t numPixels, bool rgba)
+{
+    for (size_t i = 0; i < numPixels; i++)
+    {
+        T val = *src++;
+
+        // clang-format off
+        *dst++ = val;  *dst++ = val;  *dst++ = val;
+        if (rgba) *dst++ = Alpha<T>;
+        // clang-format on
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertGrayToRGB(vector<T> &dst, const vector<T> &src, size_t numPixels, bool rgba)
+{
+    convertGrayToRGB<T>(dst.data(), src.data(), numPixels, rgba);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_GrayToRGB(T) template void convertGrayToRGB<T>(vector<T> &, const vector<T> &, size_t, bool)
+
+MAKE_GrayToRGB(uint8_t);
+MAKE_GrayToRGB(uint16_t);
+MAKE_GrayToRGB(int32_t);
+MAKE_GrayToRGB(float);
+MAKE_GrayToRGB(double);
+
+#undef MAKE_GrayToRGB
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T, bool FullRange>
+void convertRGBtoHSV(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr)
+{
+    // Set the hue range (e.g., 0-360 for float types) and scale factor (to convert the final value to output hue value).
+    constexpr double range = (sizeof(T) > 1) ? 360.0 : (FullRange ? 256.0 : 180.0);
+    constexpr double scale = range / 360.0;
+    constexpr double norm  = std::is_floating_point_v<T> ? 1 : cuda::TypeTraits<T>::max;
+    constexpr double round = std::is_floating_point_v<T> ? 0 : 0.5;
+
+    for (size_t i = 0; i < numPixels; i++)
+    {
+        double R = static_cast<double>(*src++) / norm;
+        double G = static_cast<double>(*src++) / norm;
+        double B = static_cast<double>(*src++) / norm;
+
+        // clang-format off
+        if (bgr) std::swap(R, B);
+        if (rgba) src++;
+        // clang-format on
+
+        double Vmin = std::min(R, std::min(G, B));
+        double V    = std::max(R, std::max(G, B));
+
+        double diff = static_cast<double>(V - Vmin);
+
+        double S = static_cast<double>(V) > DBL_EPSILON ? diff / V : 0.0;
+        double H = 0.0;
+
+        if (diff > DBL_EPSILON)
+        {
+            // clang-format off
+            diff = 60.0 / diff;
+            if      (V == R) H = (G - B) * diff;
+            else if (V == G) H = (B - R) * diff + 120.0;
+            else             H = (R - G) * diff + 240.0;
+            // clang-format on
+        }
+        H *= scale;
+        S *= norm;
+        V *= norm;
+
+        // Make sure hue falls within the proper range: the value 'range' (e.g., 360) should not appear since it's equivalent to 0.
+        H += round;
+        // clang-format off
+        if      (H >= range) H -= range;  // For the case when T is uint8_t and FullRange is false, H can be > 180.
+        else if (H <  0.0)   H += range;
+        // clang-format on
+        H -= round;
+
+        *dst++ = static_cast<T>(H + round);
+        *dst++ = static_cast<T>(S + round);
+        *dst++ = static_cast<T>(V + round);
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T, bool FullRange>
+void convertRGBtoHSV(vector<T> &dst, const vector<T> &src, size_t numPixels, bool rgba, bool bgr)
+{
+    convertRGBtoHSV<T, FullRange>(dst.data(), src.data(), numPixels, rgba, bgr);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+// Restricted range hue (FullRange = false): values between [0-180). Applies only to uint8_t, but still need to
+// instantiate all the types.
+#define MAKE_RGBtoHSV(T) template void convertRGBtoHSV<T, false>(vector<T> &, const vector<T> &, size_t, bool, bool)
+
+MAKE_RGBtoHSV(uint8_t);
+MAKE_RGBtoHSV(uint16_t);
+MAKE_RGBtoHSV(int32_t);
+MAKE_RGBtoHSV(float);
+MAKE_RGBtoHSV(double);
+
+#undef MAKE_RGBtoHSV
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+// Full range hue (FullRange = false): values between [0-256). Applies only to uint8_t, but still need to
+// instantiate all the types.
+#define MAKE_RGBtoHSV(T) template void convertRGBtoHSV<T, true>(vector<T> &, const vector<T> &, size_t, bool, bool)
+
+MAKE_RGBtoHSV(uint8_t);
+MAKE_RGBtoHSV(uint16_t);
+MAKE_RGBtoHSV(int32_t);
+MAKE_RGBtoHSV(float);
+MAKE_RGBtoHSV(double);
+
+#undef MAKE_RGBtoHSV
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+/* To convert HSV to RGB:
+    1) Ensure that (H,S,V) range is (360.0, 1.0, 1.0)
+    2) H' = H / 60
+    3) C  = V * S
+    4) I  = (int)H
+    5) h  = H' - I  // Fractional part of H'
+    6) X  = C * (1 - fabs(fmod(H', 2.0) - 1.0))
+          = C * (1 - fabs(H' - (I & ~1) - 1.0))
+          = C * ((I & 1) ? 1 - h : h)
+    5) m  = V - C
+          = V - V * S
+          = V * (1 - S)
+    7) p  = X + m      // When I is even: (I & 1) == 0 (I = 0, 2, or 4)
+          = C * h + V - C
+          = V * S * h + V * (1 - S)
+          = V * (S * h + 1 - S)
+          = V * (1 - S + S * h)
+          = V * (1 - S * (1 - h))
+    8) q  = X + m      // When I is odd: (I & 1) == 1 (I = 1, 3, or 5)
+          = C * (1 - h) + V - C
+          = V * S * (1 - h) + V * (1 - S)
+          = V * (S - S * h + 1 - S)
+          = V * (1 - S * h)
+    9) Cases: // Note: C + m = C + V - C = V
+           I == 0: R = C + m = V
+                   G = X + m = p  // Even case
+                   B =     m
+
+           I == 1: R = X + m = q  // Odd case
+                   G = C + m = V
+                   B =     m
+
+           I == 2: R =     m
+                   G = C + m = V
+                   B = X + m = p  // Even case
+
+           I == 3: R =     m
+                   G = X + m = q  // Odd case
+                   B = C + m = V
+
+           I == 4: R = X + m = p  // Even case
+                   G =     m
+                   B = C + m = V
+
+           I == 5: R = C + m = V
+                   G =     m
+                   B = X + m = q  // Odd case
+*/
+template<typename T, bool FullRange>
+void convertHSVtoRGB(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr)
+{
+    constexpr double range = (sizeof(T) > 1) ? 360.0 : (FullRange ? 256.0 : 180.0);
+    constexpr double scale = 6.0 / range;
+    constexpr double norm  = std::is_floating_point_v<T> ? 1 : cuda::TypeTraits<T>::max;
+    constexpr double round = std::is_floating_point_v<T> ? 0 : 0.5;
+
+    constexpr uint mapR[6] = {0, 2, 1, 1, 3, 0};
+    constexpr uint mapG[6] = {3, 0, 0, 2, 1, 1};
+    constexpr uint mapB[6] = {1, 1, 3, 0, 0, 2};
+
+    for (size_t i = 0; i < numPixels; i++)
+    {
+        double H = *src++ * scale; // 0 <= H <  6
+        double S = *src++ / norm;  // 0 <= S <= 1
+        double V = *src++ / norm;  // 0 <= V <= 1
+
+        int idx = static_cast<int>(std::floor(H));
+
+        H -= idx;
+
+        // clang-format off
+        idx %= 6;
+        if (idx < 0) idx += 6;
+
+        double val[] = {V,
+                        V * (1 - S),
+                        V * (1 - S * H),
+                        V * (1 - S * (1 - H))};
+
+        uint r = mapR[idx];
+        uint g = mapG[idx];
+        uint b = mapB[idx];
+
+        if (bgr) std::swap(r, b);
+        *dst++ = static_cast<T>(val[r] * norm + round);
+        *dst++ = static_cast<T>(val[g] * norm + round);
+        *dst++ = static_cast<T>(val[b] * norm + round);
+        if (rgba) *dst++ = Alpha<T>;
+        // clang-format on
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T, bool FullRange>
+void convertHSVtoRGB(vector<T> &dst, const vector<T> &src, size_t numPixels, bool rgba, bool bgr)
+{
+    convertHSVtoRGB<T, FullRange>(dst.data(), src.data(), numPixels, rgba, bgr);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+// Restricted range hue (FullRange = false): values between [0-180). Applies only to uint8_t, but still need to
+// instantiate all the types.
+#define MAKE_HSVtoRGB(T) template void convertHSVtoRGB<T, false>(vector<T> &, const vector<T> &, size_t, bool, bool)
+
+MAKE_HSVtoRGB(uint8_t);
+MAKE_HSVtoRGB(uint16_t);
+MAKE_HSVtoRGB(int32_t);
+MAKE_HSVtoRGB(float);
+MAKE_HSVtoRGB(double);
+
+#undef MAKE_HSVtoRGB
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+// Full range hue (FullRange = false): values between [0-256). Applies only to uint8_t, but still need to
+// instantiate all the types.
+#define MAKE_HSVtoRGB(T) template void convertHSVtoRGB<T, true>(vector<T> &, const vector<T> &, size_t, bool, bool)
+
+MAKE_HSVtoRGB(uint8_t);
+MAKE_HSVtoRGB(uint16_t);
+MAKE_HSVtoRGB(int32_t);
+MAKE_HSVtoRGB(float);
+MAKE_HSVtoRGB(double);
+
+#undef MAKE_HSVtoRGB
+
+//-==================================================================================================================-//
+template<typename T>
+void convertRGBtoYUV_PAL(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr)
+{
+    constexpr T max   = std::is_floating_point_v<T> ? 1 : cuda::TypeTraits<T>::max;
+    constexpr T delta = max / 2 + (std::is_floating_point_v<T> ? 0 : 1);
+
+    for (size_t i = 0; i < numPixels; i++)
+    {
+        T red = *src++;
+        T grn = *src++;
+        T blu = *src++;
+
+        // clang-format off
+        if (bgr) std::swap(red, blu);
+        if (rgba) src++;
+        // clang-format on
+
+        double Y = Red2Y * red + Grn2Y * grn + Blu2Y * blu;
+
+        *dst++ = cuda::SaturateCast<T>(Y);
+        *dst++ = cuda::SaturateCast<T>(Blu2U_PAL * (blu - Y) + delta);
+        *dst++ = cuda::SaturateCast<T>(Red2V_PAL * (red - Y) + delta);
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertRGBtoYUV_PAL(vector<T> &dst, const vector<T> &src, size_t numPixels, bool rgba, bool bgr)
+{
+    convertRGBtoYUV_PAL<T>(dst.data(), src.data(), numPixels, rgba, bgr);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_RGBtoYUV(T) template void convertRGBtoYUV_PAL<T>(vector<T> &, const vector<T> &, size_t, bool, bool)
+
+MAKE_RGBtoYUV(uint8_t);
+MAKE_RGBtoYUV(uint16_t);
+MAKE_RGBtoYUV(int32_t);
+MAKE_RGBtoYUV(float);
+MAKE_RGBtoYUV(double);
+
+#undef MAKE_RGBtoYUV
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void convertYUVtoRGB_PAL(T *dst, const T *src, size_t numPixels, bool rgba, bool bgr)
+{
+    constexpr T max   = std::is_floating_point_v<T> ? 1 : cuda::TypeTraits<T>::max;
+    constexpr T delta = max / 2 + (std::is_floating_point_v<T> ? 0 : 1);
+
+    for (size_t i = 0; i < numPixels; i++)
+    {
+        double Y = *src++;
+        double U = *src++;
+        double V = *src++;
+
+        U -= delta;
+        V -= delta;
+
+        double red = Y + V * V2Red;
+        double grn = Y + U * U2Grn + V * V2Grn;
+        double blu = Y + U * U2Blu;
+
+        // clang-format off
+        if (bgr) std::swap(red, blu);
+        *dst++ = cuda::SaturateCast<T>(red);
+        *dst++ = cuda::SaturateCast<T>(grn);
+        *dst++ = cuda::SaturateCast<T>(blu);
+        if (rgba) *dst++ = Alpha<T>;
+        // clang-format on
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertYUVtoRGB_PAL(vector<T> &dst, const vector<T> &src, size_t numPixels, bool rgba, bool bgr)
+{
+    convertYUVtoRGB_PAL<T>(dst.data(), src.data(), numPixels, rgba, bgr);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_YUVtoRGB(T) template void convertYUVtoRGB_PAL<T>(vector<T> &, const vector<T> &, size_t, bool, bool)
+
+MAKE_YUVtoRGB(uint8_t);
+MAKE_YUVtoRGB(uint16_t);
+MAKE_YUVtoRGB(int32_t);
+MAKE_YUVtoRGB(float);
+MAKE_YUVtoRGB(double);
+
+#undef MAKE_YUVtoRGB
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void convertRGBtoYUV_420(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu)
+{
+    // Ensure both width and height are multiples of 2 since we're processing 2x2 blocks.
+    assert(wdth % 2 == 0 && hght % 2 == 0);
+
+    const size_t imgPixels = (size_t)hght * (size_t)wdth;
+    const size_t incrPix   = 3 + rgba;
+    const size_t incrSrc   = imgPixels * incrPix;
+    const size_t incrDst   = imgPixels * 3 / 2;
+
+    for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst)
+    {
+        T *y = dst;
+        T *u = y + imgPixels;
+        T *v = u + imgPixels / 4;
+
+        const T *rgb = src;
+
+        // clang-format off
+        if (yvu) std::swap(u, v);
+        // clang-format on
+
+        for (uint h = 0; h < hght; h++)
+        {
+            for (uint w = 0; w < wdth; w++, rgb += incrPix)
+            {
+                T R = rgb[0];
+                T G = rgb[1];
+                T B = rgb[2];
+
+                // Convert all RGB values to Y values and store them.
+                // clang-format off
+                if (bgr) std::swap(R, B);
+                *y++ = cuda::SaturateCast<T>(R2Y_NV12 * R + G2Y_NV12 * G + B2Y_NV12 * B + Add2Y_NV12);
+                // clang-format on
+
+                // Convert only even pixels (in width and height) to U and V values and store them.
+                if ((w & 1) == 0 && (h & 1) == 0)
+                {
+                    double U = R2U_NV12 * R + G2U_NV12 * G + B2U_NV12 * B + Add2U_NV12;
+                    double V = R2V_NV12 * R + G2V_NV12 * G + B2V_NV12 * B + Add2V_NV12;
+
+                    *u++ = cuda::SaturateCast<T>(U);
+                    *v++ = cuda::SaturateCast<T>(V);
+                }
+            }
+        }
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertRGBtoYUV_420(vector<T> &dst, const vector<T> &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr,
+                         bool yvu)
+{
+    // Ensure input data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size.
+    assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba));
+
+    // YUV 420 needs 3 elements for each two RGB pixels.
+    assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2);
+
+    convertRGBtoYUV_420<T>(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_RGBtoYUV(T) \
+    template void convertRGBtoYUV_420<T>(vector<T> &, const vector<T> &, uint, uint, uint, bool, bool, bool)
+
+MAKE_RGBtoYUV(uint8_t);
+MAKE_RGBtoYUV(uint16_t);
+MAKE_RGBtoYUV(int32_t);
+MAKE_RGBtoYUV(float);
+MAKE_RGBtoYUV(double);
+
+#undef MAKE_RGBtoYUV
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void convertYUVtoRGB_420(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu)
+{
+    // Ensure both width and height are multiples of 2 since we're processing 2x2 blocks.
+    assert(wdth % 2 == 0 && hght % 2 == 0);
+
+    const size_t imgPixels = (size_t)hght * (size_t)wdth;
+    const size_t incrSrc   = imgPixels * 3 / 2;
+    const size_t incrDst   = imgPixels * (3 + rgba);
+
+    for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst)
+    {
+        T *rgb = dst;
+
+        const T *y = src;
+
+        for (uint h = 0; h < hght; h++)
+        {
+            // clang-format off
+            // NOTE: when computing subsampled row index, h needs to be integer divided by 4 before multiplying by width.
+            const T *u = src + imgPixels + (h / 4) * wdth + ((h / 2) & 1) * (wdth / 2);
+            const T *v = u   + imgPixels / 4;
+
+            if (yvu) std::swap(u, v);
+            // clang-format on
+
+            for (uint w = 0; w < wdth; w++)
+            {
+                double Y = *y++;
+                double U = *u;
+                double V = *v;
+
+                // Convert all YUV (ITU Rec.601) values to RGB values and store them.
+                Y -= Add2Y_NV12;
+                U -= Add2U_NV12;
+                V -= Add2V_NV12;
+
+                // clang-format off
+                if (Y < 0.0) Y = 0.0;
+                T R = cuda::SaturateCast<T>(Y2R_NV12 * Y + U2R_NV12 * U + V2R_NV12 * V);
+                T G = cuda::SaturateCast<T>(Y2G_NV12 * Y + U2G_NV12 * U + V2G_NV12 * V);
+                T B = cuda::SaturateCast<T>(Y2B_NV12 * Y + U2B_NV12 * U + V2B_NV12 * V);
+                if (bgr) std::swap(R, B);
+                *rgb++ = R;
+                *rgb++ = G;
+                *rgb++ = B;
+                if (rgba) *rgb++ = Alpha<T>;
+                // clang-format on
+
+                u += (w & 1);
+                v += (w & 1);
+            }
+        }
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertYUVtoRGB_420(vector<T> &dst, const vector<T> &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr,
+                         bool yvu)
+{
+    // Ensure output data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size.
+    assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba));
+
+    // YUV 420 needs 3 elements for each two RGB pixels.
+    assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2);
+
+    convertYUVtoRGB_420<T>(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_NV12toRGB(T) \
+    template void convertYUVtoRGB_420<T>(vector<T> &, const vector<T> &, uint, uint, uint, bool, bool, bool)
+
+MAKE_NV12toRGB(uint8_t);
+MAKE_NV12toRGB(uint16_t);
+MAKE_NV12toRGB(int32_t);
+MAKE_NV12toRGB(float);
+MAKE_NV12toRGB(double);
+
+#undef MAKE_NV12toRGB
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void convertYUVtoGray_420(T *dst, const T *src, uint wdth, uint hght, uint numImgs)
+{
+    // Ensure both width and height are multiples of 2.
+    assert(wdth % 2 == 0 && hght % 2 == 0);
+
+    const size_t imgPixels = (size_t)hght * (size_t)wdth;
+    const size_t incrSrc   = imgPixels * 3 / 2;
+
+    for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += imgPixels)
+    {
+        std::memcpy(dst, src, imgPixels * sizeof(T)); // Copy Y plane of each image to destination tensor.
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertYUVtoGray_420(vector<T> &dst, const vector<T> &src, uint wdth, uint hght, uint numImgs)
+{
+    // Ensure output data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size.
+    assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth);
+
+    // YUV 420 needs 3 elements for each two RGB pixels.
+    assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2);
+
+    convertYUVtoGray_420<T>(dst.data(), src.data(), wdth, hght, numImgs);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_YUVtoGray(T) template void convertYUVtoGray_420<T>(vector<T> &, const vector<T> &, uint, uint, uint)
+
+MAKE_YUVtoGray(uint8_t);
+MAKE_YUVtoGray(uint16_t);
+MAKE_YUVtoGray(int32_t);
+MAKE_YUVtoGray(float);
+MAKE_YUVtoGray(double);
+
+#undef MAKE_YUVtoGray
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void convertRGBtoNV12(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu)
+{
+    // Ensure both width and height are multiples of 2 since we're processing 2x2 blocks.
+    assert(wdth % 2 == 0 && hght % 2 == 0);
+
+    const size_t imgPixels = (size_t)hght * (size_t)wdth;
+    const size_t incrPix   = 3 + rgba;
+    const size_t incrSrc   = imgPixels * incrPix;
+    const size_t incrDst   = imgPixels * 3 / 2;
+
+    for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst)
+    {
+        T *y  = dst;
+        T *uv = dst + imgPixels;
+
+        const T *rgb = src;
+
+        for (uint h = 0; h < hght; h++)
+        {
+            for (uint w = 0; w < wdth; w++, rgb += incrPix)
+            {
+                T R = rgb[0];
+                T G = rgb[1];
+                T B = rgb[2];
+
+                // Convert all RGB values to Y values and store them.
+                // clang-format off
+                if (bgr) std::swap(R, B);
+                *y++ = cuda::SaturateCast<T>(R2Y_NV12 * R + G2Y_NV12 * G + B2Y_NV12 * B + Add2Y_NV12);
+                // clang-format on
+
+                // Convert only even pixels (in width and height) to U and V values and store them.
+                if ((w & 1) == 0 && (h & 1) == 0)
+                {
+                    double U = R2U_NV12 * R + G2U_NV12 * G + B2U_NV12 * B + Add2U_NV12;
+                    double V = R2V_NV12 * R + G2V_NV12 * G + B2V_NV12 * B + Add2V_NV12;
+
+                    // clang-format off
+                    if (yvu) std::swap(U, V);
+                    // clang-format on
+                    *uv++ = cuda::SaturateCast<T>(U);
+                    *uv++ = cuda::SaturateCast<T>(V);
+                }
+            }
+        }
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertRGBtoNV12(vector<T> &dst, const vector<T> &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr,
+                      bool yvu)
+{
+    // Ensure input data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size.
+    assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba));
+
+    // YUV NV12 needs 3 elements for each two RGB pixels.
+    assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2);
+
+    convertRGBtoNV12<T>(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_RGBtoNV12(T) \
+    template void convertRGBtoNV12<T>(vector<T> &, const vector<T> &, uint, uint, uint, bool, bool, bool)
+
+MAKE_RGBtoNV12(uint8_t);
+MAKE_RGBtoNV12(uint16_t);
+MAKE_RGBtoNV12(int32_t);
+MAKE_RGBtoNV12(float);
+MAKE_RGBtoNV12(double);
+
+#undef MAKE_RGBtoNV12
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void convertNV12toRGB(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu)
+{
+    // Ensure both width and height are multiples of 2 since we're processing 2x2 blocks.
+    assert(wdth % 2 == 0 && hght % 2 == 0);
+
+    const size_t imgPixels = (size_t)hght * (size_t)wdth;
+    const size_t incrSrc   = imgPixels * 3 / 2;
+    const size_t incrDst   = imgPixels * (3 + rgba);
+
+    for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst)
+    {
+        T *rgb = dst;
+
+        const T *y = src;
+
+        for (uint h = 0; h < hght; h++)
+        {
+            // NOTE: when computing uv row index, h needs to be integer divided by 2 before multiplying by width.
+            const T *uv = src + imgPixels + (h >> 1) * wdth;
+
+            for (uint w = 0; w < wdth; w++)
+            {
+                double Y = *y++;
+                double U = uv[0];
+                double V = uv[1];
+
+                // clang-format off
+                if (yvu) std::swap(U, V);
+
+                // Convert all YUV (ITU Rec.601) values to RGB values and store them.
+                Y -= Add2Y_NV12;
+                U -= Add2U_NV12;
+                V -= Add2V_NV12;
+                if (Y < 0.0) Y = 0.0;
+
+                T R = cuda::SaturateCast<T>(Y2R_NV12 * Y + U2R_NV12 * U + V2R_NV12 * V);
+                T G = cuda::SaturateCast<T>(Y2G_NV12 * Y + U2G_NV12 * U + V2G_NV12 * V);
+                T B = cuda::SaturateCast<T>(Y2B_NV12 * Y + U2B_NV12 * U + V2B_NV12 * V);
+
+                if (bgr) std::swap(R, B);
+                *rgb++ = R;
+                *rgb++ = G;
+                *rgb++ = B;
+                if (rgba) *rgb++ = Alpha<T>;
+
+                if (w & 1) uv += 2;
+                // clang-format on
+            }
+        }
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+void convertNV12toRGB(vector<T> &dst, const vector<T> &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr,
+                      bool yvu)
+{
+    // Ensure output data has sets of 3 or 4 (RGB/BGA with or w/o alpha) values for the given width and height and batch size.
+    assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba));
+
+    // YUV NV12 needs 3 elements for each two RGB pixels.
+    assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 3 / 2);
+
+    convertNV12toRGB<T>(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_NV12toRGB(T) \
+    template void convertNV12toRGB<T>(vector<T> &, const vector<T> &, uint, uint, uint, bool, bool, bool)
+
+MAKE_NV12toRGB(uint8_t);
+MAKE_NV12toRGB(uint16_t);
+MAKE_NV12toRGB(int32_t);
+MAKE_NV12toRGB(float);
+MAKE_NV12toRGB(double);
+
+#undef MAKE_NV12toRGB
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T, bool LumaFirst>
+void convertYUVtoRGB_422(T *dst, const T *src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr, bool yvu)
+{
+    // Ensure width is a multiple of 2.
+    assert(wdth % 2 == 0);
+
+    constexpr uint idx0 = (LumaFirst ? 0 : 1); // First  luma value index.
+    constexpr uint idx1 = idx0 + 2;            // Second luma value index.
+    constexpr uint idxU = (LumaFirst ? 1 : 0); // U chroma value index.
+    constexpr uint idxV = idxU + 2;            // V chroma value index.
+
+    const size_t imgPixels = (size_t)hght * (size_t)wdth;
+    const size_t incrSrc   = imgPixels * 2;
+    const size_t incrDst   = imgPixels * (3 + rgba);
+
+    for (uint n = 0; n < numImgs; n++, src += incrSrc, dst += incrDst)
+    {
+        T *rgb = dst;
+
+        const T *img = src;
+
+        for (uint h = 0; h < hght; h++)
+        {
+            for (uint w = 0; w < wdth; w += 2, img += 4)
+            {
+                T R, G, B;
+
+                // clang-format off
+                double U  = img[idxU],
+                       V  = img[idxV],
+                       Y0 = img[idx0],
+                       Y1 = img[idx1];
+
+                if (yvu) std::swap(U, V);
+
+                // Convert all YUV (ITU Rec.601) values to RGB values and store them.
+                Y0 -= Add2Y_NV12;
+                Y1 -= Add2Y_NV12;
+                U  -= Add2U_NV12;
+                V  -= Add2V_NV12;
+
+                if (Y0 < 0.0) Y0 = 0.0;
+                if (Y1 < 0.0) Y1 = 0.0;
+                // clang-format on
+
+                double Y_0  = Y2R_NV12 * Y0; // NOTE: Y2R_NV12 == Y2G_NV12 == Y2B_NV12.
+                double Y_1  = Y2R_NV12 * Y1;
+                double UV_r = U2R_NV12 * U + V2R_NV12 * V;
+                double UV_g = U2G_NV12 * U + V2G_NV12 * V;
+                double UV_b = U2B_NV12 * U + V2B_NV12 * V;
+
+                R = cuda::SaturateCast<T>(Y_0 + UV_r);
+                G = cuda::SaturateCast<T>(Y_0 + UV_g);
+                B = cuda::SaturateCast<T>(Y_0 + UV_b);
+
+                // clang-format off
+                if (bgr) std::swap(R, B);
+                *rgb++ = R;  *rgb++ = G;  *rgb++ = B;
+                if (rgba) *rgb++ = Alpha<T>;
+                // clang-format on
+
+                R = cuda::SaturateCast<T>(Y_1 + UV_r);
+                G = cuda::SaturateCast<T>(Y_1 + UV_g);
+                B = cuda::SaturateCast<T>(Y_1 + UV_b);
+
+                // clang-format off
+                if (bgr) std::swap(R, B);
+                *rgb++ = R;  *rgb++ = G;  *rgb++ = B;
+                if (rgba) *rgb++ = Alpha<T>;
+                // clang-format on
+            }
+        }
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T, bool LumaFirst>
+void convertYUVtoRGB_422(vector<T> &dst, const vector<T> &src, uint wdth, uint hght, uint numImgs, bool rgba, bool bgr,
+                         bool yvu)
+{
+    // Ensure output data has sets of 3 or 4 (RGB/BGA w/ or w/o alpha) values for the given width, height, & batch size.
+    assert(dst.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba));
+    assert(src.size() == (size_t)numImgs * (size_t)hght * (size_t)wdth * 2); // 4 values for each two RGB pixels.
+
+    convertYUVtoRGB_422<T, LumaFirst>(dst.data(), src.data(), wdth, hght, numImgs, rgba, bgr, yvu);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_422toRGB(T) \
+    template void convertYUVtoRGB_422<T, false>(vector<T> &, const vector<T> &, uint, uint, uint, bool, bool, bool)
+
+MAKE_422toRGB(uint8_t);
+MAKE_422toRGB(uint16_t);
+MAKE_422toRGB(int32_t);
+MAKE_422toRGB(float);
+MAKE_422toRGB(double);
+
+#undef MAKE_422toRGB
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_422toRGB(T) \
+    template void convertYUVtoRGB_422<T, true>(vector<T> &, const vector<T> &, uint, uint, uint, bool, bool, bool)
+
+MAKE_422toRGB(uint8_t);
+MAKE_422toRGB(uint16_t);
+MAKE_422toRGB(int32_t);
+MAKE_422toRGB(float);
+MAKE_422toRGB(double);
+
+#undef MAKE_422toRGB
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T, bool LumaFirst>
+void convertYUVtoGray_422(T *dst, const T *src, size_t numPixels)
+{
+    src += (1 - LumaFirst); // Increment to first Y value if luma not first.
+
+    for (size_t i = 0; i < numPixels; i++, src += 2) *dst++ = *src;
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T, bool LumaFirst>
+void convertYUVtoGray_422(vector<T> &dst, const vector<T> &src, size_t numPixels)
+{
+    assert(dst.size() == numPixels);
+    assert(src.size() == numPixels * 2); // YUV 422 needs 4 values for each two RGB pixels.
+
+    convertYUVtoGray_422<T, LumaFirst>(dst.data(), src.data(), numPixels);
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_422toGray(T) template void convertYUVtoGray_422<T, false>(vector<T> &, const vector<T> &, size_t)
+
+MAKE_422toGray(uint8_t);
+MAKE_422toGray(uint16_t);
+MAKE_422toGray(int32_t);
+MAKE_422toGray(float);
+MAKE_422toGray(double);
+
+#undef MAKE_422toGray
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_422toGray(T) template void convertYUVtoGray_422<T, true>(vector<T> &, const vector<T> &, size_t)
+
+MAKE_422toGray(uint8_t);
+MAKE_422toGray(uint16_t);
+MAKE_422toGray(int32_t);
+MAKE_422toGray(float);
+MAKE_422toGray(double);
+
+#undef MAKE_422toGray
+//--------------------------------------------------------------------------------------------------------------------//
diff --git a/tests/cvcuda/system/CvtColorUtils.hpp b/tests/cvcuda/system/CvtColorUtils.hpp
new file mode 100644
index 00000000..197adf86
--- /dev/null
+++ b/tests/cvcuda/system/CvtColorUtils.hpp
@@ -0,0 +1,78 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_TEST_COMMON_CVT_COLOR_UTILS_HPP
+#define NVCV_TEST_COMMON_CVT_COLOR_UTILS_HPP
+
+#include <stdint.h>
+
+#include <vector>
+
+// clang-format off
+
+
+template<typename T>
+void changeAlpha(std::vector<T> &dst, const std::vector<T> &src, size_t numPixels, bool srcRGBA, bool dstRGBA);
+
+template<typename T>
+void convertRGBtoBGR(std::vector<T> &dst, const std::vector<T> &src, size_t numPixels, bool srcRGBA, bool dstRGBA);
+
+template<typename T>
+void convertRGBtoGray(std::vector<T> &dst, const std::vector<T> &src, size_t numPixels, bool rgba, bool bgr);
+
+template<typename T>
+void convertGrayToRGB(std::vector<T> &dst, const std::vector<T> &src, size_t numPixels, bool rgba);
+
+template<typename T, bool FullRange>
+void convertRGBtoHSV(std::vector<T> &dst, const std::vector<T> &src, size_t numPixels, bool rgba, bool bgr);
+
+template<typename T, bool FullRange>
+void convertHSVtoRGB(std::vector<T> &dst, const std::vector<T> &src, size_t numPixels, bool rgba, bool bgr);
+
+template<typename T>
+void convertRGBtoYUV_PAL(std::vector<T> &dst, const std::vector<T> &src, size_t numPixels, bool rgba, bool bgr);
+
+template<typename T>
+void convertYUVtoRGB_PAL(std::vector<T> &dst, const std::vector<T> &src, size_t numPixels, bool rgba, bool bgr);
+
+template<typename T>
+void convertRGBtoYUV_420(std::vector<T> &dst, const std::vector<T> &src, uint wdth, uint hght, uint numImgs,
+                         bool rgba, bool bgr, bool yvu);
+
+template<typename T>
+void convertYUVtoRGB_420(std::vector<T> &dst, const std::vector<T> &src, uint wdth, uint hght, uint numImgs,
+                         bool rgba, bool bgr, bool yvu);
+
+template<typename T>
+void convertYUVtoGray_420(std::vector<T> &dst, const std::vector<T> &src, uint wdth, uint hght, uint numImgs);
+
+template<typename T>
+void convertRGBtoNV12(std::vector<T> &dst, const std::vector<T> &src, uint wdth, uint hght, uint num,
+                      bool rgba, bool bgr, bool yvu);
+
+template<typename T>
+void convertNV12toRGB(std::vector<T> &dst, const std::vector<T> &src, uint wdth, uint hght, uint num,
+                      bool rgba, bool bgr, bool yvu);
+
+template<typename T, bool LumaFirst>
+void convertYUVtoRGB_422(std::vector<T> &dst, const std::vector<T> &src, uint wdth, uint hght, uint numImgs,
+                         bool rgba, bool bgr, bool yvu);
+
+template<typename T, bool LumaFirst>
+void convertYUVtoGray_422(std::vector<T> &dst, const std::vector<T> &src, size_t numPixels);
+
+#endif // NVCV_TEST_COMMON_CVT_COLOR_UTILS_HPP
diff --git a/tests/cvcuda/system/TestOpCvtColor.cpp b/tests/cvcuda/system/TestOpCvtColor.cpp
index 31e5c682..67055b89 100644
--- a/tests/cvcuda/system/TestOpCvtColor.cpp
+++ b/tests/cvcuda/system/TestOpCvtColor.cpp
@@ -15,50 +15,23 @@
  * limitations under the License.
  */
 
-#include "ConvUtils.hpp"
+#include "CvtColorUtils.hpp"
 #include "Definitions.hpp"
+#include "TestUtils.hpp"
 
 #include <common/TensorDataUtils.hpp>
 #include <common/ValueTests.hpp>
 #include <cvcuda/OpCvtColor.hpp>
-#include <cvcuda/cuda_tools/TypeTraits.hpp>
 #include <nvcv/Image.hpp>
 #include <nvcv/ImageBatch.hpp>
 #include <nvcv/Tensor.hpp>
 #include <nvcv/TensorDataAccess.hpp>
 
-#include <random>
-
 namespace test = nvcv::test;
+namespace util = nvcv::util;
 namespace cuda = nvcv::cuda;
 
-#define VEC_EXPECT_NEAR(vec1, vec2, delta, dtype)                                                                    \
-    ASSERT_EQ(vec1.size(), vec2.size());                                                                             \
-    for (std::size_t idx = 0; idx < vec1.size() / sizeof(dtype); ++idx)                                              \
-    {                                                                                                                \
-        EXPECT_NEAR(reinterpret_cast<dtype *>(vec1.data())[idx], reinterpret_cast<dtype *>(vec2.data())[idx], delta) \
-            << "At index " << idx;                                                                                   \
-    }
-
-template<typename T>
-void myGenerate(T *src, std::size_t size, std::default_random_engine &randEng)
-{
-    std::uniform_int_distribution rand(0u, 255u);
-    for (std::size_t idx = 0; idx < size; ++idx)
-    {
-        src[idx] = rand(randEng);
-    }
-}
-
-template<>
-void myGenerate(float *src, std::size_t size, std::default_random_engine &randEng)
-{
-    std::uniform_real_distribution<float> rand(0.f, 1.f);
-    for (std::size_t idx = 0; idx < size; ++idx)
-    {
-        src[idx] = rand(randEng);
-    }
-}
+using std::vector;
 
 #define NVCV_IMAGE_FORMAT_RGBS8  NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, XYZ1, ASSOCIATED, X8_Y8_Z8)
 #define NVCV_IMAGE_FORMAT_BGRS8  NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, ZYX1, ASSOCIATED, X8_Y8_Z8)
@@ -101,71 +74,739 @@ void myGenerate(float *src, std::size_t size, std::default_random_engine &randEn
 
 // clang-format off
 
+//--------------------------------------------------------------------------------------------------------------------//
+template<typename T>
+static void verifyOutput(nvcv::Tensor srcTensor, nvcv::ImageFormat srcFrmt,
+                         nvcv::Tensor dstTensor, nvcv::ImageFormat dstFrmt,
+                         NVCVColorConversionCode code, int wdth, int hght, int imgs, double maxDiff)
+{
+    auto srcData = srcTensor.exportData<nvcv::TensorDataStridedCuda>();
+    auto dstData = dstTensor.exportData<nvcv::TensorDataStridedCuda>();
+    ASSERT_TRUE(srcData);
+    ASSERT_TRUE(dstData);
+
+    auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*srcData);
+    auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dstData);
+    ASSERT_TRUE(srcAccess);
+    ASSERT_TRUE(dstAccess);
+
+    int srcChannels = srcAccess->numChannels();
+    int dstChannels = dstAccess->numChannels();
+
+    ASSERT_LE(srcChannels, 4);
+    ASSERT_LE(dstChannels, 4);
+
+    int srcWdth = wdth,
+        srcHght = hght;
+    int dstWdth = wdth,
+        dstHght = hght;
+
+    if (srcFrmt == NVCV_IMAGE_FORMAT_UYVY || srcFrmt == NVCV_IMAGE_FORMAT_UYVY_ER ||
+        srcFrmt == NVCV_IMAGE_FORMAT_YUYV || srcFrmt == NVCV_IMAGE_FORMAT_YUYV_ER)
+        srcWdth = srcWdth << 1;
+    if (srcFrmt == NVCV_IMAGE_FORMAT_NV12 || srcFrmt == NVCV_IMAGE_FORMAT_NV12_ER ||
+        srcFrmt == NVCV_IMAGE_FORMAT_NV21 || srcFrmt == NVCV_IMAGE_FORMAT_NV21_ER)
+        srcHght = (srcHght * 3) >> 1;
+    ASSERT_EQ(srcWdth, srcAccess->numCols());
+    ASSERT_EQ(srcHght, srcAccess->numRows());
+
+    if (dstFrmt == NVCV_IMAGE_FORMAT_UYVY || dstFrmt == NVCV_IMAGE_FORMAT_UYVY_ER ||
+        dstFrmt == NVCV_IMAGE_FORMAT_YUYV || dstFrmt == NVCV_IMAGE_FORMAT_YUYV_ER)
+        dstWdth = dstWdth << 1;
+    if (dstFrmt == NVCV_IMAGE_FORMAT_NV12 || dstFrmt == NVCV_IMAGE_FORMAT_NV12_ER ||
+        dstFrmt == NVCV_IMAGE_FORMAT_NV21 || dstFrmt == NVCV_IMAGE_FORMAT_NV21_ER)
+        dstHght = (dstHght * 3) >> 1;
+    ASSERT_EQ(dstWdth, dstAccess->numCols());
+    ASSERT_EQ(dstHght, dstAccess->numRows());
+
+    int srcRowElems = srcChannels * srcWdth;
+    int dstRowElems = dstChannels * dstWdth;
+
+    size_t numPixels = (size_t)imgs * (size_t)wdth * (size_t)hght;
+    size_t srcElems  = (size_t)imgs * (size_t)srcWdth * (size_t)srcHght * (size_t)srcChannels;
+    size_t dstElems  = (size_t)imgs * (size_t)dstWdth * (size_t)dstHght * (size_t)dstChannels;
+
+    size_t srcPitchCPU = srcRowElems * sizeof(T);
+    size_t dstPitchCPU = dstRowElems * sizeof(T);
+
+    nvcv::Swizzle srcSwizzle = srcFrmt.swizzle();
+    nvcv::Swizzle dstSwizzle = dstFrmt.swizzle();
+
+    vector<T> srcVec(srcElems);
+    vector<T> refVec(dstElems);
+
+    bool srcBGR  = (srcSwizzle == nvcv::Swizzle::S_ZYXW ||
+                    srcSwizzle == nvcv::Swizzle::S_ZYX1 ||
+                    srcSwizzle == nvcv::Swizzle::S_ZYX0);
+    bool dstBGR  = (dstSwizzle == nvcv::Swizzle::S_ZYXW ||
+                    dstSwizzle == nvcv::Swizzle::S_ZYX1 ||
+                    dstSwizzle == nvcv::Swizzle::S_ZYX0);
+    bool srcRGBA = (srcChannels == 4),
+         dstRGBA = (dstChannels == 4);
+    bool success = true;
+
+    RandEng randEng(0);
+
+    constexpr size_t minCntAllRGB = 128 * 256 * 256; // Minimum # of pixels to call generateAllRGB.
+    constexpr size_t minCntAllHSV =  90 * 256 * 256; // Minimum # of pixels to call generateAllHSV.
+    constexpr double minMultHSV   = -0.5;            // Set hue range multiplier to be outside normal range
+    constexpr double maxMultHSV   =  1.5;            //   to test robustness to wrapped hue values.
+
+    // Populate source tensor.
+    if (srcChannels > 2)
+    {
+        if (code == NVCV_COLOR_HSV2BGR || code == NVCV_COLOR_HSV2BGR_FULL ||
+            code == NVCV_COLOR_HSV2RGB || code == NVCV_COLOR_HSV2RGB_FULL)
+        {
+            bool full = (code == NVCV_COLOR_HSV2BGR_FULL || code == NVCV_COLOR_HSV2RGB_FULL);
+
+            if (numPixels >= minCntAllHSV)
+            {
+                if (full) generateAllHSV<T, true >(srcVec, srcWdth, srcHght, imgs);
+                else      generateAllHSV<T, false>(srcVec, srcWdth, srcHght, imgs);
+            }
+            else
+            {
+                if (full) generateRandHSV<T, true >(srcVec, randEng, minMultHSV, maxMultHSV);
+                else      generateRandHSV<T, false>(srcVec, randEng, minMultHSV, maxMultHSV);
+            }
+        }
+        else
+        {
+            if (numPixels >= minCntAllRGB)
+                generateAllRGB(srcVec, srcWdth, srcHght, imgs, srcRGBA, srcBGR);
+            else
+                generateRandTestRGB(srcVec, randEng, srcRGBA, srcBGR);
+        }
+    }
+    else
+        generateRandVec(srcVec, randEng);
+
+    // Copy source from image vector to device tensor.
+    ASSERT_EQ(cudaSuccess, cudaMemcpy2D(srcData->basePtr(), srcAccess->rowStride(), srcVec.data(), srcPitchCPU,
+                                        srcPitchCPU, (size_t)imgs * (size_t)srcHght, cudaMemcpyHostToDevice));
+
+    switch (code)
+    {
+    // Add/remove alpha channel to RGB/BGR image.
+    case NVCV_COLOR_BGR2BGRA     :  // NVCV_COLOR_BGR2BGRA      =   0 (NVCV_COLOR_RGB2RGBA)
+    case NVCV_COLOR_BGRA2BGR     :  // NVCV_COLOR_BGRA2BGR      =   1 (NVCV_COLOR_RGBA2RGB)
+        changeAlpha<T>(refVec, srcVec, numPixels, srcRGBA, dstRGBA);
+        break;
+
+    // Convert between RGB and BGR (with or without alpha channel).
+    case NVCV_COLOR_BGR2RGBA     :  // NVCV_COLOR_BGR2RGBA      =   2 (NVCV_COLOR_RGB2BGRA)
+    case NVCV_COLOR_RGBA2BGR     :  // NVCV_COLOR_RGBA2BGR      =   3 (NVCV_COLOR_BGRA2RGB)
+    case NVCV_COLOR_BGR2RGB      :  // NVCV_COLOR_BGR2RGB       =   4 (NVCV_COLOR_BGR2RGB)
+    case NVCV_COLOR_BGRA2RGBA    :  // NVCV_COLOR_BGRA2RGBA     =   5 (NVCV_COLOR_RGBA2BGRA)
+        convertRGBtoBGR<T>(refVec, srcVec, numPixels, srcRGBA, dstRGBA);
+        break;
+
+    // Convert from RGB/BGR to grayscale.
+    case NVCV_COLOR_BGR2GRAY     :  // NVCV_COLOR_BGR2GRAY      =   6
+    case NVCV_COLOR_RGB2GRAY     :  // NVCV_COLOR_RGB2GRAY      =   7
+    case NVCV_COLOR_BGRA2GRAY    :  // NVCV_COLOR_BGRA2GRAY     =  10
+    case NVCV_COLOR_RGBA2GRAY    :  // NVCV_COLOR_RGBA2GRAY     =  11
+        convertRGBtoGray<T>(refVec, srcVec, numPixels, srcRGBA, srcBGR);
+        break;
+
+    // Convert from grayscale to RGB/BGR.
+    case NVCV_COLOR_GRAY2BGR     :  // NVCV_COLOR_GRAY2BGR      =   8 (NVCV_COLOR_GRAY2RGB)
+    case NVCV_COLOR_GRAY2BGRA    :  // NVCV_COLOR_GRAY2BGRA     =   9 (NVCV_COLOR_GRAY2RGBA)
+        convertGrayToRGB<T>(refVec, srcVec, numPixels, dstRGBA);
+        break;
+
+    // Convert between RGB/BGR   and BGR565 (16-bit images) --> Conversion codes 12-19 not implemented.
+    // Convert between grayscale and BGR565 (16-bit images) --> Conversion codes 20-21 not implemented.
+    // Convert between RGB/BGR   and BGR555 (16-bit images) --> Conversion codes 22-29 not implemented.
+    // Convert between grayscale and BGR555 (16-bit images) --> Conversion codes 30-31 not implemented.
+    // Convert between RGB/BGR   and CIE XYZ                --> Conversion codes 32-35 not implemented.
+    // Convert between RGB/BGR   and YCrCb (aka YCC)        --> Conversion codes 36-39 not implemented.
+
+    // Convert from RGB/BGR to HSV (hue, saturation, value).
+    case NVCV_COLOR_BGR2HSV      :  // NVCV_COLOR_BGR2HSV       =  40
+    case NVCV_COLOR_RGB2HSV      :  // NVCV_COLOR_RGB2HSV       =  41
+        convertRGBtoHSV<T, false>(refVec, srcVec, numPixels, srcRGBA, srcBGR);
+        break;
+
+    // Conversion codes 42 and 43 not specified.
+    // Convert from RGB/BGR to CIE Lab                          --> Conversion codes 44-45 not implemented.
+    // Bayer demosaicing to RGB/BGR                             --> Conversion codes 46-49 not implemented.
+    // Convert from RGB/BGR to CIE Luv                          --> Conversion codes 50-51 not implemented.
+    // Convert from RGB/BGR to HLS (hue, lightness, saturation) --> Conversion codes 52-53 not implemented.
+
+    // Convert from HSV (hue, saturation, value) to RGB/BGR.
+    case NVCV_COLOR_HSV2BGR      :  // NVCV_COLOR_HSV2BGR       =  54
+    case NVCV_COLOR_HSV2RGB      :  // NVCV_COLOR_HSV2RGB       =  55
+        convertHSVtoRGB<T, false>(refVec, srcVec, numPixels, dstRGBA, dstBGR);
+        break;
+
+    // Convert to RGB/BGR from CIE Lab                           --> Conversion codes 56-57 not implemented.
+    // Convert to RGB/BGR from CIE Luv                           --> Conversion codes 58-59 not implemented.
+    // Convert to RGB/BGR from HLS (hue, lightness, saturation)  --> Conversion codes 60-61 not implemented.
+    // VNG (Variable Number of Gradients) demosaicing to RGB/BGR --> Conversion codes 62-65 not implemented.
+
+    // Convert from RGB/BGR to full-range HSV (hue, saturation, value).
+    case NVCV_COLOR_BGR2HSV_FULL :  // NVCV_COLOR_BGR2HSV_FULL  =  66
+    case NVCV_COLOR_RGB2HSV_FULL :  // NVCV_COLOR_RGB2HSV_FULL  =  67
+        convertRGBtoHSV<T, true>(refVec, srcVec, numPixels, srcRGBA, srcBGR);
+        break;
+
+    // Convert from RGB/BGR to full-range HLS (hue, lightness, saturation) --> Conversion codes 68-69 not implemented.
+
+    // Convert from full-range HSV (hue, saturation, value) to RGB/BGR.
+    case NVCV_COLOR_HSV2BGR_FULL :  // NVCV_COLOR_HSV2BGR_FULL  =  70
+    case NVCV_COLOR_HSV2RGB_FULL :  // NVCV_COLOR_HSV2RGB_FULL  =  71
+        convertHSVtoRGB<T, true>(refVec, srcVec, numPixels, dstRGBA, dstBGR);
+        break;
+
+    // Convert from full-range HLS (hue, lightness, saturation) to RGB/BGR --> Conversion codes 72-73 not implemented.
+    // Convert from LRGB/LBGR (luminance, red, green, blue) to   CIE Lab   --> Conversion codes 74-75 not implemented.
+    // Convert from LRGB/LBGR (luminance, red, green, blue) to   CIE Luv   --> Conversion codes 76-77 not implemented.
+    // Convert to   LRGB/LBGR (luminance, red, green, blue) from CIE Lab   --> Conversion codes 78-79 not implemented.
+    // Convert to   LRGB/LBGR (luminance, red, green, blue) from CIE Luv   --> Conversion codes 80-81 not implemented.
+
+    // Convert from RGB/BGR to YUV.
+    case NVCV_COLOR_BGR2YUV      :  // NVCV_COLOR_BGR2YUV       =  82
+    case NVCV_COLOR_RGB2YUV      :  // NVCV_COLOR_RGB2YUV       =  83
+        convertRGBtoYUV_PAL<T>(refVec, srcVec, numPixels, srcRGBA, srcBGR);
+        break;
+
+    // Convert from YUV to RGB/BGR.
+    case NVCV_COLOR_YUV2BGR      :  // NVCV_COLOR_YUV2BGR       =  84
+    case NVCV_COLOR_YUV2RGB      :  // NVCV_COLOR_YUV2RGB       =  85
+        convertYUVtoRGB_PAL<T>(refVec, srcVec, numPixels, dstRGBA, dstBGR);
+        break;
+
+    // Bayer demosaicing to grayscale --> Conversion codes 86-89 not implemented.
+
+    // Convert from YUV 4:2:0 family to RGB/BGR.
+    case NVCV_COLOR_YUV2RGB_NV12 :  // NVCV_COLOR_YUV2RGB_NV12  =  90
+    case NVCV_COLOR_YUV2BGR_NV12 :  // NVCV_COLOR_YUV2BGR_NV12  =  91
+    case NVCV_COLOR_YUV2RGBA_NV12:  // NVCV_COLOR_YUV2RGBA_NV12 =  94
+    case NVCV_COLOR_YUV2BGRA_NV12:  // NVCV_COLOR_YUV2BGRA_NV12 =  95
+        convertNV12toRGB<T>(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, false);
+        break;
+
+    case NVCV_COLOR_YUV2RGB_NV21 :  // NVCV_COLOR_YUV2RGB_NV21  =  92 (NVCV_COLOR_YUV420sp2RGB)
+    case NVCV_COLOR_YUV2BGR_NV21 :  // NVCV_COLOR_YUV2BGR_NV21  =  93 (NVCV_COLOR_YUV420sp2BGR)
+    case NVCV_COLOR_YUV2RGBA_NV21:  // NVCV_COLOR_YUV2RGBA_NV21 =  96 (NVCV_COLOR_YUV420sp2RGBA)
+    case NVCV_COLOR_YUV2BGRA_NV21:  // NVCV_COLOR_YUV2BGRA_NV21 =  97 (NVCV_COLOR_YUV420sp2BGRA)
+        convertNV12toRGB<T>(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, true);
+        break;
+
+    case NVCV_COLOR_YUV2RGB_YV12 :  // NVCV_COLOR_YUV2RGB_YV12  =  98 (NVCV_COLOR_YUV420p2RGB)
+    case NVCV_COLOR_YUV2BGR_YV12 :  // NVCV_COLOR_YUV2BGR_YV12  =  99 (NVCV_COLOR_YUV420p2BGR)
+    case NVCV_COLOR_YUV2RGBA_YV12:  // NVCV_COLOR_YUV2RGBA_YV12 = 102 (NVCV_COLOR_YUV420p2RGBA)
+    case NVCV_COLOR_YUV2BGRA_YV12:  // NVCV_COLOR_YUV2BGRA_YV12 = 103 (NVCV_COLOR_YUV420p2BGRA)
+        convertYUVtoRGB_420<T>(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, true);
+        break;
+
+    case NVCV_COLOR_YUV2RGB_IYUV :  // NVCV_COLOR_YUV2RGB_IYUV  = 100 (NVCV_COLOR_YUV2RGB_I420)
+    case NVCV_COLOR_YUV2BGR_IYUV :  // NVCV_COLOR_YUV2BGR_IYUV  = 101 (NVCV_COLOR_YUV2BGR_I420)
+    case NVCV_COLOR_YUV2RGBA_IYUV:  // NVCV_COLOR_YUV2RGBA_IYUV = 104 (NVCV_COLOR_YUV2RGBA_I420)
+    case NVCV_COLOR_YUV2BGRA_IYUV:  // NVCV_COLOR_YUV2BGRA_IYUV = 105 (NVCV_COLOR_YUV2BGRA_I420)
+        convertYUVtoRGB_420<T>(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, false);
+        break;
+
+    // Convert from YUV 4:2:0 family to grayscale.
+    case NVCV_COLOR_YUV2GRAY_420 :  // NVCV_COLOR_YUV2GRAY_420  = 106 (NVCV_COLOR_YUV2GRAY_NV21, NVCV_COLOR_YUV2GRAY_NV12,
+                                    //                                 NVCV_COLOR_YUV2GRAY_YV12, NVCV_COLOR_YUV2GRAY_IYUV,
+                                    //                                 NVCV_COLOR_YUV2GRAY_I420, NVCV_COLOR_YUV420sp2GRAY,
+                                    //                                 NVCV_COLOR_YUV420p2GRAY)
+        convertYUVtoGray_420<T>(refVec, srcVec, wdth, hght, imgs);
+        break;
+
+    // Convert from YUV 4:2:2 family to RGB/BGR.
+    case NVCV_COLOR_YUV2RGB_UYVY :  // NVCV_COLOR_YUV2RGB_UYVY  = 107 ( NVCV_COLOR_YUV2RGB_Y422, NVCV_COLOR_YUV2RGB_UYNV)
+    case NVCV_COLOR_YUV2BGR_UYVY :  // NVCV_COLOR_YUV2BGR_UYVY  = 108 ( NVCV_COLOR_YUV2RGB_Y422, NVCV_COLOR_YUV2RGB_UYNV)
+    // Conversion codes 109 (NVCV_COLOR_YUV2RGB_VYUY) and 110 (NVCV_COLOR_YUV2BGR_VYUY) not available.
+    case NVCV_COLOR_YUV2RGBA_UYVY:  // NVCV_COLOR_YUV2RGBA_UYVY = 111 ( NVCV_COLOR_YUV2RGBA_Y422, NVCV_COLOR_YUV2RGBA_UYNV)
+    case NVCV_COLOR_YUV2BGRA_UYVY:  // NVCV_COLOR_YUV2BGRA_UYVY = 112 ( NVCV_COLOR_YUV2BGRA_Y422, NVCV_COLOR_YUV2BGRA_UYNV)
+        convertYUVtoRGB_422<T, false>(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, false);
+        break;
+
+    // Conversion codes 113 (NVCV_COLOR_YUV2RGBA_VYUY) and 114 (NVCV_COLOR_YUV2BGRA_VYUY) not available.
+    case NVCV_COLOR_YUV2RGB_YUY2 :  // NVCV_COLOR_YUV2RGB_YUY2  = 115 (NVCV_COLOR_YUV2RGB_YUYV, NVCV_COLOR_YUV2RGB_YUNV)
+    case NVCV_COLOR_YUV2BGR_YUY2 :  // NVCV_COLOR_YUV2BGR_YUY2  = 116 (NVCV_COLOR_YUV2BGR_YUYV, NVCV_COLOR_YUV2BGR_YUNV)
+    case NVCV_COLOR_YUV2RGBA_YUY2:  // NVCV_COLOR_YUV2RGBA_YUY2 = 119 (NVCV_COLOR_YUV2RGBA_YUYV, NVCV_COLOR_YUV2RGBA_YUNV)
+    case NVCV_COLOR_YUV2BGRA_YUY2:  // NVCV_COLOR_YUV2BGRA_YUY2 = 120 (NVCV_COLOR_YUV2BGRA_YUYV, NVCV_COLOR_YUV2BGRA_YUNV)
+        convertYUVtoRGB_422<T, true>(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, false);
+        break;
+
+    case NVCV_COLOR_YUV2RGB_YVYU :  // NVCV_COLOR_YUV2RGB_YVYU  = 117
+    case NVCV_COLOR_YUV2BGR_YVYU :  // NVCV_COLOR_YUV2BGR_YVYU  = 118
+    case NVCV_COLOR_YUV2RGBA_YVYU:  // NVCV_COLOR_YUV2RGBA_YVYU = 121
+    case NVCV_COLOR_YUV2BGRA_YVYU:  // NVCV_COLOR_YUV2BGRA_YVYU = 122
+        convertYUVtoRGB_422<T, true>(refVec, srcVec, wdth, hght, imgs, dstRGBA, dstBGR, true);
+        break;
+
+    // Convert from YUV 4:2:2 family to grayscale.
+    case NVCV_COLOR_YUV2GRAY_UYVY:  // NVCV_COLOR_YUV2GRAY_UYVY = 123 (NVCV_COLOR_YUV2GRAY_Y422, NVCV_COLOR_YUV2GRAY_UYNV)
+        convertYUVtoGray_422<T, false>(refVec, srcVec, numPixels);
+        break;
+
+    case NVCV_COLOR_YUV2GRAY_YUY2:  // NVCV_COLOR_YUV2GRAY_YUY2 = 124 (NVCV_COLOR_YUV2GRAY_YVYU, NVCV_COLOR_YUV2GRAY_YUYV,
+                                    //                                 NVCV_COLOR_YUV2GRAY_YUNV)
+        convertYUVtoGray_422<T, true>(refVec, srcVec, numPixels);
+        break;
+
+    // RGB/BGA alpha premultiplication --> Conversion codes 125-126 not implemented.
+
+    // Convert from RGB/BGR to YUV 4:2:0 family.
+    case NVCV_COLOR_RGB2YUV_I420 :  // NVCV_COLOR_RGB2YUV_I420  = 127 (NVCV_COLOR_RGB2YUV_IYUV)
+    case NVCV_COLOR_BGR2YUV_I420 :  // NVCV_COLOR_BGR2YUV_I420  = 128 (NVCV_COLOR_BGR2YUV_IYUV)
+    case NVCV_COLOR_RGBA2YUV_I420:  // NVCV_COLOR_RGBA2YUV_I420 = 129 (NVCV_COLOR_RGBA2YUV_IYUV)
+    case NVCV_COLOR_BGRA2YUV_I420:  // NVCV_COLOR_BGRA2YUV_I420 = 130 (NVCV_COLOR_BGRA2YUV_IYUV)
+        convertRGBtoYUV_420<T>(refVec, srcVec, wdth, hght, imgs, srcRGBA, srcBGR, false);
+        break;
+
+    case NVCV_COLOR_RGB2YUV_YV12 :  // NVCV_COLOR_RGB2YUV_YV12  = 131
+    case NVCV_COLOR_BGR2YUV_YV12 :  // NVCV_COLOR_BGR2YUV_YV12  = 132
+    case NVCV_COLOR_RGBA2YUV_YV12:  // NVCV_COLOR_RGBA2YUV_YV12 = 133
+    case NVCV_COLOR_BGRA2YUV_YV12:  // NVCV_COLOR_BGRA2YUV_YV12 = 134
+        convertRGBtoYUV_420<T>(refVec, srcVec, wdth, hght, imgs, srcRGBA, srcBGR, true);
+        break;
+
+    // Edge-aware demosaicing to RGB/BGR --> Conversion codes 135-138 not implemented.
+    // OpenCV COLORCVT_MAX               --> Conversion code  139     not implemented.
+
+    // Convert RGB/BGR to YUV 4:2:0 family (two plane YUV; not in OpenCV).
+    case NVCV_COLOR_RGB2YUV_NV12 :  // NVCV_COLOR_RGB2YUV_NV12  = 140
+    case NVCV_COLOR_BGR2YUV_NV12 :  // NVCV_COLOR_BGR2YUV_NV12  = 141
+    case NVCV_COLOR_RGBA2YUV_NV12:  // NVCV_COLOR_RGBA2YUV_NV12 = 144
+    case NVCV_COLOR_BGRA2YUV_NV12:  // NVCV_COLOR_BGRA2YUV_NV12 = 145
+        convertRGBtoNV12<T>(refVec, srcVec, wdth, hght, imgs, srcRGBA, srcBGR, false);
+        break;
+
+    case NVCV_COLOR_RGB2YUV_NV21 :  // NVCV_COLOR_RGB2YUV_NV21  = 142 (NVCV_COLOR_RGB2YUV420sp)
+    case NVCV_COLOR_BGR2YUV_NV21 :  // NVCV_COLOR_BGR2YUV_NV21  = 143 (NVCV_COLOR_BGR2YUV420sp)
+    case NVCV_COLOR_RGBA2YUV_NV21:  // NVCV_COLOR_RGBA2YUV_NV21 = 146 (NVCV_COLOR_RGBA2YUV420sp)
+    case NVCV_COLOR_BGRA2YUV_NV21:  // NVCV_COLOR_BGRA2YUV_NV21 = 147 (NVCV_COLOR_BGRA2YUV420sp)
+        convertRGBtoNV12<T>(refVec, srcVec, wdth, hght, imgs, srcRGBA, srcBGR, true);
+        break;
+
+    default:
+        std::cerr << "**** ERROR: Color conversion not implemented for conversion code " << code << ". ****\n\n";
+        success = false;
+    }
+
+    if (success)
+    {
+        // Run color conversion operator.
+        cudaStream_t stream;
+
+        ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
+
+        cvcuda::CvtColor convertColor;
+
+        EXPECT_NO_THROW(convertColor(stream, srcTensor, dstTensor, code));
+
+        ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+        ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+
+        // Copy destination tensor back to host.
+        vector<T> dstVec(dstElems);
+
+        ASSERT_EQ(cudaSuccess, cudaMemcpy2D(dstVec.data(), dstPitchCPU, dstData->basePtr(), dstAccess->rowStride(),
+                                            dstPitchCPU, (size_t)imgs * (size_t)dstHght, cudaMemcpyDeviceToHost));
+
+        constexpr uint maxErrCnt = 16;
+
+        // Compare "gold" reference to computed output.
+        if (dstFrmt == NVCV_IMAGE_FORMAT_HSV8 || dstFrmt == NVCV_IMAGE_FORMAT_HSVf32)
+        {
+            const bool   full  = (code == NVCV_COLOR_BGR2HSV_FULL || code == NVCV_COLOR_RGB2HSV_FULL);
+            const double range = (sizeof(T) > 1) ? 360.0 : (full ? 256.0 : 180.0);
+
+            EXPECT_NEAR_HSV_VEC_CNT(refVec, dstVec, range, maxDiff, maxErrCnt, success);
+        }
+        else
+            EXPECT_NEAR_VEC_CNT(refVec, dstVec, maxDiff, maxErrCnt, success);
+    }
+    else
+    {
+        GTEST_SKIP() << "Waived: this test hasn't been implemented.";
+    }
+}
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+#define ERR2_3 (2.0 / 1024.0) // 0.0009765625    --> approximates 2e-3 but can be exactly represented in floating point.
+#define ERR1_3 (1.0 / 1024.0) // 0.0009765625    --> approximates 1e-3 but can be exactly represented in floating point.
+#define ERR1_4 (1.0 / 8192.0) // 0.0001220703125 --> approximates 1e-4 but can be exactly represented in floating point.
+
 NVCV_TEST_SUITE_P(OpCvtColor,
+test::ValueList<int, int, int, NVCVImageFormat, NVCVImageFormat, NVCVColorConversionCode, double>
+{
+    //  W,   H,  N,  Input Format,               Output Format,              Convert Code,         maxDiff
+    { 177, 113,  1,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_BGR2BGRA,      0.0},
+    { 113, 176,  2,  NVCV_IMAGE_FORMAT_BGRA8,    NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_BGRA2BGR,      0.0},
+    { 335, 432,  2,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_RGB2RGBA,      0.0},
+    { 431, 336,  2,  NVCV_IMAGE_FORMAT_RGBA8,    NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_BGR2RGBA,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_RGBA8,    NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_RGB2BGRA,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_BGRA8,    NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBA8,    NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_RGBA2BGRA,     0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_BGRA8,    NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_BGRA2RGBA,     0.0},
+
+    { 177, 113,  1,  NVCV_IMAGE_FORMAT_BGRS8,    NVCV_IMAGE_FORMAT_BGRAS8,   NVCV_COLOR_BGR2BGRA,      0.0},
+    { 113, 176,  2,  NVCV_IMAGE_FORMAT_BGRAS8,   NVCV_IMAGE_FORMAT_BGRS8,    NVCV_COLOR_BGRA2BGR,      0.0},
+    { 335, 432,  2,  NVCV_IMAGE_FORMAT_RGBS8,    NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_COLOR_RGB2RGBA,      0.0},
+    { 431, 336,  2,  NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_IMAGE_FORMAT_RGBS8,    NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGRS8,    NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_COLOR_BGR2RGBA,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_IMAGE_FORMAT_BGRS8,    NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGBS8,    NVCV_IMAGE_FORMAT_BGRAS8,   NVCV_COLOR_RGB2BGRA,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_BGRAS8,   NVCV_IMAGE_FORMAT_RGBS8,    NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_IMAGE_FORMAT_BGRAS8,   NVCV_COLOR_RGBA2BGRA,     0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_BGRAS8,   NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_COLOR_BGRA2RGBA,     0.0},
+
+    { 177, 113,  1,  NVCV_IMAGE_FORMAT_BGR16,    NVCV_IMAGE_FORMAT_BGRA16,   NVCV_COLOR_BGR2BGRA,      0.0},
+    { 113, 176,  2,  NVCV_IMAGE_FORMAT_BGRA16,   NVCV_IMAGE_FORMAT_BGR16,    NVCV_COLOR_BGRA2BGR,      0.0},
+    { 335, 432,  2,  NVCV_IMAGE_FORMAT_RGB16,    NVCV_IMAGE_FORMAT_RGBA16,   NVCV_COLOR_RGB2RGBA,      0.0},
+    { 431, 336,  2,  NVCV_IMAGE_FORMAT_RGBA16,   NVCV_IMAGE_FORMAT_RGB16,    NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGR16,    NVCV_IMAGE_FORMAT_RGBA16,   NVCV_COLOR_BGR2RGBA,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_RGBA16,   NVCV_IMAGE_FORMAT_BGR16,    NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGB16,    NVCV_IMAGE_FORMAT_BGRA16,   NVCV_COLOR_RGB2BGRA,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_BGRA16,   NVCV_IMAGE_FORMAT_RGB16,    NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBA16,   NVCV_IMAGE_FORMAT_BGRA16,   NVCV_COLOR_RGBA2BGRA,     0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_BGRA16,   NVCV_IMAGE_FORMAT_RGBA16,   NVCV_COLOR_BGRA2RGBA,     0.0},
+
+    { 177, 113,  1,  NVCV_IMAGE_FORMAT_BGRS16,   NVCV_IMAGE_FORMAT_BGRAS16,  NVCV_COLOR_BGR2BGRA,      0.0},
+    { 113, 176,  2,  NVCV_IMAGE_FORMAT_BGRAS16,  NVCV_IMAGE_FORMAT_BGRS16,   NVCV_COLOR_BGRA2BGR,      0.0},
+    { 335, 432,  2,  NVCV_IMAGE_FORMAT_RGBS16,   NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_COLOR_RGB2RGBA,      0.0},
+    { 431, 336,  2,  NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_IMAGE_FORMAT_RGBS16,   NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGRS16,   NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_COLOR_BGR2RGBA,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_IMAGE_FORMAT_BGRS16,   NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGBS16,   NVCV_IMAGE_FORMAT_BGRAS16,  NVCV_COLOR_RGB2BGRA,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_BGRAS16,  NVCV_IMAGE_FORMAT_RGBS16,   NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_IMAGE_FORMAT_BGRAS16,  NVCV_COLOR_RGBA2BGRA,     0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_BGRAS16,  NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_COLOR_BGRA2RGBA,     0.0},
+
+    { 177, 113,  1,  NVCV_IMAGE_FORMAT_BGRS32,   NVCV_IMAGE_FORMAT_BGRAS32,  NVCV_COLOR_BGR2BGRA,      0.0},
+    { 113, 176,  2,  NVCV_IMAGE_FORMAT_BGRAS32,  NVCV_IMAGE_FORMAT_BGRS32,   NVCV_COLOR_BGRA2BGR,      0.0},
+    { 335, 432,  2,  NVCV_IMAGE_FORMAT_RGBS32,   NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_COLOR_RGB2RGBA,      0.0},
+    { 431, 336,  2,  NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_IMAGE_FORMAT_RGBS32,   NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGRS32,   NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_COLOR_BGR2RGBA,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_IMAGE_FORMAT_BGRS32,   NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGBS32,   NVCV_IMAGE_FORMAT_BGRAS32,  NVCV_COLOR_RGB2BGRA,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_BGRAS32,  NVCV_IMAGE_FORMAT_RGBS32,   NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_IMAGE_FORMAT_BGRAS32,  NVCV_COLOR_RGBA2BGRA,     0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_BGRAS32,  NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_COLOR_BGRA2RGBA,     0.0},
+
+    // Conversions that add alpha to output tensor are not allowed for f16 type.
+    { 113, 176,  2,  NVCV_IMAGE_FORMAT_BGRAf16,  NVCV_IMAGE_FORMAT_BGRf16,   NVCV_COLOR_BGRA2BGR,      0.0},
+    { 431, 336,  2,  NVCV_IMAGE_FORMAT_RGBAf16,  NVCV_IMAGE_FORMAT_RGBf16,   NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_RGBAf16,  NVCV_IMAGE_FORMAT_BGRf16,   NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_BGRAf16,  NVCV_IMAGE_FORMAT_RGBf16,   NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAf16,  NVCV_IMAGE_FORMAT_BGRAf16,  NVCV_COLOR_RGBA2BGRA,     0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_BGRAf16,  NVCV_IMAGE_FORMAT_RGBAf16,  NVCV_COLOR_BGRA2RGBA,     0.0},
+
+    { 129,  61,  4,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_COLOR_BGR2BGRA,      0.0},
+    { 129,  61,  4,  NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_BGRA2BGR,      0.0},
+    {  63,  31,  3,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_RGBAf32,  NVCV_COLOR_RGB2RGBA,      0.0},
+    {  63,  31,  3,  NVCV_IMAGE_FORMAT_RGBAf32,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_COLOR_RGBA2RGB,      0.0},
+    {  42, 111,  2,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_IMAGE_FORMAT_RGBAf32,  NVCV_COLOR_BGR2RGBA,      0.0},
+    {  42, 111,  2,  NVCV_IMAGE_FORMAT_RGBAf32,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_RGBA2BGR,      0.0},
+    {  21,  72,  2,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_COLOR_RGB2BGRA,      0.0},
+    {  21,  72,  2,  NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_COLOR_BGRA2RGB,      0.0},
+    {  23,  31,  3,  NVCV_IMAGE_FORMAT_RGBAf32,  NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_COLOR_RGBA2BGRA,     0.0},
+    {  23,  31,  3,  NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_IMAGE_FORMAT_RGBAf32,  NVCV_COLOR_BGRA2RGBA,     0.0},
+
+    { 177, 113,  1,  NVCV_IMAGE_FORMAT_BGRf64,   NVCV_IMAGE_FORMAT_BGRAf64,  NVCV_COLOR_BGR2BGRA,      0.0},
+    { 113, 176,  2,  NVCV_IMAGE_FORMAT_BGRAf64,  NVCV_IMAGE_FORMAT_BGRf64,   NVCV_COLOR_BGRA2BGR,      0.0},
+    { 335, 432,  2,  NVCV_IMAGE_FORMAT_RGBf64,   NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_COLOR_RGB2RGBA,      0.0},
+    { 431, 336,  2,  NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_IMAGE_FORMAT_RGBf64,   NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGRf64,   NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_COLOR_BGR2RGBA,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_IMAGE_FORMAT_BGRf64,   NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGBf64,   NVCV_IMAGE_FORMAT_BGRAf64,  NVCV_COLOR_RGB2BGRA,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_BGRAf64,  NVCV_IMAGE_FORMAT_RGBf64,   NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_IMAGE_FORMAT_BGRAf64,  NVCV_COLOR_RGBA2BGRA,     0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_BGRAf64,  NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_COLOR_BGRA2RGBA,     0.0},
+
+    {  23,  21, 63,  NVCV_IMAGE_FORMAT_Y8_ER,    NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_GRAY2BGR,      0.0},
+    {  21,  22, 63,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_Y8_ER,    NVCV_COLOR_BGR2GRAY,      1.0},
+    { 401, 202,  5,  NVCV_IMAGE_FORMAT_Y8_ER,    NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_GRAY2RGB,      0.0},
+    { 201, 402,  5,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_Y8_ER,    NVCV_COLOR_RGB2GRAY,      1.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_Y8_ER,    NVCV_COLOR_RGB2GRAY,      1.0},
+
+    {  32,  21,  4,  NVCV_IMAGE_FORMAT_Y16,      NVCV_IMAGE_FORMAT_BGR16,    NVCV_COLOR_GRAY2BGR,      0.0},
+    {  32,  21,  4,  NVCV_IMAGE_FORMAT_BGR16,    NVCV_IMAGE_FORMAT_Y16,      NVCV_COLOR_BGR2GRAY,      2.0},
+    {  54,  66,  5,  NVCV_IMAGE_FORMAT_Y16,      NVCV_IMAGE_FORMAT_RGB16,    NVCV_COLOR_GRAY2RGB,      0.0},
+    {  54,  66,  5,  NVCV_IMAGE_FORMAT_RGB16,    NVCV_IMAGE_FORMAT_Y16,      NVCV_COLOR_RGB2GRAY,      2.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB16,    NVCV_IMAGE_FORMAT_Y16,      NVCV_COLOR_RGB2GRAY,      2.0},
+
+    {  64,  21,  3,  NVCV_IMAGE_FORMAT_Yf32,     NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_GRAY2BGR,   ERR1_4},
+    {  64,  21,  3,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_IMAGE_FORMAT_Yf32,     NVCV_COLOR_BGR2GRAY,   ERR1_4},
+    { 121,  66,  5,  NVCV_IMAGE_FORMAT_Yf32,     NVCV_IMAGE_FORMAT_RGBf32,   NVCV_COLOR_GRAY2RGB,   ERR1_4},
+    { 121,  66,  5,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_Yf32,     NVCV_COLOR_RGB2GRAY,   ERR1_4},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_Yf32,     NVCV_COLOR_RGB2GRAY,   ERR1_4},
+
+    // Codes 9 to 39 are not implemented
+    {  55, 257,  4,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_BGR2HSV,       1.0},
+    {  55, 257,  4,  NVCV_IMAGE_FORMAT_HSV8,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_HSV2BGR,       1.0},
+    { 366,  14,  5,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_RGB2HSV,       1.0},
+    { 366,  14,  5,  NVCV_IMAGE_FORMAT_HSV8,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_HSV2RGB,       1.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_RGB2HSV,       1.0},
+    {2880,4096,  1,  NVCV_IMAGE_FORMAT_HSV8,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_HSV2RGB,       1.0},
+
+    // Hue computation differs slightly because CUDA kernel adds FLT_EPSILON to denominator for 'diff' division.
+    {  55, 257,  4,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_IMAGE_FORMAT_HSVf32,   NVCV_COLOR_BGR2HSV,    ERR2_3},
+    {  33, 525,  3,  NVCV_IMAGE_FORMAT_HSVf32,   NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_HSV2BGR,    ERR1_4},
+    { 365,  14,  5,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_HSVf32,   NVCV_COLOR_RGB2HSV,    ERR2_3},
+    { 367, 223,  2,  NVCV_IMAGE_FORMAT_HSVf32,   NVCV_IMAGE_FORMAT_RGBf32,   NVCV_COLOR_HSV2RGB,    ERR1_4},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_HSVf32,   NVCV_COLOR_RGB2HSV,    ERR2_3},
+    {5760,4096,  1,  NVCV_IMAGE_FORMAT_HSVf32,   NVCV_IMAGE_FORMAT_RGBf32,   NVCV_COLOR_RGB2HSV,    ERR2_3},
+
+    // // Codes 42 to 53 and 56 to 65 and 68 to 69 are not implemented
+    { 112, 157,  4,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_BGR2HSV_FULL,  1.0},
+    { 112, 157,  4,  NVCV_IMAGE_FORMAT_HSV8,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_HSV2BGR_FULL,  1.0},
+    { 333,  13,  3,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_RGB2HSV_FULL,  1.0},
+    { 333,  13,  3,  NVCV_IMAGE_FORMAT_HSV8,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_HSV2RGB_FULL,  1.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_RGB2HSV_FULL,  1.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_HSV8,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_RGB2HSV_FULL,  1.0},
+
+    // Codes 72 to 81 are not implemented
+    { 133,  22,  2,  NVCV_IMAGE_FORMAT_YUV8,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR,       1.0},
+    { 133,  22,  2,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_YUV8,     NVCV_COLOR_BGR2YUV,       1.0},
+    { 123,  21,  3,  NVCV_IMAGE_FORMAT_YUV8,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB,       1.0},
+    { 123,  21,  3,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_YUV8,     NVCV_COLOR_RGB2YUV,       1.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_YUV8,     NVCV_COLOR_RGB2YUV,       1.0},
+
+    { 133,  21,  3,  NVCV_IMAGE_FORMAT_YUV16,    NVCV_IMAGE_FORMAT_BGR16,    NVCV_COLOR_YUV2BGR,       1.0},
+    { 133,  21,  3,  NVCV_IMAGE_FORMAT_BGR16,    NVCV_IMAGE_FORMAT_YUV16,    NVCV_COLOR_BGR2YUV,       2.0},
+    { 123,  21,  3,  NVCV_IMAGE_FORMAT_YUV16,    NVCV_IMAGE_FORMAT_RGB16,    NVCV_COLOR_YUV2RGB,       1.0},
+    { 123,  21,  3,  NVCV_IMAGE_FORMAT_RGB16,    NVCV_IMAGE_FORMAT_YUV16,    NVCV_COLOR_RGB2YUV,       2.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB16,    NVCV_IMAGE_FORMAT_YUV16,    NVCV_COLOR_RGB2YUV,       2.0},
+
+    { 133,  21,  3,  NVCV_IMAGE_FORMAT_YUVf32,   NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_YUV2BGR,    ERR1_4},
+    { 133,  21,  3,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_IMAGE_FORMAT_YUVf32,   NVCV_COLOR_BGR2YUV,    ERR1_4},
+    { 123,  21,  3,  NVCV_IMAGE_FORMAT_YUVf32,   NVCV_IMAGE_FORMAT_RGBf32,   NVCV_COLOR_YUV2RGB,    ERR1_4},
+    { 123,  21,  3,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_YUVf32,   NVCV_COLOR_RGB2YUV,    ERR1_4},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_YUVf32,   NVCV_COLOR_RGB2YUV,    ERR1_4},
+    // Codes 86 to 89 are not implemented
+    // Codes 90 to 147 dealing with subsampled planes (NV12, etc. formats) are postponed (see comment below)
+    //     Codes 109, 110, 113, 114 dealing with VYUY format are not implemented
+    //     Codes 125, 126 dealing alpha premultiplication are not implemented
+    //     Codes 135 to 139 dealing edge-aware demosaicing are not implemented
+
+    { 120,  20,  2,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_I420,  2.0},
+    { 120,  20,  2,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_RGB2YUV_I420,  1.0},
+    { 100,  40,  3,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR_I420,  2.0},
+    { 100,  40,  3,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_BGR2YUV_I420,  1.0},
+    {  80, 120,  4,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_YUV2RGBA_I420, 2.0},
+    {  80, 120,  4,  NVCV_IMAGE_FORMAT_RGBA8,    NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_RGBA2YUV_I420, 1.0},
+    {  60,  60,  5,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGRA_I420, 2.0},
+    {  60,  60,  5,  NVCV_IMAGE_FORMAT_BGRA8,    NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_BGRA2YUV_I420, 1.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_RGB2YUV_I420,  1.0},
+
+    { 140,  80,  6,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_YV12,  2.0},
+    { 140,  80,  6,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_RGB2YUV_YV12,  1.0},
+    { 160,  60,  5,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR_YV12,  2.0},
+    { 160,  60,  5,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_BGR2YUV_YV12,  1.0},
+    {  60, 100,  4,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_YUV2RGBA_YV12, 2.0},
+    {  60, 100,  4,  NVCV_IMAGE_FORMAT_RGBA8,    NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_RGBA2YUV_YV12, 1.0},
+    {  80,  80,  3,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGRA_YV12, 2.0},
+    {  80,  80,  3,  NVCV_IMAGE_FORMAT_BGRA8,    NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_BGRA2YUV_YV12, 1.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_RGB2YUV_YV12,  1.0},
+
+    // NV12, ... makes varShape raise an error:
+    // "NVCV_ERROR_NOT_IMPLEMENTED: Batch image format must not have subsampled planes, but it is: X"
+    { 120,  20,  2,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_NV12,  2.0},
+    { 120,  20,  2,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_RGB2YUV_NV12,  1.0},
+    { 100,  40,  3,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR_NV12,  2.0},
+    { 100,  40,  3,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_BGR2YUV_NV12,  1.0},
+    {  80, 120,  4,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_YUV2RGBA_NV12, 2.0},
+    {  80, 120,  4,  NVCV_IMAGE_FORMAT_RGBA8,    NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_RGBA2YUV_NV12, 1.0},
+    {  60,  60,  5,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGRA_NV12, 2.0},
+    {  60,  60,  5,  NVCV_IMAGE_FORMAT_BGRA8,    NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_BGRA2YUV_NV12, 1.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_NV12,     NVCV_COLOR_RGB2YUV_NV12,  1.0},
+
+    { 140,  80,  6,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_NV21,  2.0},
+    { 140,  80,  6,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_RGB2YUV_NV21,  1.0},
+    { 160,  60,  5,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR_NV21,  2.0},
+    { 160,  60,  5,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_BGR2YUV_NV21,  1.0},
+    {  60, 100,  4,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_YUV2RGBA_NV21, 2.0},
+    {  60, 100,  4,  NVCV_IMAGE_FORMAT_RGBA8,    NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_RGBA2YUV_NV21, 1.0},
+    {  80,  80,  3,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGRA_NV21, 2.0},
+    {  80,  80,  3,  NVCV_IMAGE_FORMAT_BGRA8,    NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_BGRA2YUV_NV21, 1.0},
+    {4096,4096,  1,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_NV21,     NVCV_COLOR_RGB2YUV_NV21,  1.0},
+
+    {  80, 120,  2,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_Y8,       NVCV_COLOR_YUV2GRAY_420,  0.0},
+    { 100,  40,  3,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_Y8,       NVCV_COLOR_YUV2GRAY_420,  0.0},
+
+    { 120,  20,  2,  NVCV_IMAGE_FORMAT_UYVY,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_UYVY,  2.0},
+    { 120,  20,  2,  NVCV_IMAGE_FORMAT_UYVY,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR_UYVY,  2.0},
+    { 100,  40,  3,  NVCV_IMAGE_FORMAT_UYVY,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_YUV2RGBA_UYVY, 2.0},
+    { 100,  40,  3,  NVCV_IMAGE_FORMAT_UYVY,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGRA_UYVY, 2.0},
+
+    {  80, 120,  4,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_YUY2,  2.0},
+    {  80, 120,  4,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR_YUY2,  2.0},
+    {  60,  60,  5,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_YVYU,  2.0},
+    {  60,  60,  5,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR_YVYU,  2.0},
+    {  80, 120,  4,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_YUV2RGBA_YUY2, 2.0},
+    {  80, 120,  4,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGRA_YUY2, 2.0},
+    {  60,  60,  5,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_YUV2RGBA_YVYU, 2.0},
+    {  60,  60,  5,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGRA_YVYU, 2.0},
+
+    {  80, 120,  2,  NVCV_IMAGE_FORMAT_UYVY,     NVCV_IMAGE_FORMAT_Y8,       NVCV_COLOR_YUV2GRAY_UYVY,  0.0},
+    { 100,  40,  3,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_Y8,       NVCV_COLOR_YUV2GRAY_YUY2,  0.0},
+
+    // Code 148 is not implemented
+});
+
+// clang-format on
+
+//--------------------------------------------------------------------------------------------------------------------//
+TEST_P(OpCvtColor, correct_output)
+{
+    int wdth = GetParamValue<0>();
+    int hght = GetParamValue<1>();
+    int imgs = GetParamValue<2>();
+
+    nvcv::ImageFormat srcFrmt{GetParamValue<3>()};
+    nvcv::ImageFormat dstFrmt{GetParamValue<4>()};
+
+    NVCVColorConversionCode code{GetParamValue<5>()};
+
+    double maxDiff{GetParamValue<6>()};
+
+    // Create input and output tensors.
+    nvcv::Tensor srcTensor = util::CreateTensor(imgs, wdth, hght, srcFrmt);
+    nvcv::Tensor dstTensor = util::CreateTensor(imgs, wdth, hght, dstFrmt);
+
+    NVCVDataType dataType;
+    ASSERT_EQ(nvcvImageFormatGetPlaneDataType(srcFrmt, 0, &dataType), NVCV_SUCCESS);
+
+    switch (dataType)
+    {
+    case NVCV_DATA_TYPE_U8:
+    case NVCV_DATA_TYPE_2U8:
+    case NVCV_DATA_TYPE_3U8:
+    case NVCV_DATA_TYPE_4U8:
+    case NVCV_DATA_TYPE_S8:
+    case NVCV_DATA_TYPE_2S8:
+    case NVCV_DATA_TYPE_3S8:
+    case NVCV_DATA_TYPE_4S8:
+        verifyOutput<uint8_t>(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff);
+        break;
+
+    case NVCV_DATA_TYPE_U16:
+    case NVCV_DATA_TYPE_2U16:
+    case NVCV_DATA_TYPE_3U16:
+    case NVCV_DATA_TYPE_4U16:
+    case NVCV_DATA_TYPE_S16:
+    case NVCV_DATA_TYPE_2S16:
+    case NVCV_DATA_TYPE_3S16:
+    case NVCV_DATA_TYPE_4S16:
+    case NVCV_DATA_TYPE_F16:  // Data type float16 is only allowed in conversions that treat it as 16-bit integer
+    case NVCV_DATA_TYPE_2F16: //   (e.g., RGB2BGR or Gray2RGB).
+    case NVCV_DATA_TYPE_3F16:
+    case NVCV_DATA_TYPE_4F16:
+        verifyOutput<uint16_t>(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff);
+        break;
+
+    case NVCV_DATA_TYPE_S32:
+    case NVCV_DATA_TYPE_2S32:
+    case NVCV_DATA_TYPE_3S32:
+    case NVCV_DATA_TYPE_4S32:
+        verifyOutput<int32_t>(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff);
+        break;
+
+    case NVCV_DATA_TYPE_F32:
+    case NVCV_DATA_TYPE_2F32:
+    case NVCV_DATA_TYPE_3F32:
+    case NVCV_DATA_TYPE_4F32:
+        verifyOutput<float>(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff);
+        break;
+
+    case NVCV_DATA_TYPE_F64:
+    case NVCV_DATA_TYPE_2F64:
+    case NVCV_DATA_TYPE_3F64:
+    case NVCV_DATA_TYPE_4F64:
+        verifyOutput<double>(srcTensor, srcFrmt, dstTensor, dstFrmt, code, wdth, hght, imgs, maxDiff);
+        break;
+    default:
+        FAIL() << "Unsupported tensor data type.";
+        break;
+    }
+}
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+#define VEC_EXPECT_NEAR(vec1, vec2, delta, dtype)                                                                    \
+    ASSERT_EQ(vec1.size(), vec2.size());                                                                             \
+    for (std::size_t idx = 0; idx < vec1.size() / sizeof(dtype); ++idx)                                              \
+    {                                                                                                                \
+        EXPECT_NEAR(reinterpret_cast<dtype *>(vec1.data())[idx], reinterpret_cast<dtype *>(vec2.data())[idx], delta) \
+            << "At index " << idx;                                                                                   \
+    }
+
+// clang-format off
+
+NVCV_TEST_SUITE_P(OpCvtColor_circular,
 test::ValueList<int, int, int, NVCVImageFormat, NVCVImageFormat, NVCVColorConversionCode, NVCVColorConversionCode, double>
 {
-    //  W,   H,  N,               inputFormat,            outputFormat,                in2outCode,               out2inCode, maxDiff
-    { 176, 113,  1,    NVCV_IMAGE_FORMAT_BGR8,   NVCV_IMAGE_FORMAT_BGRA8,     NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,   0.0},
-    { 336, 432,  2,    NVCV_IMAGE_FORMAT_RGB8,   NVCV_IMAGE_FORMAT_RGBA8,     NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,   0.0},
-    {  77, 212,  3,    NVCV_IMAGE_FORMAT_BGR8,   NVCV_IMAGE_FORMAT_RGBA8,     NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,   0.0},
-    {  33,  55,  4,    NVCV_IMAGE_FORMAT_RGB8,   NVCV_IMAGE_FORMAT_BGRA8,     NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,   0.0},
-    { 123, 321,  5,   NVCV_IMAGE_FORMAT_RGBA8,   NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,   0.0},
-    { 176, 113,  1,   NVCV_IMAGE_FORMAT_BGRS8,   NVCV_IMAGE_FORMAT_BGRAS8,     NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,   0.0},
-    { 336, 432,  2,   NVCV_IMAGE_FORMAT_RGBS8,   NVCV_IMAGE_FORMAT_RGBAS8,     NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,   0.0},
-    {  77, 212,  3,   NVCV_IMAGE_FORMAT_BGRS8,   NVCV_IMAGE_FORMAT_RGBAS8,     NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,   0.0},
-    {  33,  55,  4,   NVCV_IMAGE_FORMAT_RGBS8,   NVCV_IMAGE_FORMAT_BGRAS8,     NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,   0.0},
-    { 123, 321,  5,   NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_IMAGE_FORMAT_BGRAS8,    NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,   0.0},
-    { 176, 113,  1,   NVCV_IMAGE_FORMAT_BGR16,   NVCV_IMAGE_FORMAT_BGRA16,     NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,   0.0},
-    { 336, 432,  2,   NVCV_IMAGE_FORMAT_RGB16,   NVCV_IMAGE_FORMAT_RGBA16,     NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,   0.0},
-    {  77, 212,  3,   NVCV_IMAGE_FORMAT_BGR16,   NVCV_IMAGE_FORMAT_RGBA16,     NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,   0.0},
-    {  33,  55,  4,   NVCV_IMAGE_FORMAT_RGB16,   NVCV_IMAGE_FORMAT_BGRA16,     NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,   0.0},
-    { 123, 321,  5,   NVCV_IMAGE_FORMAT_RGBA16,  NVCV_IMAGE_FORMAT_BGRA16,    NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,   0.0},
-    { 176, 113,  1,   NVCV_IMAGE_FORMAT_BGRS16,   NVCV_IMAGE_FORMAT_BGRAS16,     NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,   0.0},
-    { 336, 432,  2,   NVCV_IMAGE_FORMAT_RGBS16,   NVCV_IMAGE_FORMAT_RGBAS16,     NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,   0.0},
-    {  77, 212,  3,   NVCV_IMAGE_FORMAT_BGRS16,   NVCV_IMAGE_FORMAT_RGBAS16,     NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,   0.0},
-    {  33,  55,  4,   NVCV_IMAGE_FORMAT_RGBS16,   NVCV_IMAGE_FORMAT_BGRAS16,     NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,   0.0},
-    { 123, 321,  5,   NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_IMAGE_FORMAT_BGRAS16,    NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,   0.0},
-    { 176, 113,  1,   NVCV_IMAGE_FORMAT_BGRf16,   NVCV_IMAGE_FORMAT_BGRAf16,     NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,   0.0},
-    { 336, 432,  2,   NVCV_IMAGE_FORMAT_RGBf16,   NVCV_IMAGE_FORMAT_RGBAf16,     NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,   0.0},
-    {  77, 212,  3,   NVCV_IMAGE_FORMAT_BGRf16,   NVCV_IMAGE_FORMAT_RGBAf16,     NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,   0.0},
-    {  33,  55,  4,   NVCV_IMAGE_FORMAT_RGBf16,   NVCV_IMAGE_FORMAT_BGRAf16,     NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,   0.0},
-    { 123, 321,  5,   NVCV_IMAGE_FORMAT_RGBAf16,  NVCV_IMAGE_FORMAT_BGRAf16,    NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,   0.0},
-    { 176, 113,  1,   NVCV_IMAGE_FORMAT_BGRS32,   NVCV_IMAGE_FORMAT_BGRAS32,     NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,   0.0},
-    { 336, 432,  2,   NVCV_IMAGE_FORMAT_RGBS32,   NVCV_IMAGE_FORMAT_RGBAS32,     NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,   0.0},
-    {  77, 212,  3,   NVCV_IMAGE_FORMAT_BGRS32,   NVCV_IMAGE_FORMAT_RGBAS32,     NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,   0.0},
-    {  33,  55,  4,   NVCV_IMAGE_FORMAT_RGBS32,   NVCV_IMAGE_FORMAT_BGRAS32,     NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,   0.0},
-    { 123, 321,  5,   NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_IMAGE_FORMAT_BGRAS32,    NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,   0.0},
-    { 176, 113,  1,   NVCV_IMAGE_FORMAT_BGRf64,   NVCV_IMAGE_FORMAT_BGRAf64,     NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,   0.0},
-    { 336, 432,  2,   NVCV_IMAGE_FORMAT_RGBf64,   NVCV_IMAGE_FORMAT_RGBAf64,     NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,   0.0},
-    {  77, 212,  3,   NVCV_IMAGE_FORMAT_BGRf64,   NVCV_IMAGE_FORMAT_RGBAf64,     NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,   0.0},
-    {  33,  55,  4,   NVCV_IMAGE_FORMAT_RGBf64,   NVCV_IMAGE_FORMAT_BGRAf64,     NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,   0.0},
-    { 123, 321,  5,   NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_IMAGE_FORMAT_BGRAf64,    NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,   0.0},
-    {  23,  21, 63,      NVCV_IMAGE_FORMAT_Y8,    NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_GRAY2BGR,      NVCV_COLOR_BGR2GRAY,   0.0},
-    { 402, 202,  5,      NVCV_IMAGE_FORMAT_Y8,    NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_GRAY2RGB,      NVCV_COLOR_RGB2GRAY,   0.0},
-    {  32,  21,  4,     NVCV_IMAGE_FORMAT_Y16,   NVCV_IMAGE_FORMAT_BGR16,     NVCV_COLOR_GRAY2BGR,      NVCV_COLOR_BGR2GRAY,   0.0},
-    {  54,  66,  5,     NVCV_IMAGE_FORMAT_Y16,   NVCV_IMAGE_FORMAT_RGB16,     NVCV_COLOR_GRAY2RGB,      NVCV_COLOR_RGB2GRAY,   0.0},
-    {  64,  21,  3,     NVCV_IMAGE_FORMAT_Yf32,  NVCV_IMAGE_FORMAT_BGRf32,    NVCV_COLOR_GRAY2BGR,      NVCV_COLOR_BGR2GRAY,   1E-4},
-    {  121, 66,  5,     NVCV_IMAGE_FORMAT_Yf32,  NVCV_IMAGE_FORMAT_RGBf32,    NVCV_COLOR_GRAY2RGB,      NVCV_COLOR_RGB2GRAY,   1E-4},
-    { 129,  61,  4,  NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_BGRAf32,     NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,   0.0},
-    {  63,  31,  3,  NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_RGBAf32,     NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,   0.0},
-    {  42, 111,  2,  NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_RGBAf32,     NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,   0.0},
-    {  21,  72,  2,  NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_BGRAf32,     NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,   0.0},
-    {  23,  31,  3, NVCV_IMAGE_FORMAT_RGBAf32, NVCV_IMAGE_FORMAT_BGRAf32,    NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,   0.0},
+    //  W,   H,  N,  Input Format,               Output Format,               Convert Code (-->),       Convert Code (<--),   maxDiff
+    { 176, 113,  1,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,      0.0},
+    { 336, 432,  2,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBA8,    NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,     0.0},
+    { 176, 113,  1,  NVCV_IMAGE_FORMAT_BGRS8,    NVCV_IMAGE_FORMAT_BGRAS8,   NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,      0.0},
+    { 336, 432,  2,  NVCV_IMAGE_FORMAT_RGBS8,    NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGRS8,    NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGBS8,    NVCV_IMAGE_FORMAT_BGRAS8,   NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAS8,   NVCV_IMAGE_FORMAT_BGRAS8,   NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,     0.0},
+    { 176, 113,  1,  NVCV_IMAGE_FORMAT_BGR16,    NVCV_IMAGE_FORMAT_BGRA16,   NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,      0.0},
+    { 336, 432,  2,  NVCV_IMAGE_FORMAT_RGB16,    NVCV_IMAGE_FORMAT_RGBA16,   NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGR16,    NVCV_IMAGE_FORMAT_RGBA16,   NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGB16,    NVCV_IMAGE_FORMAT_BGRA16,   NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBA16,   NVCV_IMAGE_FORMAT_BGRA16,   NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,     0.0},
+    { 176, 113,  1,  NVCV_IMAGE_FORMAT_BGRS16,   NVCV_IMAGE_FORMAT_BGRAS16,  NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,      0.0},
+    { 336, 432,  2,  NVCV_IMAGE_FORMAT_RGBS16,   NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGRS16,   NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGBS16,   NVCV_IMAGE_FORMAT_BGRAS16,  NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAS16,  NVCV_IMAGE_FORMAT_BGRAS16,  NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,     0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGRf16,   NVCV_IMAGE_FORMAT_RGBf16,   NVCV_COLOR_BGR2RGB,       NVCV_COLOR_RGB2BGR,       0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAf16,  NVCV_IMAGE_FORMAT_BGRAf16,  NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,     0.0},
+    { 176, 113,  1,  NVCV_IMAGE_FORMAT_BGRS32,   NVCV_IMAGE_FORMAT_BGRAS32,  NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,      0.0},
+    { 336, 432,  2,  NVCV_IMAGE_FORMAT_RGBS32,   NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGRS32,   NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGBS32,   NVCV_IMAGE_FORMAT_BGRAS32,  NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAS32,  NVCV_IMAGE_FORMAT_BGRAS32,  NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,     0.0},
+    { 176, 113,  1,  NVCV_IMAGE_FORMAT_BGRf64,   NVCV_IMAGE_FORMAT_BGRAf64,  NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,      0.0},
+    { 336, 432,  2,  NVCV_IMAGE_FORMAT_RGBf64,   NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,      0.0},
+    {  77, 212,  3,  NVCV_IMAGE_FORMAT_BGRf64,   NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,      0.0},
+    {  33,  55,  4,  NVCV_IMAGE_FORMAT_RGBf64,   NVCV_IMAGE_FORMAT_BGRAf64,  NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,      0.0},
+    { 123, 321,  5,  NVCV_IMAGE_FORMAT_RGBAf64,  NVCV_IMAGE_FORMAT_BGRAf64,  NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,     0.0},
+    {  23,  21, 63,  NVCV_IMAGE_FORMAT_Y8,       NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_GRAY2BGR,      NVCV_COLOR_BGR2GRAY,      0.0},
+    { 402, 202,  5,  NVCV_IMAGE_FORMAT_Y8,       NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_GRAY2RGB,      NVCV_COLOR_RGB2GRAY,      0.0},
+    {  32,  21,  4,  NVCV_IMAGE_FORMAT_Y16,      NVCV_IMAGE_FORMAT_BGR16,    NVCV_COLOR_GRAY2BGR,      NVCV_COLOR_BGR2GRAY,      0.0},
+    {  54,  66,  5,  NVCV_IMAGE_FORMAT_Y16,      NVCV_IMAGE_FORMAT_RGB16,    NVCV_COLOR_GRAY2RGB,      NVCV_COLOR_RGB2GRAY,      0.0},
+    {  64,  21,  3,  NVCV_IMAGE_FORMAT_Yf32,     NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_GRAY2BGR,      NVCV_COLOR_BGR2GRAY,     1E-4},
+    {  121, 66,  5,  NVCV_IMAGE_FORMAT_Yf32,     NVCV_IMAGE_FORMAT_RGBf32,   NVCV_COLOR_GRAY2RGB,      NVCV_COLOR_RGB2GRAY,     1E-4},
+    { 129,  61,  4,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_COLOR_BGR2BGRA,      NVCV_COLOR_BGRA2BGR,      0.0},
+    {  63,  31,  3,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_RGBAf32,  NVCV_COLOR_RGB2RGBA,      NVCV_COLOR_RGBA2RGB,      0.0},
+    {  42, 111,  2,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_IMAGE_FORMAT_RGBAf32,  NVCV_COLOR_BGR2RGBA,      NVCV_COLOR_RGBA2BGR,      0.0},
+    {  21,  72,  2,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_COLOR_RGB2BGRA,      NVCV_COLOR_BGRA2RGB,      0.0},
+    {  23,  31,  3,  NVCV_IMAGE_FORMAT_RGBAf32,  NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_COLOR_RGBA2BGRA,     NVCV_COLOR_BGRA2RGBA,     0.0},
     // Codes 9 to 39 are not implemented
-    {  55, 257,  4,    NVCV_IMAGE_FORMAT_BGR8,   NVCV_IMAGE_FORMAT_HSV8,       NVCV_COLOR_BGR2HSV,       NVCV_COLOR_HSV2BGR,   5.0},
-    { 366,  14,  5,    NVCV_IMAGE_FORMAT_RGB8,   NVCV_IMAGE_FORMAT_HSV8,       NVCV_COLOR_RGB2HSV,       NVCV_COLOR_HSV2RGB,   5.0},
-    {  55, 257,  4,    NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_HSVf32,     NVCV_COLOR_BGR2HSV,       NVCV_COLOR_HSV2BGR,   1E-2},
-    { 366,  14,  5,    NVCV_IMAGE_FORMAT_RGBf32, NVCV_IMAGE_FORMAT_HSVf32,     NVCV_COLOR_RGB2HSV,       NVCV_COLOR_HSV2RGB,   1E-2},
+    {  55, 257,  4,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_BGR2HSV,       NVCV_COLOR_HSV2BGR,       5.0},
+    { 366,  14,  5,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_RGB2HSV,       NVCV_COLOR_HSV2RGB,       5.0},
+    {  55, 257,  4,  NVCV_IMAGE_FORMAT_BGRf32,   NVCV_IMAGE_FORMAT_HSVf32,   NVCV_COLOR_BGR2HSV,       NVCV_COLOR_HSV2BGR,      1E-2},
+    { 366,  14,  5,  NVCV_IMAGE_FORMAT_RGBf32,   NVCV_IMAGE_FORMAT_HSVf32,   NVCV_COLOR_RGB2HSV,       NVCV_COLOR_HSV2RGB,      1E-2},
     // Codes 42 to 53 and 56 to 65 and 68 to 69 are not implemented
-    { 112, 157,  4,    NVCV_IMAGE_FORMAT_BGR8,   NVCV_IMAGE_FORMAT_HSV8,  NVCV_COLOR_BGR2HSV_FULL,  NVCV_COLOR_HSV2BGR_FULL,   8.0},
-    { 333,  13,  3,    NVCV_IMAGE_FORMAT_RGB8,   NVCV_IMAGE_FORMAT_HSV8,  NVCV_COLOR_RGB2HSV_FULL,  NVCV_COLOR_HSV2RGB_FULL,   8.0},
+    { 112, 157,  4,  NVCV_IMAGE_FORMAT_BGR8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_BGR2HSV_FULL,  NVCV_COLOR_HSV2BGR_FULL,  8.0},
+    { 333,  13,  3,  NVCV_IMAGE_FORMAT_RGB8,     NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_RGB2HSV_FULL,  NVCV_COLOR_HSV2RGB_FULL,  8.0},
     // Codes 72 to 81 are not implemented
-    { 133,  22,  2,    NVCV_IMAGE_FORMAT_YUV8,   NVCV_IMAGE_FORMAT_BGR8,       NVCV_COLOR_YUV2BGR,       NVCV_COLOR_BGR2YUV, 128.0},
-    { 123,  21,  3,    NVCV_IMAGE_FORMAT_YUV8,   NVCV_IMAGE_FORMAT_RGB8,       NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV, 128.0},
-    { 133,  21,  3,    NVCV_IMAGE_FORMAT_YUV16,   NVCV_IMAGE_FORMAT_BGR16,       NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV, 32768.0},
-    { 123,  21,  3,    NVCV_IMAGE_FORMAT_YUV16,   NVCV_IMAGE_FORMAT_RGB16,       NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV, 32768.0},
-    { 133,  21,  3,    NVCV_IMAGE_FORMAT_YUVf32,   NVCV_IMAGE_FORMAT_BGRf32,       NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV, 1E-2},
-    { 123,  21,  3,    NVCV_IMAGE_FORMAT_YUVf32,   NVCV_IMAGE_FORMAT_RGBf32,       NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV, 1E-2},
+    { 133,  22,  2,  NVCV_IMAGE_FORMAT_YUV8,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR,       NVCV_COLOR_BGR2YUV,     128.0},
+    { 123,  21,  3,  NVCV_IMAGE_FORMAT_YUV8,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV,     128.0},
+    { 133,  21,  3,  NVCV_IMAGE_FORMAT_YUV16,    NVCV_IMAGE_FORMAT_BGR16,    NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV,   32768.0},
+    { 123,  21,  3,  NVCV_IMAGE_FORMAT_YUV16,    NVCV_IMAGE_FORMAT_RGB16,    NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV,   32768.0},
+    { 133,  21,  3,  NVCV_IMAGE_FORMAT_YUVf32,   NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV,      1E-2},
+    { 123,  21,  3,  NVCV_IMAGE_FORMAT_YUVf32,   NVCV_IMAGE_FORMAT_RGBf32,   NVCV_COLOR_YUV2RGB,       NVCV_COLOR_RGB2YUV,      1E-2},
     // Codes 86 to 89 are not implemented
     // Codes 90 to 147 dealing with subsampled planes (NV12, etc. formats) are postponed (see comment below)
     //     Codes 109, 110, 113, 114 dealing with VYUY format are not implemented
@@ -174,22 +815,24 @@ test::ValueList<int, int, int, NVCVImageFormat, NVCVImageFormat, NVCVColorConver
 
     // NV12, ... makes tensors raise an error:
     // "NVCV_ERROR_NOT_IMPLEMENTED: Batch image format must not have subsampled planes, but it is: X"
-    { 120,  20,  2,    NVCV_IMAGE_FORMAT_NV12,   NVCV_IMAGE_FORMAT_RGB8,  NVCV_COLOR_YUV2RGB_NV12,  NVCV_COLOR_RGB2YUV_NV12, 128.0},
-    { 100,  40,  3,    NVCV_IMAGE_FORMAT_NV12,   NVCV_IMAGE_FORMAT_BGR8,  NVCV_COLOR_YUV2BGR_NV12,  NVCV_COLOR_BGR2YUV_NV12, 128.0},
-    {  80, 120,  4,    NVCV_IMAGE_FORMAT_NV12,  NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_NV12, NVCV_COLOR_RGBA2YUV_NV12, 128.0},
-    {  60,  60,  5,    NVCV_IMAGE_FORMAT_NV12,  NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_NV12, NVCV_COLOR_BGRA2YUV_NV12, 128.0},
-    { 140,  80,  6,    NVCV_IMAGE_FORMAT_NV21,   NVCV_IMAGE_FORMAT_RGB8,  NVCV_COLOR_YUV2RGB_NV21,  NVCV_COLOR_RGB2YUV_NV21, 128.0},
-    { 160,  60,  5,    NVCV_IMAGE_FORMAT_NV21,   NVCV_IMAGE_FORMAT_BGR8,  NVCV_COLOR_YUV2BGR_NV21,  NVCV_COLOR_BGR2YUV_NV21, 128.0},
-    {  60, 100,  4,    NVCV_IMAGE_FORMAT_NV21,  NVCV_IMAGE_FORMAT_RGBA8, NVCV_COLOR_YUV2RGBA_NV21, NVCV_COLOR_RGBA2YUV_NV21, 128.0},
-    {  80,  80,  3,    NVCV_IMAGE_FORMAT_NV21,  NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGRA_NV21, NVCV_COLOR_BGRA2YUV_NV21, 128.0},
+    { 120,  20,  2,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_NV12,  NVCV_COLOR_RGB2YUV_NV12,   128.0},
+    { 100,  40,  3,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR_NV12,  NVCV_COLOR_BGR2YUV_NV12,   128.0},
+    {  80, 120,  4,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_YUV2RGBA_NV12, NVCV_COLOR_RGBA2YUV_NV12,  128.0},
+    {  60,  60,  5,  NVCV_IMAGE_FORMAT_NV12,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGRA_NV12, NVCV_COLOR_BGRA2YUV_NV12,  128.0},
+    { 140,  80,  6,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_NV21,  NVCV_COLOR_RGB2YUV_NV21,   128.0},
+    { 160,  60,  5,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR_NV21,  NVCV_COLOR_BGR2YUV_NV21,   128.0},
+    {  60, 100,  4,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_RGBA8,    NVCV_COLOR_YUV2RGBA_NV21, NVCV_COLOR_RGBA2YUV_NV21,  128.0},
+    {  80,  80,  3,  NVCV_IMAGE_FORMAT_NV21,     NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGRA_NV21, NVCV_COLOR_BGRA2YUV_NV21,  128.0},
 /*
-    { 120,  40,  2,    NVCV_IMAGE_FORMAT_UYVY,   NVCV_IMAGE_FORMAT_RGB8,  NVCV_COLOR_YUV2RGB_UYVY,       NVCV_COLOR_RGB2YUV, 128.0},
-    { 120,  40,  2,    NVCV_IMAGE_FORMAT_YUYV,   NVCV_IMAGE_FORMAT_RGB8,  NVCV_COLOR_YUV2RGB_YUYV,       NVCV_COLOR_RGB2YUV, 128.0},
+    { 120,  40,  2,  NVCV_IMAGE_FORMAT_UYVY,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_UYVY,  NVCV_COLOR_RGB2YUV,        128.0},
+    { 120,  40,  2,  NVCV_IMAGE_FORMAT_YUYV,     NVCV_IMAGE_FORMAT_RGB8,     NVCV_COLOR_YUV2RGB_YUYV,  NVCV_COLOR_RGB2YUV,        128.0},
 */
 
     // Code 148 is not implemented
 });
 
+// clang-format on
+
 #undef NVCV_IMAGE_FORMAT_RGBS8
 #undef NVCV_IMAGE_FORMAT_BGRS8
 #undef NVCV_IMAGE_FORMAT_RGBAS8
@@ -220,100 +863,7 @@ test::ValueList<int, int, int, NVCVImageFormat, NVCVImageFormat, NVCVColorConver
 #undef NVCV_IMAGE_FORMAT_BGRAS64
 #undef NVCV_IMAGE_FORMAT_RGBAS64
 
-// clang-format on
-
-TEST_P(OpCvtColor, correct_output)
-{
-    int width   = GetParamValue<0>();
-    int height  = GetParamValue<1>();
-    int batches = GetParamValue<2>();
-
-    nvcv::ImageFormat srcFormat{GetParamValue<3>()};
-    nvcv::ImageFormat dstFormat{GetParamValue<4>()};
-
-    NVCVDataType nvcvDataType;
-    ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(srcFormat, 0, &nvcvDataType));
-
-    NVCVColorConversionCode src2dstCode{GetParamValue<5>()};
-    NVCVColorConversionCode dst2srcCode{GetParamValue<6>()};
-
-    double maxDiff{GetParamValue<7>()};
-
-    nvcv::Tensor srcTensor = nvcv::util::CreateTensor(batches, width, height, srcFormat);
-    nvcv::Tensor dstTensor = nvcv::util::CreateTensor(batches, width, height, dstFormat);
-
-    auto srcData = srcTensor.exportData<nvcv::TensorDataStridedCuda>();
-    auto dstData = dstTensor.exportData<nvcv::TensorDataStridedCuda>();
-
-    ASSERT_NE(srcData, nullptr);
-    ASSERT_NE(dstData, nullptr);
-
-    auto srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*srcData);
-    ASSERT_TRUE(srcAccess);
-
-    auto dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dstData);
-    ASSERT_TRUE(dstAccess);
-
-    long srcSampleStride = srcAccess->sampleStride();
-
-    if (srcData->rank() == 3)
-    {
-        srcSampleStride = srcAccess->numRows() * srcAccess->rowStride();
-    }
-
-    long srcBufSize = srcSampleStride * srcAccess->numSamples();
-
-    std::vector<uint8_t>       srcVec(srcBufSize);
-    std::default_random_engine randEng(0);
-    switch (nvcvDataType)
-    {
-    case NVCV_DATA_TYPE_F32:
-    case NVCV_DATA_TYPE_2F32:
-    case NVCV_DATA_TYPE_3F32:
-    case NVCV_DATA_TYPE_4F32:
-        myGenerate(reinterpret_cast<float *>(srcVec.data()), srcVec.size() / sizeof(float), randEng);
-        break;
-    default:
-        myGenerate(reinterpret_cast<uint8_t *>(srcVec.data()), srcVec.size(), randEng);
-        break;
-    }
-
-    // copy random input to device
-    ASSERT_EQ(cudaSuccess, cudaMemcpy(srcData->basePtr(), srcVec.data(), srcBufSize, cudaMemcpyHostToDevice));
-
-    cudaStream_t stream;
-    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
-
-    // run operator
-    cvcuda::CvtColor cvtColorOp;
-
-    EXPECT_NO_THROW(cvtColorOp(stream, srcTensor, dstTensor, src2dstCode));
-
-    EXPECT_NO_THROW(cvtColorOp(stream, dstTensor, srcTensor, dst2srcCode));
-
-    ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
-    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
-
-    std::vector<uint8_t> testVec(srcBufSize);
-
-    // copy output back to host
-    ASSERT_EQ(cudaSuccess, cudaMemcpy(testVec.data(), srcData->basePtr(), srcBufSize, cudaMemcpyDeviceToHost));
-
-    switch (nvcvDataType)
-    {
-    case NVCV_DATA_TYPE_F32:
-    case NVCV_DATA_TYPE_2F32:
-    case NVCV_DATA_TYPE_3F32:
-    case NVCV_DATA_TYPE_4F32:
-        VEC_EXPECT_NEAR(testVec, srcVec, maxDiff, float);
-        break;
-    default:
-        VEC_EXPECT_NEAR(testVec, srcVec, maxDiff, uint8_t);
-        break;
-    }
-}
-
-TEST_P(OpCvtColor, varshape_correct_output)
+TEST_P(OpCvtColor_circular, varshape_correct_output)
 {
     cudaStream_t stream;
     ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
@@ -325,12 +875,14 @@ TEST_P(OpCvtColor, varshape_correct_output)
     nvcv::ImageFormat srcFormat{GetParamValue<3>()};
     nvcv::ImageFormat dstFormat{GetParamValue<4>()};
 
-    // Waive the formats that have subsampled planes
-    if (srcFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444
-        || dstFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444)
+    // clang-format off
+    // Waive the formats that have subsampled planes.
+    if (srcFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444 ||
+        dstFormat.chromaSubsampling() != nvcv::ChromaSubsampling::CSS_444)
     {
         GTEST_SKIP() << "Waived the formats that have subsampled planes for OpCvtColor varshape test";
     }
+    // clang-format on
 
     NVCVDataType nvcvDataType;
     ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(srcFormat, 0, &nvcvDataType));
@@ -366,10 +918,10 @@ TEST_P(OpCvtColor, varshape_correct_output)
         case NVCV_DATA_TYPE_2F32:
         case NVCV_DATA_TYPE_3F32:
         case NVCV_DATA_TYPE_4F32:
-            myGenerate(reinterpret_cast<float *>(srcVec[i].data()), srcVec[i].size() / sizeof(float), rng);
+            generateRandVec(reinterpret_cast<float *>(srcVec[i].data()), srcVec[i].size() / sizeof(float), rng);
             break;
         default:
-            myGenerate(reinterpret_cast<uint8_t *>(srcVec[i].data()), srcVec[i].size(), rng);
+            generateRandVec(reinterpret_cast<uint8_t *>(srcVec[i].data()), srcVec[i].size(), rng);
             break;
         }
 
@@ -447,28 +999,29 @@ TEST(OpCvtColor_negative, create_with_null_handle)
 NVCV_TEST_SUITE_P(OpCvtColor_negative,
 test::ValueList<int, int, int, NVCVImageFormat, NVCVImageFormat, NVCVColorConversionCode>
 {
-    //  W,   H,  N,               inputFormat,              outputFormat,              in2outCode
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_Y8,   NVCV_IMAGE_FORMAT_BGRA8,     NVCV_COLOR_BGR2BGRA}, // invalid input channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGR8,   NVCV_IMAGE_FORMAT_BGRAf32,     NVCV_COLOR_BGR2BGRA}, // mismatch data type
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGR8,   NVCV_IMAGE_FORMAT_Y8,     NVCV_COLOR_BGR2BGRA}, // invalid output channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGR8,    NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_GRAY2BGR}, // invalid input channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_Y8,    NVCV_IMAGE_FORMAT_BGRf32,     NVCV_COLOR_GRAY2BGR}, // mismatch data type
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_Y8,    NVCV_IMAGE_FORMAT_BGRA8,     NVCV_COLOR_GRAY2BGR}, // invalid output channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGRA8,    NVCV_IMAGE_FORMAT_Y8,     NVCV_COLOR_BGR2GRAY}, // invalid input channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGRf32,    NVCV_IMAGE_FORMAT_Y8,     NVCV_COLOR_BGR2GRAY}, // mismatch data type
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGR8,    NVCV_IMAGE_FORMAT_BGRA8,     NVCV_COLOR_BGR2GRAY}, // invalid output channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGRA8,   NVCV_IMAGE_FORMAT_YUV8,       NVCV_COLOR_BGR2YUV,}, // invalid input channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGRf32,    NVCV_IMAGE_FORMAT_YUV8,     NVCV_COLOR_BGR2YUV}, // mismatch data type
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGR8,    NVCV_IMAGE_FORMAT_BGRA8,     NVCV_COLOR_BGR2YUV}, // invalid output channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGRA8,   NVCV_IMAGE_FORMAT_BGR8,       NVCV_COLOR_YUV2BGR,}, // invalid input channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_YUV8,    NVCV_IMAGE_FORMAT_BGRf32,     NVCV_COLOR_YUV2BGR}, // mismatch data type
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_YUV8,    NVCV_IMAGE_FORMAT_BGRA8,     NVCV_COLOR_YUV2BGR}, // invalid output channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGRA8,   NVCV_IMAGE_FORMAT_HSV8,       NVCV_COLOR_BGR2HSV}, // invalid input channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGRf32,   NVCV_IMAGE_FORMAT_HSV8,       NVCV_COLOR_BGR2HSV}, // mismatch data type
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGR8,   NVCV_IMAGE_FORMAT_BGRA8,       NVCV_COLOR_BGR2HSV}, // invalid output channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_BGRA8,   NVCV_IMAGE_FORMAT_BGR8,       NVCV_COLOR_HSV2BGR}, // invalid input channel
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_HSV8,   NVCV_IMAGE_FORMAT_BGRf32,       NVCV_COLOR_HSV2BGR}, // mismatch data type
-    {   8,   8,  3,    NVCV_IMAGE_FORMAT_HSV8,   NVCV_IMAGE_FORMAT_Y8,       NVCV_COLOR_HSV2BGR}, // invalid output channel
+    //  W,   H,  N,  Input Format,              Output Format,              Conversion Code
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_Y8,      NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_BGR2BGRA}, // invalid input channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGR8,    NVCV_IMAGE_FORMAT_BGRAf32,  NVCV_COLOR_BGR2BGRA}, // mismatch data type
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGR8,    NVCV_IMAGE_FORMAT_Y8,       NVCV_COLOR_BGR2BGRA}, // invalid output channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGR8,    NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_GRAY2BGR}, // invalid input channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_Y8,      NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_GRAY2BGR}, // mismatch data type
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_Y8,      NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_GRAY2BGR}, // invalid output channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGRA8,   NVCV_IMAGE_FORMAT_Y8,       NVCV_COLOR_BGR2GRAY}, // invalid input channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGRf32,  NVCV_IMAGE_FORMAT_Y8,       NVCV_COLOR_BGR2GRAY}, // mismatch data type
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGRf16,  NVCV_IMAGE_FORMAT_BGRAf16,  NVCV_COLOR_BGR2BGRA}, // f16 type not allowed to add alpha
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGR8,    NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_BGR2GRAY}, // invalid output channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGRA8,   NVCV_IMAGE_FORMAT_YUV8,     NVCV_COLOR_BGR2YUV},  // invalid input channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGRf32,  NVCV_IMAGE_FORMAT_YUV8,     NVCV_COLOR_BGR2YUV},  // mismatch data type
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGR8,    NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_BGR2YUV},  // invalid output channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGRA8,   NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_YUV2BGR},  // invalid input channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_YUV8,    NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_YUV2BGR},  // mismatch data type
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_YUV8,    NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_YUV2BGR},  // invalid output channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGRA8,   NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_BGR2HSV},  // invalid input channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGRf32,  NVCV_IMAGE_FORMAT_HSV8,     NVCV_COLOR_BGR2HSV},  // mismatch data type
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGR8,    NVCV_IMAGE_FORMAT_BGRA8,    NVCV_COLOR_BGR2HSV},  // invalid output channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_BGRA8,   NVCV_IMAGE_FORMAT_BGR8,     NVCV_COLOR_HSV2BGR},  // invalid input channel
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_HSV8,    NVCV_IMAGE_FORMAT_BGRf32,   NVCV_COLOR_HSV2BGR},  // mismatch data type
+    {   8,   8,  3,  NVCV_IMAGE_FORMAT_HSV8,    NVCV_IMAGE_FORMAT_Y8,       NVCV_COLOR_HSV2BGR},  // invalid output channel
 });
 
 // clang-format on
@@ -484,8 +1037,8 @@ TEST_P(OpCvtColor_negative, invalid_input)
 
     NVCVColorConversionCode src2dstCode{GetParamValue<5>()};
 
-    nvcv::Tensor srcTensor = nvcv::util::CreateTensor(batches, width, height, srcFormat);
-    nvcv::Tensor dstTensor = nvcv::util::CreateTensor(batches, width, height, dstFormat);
+    nvcv::Tensor srcTensor = util::CreateTensor(batches, width, height, srcFormat);
+    nvcv::Tensor dstTensor = util::CreateTensor(batches, width, height, dstFormat);
 
     // run operator
     cvcuda::CvtColor cvtColorOp;
diff --git a/tests/cvcuda/system/TestOpMorphology.cpp b/tests/cvcuda/system/TestOpMorphology.cpp
index 8219b87b..d1aecd8e 100644
--- a/tests/cvcuda/system/TestOpMorphology.cpp
+++ b/tests/cvcuda/system/TestOpMorphology.cpp
@@ -428,6 +428,7 @@ NVCV_TEST_SUITE_P(OpMorphology, test::ValueList<int, int, int, NVCVImageFormat,
 {
     // width, height, batches,               format,  maskWidth, maskHeight,             borderMode,  morphType,    iteration
     {      5,      5,       1, NVCV_IMAGE_FORMAT_U8,          2,          2,   NVCV_BORDER_CONSTANT, NVCV_ERODE,            3},
+    {      5,      5,       1, NVCV_IMAGE_FORMAT_U8,         -1,         -1,   NVCV_BORDER_CONSTANT, NVCV_DILATE,           1},
     {      5,      5,       1, NVCV_IMAGE_FORMAT_RGBAf32,     3,          3,   NVCV_BORDER_CONSTANT, NVCV_DILATE,           1},
     {     25,     45,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_CONSTANT, NVCV_DILATE,           2},
     {    125,     35,       1, NVCV_IMAGE_FORMAT_RGBA8,       4,          4,   NVCV_BORDER_CONSTANT, NVCV_ERODE,            1},
@@ -445,7 +446,22 @@ NVCV_TEST_SUITE_P(OpMorphology, test::ValueList<int, int, int, NVCVImageFormat,
     {    325,    800,       3, NVCV_IMAGE_FORMAT_RGB8,        3,          2,   NVCV_BORDER_CONSTANT, NVCV_OPEN,            10},
     {     25,     44,       1, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_CONSTANT, NVCV_CLOSE,            1},
     {     21,    435,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_CONSTANT, NVCV_CLOSE,            3},
-
+    {      5,      5,       1, NVCV_IMAGE_FORMAT_U8,          2,          2,   NVCV_BORDER_REPLICATE, NVCV_ERODE,           3},
+    {     25,     45,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REPLICATE, NVCV_DILATE,          2},
+    {     25,     45,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REPLICATE, NVCV_OPEN,            3},
+    {     25,     44,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REPLICATE, NVCV_CLOSE,           2},
+    {      5,      5,       1, NVCV_IMAGE_FORMAT_U8,          2,          2,   NVCV_BORDER_REFLECT, NVCV_ERODE,             3},
+    {     25,     45,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT, NVCV_DILATE,            2},
+    {     25,     45,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT, NVCV_OPEN,              3},
+    {     25,     44,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT, NVCV_CLOSE,             2},
+    {      5,      5,       1, NVCV_IMAGE_FORMAT_U8,          2,          2,   NVCV_BORDER_WRAP, NVCV_ERODE,                3},
+    {     25,     45,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_WRAP, NVCV_DILATE,               2},
+    {     25,     45,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_WRAP, NVCV_OPEN,                 3},
+    {     25,     44,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_WRAP, NVCV_CLOSE,                2},
+    {      5,      5,       1, NVCV_IMAGE_FORMAT_U8,          2,          2,   NVCV_BORDER_REFLECT101, NVCV_ERODE,          3},
+    {     25,     45,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT101, NVCV_DILATE,         2},
+    {     25,     45,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT101, NVCV_OPEN,           3},
+    {     25,     44,       2, NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT101, NVCV_CLOSE,          2}
 });
 
 // clang-format on
@@ -569,6 +585,11 @@ TEST_P(OpMorphology, morph_random)
     ASSERT_EQ(cudaSuccess, cudaMemcpy(testVec.data(), outData->basePtr(), outBufSize, cudaMemcpyDeviceToHost));
 
     // generate gold result
+    if (maskSize.w == -1 || maskSize.h == -1)
+    {
+        maskSize.w = 3;
+        maskSize.h = 3;
+    }
     int2 kernelAnchor{maskSize.w / 2, maskSize.h / 2};
     hostMorph(goldVec, outStrides, inVec, inStrides, shape, format, maskSize, kernelAnchor, iteration, borderMode,
               morphType);
@@ -595,8 +616,23 @@ NVCV_TEST_SUITE_P(OpMorphologyVarShape, test::ValueList<int, int, int, NVCVImage
     {     52,     45,      21,      NVCV_IMAGE_FORMAT_U16,         1,         2,    NVCV_BORDER_CONSTANT, NVCV_CLOSE,          1},
     {    325,     45,       3,      NVCV_IMAGE_FORMAT_RGB8,        3,         4,    NVCV_BORDER_CONSTANT, NVCV_CLOSE,          1},
     {     25,    456,       4,      NVCV_IMAGE_FORMAT_U8,          3,         3,    NVCV_BORDER_CONSTANT, NVCV_OPEN,           1},
-    {     55,     45,       2,      NVCV_IMAGE_FORMAT_U8,         -1,        -1,    NVCV_BORDER_CONSTANT, NVCV_OPEN,           1}
-
+    {     55,     45,       2,      NVCV_IMAGE_FORMAT_U8,         -1,        -1,    NVCV_BORDER_CONSTANT, NVCV_OPEN,           1},
+    {      5,      5,       4,      NVCV_IMAGE_FORMAT_U8,          2,          2,   NVCV_BORDER_REPLICATE, NVCV_ERODE,         3},
+    {     25,     45,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REPLICATE, NVCV_DILATE,        2},
+    {     25,     45,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REPLICATE, NVCV_OPEN,          3},
+    {     25,     44,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REPLICATE, NVCV_CLOSE,         2},
+    {      5,      5,       1,      NVCV_IMAGE_FORMAT_U8,          2,          2,   NVCV_BORDER_REFLECT, NVCV_ERODE,           3},
+    {     25,     45,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT, NVCV_DILATE,          2},
+    {     25,     45,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT, NVCV_OPEN,            3},
+    {     25,     44,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT, NVCV_CLOSE,           2},
+    {      5,      5,       4,      NVCV_IMAGE_FORMAT_U8,          2,          2,   NVCV_BORDER_WRAP, NVCV_ERODE,              3},
+    {     25,     45,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_WRAP, NVCV_DILATE,             2},
+    {     25,     45,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_WRAP, NVCV_OPEN,               3},
+    {     25,     44,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_WRAP, NVCV_CLOSE,              2},
+    {      5,      5,       4,      NVCV_IMAGE_FORMAT_U8,          2,          2,   NVCV_BORDER_REFLECT101, NVCV_ERODE,        3},
+    {     25,     45,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT101, NVCV_DILATE,       2},
+    {     25,     45,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT101, NVCV_OPEN,         3},
+    {     25,     44,       2,      NVCV_IMAGE_FORMAT_U8,          3,          3,   NVCV_BORDER_REFLECT101, NVCV_CLOSE,        2}
 });
 
 // clang-format on
@@ -879,3 +915,180 @@ TEST_P(OpMorphologyVarShape, varshape_noop)
         EXPECT_EQ(testVec, goldVec);
     }
 }
+
+TEST(OpMorphology_Negative, createNull)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaMorphologyCreate(nullptr));
+}
+
+TEST(OpMorphology_Negative, operator)
+{
+    NVCVBorderType    borderMode = NVCV_BORDER_CONSTANT;
+    nvcv::ImageFormat format{NVCV_IMAGE_FORMAT_U8};
+    nvcv::Tensor      inTensor  = nvcv::util::CreateTensor(1, 24, 24, format);
+    nvcv::Tensor      outTensor = nvcv::util::CreateTensor(1, 24, 24, format);
+
+    cvcuda::Morphology morphOp;
+    int2               anchor(0, 0);
+
+    nvcv::Size2D maskSize(1, 1);
+
+    // testSet0: iteration < 0
+    EXPECT_THROW(morphOp(nullptr, inTensor, outTensor, nvcv::NullOpt, NVCV_ERODE, maskSize, anchor, -1, borderMode),
+                 nvcv::Exception);
+
+    // testSet1: NVCV_DILATE and NVCV_ERODE && iteration > 1 && null workspace
+    std::vector<NVCVMorphologyType> testSet1{NVCV_DILATE, NVCV_ERODE};
+    for (auto morphType : testSet1)
+    {
+        EXPECT_THROW(morphOp(nullptr, inTensor, outTensor, nvcv::NullOpt, morphType, maskSize, anchor, 2, borderMode),
+                     nvcv::Exception);
+    }
+
+    // testSet2: NVCV_CLOSE and NVCV_OPEN && null workspace
+    std::vector<NVCVMorphologyType> testSet2{NVCV_CLOSE, NVCV_OPEN};
+    for (auto morphType : testSet2)
+    {
+        EXPECT_THROW(morphOp(nullptr, inTensor, outTensor, nvcv::NullOpt, morphType, maskSize, anchor, 1, borderMode),
+                     nvcv::Exception);
+    }
+
+    // testSet3: invalid data type
+    {
+        nvcv::Tensor inTensorInvalid
+            = nvcv::util::CreateTensor(1, 24, 24, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_RGBAf16});
+        nvcv::Tensor outTensorInvalid
+            = nvcv::util::CreateTensor(1, 24, 24, nvcv::ImageFormat{NVCV_IMAGE_FORMAT_RGBAf16});
+        EXPECT_THROW(morphOp(nullptr, inTensorInvalid, outTensorInvalid, nvcv::NullOpt, NVCV_ERODE, maskSize, anchor, 0,
+                             borderMode),
+                     nvcv::Exception);
+    }
+
+    // testSet4: input format is not equal to output format
+    {
+        nvcv::Tensor outTensorInvalid = nvcv::util::CreateTensor(2, 24, 24, format);
+        EXPECT_THROW(
+            morphOp(nullptr, inTensor, outTensorInvalid, nvcv::NullOpt, NVCV_ERODE, maskSize, anchor, 0, borderMode),
+            nvcv::Exception);
+    }
+}
+
+TEST(OpMorphology_Negative, operator_varshape)
+{
+    NVCVBorderType    borderMode = NVCV_BORDER_CONSTANT;
+    nvcv::ImageFormat format{NVCV_IMAGE_FORMAT_U8};
+    const int         batches = 2;
+
+    std::vector<nvcv::Image> imgSrc;
+    nvcv::ImageBatchVarShape batchSrc(batches);
+    for (int i = 0; i < batches; ++i)
+    {
+        imgSrc.emplace_back(nvcv::Size2D{24, 24}, format);
+    }
+    batchSrc.pushBack(imgSrc.begin(), imgSrc.end());
+
+    std::vector<nvcv::Image> imgDst;
+    std::vector<nvcv::Image> imgWorkspace;
+    nvcv::ImageBatchVarShape batchDst(batches);
+    nvcv::ImageBatchVarShape batchWorkspace(batches);
+    for (int i = 0; i < batches; ++i)
+    {
+        imgDst.emplace_back(imgSrc[i].size(), imgSrc[i].format());
+        imgWorkspace.emplace_back(imgSrc[i].size(), imgSrc[i].format());
+    }
+    batchDst.pushBack(imgDst.begin(), imgDst.end());
+    batchWorkspace.pushBack(imgWorkspace.begin(), imgWorkspace.end());
+
+    // Create kernel mask size tensor
+    nvcv::Tensor maskTensor({{batches}, "N"}, nvcv::TYPE_2S32);
+    {
+        auto dev = maskTensor.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_NE(dev, nullptr);
+
+        std::vector<int2> vec(batches, int2{1, 1});
+
+        ASSERT_EQ(cudaSuccess,
+                  cudaMemcpy(dev->basePtr(), vec.data(), vec.size() * sizeof(int2), cudaMemcpyHostToDevice));
+    }
+
+    // Create Anchor tensor
+    nvcv::Tensor anchorTensor({{batches}, "N"}, nvcv::TYPE_2S32);
+    {
+        auto dev = anchorTensor.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_NE(dev, nullptr);
+
+        std::vector<int2> vec(batches, int2{0, 0});
+
+        ASSERT_EQ(cudaSuccess,
+                  cudaMemcpy(dev->basePtr(), vec.data(), vec.size() * sizeof(int2), cudaMemcpyHostToDevice));
+    }
+
+    cvcuda::Morphology morphOp;
+
+    // testSet0: iteration < 0
+    EXPECT_THROW(
+        morphOp(nullptr, batchSrc, batchDst, batchWorkspace, NVCV_ERODE, maskTensor, anchorTensor, -1, borderMode),
+        nvcv::Exception);
+
+    // testSet1: NVCV_DILATE and NVCV_ERODE && iteration > 1 && null workspace
+    std::vector<NVCVMorphologyType> testSet1{NVCV_DILATE, NVCV_ERODE};
+    for (auto morphType : testSet1)
+    {
+        EXPECT_THROW(
+            morphOp(nullptr, batchSrc, batchDst, nvcv::NullOpt, morphType, maskTensor, anchorTensor, 2, borderMode),
+            nvcv::Exception);
+    }
+
+    // testSet2: NVCV_CLOSE and NVCV_OPEN && null workspace
+    std::vector<NVCVMorphologyType> testSet2{NVCV_CLOSE, NVCV_OPEN};
+    for (auto morphType : testSet2)
+    {
+        EXPECT_THROW(
+            morphOp(nullptr, batchSrc, batchDst, nvcv::NullOpt, morphType, maskTensor, anchorTensor, 1, borderMode),
+            nvcv::Exception);
+    }
+
+    // testSet3: invalid data type
+    {
+        nvcv::ImageFormat        formatInvalid{NVCV_IMAGE_FORMAT_RGBAf16};
+        std::vector<nvcv::Image> imgSrcInvalid;
+        nvcv::ImageBatchVarShape batchSrcInvalid(batches);
+        for (int i = 0; i < batches; ++i)
+        {
+            imgSrcInvalid.emplace_back(nvcv::Size2D{24, 24}, formatInvalid);
+        }
+        batchSrcInvalid.pushBack(imgSrcInvalid.begin(), imgSrcInvalid.end());
+
+        std::vector<nvcv::Image> imgDstInvalid;
+        std::vector<nvcv::Image> imgWorkspaceInvalid;
+        nvcv::ImageBatchVarShape batchDstInvalid(batches);
+        nvcv::ImageBatchVarShape batchWorkspaceInvalid(batches);
+        for (int i = 0; i < batches; ++i)
+        {
+            imgDstInvalid.emplace_back(imgSrcInvalid[i].size(), imgSrcInvalid[i].format());
+            imgWorkspaceInvalid.emplace_back(imgSrcInvalid[i].size(), imgSrcInvalid[i].format());
+        }
+        batchDstInvalid.pushBack(imgDstInvalid.begin(), imgDstInvalid.end());
+        batchWorkspaceInvalid.pushBack(imgWorkspaceInvalid.begin(), imgWorkspaceInvalid.end());
+
+        EXPECT_THROW(morphOp(nullptr, batchSrcInvalid, batchDstInvalid, batchWorkspaceInvalid, NVCV_ERODE, maskTensor,
+                             anchorTensor, 1, borderMode),
+                     nvcv::Exception);
+    }
+
+    // testSet4: input format is not equal to output format
+    {
+        std::vector<nvcv::Image> imgDstInvalid;
+        std::vector<nvcv::Image> imgWorkspaceInvalid;
+        nvcv::ImageBatchVarShape batchDstInvalid(1);
+        nvcv::ImageBatchVarShape batchWorkspaceInvalid(1);
+        imgDstInvalid.emplace_back(imgSrc[0].size(), imgSrc[0].format());
+        imgWorkspaceInvalid.emplace_back(imgSrc[0].size(), imgSrc[0].format());
+        batchDstInvalid.pushBack(imgDstInvalid.begin(), imgDstInvalid.end());
+        batchWorkspaceInvalid.pushBack(imgWorkspaceInvalid.begin(), imgWorkspaceInvalid.end());
+
+        EXPECT_THROW(morphOp(nullptr, batchSrc, batchDstInvalid, batchWorkspaceInvalid, NVCV_ERODE, maskTensor,
+                             anchorTensor, 1, borderMode),
+                     nvcv::Exception);
+    }
+}
diff --git a/tests/cvcuda/system/TestUtils.cpp b/tests/cvcuda/system/TestUtils.cpp
new file mode 100644
index 00000000..07199806
--- /dev/null
+++ b/tests/cvcuda/system/TestUtils.cpp
@@ -0,0 +1,332 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TestUtils.hpp"
+
+#include "Definitions.hpp"
+
+#include <cvcuda/cuda_tools/TypeTraits.hpp>
+
+namespace cuda = nvcv::cuda;
+
+using std::vector;
+
+//-==================================================================================================================-//
+// Generate an random image image vector.
+template<typename T>
+void generateRandVec(T *dst, size_t size, RandEng &eng)
+{
+    RandInt<T> rand(0, cuda::TypeTraits<T>::max);
+
+    // clang-format off
+    for (size_t i = 0; i < size; i++) dst[i] = rand(eng);
+    // clang-format on
+}
+
+template<>
+void generateRandVec(float *dst, size_t size, RandEng &eng)
+{
+    RandFlt<float> rand(0.0f, 1.0f);
+
+    // clang-format off
+    for (size_t i = 0; i < size; i++) dst[i] = rand(eng);
+    // clang-format on
+}
+
+template<>
+void generateRandVec(double *dst, size_t size, RandEng &eng)
+{
+    RandFlt<double> rand(0.0, 1.0);
+
+    // clang-format off
+    for (size_t i = 0; i < size; i++) dst[i] = rand(eng);
+    // clang-format on
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_RAND_VEC(T) template void generateRandVec<T>(T *, size_t, RandEng &)
+
+MAKE_RAND_VEC(uint8_t);
+MAKE_RAND_VEC(uint16_t);
+MAKE_RAND_VEC(int32_t);
+MAKE_RAND_VEC(float);
+MAKE_RAND_VEC(double);
+
+#undef MAKE_RAND_VEC
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+template<typename T>
+void generateRandTestRGB(T *dst, size_t size, RandEng &eng, bool rgba, bool bga)
+{
+    constexpr T max    = std::is_floating_point_v<T> ? 1 : cuda::TypeTraits<T>::max;
+    constexpr T val[3] = {0, max / 2, max};
+
+    const size_t minSize = 3 * 3 * 3 * (3 + rgba);
+
+    generateRandVec(dst, size, eng);
+
+    if (size > minSize)
+    {
+        size_t idx = 0;
+
+        for (uint r = 0; r < 3; r++)
+        {
+            const T red = val[r];
+
+            for (uint g = 0; g < 3; g++)
+            {
+                const T grn = val[g];
+
+                for (uint b = 0; b < 3; b++)
+                {
+                    const T blu = val[b];
+
+                    // clang-format off
+                    if (bga) { dst[idx++] = blu;  dst[idx++] = grn;  dst[idx++] = red; }
+                    else     { dst[idx++] = red;  dst[idx++] = grn;  dst[idx++] = blu; }
+                    if (rgba)  dst[idx++] = max;
+                    // clang-format on
+                }
+            }
+        }
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_RAND_RGB_TEST(T) template void generateRandTestRGB<T>(T *, size_t, RandEng &, bool, bool)
+
+MAKE_RAND_RGB_TEST(uint8_t);
+MAKE_RAND_RGB_TEST(uint16_t);
+MAKE_RAND_RGB_TEST(int32_t);
+MAKE_RAND_RGB_TEST(float);
+MAKE_RAND_RGB_TEST(double);
+
+#undef MAKE_RAND_RGB_TEST
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+// Generate an image potentially containing all 16,777,216 RGB8 colors, assuming the image / tensor dimensions are
+// sufficiently large (http://www.brucelindbloom.com/downloads/RGB16Million.png); otherwise the image is cropped to the
+// provided sizes. Generates consecutive 256 x 256 image blocks where, within each block, red varies from 0 to 255
+// horizontally and green varies from 0 to 255 vertically. Blue increments from 0 to 255 in consecutive blocks; partial
+// blocks (i.e., those that may be cropped to specified dimensions) still increment the blue value. All values are then
+// rescaled to fit the data type--e.g., floating point types are rescaled to be between 0 and 1.
+// To get all 16,777,216 8-bit RGB colors, generate a single image (i.e., tensor batch = 1) of size 1 x 4096 x 4096,
+// or generate a tensor of 4 x 2048 x 2048, 16 x 1024 x 1024, 64 x 512 x 512, or 256 x 256 x 256.
+// Note: generates interleaved (non-planar) data.
+template<typename T>
+void generateAllRGB(T *dst, uint wdth, uint hght, uint num, bool rgba, bool bga)
+{
+    constexpr T      max   = std::is_floating_point_v<T> ? 1 : cuda::TypeTraits<T>::max;
+    constexpr double round = std::is_floating_point_v<T> ? 0 : 0.5;
+    constexpr double scale = (double)max / 255.0;
+
+    const size_t incrH = wdth * (3 + rgba);
+    const size_t incrN = hght * incrH;
+
+    uint addB = 0;
+
+    for (uint i = 0; i < num; i++)
+    {
+        T *img = dst + i * incrN;
+
+        for (uint y = 0; y < hght; y++)
+        {
+            T *row = img + y * incrH;
+
+            uint8_t grn = static_cast<uint8_t>(y & 255);
+
+            for (uint x = 0; x < wdth; x++)
+            {
+                uint8_t red = static_cast<uint8_t>(x & 255);
+                uint8_t blu = static_cast<uint8_t>(((x >> 8) + addB) & 255);
+
+                // clang-format off
+                if (bga) std::swap(red, blu);
+                *row++ = static_cast<T>(red * scale + round);
+                *row++ = static_cast<T>(grn * scale + round);
+                *row++ = static_cast<T>(blu * scale + round);
+                if (rgba) *row++ = max;
+                // clang-format on
+            }
+            // clang-format off
+            if (grn == 255) addB += ((wdth + 255) >> 8);
+            // clang-format on
+        }
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define MAKE_ALL_RGB_TEST(T) template void generateAllRGB<T>(T *, uint, uint, uint, bool rgba, bool bga)
+
+MAKE_ALL_RGB_TEST(uint8_t);
+MAKE_ALL_RGB_TEST(uint16_t);
+MAKE_ALL_RGB_TEST(int32_t);
+MAKE_ALL_RGB_TEST(float);
+MAKE_ALL_RGB_TEST(double);
+
+#undef MAKE_ALL_RGB_TEST
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+// Generate a random HSV (Hue-Saturation-Value) image where the Hue range can be specified and the Saturation and Value
+// ranges are scaled according to the data type. Since Hue is circular, it can be useful to generate Hue values outside
+// the standard range (e.g., min to test if a function that processes HSV images properly accounts for wrap-around Hue values.
+// Note: generates interleaved (non-planar) data.
+template<typename T, bool FullRange>
+void generateRandHSV(T *dst, size_t size, RandEng &eng, double minHueMult, double maxHueMult)
+{
+    ASSERT_EQ(size % 3, 0);
+
+    constexpr T      max   = std::is_floating_point_v<T> ? 1 : cuda::TypeTraits<T>::max;
+    constexpr uint   range = (sizeof(T) > 1) ? 360 : (FullRange ? 256 : 180);
+    constexpr double scale = (double)range / 360.0;
+    constexpr double round = std::is_floating_point_v<T> ? 0 : 0.5;
+
+    // clang-format off
+    if (minHueMult > 1.0) minHueMult = 0.0;
+    if (maxHueMult < 0.0) maxHueMult = 1.0;
+    // clang-format on
+
+    double minHue = minHueMult * range;
+    double maxHue = maxHueMult * range;
+
+    RandFlt<double> randHue(minHue, maxHue);
+    RandFlt<double> randSV(0.0, 1.0);
+
+    for (size_t i = 0; i < size; i += 3)
+    {
+        // clang-format off
+        *dst++ = static_cast<T>(randHue(eng) * scale + round);
+        *dst++ = static_cast<T>(randSV (eng) * max   + round);
+        *dst++ = static_cast<T>(randSV (eng) * max   + round);
+        // clang-format on
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+// Restricted range hue (FullRange = false): values between [0-180). Applies only to uint8_t, but still need to
+// instantiate all the types.
+#define MAKE_RAND_HSV_TEST(T) template void generateRandHSV<T, false>(T *, size_t, RandEng &, double, double)
+
+MAKE_RAND_HSV_TEST(uint8_t);
+MAKE_RAND_HSV_TEST(uint16_t);
+MAKE_RAND_HSV_TEST(int32_t);
+MAKE_RAND_HSV_TEST(float);
+MAKE_RAND_HSV_TEST(double);
+
+#undef MAKE_RAND_HSV_TEST
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+// Full range hue (FullRange = false): values between [0-256). Applies only to uint8_t, but still need to
+// instantiate all the types.
+#define MAKE_RAND_HSV_TEST(T) template void generateRandHSV<T, true>(T *, size_t, RandEng &, double, double)
+
+MAKE_RAND_HSV_TEST(uint8_t);
+MAKE_RAND_HSV_TEST(uint16_t);
+MAKE_RAND_HSV_TEST(int32_t);
+MAKE_RAND_HSV_TEST(float);
+MAKE_RAND_HSV_TEST(double);
+
+#undef MAKE_RAND_HSV_TEST
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//-==================================================================================================================-//
+// Generate an HSV (Hue-Saturation-Value) image containing blocks of size H_range x 256 where H_range is either:
+//   * 360 (for size(T) > 1),
+//   * 255 (for size(T) == 1 and FullRange == true), or
+//   * 180 (for size(T) == 1 and FullRange == false).
+// Within each block, H (Hue) varies from 0 to H_range-1 horizontally and S (Saturation) varies from 0 to 255 vertically.
+// V (Value) increments from 0 to 255 in consecutive blocks. The values for S and V are normalized (i.e., rescaled)
+// according to the data type.
+// To get all available HSV values, generate a single image (i.e., tensor batch = 1) of size 1 x (16*H_range) x 4096,
+// or a tensor of 4 x (8*H_range) x 2048, 16 x (4*H_range) x 1024, 64 x (2*H_range) x 512, or 256 x H_range x 256.
+// Note: generates interleaved (non-planar) data.
+template<typename T, bool FullRange>
+void generateAllHSV(T *dst, uint wdth, uint hght, uint num)
+{
+    constexpr T      max   = std::is_floating_point_v<T> ? 1 : cuda::TypeTraits<T>::max;
+    constexpr uint   range = (sizeof(T) > 1) ? 360 : (FullRange ? 256 : 180);
+    constexpr double scale = (double)range / 360.0;
+    constexpr double norm  = (double)max / 255.0;
+    constexpr double round = std::is_floating_point_v<T> ? 0 : 0.5;
+
+    constexpr uint stepV = 1; // Step size for V (value) from one block to the next. 17 is prime, so 256 % (17 * m) will
+                              // always be unique for 0 <= m < 256.
+    const size_t   incrH = wdth * 3;
+    const size_t   incrN = hght * incrH;
+
+    uint addV = 0;
+
+    for (uint i = 0; i < num; i++)
+    {
+        T *img = dst + i * incrN;
+
+        for (uint y = 0; y < hght; y++)
+        {
+            T *row = img + y * incrH;
+
+            uint8_t S = static_cast<uint8_t>(y & 255);
+
+            // clang-format off
+            for (uint x = 0; x < wdth; x++)
+            {
+                uint8_t H = static_cast<uint8_t>(x % range);
+                uint8_t V = static_cast<uint8_t>((((uint)(x / range) + addV) * stepV) & 255);
+
+                *row++ = static_cast<T>(H * scale + round);
+                *row++ = static_cast<T>(S * norm  + round);
+                *row++ = static_cast<T>(V * norm  + round);
+            }
+            if (S == 255) addV += ((wdth + range - 1) / range);
+            // clang-format on
+        }
+    }
+}
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+// Restricted range hue (FullRange = false): values between [0-180). Applies only to uint8_t, but still need to
+// instantiate all the types.
+#define MAKE_ALL_HSV_TEST(T) template void generateAllHSV<T, false>(T *, uint, uint, uint)
+
+MAKE_ALL_HSV_TEST(uint8_t);
+MAKE_ALL_HSV_TEST(uint16_t);
+MAKE_ALL_HSV_TEST(int32_t);
+MAKE_ALL_HSV_TEST(float);
+MAKE_ALL_HSV_TEST(double);
+
+#undef MAKE_ALL_HSV_TEST
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+// Full range hue (FullRange = false): values between [0-256). Applies only to uint8_t, but still need to
+// instantiate all the types.
+#define MAKE_ALL_HSV_TEST(T) template void generateAllHSV<T, true>(T *, uint, uint, uint)
+
+MAKE_ALL_HSV_TEST(uint8_t);
+MAKE_ALL_HSV_TEST(uint16_t);
+MAKE_ALL_HSV_TEST(int32_t);
+MAKE_ALL_HSV_TEST(float);
+MAKE_ALL_HSV_TEST(double);
+
+#undef MAKE_ALL_HSV_TEST
+
+//--------------------------------------------------------------------------------------------------------------------//
diff --git a/tests/cvcuda/system/TestUtils.hpp b/tests/cvcuda/system/TestUtils.hpp
new file mode 100644
index 00000000..6915e1db
--- /dev/null
+++ b/tests/cvcuda/system/TestUtils.hpp
@@ -0,0 +1,187 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVCV_TEST_COMMON_UTILS_HPP
+#define NVCV_TEST_COMMON_UTILS_HPP
+
+#include <random>
+#include <vector>
+
+using RandEng = std::default_random_engine;
+
+template<typename T>
+using RandInt = std::uniform_int_distribution<T>;
+
+template<typename T>
+using RandFlt = std::uniform_real_distribution<T>;
+
+//--------------------------------------------------------------------------------------------------------------------//
+template<typename T>
+void generateRandVec(T *dst, size_t size, RandEng &eng);
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+inline void generateRandVec(std::vector<T> &dst, RandEng &eng)
+{
+    generateRandVec<T>(dst.data(), dst.size(), eng);
+}
+
+//--------------------------------------------------------------------------------------------------------------------//
+template<typename T>
+void generateRandTestRGB(T *dst, size_t size, RandEng &eng, bool rgba = false, bool bga = false);
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+inline void generateRandTestRGB(std::vector<T> &dst, RandEng &eng, bool rgba = false, bool bga = false)
+{
+    generateRandTestRGB<T>(dst.data(), dst.size(), eng, rgba, bga);
+}
+
+//--------------------------------------------------------------------------------------------------------------------//
+template<typename T>
+void generateAllRGB(T *dst, uint wdth, uint hght, uint num, bool rgba = false, bool bga = false);
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T>
+inline void generateAllRGB(std::vector<T> &dst, uint wdth, uint hght, uint num, bool rgba = false, bool bga = false)
+{
+    ASSERT_GE(dst.size(), (size_t)num * (size_t)hght * (size_t)wdth * (size_t)(3 + rgba));
+    generateAllRGB<T>(dst.data(), wdth, hght, num, rgba, bga);
+}
+
+//--------------------------------------------------------------------------------------------------------------------//
+template<typename T, bool FullRange>
+void generateRandHSV(T *dst, size_t size, RandEng &eng, double minHueMult = 0.0, double maxHueMult = 1.0);
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T, bool FullRange>
+inline void generateRandHSV(std::vector<T> &dst, RandEng &eng, double minHueMult = 0.0, double maxHueMult = 1.0)
+{
+    generateRandHSV<T, FullRange>(dst.data(), dst.size(), eng, minHueMult, maxHueMult);
+}
+
+//--------------------------------------------------------------------------------------------------------------------//
+template<typename T, bool FullRange>
+void generateAllHSV(T *dst, uint wdth, uint hght, uint num);
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+template<typename T, bool FullRange>
+inline void generateAllHSV(std::vector<T> &dst, uint wdth, uint hght, uint num)
+{
+    ASSERT_EQ(dst.size() % 3, 0);
+    ASSERT_GE(dst.size(), (size_t)num * (size_t)hght * (size_t)wdth * (size_t)3);
+    generateAllHSV<T, FullRange>(dst.data(), wdth, hght, num);
+}
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+// NOTE: the "do {" ... "} while (false)" statements in the macros below add scope context to multi-statement macro
+//       expansions so they can be nested inside non-scoped statements (e.g., "if", "for", etc. statements that don't
+//       have braces) and still be treated like a single statement that can be terminated with a semicolon (";").
+//       For example, the "do-while" construct allows for:
+//
+//           if (<condition>)
+//               EXPECT_NEAR_VEC_CNT(vec1, vec2, maxDiff, maxCnt, passes);
+//           else
+//               std::cout << "Test condition not satisfied.\n";
+//
+//       without the problems that would otherwise occur from multi-statement macro expansion.
+//--------------------------------------------------------------------------------------------------------------------//
+#define EXPECT_NEAR_ARR_CNT(data1, data2, size, maxDiff, maxCnt, passes)                                        \
+    do                                                                                                          \
+    {                                                                                                           \
+        uint cnt = 0;                                                                                           \
+        for (size_t i = 0; i < size && cnt < maxCnt; i++)                                                       \
+        {                                                                                                       \
+            EXPECT_NEAR(data1[i], data2[i], maxDiff) << "At index " << i << " (error count = " << ++cnt << ")"; \
+        }                                                                                                       \
+        passes = (cnt == 0);                                                                                    \
+    }                                                                                                           \
+    while (false)
+
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define EXPECT_NEAR_VEC_CNT(vec1, vec2, maxDiff, maxCnt, passes)                             \
+    do                                                                                       \
+    {                                                                                        \
+        ASSERT_EQ(vec1.size(), vec2.size());                                                 \
+        EXPECT_NEAR_ARR_CNT(vec1.data(), vec2.data(), vec1.size(), maxDiff, maxCnt, passes); \
+    }                                                                                        \
+    while (false)
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+//--------------------------------------------------------------------------------------------------------------------//
+// clang-format off
+#define EXPECT_NEAR_HSV_ARR_CNT(data1, data2, size, range, maxDiff, maxCnt, passes)                                   \
+    do                                                                                                                \
+    {                                                                                                                 \
+        ASSERT_EQ(size % 3, 0);                                                                                       \
+        uint   cnt  = 0;                                                                                              \
+        double half = range * 0.5;                                                                                    \
+        for (size_t i = 0; i < size && cnt < maxCnt; i += 3)                                                          \
+        {                                                                                                             \
+            double val1 = static_cast<double>(data1[i]);                                                              \
+            double val2 = static_cast<double>(data2[i]);                                                              \
+            if (val2 >= val1 && val2 - val1 > half)                                                                   \
+                EXPECT_NEAR(data1[i] + range, data2[i], maxDiff) << "At index " << i                                  \
+                                                                 << " (error count = " << ++cnt << ")";               \
+            else if (val1 - val2 > half)                                                                              \
+                EXPECT_NEAR(data1[i], data2[i] + range, maxDiff) << "At index " << i                                  \
+                                                                 << " (error count = " << ++cnt << ")";               \
+            else                                                                                                      \
+                EXPECT_NEAR(data1[i], data2[i], maxDiff) << "At index " << i   << " (error count = " << ++cnt << ")"; \
+            EXPECT_NEAR(data1[i+1], data2[i+1], maxDiff) << "At index " << i+1 << " (error count = " << ++cnt << ")"; \
+            EXPECT_NEAR(data1[i+2], data2[i+2], maxDiff) << "At index " << i+2 << " (error count = " << ++cnt << ")"; \
+        }                                                                                                             \
+        passes = (cnt == 0);                                                                                          \
+    }                                                                                                                 \
+    while (false)
+
+// clang-format on
+//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+#define EXPECT_NEAR_HSV_VEC_CNT(vec1, vec2, range, maxDiff, maxCnt, passes)                             \
+    do                                                                                                  \
+    {                                                                                                   \
+        ASSERT_EQ(vec1.size(), vec2.size());                                                            \
+        EXPECT_NEAR_HSV_ARR_CNT(vec1.data(), vec2.data(), vec1.size(), range, maxDiff, maxCnt, passes); \
+    }                                                                                                   \
+    while (false)
+
+//--------------------------------------------------------------------------------------------------------------------//
+
+/*
+FYI: gtest expands the following macro statement:
+
+    EXPECT_NEAR(data1[i], data2[i], maxDiff) << "At index " << i << " (error count = " << ++cnt << ")";
+
+to:
+
+    switch (0)
+    case 0:
+    default:
+        if (const ::testing::AssertionResult gtest_ar
+                = ::testing::internal::DoubleNearPredFormat("refVec.data()[i]", "dstVec.data()[i]", "maxDiff",
+                                                             refVec.data()[i] ,  dstVec.data()[i] ,  maxDiff)) ;
+        else
+            ::testing::internal::AssertHelper(::testing::TestPartResult::kNonFatalFailure,
+                                              __FILE__, __LINE__, gtest_ar.failure_message())
+                = ::testing::Message() << "At index " << i << " (error count = " << ++cnt << ")";
+
+The switch statement is to disambiguate the else clause if the macro is expanded in a nested if without braces.
+*/
+
+#endif // NVCV_TEST_COMMON_UTILS_HPP