From 5ec0b30018755f24e3c47be1ee6a33fb4eb8b19e Mon Sep 17 00:00:00 2001
From: Megamouse <studienricky89@googlemail.com>
Date: Fri, 19 Jul 2024 22:45:41 +0200
Subject: [PATCH] Fix compilation

---
 .../build/include/opencv2/calib3d.hpp         |  4131 +++
 .../build/include/opencv2/calib3d/calib3d.hpp |    48 +
 .../build/include/opencv2/calib3d/calib3d_c.h |   150 +
 .../opencv410/build/include/opencv2/core.hpp  |  3421 +++
 .../build/include/opencv2/core/affine.hpp     |   678 +
 .../build/include/opencv2/core/async.hpp      |   101 +
 .../build/include/opencv2/core/base.hpp       |   664 +
 .../include/opencv2/core/bindings_utils.hpp   |   357 +
 .../build/include/opencv2/core/bufferpool.hpp |    40 +
 .../build/include/opencv2/core/check.hpp      |   170 +
 .../build/include/opencv2/core/core.hpp       |    48 +
 .../build/include/opencv2/core/core_c.h       |  3128 ++
 .../build/include/opencv2/core/cuda.hpp       |  1337 +
 .../build/include/opencv2/core/cuda.inl.hpp   |   763 +
 .../build/include/opencv2/core/cuda/block.hpp |   211 +
 .../opencv2/core/cuda/border_interpolate.hpp  |   722 +
 .../build/include/opencv2/core/cuda/color.hpp |   309 +
 .../include/opencv2/core/cuda/common.hpp      |   131 +
 .../opencv2/core/cuda/datamov_utils.hpp       |   113 +
 .../opencv2/core/cuda/detail/color_detail.hpp |  2018 ++
 .../opencv2/core/cuda/detail/reduce.hpp       |   394 +
 .../core/cuda/detail/reduce_key_val.hpp       |   567 +
 .../core/cuda/detail/transform_detail.hpp     |   392 +
 .../core/cuda/detail/type_traits_detail.hpp   |   191 +
 .../core/cuda/detail/vec_distance_detail.hpp  |   121 +
 .../opencv2/core/cuda/dynamic_smem.hpp        |    88 +
 .../include/opencv2/core/cuda/emulation.hpp   |   269 +
 .../include/opencv2/core/cuda/filters.hpp     |   293 +
 .../include/opencv2/core/cuda/funcattrib.hpp  |    79 +
 .../include/opencv2/core/cuda/functional.hpp  |   805 +
 .../include/opencv2/core/cuda/limits.hpp      |   128 +
 .../include/opencv2/core/cuda/reduce.hpp      |   230 +
 .../opencv2/core/cuda/saturate_cast.hpp       |   292 +
 .../build/include/opencv2/core/cuda/scan.hpp  |   258 +
 .../opencv2/core/cuda/simd_functions.hpp      |   869 +
 .../include/opencv2/core/cuda/transform.hpp   |    75 +
 .../include/opencv2/core/cuda/type_traits.hpp |    90 +
 .../include/opencv2/core/cuda/utility.hpp     |   230 +
 .../opencv2/core/cuda/vec_distance.hpp        |   232 +
 .../include/opencv2/core/cuda/vec_math.hpp    |   923 +
 .../include/opencv2/core/cuda/vec_traits.hpp  |   288 +
 .../build/include/opencv2/core/cuda/warp.hpp  |   139 +
 .../include/opencv2/core/cuda/warp_reduce.hpp |    76 +
 .../opencv2/core/cuda/warp_shuffle.hpp        |   162 +
 .../opencv2/core/cuda_stream_accessor.hpp     |    86 +
 .../build/include/opencv2/core/cuda_types.hpp |   144 +
 .../include/opencv2/core/cv_cpu_dispatch.h    |   386 +
 .../include/opencv2/core/cv_cpu_helper.h      |   613 +
 .../build/include/opencv2/core/cvdef.h        |   932 +
 .../build/include/opencv2/core/cvstd.hpp      |   189 +
 .../build/include/opencv2/core/cvstd.inl.hpp  |   197 +
 .../include/opencv2/core/cvstd_wrapper.hpp    |   154 +
 .../opencv2/core/detail/async_promise.hpp     |    69 +
 .../core/detail/dispatch_helper.impl.hpp      |    49 +
 .../opencv2/core/detail/exception_ptr.hpp     |    21 +
 .../build/include/opencv2/core/directx.hpp    |   184 +
 .../include/opencv2/core/dualquaternion.hpp   |   979 +
 .../opencv2/core/dualquaternion.inl.hpp       |   487 +
 .../build/include/opencv2/core/eigen.hpp      |   403 +
 .../build/include/opencv2/core/fast_math.hpp  |   433 +
 .../build/include/opencv2/core/hal/hal.hpp    |   260 +
 .../include/opencv2/core/hal/interface.h      |   190 +
 .../build/include/opencv2/core/hal/intrin.hpp |  1256 +
 .../include/opencv2/core/hal/intrin_avx.hpp   |  3177 ++
 .../opencv2/core/hal/intrin_avx512.hpp        |  3090 ++
 .../include/opencv2/core/hal/intrin_cpp.hpp   |  3317 +++
 .../opencv2/core/hal/intrin_forward.hpp       |   191 +
 .../include/opencv2/core/hal/intrin_lasx.hpp  |  3024 ++
 .../include/opencv2/core/hal/intrin_lsx.hpp   |  2538 ++
 .../include/opencv2/core/hal/intrin_msa.hpp   |  1887 ++
 .../include/opencv2/core/hal/intrin_neon.hpp  |  2655 ++
 .../include/opencv2/core/hal/intrin_rvv.hpp   |  3345 +++
 .../opencv2/core/hal/intrin_rvv071.hpp        |  2899 ++
 .../hal/intrin_rvv_010_compat_non-policy.hpp  | 24395 ++++++++++++++++
 ...n_rvv_010_compat_overloaded-non-policy.hpp |   768 +
 .../core/hal/intrin_rvv_011_compat.hpp        |    33 +
 .../core/hal/intrin_rvv_compat_overloaded.hpp |   213 +
 .../opencv2/core/hal/intrin_rvv_scalable.hpp  |  2182 ++
 .../include/opencv2/core/hal/intrin_sse.hpp   |  3468 +++
 .../opencv2/core/hal/intrin_sse_em.hpp        |   180 +
 .../include/opencv2/core/hal/intrin_vsx.hpp   |  1608 +
 .../include/opencv2/core/hal/intrin_wasm.hpp  |  2783 ++
 .../include/opencv2/core/hal/msa_macros.h     |  1558 +
 .../opencv2/core/hal/simd_utils.impl.hpp      |   186 +
 .../build/include/opencv2/core/mat.hpp        |  3797 +++
 .../build/include/opencv2/core/mat.inl.hpp    |  3422 +++
 .../build/include/opencv2/core/matx.hpp       |   544 +
 .../build/include/opencv2/core/matx.inl.hpp   |  1115 +
 .../build/include/opencv2/core/neon_utils.hpp |   128 +
 .../build/include/opencv2/core/ocl.hpp        |   923 +
 .../include/opencv2/core/ocl_genbase.hpp      |    69 +
 .../include/opencv2/core/opencl/ocl_defs.hpp  |    82 +
 .../opencv2/core/opencl/opencl_info.hpp       |   213 +
 .../opencv2/core/opencl/opencl_svm.hpp        |    81 +
 .../runtime/autogenerated/opencl_clblas.hpp   |   602 +
 .../runtime/autogenerated/opencl_clfft.hpp    |   146 +
 .../runtime/autogenerated/opencl_core.hpp     |   371 +
 .../autogenerated/opencl_core_wrappers.hpp    |   272 +
 .../runtime/autogenerated/opencl_gl.hpp       |    62 +
 .../autogenerated/opencl_gl_wrappers.hpp      |    42 +
 .../core/opencl/runtime/opencl_clblas.hpp     |    53 +
 .../core/opencl/runtime/opencl_clfft.hpp      |    53 +
 .../core/opencl/runtime/opencl_core.hpp       |    84 +
 .../opencl/runtime/opencl_core_wrappers.hpp   |    47 +
 .../opencv2/core/opencl/runtime/opencl_gl.hpp |    53 +
 .../opencl/runtime/opencl_gl_wrappers.hpp     |    47 +
 .../core/opencl/runtime/opencl_svm_20.hpp     |    48 +
 .../opencl/runtime/opencl_svm_definitions.hpp |    42 +
 .../runtime/opencl_svm_hsa_extension.hpp      |   166 +
 .../build/include/opencv2/core/opengl.hpp     |   733 +
 .../build/include/opencv2/core/operations.hpp |   612 +
 .../build/include/opencv2/core/optim.hpp      |   307 +
 .../build/include/opencv2/core/ovx.hpp        |    28 +
 .../parallel/backend/parallel_for.openmp.hpp  |    72 +
 .../parallel/backend/parallel_for.tbb.hpp     |   153 +
 .../core/parallel/parallel_backend.hpp        |    90 +
 .../include/opencv2/core/persistence.hpp      |  1310 +
 .../build/include/opencv2/core/quaternion.hpp |  1696 ++
 .../include/opencv2/core/quaternion.inl.hpp   |  1063 +
 .../build/include/opencv2/core/saturate.hpp   |   180 +
 .../include/opencv2/core/simd_intrinsics.hpp  |    87 +
 .../build/include/opencv2/core/softfloat.hpp  |   514 +
 .../build/include/opencv2/core/sse_utils.hpp  |   652 +
 .../build/include/opencv2/core/traits.hpp     |   417 +
 .../build/include/opencv2/core/types.hpp      |  2463 ++
 .../build/include/opencv2/core/types_c.h      |  2110 ++
 .../build/include/opencv2/core/utility.hpp    |  1231 +
 .../opencv2/core/utils/allocator_stats.hpp    |    29 +
 .../core/utils/allocator_stats.impl.hpp       |   106 +
 .../include/opencv2/core/utils/filesystem.hpp |    82 +
 .../opencv2/core/utils/fp_control_utils.hpp   |    69 +
 .../opencv2/core/utils/instrumentation.hpp    |   125 +
 .../opencv2/core/utils/logger.defines.hpp     |    42 +
 .../include/opencv2/core/utils/logger.hpp     |   218 +
 .../include/opencv2/core/utils/logtag.hpp     |    28 +
 .../build/include/opencv2/core/utils/tls.hpp  |   235 +
 .../include/opencv2/core/utils/trace.hpp      |   252 +
 .../build/include/opencv2/core/va_intel.hpp   |    75 +
 .../build/include/opencv2/core/version.hpp    |    26 +
 .../build/include/opencv2/core/vsx_utils.hpp  |  1047 +
 .../build/include/opencv2/cvconfig.h          |   152 +
 .../opencv410/build/include/opencv2/dnn.hpp   |    78 +
 .../build/include/opencv2/dnn/all_layers.hpp  |  1196 +
 .../build/include/opencv2/dnn/dict.hpp        |   160 +
 .../build/include/opencv2/dnn/dnn.hpp         |  1946 ++
 .../build/include/opencv2/dnn/dnn.inl.hpp     |   412 +
 .../include/opencv2/dnn/layer.details.hpp     |    78 +
 .../build/include/opencv2/dnn/layer.hpp       |    88 +
 .../build/include/opencv2/dnn/shape_utils.hpp |   290 +
 .../include/opencv2/dnn/utils/debug_utils.hpp |    24 +
 .../opencv2/dnn/utils/inference_engine.hpp    |    82 +
 .../build/include/opencv2/dnn/version.hpp     |    21 +
 .../build/include/opencv2/features2d.hpp      |  1602 +
 .../include/opencv2/features2d/features2d.hpp |    48 +
 .../opencv2/features2d/hal/interface.h        |    33 +
 .../opencv410/build/include/opencv2/flann.hpp |   629 +
 .../build/include/opencv2/flann/all_indices.h |   162 +
 .../build/include/opencv2/flann/allocator.h   |   196 +
 .../build/include/opencv2/flann/any.h         |   355 +
 .../include/opencv2/flann/autotuned_index.h   |   594 +
 .../include/opencv2/flann/composite_index.h   |   196 +
 .../build/include/opencv2/flann/config.h      |    42 +
 .../build/include/opencv2/flann/defines.h     |   169 +
 .../build/include/opencv2/flann/dist.h        |  1292 +
 .../build/include/opencv2/flann/dummy.h       |    16 +
 .../include/opencv2/flann/dynamic_bitset.h    |   160 +
 .../build/include/opencv2/flann/flann.hpp     |    48 +
 .../include/opencv2/flann/flann_base.hpp      |   312 +
 .../build/include/opencv2/flann/general.h     |    65 +
 .../include/opencv2/flann/ground_truth.h      |    98 +
 .../build/include/opencv2/flann/hdf5.h        |   235 +
 .../build/include/opencv2/flann/heap.h        |   244 +
 .../flann/hierarchical_clustering_index.h     |   846 +
 .../include/opencv2/flann/index_testing.h     |   319 +
 .../include/opencv2/flann/kdtree_index.h      |   636 +
 .../opencv2/flann/kdtree_single_index.h       |   645 +
 .../include/opencv2/flann/kmeans_index.h      |  1819 ++
 .../include/opencv2/flann/linear_index.h      |   135 +
 .../build/include/opencv2/flann/logger.h      |   138 +
 .../build/include/opencv2/flann/lsh_index.h   |   403 +
 .../build/include/opencv2/flann/lsh_table.h   |   522 +
 .../build/include/opencv2/flann/matrix.h      |   121 +
 .../build/include/opencv2/flann/miniflann.hpp |   185 +
 .../build/include/opencv2/flann/nn_index.h    |   180 +
 .../include/opencv2/flann/object_factory.h    |    95 +
 .../build/include/opencv2/flann/params.h      |   126 +
 .../build/include/opencv2/flann/random.h      |   156 +
 .../build/include/opencv2/flann/result_set.h  |   548 +
 .../build/include/opencv2/flann/sampling.h    |    84 +
 .../build/include/opencv2/flann/saving.h      |   191 +
 .../include/opencv2/flann/simplex_downhill.h  |   190 +
 .../build/include/opencv2/flann/timer.h       |    99 +
 .../opencv410/build/include/opencv2/gapi.hpp  |    42 +
 .../build/include/opencv2/gapi/core.hpp       |  1911 ++
 .../build/include/opencv2/gapi/cpu/core.hpp   |    27 +
 .../include/opencv2/gapi/cpu/gcpukernel.hpp   |   542 +
 .../include/opencv2/gapi/cpu/imgproc.hpp      |    27 +
 .../build/include/opencv2/gapi/cpu/ot.hpp     |    29 +
 .../build/include/opencv2/gapi/cpu/stereo.hpp |    48 +
 .../build/include/opencv2/gapi/cpu/video.hpp  |    25 +
 .../build/include/opencv2/gapi/fluid/core.hpp |    20 +
 .../opencv2/gapi/fluid/gfluidbuffer.hpp       |   154 +
 .../opencv2/gapi/fluid/gfluidkernel.hpp       |   442 +
 .../include/opencv2/gapi/fluid/imgproc.hpp    |    20 +
 .../build/include/opencv2/gapi/garg.hpp       |   311 +
 .../build/include/opencv2/gapi/garray.hpp     |   440 +
 .../include/opencv2/gapi/gasync_context.hpp   |    63 +
 .../build/include/opencv2/gapi/gcall.hpp      |    78 +
 .../build/include/opencv2/gapi/gcommon.hpp    |   309 +
 .../build/include/opencv2/gapi/gcompiled.hpp  |   232 +
 .../include/opencv2/gapi/gcompiled_async.hpp  |    73 +
 .../include/opencv2/gapi/gcompoundkernel.hpp  |   139 +
 .../include/opencv2/gapi/gcomputation.hpp     |   581 +
 .../opencv2/gapi/gcomputation_async.hpp       |    69 +
 .../build/include/opencv2/gapi/gframe.hpp     |   113 +
 .../build/include/opencv2/gapi/gkernel.hpp    |   757 +
 .../build/include/opencv2/gapi/gmat.hpp       |   292 +
 .../build/include/opencv2/gapi/gmetaarg.hpp   |    80 +
 .../build/include/opencv2/gapi/gopaque.hpp    |   369 +
 .../build/include/opencv2/gapi/gproto.hpp     |   159 +
 .../build/include/opencv2/gapi/gpu/core.hpp   |    27 +
 .../include/opencv2/gapi/gpu/ggpukernel.hpp   |    18 +
 .../include/opencv2/gapi/gpu/imgproc.hpp      |    28 +
 .../build/include/opencv2/gapi/gscalar.hpp    |   140 +
 .../build/include/opencv2/gapi/gstreaming.hpp |   430 +
 .../build/include/opencv2/gapi/gtransform.hpp |   103 +
 .../include/opencv2/gapi/gtype_traits.hpp     |   242 +
 .../build/include/opencv2/gapi/gtyped.hpp     |   246 +
 .../build/include/opencv2/gapi/imgproc.hpp    |  1769 ++
 .../build/include/opencv2/gapi/infer.hpp      |   717 +
 .../opencv2/gapi/infer/bindings_ie.hpp        |    70 +
 .../opencv2/gapi/infer/bindings_onnx.hpp      |    68 +
 .../opencv2/gapi/infer/bindings_ov.hpp        |   128 +
 .../build/include/opencv2/gapi/infer/ie.hpp   |   711 +
 .../build/include/opencv2/gapi/infer/onnx.hpp |   722 +
 .../build/include/opencv2/gapi/infer/ov.hpp   |   709 +
 .../include/opencv2/gapi/infer/parsers.hpp    |   138 +
 .../build/include/opencv2/gapi/media.hpp      |   258 +
 .../build/include/opencv2/gapi/oak/infer.hpp  |    66 +
 .../build/include/opencv2/gapi/oak/oak.hpp    |   158 +
 .../build/include/opencv2/gapi/ocl/core.hpp   |    27 +
 .../include/opencv2/gapi/ocl/goclkernel.hpp   |   260 +
 .../include/opencv2/gapi/ocl/imgproc.hpp      |    27 +
 .../include/opencv2/gapi/opencv_includes.hpp  |    42 +
 .../build/include/opencv2/gapi/operators.hpp  |    70 +
 .../build/include/opencv2/gapi/ot.hpp         |   194 +
 .../build/include/opencv2/gapi/own/assert.hpp |    60 +
 .../include/opencv2/gapi/own/convert.hpp      |    55 +
 .../build/include/opencv2/gapi/own/cvdefs.hpp |   166 +
 .../include/opencv2/gapi/own/exports.hpp      |    42 +
 .../build/include/opencv2/gapi/own/mat.hpp    |   354 +
 .../include/opencv2/gapi/own/saturate.hpp     |    83 +
 .../build/include/opencv2/gapi/own/scalar.hpp |    47 +
 .../build/include/opencv2/gapi/own/types.hpp  |   162 +
 .../include/opencv2/gapi/plaidml/core.hpp     |    20 +
 .../opencv2/gapi/plaidml/gplaidmlkernel.hpp   |   140 +
 .../include/opencv2/gapi/plaidml/plaidml.hpp  |    53 +
 .../include/opencv2/gapi/python/python.hpp    |    71 +
 .../build/include/opencv2/gapi/render.hpp     |    14 +
 .../include/opencv2/gapi/render/render.hpp    |   196 +
 .../opencv2/gapi/render/render_types.hpp      |   359 +
 .../build/include/opencv2/gapi/rmat.hpp       |   160 +
 .../build/include/opencv2/gapi/s11n.hpp       |   513 +
 .../build/include/opencv2/gapi/s11n/base.hpp  |    80 +
 .../build/include/opencv2/gapi/stereo.hpp     |    85 +
 .../include/opencv2/gapi/streaming/cap.hpp    |   149 +
 .../include/opencv2/gapi/streaming/desync.hpp |    86 +
 .../include/opencv2/gapi/streaming/format.hpp |    94 +
 .../streaming/gstreamer/gstreamerpipeline.hpp |    59 +
 .../streaming/gstreamer/gstreamersource.hpp   |    97 +
 .../include/opencv2/gapi/streaming/meta.hpp   |    80 +
 .../gapi/streaming/onevpl/accel_types.hpp     |    76 +
 .../gapi/streaming/onevpl/cfg_params.hpp      |   209 +
 .../onevpl/data_provider_interface.hpp        |   105 +
 .../opencv2/gapi/streaming/onevpl/default.hpp |    29 +
 .../onevpl/device_selector_interface.hpp      |    61 +
 .../opencv2/gapi/streaming/onevpl/source.hpp  |    94 +
 .../opencv2/gapi/streaming/queue_source.hpp   |    67 +
 .../include/opencv2/gapi/streaming/source.hpp |    67 +
 .../include/opencv2/gapi/streaming/sync.hpp   |    30 +
 .../build/include/opencv2/gapi/util/any.hpp   |   190 +
 .../opencv2/gapi/util/compiler_hints.hpp      |    19 +
 .../opencv2/gapi/util/copy_through_move.hpp   |    34 +
 .../include/opencv2/gapi/util/optional.hpp    |   178 +
 .../build/include/opencv2/gapi/util/throw.hpp |    36 +
 .../include/opencv2/gapi/util/type_traits.hpp |    31 +
 .../build/include/opencv2/gapi/util/util.hpp  |   190 +
 .../include/opencv2/gapi/util/variant.hpp     |   667 +
 .../build/include/opencv2/gapi/video.hpp      |   364 +
 .../build/include/opencv2/highgui.hpp         |   826 +
 .../build/include/opencv2/highgui/highgui.hpp |    48 +
 .../build/include/opencv2/highgui/highgui_c.h |   251 +
 .../build/include/opencv2/imgcodecs.hpp       |   475 +
 .../include/opencv2/imgcodecs/imgcodecs.hpp   |    48 +
 .../include/opencv2/imgcodecs/imgcodecs_c.h   |     1 +
 .../build/include/opencv2/imgcodecs/ios.h     |    59 +
 .../opencv2/imgcodecs/legacy/constants_c.h    |    54 +
 .../build/include/opencv2/imgcodecs/macosx.h  |    20 +
 .../build/include/opencv2/imgproc.hpp         |  5070 ++++
 .../include/opencv2/imgproc/bindings.hpp      |    34 +
 .../opencv2/imgproc/detail/gcgraph.hpp        |   395 +
 .../include/opencv2/imgproc/detail/legacy.hpp |    38 +
 .../build/include/opencv2/imgproc/hal/hal.hpp |   251 +
 .../include/opencv2/imgproc/hal/interface.h   |    46 +
 .../build/include/opencv2/imgproc/imgproc.hpp |    48 +
 .../build/include/opencv2/imgproc/imgproc_c.h |  1185 +
 .../include/opencv2/imgproc/segmentation.hpp  |   141 +
 .../build/include/opencv2/imgproc/types_c.h   |   660 +
 .../opencv410/build/include/opencv2/ml.hpp    |  1956 ++
 .../opencv410/build/include/opencv2/ml/ml.hpp |    48 +
 .../build/include/opencv2/ml/ml.inl.hpp       |    60 +
 .../build/include/opencv2/objdetect.hpp       |   873 +
 .../include/opencv2/objdetect/aruco_board.hpp |   199 +
 .../opencv2/objdetect/aruco_detector.hpp      |   400 +
 .../opencv2/objdetect/aruco_dictionary.hpp    |   155 +
 .../include/opencv2/objdetect/barcode.hpp     |   111 +
 .../opencv2/objdetect/charuco_detector.hpp    |   157 +
 .../objdetect/detection_based_tracker.hpp     |   222 +
 .../build/include/opencv2/objdetect/face.hpp  |   163 +
 .../objdetect/graphical_code_detector.hpp     |    85 +
 .../include/opencv2/objdetect/objdetect.hpp   |    48 +
 .../build/include/opencv2/opencv.hpp          |    95 +
 .../build/include/opencv2/opencv_modules.hpp  |    30 +
 .../opencv410/build/include/opencv2/photo.hpp |   857 +
 .../build/include/opencv2/photo/cuda.hpp      |   157 +
 .../opencv2/photo/legacy/constants_c.h        |    14 +
 .../build/include/opencv2/photo/photo.hpp     |    48 +
 .../build/include/opencv2/stitching.hpp       |   365 +
 .../opencv2/stitching/detail/autocalib.hpp    |    86 +
 .../opencv2/stitching/detail/blenders.hpp     |   184 +
 .../opencv2/stitching/detail/camera.hpp       |    78 +
 .../stitching/detail/exposure_compensate.hpp  |   245 +
 .../opencv2/stitching/detail/matchers.hpp     |   267 +
 .../stitching/detail/motion_estimators.hpp    |   373 +
 .../opencv2/stitching/detail/seam_finders.hpp |   291 +
 .../opencv2/stitching/detail/timelapsers.hpp  |    91 +
 .../include/opencv2/stitching/detail/util.hpp |   121 +
 .../opencv2/stitching/detail/util_inl.hpp     |   131 +
 .../opencv2/stitching/detail/warpers.hpp      |   706 +
 .../opencv2/stitching/detail/warpers_inl.hpp  |   782 +
 .../include/opencv2/stitching/warpers.hpp     |   277 +
 .../opencv410/build/include/opencv2/video.hpp |    58 +
 .../include/opencv2/video/background_segm.hpp |   317 +
 .../opencv2/video/detail/tracking.detail.hpp  |   406 +
 .../opencv2/video/legacy/constants_c.h        |    16 +
 .../build/include/opencv2/video/tracking.hpp  |   943 +
 .../build/include/opencv2/video/video.hpp     |    48 +
 .../build/include/opencv2/videoio.hpp         |  1182 +
 .../build/include/opencv2/videoio/cap_ios.h   |   150 +
 .../opencv2/videoio/legacy/constants_c.h      |   434 +
 .../include/opencv2/videoio/registry.hpp      |    72 +
 .../build/include/opencv2/videoio/videoio.hpp |    48 +
 .../build/include/opencv2/videoio/videoio_c.h |   153 +
 .../opencv410/build/include/opencv2/world.hpp |    58 +
 rpcs3/CMakeLists.txt                          |     3 +
 rpcs3/Emu/Cell/Modules/cellGem.cpp            |    10 +-
 rpcs3/Input/ps_move_tracker.cpp               |     8 +-
 rpcs3/rpcs3qt/CMakeLists.txt                  |     2 +
 rpcs3/rpcs3qt/pad_settings_dialog.cpp         |     1 +
 359 files changed, 186850 insertions(+), 11 deletions(-)
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/calib3d.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/calib3d/calib3d.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/calib3d/calib3d_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/affine.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/async.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/base.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/bindings_utils.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/bufferpool.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/check.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/core_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda.inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/block.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/border_interpolate.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/color.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/common.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/datamov_utils.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/color_detail.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/reduce.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/reduce_key_val.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/transform_detail.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/type_traits_detail.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/dynamic_smem.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/emulation.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/filters.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/funcattrib.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/functional.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/limits.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/reduce.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/saturate_cast.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/scan.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/simd_functions.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/transform.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/type_traits.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/utility.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_distance.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_math.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_traits.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp_reduce.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp_shuffle.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda_stream_accessor.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cuda_types.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cv_cpu_dispatch.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cv_cpu_helper.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cvdef.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd.inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd_wrapper.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/detail/async_promise.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/detail/dispatch_helper.impl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/detail/exception_ptr.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/directx.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/dualquaternion.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/dualquaternion.inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/eigen.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/fast_math.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/hal.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/interface.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_avx.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_avx512.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_cpp.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_forward.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_lasx.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_lsx.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_msa.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_neon.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv071.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_010_compat_non-policy.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_010_compat_overloaded-non-policy.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_scalable.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_sse.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_sse_em.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_vsx.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_wasm.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/msa_macros.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/hal/simd_utils.impl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/mat.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/mat.inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/matx.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/matx.inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/neon_utils.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/ocl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/ocl_genbase.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/ocl_defs.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/opencl_info.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/opencl_svm.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_clblas.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_clfft.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_gl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/opengl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/operations.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/optim.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/ovx.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/parallel_backend.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/persistence.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/quaternion.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/quaternion.inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/saturate.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/simd_intrinsics.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/softfloat.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/sse_utils.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/traits.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/types.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/types_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utility.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/allocator_stats.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/allocator_stats.impl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/filesystem.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/fp_control_utils.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/instrumentation.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logger.defines.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logger.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logtag.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/tls.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/utils/trace.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/va_intel.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/version.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/core/vsx_utils.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/cvconfig.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/all_layers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/dict.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/dnn.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/dnn.inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/layer.details.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/layer.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/shape_utils.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/utils/debug_utils.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/utils/inference_engine.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/dnn/version.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/features2d.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/features2d/features2d.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/features2d/hal/interface.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/all_indices.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/allocator.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/any.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/autotuned_index.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/composite_index.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/config.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/defines.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/dist.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/dummy.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/dynamic_bitset.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/flann.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/flann_base.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/general.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/ground_truth.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/hdf5.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/heap.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/hierarchical_clustering_index.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/index_testing.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/kdtree_index.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/kdtree_single_index.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/kmeans_index.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/linear_index.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/logger.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/lsh_index.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/lsh_table.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/matrix.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/miniflann.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/nn_index.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/object_factory.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/params.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/random.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/result_set.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/sampling.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/saving.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/simplex_downhill.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/flann/timer.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/gcpukernel.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/imgproc.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/ot.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/stereo.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/video.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/gfluidbuffer.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/gfluidkernel.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/imgproc.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/garg.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/garray.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gasync_context.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcall.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcommon.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompiled.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompiled_async.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompoundkernel.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcomputation.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcomputation_async.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gframe.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gkernel.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gmat.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gmetaarg.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gopaque.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gproto.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/ggpukernel.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/imgproc.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gscalar.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gstreaming.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtransform.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtype_traits.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtyped.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/imgproc.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_ie.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_onnx.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_ov.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/ie.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/onnx.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/ov.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/parsers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/media.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/oak/infer.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/oak/oak.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/goclkernel.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/imgproc.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/opencv_includes.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/operators.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/ot.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/assert.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/convert.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/cvdefs.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/exports.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/mat.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/saturate.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/scalar.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/types.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/core.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/gplaidmlkernel.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/plaidml.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/python/python.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/render.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/render/render.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/render/render_types.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/rmat.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/s11n.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/s11n/base.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/stereo.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/cap.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/desync.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/format.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/gstreamer/gstreamerpipeline.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/gstreamer/gstreamersource.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/meta.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/cfg_params.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/data_provider_interface.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/default.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/device_selector_interface.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/source.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/queue_source.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/source.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/sync.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/any.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/compiler_hints.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/copy_through_move.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/optional.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/throw.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/type_traits.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/util.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/variant.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/gapi/video.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/highgui.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/highgui/highgui.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/highgui/highgui_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/imgcodecs.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/imgcodecs_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/ios.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/legacy/constants_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/macosx.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc/bindings.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc/detail/gcgraph.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc/detail/legacy.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc/hal/hal.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc/hal/interface.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc/imgproc.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc/imgproc_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc/segmentation.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/imgproc/types_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/ml.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/ml/ml.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/ml/ml.inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_board.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_detector.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_dictionary.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect/barcode.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect/charuco_detector.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect/detection_based_tracker.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect/face.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect/graphical_code_detector.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/objdetect/objdetect.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/opencv.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/opencv_modules.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/photo.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/photo/cuda.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/photo/legacy/constants_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/photo/photo.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/autocalib.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/blenders.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/camera.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/exposure_compensate.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/matchers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/motion_estimators.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/seam_finders.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/timelapsers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/util.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/util_inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/warpers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/warpers_inl.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/stitching/warpers.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/video.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/video/background_segm.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/video/detail/tracking.detail.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/video/legacy/constants_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/video/tracking.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/video/video.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/videoio.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/videoio/cap_ios.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/videoio/legacy/constants_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/videoio/registry.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/videoio/videoio.hpp
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/videoio/videoio_c.h
 create mode 100644 3rdparty/opencv/opencv410/build/include/opencv2/world.hpp

diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/calib3d.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/calib3d.hpp
new file mode 100644
index 000000000000..0280e05e2184
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/calib3d.hpp
@@ -0,0 +1,4131 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CALIB3D_HPP
+#define OPENCV_CALIB3D_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/types.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/core/affine.hpp"
+#include "opencv2/core/utils/logger.hpp"
+
+/**
+  @defgroup calib3d Camera Calibration and 3D Reconstruction
+
+The functions in this section use a so-called pinhole camera model. The view of a scene
+is obtained by projecting a scene's 3D point \f$P_w\f$ into the image plane using a perspective
+transformation which forms the corresponding pixel \f$p\f$. Both \f$P_w\f$ and \f$p\f$ are
+represented in homogeneous coordinates, i.e. as 3D and 2D homogeneous vector respectively. You will
+find a brief introduction to projective geometry, homogeneous vectors and homogeneous
+transformations at the end of this section's introduction. For more succinct notation, we often drop
+the 'homogeneous' and say vector instead of homogeneous vector.
+
+The distortion-free projective transformation given by a  pinhole camera model is shown below.
+
+\f[s \; p = A \begin{bmatrix} R|t \end{bmatrix} P_w,\f]
+
+where \f$P_w\f$ is a 3D point expressed with respect to the world coordinate system,
+\f$p\f$ is a 2D pixel in the image plane, \f$A\f$ is the camera intrinsic matrix,
+\f$R\f$ and \f$t\f$ are the rotation and translation that describe the change of coordinates from
+world to camera coordinate systems (or camera frame) and \f$s\f$ is the projective transformation's
+arbitrary scaling and not part of the camera model.
+
+The camera intrinsic matrix \f$A\f$ (notation used as in @cite Zhang2000 and also generally notated
+as \f$K\f$) projects 3D points given in the camera coordinate system to 2D pixel coordinates, i.e.
+
+\f[p = A P_c.\f]
+
+The camera intrinsic matrix \f$A\f$ is composed of the focal lengths \f$f_x\f$ and \f$f_y\f$, which are
+expressed in pixel units, and the principal point \f$(c_x, c_y)\f$, that is usually close to the
+image center:
+
+\f[A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1},\f]
+
+and thus
+
+\f[s \vecthree{u}{v}{1} = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1} \vecthree{X_c}{Y_c}{Z_c}.\f]
+
+The matrix of intrinsic parameters does not depend on the scene viewed. So, once estimated, it can
+be re-used as long as the focal length is fixed (in case of a zoom lens). Thus, if an image from the
+camera is scaled by a factor, all of these parameters need to be scaled (multiplied/divided,
+respectively) by the same factor.
+
+The joint rotation-translation matrix \f$[R|t]\f$ is the matrix product of a projective
+transformation and a homogeneous transformation. The 3-by-4 projective transformation maps 3D points
+represented in camera coordinates to 2D points in the image plane and represented in normalized
+camera coordinates \f$x' = X_c / Z_c\f$ and \f$y' = Y_c / Z_c\f$:
+
+\f[Z_c \begin{bmatrix}
+x' \\
+y' \\
+1
+\end{bmatrix} = \begin{bmatrix}
+1 & 0 & 0 & 0 \\
+0 & 1 & 0 & 0 \\
+0 & 0 & 1 & 0
+\end{bmatrix}
+\begin{bmatrix}
+X_c \\
+Y_c \\
+Z_c \\
+1
+\end{bmatrix}.\f]
+
+The homogeneous transformation is encoded by the extrinsic parameters \f$R\f$ and \f$t\f$ and
+represents the change of basis from world coordinate system \f$w\f$ to the camera coordinate sytem
+\f$c\f$. Thus, given the representation of the point \f$P\f$ in world coordinates, \f$P_w\f$, we
+obtain \f$P\f$'s representation in the camera coordinate system, \f$P_c\f$, by
+
+\f[P_c = \begin{bmatrix}
+R & t \\
+0 & 1
+\end{bmatrix} P_w,\f]
+
+This homogeneous transformation is composed out of \f$R\f$, a 3-by-3 rotation matrix, and \f$t\f$, a
+3-by-1 translation vector:
+
+\f[\begin{bmatrix}
+R & t \\
+0 & 1
+\end{bmatrix} = \begin{bmatrix}
+r_{11} & r_{12} & r_{13} & t_x \\
+r_{21} & r_{22} & r_{23} & t_y \\
+r_{31} & r_{32} & r_{33} & t_z \\
+0 & 0 & 0 & 1
+\end{bmatrix},
+\f]
+
+and therefore
+
+\f[\begin{bmatrix}
+X_c \\
+Y_c \\
+Z_c \\
+1
+\end{bmatrix} = \begin{bmatrix}
+r_{11} & r_{12} & r_{13} & t_x \\
+r_{21} & r_{22} & r_{23} & t_y \\
+r_{31} & r_{32} & r_{33} & t_z \\
+0 & 0 & 0 & 1
+\end{bmatrix}
+\begin{bmatrix}
+X_w \\
+Y_w \\
+Z_w \\
+1
+\end{bmatrix}.\f]
+
+Combining the projective transformation and the homogeneous transformation, we obtain the projective
+transformation that maps 3D points in world coordinates into 2D points in the image plane and in
+normalized camera coordinates:
+
+\f[Z_c \begin{bmatrix}
+x' \\
+y' \\
+1
+\end{bmatrix} = \begin{bmatrix} R|t \end{bmatrix} \begin{bmatrix}
+X_w \\
+Y_w \\
+Z_w \\
+1
+\end{bmatrix} = \begin{bmatrix}
+r_{11} & r_{12} & r_{13} & t_x \\
+r_{21} & r_{22} & r_{23} & t_y \\
+r_{31} & r_{32} & r_{33} & t_z
+\end{bmatrix}
+\begin{bmatrix}
+X_w \\
+Y_w \\
+Z_w \\
+1
+\end{bmatrix},\f]
+
+with \f$x' = X_c / Z_c\f$ and \f$y' = Y_c / Z_c\f$. Putting the equations for instrincs and extrinsics together, we can write out
+\f$s \; p = A \begin{bmatrix} R|t \end{bmatrix} P_w\f$ as
+
+\f[s \vecthree{u}{v}{1} = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}
+\begin{bmatrix}
+r_{11} & r_{12} & r_{13} & t_x \\
+r_{21} & r_{22} & r_{23} & t_y \\
+r_{31} & r_{32} & r_{33} & t_z
+\end{bmatrix}
+\begin{bmatrix}
+X_w \\
+Y_w \\
+Z_w \\
+1
+\end{bmatrix}.\f]
+
+If \f$Z_c \ne 0\f$, the transformation above is equivalent to the following,
+
+\f[\begin{bmatrix}
+u \\
+v
+\end{bmatrix} = \begin{bmatrix}
+f_x X_c/Z_c + c_x \\
+f_y Y_c/Z_c + c_y
+\end{bmatrix}\f]
+
+with
+
+\f[\vecthree{X_c}{Y_c}{Z_c} = \begin{bmatrix}
+R|t
+\end{bmatrix} \begin{bmatrix}
+X_w \\
+Y_w \\
+Z_w \\
+1
+\end{bmatrix}.\f]
+
+The following figure illustrates the pinhole camera model.
+
+![Pinhole camera model](pics/pinhole_camera_model.png)
+
+Real lenses usually have some distortion, mostly radial distortion, and slight tangential distortion.
+So, the above model is extended as:
+
+\f[\begin{bmatrix}
+u \\
+v
+\end{bmatrix} = \begin{bmatrix}
+f_x x'' + c_x \\
+f_y y'' + c_y
+\end{bmatrix}\f]
+
+where
+
+\f[\begin{bmatrix}
+x'' \\
+y''
+\end{bmatrix} = \begin{bmatrix}
+x' \frac{1 + k_1 r^2 + k_2 r^4 + k_3 r^6}{1 + k_4 r^2 + k_5 r^4 + k_6 r^6} + 2 p_1 x' y' + p_2(r^2 + 2 x'^2) + s_1 r^2 + s_2 r^4 \\
+y' \frac{1 + k_1 r^2 + k_2 r^4 + k_3 r^6}{1 + k_4 r^2 + k_5 r^4 + k_6 r^6} + p_1 (r^2 + 2 y'^2) + 2 p_2 x' y' + s_3 r^2 + s_4 r^4 \\
+\end{bmatrix}\f]
+
+with
+
+\f[r^2 = x'^2 + y'^2\f]
+
+and
+
+\f[\begin{bmatrix}
+x'\\
+y'
+\end{bmatrix} = \begin{bmatrix}
+X_c/Z_c \\
+Y_c/Z_c
+\end{bmatrix},\f]
+
+if \f$Z_c \ne 0\f$.
+
+The distortion parameters are the radial coefficients \f$k_1\f$, \f$k_2\f$, \f$k_3\f$, \f$k_4\f$, \f$k_5\f$, and \f$k_6\f$
+,\f$p_1\f$ and \f$p_2\f$ are the tangential distortion coefficients, and \f$s_1\f$, \f$s_2\f$, \f$s_3\f$, and \f$s_4\f$,
+are the thin prism distortion coefficients. Higher-order coefficients are not considered in OpenCV.
+
+The next figures show two common types of radial distortion: barrel distortion
+(\f$ 1 + k_1 r^2 + k_2 r^4 + k_3 r^6 \f$ monotonically decreasing)
+and pincushion distortion (\f$ 1 + k_1 r^2 + k_2 r^4 + k_3 r^6 \f$ monotonically increasing).
+Radial distortion is always monotonic for real lenses,
+and if the estimator produces a non-monotonic result,
+this should be considered a calibration failure.
+More generally, radial distortion must be monotonic and the distortion function must be bijective.
+A failed estimation result may look deceptively good near the image center
+but will work poorly in e.g. AR/SFM applications.
+The optimization method used in OpenCV camera calibration does not include these constraints as
+the framework does not support the required integer programming and polynomial inequalities.
+See [issue #15992](https://github.com/opencv/opencv/issues/15992) for additional information.
+
+![](pics/distortion_examples.png)
+![](pics/distortion_examples2.png)
+
+In some cases, the image sensor may be tilted in order to focus an oblique plane in front of the
+camera (Scheimpflug principle). This can be useful for particle image velocimetry (PIV) or
+triangulation with a laser fan. The tilt causes a perspective distortion of \f$x''\f$ and
+\f$y''\f$. This distortion can be modeled in the following way, see e.g. @cite Louhichi07.
+
+\f[\begin{bmatrix}
+u \\
+v
+\end{bmatrix} = \begin{bmatrix}
+f_x x''' + c_x \\
+f_y y''' + c_y
+\end{bmatrix},\f]
+
+where
+
+\f[s\vecthree{x'''}{y'''}{1} =
+\vecthreethree{R_{33}(\tau_x, \tau_y)}{0}{-R_{13}(\tau_x, \tau_y)}
+{0}{R_{33}(\tau_x, \tau_y)}{-R_{23}(\tau_x, \tau_y)}
+{0}{0}{1} R(\tau_x, \tau_y) \vecthree{x''}{y''}{1}\f]
+
+and the matrix \f$R(\tau_x, \tau_y)\f$ is defined by two rotations with angular parameter
+\f$\tau_x\f$ and \f$\tau_y\f$, respectively,
+
+\f[
+R(\tau_x, \tau_y) =
+\vecthreethree{\cos(\tau_y)}{0}{-\sin(\tau_y)}{0}{1}{0}{\sin(\tau_y)}{0}{\cos(\tau_y)}
+\vecthreethree{1}{0}{0}{0}{\cos(\tau_x)}{\sin(\tau_x)}{0}{-\sin(\tau_x)}{\cos(\tau_x)} =
+\vecthreethree{\cos(\tau_y)}{\sin(\tau_y)\sin(\tau_x)}{-\sin(\tau_y)\cos(\tau_x)}
+{0}{\cos(\tau_x)}{\sin(\tau_x)}
+{\sin(\tau_y)}{-\cos(\tau_y)\sin(\tau_x)}{\cos(\tau_y)\cos(\tau_x)}.
+\f]
+
+In the functions below the coefficients are passed or returned as
+
+\f[(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f]
+
+vector. That is, if the vector contains four elements, it means that \f$k_3=0\f$ . The distortion
+coefficients do not depend on the scene viewed. Thus, they also belong to the intrinsic camera
+parameters. And they remain the same regardless of the captured image resolution. If, for example, a
+camera has been calibrated on images of 320 x 240 resolution, absolutely the same distortion
+coefficients can be used for 640 x 480 images from the same camera while \f$f_x\f$, \f$f_y\f$,
+\f$c_x\f$, and \f$c_y\f$ need to be scaled appropriately.
+
+The functions below use the above model to do the following:
+
+-   Project 3D points to the image plane given intrinsic and extrinsic parameters.
+-   Compute extrinsic parameters given intrinsic parameters, a few 3D points, and their
+projections.
+-   Estimate intrinsic and extrinsic camera parameters from several views of a known calibration
+pattern (every view is described by several 3D-2D point correspondences).
+-   Estimate the relative position and orientation of the stereo camera "heads" and compute the
+*rectification* transformation that makes the camera optical axes parallel.
+
+<B> Homogeneous Coordinates </B><br>
+Homogeneous Coordinates are a system of coordinates that are used in projective geometry. Their use
+allows to represent points at infinity by finite coordinates and simplifies formulas when compared
+to the cartesian counterparts, e.g. they have the advantage that affine transformations can be
+expressed as linear homogeneous transformation.
+
+One obtains the homogeneous vector \f$P_h\f$ by appending a 1 along an n-dimensional cartesian
+vector \f$P\f$ e.g. for a 3D cartesian vector the mapping \f$P \rightarrow P_h\f$ is:
+
+\f[\begin{bmatrix}
+X \\
+Y \\
+Z
+\end{bmatrix} \rightarrow \begin{bmatrix}
+X \\
+Y \\
+Z \\
+1
+\end{bmatrix}.\f]
+
+For the inverse mapping \f$P_h \rightarrow P\f$, one divides all elements of the homogeneous vector
+by its last element, e.g. for a 3D homogeneous vector one gets its 2D cartesian counterpart by:
+
+\f[\begin{bmatrix}
+X \\
+Y \\
+W
+\end{bmatrix} \rightarrow \begin{bmatrix}
+X / W \\
+Y / W
+\end{bmatrix},\f]
+
+if \f$W \ne 0\f$.
+
+Due to this mapping, all multiples \f$k P_h\f$, for \f$k \ne 0\f$, of a homogeneous point represent
+the same point \f$P_h\f$. An intuitive understanding of this property is that under a projective
+transformation, all multiples of \f$P_h\f$ are mapped to the same point. This is the physical
+observation one does for pinhole cameras, as all points along a ray through the camera's pinhole are
+projected to the same image point, e.g. all points along the red ray in the image of the pinhole
+camera model above would be mapped to the same image coordinate. This property is also the source
+for the scale ambiguity s in the equation of the pinhole camera model.
+
+As mentioned, by using homogeneous coordinates we can express any change of basis parameterized by
+\f$R\f$ and \f$t\f$ as a linear transformation, e.g. for the change of basis from coordinate system
+0 to coordinate system 1 becomes:
+
+\f[P_1 = R P_0 + t \rightarrow P_{h_1} = \begin{bmatrix}
+R & t \\
+0 & 1
+\end{bmatrix} P_{h_0}.\f]
+
+@note
+    -   Many functions in this module take a camera intrinsic matrix as an input parameter. Although all
+        functions assume the same structure of this parameter, they may name it differently. The
+        parameter's description, however, will be clear in that a camera intrinsic matrix with the structure
+        shown above is required.
+    -   A calibration sample for 3 cameras in a horizontal position can be found at
+        opencv_source_code/samples/cpp/3calibration.cpp
+    -   A calibration sample based on a sequence of images can be found at
+        opencv_source_code/samples/cpp/calibration.cpp
+    -   A calibration sample in order to do 3D reconstruction can be found at
+        opencv_source_code/samples/cpp/build3dmodel.cpp
+    -   A calibration example on stereo calibration can be found at
+        opencv_source_code/samples/cpp/stereo_calib.cpp
+    -   A calibration example on stereo matching can be found at
+        opencv_source_code/samples/cpp/stereo_match.cpp
+    -   (Python) A camera calibration sample can be found at
+        opencv_source_code/samples/python/calibrate.py
+
+  @{
+    @defgroup calib3d_fisheye Fisheye camera model
+
+    Definitions: Let P be a point in 3D of coordinates X in the world reference frame (stored in the
+    matrix X) The coordinate vector of P in the camera reference frame is:
+
+    \f[Xc = R X + T\f]
+
+    where R is the rotation matrix corresponding to the rotation vector om: R = rodrigues(om); call x, y
+    and z the 3 coordinates of Xc:
+
+    \f[x = Xc_1 \\ y = Xc_2 \\ z = Xc_3\f]
+
+    The pinhole projection coordinates of P is [a; b] where
+
+    \f[a = x / z \ and \ b = y / z \\ r^2 = a^2 + b^2 \\ \theta = atan(r)\f]
+
+    Fisheye distortion:
+
+    \f[\theta_d = \theta (1 + k_1 \theta^2 + k_2 \theta^4 + k_3 \theta^6 + k_4 \theta^8)\f]
+
+    The distorted point coordinates are [x'; y'] where
+
+    \f[x' = (\theta_d / r) a \\ y' = (\theta_d / r) b \f]
+
+    Finally, conversion into pixel coordinates: The final pixel coordinates vector [u; v] where:
+
+    \f[u = f_x (x' + \alpha y') + c_x \\
+    v = f_y y' + c_y\f]
+
+    Summary:
+    Generic camera model @cite Kannala2006 with perspective projection and without distortion correction
+
+  @}
+ */
+
+namespace cv
+{
+
+//! @addtogroup calib3d
+//! @{
+
+//! type of the robust estimation algorithm
+enum { LMEDS  = 4,  //!< least-median of squares algorithm
+       RANSAC = 8,  //!< RANSAC algorithm
+       RHO    = 16, //!< RHO algorithm
+       USAC_DEFAULT  = 32, //!< USAC algorithm, default settings
+       USAC_PARALLEL = 33, //!< USAC, parallel version
+       USAC_FM_8PTS = 34,  //!< USAC, fundamental matrix 8 points
+       USAC_FAST = 35,     //!< USAC, fast settings
+       USAC_ACCURATE = 36, //!< USAC, accurate settings
+       USAC_PROSAC = 37,   //!< USAC, sorted points, runs PROSAC
+       USAC_MAGSAC = 38    //!< USAC, runs MAGSAC++
+     };
+
+enum SolvePnPMethod {
+    SOLVEPNP_ITERATIVE   = 0, //!< Pose refinement using non-linear Levenberg-Marquardt minimization scheme @cite Madsen04 @cite Eade13 \n
+                              //!< Initial solution for non-planar "objectPoints" needs at least 6 points and uses the DLT algorithm. \n
+                              //!< Initial solution for planar "objectPoints" needs at least 4 points and uses pose from homography decomposition.
+    SOLVEPNP_EPNP        = 1, //!< EPnP: Efficient Perspective-n-Point Camera Pose Estimation @cite lepetit2009epnp
+    SOLVEPNP_P3P         = 2, //!< Complete Solution Classification for the Perspective-Three-Point Problem @cite gao2003complete
+    SOLVEPNP_DLS         = 3, //!< **Broken implementation. Using this flag will fallback to EPnP.** \n
+                              //!< A Direct Least-Squares (DLS) Method for PnP @cite hesch2011direct
+    SOLVEPNP_UPNP        = 4, //!< **Broken implementation. Using this flag will fallback to EPnP.** \n
+                              //!< Exhaustive Linearization for Robust Camera Pose and Focal Length Estimation @cite penate2013exhaustive
+    SOLVEPNP_AP3P        = 5, //!< An Efficient Algebraic Solution to the Perspective-Three-Point Problem @cite Ke17
+    SOLVEPNP_IPPE        = 6, //!< Infinitesimal Plane-Based Pose Estimation @cite Collins14 \n
+                              //!< Object points must be coplanar.
+    SOLVEPNP_IPPE_SQUARE = 7, //!< Infinitesimal Plane-Based Pose Estimation @cite Collins14 \n
+                              //!< This is a special case suitable for marker pose estimation.\n
+                              //!< 4 coplanar object points must be defined in the following order:
+                              //!<   - point 0: [-squareLength / 2,  squareLength / 2, 0]
+                              //!<   - point 1: [ squareLength / 2,  squareLength / 2, 0]
+                              //!<   - point 2: [ squareLength / 2, -squareLength / 2, 0]
+                              //!<   - point 3: [-squareLength / 2, -squareLength / 2, 0]
+    SOLVEPNP_SQPNP       = 8, //!< SQPnP: A Consistently Fast and Globally OptimalSolution to the Perspective-n-Point Problem @cite Terzakis2020SQPnP
+#ifndef CV_DOXYGEN
+    SOLVEPNP_MAX_COUNT        //!< Used for count
+#endif
+};
+
+enum { CALIB_CB_ADAPTIVE_THRESH = 1,
+       CALIB_CB_NORMALIZE_IMAGE = 2,
+       CALIB_CB_FILTER_QUADS    = 4,
+       CALIB_CB_FAST_CHECK      = 8,
+       CALIB_CB_EXHAUSTIVE      = 16,
+       CALIB_CB_ACCURACY        = 32,
+       CALIB_CB_LARGER          = 64,
+       CALIB_CB_MARKER          = 128,
+       CALIB_CB_PLAIN           = 256
+     };
+
+enum { CALIB_CB_SYMMETRIC_GRID  = 1,
+       CALIB_CB_ASYMMETRIC_GRID = 2,
+       CALIB_CB_CLUSTERING      = 4
+     };
+
+enum { CALIB_NINTRINSIC          = 18,
+       CALIB_USE_INTRINSIC_GUESS = 0x00001,
+       CALIB_FIX_ASPECT_RATIO    = 0x00002,
+       CALIB_FIX_PRINCIPAL_POINT = 0x00004,
+       CALIB_ZERO_TANGENT_DIST   = 0x00008,
+       CALIB_FIX_FOCAL_LENGTH    = 0x00010,
+       CALIB_FIX_K1              = 0x00020,
+       CALIB_FIX_K2              = 0x00040,
+       CALIB_FIX_K3              = 0x00080,
+       CALIB_FIX_K4              = 0x00800,
+       CALIB_FIX_K5              = 0x01000,
+       CALIB_FIX_K6              = 0x02000,
+       CALIB_RATIONAL_MODEL      = 0x04000,
+       CALIB_THIN_PRISM_MODEL    = 0x08000,
+       CALIB_FIX_S1_S2_S3_S4     = 0x10000,
+       CALIB_TILTED_MODEL        = 0x40000,
+       CALIB_FIX_TAUX_TAUY       = 0x80000,
+       CALIB_USE_QR              = 0x100000, //!< use QR instead of SVD decomposition for solving. Faster but potentially less precise
+       CALIB_FIX_TANGENT_DIST    = 0x200000,
+       // only for stereo
+       CALIB_FIX_INTRINSIC       = 0x00100,
+       CALIB_SAME_FOCAL_LENGTH   = 0x00200,
+       // for stereo rectification
+       CALIB_ZERO_DISPARITY      = 0x00400,
+       CALIB_USE_LU              = (1 << 17), //!< use LU instead of SVD decomposition for solving. much faster but potentially less precise
+       CALIB_USE_EXTRINSIC_GUESS = (1 << 22)  //!< for stereoCalibrate
+     };
+
+//! the algorithm for finding fundamental matrix
+enum { FM_7POINT = 1, //!< 7-point algorithm
+       FM_8POINT = 2, //!< 8-point algorithm
+       FM_LMEDS  = 4, //!< least-median algorithm. 7-point algorithm is used.
+       FM_RANSAC = 8  //!< RANSAC algorithm. It needs at least 15 points. 7-point algorithm is used.
+     };
+
+enum HandEyeCalibrationMethod
+{
+    CALIB_HAND_EYE_TSAI         = 0, //!< A New Technique for Fully Autonomous and Efficient 3D Robotics Hand/Eye Calibration @cite Tsai89
+    CALIB_HAND_EYE_PARK         = 1, //!< Robot Sensor Calibration: Solving AX = XB on the Euclidean Group @cite Park94
+    CALIB_HAND_EYE_HORAUD       = 2, //!< Hand-eye Calibration @cite Horaud95
+    CALIB_HAND_EYE_ANDREFF      = 3, //!< On-line Hand-Eye Calibration @cite Andreff99
+    CALIB_HAND_EYE_DANIILIDIS   = 4  //!< Hand-Eye Calibration Using Dual Quaternions @cite Daniilidis98
+};
+
+enum RobotWorldHandEyeCalibrationMethod
+{
+    CALIB_ROBOT_WORLD_HAND_EYE_SHAH = 0, //!< Solving the robot-world/hand-eye calibration problem using the kronecker product @cite Shah2013SolvingTR
+    CALIB_ROBOT_WORLD_HAND_EYE_LI   = 1  //!< Simultaneous robot-world and hand-eye calibration using dual-quaternions and kronecker product @cite Li2010SimultaneousRA
+};
+
+enum SamplingMethod { SAMPLING_UNIFORM=0, SAMPLING_PROGRESSIVE_NAPSAC=1, SAMPLING_NAPSAC=2,
+        SAMPLING_PROSAC=3 };
+enum LocalOptimMethod {LOCAL_OPTIM_NULL=0, LOCAL_OPTIM_INNER_LO=1, LOCAL_OPTIM_INNER_AND_ITER_LO=2,
+        LOCAL_OPTIM_GC=3, LOCAL_OPTIM_SIGMA=4};
+enum ScoreMethod {SCORE_METHOD_RANSAC=0, SCORE_METHOD_MSAC=1, SCORE_METHOD_MAGSAC=2, SCORE_METHOD_LMEDS=3};
+enum NeighborSearchMethod { NEIGH_FLANN_KNN=0, NEIGH_GRID=1, NEIGH_FLANN_RADIUS=2 };
+enum PolishingMethod { NONE_POLISHER=0, LSQ_POLISHER=1, MAGSAC=2, COV_POLISHER=3 };
+
+struct CV_EXPORTS_W_SIMPLE UsacParams
+{ // in alphabetical order
+    CV_WRAP UsacParams();
+    CV_PROP_RW double confidence;
+    CV_PROP_RW bool isParallel;
+    CV_PROP_RW int loIterations;
+    CV_PROP_RW LocalOptimMethod loMethod;
+    CV_PROP_RW int loSampleSize;
+    CV_PROP_RW int maxIterations;
+    CV_PROP_RW NeighborSearchMethod neighborsSearch;
+    CV_PROP_RW int randomGeneratorState;
+    CV_PROP_RW SamplingMethod sampler;
+    CV_PROP_RW ScoreMethod score;
+    CV_PROP_RW double threshold;
+    CV_PROP_RW PolishingMethod final_polisher;
+    CV_PROP_RW int final_polisher_iterations;
+};
+
+/** @brief Converts a rotation matrix to a rotation vector or vice versa.
+
+@param src Input rotation vector (3x1 or 1x3) or rotation matrix (3x3).
+@param dst Output rotation matrix (3x3) or rotation vector (3x1 or 1x3), respectively.
+@param jacobian Optional output Jacobian matrix, 3x9 or 9x3, which is a matrix of partial
+derivatives of the output array components with respect to the input array components.
+
+\f[\begin{array}{l} \theta \leftarrow norm(r) \\ r  \leftarrow r/ \theta \\ R =  \cos(\theta) I + (1- \cos{\theta} ) r r^T +  \sin(\theta) \vecthreethree{0}{-r_z}{r_y}{r_z}{0}{-r_x}{-r_y}{r_x}{0} \end{array}\f]
+
+Inverse transformation can be also done easily, since
+
+\f[\sin ( \theta ) \vecthreethree{0}{-r_z}{r_y}{r_z}{0}{-r_x}{-r_y}{r_x}{0} = \frac{R - R^T}{2}\f]
+
+A rotation vector is a convenient and most compact representation of a rotation matrix (since any
+rotation matrix has just 3 degrees of freedom). The representation is used in the global 3D geometry
+optimization procedures like @ref calibrateCamera, @ref stereoCalibrate, or @ref solvePnP .
+
+@note More information about the computation of the derivative of a 3D rotation matrix with respect to its exponential coordinate
+can be found in:
+    - A Compact Formula for the Derivative of a 3-D Rotation in Exponential Coordinates, Guillermo Gallego, Anthony J. Yezzi @cite Gallego2014ACF
+
+@note Useful information on SE(3) and Lie Groups can be found in:
+    - A tutorial on SE(3) transformation parameterizations and on-manifold optimization, Jose-Luis Blanco @cite blanco2010tutorial
+    - Lie Groups for 2D and 3D Transformation, Ethan Eade @cite Eade17
+    - A micro Lie theory for state estimation in robotics, Joan Solà, Jérémie Deray, Dinesh Atchuthan @cite Sol2018AML
+ */
+CV_EXPORTS_W void Rodrigues( InputArray src, OutputArray dst, OutputArray jacobian = noArray() );
+
+
+
+/** Levenberg-Marquardt solver. Starting with the specified vector of parameters it
+    optimizes the target vector criteria "err"
+    (finds local minima of each target vector component absolute value).
+
+    When needed, it calls user-provided callback.
+*/
+class CV_EXPORTS LMSolver : public Algorithm
+{
+public:
+    class CV_EXPORTS Callback
+    {
+    public:
+        virtual ~Callback() {}
+        /**
+         computes error and Jacobian for the specified vector of parameters
+
+         @param param the current vector of parameters
+         @param err output vector of errors: err_i = actual_f_i - ideal_f_i
+         @param J output Jacobian: J_ij = d(ideal_f_i)/d(param_j)
+
+         when J=noArray(), it means that it does not need to be computed.
+         Dimensionality of error vector and param vector can be different.
+         The callback should explicitly allocate (with "create" method) each output array
+         (unless it's noArray()).
+        */
+        virtual bool compute(InputArray param, OutputArray err, OutputArray J) const = 0;
+    };
+
+    /**
+       Runs Levenberg-Marquardt algorithm using the passed vector of parameters as the start point.
+       The final vector of parameters (whether the algorithm converged or not) is stored at the same
+       vector. The method returns the number of iterations used. If it's equal to the previously specified
+       maxIters, there is a big chance the algorithm did not converge.
+
+       @param param initial/final vector of parameters.
+
+       Note that the dimensionality of parameter space is defined by the size of param vector,
+       and the dimensionality of optimized criteria is defined by the size of err vector
+       computed by the callback.
+    */
+    virtual int run(InputOutputArray param) const = 0;
+
+    /**
+       Sets the maximum number of iterations
+       @param maxIters the number of iterations
+    */
+    virtual void setMaxIters(int maxIters) = 0;
+    /**
+       Retrieves the current maximum number of iterations
+    */
+    virtual int getMaxIters() const = 0;
+
+    /**
+       Creates Levenberg-Marquard solver
+
+       @param cb callback
+       @param maxIters maximum number of iterations that can be further
+         modified using setMaxIters() method.
+    */
+    static Ptr<LMSolver> create(const Ptr<LMSolver::Callback>& cb, int maxIters);
+    static Ptr<LMSolver> create(const Ptr<LMSolver::Callback>& cb, int maxIters, double eps);
+};
+
+
+
+/** @example samples/cpp/tutorial_code/features2D/Homography/pose_from_homography.cpp
+An example program about pose estimation from coplanar points
+
+Check @ref tutorial_homography "the corresponding tutorial" for more details
+*/
+
+/** @brief Finds a perspective transformation between two planes.
+
+@param srcPoints Coordinates of the points in the original plane, a matrix of the type CV_32FC2
+or vector\<Point2f\> .
+@param dstPoints Coordinates of the points in the target plane, a matrix of the type CV_32FC2 or
+a vector\<Point2f\> .
+@param method Method used to compute a homography matrix. The following methods are possible:
+-   **0** - a regular method using all the points, i.e., the least squares method
+-   @ref RANSAC - RANSAC-based robust method
+-   @ref LMEDS - Least-Median robust method
+-   @ref RHO - PROSAC-based robust method
+@param ransacReprojThreshold Maximum allowed reprojection error to treat a point pair as an inlier
+(used in the RANSAC and RHO methods only). That is, if
+\f[\| \texttt{dstPoints} _i -  \texttt{convertPointsHomogeneous} ( \texttt{H} \cdot \texttt{srcPoints} _i) \|_2  >  \texttt{ransacReprojThreshold}\f]
+then the point \f$i\f$ is considered as an outlier. If srcPoints and dstPoints are measured in pixels,
+it usually makes sense to set this parameter somewhere in the range of 1 to 10.
+@param mask Optional output mask set by a robust method ( RANSAC or LMeDS ). Note that the input
+mask values are ignored.
+@param maxIters The maximum number of RANSAC iterations.
+@param confidence Confidence level, between 0 and 1.
+
+The function finds and returns the perspective transformation \f$H\f$ between the source and the
+destination planes:
+
+\f[s_i  \vecthree{x'_i}{y'_i}{1} \sim H  \vecthree{x_i}{y_i}{1}\f]
+
+so that the back-projection error
+
+\f[\sum _i \left ( x'_i- \frac{h_{11} x_i + h_{12} y_i + h_{13}}{h_{31} x_i + h_{32} y_i + h_{33}} \right )^2+ \left ( y'_i- \frac{h_{21} x_i + h_{22} y_i + h_{23}}{h_{31} x_i + h_{32} y_i + h_{33}} \right )^2\f]
+
+is minimized. If the parameter method is set to the default value 0, the function uses all the point
+pairs to compute an initial homography estimate with a simple least-squares scheme.
+
+However, if not all of the point pairs ( \f$srcPoints_i\f$, \f$dstPoints_i\f$ ) fit the rigid perspective
+transformation (that is, there are some outliers), this initial estimate will be poor. In this case,
+you can use one of the three robust methods. The methods RANSAC, LMeDS and RHO try many different
+random subsets of the corresponding point pairs (of four pairs each, collinear pairs are discarded), estimate the homography matrix
+using this subset and a simple least-squares algorithm, and then compute the quality/goodness of the
+computed homography (which is the number of inliers for RANSAC or the least median re-projection error for
+LMeDS). The best subset is then used to produce the initial estimate of the homography matrix and
+the mask of inliers/outliers.
+
+Regardless of the method, robust or not, the computed homography matrix is refined further (using
+inliers only in case of a robust method) with the Levenberg-Marquardt method to reduce the
+re-projection error even more.
+
+The methods RANSAC and RHO can handle practically any ratio of outliers but need a threshold to
+distinguish inliers from outliers. The method LMeDS does not need any threshold but it works
+correctly only when there are more than 50% of inliers. Finally, if there are no outliers and the
+noise is rather small, use the default method (method=0).
+
+The function is used to find initial intrinsic and extrinsic matrices. Homography matrix is
+determined up to a scale. If \f$h_{33}\f$ is non-zero, the matrix is normalized so that \f$h_{33}=1\f$.
+@note Whenever an \f$H\f$ matrix cannot be estimated, an empty one will be returned.
+
+@sa
+getAffineTransform, estimateAffine2D, estimateAffinePartial2D, getPerspectiveTransform, warpPerspective,
+perspectiveTransform
+ */
+CV_EXPORTS_W Mat findHomography( InputArray srcPoints, InputArray dstPoints,
+                                 int method = 0, double ransacReprojThreshold = 3,
+                                 OutputArray mask=noArray(), const int maxIters = 2000,
+                                 const double confidence = 0.995);
+
+/** @overload */
+CV_EXPORTS Mat findHomography( InputArray srcPoints, InputArray dstPoints,
+                               OutputArray mask, int method = 0, double ransacReprojThreshold = 3 );
+
+
+CV_EXPORTS_W Mat findHomography(InputArray srcPoints, InputArray dstPoints, OutputArray mask,
+                   const UsacParams &params);
+
+/** @brief Computes an RQ decomposition of 3x3 matrices.
+
+@param src 3x3 input matrix.
+@param mtxR Output 3x3 upper-triangular matrix.
+@param mtxQ Output 3x3 orthogonal matrix.
+@param Qx Optional output 3x3 rotation matrix around x-axis.
+@param Qy Optional output 3x3 rotation matrix around y-axis.
+@param Qz Optional output 3x3 rotation matrix around z-axis.
+
+The function computes a RQ decomposition using the given rotations. This function is used in
+#decomposeProjectionMatrix to decompose the left 3x3 submatrix of a projection matrix into a camera
+and a rotation matrix.
+
+It optionally returns three rotation matrices, one for each axis, and the three Euler angles in
+degrees (as the return value) that could be used in OpenGL. Note, there is always more than one
+sequence of rotations about the three principal axes that results in the same orientation of an
+object, e.g. see @cite Slabaugh . Returned three rotation matrices and corresponding three Euler angles
+are only one of the possible solutions.
+ */
+CV_EXPORTS_W Vec3d RQDecomp3x3( InputArray src, OutputArray mtxR, OutputArray mtxQ,
+                                OutputArray Qx = noArray(),
+                                OutputArray Qy = noArray(),
+                                OutputArray Qz = noArray());
+
+/** @brief Decomposes a projection matrix into a rotation matrix and a camera intrinsic matrix.
+
+@param projMatrix 3x4 input projection matrix P.
+@param cameraMatrix Output 3x3 camera intrinsic matrix \f$\cameramatrix{A}\f$.
+@param rotMatrix Output 3x3 external rotation matrix R.
+@param transVect Output 4x1 translation vector T.
+@param rotMatrixX Optional 3x3 rotation matrix around x-axis.
+@param rotMatrixY Optional 3x3 rotation matrix around y-axis.
+@param rotMatrixZ Optional 3x3 rotation matrix around z-axis.
+@param eulerAngles Optional three-element vector containing three Euler angles of rotation in
+degrees.
+
+The function computes a decomposition of a projection matrix into a calibration and a rotation
+matrix and the position of a camera.
+
+It optionally returns three rotation matrices, one for each axis, and three Euler angles that could
+be used in OpenGL. Note, there is always more than one sequence of rotations about the three
+principal axes that results in the same orientation of an object, e.g. see @cite Slabaugh . Returned
+three rotation matrices and corresponding three Euler angles are only one of the possible solutions.
+
+The function is based on #RQDecomp3x3 .
+ */
+CV_EXPORTS_W void decomposeProjectionMatrix( InputArray projMatrix, OutputArray cameraMatrix,
+                                             OutputArray rotMatrix, OutputArray transVect,
+                                             OutputArray rotMatrixX = noArray(),
+                                             OutputArray rotMatrixY = noArray(),
+                                             OutputArray rotMatrixZ = noArray(),
+                                             OutputArray eulerAngles =noArray() );
+
+/** @brief Computes partial derivatives of the matrix product for each multiplied matrix.
+
+@param A First multiplied matrix.
+@param B Second multiplied matrix.
+@param dABdA First output derivative matrix d(A\*B)/dA of size
+\f$\texttt{A.rows*B.cols} \times {A.rows*A.cols}\f$ .
+@param dABdB Second output derivative matrix d(A\*B)/dB of size
+\f$\texttt{A.rows*B.cols} \times {B.rows*B.cols}\f$ .
+
+The function computes partial derivatives of the elements of the matrix product \f$A*B\f$ with regard to
+the elements of each of the two input matrices. The function is used to compute the Jacobian
+matrices in #stereoCalibrate but can also be used in any other similar optimization function.
+ */
+CV_EXPORTS_W void matMulDeriv( InputArray A, InputArray B, OutputArray dABdA, OutputArray dABdB );
+
+/** @brief Combines two rotation-and-shift transformations.
+
+@param rvec1 First rotation vector.
+@param tvec1 First translation vector.
+@param rvec2 Second rotation vector.
+@param tvec2 Second translation vector.
+@param rvec3 Output rotation vector of the superposition.
+@param tvec3 Output translation vector of the superposition.
+@param dr3dr1 Optional output derivative of rvec3 with regard to rvec1
+@param dr3dt1 Optional output derivative of rvec3 with regard to tvec1
+@param dr3dr2 Optional output derivative of rvec3 with regard to rvec2
+@param dr3dt2 Optional output derivative of rvec3 with regard to tvec2
+@param dt3dr1 Optional output derivative of tvec3 with regard to rvec1
+@param dt3dt1 Optional output derivative of tvec3 with regard to tvec1
+@param dt3dr2 Optional output derivative of tvec3 with regard to rvec2
+@param dt3dt2 Optional output derivative of tvec3 with regard to tvec2
+
+The functions compute:
+
+\f[\begin{array}{l} \texttt{rvec3} =  \mathrm{rodrigues} ^{-1} \left ( \mathrm{rodrigues} ( \texttt{rvec2} )  \cdot \mathrm{rodrigues} ( \texttt{rvec1} ) \right )  \\ \texttt{tvec3} =  \mathrm{rodrigues} ( \texttt{rvec2} )  \cdot \texttt{tvec1} +  \texttt{tvec2} \end{array} ,\f]
+
+where \f$\mathrm{rodrigues}\f$ denotes a rotation vector to a rotation matrix transformation, and
+\f$\mathrm{rodrigues}^{-1}\f$ denotes the inverse transformation. See #Rodrigues for details.
+
+Also, the functions can compute the derivatives of the output vectors with regards to the input
+vectors (see #matMulDeriv ). The functions are used inside #stereoCalibrate but can also be used in
+your own code where Levenberg-Marquardt or another gradient-based solver is used to optimize a
+function that contains a matrix multiplication.
+ */
+CV_EXPORTS_W void composeRT( InputArray rvec1, InputArray tvec1,
+                             InputArray rvec2, InputArray tvec2,
+                             OutputArray rvec3, OutputArray tvec3,
+                             OutputArray dr3dr1 = noArray(), OutputArray dr3dt1 = noArray(),
+                             OutputArray dr3dr2 = noArray(), OutputArray dr3dt2 = noArray(),
+                             OutputArray dt3dr1 = noArray(), OutputArray dt3dt1 = noArray(),
+                             OutputArray dt3dr2 = noArray(), OutputArray dt3dt2 = noArray() );
+
+/** @brief Projects 3D points to an image plane.
+
+@param objectPoints Array of object points expressed wrt. the world coordinate frame. A 3xN/Nx3
+1-channel or 1xN/Nx1 3-channel (or vector\<Point3f\> ), where N is the number of points in the view.
+@param rvec The rotation vector (@ref Rodrigues) that, together with tvec, performs a change of
+basis from world to camera coordinate system, see @ref calibrateCamera for details.
+@param tvec The translation vector, see parameter description above.
+@param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$\distcoeffs\f$ . If the vector is empty, the zero distortion coefficients are assumed.
+@param imagePoints Output array of image points, 1xN/Nx1 2-channel, or
+vector\<Point2f\> .
+@param jacobian Optional output 2Nx(10+\<numDistCoeffs\>) jacobian matrix of derivatives of image
+points with respect to components of the rotation vector, translation vector, focal lengths,
+coordinates of the principal point and the distortion coefficients. In the old interface different
+components of the jacobian are returned via different output parameters.
+@param aspectRatio Optional "fixed aspect ratio" parameter. If the parameter is not 0, the
+function assumes that the aspect ratio (\f$f_x / f_y\f$) is fixed and correspondingly adjusts the
+jacobian matrix.
+
+The function computes the 2D projections of 3D points to the image plane, given intrinsic and
+extrinsic camera parameters. Optionally, the function computes Jacobians -matrices of partial
+derivatives of image points coordinates (as functions of all the input parameters) with respect to
+the particular parameters, intrinsic and/or extrinsic. The Jacobians are used during the global
+optimization in @ref calibrateCamera, @ref solvePnP, and @ref stereoCalibrate. The function itself
+can also be used to compute a re-projection error, given the current intrinsic and extrinsic
+parameters.
+
+@note By setting rvec = tvec = \f$[0, 0, 0]\f$, or by setting cameraMatrix to a 3x3 identity matrix,
+or by passing zero distortion coefficients, one can get various useful partial cases of the
+function. This means, one can compute the distorted coordinates for a sparse set of points or apply
+a perspective transformation (and also compute the derivatives) in the ideal zero-distortion setup.
+ */
+CV_EXPORTS_W void projectPoints( InputArray objectPoints,
+                                 InputArray rvec, InputArray tvec,
+                                 InputArray cameraMatrix, InputArray distCoeffs,
+                                 OutputArray imagePoints,
+                                 OutputArray jacobian = noArray(),
+                                 double aspectRatio = 0 );
+
+/** @example samples/cpp/tutorial_code/features2D/Homography/homography_from_camera_displacement.cpp
+An example program about homography from the camera displacement
+
+Check @ref tutorial_homography "the corresponding tutorial" for more details
+*/
+
+/** @brief Finds an object pose from 3D-2D point correspondences.
+
+@see @ref calib3d_solvePnP
+
+This function returns the rotation and the translation vectors that transform a 3D point expressed in the object
+coordinate frame to the camera coordinate frame, using different methods:
+- P3P methods (@ref SOLVEPNP_P3P, @ref SOLVEPNP_AP3P): need 4 input points to return a unique solution.
+- @ref SOLVEPNP_IPPE Input points must be >= 4 and object points must be coplanar.
+- @ref SOLVEPNP_IPPE_SQUARE Special case suitable for marker pose estimation.
+Number of input points must be 4. Object points must be defined in the following order:
+  - point 0: [-squareLength / 2,  squareLength / 2, 0]
+  - point 1: [ squareLength / 2,  squareLength / 2, 0]
+  - point 2: [ squareLength / 2, -squareLength / 2, 0]
+  - point 3: [-squareLength / 2, -squareLength / 2, 0]
+- for all the other flags, number of input points must be >= 4 and object points can be in any configuration.
+
+@param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
+1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
+@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+where N is the number of points. vector\<Point2d\> can be also passed here.
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
+the model coordinate system to the camera coordinate system.
+@param tvec Output translation vector.
+@param useExtrinsicGuess Parameter used for #SOLVEPNP_ITERATIVE. If true (1), the function uses
+the provided rvec and tvec values as initial approximations of the rotation and translation
+vectors, respectively, and further optimizes them.
+@param flags Method for solving a PnP problem: see @ref calib3d_solvePnP_flags
+
+More information about Perspective-n-Points is described in @ref calib3d_solvePnP
+
+@note
+   -   An example of how to use solvePnP for planar augmented reality can be found at
+        opencv_source_code/samples/python/plane_ar.py
+   -   If you are using Python:
+        - Numpy array slices won't work as input because solvePnP requires contiguous
+        arrays (enforced by the assertion using cv::Mat::checkVector() around line 55 of
+        modules/calib3d/src/solvepnp.cpp version 2.4.9)
+        - The P3P algorithm requires image points to be in an array of shape (N,1,2) due
+        to its calling of #undistortPoints (around line 75 of modules/calib3d/src/solvepnp.cpp version 2.4.9)
+        which requires 2-channel information.
+        - Thus, given some data D = np.array(...) where D.shape = (N,M), in order to use a subset of
+        it as, e.g., imagePoints, one must effectively copy it into a new array: imagePoints =
+        np.ascontiguousarray(D[:,:2]).reshape((N,1,2))
+   -   The methods @ref SOLVEPNP_DLS and @ref SOLVEPNP_UPNP cannot be used as the current implementations are
+       unstable and sometimes give completely wrong results. If you pass one of these two
+       flags, @ref SOLVEPNP_EPNP method will be used instead.
+   -   The minimum number of points is 4 in the general case. In the case of @ref SOLVEPNP_P3P and @ref SOLVEPNP_AP3P
+       methods, it is required to use exactly 4 points (the first 3 points are used to estimate all the solutions
+       of the P3P problem, the last one is used to retain the best solution that minimizes the reprojection error).
+   -   With @ref SOLVEPNP_ITERATIVE method and `useExtrinsicGuess=true`, the minimum number of points is 3 (3 points
+       are sufficient to compute a pose but there are up to 4 solutions). The initial solution should be close to the
+       global solution to converge.
+   -   With @ref SOLVEPNP_IPPE input points must be >= 4 and object points must be coplanar.
+   -   With @ref SOLVEPNP_IPPE_SQUARE this is a special case suitable for marker pose estimation.
+       Number of input points must be 4. Object points must be defined in the following order:
+         - point 0: [-squareLength / 2,  squareLength / 2, 0]
+         - point 1: [ squareLength / 2,  squareLength / 2, 0]
+         - point 2: [ squareLength / 2, -squareLength / 2, 0]
+         - point 3: [-squareLength / 2, -squareLength / 2, 0]
+    -  With @ref SOLVEPNP_SQPNP input points must be >= 3
+ */
+CV_EXPORTS_W bool solvePnP( InputArray objectPoints, InputArray imagePoints,
+                            InputArray cameraMatrix, InputArray distCoeffs,
+                            OutputArray rvec, OutputArray tvec,
+                            bool useExtrinsicGuess = false, int flags = SOLVEPNP_ITERATIVE );
+
+/** @brief Finds an object pose from 3D-2D point correspondences using the RANSAC scheme.
+
+@see @ref calib3d_solvePnP
+
+@param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
+1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
+@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+where N is the number of points. vector\<Point2d\> can be also passed here.
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
+the model coordinate system to the camera coordinate system.
+@param tvec Output translation vector.
+@param useExtrinsicGuess Parameter used for @ref SOLVEPNP_ITERATIVE. If true (1), the function uses
+the provided rvec and tvec values as initial approximations of the rotation and translation
+vectors, respectively, and further optimizes them.
+@param iterationsCount Number of iterations.
+@param reprojectionError Inlier threshold value used by the RANSAC procedure. The parameter value
+is the maximum allowed distance between the observed and computed point projections to consider it
+an inlier.
+@param confidence The probability that the algorithm produces a useful result.
+@param inliers Output vector that contains indices of inliers in objectPoints and imagePoints .
+@param flags Method for solving a PnP problem (see @ref solvePnP ).
+
+The function estimates an object pose given a set of object points, their corresponding image
+projections, as well as the camera intrinsic matrix and the distortion coefficients. This function finds such
+a pose that minimizes reprojection error, that is, the sum of squared distances between the observed
+projections imagePoints and the projected (using @ref projectPoints ) objectPoints. The use of RANSAC
+makes the function resistant to outliers.
+
+@note
+   -   An example of how to use solvePNPRansac for object detection can be found at
+        opencv_source_code/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/
+   -   The default method used to estimate the camera pose for the Minimal Sample Sets step
+       is #SOLVEPNP_EPNP. Exceptions are:
+         - if you choose #SOLVEPNP_P3P or #SOLVEPNP_AP3P, these methods will be used.
+         - if the number of input points is equal to 4, #SOLVEPNP_P3P is used.
+   -   The method used to estimate the camera pose using all the inliers is defined by the
+       flags parameters unless it is equal to #SOLVEPNP_P3P or #SOLVEPNP_AP3P. In this case,
+       the method #SOLVEPNP_EPNP will be used instead.
+ */
+CV_EXPORTS_W bool solvePnPRansac( InputArray objectPoints, InputArray imagePoints,
+                                  InputArray cameraMatrix, InputArray distCoeffs,
+                                  OutputArray rvec, OutputArray tvec,
+                                  bool useExtrinsicGuess = false, int iterationsCount = 100,
+                                  float reprojectionError = 8.0, double confidence = 0.99,
+                                  OutputArray inliers = noArray(), int flags = SOLVEPNP_ITERATIVE );
+
+
+/*
+Finds rotation and translation vector.
+If cameraMatrix is given then run P3P. Otherwise run linear P6P and output cameraMatrix too.
+*/
+CV_EXPORTS_W bool solvePnPRansac( InputArray objectPoints, InputArray imagePoints,
+                     InputOutputArray cameraMatrix, InputArray distCoeffs,
+                     OutputArray rvec, OutputArray tvec, OutputArray inliers,
+                     const UsacParams &params=UsacParams());
+
+/** @brief Finds an object pose from 3 3D-2D point correspondences.
+
+@see @ref calib3d_solvePnP
+
+@param objectPoints Array of object points in the object coordinate space, 3x3 1-channel or
+1x3/3x1 3-channel. vector\<Point3f\> can be also passed here.
+@param imagePoints Array of corresponding image points, 3x2 1-channel or 1x3/3x1 2-channel.
+ vector\<Point2f\> can be also passed here.
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param rvecs Output rotation vectors (see @ref Rodrigues ) that, together with tvecs, brings points from
+the model coordinate system to the camera coordinate system. A P3P problem has up to 4 solutions.
+@param tvecs Output translation vectors.
+@param flags Method for solving a P3P problem:
+-   @ref SOLVEPNP_P3P Method is based on the paper of X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang
+"Complete Solution Classification for the Perspective-Three-Point Problem" (@cite gao2003complete).
+-   @ref SOLVEPNP_AP3P Method is based on the paper of T. Ke and S. Roumeliotis.
+"An Efficient Algebraic Solution to the Perspective-Three-Point Problem" (@cite Ke17).
+
+The function estimates the object pose given 3 object points, their corresponding image
+projections, as well as the camera intrinsic matrix and the distortion coefficients.
+
+@note
+The solutions are sorted by reprojection errors (lowest to highest).
+ */
+CV_EXPORTS_W int solveP3P( InputArray objectPoints, InputArray imagePoints,
+                           InputArray cameraMatrix, InputArray distCoeffs,
+                           OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
+                           int flags );
+
+/** @brief Refine a pose (the translation and the rotation that transform a 3D point expressed in the object coordinate frame
+to the camera coordinate frame) from a 3D-2D point correspondences and starting from an initial solution.
+
+@see @ref calib3d_solvePnP
+
+@param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or 1xN/Nx1 3-channel,
+where N is the number of points. vector\<Point3d\> can also be passed here.
+@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+where N is the number of points. vector\<Point2d\> can also be passed here.
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
+the model coordinate system to the camera coordinate system. Input values are used as an initial solution.
+@param tvec Input/Output translation vector. Input values are used as an initial solution.
+@param criteria Criteria when to stop the Levenberg-Marquard iterative algorithm.
+
+The function refines the object pose given at least 3 object points, their corresponding image
+projections, an initial solution for the rotation and translation vector,
+as well as the camera intrinsic matrix and the distortion coefficients.
+The function minimizes the projection error with respect to the rotation and the translation vectors, according
+to a Levenberg-Marquardt iterative minimization @cite Madsen04 @cite Eade13 process.
+ */
+CV_EXPORTS_W void solvePnPRefineLM( InputArray objectPoints, InputArray imagePoints,
+                                    InputArray cameraMatrix, InputArray distCoeffs,
+                                    InputOutputArray rvec, InputOutputArray tvec,
+                                    TermCriteria criteria = TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 20, FLT_EPSILON));
+
+/** @brief Refine a pose (the translation and the rotation that transform a 3D point expressed in the object coordinate frame
+to the camera coordinate frame) from a 3D-2D point correspondences and starting from an initial solution.
+
+@see @ref calib3d_solvePnP
+
+@param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or 1xN/Nx1 3-channel,
+where N is the number of points. vector\<Point3d\> can also be passed here.
+@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+where N is the number of points. vector\<Point2d\> can also be passed here.
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param rvec Input/Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
+the model coordinate system to the camera coordinate system. Input values are used as an initial solution.
+@param tvec Input/Output translation vector. Input values are used as an initial solution.
+@param criteria Criteria when to stop the Levenberg-Marquard iterative algorithm.
+@param VVSlambda Gain for the virtual visual servoing control law, equivalent to the \f$\alpha\f$
+gain in the Damped Gauss-Newton formulation.
+
+The function refines the object pose given at least 3 object points, their corresponding image
+projections, an initial solution for the rotation and translation vector,
+as well as the camera intrinsic matrix and the distortion coefficients.
+The function minimizes the projection error with respect to the rotation and the translation vectors, using a
+virtual visual servoing (VVS) @cite Chaumette06 @cite Marchand16 scheme.
+ */
+CV_EXPORTS_W void solvePnPRefineVVS( InputArray objectPoints, InputArray imagePoints,
+                                     InputArray cameraMatrix, InputArray distCoeffs,
+                                     InputOutputArray rvec, InputOutputArray tvec,
+                                     TermCriteria criteria = TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 20, FLT_EPSILON),
+                                     double VVSlambda = 1);
+
+/** @brief Finds an object pose from 3D-2D point correspondences.
+
+@see @ref calib3d_solvePnP
+
+This function returns a list of all the possible solutions (a solution is a <rotation vector, translation vector>
+couple), depending on the number of input points and the chosen method:
+- P3P methods (@ref SOLVEPNP_P3P, @ref SOLVEPNP_AP3P): 3 or 4 input points. Number of returned solutions can be between 0 and 4 with 3 input points.
+- @ref SOLVEPNP_IPPE Input points must be >= 4 and object points must be coplanar. Returns 2 solutions.
+- @ref SOLVEPNP_IPPE_SQUARE Special case suitable for marker pose estimation.
+Number of input points must be 4 and 2 solutions are returned. Object points must be defined in the following order:
+  - point 0: [-squareLength / 2,  squareLength / 2, 0]
+  - point 1: [ squareLength / 2,  squareLength / 2, 0]
+  - point 2: [ squareLength / 2, -squareLength / 2, 0]
+  - point 3: [-squareLength / 2, -squareLength / 2, 0]
+- for all the other flags, number of input points must be >= 4 and object points can be in any configuration.
+Only 1 solution is returned.
+
+@param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
+1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
+@param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+where N is the number of points. vector\<Point2d\> can be also passed here.
+@param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param rvecs Vector of output rotation vectors (see @ref Rodrigues ) that, together with tvecs, brings points from
+the model coordinate system to the camera coordinate system.
+@param tvecs Vector of output translation vectors.
+@param useExtrinsicGuess Parameter used for #SOLVEPNP_ITERATIVE. If true (1), the function uses
+the provided rvec and tvec values as initial approximations of the rotation and translation
+vectors, respectively, and further optimizes them.
+@param flags Method for solving a PnP problem: see @ref calib3d_solvePnP_flags
+@param rvec Rotation vector used to initialize an iterative PnP refinement algorithm, when flag is @ref SOLVEPNP_ITERATIVE
+and useExtrinsicGuess is set to true.
+@param tvec Translation vector used to initialize an iterative PnP refinement algorithm, when flag is @ref SOLVEPNP_ITERATIVE
+and useExtrinsicGuess is set to true.
+@param reprojectionError Optional vector of reprojection error, that is the RMS error
+(\f$ \text{RMSE} = \sqrt{\frac{\sum_{i}^{N} \left ( \hat{y_i} - y_i \right )^2}{N}} \f$) between the input image points
+and the 3D object points projected with the estimated pose.
+
+More information is described in @ref calib3d_solvePnP
+
+@note
+   -   An example of how to use solvePnP for planar augmented reality can be found at
+        opencv_source_code/samples/python/plane_ar.py
+   -   If you are using Python:
+        - Numpy array slices won't work as input because solvePnP requires contiguous
+        arrays (enforced by the assertion using cv::Mat::checkVector() around line 55 of
+        modules/calib3d/src/solvepnp.cpp version 2.4.9)
+        - The P3P algorithm requires image points to be in an array of shape (N,1,2) due
+        to its calling of #undistortPoints (around line 75 of modules/calib3d/src/solvepnp.cpp version 2.4.9)
+        which requires 2-channel information.
+        - Thus, given some data D = np.array(...) where D.shape = (N,M), in order to use a subset of
+        it as, e.g., imagePoints, one must effectively copy it into a new array: imagePoints =
+        np.ascontiguousarray(D[:,:2]).reshape((N,1,2))
+   -   The methods @ref SOLVEPNP_DLS and @ref SOLVEPNP_UPNP cannot be used as the current implementations are
+       unstable and sometimes give completely wrong results. If you pass one of these two
+       flags, @ref SOLVEPNP_EPNP method will be used instead.
+   -   The minimum number of points is 4 in the general case. In the case of @ref SOLVEPNP_P3P and @ref SOLVEPNP_AP3P
+       methods, it is required to use exactly 4 points (the first 3 points are used to estimate all the solutions
+       of the P3P problem, the last one is used to retain the best solution that minimizes the reprojection error).
+   -   With @ref SOLVEPNP_ITERATIVE method and `useExtrinsicGuess=true`, the minimum number of points is 3 (3 points
+       are sufficient to compute a pose but there are up to 4 solutions). The initial solution should be close to the
+       global solution to converge.
+   -   With @ref SOLVEPNP_IPPE input points must be >= 4 and object points must be coplanar.
+   -   With @ref SOLVEPNP_IPPE_SQUARE this is a special case suitable for marker pose estimation.
+       Number of input points must be 4. Object points must be defined in the following order:
+         - point 0: [-squareLength / 2,  squareLength / 2, 0]
+         - point 1: [ squareLength / 2,  squareLength / 2, 0]
+         - point 2: [ squareLength / 2, -squareLength / 2, 0]
+         - point 3: [-squareLength / 2, -squareLength / 2, 0]
+ */
+CV_EXPORTS_W int solvePnPGeneric( InputArray objectPoints, InputArray imagePoints,
+                                  InputArray cameraMatrix, InputArray distCoeffs,
+                                  OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
+                                  bool useExtrinsicGuess = false, SolvePnPMethod flags = SOLVEPNP_ITERATIVE,
+                                  InputArray rvec = noArray(), InputArray tvec = noArray(),
+                                  OutputArray reprojectionError = noArray() );
+
+/** @brief Finds an initial camera intrinsic matrix from 3D-2D point correspondences.
+
+@param objectPoints Vector of vectors of the calibration pattern points in the calibration pattern
+coordinate space. In the old interface all the per-view vectors are concatenated. See
+#calibrateCamera for details.
+@param imagePoints Vector of vectors of the projections of the calibration pattern points. In the
+old interface all the per-view vectors are concatenated.
+@param imageSize Image size in pixels used to initialize the principal point.
+@param aspectRatio If it is zero or negative, both \f$f_x\f$ and \f$f_y\f$ are estimated independently.
+Otherwise, \f$f_x = f_y \cdot \texttt{aspectRatio}\f$ .
+
+The function estimates and returns an initial camera intrinsic matrix for the camera calibration process.
+Currently, the function only supports planar calibration patterns, which are patterns where each
+object point has z-coordinate =0.
+ */
+CV_EXPORTS_W Mat initCameraMatrix2D( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints,
+                                     Size imageSize, double aspectRatio = 1.0 );
+
+/** @brief Finds the positions of internal corners of the chessboard.
+
+@param image Source chessboard view. It must be an 8-bit grayscale or color image.
+@param patternSize Number of inner corners per a chessboard row and column
+( patternSize = cv::Size(points_per_row,points_per_colum) = cv::Size(columns,rows) ).
+@param corners Output array of detected corners.
+@param flags Various operation flags that can be zero or a combination of the following values:
+-   @ref CALIB_CB_ADAPTIVE_THRESH Use adaptive thresholding to convert the image to black
+and white, rather than a fixed threshold level (computed from the average image brightness).
+-   @ref CALIB_CB_NORMALIZE_IMAGE Normalize the image gamma with #equalizeHist before
+applying fixed or adaptive thresholding.
+-   @ref CALIB_CB_FILTER_QUADS Use additional criteria (like contour area, perimeter,
+square-like shape) to filter out false quads extracted at the contour retrieval stage.
+-   @ref CALIB_CB_FAST_CHECK Run a fast check on the image that looks for chessboard corners,
+and shortcut the call if none is found. This can drastically speed up the call in the
+degenerate condition when no chessboard is observed.
+-   @ref CALIB_CB_PLAIN All other flags are ignored. The input image is taken as is.
+No image processing is done to improve to find the checkerboard. This has the effect of speeding up the
+execution of the function but could lead to not recognizing the checkerboard if the image
+is not previously binarized in the appropriate manner.
+
+The function attempts to determine whether the input image is a view of the chessboard pattern and
+locate the internal chessboard corners. The function returns a non-zero value if all of the corners
+are found and they are placed in a certain order (row by row, left to right in every row).
+Otherwise, if the function fails to find all the corners or reorder them, it returns 0. For example,
+a regular chessboard has 8 x 8 squares and 7 x 7 internal corners, that is, points where the black
+squares touch each other. The detected coordinates are approximate, and to determine their positions
+more accurately, the function calls #cornerSubPix. You also may use the function #cornerSubPix with
+different parameters if returned coordinates are not accurate enough.
+
+Sample usage of detecting and drawing chessboard corners: :
+@code
+    Size patternsize(8,6); //interior number of corners
+    Mat gray = ....; //source image
+    vector<Point2f> corners; //this will be filled by the detected corners
+
+    //CALIB_CB_FAST_CHECK saves a lot of time on images
+    //that do not contain any chessboard corners
+    bool patternfound = findChessboardCorners(gray, patternsize, corners,
+            CALIB_CB_ADAPTIVE_THRESH + CALIB_CB_NORMALIZE_IMAGE
+            + CALIB_CB_FAST_CHECK);
+
+    if(patternfound)
+      cornerSubPix(gray, corners, Size(11, 11), Size(-1, -1),
+        TermCriteria(CV_TERMCRIT_EPS + CV_TERMCRIT_ITER, 30, 0.1));
+
+    drawChessboardCorners(img, patternsize, Mat(corners), patternfound);
+@endcode
+@note The function requires white space (like a square-thick border, the wider the better) around
+the board to make the detection more robust in various environments. Otherwise, if there is no
+border and the background is dark, the outer black squares cannot be segmented properly and so the
+square grouping and ordering algorithm fails.
+
+Use gen_pattern.py (@ref tutorial_camera_calibration_pattern) to create checkerboard.
+ */
+CV_EXPORTS_W bool findChessboardCorners( InputArray image, Size patternSize, OutputArray corners,
+                                         int flags = CALIB_CB_ADAPTIVE_THRESH + CALIB_CB_NORMALIZE_IMAGE );
+
+/*
+   Checks whether the image contains chessboard of the specific size or not.
+   If yes, nonzero value is returned.
+*/
+CV_EXPORTS_W bool checkChessboard(InputArray img, Size size);
+
+/** @brief Finds the positions of internal corners of the chessboard using a sector based approach.
+
+@param image Source chessboard view. It must be an 8-bit grayscale or color image.
+@param patternSize Number of inner corners per a chessboard row and column
+( patternSize = cv::Size(points_per_row,points_per_colum) = cv::Size(columns,rows) ).
+@param corners Output array of detected corners.
+@param flags Various operation flags that can be zero or a combination of the following values:
+-   @ref CALIB_CB_NORMALIZE_IMAGE Normalize the image gamma with equalizeHist before detection.
+-   @ref CALIB_CB_EXHAUSTIVE Run an exhaustive search to improve detection rate.
+-   @ref CALIB_CB_ACCURACY Up sample input image to improve sub-pixel accuracy due to aliasing effects.
+-   @ref CALIB_CB_LARGER The detected pattern is allowed to be larger than patternSize (see description).
+-   @ref CALIB_CB_MARKER The detected pattern must have a marker (see description).
+This should be used if an accurate camera calibration is required.
+@param meta Optional output arrray of detected corners (CV_8UC1 and size = cv::Size(columns,rows)).
+Each entry stands for one corner of the pattern and can have one of the following values:
+-   0 = no meta data attached
+-   1 = left-top corner of a black cell
+-   2 = left-top corner of a white cell
+-   3 = left-top corner of a black cell with a white marker dot
+-   4 = left-top corner of a white cell with a black marker dot (pattern origin in case of markers otherwise first corner)
+
+The function is analog to #findChessboardCorners but uses a localized radon
+transformation approximated by box filters being more robust to all sort of
+noise, faster on larger images and is able to directly return the sub-pixel
+position of the internal chessboard corners. The Method is based on the paper
+@cite duda2018 "Accurate Detection and Localization of Checkerboard Corners for
+Calibration" demonstrating that the returned sub-pixel positions are more
+accurate than the one returned by cornerSubPix allowing a precise camera
+calibration for demanding applications.
+
+In the case, the flags @ref CALIB_CB_LARGER or @ref CALIB_CB_MARKER are given,
+the result can be recovered from the optional meta array. Both flags are
+helpful to use calibration patterns exceeding the field of view of the camera.
+These oversized patterns allow more accurate calibrations as corners can be
+utilized, which are as close as possible to the image borders.  For a
+consistent coordinate system across all images, the optional marker (see image
+below) can be used to move the origin of the board to the location where the
+black circle is located.
+
+@note The function requires a white boarder with roughly the same width as one
+of the checkerboard fields around the whole board to improve the detection in
+various environments. In addition, because of the localized radon
+transformation it is beneficial to use round corners for the field corners
+which are located on the outside of the board. The following figure illustrates
+a sample checkerboard optimized for the detection. However, any other checkerboard
+can be used as well.
+
+Use gen_pattern.py (@ref tutorial_camera_calibration_pattern) to create checkerboard.
+![Checkerboard](pics/checkerboard_radon.png)
+ */
+CV_EXPORTS_AS(findChessboardCornersSBWithMeta)
+bool findChessboardCornersSB(InputArray image,Size patternSize, OutputArray corners,
+                             int flags,OutputArray meta);
+/** @overload */
+CV_EXPORTS_W inline
+bool findChessboardCornersSB(InputArray image, Size patternSize, OutputArray corners,
+                             int flags = 0)
+{
+    return findChessboardCornersSB(image, patternSize, corners, flags, noArray());
+}
+
+/** @brief Estimates the sharpness of a detected chessboard.
+
+Image sharpness, as well as brightness, are a critical parameter for accuracte
+camera calibration. For accessing these parameters for filtering out
+problematic calibraiton images, this method calculates edge profiles by traveling from
+black to white chessboard cell centers. Based on this, the number of pixels is
+calculated required to transit from black to white. This width of the
+transition area is a good indication of how sharp the chessboard is imaged
+and should be below ~3.0 pixels.
+
+@param image Gray image used to find chessboard corners
+@param patternSize Size of a found chessboard pattern
+@param corners Corners found by #findChessboardCornersSB
+@param rise_distance Rise distance 0.8 means 10% ... 90% of the final signal strength
+@param vertical By default edge responses for horizontal lines are calculated
+@param sharpness Optional output array with a sharpness value for calculated edge responses (see description)
+
+The optional sharpness array is of type CV_32FC1 and has for each calculated
+profile one row with the following five entries:
+* 0 = x coordinate of the underlying edge in the image
+* 1 = y coordinate of the underlying edge in the image
+* 2 = width of the transition area (sharpness)
+* 3 = signal strength in the black cell (min brightness)
+* 4 = signal strength in the white cell (max brightness)
+
+@return Scalar(average sharpness, average min brightness, average max brightness,0)
+*/
+CV_EXPORTS_W Scalar estimateChessboardSharpness(InputArray image, Size patternSize, InputArray corners,
+                                                float rise_distance=0.8F,bool vertical=false,
+                                                OutputArray sharpness=noArray());
+
+
+//! finds subpixel-accurate positions of the chessboard corners
+CV_EXPORTS_W bool find4QuadCornerSubpix( InputArray img, InputOutputArray corners, Size region_size );
+
+/** @brief Renders the detected chessboard corners.
+
+@param image Destination image. It must be an 8-bit color image.
+@param patternSize Number of inner corners per a chessboard row and column
+(patternSize = cv::Size(points_per_row,points_per_column)).
+@param corners Array of detected corners, the output of #findChessboardCorners.
+@param patternWasFound Parameter indicating whether the complete board was found or not. The
+return value of #findChessboardCorners should be passed here.
+
+The function draws individual chessboard corners detected either as red circles if the board was not
+found, or as colored corners connected with lines if the board was found.
+ */
+CV_EXPORTS_W void drawChessboardCorners( InputOutputArray image, Size patternSize,
+                                         InputArray corners, bool patternWasFound );
+
+/** @brief Draw axes of the world/object coordinate system from pose estimation. @sa solvePnP
+
+@param image Input/output image. It must have 1 or 3 channels. The number of channels is not altered.
+@param cameraMatrix Input 3x3 floating-point matrix of camera intrinsic parameters.
+\f$\cameramatrix{A}\f$
+@param distCoeffs Input vector of distortion coefficients
+\f$\distcoeffs\f$. If the vector is empty, the zero distortion coefficients are assumed.
+@param rvec Rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
+the model coordinate system to the camera coordinate system.
+@param tvec Translation vector.
+@param length Length of the painted axes in the same unit than tvec (usually in meters).
+@param thickness Line thickness of the painted axes.
+
+This function draws the axes of the world/object coordinate system w.r.t. to the camera frame.
+OX is drawn in red, OY in green and OZ in blue.
+ */
+CV_EXPORTS_W void drawFrameAxes(InputOutputArray image, InputArray cameraMatrix, InputArray distCoeffs,
+                                InputArray rvec, InputArray tvec, float length, int thickness=3);
+
+struct CV_EXPORTS_W_SIMPLE CirclesGridFinderParameters
+{
+    CV_WRAP CirclesGridFinderParameters();
+    CV_PROP_RW cv::Size2f densityNeighborhoodSize;
+    CV_PROP_RW float minDensity;
+    CV_PROP_RW int kmeansAttempts;
+    CV_PROP_RW int minDistanceToAddKeypoint;
+    CV_PROP_RW int keypointScale;
+    CV_PROP_RW float minGraphConfidence;
+    CV_PROP_RW float vertexGain;
+    CV_PROP_RW float vertexPenalty;
+    CV_PROP_RW float existingVertexGain;
+    CV_PROP_RW float edgeGain;
+    CV_PROP_RW float edgePenalty;
+    CV_PROP_RW float convexHullFactor;
+    CV_PROP_RW float minRNGEdgeSwitchDist;
+
+    enum GridType
+    {
+      SYMMETRIC_GRID, ASYMMETRIC_GRID
+    };
+    GridType gridType;
+
+    CV_PROP_RW float squareSize; //!< Distance between two adjacent points. Used by CALIB_CB_CLUSTERING.
+    CV_PROP_RW float maxRectifiedDistance; //!< Max deviation from prediction. Used by CALIB_CB_CLUSTERING.
+};
+
+#ifndef DISABLE_OPENCV_3_COMPATIBILITY
+typedef CirclesGridFinderParameters CirclesGridFinderParameters2;
+#endif
+
+/** @brief Finds centers in the grid of circles.
+
+@param image grid view of input circles; it must be an 8-bit grayscale or color image.
+@param patternSize number of circles per row and column
+( patternSize = Size(points_per_row, points_per_colum) ).
+@param centers output array of detected centers.
+@param flags various operation flags that can be one of the following values:
+-   @ref CALIB_CB_SYMMETRIC_GRID uses symmetric pattern of circles.
+-   @ref CALIB_CB_ASYMMETRIC_GRID uses asymmetric pattern of circles.
+-   @ref CALIB_CB_CLUSTERING uses a special algorithm for grid detection. It is more robust to
+perspective distortions but much more sensitive to background clutter.
+@param blobDetector feature detector that finds blobs like dark circles on light background.
+                    If `blobDetector` is NULL then `image` represents Point2f array of candidates.
+@param parameters struct for finding circles in a grid pattern.
+
+The function attempts to determine whether the input image contains a grid of circles. If it is, the
+function locates centers of the circles. The function returns a non-zero value if all of the centers
+have been found and they have been placed in a certain order (row by row, left to right in every
+row). Otherwise, if the function fails to find all the corners or reorder them, it returns 0.
+
+Sample usage of detecting and drawing the centers of circles: :
+@code
+    Size patternsize(7,7); //number of centers
+    Mat gray = ...; //source image
+    vector<Point2f> centers; //this will be filled by the detected centers
+
+    bool patternfound = findCirclesGrid(gray, patternsize, centers);
+
+    drawChessboardCorners(img, patternsize, Mat(centers), patternfound);
+@endcode
+@note The function requires white space (like a square-thick border, the wider the better) around
+the board to make the detection more robust in various environments.
+ */
+CV_EXPORTS_W bool findCirclesGrid( InputArray image, Size patternSize,
+                                   OutputArray centers, int flags,
+                                   const Ptr<FeatureDetector> &blobDetector,
+                                   const CirclesGridFinderParameters& parameters);
+
+/** @overload */
+CV_EXPORTS_W bool findCirclesGrid( InputArray image, Size patternSize,
+                                   OutputArray centers, int flags = CALIB_CB_SYMMETRIC_GRID,
+                                   const Ptr<FeatureDetector> &blobDetector = SimpleBlobDetector::create());
+
+/** @brief Finds the camera intrinsic and extrinsic parameters from several views of a calibration
+pattern.
+
+@param objectPoints In the new interface it is a vector of vectors of calibration pattern points in
+the calibration pattern coordinate space (e.g. std::vector<std::vector<cv::Vec3f>>). The outer
+vector contains as many elements as the number of pattern views. If the same calibration pattern
+is shown in each view and it is fully visible, all the vectors will be the same. Although, it is
+possible to use partially occluded patterns or even different patterns in different views. Then,
+the vectors will be different. Although the points are 3D, they all lie in the calibration pattern's
+XY coordinate plane (thus 0 in the Z-coordinate), if the used calibration pattern is a planar rig.
+In the old interface all the vectors of object points from different views are concatenated
+together.
+@param imagePoints In the new interface it is a vector of vectors of the projections of calibration
+pattern points (e.g. std::vector<std::vector<cv::Vec2f>>). imagePoints.size() and
+objectPoints.size(), and imagePoints[i].size() and objectPoints[i].size() for each i, must be equal,
+respectively. In the old interface all the vectors of object points from different views are
+concatenated together.
+@param imageSize Size of the image used only to initialize the camera intrinsic matrix.
+@param cameraMatrix Input/output 3x3 floating-point camera intrinsic matrix
+\f$\cameramatrix{A}\f$ . If @ref CALIB_USE_INTRINSIC_GUESS
+and/or @ref CALIB_FIX_ASPECT_RATIO, @ref CALIB_FIX_PRINCIPAL_POINT or @ref CALIB_FIX_FOCAL_LENGTH
+are specified, some or all of fx, fy, cx, cy must be initialized before calling the function.
+@param distCoeffs Input/output vector of distortion coefficients
+\f$\distcoeffs\f$.
+@param rvecs Output vector of rotation vectors (@ref Rodrigues ) estimated for each pattern view
+(e.g. std::vector<cv::Mat>>). That is, each i-th rotation vector together with the corresponding
+i-th translation vector (see the next output parameter description) brings the calibration pattern
+from the object coordinate space (in which object points are specified) to the camera coordinate
+space. In more technical terms, the tuple of the i-th rotation and translation vector performs
+a change of basis from object coordinate space to camera coordinate space. Due to its duality, this
+tuple is equivalent to the position of the calibration pattern with respect to the camera coordinate
+space.
+@param tvecs Output vector of translation vectors estimated for each pattern view, see parameter
+describtion above.
+@param stdDeviationsIntrinsics Output vector of standard deviations estimated for intrinsic
+parameters. Order of deviations values:
+\f$(f_x, f_y, c_x, c_y, k_1, k_2, p_1, p_2, k_3, k_4, k_5, k_6 , s_1, s_2, s_3,
+ s_4, \tau_x, \tau_y)\f$ If one of parameters is not estimated, it's deviation is equals to zero.
+@param stdDeviationsExtrinsics Output vector of standard deviations estimated for extrinsic
+parameters. Order of deviations values: \f$(R_0, T_0, \dotsc , R_{M - 1}, T_{M - 1})\f$ where M is
+the number of pattern views. \f$R_i, T_i\f$ are concatenated 1x3 vectors.
+ @param perViewErrors Output vector of the RMS re-projection error estimated for each pattern view.
+@param flags Different flags that may be zero or a combination of the following values:
+-   @ref CALIB_USE_INTRINSIC_GUESS cameraMatrix contains valid initial values of
+fx, fy, cx, cy that are optimized further. Otherwise, (cx, cy) is initially set to the image
+center ( imageSize is used), and focal distances are computed in a least-squares fashion.
+Note, that if intrinsic parameters are known, there is no need to use this function just to
+estimate extrinsic parameters. Use @ref solvePnP instead.
+-   @ref CALIB_FIX_PRINCIPAL_POINT The principal point is not changed during the global
+optimization. It stays at the center or at a different location specified when
+ @ref CALIB_USE_INTRINSIC_GUESS is set too.
+-   @ref CALIB_FIX_ASPECT_RATIO The functions consider only fy as a free parameter. The
+ratio fx/fy stays the same as in the input cameraMatrix . When
+ @ref CALIB_USE_INTRINSIC_GUESS is not set, the actual input values of fx and fy are
+ignored, only their ratio is computed and used further.
+-   @ref CALIB_ZERO_TANGENT_DIST Tangential distortion coefficients \f$(p_1, p_2)\f$ are set
+to zeros and stay zero.
+-   @ref CALIB_FIX_FOCAL_LENGTH The focal length is not changed during the global optimization if
+ @ref CALIB_USE_INTRINSIC_GUESS is set.
+-   @ref CALIB_FIX_K1,..., @ref CALIB_FIX_K6 The corresponding radial distortion
+coefficient is not changed during the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is
+set, the coefficient from the supplied distCoeffs matrix is used. Otherwise, it is set to 0.
+-   @ref CALIB_RATIONAL_MODEL Coefficients k4, k5, and k6 are enabled. To provide the
+backward compatibility, this extra flag should be explicitly specified to make the
+calibration function use the rational model and return 8 coefficients or more.
+-   @ref CALIB_THIN_PRISM_MODEL Coefficients s1, s2, s3 and s4 are enabled. To provide the
+backward compatibility, this extra flag should be explicitly specified to make the
+calibration function use the thin prism model and return 12 coefficients or more.
+-   @ref CALIB_FIX_S1_S2_S3_S4 The thin prism distortion coefficients are not changed during
+the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
+supplied distCoeffs matrix is used. Otherwise, it is set to 0.
+-   @ref CALIB_TILTED_MODEL Coefficients tauX and tauY are enabled. To provide the
+backward compatibility, this extra flag should be explicitly specified to make the
+calibration function use the tilted sensor model and return 14 coefficients.
+-   @ref CALIB_FIX_TAUX_TAUY The coefficients of the tilted sensor model are not changed during
+the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
+supplied distCoeffs matrix is used. Otherwise, it is set to 0.
+@param criteria Termination criteria for the iterative optimization algorithm.
+
+@return the overall RMS re-projection error.
+
+The function estimates the intrinsic camera parameters and extrinsic parameters for each of the
+views. The algorithm is based on @cite Zhang2000 and @cite BouguetMCT . The coordinates of 3D object
+points and their corresponding 2D projections in each view must be specified. That may be achieved
+by using an object with known geometry and easily detectable feature points. Such an object is
+called a calibration rig or calibration pattern, and OpenCV has built-in support for a chessboard as
+a calibration rig (see @ref findChessboardCorners). Currently, initialization of intrinsic
+parameters (when @ref CALIB_USE_INTRINSIC_GUESS is not set) is only implemented for planar calibration
+patterns (where Z-coordinates of the object points must be all zeros). 3D calibration rigs can also
+be used as long as initial cameraMatrix is provided.
+
+The algorithm performs the following steps:
+
+-   Compute the initial intrinsic parameters (the option only available for planar calibration
+    patterns) or read them from the input parameters. The distortion coefficients are all set to
+    zeros initially unless some of CALIB_FIX_K? are specified.
+
+-   Estimate the initial camera pose as if the intrinsic parameters have been already known. This is
+    done using @ref solvePnP .
+
+-   Run the global Levenberg-Marquardt optimization algorithm to minimize the reprojection error,
+    that is, the total sum of squared distances between the observed feature points imagePoints and
+    the projected (using the current estimates for camera parameters and the poses) object points
+    objectPoints. See @ref projectPoints for details.
+
+@note
+    If you use a non-square (i.e. non-N-by-N) grid and @ref findChessboardCorners for calibration,
+    and @ref calibrateCamera returns bad values (zero distortion coefficients, \f$c_x\f$ and
+    \f$c_y\f$ very far from the image center, and/or large differences between \f$f_x\f$ and
+    \f$f_y\f$ (ratios of 10:1 or more)), then you are probably using patternSize=cvSize(rows,cols)
+    instead of using patternSize=cvSize(cols,rows) in @ref findChessboardCorners.
+
+@note
+    The function may throw exceptions, if unsupported combination of parameters is provided or
+    the system is underconstrained.
+
+@sa
+   calibrateCameraRO, findChessboardCorners, solvePnP, initCameraMatrix2D, stereoCalibrate,
+   undistort
+ */
+CV_EXPORTS_AS(calibrateCameraExtended) double calibrateCamera( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints, Size imageSize,
+                                     InputOutputArray cameraMatrix, InputOutputArray distCoeffs,
+                                     OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
+                                     OutputArray stdDeviationsIntrinsics,
+                                     OutputArray stdDeviationsExtrinsics,
+                                     OutputArray perViewErrors,
+                                     int flags = 0, TermCriteria criteria = TermCriteria(
+                                        TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );
+
+/** @overload */
+CV_EXPORTS_W double calibrateCamera( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints, Size imageSize,
+                                     InputOutputArray cameraMatrix, InputOutputArray distCoeffs,
+                                     OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
+                                     int flags = 0, TermCriteria criteria = TermCriteria(
+                                        TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );
+
+/** @brief Finds the camera intrinsic and extrinsic parameters from several views of a calibration pattern.
+
+This function is an extension of #calibrateCamera with the method of releasing object which was
+proposed in @cite strobl2011iccv. In many common cases with inaccurate, unmeasured, roughly planar
+targets (calibration plates), this method can dramatically improve the precision of the estimated
+camera parameters. Both the object-releasing method and standard method are supported by this
+function. Use the parameter **iFixedPoint** for method selection. In the internal implementation,
+#calibrateCamera is a wrapper for this function.
+
+@param objectPoints Vector of vectors of calibration pattern points in the calibration pattern
+coordinate space. See #calibrateCamera for details. If the method of releasing object to be used,
+the identical calibration board must be used in each view and it must be fully visible, and all
+objectPoints[i] must be the same and all points should be roughly close to a plane. **The calibration
+target has to be rigid, or at least static if the camera (rather than the calibration target) is
+shifted for grabbing images.**
+@param imagePoints Vector of vectors of the projections of calibration pattern points. See
+#calibrateCamera for details.
+@param imageSize Size of the image used only to initialize the intrinsic camera matrix.
+@param iFixedPoint The index of the 3D object point in objectPoints[0] to be fixed. It also acts as
+a switch for calibration method selection. If object-releasing method to be used, pass in the
+parameter in the range of [1, objectPoints[0].size()-2], otherwise a value out of this range will
+make standard calibration method selected. Usually the top-right corner point of the calibration
+board grid is recommended to be fixed when object-releasing method being utilized. According to
+\cite strobl2011iccv, two other points are also fixed. In this implementation, objectPoints[0].front
+and objectPoints[0].back.z are used. With object-releasing method, accurate rvecs, tvecs and
+newObjPoints are only possible if coordinates of these three fixed points are accurate enough.
+@param cameraMatrix Output 3x3 floating-point camera matrix. See #calibrateCamera for details.
+@param distCoeffs Output vector of distortion coefficients. See #calibrateCamera for details.
+@param rvecs Output vector of rotation vectors estimated for each pattern view. See #calibrateCamera
+for details.
+@param tvecs Output vector of translation vectors estimated for each pattern view.
+@param newObjPoints The updated output vector of calibration pattern points. The coordinates might
+be scaled based on three fixed points. The returned coordinates are accurate only if the above
+mentioned three fixed points are accurate. If not needed, noArray() can be passed in. This parameter
+is ignored with standard calibration method.
+@param stdDeviationsIntrinsics Output vector of standard deviations estimated for intrinsic parameters.
+See #calibrateCamera for details.
+@param stdDeviationsExtrinsics Output vector of standard deviations estimated for extrinsic parameters.
+See #calibrateCamera for details.
+@param stdDeviationsObjPoints Output vector of standard deviations estimated for refined coordinates
+of calibration pattern points. It has the same size and order as objectPoints[0] vector. This
+parameter is ignored with standard calibration method.
+ @param perViewErrors Output vector of the RMS re-projection error estimated for each pattern view.
+@param flags Different flags that may be zero or a combination of some predefined values. See
+#calibrateCamera for details. If the method of releasing object is used, the calibration time may
+be much longer. CALIB_USE_QR or CALIB_USE_LU could be used for faster calibration with potentially
+less precise and less stable in some rare cases.
+@param criteria Termination criteria for the iterative optimization algorithm.
+
+@return the overall RMS re-projection error.
+
+The function estimates the intrinsic camera parameters and extrinsic parameters for each of the
+views. The algorithm is based on @cite Zhang2000, @cite BouguetMCT and @cite strobl2011iccv. See
+#calibrateCamera for other detailed explanations.
+@sa
+   calibrateCamera, findChessboardCorners, solvePnP, initCameraMatrix2D, stereoCalibrate, undistort
+ */
+CV_EXPORTS_AS(calibrateCameraROExtended) double calibrateCameraRO( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints, Size imageSize, int iFixedPoint,
+                                     InputOutputArray cameraMatrix, InputOutputArray distCoeffs,
+                                     OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
+                                     OutputArray newObjPoints,
+                                     OutputArray stdDeviationsIntrinsics,
+                                     OutputArray stdDeviationsExtrinsics,
+                                     OutputArray stdDeviationsObjPoints,
+                                     OutputArray perViewErrors,
+                                     int flags = 0, TermCriteria criteria = TermCriteria(
+                                        TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );
+
+/** @overload */
+CV_EXPORTS_W double calibrateCameraRO( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints, Size imageSize, int iFixedPoint,
+                                     InputOutputArray cameraMatrix, InputOutputArray distCoeffs,
+                                     OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
+                                     OutputArray newObjPoints,
+                                     int flags = 0, TermCriteria criteria = TermCriteria(
+                                        TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );
+
+/** @brief Computes useful camera characteristics from the camera intrinsic matrix.
+
+@param cameraMatrix Input camera intrinsic matrix that can be estimated by #calibrateCamera or
+#stereoCalibrate .
+@param imageSize Input image size in pixels.
+@param apertureWidth Physical width in mm of the sensor.
+@param apertureHeight Physical height in mm of the sensor.
+@param fovx Output field of view in degrees along the horizontal sensor axis.
+@param fovy Output field of view in degrees along the vertical sensor axis.
+@param focalLength Focal length of the lens in mm.
+@param principalPoint Principal point in mm.
+@param aspectRatio \f$f_y/f_x\f$
+
+The function computes various useful camera characteristics from the previously estimated camera
+matrix.
+
+@note
+   Do keep in mind that the unity measure 'mm' stands for whatever unit of measure one chooses for
+    the chessboard pitch (it can thus be any value).
+ */
+CV_EXPORTS_W void calibrationMatrixValues( InputArray cameraMatrix, Size imageSize,
+                                           double apertureWidth, double apertureHeight,
+                                           CV_OUT double& fovx, CV_OUT double& fovy,
+                                           CV_OUT double& focalLength, CV_OUT Point2d& principalPoint,
+                                           CV_OUT double& aspectRatio );
+
+/** @brief Calibrates a stereo camera set up. This function finds the intrinsic parameters
+for each of the two cameras and the extrinsic parameters between the two cameras.
+
+@param objectPoints Vector of vectors of the calibration pattern points. The same structure as
+in @ref calibrateCamera. For each pattern view, both cameras need to see the same object
+points. Therefore, objectPoints.size(), imagePoints1.size(), and imagePoints2.size() need to be
+equal as well as objectPoints[i].size(), imagePoints1[i].size(), and imagePoints2[i].size() need to
+be equal for each i.
+@param imagePoints1 Vector of vectors of the projections of the calibration pattern points,
+observed by the first camera. The same structure as in @ref calibrateCamera.
+@param imagePoints2 Vector of vectors of the projections of the calibration pattern points,
+observed by the second camera. The same structure as in @ref calibrateCamera.
+@param cameraMatrix1 Input/output camera intrinsic matrix for the first camera, the same as in
+@ref calibrateCamera. Furthermore, for the stereo case, additional flags may be used, see below.
+@param distCoeffs1 Input/output vector of distortion coefficients, the same as in
+@ref calibrateCamera.
+@param cameraMatrix2 Input/output second camera intrinsic matrix for the second camera. See description for
+cameraMatrix1.
+@param distCoeffs2 Input/output lens distortion coefficients for the second camera. See
+description for distCoeffs1.
+@param imageSize Size of the image used only to initialize the camera intrinsic matrices.
+@param R Output rotation matrix. Together with the translation vector T, this matrix brings
+points given in the first camera's coordinate system to points in the second camera's
+coordinate system. In more technical terms, the tuple of R and T performs a change of basis
+from the first camera's coordinate system to the second camera's coordinate system. Due to its
+duality, this tuple is equivalent to the position of the first camera with respect to the
+second camera coordinate system.
+@param T Output translation vector, see description above.
+@param E Output essential matrix.
+@param F Output fundamental matrix.
+@param rvecs Output vector of rotation vectors ( @ref Rodrigues ) estimated for each pattern view in the
+coordinate system of the first camera of the stereo pair (e.g. std::vector<cv::Mat>). More in detail, each
+i-th rotation vector together with the corresponding i-th translation vector (see the next output parameter
+description) brings the calibration pattern from the object coordinate space (in which object points are
+specified) to the camera coordinate space of the first camera of the stereo pair. In more technical terms,
+the tuple of the i-th rotation and translation vector performs a change of basis from object coordinate space
+to camera coordinate space of the first camera of the stereo pair.
+@param tvecs Output vector of translation vectors estimated for each pattern view, see parameter description
+of previous output parameter ( rvecs ).
+@param perViewErrors Output vector of the RMS re-projection error estimated for each pattern view.
+@param flags Different flags that may be zero or a combination of the following values:
+-   @ref CALIB_FIX_INTRINSIC Fix cameraMatrix? and distCoeffs? so that only R, T, E, and F
+matrices are estimated.
+-   @ref CALIB_USE_INTRINSIC_GUESS Optimize some or all of the intrinsic parameters
+according to the specified flags. Initial values are provided by the user.
+-   @ref CALIB_USE_EXTRINSIC_GUESS R and T contain valid initial values that are optimized further.
+Otherwise R and T are initialized to the median value of the pattern views (each dimension separately).
+-   @ref CALIB_FIX_PRINCIPAL_POINT Fix the principal points during the optimization.
+-   @ref CALIB_FIX_FOCAL_LENGTH Fix \f$f^{(j)}_x\f$ and \f$f^{(j)}_y\f$ .
+-   @ref CALIB_FIX_ASPECT_RATIO Optimize \f$f^{(j)}_y\f$ . Fix the ratio \f$f^{(j)}_x/f^{(j)}_y\f$
+.
+-   @ref CALIB_SAME_FOCAL_LENGTH Enforce \f$f^{(0)}_x=f^{(1)}_x\f$ and \f$f^{(0)}_y=f^{(1)}_y\f$ .
+-   @ref CALIB_ZERO_TANGENT_DIST Set tangential distortion coefficients for each camera to
+zeros and fix there.
+-   @ref CALIB_FIX_K1,..., @ref CALIB_FIX_K6 Do not change the corresponding radial
+distortion coefficient during the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set,
+the coefficient from the supplied distCoeffs matrix is used. Otherwise, it is set to 0.
+-   @ref CALIB_RATIONAL_MODEL Enable coefficients k4, k5, and k6. To provide the backward
+compatibility, this extra flag should be explicitly specified to make the calibration
+function use the rational model and return 8 coefficients. If the flag is not set, the
+function computes and returns only 5 distortion coefficients.
+-   @ref CALIB_THIN_PRISM_MODEL Coefficients s1, s2, s3 and s4 are enabled. To provide the
+backward compatibility, this extra flag should be explicitly specified to make the
+calibration function use the thin prism model and return 12 coefficients. If the flag is not
+set, the function computes and returns only 5 distortion coefficients.
+-   @ref CALIB_FIX_S1_S2_S3_S4 The thin prism distortion coefficients are not changed during
+the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
+supplied distCoeffs matrix is used. Otherwise, it is set to 0.
+-   @ref CALIB_TILTED_MODEL Coefficients tauX and tauY are enabled. To provide the
+backward compatibility, this extra flag should be explicitly specified to make the
+calibration function use the tilted sensor model and return 14 coefficients. If the flag is not
+set, the function computes and returns only 5 distortion coefficients.
+-   @ref CALIB_FIX_TAUX_TAUY The coefficients of the tilted sensor model are not changed during
+the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
+supplied distCoeffs matrix is used. Otherwise, it is set to 0.
+@param criteria Termination criteria for the iterative optimization algorithm.
+
+The function estimates the transformation between two cameras making a stereo pair. If one computes
+the poses of an object relative to the first camera and to the second camera,
+( \f$R_1\f$,\f$T_1\f$ ) and (\f$R_2\f$,\f$T_2\f$), respectively, for a stereo camera where the
+relative position and orientation between the two cameras are fixed, then those poses definitely
+relate to each other. This means, if the relative position and orientation (\f$R\f$,\f$T\f$) of the
+two cameras is known, it is possible to compute (\f$R_2\f$,\f$T_2\f$) when (\f$R_1\f$,\f$T_1\f$) is
+given. This is what the described function does. It computes (\f$R\f$,\f$T\f$) such that:
+
+\f[R_2=R R_1\f]
+\f[T_2=R T_1 + T.\f]
+
+Therefore, one can compute the coordinate representation of a 3D point for the second camera's
+coordinate system when given the point's coordinate representation in the first camera's coordinate
+system:
+
+\f[\begin{bmatrix}
+X_2 \\
+Y_2 \\
+Z_2 \\
+1
+\end{bmatrix} = \begin{bmatrix}
+R & T \\
+0 & 1
+\end{bmatrix} \begin{bmatrix}
+X_1 \\
+Y_1 \\
+Z_1 \\
+1
+\end{bmatrix}.\f]
+
+
+Optionally, it computes the essential matrix E:
+
+\f[E= \vecthreethree{0}{-T_2}{T_1}{T_2}{0}{-T_0}{-T_1}{T_0}{0} R\f]
+
+where \f$T_i\f$ are components of the translation vector \f$T\f$ : \f$T=[T_0, T_1, T_2]^T\f$ .
+And the function can also compute the fundamental matrix F:
+
+\f[F = cameraMatrix2^{-T}\cdot E \cdot cameraMatrix1^{-1}\f]
+
+Besides the stereo-related information, the function can also perform a full calibration of each of
+the two cameras. However, due to the high dimensionality of the parameter space and noise in the
+input data, the function can diverge from the correct solution. If the intrinsic parameters can be
+estimated with high accuracy for each of the cameras individually (for example, using
+#calibrateCamera ), you are recommended to do so and then pass @ref CALIB_FIX_INTRINSIC flag to the
+function along with the computed intrinsic parameters. Otherwise, if all the parameters are
+estimated at once, it makes sense to restrict some parameters, for example, pass
+ @ref CALIB_SAME_FOCAL_LENGTH and @ref CALIB_ZERO_TANGENT_DIST flags, which is usually a
+reasonable assumption.
+
+Similarly to #calibrateCamera, the function minimizes the total re-projection error for all the
+points in all the available views from both cameras. The function returns the final value of the
+re-projection error.
+ */
+CV_EXPORTS_AS(stereoCalibrateExtended) double stereoCalibrate( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2,
+                                     InputOutputArray cameraMatrix1, InputOutputArray distCoeffs1,
+                                     InputOutputArray cameraMatrix2, InputOutputArray distCoeffs2,
+                                     Size imageSize, InputOutputArray R, InputOutputArray T, OutputArray E, OutputArray F,
+                                     OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs, OutputArray perViewErrors, int flags = CALIB_FIX_INTRINSIC,
+                                     TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6) );
+
+/// @overload
+CV_EXPORTS_W double stereoCalibrate( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2,
+                                     InputOutputArray cameraMatrix1, InputOutputArray distCoeffs1,
+                                     InputOutputArray cameraMatrix2, InputOutputArray distCoeffs2,
+                                     Size imageSize, OutputArray R,OutputArray T, OutputArray E, OutputArray F,
+                                     int flags = CALIB_FIX_INTRINSIC,
+                                     TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6) );
+
+/// @overload
+CV_EXPORTS_W double stereoCalibrate( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2,
+                                     InputOutputArray cameraMatrix1, InputOutputArray distCoeffs1,
+                                     InputOutputArray cameraMatrix2, InputOutputArray distCoeffs2,
+                                     Size imageSize, InputOutputArray R, InputOutputArray T, OutputArray E, OutputArray F,
+                                     OutputArray perViewErrors, int flags = CALIB_FIX_INTRINSIC,
+                                     TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6) );
+
+/** @brief Computes rectification transforms for each head of a calibrated stereo camera.
+
+@param cameraMatrix1 First camera intrinsic matrix.
+@param distCoeffs1 First camera distortion parameters.
+@param cameraMatrix2 Second camera intrinsic matrix.
+@param distCoeffs2 Second camera distortion parameters.
+@param imageSize Size of the image used for stereo calibration.
+@param R Rotation matrix from the coordinate system of the first camera to the second camera,
+see @ref stereoCalibrate.
+@param T Translation vector from the coordinate system of the first camera to the second camera,
+see @ref stereoCalibrate.
+@param R1 Output 3x3 rectification transform (rotation matrix) for the first camera. This matrix
+brings points given in the unrectified first camera's coordinate system to points in the rectified
+first camera's coordinate system. In more technical terms, it performs a change of basis from the
+unrectified first camera's coordinate system to the rectified first camera's coordinate system.
+@param R2 Output 3x3 rectification transform (rotation matrix) for the second camera. This matrix
+brings points given in the unrectified second camera's coordinate system to points in the rectified
+second camera's coordinate system. In more technical terms, it performs a change of basis from the
+unrectified second camera's coordinate system to the rectified second camera's coordinate system.
+@param P1 Output 3x4 projection matrix in the new (rectified) coordinate systems for the first
+camera, i.e. it projects points given in the rectified first camera coordinate system into the
+rectified first camera's image.
+@param P2 Output 3x4 projection matrix in the new (rectified) coordinate systems for the second
+camera, i.e. it projects points given in the rectified first camera coordinate system into the
+rectified second camera's image.
+@param Q Output \f$4 \times 4\f$ disparity-to-depth mapping matrix (see @ref reprojectImageTo3D).
+@param flags Operation flags that may be zero or @ref CALIB_ZERO_DISPARITY . If the flag is set,
+the function makes the principal points of each camera have the same pixel coordinates in the
+rectified views. And if the flag is not set, the function may still shift the images in the
+horizontal or vertical direction (depending on the orientation of epipolar lines) to maximize the
+useful image area.
+@param alpha Free scaling parameter. If it is -1 or absent, the function performs the default
+scaling. Otherwise, the parameter should be between 0 and 1. alpha=0 means that the rectified
+images are zoomed and shifted so that only valid pixels are visible (no black areas after
+rectification). alpha=1 means that the rectified image is decimated and shifted so that all the
+pixels from the original images from the cameras are retained in the rectified images (no source
+image pixels are lost). Any intermediate value yields an intermediate result between
+those two extreme cases.
+@param newImageSize New image resolution after rectification. The same size should be passed to
+#initUndistortRectifyMap (see the stereo_calib.cpp sample in OpenCV samples directory). When (0,0)
+is passed (default), it is set to the original imageSize . Setting it to a larger value can help you
+preserve details in the original image, especially when there is a big radial distortion.
+@param validPixROI1 Optional output rectangles inside the rectified images where all the pixels
+are valid. If alpha=0 , the ROIs cover the whole images. Otherwise, they are likely to be smaller
+(see the picture below).
+@param validPixROI2 Optional output rectangles inside the rectified images where all the pixels
+are valid. If alpha=0 , the ROIs cover the whole images. Otherwise, they are likely to be smaller
+(see the picture below).
+
+The function computes the rotation matrices for each camera that (virtually) make both camera image
+planes the same plane. Consequently, this makes all the epipolar lines parallel and thus simplifies
+the dense stereo correspondence problem. The function takes the matrices computed by #stereoCalibrate
+as input. As output, it provides two rotation matrices and also two projection matrices in the new
+coordinates. The function distinguishes the following two cases:
+
+-   **Horizontal stereo**: the first and the second camera views are shifted relative to each other
+    mainly along the x-axis (with possible small vertical shift). In the rectified images, the
+    corresponding epipolar lines in the left and right cameras are horizontal and have the same
+    y-coordinate. P1 and P2 look like:
+
+    \f[\texttt{P1} = \begin{bmatrix}
+                        f & 0 & cx_1 & 0 \\
+                        0 & f & cy & 0 \\
+                        0 & 0 & 1 & 0
+                     \end{bmatrix}\f]
+
+    \f[\texttt{P2} = \begin{bmatrix}
+                        f & 0 & cx_2 & T_x \cdot f \\
+                        0 & f & cy & 0 \\
+                        0 & 0 & 1 & 0
+                     \end{bmatrix} ,\f]
+
+    \f[\texttt{Q} = \begin{bmatrix}
+                        1 & 0 & 0 & -cx_1 \\
+                        0 & 1 & 0 & -cy \\
+                        0 & 0 & 0 & f \\
+                        0 & 0 & -\frac{1}{T_x} & \frac{cx_1 - cx_2}{T_x}
+                    \end{bmatrix} \f]
+
+    where \f$T_x\f$ is a horizontal shift between the cameras and \f$cx_1=cx_2\f$ if
+    @ref CALIB_ZERO_DISPARITY is set.
+
+-   **Vertical stereo**: the first and the second camera views are shifted relative to each other
+    mainly in the vertical direction (and probably a bit in the horizontal direction too). The epipolar
+    lines in the rectified images are vertical and have the same x-coordinate. P1 and P2 look like:
+
+    \f[\texttt{P1} = \begin{bmatrix}
+                        f & 0 & cx & 0 \\
+                        0 & f & cy_1 & 0 \\
+                        0 & 0 & 1 & 0
+                     \end{bmatrix}\f]
+
+    \f[\texttt{P2} = \begin{bmatrix}
+                        f & 0 & cx & 0 \\
+                        0 & f & cy_2 & T_y \cdot f \\
+                        0 & 0 & 1 & 0
+                     \end{bmatrix},\f]
+
+    \f[\texttt{Q} = \begin{bmatrix}
+                        1 & 0 & 0 & -cx \\
+                        0 & 1 & 0 & -cy_1 \\
+                        0 & 0 & 0 & f \\
+                        0 & 0 & -\frac{1}{T_y} & \frac{cy_1 - cy_2}{T_y}
+                    \end{bmatrix} \f]
+
+    where \f$T_y\f$ is a vertical shift between the cameras and \f$cy_1=cy_2\f$ if
+    @ref CALIB_ZERO_DISPARITY is set.
+
+As you can see, the first three columns of P1 and P2 will effectively be the new "rectified" camera
+matrices. The matrices, together with R1 and R2 , can then be passed to #initUndistortRectifyMap to
+initialize the rectification map for each camera.
+
+See below the screenshot from the stereo_calib.cpp sample. Some red horizontal lines pass through
+the corresponding image regions. This means that the images are well rectified, which is what most
+stereo correspondence algorithms rely on. The green rectangles are roi1 and roi2 . You see that
+their interiors are all valid pixels.
+
+![image](pics/stereo_undistort.jpg)
+ */
+CV_EXPORTS_W void stereoRectify( InputArray cameraMatrix1, InputArray distCoeffs1,
+                                 InputArray cameraMatrix2, InputArray distCoeffs2,
+                                 Size imageSize, InputArray R, InputArray T,
+                                 OutputArray R1, OutputArray R2,
+                                 OutputArray P1, OutputArray P2,
+                                 OutputArray Q, int flags = CALIB_ZERO_DISPARITY,
+                                 double alpha = -1, Size newImageSize = Size(),
+                                 CV_OUT Rect* validPixROI1 = 0, CV_OUT Rect* validPixROI2 = 0 );
+
+/** @brief Computes a rectification transform for an uncalibrated stereo camera.
+
+@param points1 Array of feature points in the first image.
+@param points2 The corresponding points in the second image. The same formats as in
+#findFundamentalMat are supported.
+@param F Input fundamental matrix. It can be computed from the same set of point pairs using
+#findFundamentalMat .
+@param imgSize Size of the image.
+@param H1 Output rectification homography matrix for the first image.
+@param H2 Output rectification homography matrix for the second image.
+@param threshold Optional threshold used to filter out the outliers. If the parameter is greater
+than zero, all the point pairs that do not comply with the epipolar geometry (that is, the points
+for which \f$|\texttt{points2[i]}^T \cdot \texttt{F} \cdot \texttt{points1[i]}|>\texttt{threshold}\f$ )
+are rejected prior to computing the homographies. Otherwise, all the points are considered inliers.
+
+The function computes the rectification transformations without knowing intrinsic parameters of the
+cameras and their relative position in the space, which explains the suffix "uncalibrated". Another
+related difference from #stereoRectify is that the function outputs not the rectification
+transformations in the object (3D) space, but the planar perspective transformations encoded by the
+homography matrices H1 and H2 . The function implements the algorithm @cite Hartley99 .
+
+@note
+   While the algorithm does not need to know the intrinsic parameters of the cameras, it heavily
+    depends on the epipolar geometry. Therefore, if the camera lenses have a significant distortion,
+    it would be better to correct it before computing the fundamental matrix and calling this
+    function. For example, distortion coefficients can be estimated for each head of stereo camera
+    separately by using #calibrateCamera . Then, the images can be corrected using #undistort , or
+    just the point coordinates can be corrected with #undistortPoints .
+ */
+CV_EXPORTS_W bool stereoRectifyUncalibrated( InputArray points1, InputArray points2,
+                                             InputArray F, Size imgSize,
+                                             OutputArray H1, OutputArray H2,
+                                             double threshold = 5 );
+
+//! computes the rectification transformations for 3-head camera, where all the heads are on the same line.
+CV_EXPORTS_W float rectify3Collinear( InputArray cameraMatrix1, InputArray distCoeffs1,
+                                      InputArray cameraMatrix2, InputArray distCoeffs2,
+                                      InputArray cameraMatrix3, InputArray distCoeffs3,
+                                      InputArrayOfArrays imgpt1, InputArrayOfArrays imgpt3,
+                                      Size imageSize, InputArray R12, InputArray T12,
+                                      InputArray R13, InputArray T13,
+                                      OutputArray R1, OutputArray R2, OutputArray R3,
+                                      OutputArray P1, OutputArray P2, OutputArray P3,
+                                      OutputArray Q, double alpha, Size newImgSize,
+                                      CV_OUT Rect* roi1, CV_OUT Rect* roi2, int flags );
+
+/** @brief Returns the new camera intrinsic matrix based on the free scaling parameter.
+
+@param cameraMatrix Input camera intrinsic matrix.
+@param distCoeffs Input vector of distortion coefficients
+\f$\distcoeffs\f$. If the vector is NULL/empty, the zero distortion coefficients are
+assumed.
+@param imageSize Original image size.
+@param alpha Free scaling parameter between 0 (when all the pixels in the undistorted image are
+valid) and 1 (when all the source image pixels are retained in the undistorted image). See
+#stereoRectify for details.
+@param newImgSize Image size after rectification. By default, it is set to imageSize .
+@param validPixROI Optional output rectangle that outlines all-good-pixels region in the
+undistorted image. See roi1, roi2 description in #stereoRectify .
+@param centerPrincipalPoint Optional flag that indicates whether in the new camera intrinsic matrix the
+principal point should be at the image center or not. By default, the principal point is chosen to
+best fit a subset of the source image (determined by alpha) to the corrected image.
+@return new_camera_matrix Output new camera intrinsic matrix.
+
+The function computes and returns the optimal new camera intrinsic matrix based on the free scaling parameter.
+By varying this parameter, you may retrieve only sensible pixels alpha=0 , keep all the original
+image pixels if there is valuable information in the corners alpha=1 , or get something in between.
+When alpha\>0 , the undistorted result is likely to have some black pixels corresponding to
+"virtual" pixels outside of the captured distorted image. The original camera intrinsic matrix, distortion
+coefficients, the computed new camera intrinsic matrix, and newImageSize should be passed to
+#initUndistortRectifyMap to produce the maps for #remap .
+ */
+CV_EXPORTS_W Mat getOptimalNewCameraMatrix( InputArray cameraMatrix, InputArray distCoeffs,
+                                            Size imageSize, double alpha, Size newImgSize = Size(),
+                                            CV_OUT Rect* validPixROI = 0,
+                                            bool centerPrincipalPoint = false);
+
+/** @brief Computes Hand-Eye calibration: \f$_{}^{g}\textrm{T}_c\f$
+
+@param[in] R_gripper2base Rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the gripper frame to the robot base frame (\f$_{}^{b}\textrm{T}_g\f$).
+This is a vector (`vector<Mat>`) that contains the rotation, `(3x3)` rotation matrices or `(3x1)` rotation vectors,
+for all the transformations from gripper frame to robot base frame.
+@param[in] t_gripper2base Translation part extracted from the homogeneous matrix that transforms a point
+expressed in the gripper frame to the robot base frame (\f$_{}^{b}\textrm{T}_g\f$).
+This is a vector (`vector<Mat>`) that contains the `(3x1)` translation vectors for all the transformations
+from gripper frame to robot base frame.
+@param[in] R_target2cam Rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the target frame to the camera frame (\f$_{}^{c}\textrm{T}_t\f$).
+This is a vector (`vector<Mat>`) that contains the rotation, `(3x3)` rotation matrices or `(3x1)` rotation vectors,
+for all the transformations from calibration target frame to camera frame.
+@param[in] t_target2cam Rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the target frame to the camera frame (\f$_{}^{c}\textrm{T}_t\f$).
+This is a vector (`vector<Mat>`) that contains the `(3x1)` translation vectors for all the transformations
+from calibration target frame to camera frame.
+@param[out] R_cam2gripper Estimated `(3x3)` rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the camera frame to the gripper frame (\f$_{}^{g}\textrm{T}_c\f$).
+@param[out] t_cam2gripper Estimated `(3x1)` translation part extracted from the homogeneous matrix that transforms a point
+expressed in the camera frame to the gripper frame (\f$_{}^{g}\textrm{T}_c\f$).
+@param[in] method One of the implemented Hand-Eye calibration method, see cv::HandEyeCalibrationMethod
+
+The function performs the Hand-Eye calibration using various methods. One approach consists in estimating the
+rotation then the translation (separable solutions) and the following methods are implemented:
+  - R. Tsai, R. Lenz A New Technique for Fully Autonomous and Efficient 3D Robotics Hand/EyeCalibration \cite Tsai89
+  - F. Park, B. Martin Robot Sensor Calibration: Solving AX = XB on the Euclidean Group \cite Park94
+  - R. Horaud, F. Dornaika Hand-Eye Calibration \cite Horaud95
+
+Another approach consists in estimating simultaneously the rotation and the translation (simultaneous solutions),
+with the following implemented methods:
+  - N. Andreff, R. Horaud, B. Espiau On-line Hand-Eye Calibration \cite Andreff99
+  - K. Daniilidis Hand-Eye Calibration Using Dual Quaternions \cite Daniilidis98
+
+The following picture describes the Hand-Eye calibration problem where the transformation between a camera ("eye")
+mounted on a robot gripper ("hand") has to be estimated. This configuration is called eye-in-hand.
+
+The eye-to-hand configuration consists in a static camera observing a calibration pattern mounted on the robot
+end-effector. The transformation from the camera to the robot base frame can then be estimated by inputting
+the suitable transformations to the function, see below.
+
+![](pics/hand-eye_figure.png)
+
+The calibration procedure is the following:
+  - a static calibration pattern is used to estimate the transformation between the target frame
+  and the camera frame
+  - the robot gripper is moved in order to acquire several poses
+  - for each pose, the homogeneous transformation between the gripper frame and the robot base frame is recorded using for
+  instance the robot kinematics
+\f[
+    \begin{bmatrix}
+    X_b\\
+    Y_b\\
+    Z_b\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{b}\textrm{R}_g & _{}^{b}\textrm{t}_g \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_g\\
+    Y_g\\
+    Z_g\\
+    1
+    \end{bmatrix}
+\f]
+  - for each pose, the homogeneous transformation between the calibration target frame and the camera frame is recorded using
+  for instance a pose estimation method (PnP) from 2D-3D point correspondences
+\f[
+    \begin{bmatrix}
+    X_c\\
+    Y_c\\
+    Z_c\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{c}\textrm{R}_t & _{}^{c}\textrm{t}_t \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_t\\
+    Y_t\\
+    Z_t\\
+    1
+    \end{bmatrix}
+\f]
+
+The Hand-Eye calibration procedure returns the following homogeneous transformation
+\f[
+    \begin{bmatrix}
+    X_g\\
+    Y_g\\
+    Z_g\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{g}\textrm{R}_c & _{}^{g}\textrm{t}_c \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_c\\
+    Y_c\\
+    Z_c\\
+    1
+    \end{bmatrix}
+\f]
+
+This problem is also known as solving the \f$\mathbf{A}\mathbf{X}=\mathbf{X}\mathbf{B}\f$ equation:
+  - for an eye-in-hand configuration
+\f[
+    \begin{align*}
+    ^{b}{\textrm{T}_g}^{(1)} \hspace{0.2em} ^{g}\textrm{T}_c \hspace{0.2em} ^{c}{\textrm{T}_t}^{(1)} &=
+    \hspace{0.1em} ^{b}{\textrm{T}_g}^{(2)} \hspace{0.2em} ^{g}\textrm{T}_c \hspace{0.2em} ^{c}{\textrm{T}_t}^{(2)} \\
+
+    (^{b}{\textrm{T}_g}^{(2)})^{-1} \hspace{0.2em} ^{b}{\textrm{T}_g}^{(1)} \hspace{0.2em} ^{g}\textrm{T}_c &=
+    \hspace{0.1em} ^{g}\textrm{T}_c \hspace{0.2em} ^{c}{\textrm{T}_t}^{(2)} (^{c}{\textrm{T}_t}^{(1)})^{-1} \\
+
+    \textrm{A}_i \textrm{X} &= \textrm{X} \textrm{B}_i \\
+    \end{align*}
+\f]
+
+  - for an eye-to-hand configuration
+\f[
+    \begin{align*}
+    ^{g}{\textrm{T}_b}^{(1)} \hspace{0.2em} ^{b}\textrm{T}_c \hspace{0.2em} ^{c}{\textrm{T}_t}^{(1)} &=
+    \hspace{0.1em} ^{g}{\textrm{T}_b}^{(2)} \hspace{0.2em} ^{b}\textrm{T}_c \hspace{0.2em} ^{c}{\textrm{T}_t}^{(2)} \\
+
+    (^{g}{\textrm{T}_b}^{(2)})^{-1} \hspace{0.2em} ^{g}{\textrm{T}_b}^{(1)} \hspace{0.2em} ^{b}\textrm{T}_c &=
+    \hspace{0.1em} ^{b}\textrm{T}_c \hspace{0.2em} ^{c}{\textrm{T}_t}^{(2)} (^{c}{\textrm{T}_t}^{(1)})^{-1} \\
+
+    \textrm{A}_i \textrm{X} &= \textrm{X} \textrm{B}_i \\
+    \end{align*}
+\f]
+
+\note
+Additional information can be found on this [website](http://campar.in.tum.de/Chair/HandEyeCalibration).
+\note
+A minimum of 2 motions with non parallel rotation axes are necessary to determine the hand-eye transformation.
+So at least 3 different poses are required, but it is strongly recommended to use many more poses.
+
+ */
+CV_EXPORTS_W void calibrateHandEye( InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gripper2base,
+                                    InputArrayOfArrays R_target2cam, InputArrayOfArrays t_target2cam,
+                                    OutputArray R_cam2gripper, OutputArray t_cam2gripper,
+                                    HandEyeCalibrationMethod method=CALIB_HAND_EYE_TSAI );
+
+/** @brief Computes Robot-World/Hand-Eye calibration: \f$_{}^{w}\textrm{T}_b\f$ and \f$_{}^{c}\textrm{T}_g\f$
+
+@param[in] R_world2cam Rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the world frame to the camera frame (\f$_{}^{c}\textrm{T}_w\f$).
+This is a vector (`vector<Mat>`) that contains the rotation, `(3x3)` rotation matrices or `(3x1)` rotation vectors,
+for all the transformations from world frame to the camera frame.
+@param[in] t_world2cam Translation part extracted from the homogeneous matrix that transforms a point
+expressed in the world frame to the camera frame (\f$_{}^{c}\textrm{T}_w\f$).
+This is a vector (`vector<Mat>`) that contains the `(3x1)` translation vectors for all the transformations
+from world frame to the camera frame.
+@param[in] R_base2gripper Rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the robot base frame to the gripper frame (\f$_{}^{g}\textrm{T}_b\f$).
+This is a vector (`vector<Mat>`) that contains the rotation, `(3x3)` rotation matrices or `(3x1)` rotation vectors,
+for all the transformations from robot base frame to the gripper frame.
+@param[in] t_base2gripper Rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the robot base frame to the gripper frame (\f$_{}^{g}\textrm{T}_b\f$).
+This is a vector (`vector<Mat>`) that contains the `(3x1)` translation vectors for all the transformations
+from robot base frame to the gripper frame.
+@param[out] R_base2world Estimated `(3x3)` rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the robot base frame to the world frame (\f$_{}^{w}\textrm{T}_b\f$).
+@param[out] t_base2world Estimated `(3x1)` translation part extracted from the homogeneous matrix that transforms a point
+expressed in the robot base frame to the world frame (\f$_{}^{w}\textrm{T}_b\f$).
+@param[out] R_gripper2cam Estimated `(3x3)` rotation part extracted from the homogeneous matrix that transforms a point
+expressed in the gripper frame to the camera frame (\f$_{}^{c}\textrm{T}_g\f$).
+@param[out] t_gripper2cam Estimated `(3x1)` translation part extracted from the homogeneous matrix that transforms a point
+expressed in the gripper frame to the camera frame (\f$_{}^{c}\textrm{T}_g\f$).
+@param[in] method One of the implemented Robot-World/Hand-Eye calibration method, see cv::RobotWorldHandEyeCalibrationMethod
+
+The function performs the Robot-World/Hand-Eye calibration using various methods. One approach consists in estimating the
+rotation then the translation (separable solutions):
+  - M. Shah, Solving the robot-world/hand-eye calibration problem using the kronecker product \cite Shah2013SolvingTR
+
+Another approach consists in estimating simultaneously the rotation and the translation (simultaneous solutions),
+with the following implemented method:
+  - A. Li, L. Wang, and D. Wu, Simultaneous robot-world and hand-eye calibration using dual-quaternions and kronecker product \cite Li2010SimultaneousRA
+
+The following picture describes the Robot-World/Hand-Eye calibration problem where the transformations between a robot and a world frame
+and between a robot gripper ("hand") and a camera ("eye") mounted at the robot end-effector have to be estimated.
+
+![](pics/robot-world_hand-eye_figure.png)
+
+The calibration procedure is the following:
+  - a static calibration pattern is used to estimate the transformation between the target frame
+  and the camera frame
+  - the robot gripper is moved in order to acquire several poses
+  - for each pose, the homogeneous transformation between the gripper frame and the robot base frame is recorded using for
+  instance the robot kinematics
+\f[
+    \begin{bmatrix}
+    X_g\\
+    Y_g\\
+    Z_g\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{g}\textrm{R}_b & _{}^{g}\textrm{t}_b \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_b\\
+    Y_b\\
+    Z_b\\
+    1
+    \end{bmatrix}
+\f]
+  - for each pose, the homogeneous transformation between the calibration target frame (the world frame) and the camera frame is recorded using
+  for instance a pose estimation method (PnP) from 2D-3D point correspondences
+\f[
+    \begin{bmatrix}
+    X_c\\
+    Y_c\\
+    Z_c\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{c}\textrm{R}_w & _{}^{c}\textrm{t}_w \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_w\\
+    Y_w\\
+    Z_w\\
+    1
+    \end{bmatrix}
+\f]
+
+The Robot-World/Hand-Eye calibration procedure returns the following homogeneous transformations
+\f[
+    \begin{bmatrix}
+    X_w\\
+    Y_w\\
+    Z_w\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{w}\textrm{R}_b & _{}^{w}\textrm{t}_b \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_b\\
+    Y_b\\
+    Z_b\\
+    1
+    \end{bmatrix}
+\f]
+\f[
+    \begin{bmatrix}
+    X_c\\
+    Y_c\\
+    Z_c\\
+    1
+    \end{bmatrix}
+    =
+    \begin{bmatrix}
+    _{}^{c}\textrm{R}_g & _{}^{c}\textrm{t}_g \\
+    0_{1 \times 3} & 1
+    \end{bmatrix}
+    \begin{bmatrix}
+    X_g\\
+    Y_g\\
+    Z_g\\
+    1
+    \end{bmatrix}
+\f]
+
+This problem is also known as solving the \f$\mathbf{A}\mathbf{X}=\mathbf{Z}\mathbf{B}\f$ equation, with:
+  - \f$\mathbf{A} \Leftrightarrow \hspace{0.1em} _{}^{c}\textrm{T}_w\f$
+  - \f$\mathbf{X} \Leftrightarrow \hspace{0.1em} _{}^{w}\textrm{T}_b\f$
+  - \f$\mathbf{Z} \Leftrightarrow \hspace{0.1em} _{}^{c}\textrm{T}_g\f$
+  - \f$\mathbf{B} \Leftrightarrow \hspace{0.1em} _{}^{g}\textrm{T}_b\f$
+
+\note
+At least 3 measurements are required (input vectors size must be greater or equal to 3).
+
+ */
+CV_EXPORTS_W void calibrateRobotWorldHandEye( InputArrayOfArrays R_world2cam, InputArrayOfArrays t_world2cam,
+                                              InputArrayOfArrays R_base2gripper, InputArrayOfArrays t_base2gripper,
+                                              OutputArray R_base2world, OutputArray t_base2world,
+                                              OutputArray R_gripper2cam, OutputArray t_gripper2cam,
+                                              RobotWorldHandEyeCalibrationMethod method=CALIB_ROBOT_WORLD_HAND_EYE_SHAH );
+
+/** @brief Converts points from Euclidean to homogeneous space.
+
+@param src Input vector of N-dimensional points.
+@param dst Output vector of N+1-dimensional points.
+
+The function converts points from Euclidean to homogeneous space by appending 1's to the tuple of
+point coordinates. That is, each point (x1, x2, ..., xn) is converted to (x1, x2, ..., xn, 1).
+ */
+CV_EXPORTS_W void convertPointsToHomogeneous( InputArray src, OutputArray dst );
+
+/** @brief Converts points from homogeneous to Euclidean space.
+
+@param src Input vector of N-dimensional points.
+@param dst Output vector of N-1-dimensional points.
+
+The function converts points homogeneous to Euclidean space using perspective projection. That is,
+each point (x1, x2, ... x(n-1), xn) is converted to (x1/xn, x2/xn, ..., x(n-1)/xn). When xn=0, the
+output point coordinates will be (0,0,0,...).
+ */
+CV_EXPORTS_W void convertPointsFromHomogeneous( InputArray src, OutputArray dst );
+
+/** @brief Converts points to/from homogeneous coordinates.
+
+@param src Input array or vector of 2D, 3D, or 4D points.
+@param dst Output vector of 2D, 3D, or 4D points.
+
+The function converts 2D or 3D points from/to homogeneous coordinates by calling either
+#convertPointsToHomogeneous or #convertPointsFromHomogeneous.
+
+@note The function is obsolete. Use one of the previous two functions instead.
+ */
+CV_EXPORTS void convertPointsHomogeneous( InputArray src, OutputArray dst );
+
+/** @brief Calculates a fundamental matrix from the corresponding points in two images.
+
+@param points1 Array of N points from the first image. The point coordinates should be
+floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1 .
+@param method Method for computing a fundamental matrix.
+-   @ref FM_7POINT for a 7-point algorithm. \f$N = 7\f$
+-   @ref FM_8POINT for an 8-point algorithm. \f$N \ge 8\f$
+-   @ref FM_RANSAC for the RANSAC algorithm. \f$N \ge 8\f$
+-   @ref FM_LMEDS for the LMedS algorithm. \f$N \ge 8\f$
+@param ransacReprojThreshold Parameter used only for RANSAC. It is the maximum distance from a point to an epipolar
+line in pixels, beyond which the point is considered an outlier and is not used for computing the
+final fundamental matrix. It can be set to something like 1-3, depending on the accuracy of the
+point localization, image resolution, and the image noise.
+@param confidence Parameter used for the RANSAC and LMedS methods only. It specifies a desirable level
+of confidence (probability) that the estimated matrix is correct.
+@param[out] mask optional output mask
+@param maxIters The maximum number of robust method iterations.
+
+The epipolar geometry is described by the following equation:
+
+\f[[p_2; 1]^T F [p_1; 1] = 0\f]
+
+where \f$F\f$ is a fundamental matrix, \f$p_1\f$ and \f$p_2\f$ are corresponding points in the first and the
+second images, respectively.
+
+The function calculates the fundamental matrix using one of four methods listed above and returns
+the found fundamental matrix. Normally just one matrix is found. But in case of the 7-point
+algorithm, the function may return up to 3 solutions ( \f$9 \times 3\f$ matrix that stores all 3
+matrices sequentially).
+
+The calculated fundamental matrix may be passed further to #computeCorrespondEpilines that finds the
+epipolar lines corresponding to the specified points. It can also be passed to
+#stereoRectifyUncalibrated to compute the rectification transformation. :
+@code
+    // Example. Estimation of fundamental matrix using the RANSAC algorithm
+    int point_count = 100;
+    vector<Point2f> points1(point_count);
+    vector<Point2f> points2(point_count);
+
+    // initialize the points here ...
+    for( int i = 0; i < point_count; i++ )
+    {
+        points1[i] = ...;
+        points2[i] = ...;
+    }
+
+    Mat fundamental_matrix =
+     findFundamentalMat(points1, points2, FM_RANSAC, 3, 0.99);
+@endcode
+ */
+CV_EXPORTS_W Mat findFundamentalMat( InputArray points1, InputArray points2,
+                                     int method, double ransacReprojThreshold, double confidence,
+                                     int maxIters, OutputArray mask = noArray() );
+
+/** @overload */
+CV_EXPORTS_W Mat findFundamentalMat( InputArray points1, InputArray points2,
+                                     int method = FM_RANSAC,
+                                     double ransacReprojThreshold = 3., double confidence = 0.99,
+                                     OutputArray mask = noArray() );
+
+/** @overload */
+CV_EXPORTS Mat findFundamentalMat( InputArray points1, InputArray points2,
+                                   OutputArray mask, int method = FM_RANSAC,
+                                   double ransacReprojThreshold = 3., double confidence = 0.99 );
+
+
+CV_EXPORTS_W Mat findFundamentalMat( InputArray points1, InputArray points2,
+                        OutputArray mask, const UsacParams &params);
+
+/** @brief Calculates an essential matrix from the corresponding points in two images.
+
+@param points1 Array of N (N \>= 5) 2D points from the first image. The point coordinates should
+be floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1.
+@param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+Note that this function assumes that points1 and points2 are feature points from cameras with the
+same camera intrinsic matrix. If this assumption does not hold for your use case, use another
+function overload or #undistortPoints with `P = cv::NoArray()` for both cameras to transform image
+points to normalized image coordinates, which are valid for the identity camera intrinsic matrix.
+When passing these coordinates, pass the identity matrix for this parameter.
+@param method Method for computing an essential matrix.
+-   @ref RANSAC for the RANSAC algorithm.
+-   @ref LMEDS for the LMedS algorithm.
+@param prob Parameter used for the RANSAC or LMedS methods only. It specifies a desirable level of
+confidence (probability) that the estimated matrix is correct.
+@param threshold Parameter used for RANSAC. It is the maximum distance from a point to an epipolar
+line in pixels, beyond which the point is considered an outlier and is not used for computing the
+final fundamental matrix. It can be set to something like 1-3, depending on the accuracy of the
+point localization, image resolution, and the image noise.
+@param mask Output array of N elements, every element of which is set to 0 for outliers and to 1
+for the other points. The array is computed only in the RANSAC and LMedS methods.
+@param maxIters The maximum number of robust method iterations.
+
+This function estimates essential matrix based on the five-point algorithm solver in @cite Nister03 .
+@cite SteweniusCFS is also a related. The epipolar geometry is described by the following equation:
+
+\f[[p_2; 1]^T K^{-T} E K^{-1} [p_1; 1] = 0\f]
+
+where \f$E\f$ is an essential matrix, \f$p_1\f$ and \f$p_2\f$ are corresponding points in the first and the
+second images, respectively. The result of this function may be passed further to
+#decomposeEssentialMat or #recoverPose to recover the relative pose between cameras.
+ */
+CV_EXPORTS_W
+Mat findEssentialMat(
+    InputArray points1, InputArray points2,
+    InputArray cameraMatrix, int method = RANSAC,
+    double prob = 0.999, double threshold = 1.0,
+    int maxIters = 1000, OutputArray mask = noArray()
+);
+
+/** @overload */
+CV_EXPORTS
+Mat findEssentialMat(
+    InputArray points1, InputArray points2,
+    InputArray cameraMatrix, int method,
+    double prob, double threshold,
+    OutputArray mask
+);  // TODO remove from OpenCV 5.0
+
+/** @overload
+@param points1 Array of N (N \>= 5) 2D points from the first image. The point coordinates should
+be floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1 .
+@param focal focal length of the camera. Note that this function assumes that points1 and points2
+are feature points from cameras with same focal length and principal point.
+@param pp principal point of the camera.
+@param method Method for computing a fundamental matrix.
+-   @ref RANSAC for the RANSAC algorithm.
+-   @ref LMEDS for the LMedS algorithm.
+@param threshold Parameter used for RANSAC. It is the maximum distance from a point to an epipolar
+line in pixels, beyond which the point is considered an outlier and is not used for computing the
+final fundamental matrix. It can be set to something like 1-3, depending on the accuracy of the
+point localization, image resolution, and the image noise.
+@param prob Parameter used for the RANSAC or LMedS methods only. It specifies a desirable level of
+confidence (probability) that the estimated matrix is correct.
+@param mask Output array of N elements, every element of which is set to 0 for outliers and to 1
+for the other points. The array is computed only in the RANSAC and LMedS methods.
+@param maxIters The maximum number of robust method iterations.
+
+This function differs from the one above that it computes camera intrinsic matrix from focal length and
+principal point:
+
+\f[A =
+\begin{bmatrix}
+f & 0 & x_{pp}  \\
+0 & f & y_{pp}  \\
+0 & 0 & 1
+\end{bmatrix}\f]
+ */
+CV_EXPORTS_W
+Mat findEssentialMat(
+    InputArray points1, InputArray points2,
+    double focal = 1.0, Point2d pp = Point2d(0, 0),
+    int method = RANSAC, double prob = 0.999,
+    double threshold = 1.0, int maxIters = 1000,
+    OutputArray mask = noArray()
+);
+
+/** @overload */
+CV_EXPORTS
+Mat findEssentialMat(
+    InputArray points1, InputArray points2,
+    double focal, Point2d pp,
+    int method, double prob,
+    double threshold, OutputArray mask
+);  // TODO remove from OpenCV 5.0
+
+/** @brief Calculates an essential matrix from the corresponding points in two images from potentially two different cameras.
+
+@param points1 Array of N (N \>= 5) 2D points from the first image. The point coordinates should
+be floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1.
+@param cameraMatrix1 Camera matrix for the first camera \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix2 Camera matrix for the second camera \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param distCoeffs1 Input vector of distortion coefficients for the first camera
+\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
+of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
+@param distCoeffs2 Input vector of distortion coefficients for the second camera
+\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
+of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
+@param method Method for computing an essential matrix.
+-   @ref RANSAC for the RANSAC algorithm.
+-   @ref LMEDS for the LMedS algorithm.
+@param prob Parameter used for the RANSAC or LMedS methods only. It specifies a desirable level of
+confidence (probability) that the estimated matrix is correct.
+@param threshold Parameter used for RANSAC. It is the maximum distance from a point to an epipolar
+line in pixels, beyond which the point is considered an outlier and is not used for computing the
+final fundamental matrix. It can be set to something like 1-3, depending on the accuracy of the
+point localization, image resolution, and the image noise.
+@param mask Output array of N elements, every element of which is set to 0 for outliers and to 1
+for the other points. The array is computed only in the RANSAC and LMedS methods.
+
+This function estimates essential matrix based on the five-point algorithm solver in @cite Nister03 .
+@cite SteweniusCFS is also a related. The epipolar geometry is described by the following equation:
+
+\f[[p_2; 1]^T K^{-T} E K^{-1} [p_1; 1] = 0\f]
+
+where \f$E\f$ is an essential matrix, \f$p_1\f$ and \f$p_2\f$ are corresponding points in the first and the
+second images, respectively. The result of this function may be passed further to
+#decomposeEssentialMat or  #recoverPose to recover the relative pose between cameras.
+ */
+CV_EXPORTS_W Mat findEssentialMat( InputArray points1, InputArray points2,
+                                 InputArray cameraMatrix1, InputArray distCoeffs1,
+                                 InputArray cameraMatrix2, InputArray distCoeffs2,
+                                 int method = RANSAC,
+                                 double prob = 0.999, double threshold = 1.0,
+                                 OutputArray mask = noArray() );
+
+
+CV_EXPORTS_W Mat findEssentialMat( InputArray points1, InputArray points2,
+                      InputArray cameraMatrix1, InputArray cameraMatrix2,
+                      InputArray dist_coeff1, InputArray dist_coeff2, OutputArray mask,
+                      const UsacParams &params);
+
+/** @brief Decompose an essential matrix to possible rotations and translation.
+
+@param E The input essential matrix.
+@param R1 One possible rotation matrix.
+@param R2 Another possible rotation matrix.
+@param t One possible translation.
+
+This function decomposes the essential matrix E using svd decomposition @cite HartleyZ00. In
+general, four possible poses exist for the decomposition of E. They are \f$[R_1, t]\f$,
+\f$[R_1, -t]\f$, \f$[R_2, t]\f$, \f$[R_2, -t]\f$.
+
+If E gives the epipolar constraint \f$[p_2; 1]^T A^{-T} E A^{-1} [p_1; 1] = 0\f$ between the image
+points \f$p_1\f$ in the first image and \f$p_2\f$ in second image, then any of the tuples
+\f$[R_1, t]\f$, \f$[R_1, -t]\f$, \f$[R_2, t]\f$, \f$[R_2, -t]\f$ is a change of basis from the first
+camera's coordinate system to the second camera's coordinate system. However, by decomposing E, one
+can only get the direction of the translation. For this reason, the translation t is returned with
+unit length.
+ */
+CV_EXPORTS_W void decomposeEssentialMat( InputArray E, OutputArray R1, OutputArray R2, OutputArray t );
+
+/** @brief Recovers the relative camera rotation and the translation from corresponding points in two images from two different cameras, using cheirality check. Returns the number of
+inliers that pass the check.
+
+@param points1 Array of N 2D points from the first image. The point coordinates should be
+floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1 .
+@param cameraMatrix1 Input/output camera matrix for the first camera, the same as in
+@ref calibrateCamera. Furthermore, for the stereo case, additional flags may be used, see below.
+@param distCoeffs1 Input/output vector of distortion coefficients, the same as in
+@ref calibrateCamera.
+@param cameraMatrix2 Input/output camera matrix for the first camera, the same as in
+@ref calibrateCamera. Furthermore, for the stereo case, additional flags may be used, see below.
+@param distCoeffs2 Input/output vector of distortion coefficients, the same as in
+@ref calibrateCamera.
+@param E The output essential matrix.
+@param R Output rotation matrix. Together with the translation vector, this matrix makes up a tuple
+that performs a change of basis from the first camera's coordinate system to the second camera's
+coordinate system. Note that, in general, t can not be used for this tuple, see the parameter
+described below.
+@param t Output translation vector. This vector is obtained by @ref decomposeEssentialMat and
+therefore is only known up to scale, i.e. t is the direction of the translation vector and has unit
+length.
+@param method Method for computing an essential matrix.
+-   @ref RANSAC for the RANSAC algorithm.
+-   @ref LMEDS for the LMedS algorithm.
+@param prob Parameter used for the RANSAC or LMedS methods only. It specifies a desirable level of
+confidence (probability) that the estimated matrix is correct.
+@param threshold Parameter used for RANSAC. It is the maximum distance from a point to an epipolar
+line in pixels, beyond which the point is considered an outlier and is not used for computing the
+final fundamental matrix. It can be set to something like 1-3, depending on the accuracy of the
+point localization, image resolution, and the image noise.
+@param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
+inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the cheirality check.
+
+This function decomposes an essential matrix using @ref decomposeEssentialMat and then verifies
+possible pose hypotheses by doing cheirality check. The cheirality check means that the
+triangulated 3D points should have positive depth. Some details can be found in @cite Nister03.
+
+This function can be used to process the output E and mask from @ref findEssentialMat. In this
+scenario, points1 and points2 are the same input for findEssentialMat.:
+@code
+    // Example. Estimation of fundamental matrix using the RANSAC algorithm
+    int point_count = 100;
+    vector<Point2f> points1(point_count);
+    vector<Point2f> points2(point_count);
+
+    // initialize the points here ...
+    for( int i = 0; i < point_count; i++ )
+    {
+        points1[i] = ...;
+        points2[i] = ...;
+    }
+
+    // Input: camera calibration of both cameras, for example using intrinsic chessboard calibration.
+    Mat cameraMatrix1, distCoeffs1, cameraMatrix2, distCoeffs2;
+
+    // Output: Essential matrix, relative rotation and relative translation.
+    Mat E, R, t, mask;
+
+    recoverPose(points1, points2, cameraMatrix1, distCoeffs1, cameraMatrix2, distCoeffs2, E, R, t, mask);
+@endcode
+ */
+CV_EXPORTS_W int recoverPose( InputArray points1, InputArray points2,
+                            InputArray cameraMatrix1, InputArray distCoeffs1,
+                            InputArray cameraMatrix2, InputArray distCoeffs2,
+                            OutputArray E, OutputArray R, OutputArray t,
+                            int method = cv::RANSAC, double prob = 0.999, double threshold = 1.0,
+                            InputOutputArray mask = noArray());
+
+/** @brief Recovers the relative camera rotation and the translation from an estimated essential
+matrix and the corresponding points in two images, using chirality check. Returns the number of
+inliers that pass the check.
+
+@param E The input essential matrix.
+@param points1 Array of N 2D points from the first image. The point coordinates should be
+floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1 .
+@param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+Note that this function assumes that points1 and points2 are feature points from cameras with the
+same camera intrinsic matrix.
+@param R Output rotation matrix. Together with the translation vector, this matrix makes up a tuple
+that performs a change of basis from the first camera's coordinate system to the second camera's
+coordinate system. Note that, in general, t can not be used for this tuple, see the parameter
+described below.
+@param t Output translation vector. This vector is obtained by @ref decomposeEssentialMat and
+therefore is only known up to scale, i.e. t is the direction of the translation vector and has unit
+length.
+@param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the chirality check.
+
+This function decomposes an essential matrix using @ref decomposeEssentialMat and then verifies
+possible pose hypotheses by doing chirality check. The chirality check means that the
+triangulated 3D points should have positive depth. Some details can be found in @cite Nister03.
+
+This function can be used to process the output E and mask from @ref findEssentialMat. In this
+scenario, points1 and points2 are the same input for #findEssentialMat :
+@code
+    // Example. Estimation of fundamental matrix using the RANSAC algorithm
+    int point_count = 100;
+    vector<Point2f> points1(point_count);
+    vector<Point2f> points2(point_count);
+
+    // initialize the points here ...
+    for( int i = 0; i < point_count; i++ )
+    {
+        points1[i] = ...;
+        points2[i] = ...;
+    }
+
+    // cametra matrix with both focal lengths = 1, and principal point = (0, 0)
+    Mat cameraMatrix = Mat::eye(3, 3, CV_64F);
+
+    Mat E, R, t, mask;
+
+    E = findEssentialMat(points1, points2, cameraMatrix, RANSAC, 0.999, 1.0, mask);
+    recoverPose(E, points1, points2, cameraMatrix, R, t, mask);
+@endcode
+ */
+CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray points2,
+                            InputArray cameraMatrix, OutputArray R, OutputArray t,
+                            InputOutputArray mask = noArray() );
+
+/** @overload
+@param E The input essential matrix.
+@param points1 Array of N 2D points from the first image. The point coordinates should be
+floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1 .
+@param R Output rotation matrix. Together with the translation vector, this matrix makes up a tuple
+that performs a change of basis from the first camera's coordinate system to the second camera's
+coordinate system. Note that, in general, t can not be used for this tuple, see the parameter
+description below.
+@param t Output translation vector. This vector is obtained by @ref decomposeEssentialMat and
+therefore is only known up to scale, i.e. t is the direction of the translation vector and has unit
+length.
+@param focal Focal length of the camera. Note that this function assumes that points1 and points2
+are feature points from cameras with same focal length and principal point.
+@param pp principal point of the camera.
+@param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the chirality check.
+
+This function differs from the one above that it computes camera intrinsic matrix from focal length and
+principal point:
+
+\f[A =
+\begin{bmatrix}
+f & 0 & x_{pp}  \\
+0 & f & y_{pp}  \\
+0 & 0 & 1
+\end{bmatrix}\f]
+ */
+CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray points2,
+                            OutputArray R, OutputArray t,
+                            double focal = 1.0, Point2d pp = Point2d(0, 0),
+                            InputOutputArray mask = noArray() );
+
+/** @overload
+@param E The input essential matrix.
+@param points1 Array of N 2D points from the first image. The point coordinates should be
+floating-point (single or double precision).
+@param points2 Array of the second image points of the same size and format as points1.
+@param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+Note that this function assumes that points1 and points2 are feature points from cameras with the
+same camera intrinsic matrix.
+@param R Output rotation matrix. Together with the translation vector, this matrix makes up a tuple
+that performs a change of basis from the first camera's coordinate system to the second camera's
+coordinate system. Note that, in general, t can not be used for this tuple, see the parameter
+description below.
+@param t Output translation vector. This vector is obtained by @ref decomposeEssentialMat and
+therefore is only known up to scale, i.e. t is the direction of the translation vector and has unit
+length.
+@param distanceThresh threshold distance which is used to filter out far away points (i.e. infinite
+points).
+@param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the chirality check.
+@param triangulatedPoints 3D points which were reconstructed by triangulation.
+
+This function differs from the one above that it outputs the triangulated 3D point that are used for
+the chirality check.
+ */
+CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray points2,
+                            InputArray cameraMatrix, OutputArray R, OutputArray t, double distanceThresh, InputOutputArray mask = noArray(),
+                            OutputArray triangulatedPoints = noArray());
+
+/** @brief For points in an image of a stereo pair, computes the corresponding epilines in the other image.
+
+@param points Input points. \f$N \times 1\f$ or \f$1 \times N\f$ matrix of type CV_32FC2 or
+vector\<Point2f\> .
+@param whichImage Index of the image (1 or 2) that contains the points .
+@param F Fundamental matrix that can be estimated using #findFundamentalMat or #stereoRectify .
+@param lines Output vector of the epipolar lines corresponding to the points in the other image.
+Each line \f$ax + by + c=0\f$ is encoded by 3 numbers \f$(a, b, c)\f$ .
+
+For every point in one of the two images of a stereo pair, the function finds the equation of the
+corresponding epipolar line in the other image.
+
+From the fundamental matrix definition (see #findFundamentalMat ), line \f$l^{(2)}_i\f$ in the second
+image for the point \f$p^{(1)}_i\f$ in the first image (when whichImage=1 ) is computed as:
+
+\f[l^{(2)}_i = F p^{(1)}_i\f]
+
+And vice versa, when whichImage=2, \f$l^{(1)}_i\f$ is computed from \f$p^{(2)}_i\f$ as:
+
+\f[l^{(1)}_i = F^T p^{(2)}_i\f]
+
+Line coefficients are defined up to a scale. They are normalized so that \f$a_i^2+b_i^2=1\f$ .
+ */
+CV_EXPORTS_W void computeCorrespondEpilines( InputArray points, int whichImage,
+                                             InputArray F, OutputArray lines );
+
+/** @brief This function reconstructs 3-dimensional points (in homogeneous coordinates) by using
+their observations with a stereo camera.
+
+@param projMatr1 3x4 projection matrix of the first camera, i.e. this matrix projects 3D points
+given in the world's coordinate system into the first image.
+@param projMatr2 3x4 projection matrix of the second camera, i.e. this matrix projects 3D points
+given in the world's coordinate system into the second image.
+@param projPoints1 2xN array of feature points in the first image. In the case of the c++ version,
+it can be also a vector of feature points or two-channel matrix of size 1xN or Nx1.
+@param projPoints2 2xN array of corresponding points in the second image. In the case of the c++
+version, it can be also a vector of feature points or two-channel matrix of size 1xN or Nx1.
+@param points4D 4xN array of reconstructed points in homogeneous coordinates. These points are
+returned in the world's coordinate system.
+
+@note
+   Keep in mind that all input data should be of float type in order for this function to work.
+
+@note
+   If the projection matrices from @ref stereoRectify are used, then the returned points are
+   represented in the first camera's rectified coordinate system.
+
+@sa
+   reprojectImageTo3D
+ */
+CV_EXPORTS_W void triangulatePoints( InputArray projMatr1, InputArray projMatr2,
+                                     InputArray projPoints1, InputArray projPoints2,
+                                     OutputArray points4D );
+
+/** @brief Refines coordinates of corresponding points.
+
+@param F 3x3 fundamental matrix.
+@param points1 1xN array containing the first set of points.
+@param points2 1xN array containing the second set of points.
+@param newPoints1 The optimized points1.
+@param newPoints2 The optimized points2.
+
+The function implements the Optimal Triangulation Method (see Multiple View Geometry @cite HartleyZ00 for details).
+For each given point correspondence points1[i] \<-\> points2[i], and a fundamental matrix F, it
+computes the corrected correspondences newPoints1[i] \<-\> newPoints2[i] that minimize the geometric
+error \f$d(points1[i], newPoints1[i])^2 + d(points2[i],newPoints2[i])^2\f$ (where \f$d(a,b)\f$ is the
+geometric distance between points \f$a\f$ and \f$b\f$ ) subject to the epipolar constraint
+\f$newPoints2^T \cdot F \cdot newPoints1 = 0\f$ .
+ */
+CV_EXPORTS_W void correctMatches( InputArray F, InputArray points1, InputArray points2,
+                                  OutputArray newPoints1, OutputArray newPoints2 );
+
+/** @brief Filters off small noise blobs (speckles) in the disparity map
+
+@param img The input 16-bit signed disparity image
+@param newVal The disparity value used to paint-off the speckles
+@param maxSpeckleSize The maximum speckle size to consider it a speckle. Larger blobs are not
+affected by the algorithm
+@param maxDiff Maximum difference between neighbor disparity pixels to put them into the same
+blob. Note that since StereoBM, StereoSGBM and may be other algorithms return a fixed-point
+disparity map, where disparity values are multiplied by 16, this scale factor should be taken into
+account when specifying this parameter value.
+@param buf The optional temporary buffer to avoid memory allocation within the function.
+ */
+CV_EXPORTS_W void filterSpeckles( InputOutputArray img, double newVal,
+                                  int maxSpeckleSize, double maxDiff,
+                                  InputOutputArray buf = noArray() );
+
+//! computes valid disparity ROI from the valid ROIs of the rectified images (that are returned by #stereoRectify)
+CV_EXPORTS_W Rect getValidDisparityROI( Rect roi1, Rect roi2,
+                                        int minDisparity, int numberOfDisparities,
+                                        int blockSize );
+
+//! validates disparity using the left-right check. The matrix "cost" should be computed by the stereo correspondence algorithm
+CV_EXPORTS_W void validateDisparity( InputOutputArray disparity, InputArray cost,
+                                     int minDisparity, int numberOfDisparities,
+                                     int disp12MaxDisp = 1 );
+
+/** @brief Reprojects a disparity image to 3D space.
+
+@param disparity Input single-channel 8-bit unsigned, 16-bit signed, 32-bit signed or 32-bit
+floating-point disparity image. The values of 8-bit / 16-bit signed formats are assumed to have no
+fractional bits. If the disparity is 16-bit signed format, as computed by @ref StereoBM or
+@ref StereoSGBM and maybe other algorithms, it should be divided by 16 (and scaled to float) before
+being used here.
+@param _3dImage Output 3-channel floating-point image of the same size as disparity. Each element of
+_3dImage(x,y) contains 3D coordinates of the point (x,y) computed from the disparity map. If one
+uses Q obtained by @ref stereoRectify, then the returned points are represented in the first
+camera's rectified coordinate system.
+@param Q \f$4 \times 4\f$ perspective transformation matrix that can be obtained with
+@ref stereoRectify.
+@param handleMissingValues Indicates, whether the function should handle missing values (i.e.
+points where the disparity was not computed). If handleMissingValues=true, then pixels with the
+minimal disparity that corresponds to the outliers (see StereoMatcher::compute ) are transformed
+to 3D points with a very large Z value (currently set to 10000).
+@param ddepth The optional output array depth. If it is -1, the output image will have CV_32F
+depth. ddepth can also be set to CV_16S, CV_32S or CV_32F.
+
+The function transforms a single-channel disparity map to a 3-channel image representing a 3D
+surface. That is, for each pixel (x,y) and the corresponding disparity d=disparity(x,y) , it
+computes:
+
+\f[\begin{bmatrix}
+X \\
+Y \\
+Z \\
+W
+\end{bmatrix} = Q \begin{bmatrix}
+x \\
+y \\
+\texttt{disparity} (x,y) \\
+1
+\end{bmatrix}.\f]
+
+@sa
+   To reproject a sparse set of points {(x,y,d),...} to 3D space, use perspectiveTransform.
+ */
+CV_EXPORTS_W void reprojectImageTo3D( InputArray disparity,
+                                      OutputArray _3dImage, InputArray Q,
+                                      bool handleMissingValues = false,
+                                      int ddepth = -1 );
+
+/** @brief Calculates the Sampson Distance between two points.
+
+The function cv::sampsonDistance calculates and returns the first order approximation of the geometric error as:
+\f[
+sd( \texttt{pt1} , \texttt{pt2} )=
+\frac{(\texttt{pt2}^t \cdot \texttt{F} \cdot \texttt{pt1})^2}
+{((\texttt{F} \cdot \texttt{pt1})(0))^2 +
+((\texttt{F} \cdot \texttt{pt1})(1))^2 +
+((\texttt{F}^t \cdot \texttt{pt2})(0))^2 +
+((\texttt{F}^t \cdot \texttt{pt2})(1))^2}
+\f]
+The fundamental matrix may be calculated using the #findFundamentalMat function. See @cite HartleyZ00 11.4.3 for details.
+@param pt1 first homogeneous 2d point
+@param pt2 second homogeneous 2d point
+@param F fundamental matrix
+@return The computed Sampson distance.
+*/
+CV_EXPORTS_W double sampsonDistance(InputArray pt1, InputArray pt2, InputArray F);
+
+/** @brief Computes an optimal affine transformation between two 3D point sets.
+
+It computes
+\f[
+\begin{bmatrix}
+x\\
+y\\
+z\\
+\end{bmatrix}
+=
+\begin{bmatrix}
+a_{11} & a_{12} & a_{13}\\
+a_{21} & a_{22} & a_{23}\\
+a_{31} & a_{32} & a_{33}\\
+\end{bmatrix}
+\begin{bmatrix}
+X\\
+Y\\
+Z\\
+\end{bmatrix}
++
+\begin{bmatrix}
+b_1\\
+b_2\\
+b_3\\
+\end{bmatrix}
+\f]
+
+@param src First input 3D point set containing \f$(X,Y,Z)\f$.
+@param dst Second input 3D point set containing \f$(x,y,z)\f$.
+@param out Output 3D affine transformation matrix \f$3 \times 4\f$ of the form
+\f[
+\begin{bmatrix}
+a_{11} & a_{12} & a_{13} & b_1\\
+a_{21} & a_{22} & a_{23} & b_2\\
+a_{31} & a_{32} & a_{33} & b_3\\
+\end{bmatrix}
+\f]
+@param inliers Output vector indicating which points are inliers (1-inlier, 0-outlier).
+@param ransacThreshold Maximum reprojection error in the RANSAC algorithm to consider a point as
+an inlier.
+@param confidence Confidence level, between 0 and 1, for the estimated transformation. Anything
+between 0.95 and 0.99 is usually good enough. Values too close to 1 can slow down the estimation
+significantly. Values lower than 0.8-0.9 can result in an incorrectly estimated transformation.
+
+The function estimates an optimal 3D affine transformation between two 3D point sets using the
+RANSAC algorithm.
+ */
+CV_EXPORTS_W  int estimateAffine3D(InputArray src, InputArray dst,
+                                   OutputArray out, OutputArray inliers,
+                                   double ransacThreshold = 3, double confidence = 0.99);
+
+/** @brief Computes an optimal affine transformation between two 3D point sets.
+
+It computes \f$R,s,t\f$ minimizing \f$\sum{i} dst_i - c \cdot R \cdot src_i \f$
+where \f$R\f$ is a 3x3 rotation matrix, \f$t\f$ is a 3x1 translation vector and \f$s\f$ is a
+scalar size value. This is an implementation of the algorithm by Umeyama \cite umeyama1991least .
+The estimated affine transform has a homogeneous scale which is a subclass of affine
+transformations with 7 degrees of freedom. The paired point sets need to comprise at least 3
+points each.
+
+@param src First input 3D point set.
+@param dst Second input 3D point set.
+@param scale If null is passed, the scale parameter c will be assumed to be 1.0.
+Else the pointed-to variable will be set to the optimal scale.
+@param force_rotation If true, the returned rotation will never be a reflection.
+This might be unwanted, e.g. when optimizing a transform between a right- and a
+left-handed coordinate system.
+@return 3D affine transformation matrix \f$3 \times 4\f$ of the form
+\f[T =
+\begin{bmatrix}
+R & t\\
+\end{bmatrix}
+\f]
+
+ */
+CV_EXPORTS_W   cv::Mat estimateAffine3D(InputArray src, InputArray dst,
+                                        CV_OUT double* scale = nullptr, bool force_rotation = true);
+
+/** @brief Computes an optimal translation between two 3D point sets.
+ *
+ * It computes
+ * \f[
+ * \begin{bmatrix}
+ * x\\
+ * y\\
+ * z\\
+ * \end{bmatrix}
+ * =
+ * \begin{bmatrix}
+ * X\\
+ * Y\\
+ * Z\\
+ * \end{bmatrix}
+ * +
+ * \begin{bmatrix}
+ * b_1\\
+ * b_2\\
+ * b_3\\
+ * \end{bmatrix}
+ * \f]
+ *
+ * @param src First input 3D point set containing \f$(X,Y,Z)\f$.
+ * @param dst Second input 3D point set containing \f$(x,y,z)\f$.
+ * @param out Output 3D translation vector \f$3 \times 1\f$ of the form
+ * \f[
+ * \begin{bmatrix}
+ * b_1 \\
+ * b_2 \\
+ * b_3 \\
+ * \end{bmatrix}
+ * \f]
+ * @param inliers Output vector indicating which points are inliers (1-inlier, 0-outlier).
+ * @param ransacThreshold Maximum reprojection error in the RANSAC algorithm to consider a point as
+ * an inlier.
+ * @param confidence Confidence level, between 0 and 1, for the estimated transformation. Anything
+ * between 0.95 and 0.99 is usually good enough. Values too close to 1 can slow down the estimation
+ * significantly. Values lower than 0.8-0.9 can result in an incorrectly estimated transformation.
+ *
+ * The function estimates an optimal 3D translation between two 3D point sets using the
+ * RANSAC algorithm.
+ *  */
+CV_EXPORTS_W  int estimateTranslation3D(InputArray src, InputArray dst,
+                                        OutputArray out, OutputArray inliers,
+                                        double ransacThreshold = 3, double confidence = 0.99);
+
+/** @brief Computes an optimal affine transformation between two 2D point sets.
+
+It computes
+\f[
+\begin{bmatrix}
+x\\
+y\\
+\end{bmatrix}
+=
+\begin{bmatrix}
+a_{11} & a_{12}\\
+a_{21} & a_{22}\\
+\end{bmatrix}
+\begin{bmatrix}
+X\\
+Y\\
+\end{bmatrix}
++
+\begin{bmatrix}
+b_1\\
+b_2\\
+\end{bmatrix}
+\f]
+
+@param from First input 2D point set containing \f$(X,Y)\f$.
+@param to Second input 2D point set containing \f$(x,y)\f$.
+@param inliers Output vector indicating which points are inliers (1-inlier, 0-outlier).
+@param method Robust method used to compute transformation. The following methods are possible:
+-   @ref RANSAC - RANSAC-based robust method
+-   @ref LMEDS - Least-Median robust method
+RANSAC is the default method.
+@param ransacReprojThreshold Maximum reprojection error in the RANSAC algorithm to consider
+a point as an inlier. Applies only to RANSAC.
+@param maxIters The maximum number of robust method iterations.
+@param confidence Confidence level, between 0 and 1, for the estimated transformation. Anything
+between 0.95 and 0.99 is usually good enough. Values too close to 1 can slow down the estimation
+significantly. Values lower than 0.8-0.9 can result in an incorrectly estimated transformation.
+@param refineIters Maximum number of iterations of refining algorithm (Levenberg-Marquardt).
+Passing 0 will disable refining, so the output matrix will be output of robust method.
+
+@return Output 2D affine transformation matrix \f$2 \times 3\f$ or empty matrix if transformation
+could not be estimated. The returned matrix has the following form:
+\f[
+\begin{bmatrix}
+a_{11} & a_{12} & b_1\\
+a_{21} & a_{22} & b_2\\
+\end{bmatrix}
+\f]
+
+The function estimates an optimal 2D affine transformation between two 2D point sets using the
+selected robust algorithm.
+
+The computed transformation is then refined further (using only inliers) with the
+Levenberg-Marquardt method to reduce the re-projection error even more.
+
+@note
+The RANSAC method can handle practically any ratio of outliers but needs a threshold to
+distinguish inliers from outliers. The method LMeDS does not need any threshold but it works
+correctly only when there are more than 50% of inliers.
+
+@sa estimateAffinePartial2D, getAffineTransform
+*/
+CV_EXPORTS_W cv::Mat estimateAffine2D(InputArray from, InputArray to, OutputArray inliers = noArray(),
+                                  int method = RANSAC, double ransacReprojThreshold = 3,
+                                  size_t maxIters = 2000, double confidence = 0.99,
+                                  size_t refineIters = 10);
+
+
+CV_EXPORTS_W cv::Mat estimateAffine2D(InputArray pts1, InputArray pts2, OutputArray inliers,
+                     const UsacParams &params);
+
+/** @brief Computes an optimal limited affine transformation with 4 degrees of freedom between
+two 2D point sets.
+
+@param from First input 2D point set.
+@param to Second input 2D point set.
+@param inliers Output vector indicating which points are inliers.
+@param method Robust method used to compute transformation. The following methods are possible:
+-   @ref RANSAC - RANSAC-based robust method
+-   @ref LMEDS - Least-Median robust method
+RANSAC is the default method.
+@param ransacReprojThreshold Maximum reprojection error in the RANSAC algorithm to consider
+a point as an inlier. Applies only to RANSAC.
+@param maxIters The maximum number of robust method iterations.
+@param confidence Confidence level, between 0 and 1, for the estimated transformation. Anything
+between 0.95 and 0.99 is usually good enough. Values too close to 1 can slow down the estimation
+significantly. Values lower than 0.8-0.9 can result in an incorrectly estimated transformation.
+@param refineIters Maximum number of iterations of refining algorithm (Levenberg-Marquardt).
+Passing 0 will disable refining, so the output matrix will be output of robust method.
+
+@return Output 2D affine transformation (4 degrees of freedom) matrix \f$2 \times 3\f$ or
+empty matrix if transformation could not be estimated.
+
+The function estimates an optimal 2D affine transformation with 4 degrees of freedom limited to
+combinations of translation, rotation, and uniform scaling. Uses the selected algorithm for robust
+estimation.
+
+The computed transformation is then refined further (using only inliers) with the
+Levenberg-Marquardt method to reduce the re-projection error even more.
+
+Estimated transformation matrix is:
+\f[ \begin{bmatrix} \cos(\theta) \cdot s & -\sin(\theta) \cdot s & t_x \\
+                \sin(\theta) \cdot s & \cos(\theta) \cdot s & t_y
+\end{bmatrix} \f]
+Where \f$ \theta \f$ is the rotation angle, \f$ s \f$ the scaling factor and \f$ t_x, t_y \f$ are
+translations in \f$ x, y \f$ axes respectively.
+
+@note
+The RANSAC method can handle practically any ratio of outliers but need a threshold to
+distinguish inliers from outliers. The method LMeDS does not need any threshold but it works
+correctly only when there are more than 50% of inliers.
+
+@sa estimateAffine2D, getAffineTransform
+*/
+CV_EXPORTS_W cv::Mat estimateAffinePartial2D(InputArray from, InputArray to, OutputArray inliers = noArray(),
+                                  int method = RANSAC, double ransacReprojThreshold = 3,
+                                  size_t maxIters = 2000, double confidence = 0.99,
+                                  size_t refineIters = 10);
+
+/** @example samples/cpp/tutorial_code/features2D/Homography/decompose_homography.cpp
+An example program with homography decomposition.
+
+Check @ref tutorial_homography "the corresponding tutorial" for more details.
+*/
+
+/** @brief Decompose a homography matrix to rotation(s), translation(s) and plane normal(s).
+
+@param H The input homography matrix between two images.
+@param K The input camera intrinsic matrix.
+@param rotations Array of rotation matrices.
+@param translations Array of translation matrices.
+@param normals Array of plane normal matrices.
+
+This function extracts relative camera motion between two views of a planar object and returns up to
+four mathematical solution tuples of rotation, translation, and plane normal. The decomposition of
+the homography matrix H is described in detail in @cite Malis2007.
+
+If the homography H, induced by the plane, gives the constraint
+\f[s_i \vecthree{x'_i}{y'_i}{1} \sim H \vecthree{x_i}{y_i}{1}\f] on the source image points
+\f$p_i\f$ and the destination image points \f$p'_i\f$, then the tuple of rotations[k] and
+translations[k] is a change of basis from the source camera's coordinate system to the destination
+camera's coordinate system. However, by decomposing H, one can only get the translation normalized
+by the (typically unknown) depth of the scene, i.e. its direction but with normalized length.
+
+If point correspondences are available, at least two solutions may further be invalidated, by
+applying positive depth constraint, i.e. all points must be in front of the camera.
+ */
+CV_EXPORTS_W int decomposeHomographyMat(InputArray H,
+                                        InputArray K,
+                                        OutputArrayOfArrays rotations,
+                                        OutputArrayOfArrays translations,
+                                        OutputArrayOfArrays normals);
+
+/** @brief Filters homography decompositions based on additional information.
+
+@param rotations Vector of rotation matrices.
+@param normals Vector of plane normal matrices.
+@param beforePoints Vector of (rectified) visible reference points before the homography is applied
+@param afterPoints Vector of (rectified) visible reference points after the homography is applied
+@param possibleSolutions Vector of int indices representing the viable solution set after filtering
+@param pointsMask optional Mat/Vector of 8u type representing the mask for the inliers as given by the #findHomography function
+
+This function is intended to filter the output of the #decomposeHomographyMat based on additional
+information as described in @cite Malis2007 . The summary of the method: the #decomposeHomographyMat function
+returns 2 unique solutions and their "opposites" for a total of 4 solutions. If we have access to the
+sets of points visible in the camera frame before and after the homography transformation is applied,
+we can determine which are the true potential solutions and which are the opposites by verifying which
+homographies are consistent with all visible reference points being in front of the camera. The inputs
+are left unchanged; the filtered solution set is returned as indices into the existing one.
+
+*/
+CV_EXPORTS_W void filterHomographyDecompByVisibleRefpoints(InputArrayOfArrays rotations,
+                                                           InputArrayOfArrays normals,
+                                                           InputArray beforePoints,
+                                                           InputArray afterPoints,
+                                                           OutputArray possibleSolutions,
+                                                           InputArray pointsMask = noArray());
+
+/** @brief The base class for stereo correspondence algorithms.
+ */
+class CV_EXPORTS_W StereoMatcher : public Algorithm
+{
+public:
+    enum { DISP_SHIFT = 4,
+           DISP_SCALE = (1 << DISP_SHIFT)
+         };
+
+    /** @brief Computes disparity map for the specified stereo pair
+
+    @param left Left 8-bit single-channel image.
+    @param right Right image of the same size and the same type as the left one.
+    @param disparity Output disparity map. It has the same size as the input images. Some algorithms,
+    like StereoBM or StereoSGBM compute 16-bit fixed-point disparity map (where each disparity value
+    has 4 fractional bits), whereas other algorithms output 32-bit floating-point disparity map.
+     */
+    CV_WRAP virtual void compute( InputArray left, InputArray right,
+                                  OutputArray disparity ) = 0;
+
+    CV_WRAP virtual int getMinDisparity() const = 0;
+    CV_WRAP virtual void setMinDisparity(int minDisparity) = 0;
+
+    CV_WRAP virtual int getNumDisparities() const = 0;
+    CV_WRAP virtual void setNumDisparities(int numDisparities) = 0;
+
+    CV_WRAP virtual int getBlockSize() const = 0;
+    CV_WRAP virtual void setBlockSize(int blockSize) = 0;
+
+    CV_WRAP virtual int getSpeckleWindowSize() const = 0;
+    CV_WRAP virtual void setSpeckleWindowSize(int speckleWindowSize) = 0;
+
+    CV_WRAP virtual int getSpeckleRange() const = 0;
+    CV_WRAP virtual void setSpeckleRange(int speckleRange) = 0;
+
+    CV_WRAP virtual int getDisp12MaxDiff() const = 0;
+    CV_WRAP virtual void setDisp12MaxDiff(int disp12MaxDiff) = 0;
+};
+
+
+/** @brief Class for computing stereo correspondence using the block matching algorithm, introduced and
+contributed to OpenCV by K. Konolige.
+ */
+class CV_EXPORTS_W StereoBM : public StereoMatcher
+{
+public:
+    enum { PREFILTER_NORMALIZED_RESPONSE = 0,
+           PREFILTER_XSOBEL              = 1
+         };
+
+    CV_WRAP virtual int getPreFilterType() const = 0;
+    CV_WRAP virtual void setPreFilterType(int preFilterType) = 0;
+
+    CV_WRAP virtual int getPreFilterSize() const = 0;
+    CV_WRAP virtual void setPreFilterSize(int preFilterSize) = 0;
+
+    CV_WRAP virtual int getPreFilterCap() const = 0;
+    CV_WRAP virtual void setPreFilterCap(int preFilterCap) = 0;
+
+    CV_WRAP virtual int getTextureThreshold() const = 0;
+    CV_WRAP virtual void setTextureThreshold(int textureThreshold) = 0;
+
+    CV_WRAP virtual int getUniquenessRatio() const = 0;
+    CV_WRAP virtual void setUniquenessRatio(int uniquenessRatio) = 0;
+
+    CV_WRAP virtual int getSmallerBlockSize() const = 0;
+    CV_WRAP virtual void setSmallerBlockSize(int blockSize) = 0;
+
+    CV_WRAP virtual Rect getROI1() const = 0;
+    CV_WRAP virtual void setROI1(Rect roi1) = 0;
+
+    CV_WRAP virtual Rect getROI2() const = 0;
+    CV_WRAP virtual void setROI2(Rect roi2) = 0;
+
+    /** @brief Creates StereoBM object
+
+    @param numDisparities the disparity search range. For each pixel algorithm will find the best
+    disparity from 0 (default minimum disparity) to numDisparities. The search range can then be
+    shifted by changing the minimum disparity.
+    @param blockSize the linear size of the blocks compared by the algorithm. The size should be odd
+    (as the block is centered at the current pixel). Larger block size implies smoother, though less
+    accurate disparity map. Smaller block size gives more detailed disparity map, but there is higher
+    chance for algorithm to find a wrong correspondence.
+
+    The function create StereoBM object. You can then call StereoBM::compute() to compute disparity for
+    a specific stereo pair.
+     */
+    CV_WRAP static Ptr<StereoBM> create(int numDisparities = 0, int blockSize = 21);
+};
+
+/** @brief The class implements the modified H. Hirschmuller algorithm @cite HH08 that differs from the original
+one as follows:
+
+-   By default, the algorithm is single-pass, which means that you consider only 5 directions
+instead of 8. Set mode=StereoSGBM::MODE_HH in createStereoSGBM to run the full variant of the
+algorithm but beware that it may consume a lot of memory.
+-   The algorithm matches blocks, not individual pixels. Though, setting blockSize=1 reduces the
+blocks to single pixels.
+-   Mutual information cost function is not implemented. Instead, a simpler Birchfield-Tomasi
+sub-pixel metric from @cite BT98 is used. Though, the color images are supported as well.
+-   Some pre- and post- processing steps from K. Konolige algorithm StereoBM are included, for
+example: pre-filtering (StereoBM::PREFILTER_XSOBEL type) and post-filtering (uniqueness
+check, quadratic interpolation and speckle filtering).
+
+@note
+   -   (Python) An example illustrating the use of the StereoSGBM matching algorithm can be found
+        at opencv_source_code/samples/python/stereo_match.py
+ */
+class CV_EXPORTS_W StereoSGBM : public StereoMatcher
+{
+public:
+    enum
+    {
+        MODE_SGBM = 0,
+        MODE_HH   = 1,
+        MODE_SGBM_3WAY = 2,
+        MODE_HH4  = 3
+    };
+
+    CV_WRAP virtual int getPreFilterCap() const = 0;
+    CV_WRAP virtual void setPreFilterCap(int preFilterCap) = 0;
+
+    CV_WRAP virtual int getUniquenessRatio() const = 0;
+    CV_WRAP virtual void setUniquenessRatio(int uniquenessRatio) = 0;
+
+    CV_WRAP virtual int getP1() const = 0;
+    CV_WRAP virtual void setP1(int P1) = 0;
+
+    CV_WRAP virtual int getP2() const = 0;
+    CV_WRAP virtual void setP2(int P2) = 0;
+
+    CV_WRAP virtual int getMode() const = 0;
+    CV_WRAP virtual void setMode(int mode) = 0;
+
+    /** @brief Creates StereoSGBM object
+
+    @param minDisparity Minimum possible disparity value. Normally, it is zero but sometimes
+    rectification algorithms can shift images, so this parameter needs to be adjusted accordingly.
+    @param numDisparities Maximum disparity minus minimum disparity. The value is always greater than
+    zero. In the current implementation, this parameter must be divisible by 16.
+    @param blockSize Matched block size. It must be an odd number \>=1 . Normally, it should be
+    somewhere in the 3..11 range.
+    @param P1 The first parameter controlling the disparity smoothness. See below.
+    @param P2 The second parameter controlling the disparity smoothness. The larger the values are,
+    the smoother the disparity is. P1 is the penalty on the disparity change by plus or minus 1
+    between neighbor pixels. P2 is the penalty on the disparity change by more than 1 between neighbor
+    pixels. The algorithm requires P2 \> P1 . See stereo_match.cpp sample where some reasonably good
+    P1 and P2 values are shown (like 8\*number_of_image_channels\*blockSize\*blockSize and
+    32\*number_of_image_channels\*blockSize\*blockSize , respectively).
+    @param disp12MaxDiff Maximum allowed difference (in integer pixel units) in the left-right
+    disparity check. Set it to a non-positive value to disable the check.
+    @param preFilterCap Truncation value for the prefiltered image pixels. The algorithm first
+    computes x-derivative at each pixel and clips its value by [-preFilterCap, preFilterCap] interval.
+    The result values are passed to the Birchfield-Tomasi pixel cost function.
+    @param uniquenessRatio Margin in percentage by which the best (minimum) computed cost function
+    value should "win" the second best value to consider the found match correct. Normally, a value
+    within the 5-15 range is good enough.
+    @param speckleWindowSize Maximum size of smooth disparity regions to consider their noise speckles
+    and invalidate. Set it to 0 to disable speckle filtering. Otherwise, set it somewhere in the
+    50-200 range.
+    @param speckleRange Maximum disparity variation within each connected component. If you do speckle
+    filtering, set the parameter to a positive value, it will be implicitly multiplied by 16.
+    Normally, 1 or 2 is good enough.
+    @param mode Set it to StereoSGBM::MODE_HH to run the full-scale two-pass dynamic programming
+    algorithm. It will consume O(W\*H\*numDisparities) bytes, which is large for 640x480 stereo and
+    huge for HD-size pictures. By default, it is set to false .
+
+    The first constructor initializes StereoSGBM with all the default parameters. So, you only have to
+    set StereoSGBM::numDisparities at minimum. The second constructor enables you to set each parameter
+    to a custom value.
+     */
+    CV_WRAP static Ptr<StereoSGBM> create(int minDisparity = 0, int numDisparities = 16, int blockSize = 3,
+                                          int P1 = 0, int P2 = 0, int disp12MaxDiff = 0,
+                                          int preFilterCap = 0, int uniquenessRatio = 0,
+                                          int speckleWindowSize = 0, int speckleRange = 0,
+                                          int mode = StereoSGBM::MODE_SGBM);
+};
+
+
+//! cv::undistort mode
+enum UndistortTypes
+{
+    PROJ_SPHERICAL_ORTHO  = 0,
+    PROJ_SPHERICAL_EQRECT = 1
+};
+
+/** @brief Transforms an image to compensate for lens distortion.
+
+The function transforms an image to compensate radial and tangential lens distortion.
+
+The function is simply a combination of #initUndistortRectifyMap (with unity R ) and #remap
+(with bilinear interpolation). See the former function for details of the transformation being
+performed.
+
+Those pixels in the destination image, for which there is no correspondent pixels in the source
+image, are filled with zeros (black color).
+
+A particular subset of the source image that will be visible in the corrected image can be regulated
+by newCameraMatrix. You can use #getOptimalNewCameraMatrix to compute the appropriate
+newCameraMatrix depending on your requirements.
+
+The camera matrix and the distortion parameters can be determined using #calibrateCamera. If
+the resolution of images is different from the resolution used at the calibration stage, \f$f_x,
+f_y, c_x\f$ and \f$c_y\f$ need to be scaled accordingly, while the distortion coefficients remain
+the same.
+
+@param src Input (distorted) image.
+@param dst Output (corrected) image that has the same size and type as src .
+@param cameraMatrix Input camera matrix \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
+of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
+@param newCameraMatrix Camera matrix of the distorted image. By default, it is the same as
+cameraMatrix but you may additionally scale and shift the result by using a different matrix.
+ */
+CV_EXPORTS_W void undistort( InputArray src, OutputArray dst,
+                             InputArray cameraMatrix,
+                             InputArray distCoeffs,
+                             InputArray newCameraMatrix = noArray() );
+
+/** @brief Computes the undistortion and rectification transformation map.
+
+The function computes the joint undistortion and rectification transformation and represents the
+result in the form of maps for #remap. The undistorted image looks like original, as if it is
+captured with a camera using the camera matrix =newCameraMatrix and zero distortion. In case of a
+monocular camera, newCameraMatrix is usually equal to cameraMatrix, or it can be computed by
+#getOptimalNewCameraMatrix for a better control over scaling. In case of a stereo camera,
+newCameraMatrix is normally set to P1 or P2 computed by #stereoRectify .
+
+Also, this new camera is oriented differently in the coordinate space, according to R. That, for
+example, helps to align two heads of a stereo camera so that the epipolar lines on both images
+become horizontal and have the same y- coordinate (in case of a horizontally aligned stereo camera).
+
+The function actually builds the maps for the inverse mapping algorithm that is used by #remap. That
+is, for each pixel \f$(u, v)\f$ in the destination (corrected and rectified) image, the function
+computes the corresponding coordinates in the source image (that is, in the original image from
+camera). The following process is applied:
+\f[
+\begin{array}{l}
+x  \leftarrow (u - {c'}_x)/{f'}_x  \\
+y  \leftarrow (v - {c'}_y)/{f'}_y  \\
+{[X\,Y\,W]} ^T  \leftarrow R^{-1}*[x \, y \, 1]^T  \\
+x'  \leftarrow X/W  \\
+y'  \leftarrow Y/W  \\
+r^2  \leftarrow x'^2 + y'^2 \\
+x''  \leftarrow x' \frac{1 + k_1 r^2 + k_2 r^4 + k_3 r^6}{1 + k_4 r^2 + k_5 r^4 + k_6 r^6}
++ 2p_1 x' y' + p_2(r^2 + 2 x'^2)  + s_1 r^2 + s_2 r^4\\
+y''  \leftarrow y' \frac{1 + k_1 r^2 + k_2 r^4 + k_3 r^6}{1 + k_4 r^2 + k_5 r^4 + k_6 r^6}
++ p_1 (r^2 + 2 y'^2) + 2 p_2 x' y' + s_3 r^2 + s_4 r^4 \\
+s\vecthree{x'''}{y'''}{1} =
+\vecthreethree{R_{33}(\tau_x, \tau_y)}{0}{-R_{13}((\tau_x, \tau_y)}
+{0}{R_{33}(\tau_x, \tau_y)}{-R_{23}(\tau_x, \tau_y)}
+{0}{0}{1} R(\tau_x, \tau_y) \vecthree{x''}{y''}{1}\\
+map_x(u,v)  \leftarrow x''' f_x + c_x  \\
+map_y(u,v)  \leftarrow y''' f_y + c_y
+\end{array}
+\f]
+where \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
+are the distortion coefficients.
+
+In case of a stereo camera, this function is called twice: once for each camera head, after
+#stereoRectify, which in its turn is called after #stereoCalibrate. But if the stereo camera
+was not calibrated, it is still possible to compute the rectification transformations directly from
+the fundamental matrix using #stereoRectifyUncalibrated. For each camera, the function computes
+homography H as the rectification transformation in a pixel domain, not a rotation matrix R in 3D
+space. R can be computed from H as
+\f[\texttt{R} = \texttt{cameraMatrix} ^{-1} \cdot \texttt{H} \cdot \texttt{cameraMatrix}\f]
+where cameraMatrix can be chosen arbitrarily.
+
+@param cameraMatrix Input camera matrix \f$A=\vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
+of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
+@param R Optional rectification transformation in the object space (3x3 matrix). R1 or R2 ,
+computed by #stereoRectify can be passed here. If the matrix is empty, the identity transformation
+is assumed. In #initUndistortRectifyMap R assumed to be an identity matrix.
+@param newCameraMatrix New camera matrix \f$A'=\vecthreethree{f_x'}{0}{c_x'}{0}{f_y'}{c_y'}{0}{0}{1}\f$.
+@param size Undistorted image size.
+@param m1type Type of the first output map that can be CV_32FC1, CV_32FC2 or CV_16SC2, see #convertMaps
+@param map1 The first output map.
+@param map2 The second output map.
+ */
+CV_EXPORTS_W
+void initUndistortRectifyMap(InputArray cameraMatrix, InputArray distCoeffs,
+                             InputArray R, InputArray newCameraMatrix,
+                             Size size, int m1type, OutputArray map1, OutputArray map2);
+
+/** @brief Computes the projection and inverse-rectification transformation map. In essense, this is the inverse of
+#initUndistortRectifyMap to accomodate stereo-rectification of projectors ('inverse-cameras') in projector-camera pairs.
+
+The function computes the joint projection and inverse rectification transformation and represents the
+result in the form of maps for #remap. The projected image looks like a distorted version of the original which,
+once projected by a projector, should visually match the original. In case of a monocular camera, newCameraMatrix
+is usually equal to cameraMatrix, or it can be computed by
+#getOptimalNewCameraMatrix for a better control over scaling. In case of a projector-camera pair,
+newCameraMatrix is normally set to P1 or P2 computed by #stereoRectify .
+
+The projector is oriented differently in the coordinate space, according to R. In case of projector-camera pairs,
+this helps align the projector (in the same manner as #initUndistortRectifyMap for the camera) to create a stereo-rectified pair. This
+allows epipolar lines on both images to become horizontal and have the same y-coordinate (in case of a horizontally aligned projector-camera pair).
+
+The function builds the maps for the inverse mapping algorithm that is used by #remap. That
+is, for each pixel \f$(u, v)\f$ in the destination (projected and inverse-rectified) image, the function
+computes the corresponding coordinates in the source image (that is, in the original digital image). The following process is applied:
+
+\f[
+\begin{array}{l}
+\text{newCameraMatrix}\\
+x  \leftarrow (u - {c'}_x)/{f'}_x  \\
+y  \leftarrow (v - {c'}_y)/{f'}_y  \\
+
+\\\text{Undistortion}
+\\\scriptsize{\textit{though equation shown is for radial undistortion, function implements cv::undistortPoints()}}\\
+r^2  \leftarrow x^2 + y^2 \\
+\theta \leftarrow \frac{1 + k_1 r^2 + k_2 r^4 + k_3 r^6}{1 + k_4 r^2 + k_5 r^4 + k_6 r^6}\\
+x' \leftarrow \frac{x}{\theta} \\
+y'  \leftarrow \frac{y}{\theta} \\
+
+\\\text{Rectification}\\
+{[X\,Y\,W]} ^T  \leftarrow R*[x' \, y' \, 1]^T  \\
+x''  \leftarrow X/W  \\
+y''  \leftarrow Y/W  \\
+
+\\\text{cameraMatrix}\\
+map_x(u,v)  \leftarrow x'' f_x + c_x  \\
+map_y(u,v)  \leftarrow y'' f_y + c_y
+\end{array}
+\f]
+where \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
+are the distortion coefficients vector distCoeffs.
+
+In case of a stereo-rectified projector-camera pair, this function is called for the projector while #initUndistortRectifyMap is called for the camera head.
+This is done after #stereoRectify, which in turn is called after #stereoCalibrate. If the projector-camera pair
+is not calibrated, it is still possible to compute the rectification transformations directly from
+the fundamental matrix using #stereoRectifyUncalibrated. For the projector and camera, the function computes
+homography H as the rectification transformation in a pixel domain, not a rotation matrix R in 3D
+space. R can be computed from H as
+\f[\texttt{R} = \texttt{cameraMatrix} ^{-1} \cdot \texttt{H} \cdot \texttt{cameraMatrix}\f]
+where cameraMatrix can be chosen arbitrarily.
+
+@param cameraMatrix Input camera matrix \f$A=\vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
+of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
+@param R Optional rectification transformation in the object space (3x3 matrix). R1 or R2,
+computed by #stereoRectify can be passed here. If the matrix is empty, the identity transformation
+is assumed.
+@param newCameraMatrix New camera matrix \f$A'=\vecthreethree{f_x'}{0}{c_x'}{0}{f_y'}{c_y'}{0}{0}{1}\f$.
+@param size Distorted image size.
+@param m1type Type of the first output map. Can be CV_32FC1, CV_32FC2 or CV_16SC2, see #convertMaps
+@param map1 The first output map for #remap.
+@param map2 The second output map for #remap.
+ */
+CV_EXPORTS_W
+void initInverseRectificationMap( InputArray cameraMatrix, InputArray distCoeffs,
+                           InputArray R, InputArray newCameraMatrix,
+                           const Size& size, int m1type, OutputArray map1, OutputArray map2 );
+
+//! initializes maps for #remap for wide-angle
+CV_EXPORTS
+float initWideAngleProjMap(InputArray cameraMatrix, InputArray distCoeffs,
+                           Size imageSize, int destImageWidth,
+                           int m1type, OutputArray map1, OutputArray map2,
+                           enum UndistortTypes projType = PROJ_SPHERICAL_EQRECT, double alpha = 0);
+static inline
+float initWideAngleProjMap(InputArray cameraMatrix, InputArray distCoeffs,
+                           Size imageSize, int destImageWidth,
+                           int m1type, OutputArray map1, OutputArray map2,
+                           int projType, double alpha = 0)
+{
+    return initWideAngleProjMap(cameraMatrix, distCoeffs, imageSize, destImageWidth,
+                                m1type, map1, map2, (UndistortTypes)projType, alpha);
+}
+
+/** @brief Returns the default new camera matrix.
+
+The function returns the camera matrix that is either an exact copy of the input cameraMatrix (when
+centerPrinicipalPoint=false ), or the modified one (when centerPrincipalPoint=true).
+
+In the latter case, the new camera matrix will be:
+
+\f[\begin{bmatrix} f_x && 0 && ( \texttt{imgSize.width} -1)*0.5  \\ 0 && f_y && ( \texttt{imgSize.height} -1)*0.5  \\ 0 && 0 && 1 \end{bmatrix} ,\f]
+
+where \f$f_x\f$ and \f$f_y\f$ are \f$(0,0)\f$ and \f$(1,1)\f$ elements of cameraMatrix, respectively.
+
+By default, the undistortion functions in OpenCV (see #initUndistortRectifyMap, #undistort) do not
+move the principal point. However, when you work with stereo, it is important to move the principal
+points in both views to the same y-coordinate (which is required by most of stereo correspondence
+algorithms), and may be to the same x-coordinate too. So, you can form the new camera matrix for
+each view where the principal points are located at the center.
+
+@param cameraMatrix Input camera matrix.
+@param imgsize Camera view image size in pixels.
+@param centerPrincipalPoint Location of the principal point in the new camera matrix. The
+parameter indicates whether this location should be at the image center or not.
+ */
+CV_EXPORTS_W
+Mat getDefaultNewCameraMatrix(InputArray cameraMatrix, Size imgsize = Size(),
+                              bool centerPrincipalPoint = false);
+
+/** @brief Computes the ideal point coordinates from the observed point coordinates.
+
+The function is similar to #undistort and #initUndistortRectifyMap but it operates on a
+sparse set of points instead of a raster image. Also the function performs a reverse transformation
+to  #projectPoints. In case of a 3D object, it does not reconstruct its 3D coordinates, but for a
+planar object, it does, up to a translation vector, if the proper R is specified.
+
+For each observed point coordinate \f$(u, v)\f$ the function computes:
+\f[
+\begin{array}{l}
+x^{"}  \leftarrow (u - c_x)/f_x  \\
+y^{"}  \leftarrow (v - c_y)/f_y  \\
+(x',y') = undistort(x^{"},y^{"}, \texttt{distCoeffs}) \\
+{[X\,Y\,W]} ^T  \leftarrow R*[x' \, y' \, 1]^T  \\
+x  \leftarrow X/W  \\
+y  \leftarrow Y/W  \\
+\text{only performed if P is specified:} \\
+u'  \leftarrow x {f'}_x + {c'}_x  \\
+v'  \leftarrow y {f'}_y + {c'}_y
+\end{array}
+\f]
+
+where *undistort* is an approximate iterative algorithm that estimates the normalized original
+point coordinates out of the normalized distorted point coordinates ("normalized" means that the
+coordinates do not depend on the camera matrix).
+
+The function can be used for both a stereo camera head or a monocular camera (when R is empty).
+@param src Observed point coordinates, 2xN/Nx2 1-channel or 1xN/Nx1 2-channel (CV_32FC2 or CV_64FC2) (or
+vector\<Point2f\> ).
+@param dst Output ideal point coordinates (1xN/Nx1 2-channel or vector\<Point2f\> ) after undistortion and reverse perspective
+transformation. If matrix P is identity or omitted, dst will contain normalized point coordinates.
+@param cameraMatrix Camera matrix \f$\vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param distCoeffs Input vector of distortion coefficients
+\f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
+of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
+@param R Rectification transformation in the object space (3x3 matrix). R1 or R2 computed by
+#stereoRectify can be passed here. If the matrix is empty, the identity transformation is used.
+@param P New camera matrix (3x3) or new projection matrix (3x4) \f$\begin{bmatrix} {f'}_x & 0 & {c'}_x & t_x \\ 0 & {f'}_y & {c'}_y & t_y \\ 0 & 0 & 1 & t_z \end{bmatrix}\f$. P1 or P2 computed by
+#stereoRectify can be passed here. If the matrix is empty, the identity new camera matrix is used.
+ */
+CV_EXPORTS_W
+void undistortPoints(InputArray src, OutputArray dst,
+                     InputArray cameraMatrix, InputArray distCoeffs,
+                     InputArray R = noArray(), InputArray P = noArray());
+/** @overload
+    @note Default version of #undistortPoints does 5 iterations to compute undistorted points.
+ */
+CV_EXPORTS_AS(undistortPointsIter)
+void undistortPoints(InputArray src, OutputArray dst,
+                     InputArray cameraMatrix, InputArray distCoeffs,
+                     InputArray R, InputArray P, TermCriteria criteria);
+
+/**
+ * @brief Compute undistorted image points position
+ *
+ * @param src Observed points position, 2xN/Nx2 1-channel or 1xN/Nx1 2-channel (CV_32FC2 or
+CV_64FC2) (or vector\<Point2f\> ).
+ * @param dst Output undistorted points position (1xN/Nx1 2-channel or vector\<Point2f\> ).
+ * @param cameraMatrix Camera matrix \f$\vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+ * @param distCoeffs Distortion coefficients
+ */
+CV_EXPORTS_W
+void undistortImagePoints(InputArray src, OutputArray dst, InputArray cameraMatrix,
+                          InputArray distCoeffs,
+                          TermCriteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5,
+                                                      0.01));
+
+//! @} calib3d
+
+/** @brief The methods in this namespace use a so-called fisheye camera model.
+  @ingroup calib3d_fisheye
+*/
+namespace fisheye
+{
+//! @addtogroup calib3d_fisheye
+//! @{
+
+    enum{
+        CALIB_USE_INTRINSIC_GUESS   = 1 << 0,
+        CALIB_RECOMPUTE_EXTRINSIC   = 1 << 1,
+        CALIB_CHECK_COND            = 1 << 2,
+        CALIB_FIX_SKEW              = 1 << 3,
+        CALIB_FIX_K1                = 1 << 4,
+        CALIB_FIX_K2                = 1 << 5,
+        CALIB_FIX_K3                = 1 << 6,
+        CALIB_FIX_K4                = 1 << 7,
+        CALIB_FIX_INTRINSIC         = 1 << 8,
+        CALIB_FIX_PRINCIPAL_POINT   = 1 << 9,
+        CALIB_ZERO_DISPARITY        = 1 << 10,
+        CALIB_FIX_FOCAL_LENGTH      = 1 << 11
+    };
+
+    /** @brief Projects points using fisheye model
+
+    @param objectPoints Array of object points, 1xN/Nx1 3-channel (or vector\<Point3f\> ), where N is
+    the number of points in the view.
+    @param imagePoints Output array of image points, 2xN/Nx2 1-channel or 1xN/Nx1 2-channel, or
+    vector\<Point2f\>.
+    @param affine
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param alpha The skew coefficient.
+    @param jacobian Optional output 2Nx15 jacobian matrix of derivatives of image points with respect
+    to components of the focal lengths, coordinates of the principal point, distortion coefficients,
+    rotation vector, translation vector, and the skew. In the old interface different components of
+    the jacobian are returned via different output parameters.
+
+    The function computes projections of 3D points to the image plane given intrinsic and extrinsic
+    camera parameters. Optionally, the function computes Jacobians - matrices of partial derivatives of
+    image points coordinates (as functions of all the input parameters) with respect to the particular
+    parameters, intrinsic and/or extrinsic.
+     */
+    CV_EXPORTS void projectPoints(InputArray objectPoints, OutputArray imagePoints, const Affine3d& affine,
+        InputArray K, InputArray D, double alpha = 0, OutputArray jacobian = noArray());
+
+    /** @overload */
+    CV_EXPORTS_W void projectPoints(InputArray objectPoints, OutputArray imagePoints, InputArray rvec, InputArray tvec,
+        InputArray K, InputArray D, double alpha = 0, OutputArray jacobian = noArray());
+
+    /** @brief Distorts 2D points using fisheye model.
+
+    @param undistorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is
+    the number of points in the view.
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param alpha The skew coefficient.
+    @param distorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .
+
+    Note that the function assumes the camera intrinsic matrix of the undistorted points to be identity.
+    This means if you want to distort image points you have to multiply them with \f$K^{-1}\f$.
+     */
+    CV_EXPORTS_W void distortPoints(InputArray undistorted, OutputArray distorted, InputArray K, InputArray D, double alpha = 0);
+
+    /** @brief Undistorts 2D points using fisheye model
+
+    @param distorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is the
+    number of points in the view.
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param R Rectification transformation in the object space: 3x3 1-channel, or vector: 3x1/1x3
+    1-channel or 1x1 3-channel
+    @param P New camera intrinsic matrix (3x3) or new projection matrix (3x4)
+    @param criteria Termination criteria
+    @param undistorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .
+     */
+    CV_EXPORTS_W void undistortPoints(InputArray distorted, OutputArray undistorted,
+        InputArray K, InputArray D, InputArray R = noArray(), InputArray P  = noArray(),
+                TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 10, 1e-8));
+
+    /** @brief Computes undistortion and rectification maps for image transform by #remap. If D is empty zero
+    distortion is used, if R or P is empty identity matrixes are used.
+
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param R Rectification transformation in the object space: 3x3 1-channel, or vector: 3x1/1x3
+    1-channel or 1x1 3-channel
+    @param P New camera intrinsic matrix (3x3) or new projection matrix (3x4)
+    @param size Undistorted image size.
+    @param m1type Type of the first output map that can be CV_32FC1 or CV_16SC2 . See #convertMaps
+    for details.
+    @param map1 The first output map.
+    @param map2 The second output map.
+     */
+    CV_EXPORTS_W void initUndistortRectifyMap(InputArray K, InputArray D, InputArray R, InputArray P,
+        const cv::Size& size, int m1type, OutputArray map1, OutputArray map2);
+
+    /** @brief Transforms an image to compensate for fisheye lens distortion.
+
+    @param distorted image with fisheye lens distortion.
+    @param undistorted Output image with compensated fisheye lens distortion.
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param Knew Camera intrinsic matrix of the distorted image. By default, it is the identity matrix but you
+    may additionally scale and shift the result by using a different matrix.
+    @param new_size the new size
+
+    The function transforms an image to compensate radial and tangential lens distortion.
+
+    The function is simply a combination of #fisheye::initUndistortRectifyMap (with unity R ) and #remap
+    (with bilinear interpolation). See the former function for details of the transformation being
+    performed.
+
+    See below the results of undistortImage.
+       -   a\) result of undistort of perspective camera model (all possible coefficients (k_1, k_2, k_3,
+            k_4, k_5, k_6) of distortion were optimized under calibration)
+        -   b\) result of #fisheye::undistortImage of fisheye camera model (all possible coefficients (k_1, k_2,
+            k_3, k_4) of fisheye distortion were optimized under calibration)
+        -   c\) original image was captured with fisheye lens
+
+    Pictures a) and b) almost the same. But if we consider points of image located far from the center
+    of image, we can notice that on image a) these points are distorted.
+
+    ![image](pics/fisheye_undistorted.jpg)
+     */
+    CV_EXPORTS_W void undistortImage(InputArray distorted, OutputArray undistorted,
+        InputArray K, InputArray D, InputArray Knew = cv::noArray(), const Size& new_size = Size());
+
+    /** @brief Estimates new camera intrinsic matrix for undistortion or rectification.
+
+    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param image_size Size of the image
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param R Rectification transformation in the object space: 3x3 1-channel, or vector: 3x1/1x3
+    1-channel or 1x1 3-channel
+    @param P New camera intrinsic matrix (3x3) or new projection matrix (3x4)
+    @param balance Sets the new focal length in range between the min focal length and the max focal
+    length. Balance is in range of [0, 1].
+    @param new_size the new size
+    @param fov_scale Divisor for new focal length.
+     */
+    CV_EXPORTS_W void estimateNewCameraMatrixForUndistortRectify(InputArray K, InputArray D, const Size &image_size, InputArray R,
+        OutputArray P, double balance = 0.0, const Size& new_size = Size(), double fov_scale = 1.0);
+
+    /** @brief Performs camera calibration
+
+    @param objectPoints vector of vectors of calibration pattern points in the calibration pattern
+    coordinate space.
+    @param imagePoints vector of vectors of the projections of calibration pattern points.
+    imagePoints.size() and objectPoints.size() and imagePoints[i].size() must be equal to
+    objectPoints[i].size() for each i.
+    @param image_size Size of the image used only to initialize the camera intrinsic matrix.
+    @param K Output 3x3 floating-point camera intrinsic matrix
+    \f$\cameramatrix{A}\f$ . If
+    @ref fisheye::CALIB_USE_INTRINSIC_GUESS is specified, some or all of fx, fy, cx, cy must be
+    initialized before calling the function.
+    @param D Output vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param rvecs Output vector of rotation vectors (see Rodrigues ) estimated for each pattern view.
+    That is, each k-th rotation vector together with the corresponding k-th translation vector (see
+    the next output parameter description) brings the calibration pattern from the model coordinate
+    space (in which object points are specified) to the world coordinate space, that is, a real
+    position of the calibration pattern in the k-th pattern view (k=0.. *M* -1).
+    @param tvecs Output vector of translation vectors estimated for each pattern view.
+    @param flags Different flags that may be zero or a combination of the following values:
+    -    @ref fisheye::CALIB_USE_INTRINSIC_GUESS  cameraMatrix contains valid initial values of
+    fx, fy, cx, cy that are optimized further. Otherwise, (cx, cy) is initially set to the image
+    center ( imageSize is used), and focal distances are computed in a least-squares fashion.
+    -    @ref fisheye::CALIB_RECOMPUTE_EXTRINSIC  Extrinsic will be recomputed after each iteration
+    of intrinsic optimization.
+    -    @ref fisheye::CALIB_CHECK_COND  The functions will check validity of condition number.
+    -    @ref fisheye::CALIB_FIX_SKEW  Skew coefficient (alpha) is set to zero and stay zero.
+    -    @ref fisheye::CALIB_FIX_K1,..., @ref fisheye::CALIB_FIX_K4 Selected distortion coefficients
+    are set to zeros and stay zero.
+    -    @ref fisheye::CALIB_FIX_PRINCIPAL_POINT  The principal point is not changed during the global
+optimization. It stays at the center or at a different location specified when @ref fisheye::CALIB_USE_INTRINSIC_GUESS is set too.
+    -    @ref fisheye::CALIB_FIX_FOCAL_LENGTH The focal length is not changed during the global
+optimization. It is the \f$max(width,height)/\pi\f$ or the provided \f$f_x\f$, \f$f_y\f$ when @ref fisheye::CALIB_USE_INTRINSIC_GUESS is set too.
+    @param criteria Termination criteria for the iterative optimization algorithm.
+     */
+    CV_EXPORTS_W double calibrate(InputArrayOfArrays objectPoints, InputArrayOfArrays imagePoints, const Size& image_size,
+        InputOutputArray K, InputOutputArray D, OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs, int flags = 0,
+            TermCriteria criteria = TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 100, DBL_EPSILON));
+
+    /** @brief Stereo rectification for fisheye camera model
+
+    @param K1 First camera intrinsic matrix.
+    @param D1 First camera distortion parameters.
+    @param K2 Second camera intrinsic matrix.
+    @param D2 Second camera distortion parameters.
+    @param imageSize Size of the image used for stereo calibration.
+    @param R Rotation matrix between the coordinate systems of the first and the second
+    cameras.
+    @param tvec Translation vector between coordinate systems of the cameras.
+    @param R1 Output 3x3 rectification transform (rotation matrix) for the first camera.
+    @param R2 Output 3x3 rectification transform (rotation matrix) for the second camera.
+    @param P1 Output 3x4 projection matrix in the new (rectified) coordinate systems for the first
+    camera.
+    @param P2 Output 3x4 projection matrix in the new (rectified) coordinate systems for the second
+    camera.
+    @param Q Output \f$4 \times 4\f$ disparity-to-depth mapping matrix (see #reprojectImageTo3D ).
+    @param flags Operation flags that may be zero or @ref fisheye::CALIB_ZERO_DISPARITY . If the flag is set,
+    the function makes the principal points of each camera have the same pixel coordinates in the
+    rectified views. And if the flag is not set, the function may still shift the images in the
+    horizontal or vertical direction (depending on the orientation of epipolar lines) to maximize the
+    useful image area.
+    @param newImageSize New image resolution after rectification. The same size should be passed to
+    #initUndistortRectifyMap (see the stereo_calib.cpp sample in OpenCV samples directory). When (0,0)
+    is passed (default), it is set to the original imageSize . Setting it to larger value can help you
+    preserve details in the original image, especially when there is a big radial distortion.
+    @param balance Sets the new focal length in range between the min focal length and the max focal
+    length. Balance is in range of [0, 1].
+    @param fov_scale Divisor for new focal length.
+     */
+    CV_EXPORTS_W void stereoRectify(InputArray K1, InputArray D1, InputArray K2, InputArray D2, const Size &imageSize, InputArray R, InputArray tvec,
+        OutputArray R1, OutputArray R2, OutputArray P1, OutputArray P2, OutputArray Q, int flags, const Size &newImageSize = Size(),
+        double balance = 0.0, double fov_scale = 1.0);
+
+    /** @brief Performs stereo calibration
+
+    @param objectPoints Vector of vectors of the calibration pattern points.
+    @param imagePoints1 Vector of vectors of the projections of the calibration pattern points,
+    observed by the first camera.
+    @param imagePoints2 Vector of vectors of the projections of the calibration pattern points,
+    observed by the second camera.
+    @param K1 Input/output first camera intrinsic matrix:
+    \f$\vecthreethree{f_x^{(j)}}{0}{c_x^{(j)}}{0}{f_y^{(j)}}{c_y^{(j)}}{0}{0}{1}\f$ , \f$j = 0,\, 1\f$ . If
+    any of @ref fisheye::CALIB_USE_INTRINSIC_GUESS , @ref fisheye::CALIB_FIX_INTRINSIC are specified,
+    some or all of the matrix components must be initialized.
+    @param D1 Input/output vector of distortion coefficients \f$\distcoeffsfisheye\f$ of 4 elements.
+    @param K2 Input/output second camera intrinsic matrix. The parameter is similar to K1 .
+    @param D2 Input/output lens distortion coefficients for the second camera. The parameter is
+    similar to D1 .
+    @param imageSize Size of the image used only to initialize camera intrinsic matrix.
+    @param R Output rotation matrix between the 1st and the 2nd camera coordinate systems.
+    @param T Output translation vector between the coordinate systems of the cameras.
+    @param rvecs Output vector of rotation vectors ( @ref Rodrigues ) estimated for each pattern view in the
+    coordinate system of the first camera of the stereo pair (e.g. std::vector<cv::Mat>). More in detail, each
+    i-th rotation vector together with the corresponding i-th translation vector (see the next output parameter
+    description) brings the calibration pattern from the object coordinate space (in which object points are
+    specified) to the camera coordinate space of the first camera of the stereo pair. In more technical terms,
+    the tuple of the i-th rotation and translation vector performs a change of basis from object coordinate space
+    to camera coordinate space of the first camera of the stereo pair.
+    @param tvecs Output vector of translation vectors estimated for each pattern view, see parameter description
+    of previous output parameter ( rvecs ).
+    @param flags Different flags that may be zero or a combination of the following values:
+    -    @ref fisheye::CALIB_FIX_INTRINSIC  Fix K1, K2? and D1, D2? so that only R, T matrices
+    are estimated.
+    -    @ref fisheye::CALIB_USE_INTRINSIC_GUESS  K1, K2 contains valid initial values of
+    fx, fy, cx, cy that are optimized further. Otherwise, (cx, cy) is initially set to the image
+    center (imageSize is used), and focal distances are computed in a least-squares fashion.
+    -    @ref fisheye::CALIB_RECOMPUTE_EXTRINSIC  Extrinsic will be recomputed after each iteration
+    of intrinsic optimization.
+    -    @ref fisheye::CALIB_CHECK_COND  The functions will check validity of condition number.
+    -    @ref fisheye::CALIB_FIX_SKEW  Skew coefficient (alpha) is set to zero and stay zero.
+    -   @ref fisheye::CALIB_FIX_K1,..., @ref fisheye::CALIB_FIX_K4 Selected distortion coefficients are set to zeros and stay
+    zero.
+    @param criteria Termination criteria for the iterative optimization algorithm.
+     */
+    CV_EXPORTS_W double stereoCalibrate(InputArrayOfArrays objectPoints, InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2,
+                                  InputOutputArray K1, InputOutputArray D1, InputOutputArray K2, InputOutputArray D2, Size imageSize,
+                                  OutputArray R, OutputArray T, OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs, int flags = fisheye::CALIB_FIX_INTRINSIC,
+                                  TermCriteria criteria = TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 100, DBL_EPSILON));
+
+    /// @overload
+    CV_EXPORTS_W double stereoCalibrate(InputArrayOfArrays objectPoints, InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2,
+                                  InputOutputArray K1, InputOutputArray D1, InputOutputArray K2, InputOutputArray D2, Size imageSize,
+                                  OutputArray R, OutputArray T, int flags = fisheye::CALIB_FIX_INTRINSIC,
+                                  TermCriteria criteria = TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 100, DBL_EPSILON));
+
+    /**
+    @brief Finds an object pose from 3D-2D point correspondences for fisheye camera moodel.
+
+    @param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
+    1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
+    @param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+    where N is the number of points. vector\<Point2d\> can be also passed here.
+    @param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+    @param distCoeffs Input vector of distortion coefficients (4x1/1x4).
+    @param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
+    the model coordinate system to the camera coordinate system.
+    @param tvec Output translation vector.
+    @param useExtrinsicGuess Parameter used for #SOLVEPNP_ITERATIVE. If true (1), the function uses
+    the provided rvec and tvec values as initial approximations of the rotation and translation
+    vectors, respectively, and further optimizes them.
+    @param flags Method for solving a PnP problem: see @ref calib3d_solvePnP_flags
+    This function returns the rotation and the translation vectors that transform a 3D point expressed in the object
+    coordinate frame to the camera coordinate frame, using different methods:
+    - P3P methods (@ref SOLVEPNP_P3P, @ref SOLVEPNP_AP3P): need 4 input points to return a unique solution.
+    - @ref SOLVEPNP_IPPE Input points must be >= 4 and object points must be coplanar.
+    - @ref SOLVEPNP_IPPE_SQUARE Special case suitable for marker pose estimation.
+    Number of input points must be 4. Object points must be defined in the following order:
+    - point 0: [-squareLength / 2,  squareLength / 2, 0]
+    - point 1: [ squareLength / 2,  squareLength / 2, 0]
+    - point 2: [ squareLength / 2, -squareLength / 2, 0]
+    - point 3: [-squareLength / 2, -squareLength / 2, 0]
+    - for all the other flags, number of input points must be >= 4 and object points can be in any configuration.
+    @param criteria Termination criteria for internal undistortPoints call.
+    The function interally undistorts points with @ref undistortPoints and call @ref cv::solvePnP,
+    thus the input are very similar. Check there and Perspective-n-Points is described in @ref calib3d_solvePnP
+    for more information.
+    */
+    CV_EXPORTS_W bool solvePnP( InputArray objectPoints, InputArray imagePoints,
+                                InputArray cameraMatrix, InputArray distCoeffs,
+                                OutputArray rvec, OutputArray tvec,
+                                bool useExtrinsicGuess = false, int flags = SOLVEPNP_ITERATIVE,
+                                TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 10, 1e-8)
+                              );
+
+//! @} calib3d_fisheye
+} // end namespace fisheye
+
+} //end namespace cv
+
+#if 0 //def __cplusplus
+//////////////////////////////////////////////////////////////////////////////////////////
+class CV_EXPORTS CvLevMarq
+{
+public:
+    CvLevMarq();
+    CvLevMarq( int nparams, int nerrs, CvTermCriteria criteria=
+              cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
+              bool completeSymmFlag=false );
+    ~CvLevMarq();
+    void init( int nparams, int nerrs, CvTermCriteria criteria=
+              cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
+              bool completeSymmFlag=false );
+    bool update( const CvMat*& param, CvMat*& J, CvMat*& err );
+    bool updateAlt( const CvMat*& param, CvMat*& JtJ, CvMat*& JtErr, double*& errNorm );
+
+    void clear();
+    void step();
+    enum { DONE=0, STARTED=1, CALC_J=2, CHECK_ERR=3 };
+
+    cv::Ptr<CvMat> mask;
+    cv::Ptr<CvMat> prevParam;
+    cv::Ptr<CvMat> param;
+    cv::Ptr<CvMat> J;
+    cv::Ptr<CvMat> err;
+    cv::Ptr<CvMat> JtJ;
+    cv::Ptr<CvMat> JtJN;
+    cv::Ptr<CvMat> JtErr;
+    cv::Ptr<CvMat> JtJV;
+    cv::Ptr<CvMat> JtJW;
+    double prevErrNorm, errNorm;
+    int lambdaLg10;
+    CvTermCriteria criteria;
+    int state;
+    int iters;
+    bool completeSymmFlag;
+    int solveMethod;
+};
+#endif
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/calib3d/calib3d.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/calib3d/calib3d.hpp
new file mode 100644
index 000000000000..b3da45edd59c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/calib3d/calib3d.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/calib3d.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/calib3d/calib3d_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/calib3d/calib3d_c.h
new file mode 100644
index 000000000000..e2af07b2e2ba
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/calib3d/calib3d_c.h
@@ -0,0 +1,150 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CALIB3D_C_H
+#define OPENCV_CALIB3D_C_H
+
+#include "opencv2/core/types_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Calculates fundamental matrix given a set of corresponding points */
+#define CV_FM_7POINT 1
+#define CV_FM_8POINT 2
+
+#define CV_LMEDS 4
+#define CV_RANSAC 8
+
+#define CV_FM_LMEDS_ONLY  CV_LMEDS
+#define CV_FM_RANSAC_ONLY CV_RANSAC
+#define CV_FM_LMEDS CV_LMEDS
+#define CV_FM_RANSAC CV_RANSAC
+
+enum
+{
+    CV_ITERATIVE = 0,
+    CV_EPNP = 1, // F.Moreno-Noguer, V.Lepetit and P.Fua "EPnP: Efficient Perspective-n-Point Camera Pose Estimation"
+    CV_P3P = 2, // X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang; "Complete Solution Classification for the Perspective-Three-Point Problem"
+    CV_DLS = 3 // Joel A. Hesch and Stergios I. Roumeliotis. "A Direct Least-Squares (DLS) Method for PnP"
+};
+
+#define CV_CALIB_CB_ADAPTIVE_THRESH  1
+#define CV_CALIB_CB_NORMALIZE_IMAGE  2
+#define CV_CALIB_CB_FILTER_QUADS     4
+#define CV_CALIB_CB_FAST_CHECK       8
+
+#define CV_CALIB_USE_INTRINSIC_GUESS  1
+#define CV_CALIB_FIX_ASPECT_RATIO     2
+#define CV_CALIB_FIX_PRINCIPAL_POINT  4
+#define CV_CALIB_ZERO_TANGENT_DIST    8
+#define CV_CALIB_FIX_FOCAL_LENGTH 16
+#define CV_CALIB_FIX_K1  32
+#define CV_CALIB_FIX_K2  64
+#define CV_CALIB_FIX_K3  128
+#define CV_CALIB_FIX_K4  2048
+#define CV_CALIB_FIX_K5  4096
+#define CV_CALIB_FIX_K6  8192
+#define CV_CALIB_RATIONAL_MODEL 16384
+#define CV_CALIB_THIN_PRISM_MODEL 32768
+#define CV_CALIB_FIX_S1_S2_S3_S4  65536
+#define CV_CALIB_TILTED_MODEL  262144
+#define CV_CALIB_FIX_TAUX_TAUY  524288
+#define CV_CALIB_FIX_TANGENT_DIST 2097152
+
+#define CV_CALIB_NINTRINSIC 18
+
+#define CV_CALIB_FIX_INTRINSIC  256
+#define CV_CALIB_SAME_FOCAL_LENGTH 512
+
+#define CV_CALIB_ZERO_DISPARITY 1024
+
+/* stereo correspondence parameters and functions */
+#define CV_STEREO_BM_NORMALIZED_RESPONSE  0
+#define CV_STEREO_BM_XSOBEL               1
+
+#ifdef __cplusplus
+} // extern "C"
+
+//////////////////////////////////////////////////////////////////////////////////////////
+class CV_EXPORTS CvLevMarq
+{
+public:
+    CvLevMarq();
+    CvLevMarq( int nparams, int nerrs, CvTermCriteria criteria=
+              cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
+              bool completeSymmFlag=false );
+    ~CvLevMarq();
+    void init( int nparams, int nerrs, CvTermCriteria criteria=
+              cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
+              bool completeSymmFlag=false );
+    bool update( const CvMat*& param, CvMat*& J, CvMat*& err );
+    bool updateAlt( const CvMat*& param, CvMat*& JtJ, CvMat*& JtErr, double*& errNorm );
+
+    void clear();
+    void step();
+    enum { DONE=0, STARTED=1, CALC_J=2, CHECK_ERR=3 };
+
+    cv::Ptr<CvMat> mask;
+    cv::Ptr<CvMat> prevParam;
+    cv::Ptr<CvMat> param;
+    cv::Ptr<CvMat> J;
+    cv::Ptr<CvMat> err;
+    cv::Ptr<CvMat> JtJ;
+    cv::Ptr<CvMat> JtJN;
+    cv::Ptr<CvMat> JtErr;
+    cv::Ptr<CvMat> JtJV;
+    cv::Ptr<CvMat> JtJW;
+    double prevErrNorm, errNorm;
+    int lambdaLg10;
+    CvTermCriteria criteria;
+    int state;
+    int iters;
+    bool completeSymmFlag;
+    int solveMethod;
+};
+
+#endif
+
+#endif /* OPENCV_CALIB3D_C_H */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core.hpp
new file mode 100644
index 000000000000..b58a3a6ccbbe
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core.hpp
@@ -0,0 +1,3421 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2015, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_HPP
+#define OPENCV_CORE_HPP
+
+#ifndef __cplusplus
+#  error core.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/traits.hpp"
+#include "opencv2/core/matx.hpp"
+#include "opencv2/core/types.hpp"
+#include "opencv2/core/mat.hpp"
+#include "opencv2/core/persistence.hpp"
+
+/**
+@defgroup core Core functionality
+@{
+    @defgroup core_basic Basic structures
+    @defgroup core_array Operations on arrays
+    @defgroup core_async Asynchronous API
+    @defgroup core_xml XML/YAML Persistence
+    @defgroup core_cluster Clustering
+    @defgroup core_utils Utility and system functions and macros
+    @{
+        @defgroup core_logging Logging facilities
+        @defgroup core_utils_sse SSE utilities
+        @defgroup core_utils_neon NEON utilities
+        @defgroup core_utils_vsx VSX utilities
+        @defgroup core_utils_softfloat Softfloat support
+        @defgroup core_utils_samples Utility functions for OpenCV samples
+    @}
+    @defgroup core_opengl OpenGL interoperability
+    @defgroup core_ipp Intel IPP Asynchronous C/C++ Converters
+    @defgroup core_optim Optimization Algorithms
+    @defgroup core_directx DirectX interoperability
+    @defgroup core_eigen Eigen support
+    @defgroup core_opencl OpenCL support
+    @defgroup core_va_intel Intel VA-API/OpenCL (CL-VA) interoperability
+    @defgroup core_hal Hardware Acceleration Layer
+    @{
+        @defgroup core_hal_functions Functions
+        @defgroup core_hal_interface Interface
+        @defgroup core_hal_intrin Universal intrinsics
+        @{
+            @defgroup core_hal_intrin_impl Private implementation helpers
+        @}
+        @defgroup core_lowlevel_api Low-level API for external libraries / plugins
+    @}
+    @defgroup core_parallel Parallel Processing
+    @{
+        @defgroup core_parallel_backend Parallel backends API
+    @}
+@}
+ */
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+/*! @brief Class passed to an error.
+
+This class encapsulates all or almost all necessary
+information about the error happened in the program. The exception is
+usually constructed and thrown implicitly via CV_Error and CV_Error_ macros.
+@see error
+ */
+class CV_EXPORTS Exception : public std::exception
+{
+public:
+    /*!
+     Default constructor
+     */
+    Exception();
+    /*!
+     Full constructor. Normally the constructor is not called explicitly.
+     Instead, the macros CV_Error(), CV_Error_() and CV_Assert() are used.
+    */
+    Exception(int _code, const String& _err, const String& _func, const String& _file, int _line);
+    virtual ~Exception() throw();
+
+    /*!
+     \return the error description and the context as a text string.
+    */
+    virtual const char *what() const throw() CV_OVERRIDE;
+    void formatMessage();
+
+    String msg; ///< the formatted error message
+
+    int code; ///< error code @see CVStatus
+    String err; ///< error description
+    String func; ///< function name. Available only when the compiler supports getting it
+    String file; ///< source file name where the error has occurred
+    int line; ///< line number in the source file where the error has occurred
+};
+
+/*! @brief Signals an error and raises the exception.
+
+By default the function prints information about the error to stderr,
+then it either stops if cv::setBreakOnError() had been called before or raises the exception.
+It is possible to alternate error processing by using #redirectError().
+@param exc the exception raisen.
+@deprecated drop this version
+ */
+CV_EXPORTS CV_NORETURN void error(const Exception& exc);
+
+enum SortFlags { SORT_EVERY_ROW    = 0, //!< each matrix row is sorted independently
+                 SORT_EVERY_COLUMN = 1, //!< each matrix column is sorted
+                                        //!< independently; this flag and the previous one are
+                                        //!< mutually exclusive.
+                 SORT_ASCENDING    = 0, //!< each matrix row is sorted in the ascending
+                                        //!< order.
+                 SORT_DESCENDING   = 16 //!< each matrix row is sorted in the
+                                        //!< descending order; this flag and the previous one are also
+                                        //!< mutually exclusive.
+               };
+
+//! @} core_utils
+
+//! @addtogroup core
+//! @{
+
+//! Covariation flags
+enum CovarFlags {
+    /** The output covariance matrix is calculated as:
+       \f[\texttt{scale}   \cdot  [  \texttt{vects}  [0]-  \texttt{mean}  , \texttt{vects}  [1]-  \texttt{mean}  ,...]^T  \cdot  [ \texttt{vects}  [0]- \texttt{mean}  , \texttt{vects}  [1]- \texttt{mean}  ,...],\f]
+       The covariance matrix will be nsamples x nsamples. Such an unusual covariance matrix is used
+       for fast PCA of a set of very large vectors (see, for example, the EigenFaces technique for
+       face recognition). Eigenvalues of this "scrambled" matrix match the eigenvalues of the true
+       covariance matrix. The "true" eigenvectors can be easily calculated from the eigenvectors of
+       the "scrambled" covariance matrix. */
+    COVAR_SCRAMBLED = 0,
+    /**The output covariance matrix is calculated as:
+        \f[\texttt{scale}   \cdot  [  \texttt{vects}  [0]-  \texttt{mean}  , \texttt{vects}  [1]-  \texttt{mean}  ,...]  \cdot  [ \texttt{vects}  [0]- \texttt{mean}  , \texttt{vects}  [1]- \texttt{mean}  ,...]^T,\f]
+        covar will be a square matrix of the same size as the total number of elements in each input
+        vector. One and only one of #COVAR_SCRAMBLED and #COVAR_NORMAL must be specified.*/
+    COVAR_NORMAL    = 1,
+    /** If the flag is specified, the function does not calculate mean from
+        the input vectors but, instead, uses the passed mean vector. This is useful if mean has been
+        pre-calculated or known in advance, or if the covariance matrix is calculated by parts. In
+        this case, mean is not a mean vector of the input sub-set of vectors but rather the mean
+        vector of the whole set.*/
+    COVAR_USE_AVG   = 2,
+    /** If the flag is specified, the covariance matrix is scaled. In the
+        "normal" mode, scale is 1./nsamples . In the "scrambled" mode, scale is the reciprocal of the
+        total number of elements in each input vector. By default (if the flag is not specified), the
+        covariance matrix is not scaled ( scale=1 ).*/
+    COVAR_SCALE     = 4,
+    /** If the flag is
+        specified, all the input vectors are stored as rows of the samples matrix. mean should be a
+        single-row vector in this case.*/
+    COVAR_ROWS      = 8,
+    /** If the flag is
+        specified, all the input vectors are stored as columns of the samples matrix. mean should be a
+        single-column vector in this case.*/
+    COVAR_COLS      = 16
+};
+
+//! @addtogroup core_cluster
+//!  @{
+
+//! k-Means flags
+enum KmeansFlags {
+    /** Select random initial centers in each attempt.*/
+    KMEANS_RANDOM_CENTERS     = 0,
+    /** Use kmeans++ center initialization by Arthur and Vassilvitskii [Arthur2007].*/
+    KMEANS_PP_CENTERS         = 2,
+    /** During the first (and possibly the only) attempt, use the
+        user-supplied labels instead of computing them from the initial centers. For the second and
+        further attempts, use the random or semi-random centers. Use one of KMEANS_\*_CENTERS flag
+        to specify the exact method.*/
+    KMEANS_USE_INITIAL_LABELS = 1
+};
+
+//! @} core_cluster
+
+//! @addtogroup core_array
+//! @{
+
+enum ReduceTypes { REDUCE_SUM = 0, //!< the output is the sum of all rows/columns of the matrix.
+                   REDUCE_AVG = 1, //!< the output is the mean vector of all rows/columns of the matrix.
+                   REDUCE_MAX = 2, //!< the output is the maximum (column/row-wise) of all rows/columns of the matrix.
+                   REDUCE_MIN = 3,  //!< the output is the minimum (column/row-wise) of all rows/columns of the matrix.
+                   REDUCE_SUM2 = 4  //!< the output is the sum of all squared rows/columns of the matrix.
+                 };
+
+//! @} core_array
+
+/** @brief Swaps two matrices
+*/
+CV_EXPORTS void swap(Mat& a, Mat& b);
+/** @overload */
+CV_EXPORTS void swap( UMat& a, UMat& b );
+
+//! @} core
+
+//! @addtogroup core_array
+//! @{
+
+/** @brief Computes the source location of an extrapolated pixel.
+
+The function computes and returns the coordinate of a donor pixel corresponding to the specified
+extrapolated pixel when using the specified extrapolation border mode. For example, if you use
+cv::BORDER_WRAP mode in the horizontal direction, cv::BORDER_REFLECT_101 in the vertical direction and
+want to compute value of the "virtual" pixel Point(-5, 100) in a floating-point image img, it
+looks like:
+@code{.cpp}
+    float val = img.at<float>(borderInterpolate(100, img.rows, cv::BORDER_REFLECT_101),
+                              borderInterpolate(-5, img.cols, cv::BORDER_WRAP));
+@endcode
+Normally, the function is not called directly. It is used inside filtering functions and also in
+copyMakeBorder.
+@param p 0-based coordinate of the extrapolated pixel along one of the axes, likely \<0 or \>= len
+@param len Length of the array along the corresponding axis.
+@param borderType Border type, one of the #BorderTypes, except for #BORDER_TRANSPARENT and
+#BORDER_ISOLATED. When borderType==#BORDER_CONSTANT, the function always returns -1, regardless
+of p and len.
+
+@sa copyMakeBorder
+*/
+CV_EXPORTS_W int borderInterpolate(int p, int len, int borderType);
+
+/** @example samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp
+An example using copyMakeBorder function.
+Check @ref tutorial_copyMakeBorder "the corresponding tutorial" for more details
+*/
+
+/** @brief Forms a border around an image.
+
+The function copies the source image into the middle of the destination image. The areas to the
+left, to the right, above and below the copied source image will be filled with extrapolated
+pixels. This is not what filtering functions based on it do (they extrapolate pixels on-fly), but
+what other more complex functions, including your own, may do to simplify image boundary handling.
+
+The function supports the mode when src is already in the middle of dst . In this case, the
+function does not copy src itself but simply constructs the border, for example:
+
+@code{.cpp}
+    // let border be the same in all directions
+    int border=2;
+    // constructs a larger image to fit both the image and the border
+    Mat gray_buf(rgb.rows + border*2, rgb.cols + border*2, rgb.depth());
+    // select the middle part of it w/o copying data
+    Mat gray(gray_canvas, Rect(border, border, rgb.cols, rgb.rows));
+    // convert image from RGB to grayscale
+    cvtColor(rgb, gray, COLOR_RGB2GRAY);
+    // form a border in-place
+    copyMakeBorder(gray, gray_buf, border, border,
+                   border, border, BORDER_REPLICATE);
+    // now do some custom filtering ...
+    ...
+@endcode
+@note When the source image is a part (ROI) of a bigger image, the function will try to use the
+pixels outside of the ROI to form a border. To disable this feature and always do extrapolation, as
+if src was not a ROI, use borderType | #BORDER_ISOLATED.
+
+@param src Source image.
+@param dst Destination image of the same type as src and the size Size(src.cols+left+right,
+src.rows+top+bottom) .
+@param top the top pixels
+@param bottom the bottom pixels
+@param left the left pixels
+@param right Parameter specifying how many pixels in each direction from the source image rectangle
+to extrapolate. For example, top=1, bottom=1, left=1, right=1 mean that 1 pixel-wide border needs
+to be built.
+@param borderType Border type. See borderInterpolate for details.
+@param value Border value if borderType==BORDER_CONSTANT .
+
+@sa  borderInterpolate
+*/
+CV_EXPORTS_W void copyMakeBorder(InputArray src, OutputArray dst,
+                                 int top, int bottom, int left, int right,
+                                 int borderType, const Scalar& value = Scalar() );
+
+/** @brief Calculates the per-element sum of two arrays or an array and a scalar.
+
+The function add calculates:
+- Sum of two arrays when both input arrays have the same size and the same number of channels:
+\f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) +  \texttt{src2}(I)) \quad \texttt{if mask}(I) \ne0\f]
+- Sum of an array and a scalar when src2 is constructed from Scalar or has the same number of
+elements as `src1.channels()`:
+\f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) +  \texttt{src2} ) \quad \texttt{if mask}(I) \ne0\f]
+- Sum of a scalar and an array when src1 is constructed from Scalar or has the same number of
+elements as `src2.channels()`:
+\f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1} +  \texttt{src2}(I) ) \quad \texttt{if mask}(I) \ne0\f]
+where `I` is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+
+The first function in the list above can be replaced with matrix expressions:
+@code{.cpp}
+    dst = src1 + src2;
+    dst += src1; // equivalent to add(dst, src1, dst);
+@endcode
+The input arrays and the output array can all have the same or different depths. For example, you
+can add a 16-bit unsigned array to a 8-bit signed array and store the sum as a 32-bit
+floating-point array. Depth of the output array is determined by the dtype parameter. In the second
+and third cases above, as well as in the first case, when src1.depth() == src2.depth(), dtype can
+be set to the default -1. In this case, the output array will have the same depth as the input
+array, be it src1, src2 or both.
+@note Saturation is not applied when the output array has the depth CV_32S. You may even get
+result of an incorrect sign in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`add(src,X)` means `add(src,(X,X,X,X))`.
+`add(src,(X,))` means `add(src,(X,0,0,0))`.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and number of channels as the input array(s); the
+depth is defined by dtype or src1/src2.
+@param mask optional operation mask - 8-bit single channel array, that specifies elements of the
+output array to be changed.
+@param dtype optional depth of the output array (see the discussion below).
+@sa subtract, addWeighted, scaleAdd, Mat::convertTo
+*/
+CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst,
+                      InputArray mask = noArray(), int dtype = -1);
+
+/** @brief Calculates the per-element difference between two arrays or array and a scalar.
+
+The function subtract calculates:
+- Difference between two arrays, when both input arrays have the same size and the same number of
+channels:
+    \f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) -  \texttt{src2}(I)) \quad \texttt{if mask}(I) \ne0\f]
+- Difference between an array and a scalar, when src2 is constructed from Scalar or has the same
+number of elements as `src1.channels()`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) -  \texttt{src2} ) \quad \texttt{if mask}(I) \ne0\f]
+- Difference between a scalar and an array, when src1 is constructed from Scalar or has the same
+number of elements as `src2.channels()`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1} -  \texttt{src2}(I) ) \quad \texttt{if mask}(I) \ne0\f]
+- The reverse difference between a scalar and an array in the case of `SubRS`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src2} -  \texttt{src1}(I) ) \quad \texttt{if mask}(I) \ne0\f]
+where I is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+
+The first function in the list above can be replaced with matrix expressions:
+@code{.cpp}
+    dst = src1 - src2;
+    dst -= src1; // equivalent to subtract(dst, src1, dst);
+@endcode
+The input arrays and the output array can all have the same or different depths. For example, you
+can subtract to 8-bit unsigned arrays and store the difference in a 16-bit signed array. Depth of
+the output array is determined by dtype parameter. In the second and third cases above, as well as
+in the first case, when src1.depth() == src2.depth(), dtype can be set to the default -1. In this
+case the output array will have the same depth as the input array, be it src1, src2 or both.
+@note Saturation is not applied when the output array has the depth CV_32S. You may even get
+result of an incorrect sign in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`subtract(src,X)` means `subtract(src,(X,X,X,X))`.
+`subtract(src,(X,))` means `subtract(src,(X,0,0,0))`.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array of the same size and the same number of channels as the input array.
+@param mask optional operation mask; this is an 8-bit single channel array that specifies elements
+of the output array to be changed.
+@param dtype optional depth of the output array
+@sa  add, addWeighted, scaleAdd, Mat::convertTo
+  */
+CV_EXPORTS_W void subtract(InputArray src1, InputArray src2, OutputArray dst,
+                           InputArray mask = noArray(), int dtype = -1);
+
+
+/** @brief Calculates the per-element scaled product of two arrays.
+
+The function multiply calculates the per-element product of two arrays:
+
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{scale} \cdot \texttt{src1} (I)  \cdot \texttt{src2} (I))\f]
+
+There is also a @ref MatrixExpressions -friendly variant of the first function. See Mat::mul .
+
+For a not-per-element matrix product, see gemm .
+
+@note Saturation is not applied when the output array has the depth
+CV_32S. You may even get result of an incorrect sign in the case of
+overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`multiply(src,X)` means `multiply(src,(X,X,X,X))`.
+`multiply(src,(X,))` means `multiply(src,(X,0,0,0))`.
+@param src1 first input array.
+@param src2 second input array of the same size and the same type as src1.
+@param dst output array of the same size and type as src1.
+@param scale optional scale factor.
+@param dtype optional depth of the output array
+@sa add, subtract, divide, scaleAdd, addWeighted, accumulate, accumulateProduct, accumulateSquare,
+Mat::convertTo
+*/
+CV_EXPORTS_W void multiply(InputArray src1, InputArray src2,
+                           OutputArray dst, double scale = 1, int dtype = -1);
+
+/** @brief Performs per-element division of two arrays or a scalar by an array.
+
+The function cv::divide divides one array by another:
+\f[\texttt{dst(I) = saturate(src1(I)*scale/src2(I))}\f]
+or a scalar by an array when there is no src1 :
+\f[\texttt{dst(I) = saturate(scale/src2(I))}\f]
+
+Different channels of multi-channel arrays are processed independently.
+
+For integer types when src2(I) is zero, dst(I) will also be zero.
+
+@note In case of floating point data there is no special defined behavior for zero src2(I) values.
+Regular floating-point division is used.
+Expect correct IEEE-754 behaviour for floating-point data (with NaN, Inf result values).
+
+@note Saturation is not applied when the output array has the depth CV_32S. You may even get
+result of an incorrect sign in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`divide(src,X)` means `divide(src,(X,X,X,X))`.
+`divide(src,(X,))` means `divide(src,(X,0,0,0))`.
+@param src1 first input array.
+@param src2 second input array of the same size and type as src1.
+@param scale scalar factor.
+@param dst output array of the same size and type as src2.
+@param dtype optional depth of the output array; if -1, dst will have depth src2.depth(), but in
+case of an array-by-array division, you can only pass -1 when src1.depth()==src2.depth().
+@sa  multiply, add, subtract
+*/
+CV_EXPORTS_W void divide(InputArray src1, InputArray src2, OutputArray dst,
+                         double scale = 1, int dtype = -1);
+
+/** @overload */
+CV_EXPORTS_W void divide(double scale, InputArray src2,
+                         OutputArray dst, int dtype = -1);
+
+/** @brief Calculates the sum of a scaled array and another array.
+
+The function scaleAdd is one of the classical primitive linear algebra operations, known as DAXPY
+or SAXPY in [BLAS](http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms). It calculates
+the sum of a scaled array and another array:
+\f[\texttt{dst} (I)= \texttt{scale} \cdot \texttt{src1} (I) +  \texttt{src2} (I)\f]
+The function can also be emulated with a matrix expression, for example:
+@code{.cpp}
+    Mat A(3, 3, CV_64F);
+    ...
+    A.row(0) = A.row(1)*2 + A.row(2);
+@endcode
+@param src1 first input array.
+@param alpha scale factor for the first array.
+@param src2 second input array of the same size and type as src1.
+@param dst output array of the same size and type as src1.
+@sa add, addWeighted, subtract, Mat::dot, Mat::convertTo
+*/
+CV_EXPORTS_W void scaleAdd(InputArray src1, double alpha, InputArray src2, OutputArray dst);
+
+/** @example samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
+Check @ref tutorial_trackbar "the corresponding tutorial" for more details
+*/
+
+/** @brief Calculates the weighted sum of two arrays.
+
+The function addWeighted calculates the weighted sum of two arrays as follows:
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)* \texttt{beta} +  \texttt{gamma} )\f]
+where I is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+The function can be replaced with a matrix expression:
+@code{.cpp}
+    dst = src1*alpha + src2*beta + gamma;
+@endcode
+@note Saturation is not applied when the output array has the depth CV_32S. You may even get
+result of an incorrect sign in the case of overflow.
+@param src1 first input array.
+@param alpha weight of the first array elements.
+@param src2 second input array of the same size and channel number as src1.
+@param beta weight of the second array elements.
+@param gamma scalar added to each sum.
+@param dst output array that has the same size and number of channels as the input arrays.
+@param dtype optional depth of the output array; when both input arrays have the same depth, dtype
+can be set to -1, which will be equivalent to src1.depth().
+@sa  add, subtract, scaleAdd, Mat::convertTo
+*/
+CV_EXPORTS_W void addWeighted(InputArray src1, double alpha, InputArray src2,
+                              double beta, double gamma, OutputArray dst, int dtype = -1);
+
+/** @brief Scales, calculates absolute values, and converts the result to 8-bit.
+
+On each element of the input array, the function convertScaleAbs
+performs three operations sequentially: scaling, taking an absolute
+value, conversion to an unsigned 8-bit type:
+\f[\texttt{dst} (I)= \texttt{saturate\_cast<uchar>} (| \texttt{src} (I)* \texttt{alpha} +  \texttt{beta} |)\f]
+In case of multi-channel arrays, the function processes each channel
+independently. When the output is not 8-bit, the operation can be
+emulated by calling the Mat::convertTo method (or by using matrix
+expressions) and then by calculating an absolute value of the result.
+For example:
+@code{.cpp}
+    Mat_<float> A(30,30);
+    randu(A, Scalar(-100), Scalar(100));
+    Mat_<float> B = A*5 + 3;
+    B = abs(B);
+    // Mat_<float> B = abs(A*5+3) will also do the job,
+    // but it will allocate a temporary matrix
+@endcode
+@param src input array.
+@param dst output array.
+@param alpha optional scale factor.
+@param beta optional delta added to the scaled values.
+@sa  Mat::convertTo, cv::abs(const Mat&)
+*/
+CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
+                                  double alpha = 1, double beta = 0);
+
+/** @brief Converts an array to half precision floating number.
+
+This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point). CV_16S format is used to represent FP16 data.
+There are two use modes (src -> dst): CV_32F -> CV_16S and CV_16S -> CV_32F. The input array has to have type of CV_32F or
+CV_16S to represent the bit depth. If the input array is neither of them, the function will raise an error.
+The format of half precision floating point is defined in IEEE 754-2008.
+
+@param src input array.
+@param dst output array.
+
+@deprecated Use Mat::convertTo with CV_16F instead.
+*/
+CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst);
+
+/** @brief Performs a look-up table transform of an array.
+
+The function LUT fills the output array with values from the look-up table. Indices of the entries
+are taken from the input array. That is, the function processes each element of src as follows:
+\f[\texttt{dst} (I)  \leftarrow \texttt{lut(src(I) + d)}\f]
+where
+\f[d =  \fork{0}{if \(\texttt{src}\) has depth \(\texttt{CV_8U}\)}{128}{if \(\texttt{src}\) has depth \(\texttt{CV_8S}\)}\f]
+@param src input array of 8-bit elements.
+@param lut look-up table of 256 elements; in case of multi-channel input array, the table should
+either have a single channel (in this case the same table is used for all channels) or the same
+number of channels as in the input array.
+@param dst output array of the same size and number of channels as src, and the same depth as lut.
+@sa  convertScaleAbs, Mat::convertTo
+*/
+CV_EXPORTS_W void LUT(InputArray src, InputArray lut, OutputArray dst);
+
+/** @brief Calculates the sum of array elements.
+
+The function cv::sum calculates and returns the sum of array elements,
+independently for each channel.
+@param src input array that must have from 1 to 4 channels.
+@sa  countNonZero, mean, meanStdDev, norm, minMaxLoc, reduce
+*/
+CV_EXPORTS_AS(sumElems) Scalar sum(InputArray src);
+
+/** @brief Checks for the presence of at least one non-zero array element.
+
+The function returns whether there are non-zero elements in src
+
+The function do not work with multi-channel arrays. If you need to check non-zero array
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+
+@note
+- If the location of non-zero array elements is important, @ref findNonZero is helpful.
+- If the count of non-zero array elements is important, @ref countNonZero is helpful.
+@param src single-channel array.
+@sa  mean, meanStdDev, norm, minMaxLoc, calcCovarMatrix
+@sa  findNonZero, countNonZero
+*/
+CV_EXPORTS_W bool hasNonZero( InputArray src );
+
+/** @brief Counts non-zero array elements.
+
+The function returns the number of non-zero elements in src :
+\f[\sum _{I: \; \texttt{src} (I) \ne0 } 1\f]
+
+The function do not work with multi-channel arrays. If you need to count non-zero array
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+
+@note
+- If only whether there are non-zero elements is important, @ref hasNonZero is helpful.
+- If the location of non-zero array elements is important, @ref findNonZero is helpful.
+@param src single-channel array.
+@sa  mean, meanStdDev, norm, minMaxLoc, calcCovarMatrix
+@sa  findNonZero, hasNonZero
+*/
+CV_EXPORTS_W int countNonZero( InputArray src );
+
+/** @brief Returns the list of locations of non-zero pixels
+
+Given a binary matrix (likely returned from an operation such
+as threshold(), compare(), >, ==, etc, return all of
+the non-zero indices as a cv::Mat or std::vector<cv::Point> (x,y)
+For example:
+@code{.cpp}
+    cv::Mat binaryImage; // input, binary image
+    cv::Mat locations;   // output, locations of non-zero pixels
+    cv::findNonZero(binaryImage, locations);
+
+    // access pixel coordinates
+    Point pnt = locations.at<Point>(i);
+@endcode
+or
+@code{.cpp}
+    cv::Mat binaryImage; // input, binary image
+    vector<Point> locations;   // output, locations of non-zero pixels
+    cv::findNonZero(binaryImage, locations);
+
+    // access pixel coordinates
+    Point pnt = locations[i];
+@endcode
+
+The function do not work with multi-channel arrays. If you need to find non-zero
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+
+@note
+- If only count of non-zero array elements is important, @ref countNonZero is helpful.
+- If only whether there are non-zero elements is important, @ref hasNonZero is helpful.
+@param src single-channel array
+@param idx the output array, type of cv::Mat or std::vector<Point>, corresponding to non-zero indices in the input
+@sa  countNonZero, hasNonZero
+*/
+CV_EXPORTS_W void findNonZero( InputArray src, OutputArray idx );
+
+/** @brief Calculates an average (mean) of array elements.
+
+The function cv::mean calculates the mean value M of array elements,
+independently for each channel, and return it:
+\f[\begin{array}{l} N =  \sum _{I: \; \texttt{mask} (I) \ne 0} 1 \\ M_c =  \left ( \sum _{I: \; \texttt{mask} (I) \ne 0}{ \texttt{mtx} (I)_c} \right )/N \end{array}\f]
+When all the mask elements are 0's, the function returns Scalar::all(0)
+@param src input array that should have from 1 to 4 channels so that the result can be stored in
+Scalar_ .
+@param mask optional operation mask.
+@sa  countNonZero, meanStdDev, norm, minMaxLoc
+*/
+CV_EXPORTS_W Scalar mean(InputArray src, InputArray mask = noArray());
+
+/** Calculates a mean and standard deviation of array elements.
+
+The function cv::meanStdDev calculates the mean and the standard deviation M
+of array elements independently for each channel and returns it via the
+output parameters:
+\f[\begin{array}{l} N =  \sum _{I, \texttt{mask} (I)  \ne 0} 1 \\ \texttt{mean} _c =  \frac{\sum_{ I: \; \texttt{mask}(I) \ne 0} \texttt{src} (I)_c}{N} \\ \texttt{stddev} _c =  \sqrt{\frac{\sum_{ I: \; \texttt{mask}(I) \ne 0} \left ( \texttt{src} (I)_c -  \texttt{mean} _c \right )^2}{N}} \end{array}\f]
+When all the mask elements are 0's, the function returns
+mean=stddev=Scalar::all(0).
+@note The calculated standard deviation is only the diagonal of the
+complete normalized covariance matrix. If the full matrix is needed, you
+can reshape the multi-channel array M x N to the single-channel array
+M\*N x mtx.channels() (only possible when the matrix is continuous) and
+then pass the matrix to calcCovarMatrix .
+@param src input array that should have from 1 to 4 channels so that the results can be stored in
+Scalar_ 's.
+@param mean output parameter: calculated mean value.
+@param stddev output parameter: calculated standard deviation.
+@param mask optional operation mask.
+@sa  countNonZero, mean, norm, minMaxLoc, calcCovarMatrix
+*/
+CV_EXPORTS_W void meanStdDev(InputArray src, OutputArray mean, OutputArray stddev,
+                             InputArray mask=noArray());
+
+/** @brief Calculates the  absolute norm of an array.
+
+This version of #norm calculates the absolute norm of src1. The type of norm to calculate is specified using #NormTypes.
+
+As example for one array consider the function \f$r(x)= \begin{pmatrix} x \\ 1-x \end{pmatrix}, x \in [-1;1]\f$.
+The \f$ L_{1}, L_{2} \f$ and \f$ L_{\infty} \f$ norm for the sample value \f$r(-1) = \begin{pmatrix} -1 \\ 2 \end{pmatrix}\f$
+is calculated as follows
+\f{align*}
+    \| r(-1) \|_{L_1} &= |-1| + |2| = 3 \\
+    \| r(-1) \|_{L_2} &= \sqrt{(-1)^{2} + (2)^{2}} = \sqrt{5} \\
+    \| r(-1) \|_{L_\infty} &= \max(|-1|,|2|) = 2
+\f}
+and for \f$r(0.5) = \begin{pmatrix} 0.5 \\ 0.5 \end{pmatrix}\f$ the calculation is
+\f{align*}
+    \| r(0.5) \|_{L_1} &= |0.5| + |0.5| = 1 \\
+    \| r(0.5) \|_{L_2} &= \sqrt{(0.5)^{2} + (0.5)^{2}} = \sqrt{0.5} \\
+    \| r(0.5) \|_{L_\infty} &= \max(|0.5|,|0.5|) = 0.5.
+\f}
+The following graphic shows all values for the three norm functions \f$\| r(x) \|_{L_1}, \| r(x) \|_{L_2}\f$ and \f$\| r(x) \|_{L_\infty}\f$.
+It is notable that the \f$ L_{1} \f$ norm forms the upper and the \f$ L_{\infty} \f$ norm forms the lower border for the example function \f$ r(x) \f$.
+![Graphs for the different norm functions from the above example](pics/NormTypes_OneArray_1-2-INF.png)
+
+When the mask parameter is specified and it is not empty, the norm is
+
+If normType is not specified, #NORM_L2 is used.
+calculated only over the region specified by the mask.
+
+Multi-channel input arrays are treated as single-channel arrays, that is,
+the results for all channels are combined.
+
+Hamming norms can only be calculated with CV_8U depth arrays.
+
+@param src1 first input array.
+@param normType type of the norm (see #NormTypes).
+@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
+*/
+CV_EXPORTS_W double norm(InputArray src1, int normType = NORM_L2, InputArray mask = noArray());
+
+/** @brief Calculates an absolute difference norm or a relative difference norm.
+
+This version of cv::norm calculates the absolute difference norm
+or the relative difference norm of arrays src1 and src2.
+The type of norm to calculate is specified using #NormTypes.
+
+@param src1 first input array.
+@param src2 second input array of the same size and the same type as src1.
+@param normType type of the norm (see #NormTypes).
+@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
+*/
+CV_EXPORTS_W double norm(InputArray src1, InputArray src2,
+                         int normType = NORM_L2, InputArray mask = noArray());
+/** @overload
+@param src first input array.
+@param normType type of the norm (see #NormTypes).
+*/
+CV_EXPORTS double norm( const SparseMat& src, int normType );
+
+/** @brief Computes the Peak Signal-to-Noise Ratio (PSNR) image quality metric.
+
+This function calculates the Peak Signal-to-Noise Ratio (PSNR) image quality metric in decibels (dB),
+between two input arrays src1 and src2. The arrays must have the same type.
+
+The PSNR is calculated as follows:
+
+\f[
+\texttt{PSNR} = 10 \cdot \log_{10}{\left( \frac{R^2}{MSE} \right) }
+\f]
+
+where R is the maximum integer value of depth (e.g. 255 in the case of CV_8U data)
+and MSE is the mean squared error between the two arrays.
+
+@param src1 first input array.
+@param src2 second input array of the same size as src1.
+@param R the maximum pixel value (255 by default)
+
+  */
+CV_EXPORTS_W double PSNR(InputArray src1, InputArray src2, double R=255.);
+
+/** @brief naive nearest neighbor finder
+
+see http://en.wikipedia.org/wiki/Nearest_neighbor_search
+@todo document
+  */
+CV_EXPORTS_W void batchDistance(InputArray src1, InputArray src2,
+                                OutputArray dist, int dtype, OutputArray nidx,
+                                int normType = NORM_L2, int K = 0,
+                                InputArray mask = noArray(), int update = 0,
+                                bool crosscheck = false);
+
+/** @brief Normalizes the norm or value range of an array.
+
+The function cv::normalize normalizes scale and shift the input array elements so that
+\f[\| \texttt{dst} \| _{L_p}= \texttt{alpha}\f]
+(where p=Inf, 1 or 2) when normType=NORM_INF, NORM_L1, or NORM_L2, respectively; or so that
+\f[\min _I  \texttt{dst} (I)= \texttt{alpha} , \, \, \max _I  \texttt{dst} (I)= \texttt{beta}\f]
+
+when normType=NORM_MINMAX (for dense arrays only). The optional mask specifies a sub-array to be
+normalized. This means that the norm or min-n-max are calculated over the sub-array, and then this
+sub-array is modified to be normalized. If you want to only use the mask to calculate the norm or
+min-max but modify the whole array, you can use norm and Mat::convertTo.
+
+In case of sparse matrices, only the non-zero values are analyzed and transformed. Because of this,
+the range transformation for sparse matrices is not allowed since it can shift the zero level.
+
+Possible usage with some positive example data:
+@code{.cpp}
+    vector<double> positiveData = { 2.0, 8.0, 10.0 };
+    vector<double> normalizedData_l1, normalizedData_l2, normalizedData_inf, normalizedData_minmax;
+
+    // Norm to probability (total count)
+    // sum(numbers) = 20.0
+    // 2.0      0.1     (2.0/20.0)
+    // 8.0      0.4     (8.0/20.0)
+    // 10.0     0.5     (10.0/20.0)
+    normalize(positiveData, normalizedData_l1, 1.0, 0.0, NORM_L1);
+
+    // Norm to unit vector: ||positiveData|| = 1.0
+    // 2.0      0.15
+    // 8.0      0.62
+    // 10.0     0.77
+    normalize(positiveData, normalizedData_l2, 1.0, 0.0, NORM_L2);
+
+    // Norm to max element
+    // 2.0      0.2     (2.0/10.0)
+    // 8.0      0.8     (8.0/10.0)
+    // 10.0     1.0     (10.0/10.0)
+    normalize(positiveData, normalizedData_inf, 1.0, 0.0, NORM_INF);
+
+    // Norm to range [0.0;1.0]
+    // 2.0      0.0     (shift to left border)
+    // 8.0      0.75    (6.0/8.0)
+    // 10.0     1.0     (shift to right border)
+    normalize(positiveData, normalizedData_minmax, 1.0, 0.0, NORM_MINMAX);
+@endcode
+
+@param src input array.
+@param dst output array of the same size as src .
+@param alpha norm value to normalize to or the lower range boundary in case of the range
+normalization.
+@param beta upper range boundary in case of the range normalization; it is not used for the norm
+normalization.
+@param norm_type normalization type (see cv::NormTypes).
+@param dtype when negative, the output array has the same type as src; otherwise, it has the same
+number of channels as src and the depth =CV_MAT_DEPTH(dtype).
+@param mask optional operation mask.
+@sa norm, Mat::convertTo, SparseMat::convertTo
+*/
+CV_EXPORTS_W void normalize( InputArray src, InputOutputArray dst, double alpha = 1, double beta = 0,
+                             int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray());
+
+/** @overload
+@param src input array.
+@param dst output array of the same size as src .
+@param alpha norm value to normalize to or the lower range boundary in case of the range
+normalization.
+@param normType normalization type (see cv::NormTypes).
+*/
+CV_EXPORTS void normalize( const SparseMat& src, SparseMat& dst, double alpha, int normType );
+
+/** @brief Finds the global minimum and maximum in an array.
+
+The function cv::minMaxLoc finds the minimum and maximum element values and their positions. The
+extremums are searched across the whole array or, if mask is not an empty array, in the specified
+array region.
+
+The function do not work with multi-channel arrays. If you need to find minimum or maximum
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+@param src input single-channel array.
+@param minVal pointer to the returned minimum value; NULL is used if not required.
+@param maxVal pointer to the returned maximum value; NULL is used if not required.
+@param minLoc pointer to the returned minimum location (in 2D case); NULL is used if not required.
+@param maxLoc pointer to the returned maximum location (in 2D case); NULL is used if not required.
+@param mask optional mask used to select a sub-array.
+@sa max, min, reduceArgMin, reduceArgMax, compare, inRange, extractImageCOI, mixChannels, split, Mat::reshape
+*/
+CV_EXPORTS_W void minMaxLoc(InputArray src, CV_OUT double* minVal,
+                            CV_OUT double* maxVal = 0, CV_OUT Point* minLoc = 0,
+                            CV_OUT Point* maxLoc = 0, InputArray mask = noArray());
+
+/**
+ * @brief Finds indices of min elements along provided axis
+ *
+ * @note
+ *      - If input or output array is not continuous, this function will create an internal copy.
+ *      - NaN handling is left unspecified, see patchNaNs().
+ *      - The returned index is always in bounds of input matrix.
+ *
+ * @param src input single-channel array.
+ * @param dst output array of type CV_32SC1 with the same dimensionality as src,
+ * except for axis being reduced - it should be set to 1.
+ * @param lastIndex whether to get the index of first or last occurrence of min.
+ * @param axis axis to reduce along.
+ * @sa reduceArgMax, minMaxLoc, min, max, compare, reduce
+ */
+CV_EXPORTS_W void reduceArgMin(InputArray src, OutputArray dst, int axis, bool lastIndex = false);
+
+/**
+ * @brief Finds indices of max elements along provided axis
+ *
+ * @note
+ *      - If input or output array is not continuous, this function will create an internal copy.
+ *      - NaN handling is left unspecified, see patchNaNs().
+ *      - The returned index is always in bounds of input matrix.
+ *
+ * @param src input single-channel array.
+ * @param dst output array of type CV_32SC1 with the same dimensionality as src,
+ * except for axis being reduced - it should be set to 1.
+ * @param lastIndex whether to get the index of first or last occurrence of max.
+ * @param axis axis to reduce along.
+ * @sa reduceArgMin, minMaxLoc, min, max, compare, reduce
+ */
+CV_EXPORTS_W void reduceArgMax(InputArray src, OutputArray dst, int axis, bool lastIndex = false);
+
+/** @brief Finds the global minimum and maximum in an array
+
+The function cv::minMaxIdx finds the minimum and maximum element values and their positions. The
+extremums are searched across the whole array or, if mask is not an empty array, in the specified
+array region. In case of a sparse matrix, the minimum is found among non-zero elements
+only. Multi-channel input is supported without mask and extremums indexes (should be nullptr).
+@note When minIdx is not NULL, it must have at least 2 elements (as well as maxIdx), even if src is
+a single-row or single-column matrix. In OpenCV (following MATLAB) each array has at least 2
+dimensions, i.e. single-column matrix is Mx1 matrix (and therefore minIdx/maxIdx will be
+(i1,0)/(i2,0)) and single-row matrix is 1xN matrix (and therefore minIdx/maxIdx will be
+(0,j1)/(0,j2)).
+@param src input single-channel array.
+@param minVal pointer to the returned minimum value; NULL is used if not required.
+@param maxVal pointer to the returned maximum value; NULL is used if not required.
+@param minIdx pointer to the returned minimum location (in nD case); NULL is used if not required;
+Otherwise, it must point to an array of src.dims elements, the coordinates of the minimum element
+in each dimension are stored there sequentially.
+@param maxIdx pointer to the returned maximum location (in nD case). NULL is used if not required.
+@param mask specified array region
+*/
+CV_EXPORTS void minMaxIdx(InputArray src, double* minVal, double* maxVal = 0,
+                          int* minIdx = 0, int* maxIdx = 0, InputArray mask = noArray());
+
+/** @overload
+@param a input single-channel array.
+@param minVal pointer to the returned minimum value; NULL is used if not required.
+@param maxVal pointer to the returned maximum value; NULL is used if not required.
+@param minIdx pointer to the returned minimum location (in nD case); NULL is used if not required;
+Otherwise, it must point to an array of src.dims elements, the coordinates of the minimum element
+in each dimension are stored there sequentially.
+@param maxIdx pointer to the returned maximum location (in nD case). NULL is used if not required.
+*/
+CV_EXPORTS void minMaxLoc(const SparseMat& a, double* minVal,
+                          double* maxVal, int* minIdx = 0, int* maxIdx = 0);
+
+/** @brief Reduces a matrix to a vector.
+
+The function #reduce reduces the matrix to a vector by treating the matrix rows/columns as a set of
+1D vectors and performing the specified operation on the vectors until a single row/column is
+obtained. For example, the function can be used to compute horizontal and vertical projections of a
+raster image. In case of #REDUCE_MAX and #REDUCE_MIN, the output image should have the same type as the source one.
+In case of #REDUCE_SUM, #REDUCE_SUM2 and #REDUCE_AVG, the output may have a larger element bit-depth to preserve accuracy.
+And multi-channel arrays are also supported in these two reduction modes.
+
+The following code demonstrates its usage for a single channel matrix.
+@snippet snippets/core_reduce.cpp example
+
+And the following code demonstrates its usage for a two-channel matrix.
+@snippet snippets/core_reduce.cpp example2
+
+@param src input 2D matrix.
+@param dst output vector. Its size and type is defined by dim and dtype parameters.
+@param dim dimension index along which the matrix is reduced. 0 means that the matrix is reduced to
+a single row. 1 means that the matrix is reduced to a single column.
+@param rtype reduction operation that could be one of #ReduceTypes
+@param dtype when negative, the output vector will have the same type as the input matrix,
+otherwise, its type will be CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()).
+@sa repeat, reduceArgMin, reduceArgMax
+*/
+CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype = -1);
+
+/** @brief Creates one multi-channel array out of several single-channel ones.
+
+The function cv::merge merges several arrays to make a single multi-channel array. That is, each
+element of the output array will be a concatenation of the elements of the input arrays, where
+elements of i-th input array are treated as mv[i].channels()-element vectors.
+
+The function cv::split does the reverse operation. If you need to shuffle channels in some other
+advanced way, use cv::mixChannels.
+
+The following example shows how to merge 3 single channel matrices into a single 3-channel matrix.
+@snippet snippets/core_merge.cpp example
+
+@param mv input array of matrices to be merged; all the matrices in mv must have the same
+size and the same depth.
+@param count number of input matrices when mv is a plain C array; it must be greater than zero.
+@param dst output array of the same size and the same depth as mv[0]; The number of channels will
+be equal to the parameter count.
+@sa  mixChannels, split, Mat::reshape
+*/
+CV_EXPORTS void merge(const Mat* mv, size_t count, OutputArray dst);
+
+/** @overload
+@param mv input vector of matrices to be merged; all the matrices in mv must have the same
+size and the same depth.
+@param dst output array of the same size and the same depth as mv[0]; The number of channels will
+be the total number of channels in the matrix array.
+  */
+CV_EXPORTS_W void merge(InputArrayOfArrays mv, OutputArray dst);
+
+/** @brief Divides a multi-channel array into several single-channel arrays.
+
+The function cv::split splits a multi-channel array into separate single-channel arrays:
+\f[\texttt{mv} [c](I) =  \texttt{src} (I)_c\f]
+If you need to extract a single channel or do some other sophisticated channel permutation, use
+mixChannels.
+
+The following example demonstrates how to split a 3-channel matrix into 3 single channel matrices.
+@snippet snippets/core_split.cpp example
+
+@param src input multi-channel array.
+@param mvbegin output array; the number of arrays must match src.channels(); the arrays themselves are
+reallocated, if needed.
+@sa merge, mixChannels, cvtColor
+*/
+CV_EXPORTS void split(const Mat& src, Mat* mvbegin);
+
+/** @overload
+@param m input multi-channel array.
+@param mv output vector of arrays; the arrays themselves are reallocated, if needed.
+*/
+CV_EXPORTS_W void split(InputArray m, OutputArrayOfArrays mv);
+
+/** @brief Copies specified channels from input arrays to the specified channels of
+output arrays.
+
+The function cv::mixChannels provides an advanced mechanism for shuffling image channels.
+
+cv::split,cv::merge,cv::extractChannel,cv::insertChannel and some forms of cv::cvtColor are partial cases of cv::mixChannels.
+
+In the example below, the code splits a 4-channel BGRA image into a 3-channel BGR (with B and R
+channels swapped) and a separate alpha-channel image:
+@code{.cpp}
+    Mat bgra( 100, 100, CV_8UC4, Scalar(255,0,0,255) );
+    Mat bgr( bgra.rows, bgra.cols, CV_8UC3 );
+    Mat alpha( bgra.rows, bgra.cols, CV_8UC1 );
+
+    // forming an array of matrices is a quite efficient operation,
+    // because the matrix data is not copied, only the headers
+    Mat out[] = { bgr, alpha };
+    // bgra[0] -> bgr[2], bgra[1] -> bgr[1],
+    // bgra[2] -> bgr[0], bgra[3] -> alpha[0]
+    int from_to[] = { 0,2, 1,1, 2,0, 3,3 };
+    mixChannels( &bgra, 1, out, 2, from_to, 4 );
+@endcode
+@note Unlike many other new-style C++ functions in OpenCV (see the introduction section and
+Mat::create ), cv::mixChannels requires the output arrays to be pre-allocated before calling the
+function.
+@param src input array or vector of matrices; all of the matrices must have the same size and the
+same depth.
+@param nsrcs number of matrices in `src`.
+@param dst output array or vector of matrices; all the matrices **must be allocated**; their size and
+depth must be the same as in `src[0]`.
+@param ndsts number of matrices in `dst`.
+@param fromTo array of index pairs specifying which channels are copied and where; fromTo[k\*2] is
+a 0-based index of the input channel in src, fromTo[k\*2+1] is an index of the output channel in
+dst; the continuous channel numbering is used: the first input image channels are indexed from 0 to
+src[0].channels()-1, the second input image channels are indexed from src[0].channels() to
+src[0].channels() + src[1].channels()-1, and so on, the same scheme is used for the output image
+channels; as a special case, when fromTo[k\*2] is negative, the corresponding output channel is
+filled with zero .
+@param npairs number of index pairs in `fromTo`.
+@sa split, merge, extractChannel, insertChannel, cvtColor
+*/
+CV_EXPORTS void mixChannels(const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts,
+                            const int* fromTo, size_t npairs);
+
+/** @overload
+@param src input array or vector of matrices; all of the matrices must have the same size and the
+same depth.
+@param dst output array or vector of matrices; all the matrices **must be allocated**; their size and
+depth must be the same as in src[0].
+@param fromTo array of index pairs specifying which channels are copied and where; fromTo[k\*2] is
+a 0-based index of the input channel in src, fromTo[k\*2+1] is an index of the output channel in
+dst; the continuous channel numbering is used: the first input image channels are indexed from 0 to
+src[0].channels()-1, the second input image channels are indexed from src[0].channels() to
+src[0].channels() + src[1].channels()-1, and so on, the same scheme is used for the output image
+channels; as a special case, when fromTo[k\*2] is negative, the corresponding output channel is
+filled with zero .
+@param npairs number of index pairs in fromTo.
+*/
+CV_EXPORTS void mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
+                            const int* fromTo, size_t npairs);
+
+/** @overload
+@param src input array or vector of matrices; all of the matrices must have the same size and the
+same depth.
+@param dst output array or vector of matrices; all the matrices **must be allocated**; their size and
+depth must be the same as in src[0].
+@param fromTo array of index pairs specifying which channels are copied and where; fromTo[k\*2] is
+a 0-based index of the input channel in src, fromTo[k\*2+1] is an index of the output channel in
+dst; the continuous channel numbering is used: the first input image channels are indexed from 0 to
+src[0].channels()-1, the second input image channels are indexed from src[0].channels() to
+src[0].channels() + src[1].channels()-1, and so on, the same scheme is used for the output image
+channels; as a special case, when fromTo[k\*2] is negative, the corresponding output channel is
+filled with zero .
+*/
+CV_EXPORTS_W void mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
+                              const std::vector<int>& fromTo);
+
+/** @brief Extracts a single channel from src (coi is 0-based index)
+@param src input array
+@param dst output array
+@param coi index of channel to extract
+@sa mixChannels, split
+*/
+CV_EXPORTS_W void extractChannel(InputArray src, OutputArray dst, int coi);
+
+/** @brief Inserts a single channel to dst (coi is 0-based index)
+@param src input array
+@param dst output array
+@param coi index of channel for insertion
+@sa mixChannels, merge
+*/
+CV_EXPORTS_W void insertChannel(InputArray src, InputOutputArray dst, int coi);
+
+/** @brief Flips a 2D array around vertical, horizontal, or both axes.
+
+The function cv::flip flips the array in one of three different ways (row
+and column indices are 0-based):
+\f[\texttt{dst} _{ij} =
+\left\{
+\begin{array}{l l}
+\texttt{src} _{\texttt{src.rows}-i-1,j} & if\;  \texttt{flipCode} = 0 \\
+\texttt{src} _{i, \texttt{src.cols} -j-1} & if\;  \texttt{flipCode} > 0 \\
+\texttt{src} _{ \texttt{src.rows} -i-1, \texttt{src.cols} -j-1} & if\; \texttt{flipCode} < 0 \\
+\end{array}
+\right.\f]
+The example scenarios of using the function are the following:
+*   Vertical flipping of the image (flipCode == 0) to switch between
+    top-left and bottom-left image origin. This is a typical operation
+    in video processing on Microsoft Windows\* OS.
+*   Horizontal flipping of the image with the subsequent horizontal
+    shift and absolute difference calculation to check for a
+    vertical-axis symmetry (flipCode \> 0).
+*   Simultaneous horizontal and vertical flipping of the image with
+    the subsequent shift and absolute difference calculation to check
+    for a central symmetry (flipCode \< 0).
+*   Reversing the order of point arrays (flipCode \> 0 or
+    flipCode == 0).
+@param src input array.
+@param dst output array of the same size and type as src.
+@param flipCode a flag to specify how to flip the array; 0 means
+flipping around the x-axis and positive value (for example, 1) means
+flipping around y-axis. Negative value (for example, -1) means flipping
+around both axes.
+@sa transpose, repeat, completeSymm
+*/
+CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);
+
+/** @brief Flips a n-dimensional at given axis
+ *  @param src input array
+ *  @param dst output array that has the same shape of src
+ *  @param axis axis that performs a flip on. 0 <= axis < src.dims.
+ */
+CV_EXPORTS_W void flipND(InputArray src, OutputArray dst, int axis);
+
+/** @brief Broadcast the given Mat to the given shape.
+ * @param src input array
+ * @param shape target shape. Should be a list of CV_32S numbers. Note that negative values are not supported.
+ * @param dst output array that has the given shape
+ */
+CV_EXPORTS_W void broadcast(InputArray src, InputArray shape, OutputArray dst);
+
+enum RotateFlags {
+    ROTATE_90_CLOCKWISE = 0, //!<Rotate 90 degrees clockwise
+    ROTATE_180 = 1, //!<Rotate 180 degrees clockwise
+    ROTATE_90_COUNTERCLOCKWISE = 2, //!<Rotate 270 degrees clockwise
+};
+/** @brief Rotates a 2D array in multiples of 90 degrees.
+The function cv::rotate rotates the array in one of three different ways:
+*   Rotate by 90 degrees clockwise (rotateCode = ROTATE_90_CLOCKWISE).
+*   Rotate by 180 degrees clockwise (rotateCode = ROTATE_180).
+*   Rotate by 270 degrees clockwise (rotateCode = ROTATE_90_COUNTERCLOCKWISE).
+@param src input array.
+@param dst output array of the same type as src.  The size is the same with ROTATE_180,
+and the rows and cols are switched for ROTATE_90_CLOCKWISE and ROTATE_90_COUNTERCLOCKWISE.
+@param rotateCode an enum to specify how to rotate the array; see the enum #RotateFlags
+@sa transpose, repeat, completeSymm, flip, RotateFlags
+*/
+CV_EXPORTS_W void rotate(InputArray src, OutputArray dst, int rotateCode);
+
+/** @brief Fills the output array with repeated copies of the input array.
+
+The function cv::repeat duplicates the input array one or more times along each of the two axes:
+\f[\texttt{dst} _{ij}= \texttt{src} _{i\mod src.rows, \; j\mod src.cols }\f]
+The second variant of the function is more convenient to use with @ref MatrixExpressions.
+@param src input array to replicate.
+@param ny Flag to specify how many times the `src` is repeated along the
+vertical axis.
+@param nx Flag to specify how many times the `src` is repeated along the
+horizontal axis.
+@param dst output array of the same type as `src`.
+@sa cv::reduce
+*/
+CV_EXPORTS_W void repeat(InputArray src, int ny, int nx, OutputArray dst);
+
+/** @overload
+@param src input array to replicate.
+@param ny Flag to specify how many times the `src` is repeated along the
+vertical axis.
+@param nx Flag to specify how many times the `src` is repeated along the
+horizontal axis.
+  */
+CV_EXPORTS Mat repeat(const Mat& src, int ny, int nx);
+
+/** @brief Applies horizontal concatenation to given matrices.
+
+The function horizontally concatenates two or more cv::Mat matrices (with the same number of rows).
+@code{.cpp}
+    cv::Mat matArray[] = { cv::Mat(4, 1, CV_8UC1, cv::Scalar(1)),
+                           cv::Mat(4, 1, CV_8UC1, cv::Scalar(2)),
+                           cv::Mat(4, 1, CV_8UC1, cv::Scalar(3)),};
+
+    cv::Mat out;
+    cv::hconcat( matArray, 3, out );
+    //out:
+    //[1, 2, 3;
+    // 1, 2, 3;
+    // 1, 2, 3;
+    // 1, 2, 3]
+@endcode
+@param src input array or vector of matrices. all of the matrices must have the same number of rows and the same depth.
+@param nsrc number of matrices in src.
+@param dst output array. It has the same number of rows and depth as the src, and the sum of cols of the src.
+@sa cv::vconcat(const Mat*, size_t, OutputArray), @sa cv::vconcat(InputArrayOfArrays, OutputArray) and @sa cv::vconcat(InputArray, InputArray, OutputArray)
+*/
+CV_EXPORTS void hconcat(const Mat* src, size_t nsrc, OutputArray dst);
+/** @overload
+ @code{.cpp}
+    cv::Mat_<float> A = (cv::Mat_<float>(3, 2) << 1, 4,
+                                                  2, 5,
+                                                  3, 6);
+    cv::Mat_<float> B = (cv::Mat_<float>(3, 2) << 7, 10,
+                                                  8, 11,
+                                                  9, 12);
+
+    cv::Mat C;
+    cv::hconcat(A, B, C);
+    //C:
+    //[1, 4, 7, 10;
+    // 2, 5, 8, 11;
+    // 3, 6, 9, 12]
+ @endcode
+ @param src1 first input array to be considered for horizontal concatenation.
+ @param src2 second input array to be considered for horizontal concatenation.
+ @param dst output array. It has the same number of rows and depth as the src1 and src2, and the sum of cols of the src1 and src2.
+ */
+CV_EXPORTS void hconcat(InputArray src1, InputArray src2, OutputArray dst);
+/** @overload
+ @code{.cpp}
+    std::vector<cv::Mat> matrices = { cv::Mat(4, 1, CV_8UC1, cv::Scalar(1)),
+                                      cv::Mat(4, 1, CV_8UC1, cv::Scalar(2)),
+                                      cv::Mat(4, 1, CV_8UC1, cv::Scalar(3)),};
+
+    cv::Mat out;
+    cv::hconcat( matrices, out );
+    //out:
+    //[1, 2, 3;
+    // 1, 2, 3;
+    // 1, 2, 3;
+    // 1, 2, 3]
+ @endcode
+ @param src input array or vector of matrices. all of the matrices must have the same number of rows and the same depth.
+ @param dst output array. It has the same number of rows and depth as the src, and the sum of cols of the src.
+same depth.
+ */
+CV_EXPORTS_W void hconcat(InputArrayOfArrays src, OutputArray dst);
+
+/** @brief Applies vertical concatenation to given matrices.
+
+The function vertically concatenates two or more cv::Mat matrices (with the same number of cols).
+@code{.cpp}
+    cv::Mat matArray[] = { cv::Mat(1, 4, CV_8UC1, cv::Scalar(1)),
+                           cv::Mat(1, 4, CV_8UC1, cv::Scalar(2)),
+                           cv::Mat(1, 4, CV_8UC1, cv::Scalar(3)),};
+
+    cv::Mat out;
+    cv::vconcat( matArray, 3, out );
+    //out:
+    //[1,   1,   1,   1;
+    // 2,   2,   2,   2;
+    // 3,   3,   3,   3]
+@endcode
+@param src input array or vector of matrices. all of the matrices must have the same number of cols and the same depth.
+@param nsrc number of matrices in src.
+@param dst output array. It has the same number of cols and depth as the src, and the sum of rows of the src.
+@sa cv::hconcat(const Mat*, size_t, OutputArray), @sa cv::hconcat(InputArrayOfArrays, OutputArray) and @sa cv::hconcat(InputArray, InputArray, OutputArray)
+*/
+CV_EXPORTS void vconcat(const Mat* src, size_t nsrc, OutputArray dst);
+/** @overload
+ @code{.cpp}
+    cv::Mat_<float> A = (cv::Mat_<float>(3, 2) << 1, 7,
+                                                  2, 8,
+                                                  3, 9);
+    cv::Mat_<float> B = (cv::Mat_<float>(3, 2) << 4, 10,
+                                                  5, 11,
+                                                  6, 12);
+
+    cv::Mat C;
+    cv::vconcat(A, B, C);
+    //C:
+    //[1, 7;
+    // 2, 8;
+    // 3, 9;
+    // 4, 10;
+    // 5, 11;
+    // 6, 12]
+ @endcode
+ @param src1 first input array to be considered for vertical concatenation.
+ @param src2 second input array to be considered for vertical concatenation.
+ @param dst output array. It has the same number of cols and depth as the src1 and src2, and the sum of rows of the src1 and src2.
+ */
+CV_EXPORTS void vconcat(InputArray src1, InputArray src2, OutputArray dst);
+/** @overload
+ @code{.cpp}
+    std::vector<cv::Mat> matrices = { cv::Mat(1, 4, CV_8UC1, cv::Scalar(1)),
+                                      cv::Mat(1, 4, CV_8UC1, cv::Scalar(2)),
+                                      cv::Mat(1, 4, CV_8UC1, cv::Scalar(3)),};
+
+    cv::Mat out;
+    cv::vconcat( matrices, out );
+    //out:
+    //[1,   1,   1,   1;
+    // 2,   2,   2,   2;
+    // 3,   3,   3,   3]
+ @endcode
+ @param src input array or vector of matrices. all of the matrices must have the same number of cols and the same depth
+ @param dst output array. It has the same number of cols and depth as the src, and the sum of rows of the src.
+same depth.
+ */
+CV_EXPORTS_W void vconcat(InputArrayOfArrays src, OutputArray dst);
+
+/** @brief computes bitwise conjunction of the two arrays (dst = src1 & src2)
+Calculates the per-element bit-wise conjunction of two arrays or an
+array and a scalar.
+
+The function cv::bitwise_and calculates the per-element bit-wise logical conjunction for:
+*   Two arrays when src1 and src2 have the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \wedge \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+*   An array and a scalar when src2 is constructed from Scalar or has
+    the same number of elements as `src1.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \wedge \texttt{src2} \quad \texttt{if mask} (I) \ne0\f]
+*   A scalar and an array when src1 is constructed from Scalar or has
+    the same number of elements as `src2.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1}  \wedge \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+In case of floating-point arrays, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel arrays, each channel is processed
+independently. In the second and third cases above, the scalar is first
+converted to the array type.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and type as the input
+arrays.
+@param mask optional operation mask, 8-bit single channel array, that
+specifies elements of the output array to be changed.
+*/
+CV_EXPORTS_W void bitwise_and(InputArray src1, InputArray src2,
+                              OutputArray dst, InputArray mask = noArray());
+
+/** @brief Calculates the per-element bit-wise disjunction of two arrays or an
+array and a scalar.
+
+The function cv::bitwise_or calculates the per-element bit-wise logical disjunction for:
+*   Two arrays when src1 and src2 have the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \vee \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+*   An array and a scalar when src2 is constructed from Scalar or has
+    the same number of elements as `src1.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \vee \texttt{src2} \quad \texttt{if mask} (I) \ne0\f]
+*   A scalar and an array when src1 is constructed from Scalar or has
+    the same number of elements as `src2.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1}  \vee \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+In case of floating-point arrays, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel arrays, each channel is processed
+independently. In the second and third cases above, the scalar is first
+converted to the array type.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and type as the input
+arrays.
+@param mask optional operation mask, 8-bit single channel array, that
+specifies elements of the output array to be changed.
+*/
+CV_EXPORTS_W void bitwise_or(InputArray src1, InputArray src2,
+                             OutputArray dst, InputArray mask = noArray());
+
+/** @brief Calculates the per-element bit-wise "exclusive or" operation on two
+arrays or an array and a scalar.
+
+The function cv::bitwise_xor calculates the per-element bit-wise logical "exclusive-or"
+operation for:
+*   Two arrays when src1 and src2 have the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \oplus \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+*   An array and a scalar when src2 is constructed from Scalar or has
+    the same number of elements as `src1.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \oplus \texttt{src2} \quad \texttt{if mask} (I) \ne0\f]
+*   A scalar and an array when src1 is constructed from Scalar or has
+    the same number of elements as `src2.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1}  \oplus \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+In case of floating-point arrays, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel arrays, each channel is processed
+independently. In the 2nd and 3rd cases above, the scalar is first
+converted to the array type.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and type as the input
+arrays.
+@param mask optional operation mask, 8-bit single channel array, that
+specifies elements of the output array to be changed.
+*/
+CV_EXPORTS_W void bitwise_xor(InputArray src1, InputArray src2,
+                              OutputArray dst, InputArray mask = noArray());
+
+/** @brief  Inverts every bit of an array.
+
+The function cv::bitwise_not calculates per-element bit-wise inversion of the input
+array:
+\f[\texttt{dst} (I) =  \neg \texttt{src} (I)\f]
+In case of a floating-point input array, its machine-specific bit
+representation (usually IEEE754-compliant) is used for the operation. In
+case of multi-channel arrays, each channel is processed independently.
+@param src input array.
+@param dst output array that has the same size and type as the input
+array.
+@param mask optional operation mask, 8-bit single channel array, that
+specifies elements of the output array to be changed.
+*/
+CV_EXPORTS_W void bitwise_not(InputArray src, OutputArray dst,
+                              InputArray mask = noArray());
+
+/** @brief Calculates the per-element absolute difference between two arrays or between an array and a scalar.
+
+The function cv::absdiff calculates:
+*   Absolute difference between two arrays when they have the same
+    size and type:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1}(I) -  \texttt{src2}(I)|)\f]
+*   Absolute difference between an array and a scalar when the second
+    array is constructed from Scalar or has as many elements as the
+    number of channels in `src1`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1}(I) -  \texttt{src2} |)\f]
+*   Absolute difference between a scalar and an array when the first
+    array is constructed from Scalar or has as many elements as the
+    number of channels in `src2`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1} -  \texttt{src2}(I) |)\f]
+    where I is a multi-dimensional index of array elements. In case of
+    multi-channel arrays, each channel is processed independently.
+@note Saturation is not applied when the arrays have the depth CV_32S.
+You may even get a negative value in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`absdiff(src,X)` means `absdiff(src,(X,X,X,X))`.
+`absdiff(src,(X,))` means `absdiff(src,(X,0,0,0))`.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and type as input arrays.
+@sa cv::abs(const Mat&)
+*/
+CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst);
+
+/** @brief  This is an overloaded member function, provided for convenience (python)
+Copies the matrix to another one.
+When the operation mask is specified, if the Mat::create call shown above reallocates the matrix, the newly allocated matrix is initialized with all zeros before copying the data.
+@param src source matrix.
+@param dst Destination matrix. If it does not have a proper size or type before the operation, it is
+reallocated.
+@param mask Operation mask of the same size as \*this. Its non-zero elements indicate which matrix
+elements need to be copied. The mask has to be of type CV_8U and can have 1 or multiple channels.
+*/
+
+void CV_EXPORTS_W copyTo(InputArray src, OutputArray dst, InputArray mask);
+/** @brief  Checks if array elements lie between the elements of two other arrays.
+
+The function checks the range as follows:
+-   For every element of a single-channel input array:
+    \f[\texttt{dst} (I)= \texttt{lowerb} (I)_0  \leq \texttt{src} (I)_0 \leq  \texttt{upperb} (I)_0\f]
+-   For two-channel arrays:
+    \f[\texttt{dst} (I)= \texttt{lowerb} (I)_0  \leq \texttt{src} (I)_0 \leq  \texttt{upperb} (I)_0  \land \texttt{lowerb} (I)_1  \leq \texttt{src} (I)_1 \leq  \texttt{upperb} (I)_1\f]
+-   and so forth.
+
+That is, dst (I) is set to 255 (all 1 -bits) if src (I) is within the
+specified 1D, 2D, 3D, ... box and 0 otherwise.
+
+When the lower and/or upper boundary parameters are scalars, the indexes
+(I) at lowerb and upperb in the above formulas should be omitted.
+@param src first input array.
+@param lowerb inclusive lower boundary array or a scalar.
+@param upperb inclusive upper boundary array or a scalar.
+@param dst output array of the same size as src and CV_8U type.
+*/
+CV_EXPORTS_W void inRange(InputArray src, InputArray lowerb,
+                          InputArray upperb, OutputArray dst);
+
+/** @brief Performs the per-element comparison of two arrays or an array and scalar value.
+
+The function compares:
+*   Elements of two arrays when src1 and src2 have the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \,\texttt{cmpop}\, \texttt{src2} (I)\f]
+*   Elements of src1 with a scalar src2 when src2 is constructed from
+    Scalar or has a single element:
+    \f[\texttt{dst} (I) =  \texttt{src1}(I) \,\texttt{cmpop}\,  \texttt{src2}\f]
+*   src1 with elements of src2 when src1 is constructed from Scalar or
+    has a single element:
+    \f[\texttt{dst} (I) =  \texttt{src1}  \,\texttt{cmpop}\, \texttt{src2} (I)\f]
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+@code{.cpp}
+    Mat dst1 = src1 >= src2;
+    Mat dst2 = src1 < 8;
+    ...
+@endcode
+@param src1 first input array or a scalar; when it is an array, it must have a single channel.
+@param src2 second input array or a scalar; when it is an array, it must have a single channel.
+@param dst output array of type ref CV_8U that has the same size and the same number of channels as
+    the input arrays.
+@param cmpop a flag, that specifies correspondence between the arrays (cv::CmpTypes)
+@sa checkRange, min, max, threshold
+*/
+CV_EXPORTS_W void compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop);
+
+/** @brief Calculates per-element minimum of two arrays or an array and a scalar.
+
+The function cv::min calculates the per-element minimum of two arrays:
+\f[\texttt{dst} (I)= \min ( \texttt{src1} (I), \texttt{src2} (I))\f]
+or array and a scalar:
+\f[\texttt{dst} (I)= \min ( \texttt{src1} (I), \texttt{value} )\f]
+@param src1 first input array.
+@param src2 second input array of the same size and type as src1.
+@param dst output array of the same size and type as src1.
+@sa max, compare, inRange, minMaxLoc
+*/
+CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst);
+/** @overload
+needed to avoid conflicts with const _Tp& std::min(const _Tp&, const _Tp&, _Compare)
+*/
+CV_EXPORTS void min(const Mat& src1, const Mat& src2, Mat& dst);
+/** @overload
+needed to avoid conflicts with const _Tp& std::min(const _Tp&, const _Tp&, _Compare)
+*/
+CV_EXPORTS void min(const UMat& src1, const UMat& src2, UMat& dst);
+
+/** @brief Calculates per-element maximum of two arrays or an array and a scalar.
+
+The function cv::max calculates the per-element maximum of two arrays:
+\f[\texttt{dst} (I)= \max ( \texttt{src1} (I), \texttt{src2} (I))\f]
+or array and a scalar:
+\f[\texttt{dst} (I)= \max ( \texttt{src1} (I), \texttt{value} )\f]
+@param src1 first input array.
+@param src2 second input array of the same size and type as src1 .
+@param dst output array of the same size and type as src1.
+@sa  min, compare, inRange, minMaxLoc, @ref MatrixExpressions
+*/
+CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst);
+/** @overload
+needed to avoid conflicts with const _Tp& std::min(const _Tp&, const _Tp&, _Compare)
+*/
+CV_EXPORTS void max(const Mat& src1, const Mat& src2, Mat& dst);
+/** @overload
+needed to avoid conflicts with const _Tp& std::min(const _Tp&, const _Tp&, _Compare)
+*/
+CV_EXPORTS void max(const UMat& src1, const UMat& src2, UMat& dst);
+
+/** @brief Calculates a square root of array elements.
+
+The function cv::sqrt calculates a square root of each input array element.
+In case of multi-channel arrays, each channel is processed
+independently. The accuracy is approximately the same as of the built-in
+std::sqrt .
+@param src input floating-point array.
+@param dst output array of the same size and type as src.
+*/
+CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst);
+
+/** @brief Raises every array element to a power.
+
+The function cv::pow raises every element of the input array to power :
+\f[\texttt{dst} (I) =  \fork{\texttt{src}(I)^{power}}{if \(\texttt{power}\) is integer}{|\texttt{src}(I)|^{power}}{otherwise}\f]
+
+So, for a non-integer power exponent, the absolute values of input array
+elements are used. However, it is possible to get true values for
+negative values using some extra operations. In the example below,
+computing the 5th root of array src shows:
+@code{.cpp}
+    Mat mask = src < 0;
+    pow(src, 1./5, dst);
+    subtract(Scalar::all(0), dst, dst, mask);
+@endcode
+For some values of power, such as integer values, 0.5 and -0.5,
+specialized faster algorithms are used.
+
+Special values (NaN, Inf) are not handled.
+@param src input array.
+@param power exponent of power.
+@param dst output array of the same size and type as src.
+@sa sqrt, exp, log, cartToPolar, polarToCart
+*/
+CV_EXPORTS_W void pow(InputArray src, double power, OutputArray dst);
+
+/** @brief Calculates the exponent of every array element.
+
+The function cv::exp calculates the exponent of every element of the input
+array:
+\f[\texttt{dst} [I] = e^{ src(I) }\f]
+
+The maximum relative error is about 7e-6 for single-precision input and
+less than 1e-10 for double-precision input. Currently, the function
+converts denormalized values to zeros on output. Special values (NaN,
+Inf) are not handled.
+@param src input array.
+@param dst output array of the same size and type as src.
+@sa log, cartToPolar, polarToCart, phase, pow, sqrt, magnitude
+*/
+CV_EXPORTS_W void exp(InputArray src, OutputArray dst);
+
+/** @brief Calculates the natural logarithm of every array element.
+
+The function cv::log calculates the natural logarithm of every element of the input array:
+\f[\texttt{dst} (I) =  \log (\texttt{src}(I)) \f]
+
+Output on zero, negative and special (NaN, Inf) values is undefined.
+
+@param src input array.
+@param dst output array of the same size and type as src .
+@sa exp, cartToPolar, polarToCart, phase, pow, sqrt, magnitude
+*/
+CV_EXPORTS_W void log(InputArray src, OutputArray dst);
+
+/** @brief Calculates x and y coordinates of 2D vectors from their magnitude and angle.
+
+The function cv::polarToCart calculates the Cartesian coordinates of each 2D
+vector represented by the corresponding elements of magnitude and angle:
+\f[\begin{array}{l} \texttt{x} (I) =  \texttt{magnitude} (I) \cos ( \texttt{angle} (I)) \\ \texttt{y} (I) =  \texttt{magnitude} (I) \sin ( \texttt{angle} (I)) \\ \end{array}\f]
+
+The relative accuracy of the estimated coordinates is about 1e-6.
+@param magnitude input floating-point array of magnitudes of 2D vectors;
+it can be an empty matrix (=Mat()), in this case, the function assumes
+that all the magnitudes are =1; if it is not empty, it must have the
+same size and type as angle.
+@param angle input floating-point array of angles of 2D vectors.
+@param x output array of x-coordinates of 2D vectors; it has the same
+size and type as angle.
+@param y output array of y-coordinates of 2D vectors; it has the same
+size and type as angle.
+@param angleInDegrees when true, the input angles are measured in
+degrees, otherwise, they are measured in radians.
+@sa cartToPolar, magnitude, phase, exp, log, pow, sqrt
+*/
+CV_EXPORTS_W void polarToCart(InputArray magnitude, InputArray angle,
+                              OutputArray x, OutputArray y, bool angleInDegrees = false);
+
+/** @brief Calculates the magnitude and angle of 2D vectors.
+
+The function cv::cartToPolar calculates either the magnitude, angle, or both
+for every 2D vector (x(I),y(I)):
+\f[\begin{array}{l} \texttt{magnitude} (I)= \sqrt{\texttt{x}(I)^2+\texttt{y}(I)^2} , \\ \texttt{angle} (I)= \texttt{atan2} ( \texttt{y} (I), \texttt{x} (I))[ \cdot180 / \pi ] \end{array}\f]
+
+The angles are calculated with accuracy about 0.3 degrees. For the point
+(0,0), the angle is set to 0.
+@param x array of x-coordinates; this must be a single-precision or
+double-precision floating-point array.
+@param y array of y-coordinates, that must have the same size and same type as x.
+@param magnitude output array of magnitudes of the same size and type as x.
+@param angle output array of angles that has the same size and type as
+x; the angles are measured in radians (from 0 to 2\*Pi) or in degrees (0 to 360 degrees).
+@param angleInDegrees a flag, indicating whether the angles are measured
+in radians (which is by default), or in degrees.
+@sa Sobel, Scharr
+*/
+CV_EXPORTS_W void cartToPolar(InputArray x, InputArray y,
+                              OutputArray magnitude, OutputArray angle,
+                              bool angleInDegrees = false);
+
+/** @brief Calculates the rotation angle of 2D vectors.
+
+The function cv::phase calculates the rotation angle of each 2D vector that
+is formed from the corresponding elements of x and y :
+\f[\texttt{angle} (I) =  \texttt{atan2} ( \texttt{y} (I), \texttt{x} (I))\f]
+
+The angle estimation accuracy is about 0.3 degrees. When x(I)=y(I)=0 ,
+the corresponding angle(I) is set to 0.
+@param x input floating-point array of x-coordinates of 2D vectors.
+@param y input array of y-coordinates of 2D vectors; it must have the
+same size and the same type as x.
+@param angle output array of vector angles; it has the same size and
+same type as x .
+@param angleInDegrees when true, the function calculates the angle in
+degrees, otherwise, they are measured in radians.
+*/
+CV_EXPORTS_W void phase(InputArray x, InputArray y, OutputArray angle,
+                        bool angleInDegrees = false);
+
+/** @brief Calculates the magnitude of 2D vectors.
+
+The function cv::magnitude calculates the magnitude of 2D vectors formed
+from the corresponding elements of x and y arrays:
+\f[\texttt{dst} (I) =  \sqrt{\texttt{x}(I)^2 + \texttt{y}(I)^2}\f]
+@param x floating-point array of x-coordinates of the vectors.
+@param y floating-point array of y-coordinates of the vectors; it must
+have the same size as x.
+@param magnitude output array of the same size and type as x.
+@sa cartToPolar, polarToCart, phase, sqrt
+*/
+CV_EXPORTS_W void magnitude(InputArray x, InputArray y, OutputArray magnitude);
+
+/** @brief Checks every element of an input array for invalid values.
+
+The function cv::checkRange checks that every array element is neither NaN nor infinite. When minVal \>
+-DBL_MAX and maxVal \< DBL_MAX, the function also checks that each value is between minVal and
+maxVal. In case of multi-channel arrays, each channel is processed independently. If some values
+are out of range, position of the first outlier is stored in pos (when pos != NULL). Then, the
+function either returns false (when quiet=true) or throws an exception.
+@param a input array.
+@param quiet a flag, indicating whether the functions quietly return false when the array elements
+are out of range or they throw an exception.
+@param pos optional output parameter, when not NULL, must be a pointer to array of src.dims
+elements.
+@param minVal inclusive lower boundary of valid values range.
+@param maxVal exclusive upper boundary of valid values range.
+*/
+CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
+                            double minVal = -DBL_MAX, double maxVal = DBL_MAX);
+
+/** @brief Replaces NaNs by given number
+@param a input/output matrix (CV_32F type).
+@param val value to convert the NaNs
+*/
+CV_EXPORTS_W void patchNaNs(InputOutputArray a, double val = 0);
+
+/** @brief Performs generalized matrix multiplication.
+
+The function cv::gemm performs generalized matrix multiplication similar to the
+gemm functions in BLAS level 3. For example,
+`gemm(src1, src2, alpha, src3, beta, dst, GEMM_1_T + GEMM_3_T)`
+corresponds to
+\f[\texttt{dst} =  \texttt{alpha} \cdot \texttt{src1} ^T  \cdot \texttt{src2} +  \texttt{beta} \cdot \texttt{src3} ^T\f]
+
+In case of complex (two-channel) data, performed a complex matrix
+multiplication.
+
+The function can be replaced with a matrix expression. For example, the
+above call can be replaced with:
+@code{.cpp}
+    dst = alpha*src1.t()*src2 + beta*src3.t();
+@endcode
+@param src1 first multiplied input matrix that could be real(CV_32FC1,
+CV_64FC1) or complex(CV_32FC2, CV_64FC2).
+@param src2 second multiplied input matrix of the same type as src1.
+@param alpha weight of the matrix product.
+@param src3 third optional delta matrix added to the matrix product; it
+should have the same type as src1 and src2.
+@param beta weight of src3.
+@param dst output matrix; it has the proper size and the same type as
+input matrices.
+@param flags operation flags (cv::GemmFlags)
+@sa mulTransposed, transform
+*/
+CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, double alpha,
+                       InputArray src3, double beta, OutputArray dst, int flags = 0);
+
+/** @brief Calculates the product of a matrix and its transposition.
+
+The function cv::mulTransposed calculates the product of src and its
+transposition:
+\f[\texttt{dst} = \texttt{scale} ( \texttt{src} - \texttt{delta} )^T ( \texttt{src} - \texttt{delta} )\f]
+if aTa=true, and
+\f[\texttt{dst} = \texttt{scale} ( \texttt{src} - \texttt{delta} ) ( \texttt{src} - \texttt{delta} )^T\f]
+otherwise. The function is used to calculate the covariance matrix. With
+zero delta, it can be used as a faster substitute for general matrix
+product A\*B when B=A'
+@param src input single-channel matrix. Note that unlike gemm, the
+function can multiply not only floating-point matrices.
+@param dst output square matrix.
+@param aTa Flag specifying the multiplication ordering. See the
+description below.
+@param delta Optional delta matrix subtracted from src before the
+multiplication. When the matrix is empty ( delta=noArray() ), it is
+assumed to be zero, that is, nothing is subtracted. If it has the same
+size as src, it is simply subtracted. Otherwise, it is "repeated" (see
+repeat ) to cover the full src and then subtracted. Type of the delta
+matrix, when it is not empty, must be the same as the type of created
+output matrix. See the dtype parameter description below.
+@param scale Optional scale factor for the matrix product.
+@param dtype Optional type of the output matrix. When it is negative,
+the output matrix will have the same type as src . Otherwise, it will be
+type=CV_MAT_DEPTH(dtype) that should be either CV_32F or CV_64F .
+@sa calcCovarMatrix, gemm, repeat, reduce
+*/
+CV_EXPORTS_W void mulTransposed( InputArray src, OutputArray dst, bool aTa,
+                                 InputArray delta = noArray(),
+                                 double scale = 1, int dtype = -1 );
+
+/** @brief Transposes a matrix.
+
+The function cv::transpose transposes the matrix src :
+\f[\texttt{dst} (i,j) =  \texttt{src} (j,i)\f]
+@note No complex conjugation is done in case of a complex matrix. It
+should be done separately if needed.
+@param src input array.
+@param dst output array of the same type as src.
+*/
+CV_EXPORTS_W void transpose(InputArray src, OutputArray dst);
+
+/** @brief Transpose for n-dimensional matrices.
+ *
+ * @note Input should be continuous single-channel matrix.
+ * @param src input array.
+ * @param order a permutation of [0,1,..,N-1] where N is the number of axes of src.
+ * The i'th axis of dst will correspond to the axis numbered order[i] of the input.
+ * @param dst output array of the same type as src.
+ */
+CV_EXPORTS_W void transposeND(InputArray src, const std::vector<int>& order, OutputArray dst);
+
+/** @brief Performs the matrix transformation of every array element.
+
+The function cv::transform performs the matrix transformation of every
+element of the array src and stores the results in dst :
+\f[\texttt{dst} (I) =  \texttt{m} \cdot \texttt{src} (I)\f]
+(when m.cols=src.channels() ), or
+\f[\texttt{dst} (I) =  \texttt{m} \cdot [ \texttt{src} (I); 1]\f]
+(when m.cols=src.channels()+1 )
+
+Every element of the N -channel array src is interpreted as N -element
+vector that is transformed using the M x N or M x (N+1) matrix m to
+M-element vector - the corresponding element of the output array dst .
+
+The function may be used for geometrical transformation of
+N -dimensional points, arbitrary linear color space transformation (such
+as various kinds of RGB to YUV transforms), shuffling the image
+channels, and so forth.
+@param src input array that must have as many channels (1 to 4) as
+m.cols or m.cols-1.
+@param dst output array of the same size and depth as src; it has as
+many channels as m.rows.
+@param m transformation 2x2 or 2x3 floating-point matrix.
+@sa perspectiveTransform, getAffineTransform, estimateAffine2D, warpAffine, warpPerspective
+*/
+CV_EXPORTS_W void transform(InputArray src, OutputArray dst, InputArray m );
+
+/** @brief Performs the perspective matrix transformation of vectors.
+
+The function cv::perspectiveTransform transforms every element of src by
+treating it as a 2D or 3D vector, in the following way:
+\f[(x, y, z)  \rightarrow (x'/w, y'/w, z'/w)\f]
+where
+\f[(x', y', z', w') =  \texttt{mat} \cdot \begin{bmatrix} x & y & z & 1  \end{bmatrix}\f]
+and
+\f[w =  \fork{w'}{if \(w' \ne 0\)}{\infty}{otherwise}\f]
+
+Here a 3D vector transformation is shown. In case of a 2D vector
+transformation, the z component is omitted.
+
+@note The function transforms a sparse set of 2D or 3D vectors. If you
+want to transform an image using perspective transformation, use
+warpPerspective . If you have an inverse problem, that is, you want to
+compute the most probable perspective transformation out of several
+pairs of corresponding points, you can use getPerspectiveTransform or
+findHomography .
+@param src input two-channel or three-channel floating-point array; each
+element is a 2D/3D vector to be transformed.
+@param dst output array of the same size and type as src.
+@param m 3x3 or 4x4 floating-point transformation matrix.
+@sa  transform, warpPerspective, getPerspectiveTransform, findHomography
+*/
+CV_EXPORTS_W void perspectiveTransform(InputArray src, OutputArray dst, InputArray m );
+
+/** @brief Copies the lower or the upper half of a square matrix to its another half.
+
+The function cv::completeSymm copies the lower or the upper half of a square matrix to
+its another half. The matrix diagonal remains unchanged:
+ - \f$\texttt{m}_{ij}=\texttt{m}_{ji}\f$ for \f$i > j\f$ if
+    lowerToUpper=false
+ - \f$\texttt{m}_{ij}=\texttt{m}_{ji}\f$ for \f$i < j\f$ if
+    lowerToUpper=true
+
+@param m input-output floating-point square matrix.
+@param lowerToUpper operation flag; if true, the lower half is copied to
+the upper half. Otherwise, the upper half is copied to the lower half.
+@sa flip, transpose
+*/
+CV_EXPORTS_W void completeSymm(InputOutputArray m, bool lowerToUpper = false);
+
+/** @brief Initializes a scaled identity matrix.
+
+The function cv::setIdentity initializes a scaled identity matrix:
+\f[\texttt{mtx} (i,j)= \fork{\texttt{value}}{ if \(i=j\)}{0}{otherwise}\f]
+
+The function can also be emulated using the matrix initializers and the
+matrix expressions:
+@code
+    Mat A = Mat::eye(4, 3, CV_32F)*5;
+    // A will be set to [[5, 0, 0], [0, 5, 0], [0, 0, 5], [0, 0, 0]]
+@endcode
+@param mtx matrix to initialize (not necessarily square).
+@param s value to assign to diagonal elements.
+@sa Mat::zeros, Mat::ones, Mat::setTo, Mat::operator=
+*/
+CV_EXPORTS_W void setIdentity(InputOutputArray mtx, const Scalar& s = Scalar(1));
+
+/** @brief Returns the determinant of a square floating-point matrix.
+
+The function cv::determinant calculates and returns the determinant of the
+specified matrix. For small matrices ( mtx.cols=mtx.rows\<=3 ), the
+direct method is used. For larger matrices, the function uses LU
+factorization with partial pivoting.
+
+For symmetric positively-determined matrices, it is also possible to use
+eigen decomposition to calculate the determinant.
+@param mtx input matrix that must have CV_32FC1 or CV_64FC1 type and
+square size.
+@sa trace, invert, solve, eigen, @ref MatrixExpressions
+*/
+CV_EXPORTS_W double determinant(InputArray mtx);
+
+/** @brief Returns the trace of a matrix.
+
+The function cv::trace returns the sum of the diagonal elements of the
+matrix mtx .
+\f[\mathrm{tr} ( \texttt{mtx} ) =  \sum _i  \texttt{mtx} (i,i)\f]
+@param mtx input matrix.
+*/
+CV_EXPORTS_W Scalar trace(InputArray mtx);
+
+/** @brief Finds the inverse or pseudo-inverse of a matrix.
+
+The function cv::invert inverts the matrix src and stores the result in dst
+. When the matrix src is singular or non-square, the function calculates
+the pseudo-inverse matrix (the dst matrix) so that norm(src\*dst - I) is
+minimal, where I is an identity matrix.
+
+In case of the #DECOMP_LU method, the function returns non-zero value if
+the inverse has been successfully calculated and 0 if src is singular.
+
+In case of the #DECOMP_SVD method, the function returns the inverse
+condition number of src (the ratio of the smallest singular value to the
+largest singular value) and 0 if src is singular. The SVD method
+calculates a pseudo-inverse matrix if src is singular.
+
+Similarly to #DECOMP_LU, the method #DECOMP_CHOLESKY works only with
+non-singular square matrices that should also be symmetrical and
+positively defined. In this case, the function stores the inverted
+matrix in dst and returns non-zero. Otherwise, it returns 0.
+
+@param src input floating-point M x N matrix.
+@param dst output matrix of N x M size and the same type as src.
+@param flags inversion method (cv::DecompTypes)
+@sa solve, SVD
+*/
+CV_EXPORTS_W double invert(InputArray src, OutputArray dst, int flags = DECOMP_LU);
+
+/** @brief Solves one or more linear systems or least-squares problems.
+
+The function cv::solve solves a linear system or least-squares problem (the
+latter is possible with SVD or QR methods, or by specifying the flag
+#DECOMP_NORMAL ):
+\f[\texttt{dst} =  \arg \min _X \| \texttt{src1} \cdot \texttt{X} -  \texttt{src2} \|\f]
+
+If #DECOMP_LU or #DECOMP_CHOLESKY method is used, the function returns 1
+if src1 (or \f$\texttt{src1}^T\texttt{src1}\f$ ) is non-singular. Otherwise,
+it returns 0. In the latter case, dst is not valid. Other methods find a
+pseudo-solution in case of a singular left-hand side part.
+
+@note If you want to find a unity-norm solution of an under-defined
+singular system \f$\texttt{src1}\cdot\texttt{dst}=0\f$ , the function solve
+will not do the work. Use SVD::solveZ instead.
+
+@param src1 input matrix on the left-hand side of the system.
+@param src2 input matrix on the right-hand side of the system.
+@param dst output solution.
+@param flags solution (matrix inversion) method (#DecompTypes)
+@sa invert, SVD, eigen
+*/
+CV_EXPORTS_W bool solve(InputArray src1, InputArray src2,
+                        OutputArray dst, int flags = DECOMP_LU);
+
+/** @brief Sorts each row or each column of a matrix.
+
+The function cv::sort sorts each matrix row or each matrix column in
+ascending or descending order. So you should pass two operation flags to
+get desired behaviour. If you want to sort matrix rows or columns
+lexicographically, you can use STL std::sort generic function with the
+proper comparison predicate.
+
+@param src input single-channel array.
+@param dst output array of the same size and type as src.
+@param flags operation flags, a combination of #SortFlags
+@sa sortIdx, randShuffle
+*/
+CV_EXPORTS_W void sort(InputArray src, OutputArray dst, int flags);
+
+/** @brief Sorts each row or each column of a matrix.
+
+The function cv::sortIdx sorts each matrix row or each matrix column in the
+ascending or descending order. So you should pass two operation flags to
+get desired behaviour. Instead of reordering the elements themselves, it
+stores the indices of sorted elements in the output array. For example:
+@code
+    Mat A = Mat::eye(3,3,CV_32F), B;
+    sortIdx(A, B, SORT_EVERY_ROW + SORT_ASCENDING);
+    // B will probably contain
+    // (because of equal elements in A some permutations are possible):
+    // [[1, 2, 0], [0, 2, 1], [0, 1, 2]]
+@endcode
+@param src input single-channel array.
+@param dst output integer array of the same size as src.
+@param flags operation flags that could be a combination of cv::SortFlags
+@sa sort, randShuffle
+*/
+CV_EXPORTS_W void sortIdx(InputArray src, OutputArray dst, int flags);
+
+/** @brief Finds the real roots of a cubic equation.
+
+The function solveCubic finds the real roots of a cubic equation:
+-   if coeffs is a 4-element vector:
+\f[\texttt{coeffs} [0] x^3 +  \texttt{coeffs} [1] x^2 +  \texttt{coeffs} [2] x +  \texttt{coeffs} [3] = 0\f]
+-   if coeffs is a 3-element vector:
+\f[x^3 +  \texttt{coeffs} [0] x^2 +  \texttt{coeffs} [1] x +  \texttt{coeffs} [2] = 0\f]
+
+The roots are stored in the roots array.
+@param coeffs equation coefficients, an array of 3 or 4 elements.
+@param roots output array of real roots that has 1 or 3 elements.
+@return number of real roots. It can be 0, 1 or 2.
+*/
+CV_EXPORTS_W int solveCubic(InputArray coeffs, OutputArray roots);
+
+/** @brief Finds the real or complex roots of a polynomial equation.
+
+The function cv::solvePoly finds real and complex roots of a polynomial equation:
+\f[\texttt{coeffs} [n] x^{n} +  \texttt{coeffs} [n-1] x^{n-1} + ... +  \texttt{coeffs} [1] x +  \texttt{coeffs} [0] = 0\f]
+@param coeffs array of polynomial coefficients.
+@param roots output (complex) array of roots.
+@param maxIters maximum number of iterations the algorithm does.
+*/
+CV_EXPORTS_W double solvePoly(InputArray coeffs, OutputArray roots, int maxIters = 300);
+
+/** @brief Calculates eigenvalues and eigenvectors of a symmetric matrix.
+
+The function cv::eigen calculates just eigenvalues, or eigenvalues and eigenvectors of the symmetric
+matrix src:
+@code
+    src*eigenvectors.row(i).t() = eigenvalues.at<srcType>(i)*eigenvectors.row(i).t()
+@endcode
+
+@note Use cv::eigenNonSymmetric for calculation of real eigenvalues and eigenvectors of non-symmetric matrix.
+
+@param src input matrix that must have CV_32FC1 or CV_64FC1 type, square size and be symmetrical
+(src ^T^ == src).
+@param eigenvalues output vector of eigenvalues of the same type as src; the eigenvalues are stored
+in the descending order.
+@param eigenvectors output matrix of eigenvectors; it has the same size and type as src; the
+eigenvectors are stored as subsequent matrix rows, in the same order as the corresponding
+eigenvalues.
+@sa eigenNonSymmetric, completeSymm, PCA
+*/
+CV_EXPORTS_W bool eigen(InputArray src, OutputArray eigenvalues,
+                        OutputArray eigenvectors = noArray());
+
+/** @brief Calculates eigenvalues and eigenvectors of a non-symmetric matrix (real eigenvalues only).
+
+@note Assumes real eigenvalues.
+
+The function calculates eigenvalues and eigenvectors (optional) of the square matrix src:
+@code
+    src*eigenvectors.row(i).t() = eigenvalues.at<srcType>(i)*eigenvectors.row(i).t()
+@endcode
+
+@param src input matrix (CV_32FC1 or CV_64FC1 type).
+@param eigenvalues output vector of eigenvalues (type is the same type as src).
+@param eigenvectors output matrix of eigenvectors (type is the same type as src). The eigenvectors are stored as subsequent matrix rows, in the same order as the corresponding eigenvalues.
+@sa eigen
+*/
+CV_EXPORTS_W void eigenNonSymmetric(InputArray src, OutputArray eigenvalues,
+                                    OutputArray eigenvectors);
+
+/** @brief Calculates the covariance matrix of a set of vectors.
+
+The function cv::calcCovarMatrix calculates the covariance matrix and, optionally, the mean vector of
+the set of input vectors.
+@param samples samples stored as separate matrices
+@param nsamples number of samples
+@param covar output covariance matrix of the type ctype and square size.
+@param mean input or output (depending on the flags) array as the average value of the input vectors.
+@param flags operation flags as a combination of #CovarFlags
+@param ctype type of the matrixl; it equals 'CV_64F' by default.
+@sa PCA, mulTransposed, Mahalanobis
+@todo InputArrayOfArrays
+*/
+CV_EXPORTS void calcCovarMatrix( const Mat* samples, int nsamples, Mat& covar, Mat& mean,
+                                 int flags, int ctype = CV_64F);
+
+/** @overload
+@note use #COVAR_ROWS or #COVAR_COLS flag
+@param samples samples stored as rows/columns of a single matrix.
+@param covar output covariance matrix of the type ctype and square size.
+@param mean input or output (depending on the flags) array as the average value of the input vectors.
+@param flags operation flags as a combination of #CovarFlags
+@param ctype type of the matrixl; it equals 'CV_64F' by default.
+*/
+CV_EXPORTS_W void calcCovarMatrix( InputArray samples, OutputArray covar,
+                                   InputOutputArray mean, int flags, int ctype = CV_64F);
+
+/** wrap PCA::operator() */
+CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
+                             OutputArray eigenvectors, int maxComponents = 0);
+
+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           int maxComponents = 0);
+
+/** wrap PCA::operator() */
+CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
+                             OutputArray eigenvectors, double retainedVariance);
+
+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           double retainedVariance);
+
+/** wrap PCA::project */
+CV_EXPORTS_W void PCAProject(InputArray data, InputArray mean,
+                             InputArray eigenvectors, OutputArray result);
+
+/** wrap PCA::backProject */
+CV_EXPORTS_W void PCABackProject(InputArray data, InputArray mean,
+                                 InputArray eigenvectors, OutputArray result);
+
+/** wrap SVD::compute */
+CV_EXPORTS_W void SVDecomp( InputArray src, OutputArray w, OutputArray u, OutputArray vt, int flags = 0 );
+
+/** wrap SVD::backSubst */
+CV_EXPORTS_W void SVBackSubst( InputArray w, InputArray u, InputArray vt,
+                               InputArray rhs, OutputArray dst );
+
+/** @brief Calculates the Mahalanobis distance between two vectors.
+
+The function cv::Mahalanobis calculates and returns the weighted distance between two vectors:
+\f[d( \texttt{vec1} , \texttt{vec2} )= \sqrt{\sum_{i,j}{\texttt{icovar(i,j)}\cdot(\texttt{vec1}(I)-\texttt{vec2}(I))\cdot(\texttt{vec1(j)}-\texttt{vec2(j)})} }\f]
+The covariance matrix may be calculated using the #calcCovarMatrix function and then inverted using
+the invert function (preferably using the #DECOMP_SVD method, as the most accurate).
+@param v1 first 1D input vector.
+@param v2 second 1D input vector.
+@param icovar inverse covariance matrix.
+*/
+CV_EXPORTS_W double Mahalanobis(InputArray v1, InputArray v2, InputArray icovar);
+
+/** @brief Performs a forward or inverse Discrete Fourier transform of a 1D or 2D floating-point array.
+
+The function cv::dft performs one of the following:
+-   Forward the Fourier transform of a 1D vector of N elements:
+    \f[Y = F^{(N)}  \cdot X,\f]
+    where \f$F^{(N)}_{jk}=\exp(-2\pi i j k/N)\f$ and \f$i=\sqrt{-1}\f$
+-   Inverse the Fourier transform of a 1D vector of N elements:
+    \f[\begin{array}{l} X'=  \left (F^{(N)} \right )^{-1}  \cdot Y =  \left (F^{(N)} \right )^*  \cdot y  \\ X = (1/N)  \cdot X, \end{array}\f]
+    where \f$F^*=\left(\textrm{Re}(F^{(N)})-\textrm{Im}(F^{(N)})\right)^T\f$
+-   Forward the 2D Fourier transform of a M x N matrix:
+    \f[Y = F^{(M)}  \cdot X  \cdot F^{(N)}\f]
+-   Inverse the 2D Fourier transform of a M x N matrix:
+    \f[\begin{array}{l} X'=  \left (F^{(M)} \right )^*  \cdot Y  \cdot \left (F^{(N)} \right )^* \\ X =  \frac{1}{M \cdot N} \cdot X' \end{array}\f]
+
+In case of real (single-channel) data, the output spectrum of the forward Fourier transform or input
+spectrum of the inverse Fourier transform can be represented in a packed format called *CCS*
+(complex-conjugate-symmetrical). It was borrowed from IPL (Intel\* Image Processing Library). Here
+is how 2D *CCS* spectrum looks:
+\f[\begin{bmatrix} Re Y_{0,0} & Re Y_{0,1} & Im Y_{0,1} & Re Y_{0,2} & Im Y_{0,2} &  \cdots & Re Y_{0,N/2-1} & Im Y_{0,N/2-1} & Re Y_{0,N/2}  \\ Re Y_{1,0} & Re Y_{1,1} & Im Y_{1,1} & Re Y_{1,2} & Im Y_{1,2} &  \cdots & Re Y_{1,N/2-1} & Im Y_{1,N/2-1} & Re Y_{1,N/2}  \\ Im Y_{1,0} & Re Y_{2,1} & Im Y_{2,1} & Re Y_{2,2} & Im Y_{2,2} &  \cdots & Re Y_{2,N/2-1} & Im Y_{2,N/2-1} & Im Y_{1,N/2}  \\ \hdotsfor{9} \\ Re Y_{M/2-1,0} &  Re Y_{M-3,1}  & Im Y_{M-3,1} &  \hdotsfor{3} & Re Y_{M-3,N/2-1} & Im Y_{M-3,N/2-1}& Re Y_{M/2-1,N/2}  \\ Im Y_{M/2-1,0} &  Re Y_{M-2,1}  & Im Y_{M-2,1} &  \hdotsfor{3} & Re Y_{M-2,N/2-1} & Im Y_{M-2,N/2-1}& Im Y_{M/2-1,N/2}  \\ Re Y_{M/2,0}  &  Re Y_{M-1,1} &  Im Y_{M-1,1} &  \hdotsfor{3} & Re Y_{M-1,N/2-1} & Im Y_{M-1,N/2-1}& Re Y_{M/2,N/2} \end{bmatrix}\f]
+
+In case of 1D transform of a real vector, the output looks like the first row of the matrix above.
+
+So, the function chooses an operation mode depending on the flags and size of the input array:
+-   If #DFT_ROWS is set or the input array has a single row or single column, the function
+    performs a 1D forward or inverse transform of each row of a matrix when #DFT_ROWS is set.
+    Otherwise, it performs a 2D transform.
+-   If the input array is real and #DFT_INVERSE is not set, the function performs a forward 1D or
+    2D transform:
+    -   When #DFT_COMPLEX_OUTPUT is set, the output is a complex matrix of the same size as
+        input.
+    -   When #DFT_COMPLEX_OUTPUT is not set, the output is a real matrix of the same size as
+        input. In case of 2D transform, it uses the packed format as shown above. In case of a
+        single 1D transform, it looks like the first row of the matrix above. In case of
+        multiple 1D transforms (when using the #DFT_ROWS flag), each row of the output matrix
+        looks like the first row of the matrix above.
+-   If the input array is complex and either #DFT_INVERSE or #DFT_REAL_OUTPUT are not set, the
+    output is a complex array of the same size as input. The function performs a forward or
+    inverse 1D or 2D transform of the whole input array or each row of the input array
+    independently, depending on the flags DFT_INVERSE and DFT_ROWS.
+-   When #DFT_INVERSE is set and the input array is real, or it is complex but #DFT_REAL_OUTPUT
+    is set, the output is a real array of the same size as input. The function performs a 1D or 2D
+    inverse transformation of the whole input array or each individual row, depending on the flags
+    #DFT_INVERSE and #DFT_ROWS.
+
+If #DFT_SCALE is set, the scaling is done after the transformation.
+
+Unlike dct, the function supports arrays of arbitrary size. But only those arrays are processed
+efficiently, whose sizes can be factorized in a product of small prime numbers (2, 3, and 5 in the
+current implementation). Such an efficient DFT size can be calculated using the getOptimalDFTSize
+method.
+
+The sample below illustrates how to calculate a DFT-based convolution of two 2D real arrays:
+@code
+    void convolveDFT(InputArray A, InputArray B, OutputArray C)
+    {
+        // reallocate the output array if needed
+        C.create(abs(A.rows - B.rows)+1, abs(A.cols - B.cols)+1, A.type());
+        Size dftSize;
+        // calculate the size of DFT transform
+        dftSize.width = getOptimalDFTSize(A.cols + B.cols - 1);
+        dftSize.height = getOptimalDFTSize(A.rows + B.rows - 1);
+
+        // allocate temporary buffers and initialize them with 0's
+        Mat tempA(dftSize, A.type(), Scalar::all(0));
+        Mat tempB(dftSize, B.type(), Scalar::all(0));
+
+        // copy A and B to the top-left corners of tempA and tempB, respectively
+        Mat roiA(tempA, Rect(0,0,A.cols,A.rows));
+        A.copyTo(roiA);
+        Mat roiB(tempB, Rect(0,0,B.cols,B.rows));
+        B.copyTo(roiB);
+
+        // now transform the padded A & B in-place;
+        // use "nonzeroRows" hint for faster processing
+        dft(tempA, tempA, 0, A.rows);
+        dft(tempB, tempB, 0, B.rows);
+
+        // multiply the spectrums;
+        // the function handles packed spectrum representations well
+        mulSpectrums(tempA, tempB, tempA);
+
+        // transform the product back from the frequency domain.
+        // Even though all the result rows will be non-zero,
+        // you need only the first C.rows of them, and thus you
+        // pass nonzeroRows == C.rows
+        dft(tempA, tempA, DFT_INVERSE + DFT_SCALE, C.rows);
+
+        // now copy the result back to C.
+        tempA(Rect(0, 0, C.cols, C.rows)).copyTo(C);
+
+        // all the temporary buffers will be deallocated automatically
+    }
+@endcode
+To optimize this sample, consider the following approaches:
+-   Since nonzeroRows != 0 is passed to the forward transform calls and since A and B are copied to
+    the top-left corners of tempA and tempB, respectively, it is not necessary to clear the whole
+    tempA and tempB. It is only necessary to clear the tempA.cols - A.cols ( tempB.cols - B.cols)
+    rightmost columns of the matrices.
+-   This DFT-based convolution does not have to be applied to the whole big arrays, especially if B
+    is significantly smaller than A or vice versa. Instead, you can calculate convolution by parts.
+    To do this, you need to split the output array C into multiple tiles. For each tile, estimate
+    which parts of A and B are required to calculate convolution in this tile. If the tiles in C are
+    too small, the speed will decrease a lot because of repeated work. In the ultimate case, when
+    each tile in C is a single pixel, the algorithm becomes equivalent to the naive convolution
+    algorithm. If the tiles are too big, the temporary arrays tempA and tempB become too big and
+    there is also a slowdown because of bad cache locality. So, there is an optimal tile size
+    somewhere in the middle.
+-   If different tiles in C can be calculated in parallel and, thus, the convolution is done by
+    parts, the loop can be threaded.
+
+All of the above improvements have been implemented in #matchTemplate and #filter2D . Therefore, by
+using them, you can get the performance even better than with the above theoretically optimal
+implementation. Though, those two functions actually calculate cross-correlation, not convolution,
+so you need to "flip" the second convolution operand B vertically and horizontally using flip .
+@note
+-   An example using the discrete fourier transform can be found at
+    opencv_source_code/samples/cpp/dft.cpp
+-   (Python) An example using the dft functionality to perform Wiener deconvolution can be found
+    at opencv_source/samples/python/deconvolution.py
+-   (Python) An example rearranging the quadrants of a Fourier image can be found at
+    opencv_source/samples/python/dft.py
+@param src input array that could be real or complex.
+@param dst output array whose size and type depends on the flags .
+@param flags transformation flags, representing a combination of the #DftFlags
+@param nonzeroRows when the parameter is not zero, the function assumes that only the first
+nonzeroRows rows of the input array (#DFT_INVERSE is not set) or only the first nonzeroRows of the
+output array (#DFT_INVERSE is set) contain non-zeros, thus, the function can handle the rest of the
+rows more efficiently and save some time; this technique is very useful for calculating array
+cross-correlation or convolution using DFT.
+@sa dct, getOptimalDFTSize, mulSpectrums, filter2D, matchTemplate, flip, cartToPolar,
+magnitude, phase
+*/
+CV_EXPORTS_W void dft(InputArray src, OutputArray dst, int flags = 0, int nonzeroRows = 0);
+
+/** @brief Calculates the inverse Discrete Fourier Transform of a 1D or 2D array.
+
+idft(src, dst, flags) is equivalent to dft(src, dst, flags | #DFT_INVERSE) .
+@note None of dft and idft scales the result by default. So, you should pass #DFT_SCALE to one of
+dft or idft explicitly to make these transforms mutually inverse.
+@sa dft, dct, idct, mulSpectrums, getOptimalDFTSize
+@param src input floating-point real or complex array.
+@param dst output array whose size and type depend on the flags.
+@param flags operation flags (see dft and #DftFlags).
+@param nonzeroRows number of dst rows to process; the rest of the rows have undefined content (see
+the convolution sample in dft description.
+*/
+CV_EXPORTS_W void idft(InputArray src, OutputArray dst, int flags = 0, int nonzeroRows = 0);
+
+/** @brief Performs a forward or inverse discrete Cosine transform of 1D or 2D array.
+
+The function cv::dct performs a forward or inverse discrete Cosine transform (DCT) of a 1D or 2D
+floating-point array:
+-   Forward Cosine transform of a 1D vector of N elements:
+    \f[Y = C^{(N)}  \cdot X\f]
+    where
+    \f[C^{(N)}_{jk}= \sqrt{\alpha_j/N} \cos \left ( \frac{\pi(2k+1)j}{2N} \right )\f]
+    and
+    \f$\alpha_0=1\f$, \f$\alpha_j=2\f$ for *j \> 0*.
+-   Inverse Cosine transform of a 1D vector of N elements:
+    \f[X =  \left (C^{(N)} \right )^{-1}  \cdot Y =  \left (C^{(N)} \right )^T  \cdot Y\f]
+    (since \f$C^{(N)}\f$ is an orthogonal matrix, \f$C^{(N)} \cdot \left(C^{(N)}\right)^T = I\f$ )
+-   Forward 2D Cosine transform of M x N matrix:
+    \f[Y = C^{(N)}  \cdot X  \cdot \left (C^{(N)} \right )^T\f]
+-   Inverse 2D Cosine transform of M x N matrix:
+    \f[X =  \left (C^{(N)} \right )^T  \cdot X  \cdot C^{(N)}\f]
+
+The function chooses the mode of operation by looking at the flags and size of the input array:
+-   If (flags & #DCT_INVERSE) == 0, the function does a forward 1D or 2D transform. Otherwise, it
+    is an inverse 1D or 2D transform.
+-   If (flags & #DCT_ROWS) != 0, the function performs a 1D transform of each row.
+-   If the array is a single column or a single row, the function performs a 1D transform.
+-   If none of the above is true, the function performs a 2D transform.
+
+@note Currently dct supports even-size arrays (2, 4, 6 ...). For data analysis and approximation, you
+can pad the array when necessary.
+Also, the function performance depends very much, and not monotonically, on the array size (see
+getOptimalDFTSize ). In the current implementation DCT of a vector of size N is calculated via DFT
+of a vector of size N/2 . Thus, the optimal DCT size N1 \>= N can be calculated as:
+@code
+    size_t getOptimalDCTSize(size_t N) { return 2*getOptimalDFTSize((N+1)/2); }
+    N1 = getOptimalDCTSize(N);
+@endcode
+@param src input floating-point array.
+@param dst output array of the same size and type as src .
+@param flags transformation flags as a combination of cv::DftFlags (DCT_*)
+@sa dft, getOptimalDFTSize, idct
+*/
+CV_EXPORTS_W void dct(InputArray src, OutputArray dst, int flags = 0);
+
+/** @brief Calculates the inverse Discrete Cosine Transform of a 1D or 2D array.
+
+idct(src, dst, flags) is equivalent to dct(src, dst, flags | DCT_INVERSE).
+@param src input floating-point single-channel array.
+@param dst output array of the same size and type as src.
+@param flags operation flags.
+@sa  dct, dft, idft, getOptimalDFTSize
+*/
+CV_EXPORTS_W void idct(InputArray src, OutputArray dst, int flags = 0);
+
+/** @brief Performs the per-element multiplication of two Fourier spectrums.
+
+The function cv::mulSpectrums performs the per-element multiplication of the two CCS-packed or complex
+matrices that are results of a real or complex Fourier transform.
+
+The function, together with dft and idft, may be used to calculate convolution (pass conjB=false )
+or correlation (pass conjB=true ) of two arrays rapidly. When the arrays are complex, they are
+simply multiplied (per element) with an optional conjugation of the second-array elements. When the
+arrays are real, they are assumed to be CCS-packed (see dft for details).
+@param a first input array.
+@param b second input array of the same size and type as src1 .
+@param c output array of the same size and type as src1 .
+@param flags operation flags; currently, the only supported flag is cv::DFT_ROWS, which indicates that
+each row of src1 and src2 is an independent 1D Fourier spectrum. If you do not want to use this flag, then simply add a `0` as value.
+@param conjB optional flag that conjugates the second input array before the multiplication (true)
+or not (false).
+*/
+CV_EXPORTS_W void mulSpectrums(InputArray a, InputArray b, OutputArray c,
+                               int flags, bool conjB = false);
+
+/** @brief Returns the optimal DFT size for a given vector size.
+
+DFT performance is not a monotonic function of a vector size. Therefore, when you calculate
+convolution of two arrays or perform the spectral analysis of an array, it usually makes sense to
+pad the input data with zeros to get a bit larger array that can be transformed much faster than the
+original one. Arrays whose size is a power-of-two (2, 4, 8, 16, 32, ...) are the fastest to process.
+Though, the arrays whose size is a product of 2's, 3's, and 5's (for example, 300 = 5\*5\*3\*2\*2)
+are also processed quite efficiently.
+
+The function cv::getOptimalDFTSize returns the minimum number N that is greater than or equal to vecsize
+so that the DFT of a vector of size N can be processed efficiently. In the current implementation N
+= 2 ^p^ \* 3 ^q^ \* 5 ^r^ for some integer p, q, r.
+
+The function returns a negative number if vecsize is too large (very close to INT_MAX ).
+
+While the function cannot be used directly to estimate the optimal vector size for DCT transform
+(since the current DCT implementation supports only even-size vectors), it can be easily processed
+as getOptimalDFTSize((vecsize+1)/2)\*2.
+@param vecsize vector size.
+@sa dft, dct, idft, idct, mulSpectrums
+*/
+CV_EXPORTS_W int getOptimalDFTSize(int vecsize);
+
+/** @brief Returns the default random number generator.
+
+The function cv::theRNG returns the default random number generator. For each thread, there is a
+separate random number generator, so you can use the function safely in multi-thread environments.
+If you just need to get a single random number using this generator or initialize an array, you can
+use randu or randn instead. But if you are going to generate many random numbers inside a loop, it
+is much faster to use this function to retrieve the generator and then use RNG::operator _Tp() .
+@sa RNG, randu, randn
+*/
+CV_EXPORTS RNG& theRNG();
+
+/** @brief Sets state of default random number generator.
+
+The function cv::setRNGSeed sets state of default random number generator to custom value.
+@param seed new state for default random number generator
+@sa RNG, randu, randn
+*/
+CV_EXPORTS_W void setRNGSeed(int seed);
+
+/** @brief Generates a single uniformly-distributed random number or an array of random numbers.
+
+Non-template variant of the function fills the matrix dst with uniformly-distributed
+random numbers from the specified range:
+\f[\texttt{low} _c  \leq \texttt{dst} (I)_c <  \texttt{high} _c\f]
+@param dst output array of random numbers; the array must be pre-allocated.
+@param low inclusive lower boundary of the generated random numbers.
+@param high exclusive upper boundary of the generated random numbers.
+@sa RNG, randn, theRNG
+*/
+CV_EXPORTS_W void randu(InputOutputArray dst, InputArray low, InputArray high);
+
+/** @brief Fills the array with normally distributed random numbers.
+
+The function cv::randn fills the matrix dst with normally distributed random numbers with the specified
+mean vector and the standard deviation matrix. The generated random numbers are clipped to fit the
+value range of the output array data type.
+@param dst output array of random numbers; the array must be pre-allocated and have 1 to 4 channels.
+@param mean mean value (expectation) of the generated random numbers.
+@param stddev standard deviation of the generated random numbers; it can be either a vector (in
+which case a diagonal standard deviation matrix is assumed) or a square matrix.
+@sa RNG, randu
+*/
+CV_EXPORTS_W void randn(InputOutputArray dst, InputArray mean, InputArray stddev);
+
+/** @brief Shuffles the array elements randomly.
+
+The function cv::randShuffle shuffles the specified 1D array by randomly choosing pairs of elements and
+swapping them. The number of such swap operations will be dst.rows\*dst.cols\*iterFactor .
+@param dst input/output numerical 1D array.
+@param iterFactor scale factor that determines the number of random swap operations (see the details
+below).
+@param rng optional random number generator used for shuffling; if it is zero, theRNG () is used
+instead.
+@sa RNG, sort
+*/
+CV_EXPORTS_W void randShuffle(InputOutputArray dst, double iterFactor = 1., RNG* rng = 0);
+
+/** @brief Principal Component Analysis
+
+The class is used to calculate a special basis for a set of vectors. The
+basis will consist of eigenvectors of the covariance matrix calculated
+from the input set of vectors. The class %PCA can also transform
+vectors to/from the new coordinate space defined by the basis. Usually,
+in this new coordinate system, each vector from the original set (and
+any linear combination of such vectors) can be quite accurately
+approximated by taking its first few components, corresponding to the
+eigenvectors of the largest eigenvalues of the covariance matrix.
+Geometrically it means that you calculate a projection of the vector to
+a subspace formed by a few eigenvectors corresponding to the dominant
+eigenvalues of the covariance matrix. And usually such a projection is
+very close to the original vector. So, you can represent the original
+vector from a high-dimensional space with a much shorter vector
+consisting of the projected vector's coordinates in the subspace. Such a
+transformation is also known as Karhunen-Loeve Transform, or KLT.
+See http://en.wikipedia.org/wiki/Principal_component_analysis
+
+The sample below is the function that takes two matrices. The first
+function stores a set of vectors (a row per vector) that is used to
+calculate PCA. The second function stores another "test" set of vectors
+(a row per vector). First, these vectors are compressed with PCA, then
+reconstructed back, and then the reconstruction error norm is computed
+and printed for each vector. :
+
+@code{.cpp}
+using namespace cv;
+
+PCA compressPCA(const Mat& pcaset, int maxComponents,
+                const Mat& testset, Mat& compressed)
+{
+    PCA pca(pcaset, // pass the data
+            Mat(), // we do not have a pre-computed mean vector,
+                   // so let the PCA engine to compute it
+            PCA::DATA_AS_ROW, // indicate that the vectors
+                                // are stored as matrix rows
+                                // (use PCA::DATA_AS_COL if the vectors are
+                                // the matrix columns)
+            maxComponents // specify, how many principal components to retain
+            );
+    // if there is no test data, just return the computed basis, ready-to-use
+    if( !testset.data )
+        return pca;
+    CV_Assert( testset.cols == pcaset.cols );
+
+    compressed.create(testset.rows, maxComponents, testset.type());
+
+    Mat reconstructed;
+    for( int i = 0; i < testset.rows; i++ )
+    {
+        Mat vec = testset.row(i), coeffs = compressed.row(i), reconstructed;
+        // compress the vector, the result will be stored
+        // in the i-th row of the output matrix
+        pca.project(vec, coeffs);
+        // and then reconstruct it
+        pca.backProject(coeffs, reconstructed);
+        // and measure the error
+        printf("%d. diff = %g\n", i, norm(vec, reconstructed, NORM_L2));
+    }
+    return pca;
+}
+@endcode
+@sa calcCovarMatrix, mulTransposed, SVD, dft, dct
+*/
+class CV_EXPORTS PCA
+{
+public:
+    enum Flags { DATA_AS_ROW = 0, //!< indicates that the input samples are stored as matrix rows
+                 DATA_AS_COL = 1, //!< indicates that the input samples are stored as matrix columns
+                 USE_AVG     = 2  //!
+               };
+
+    /** @brief default constructor
+
+    The default constructor initializes an empty %PCA structure. The other
+    constructors initialize the structure and call PCA::operator()().
+    */
+    PCA();
+
+    /** @overload
+    @param data input samples stored as matrix rows or matrix columns.
+    @param mean optional mean value; if the matrix is empty (@c noArray()),
+    the mean is computed from the data.
+    @param flags operation flags; currently the parameter is only used to
+    specify the data layout (PCA::Flags)
+    @param maxComponents maximum number of components that %PCA should
+    retain; by default, all the components are retained.
+    */
+    PCA(InputArray data, InputArray mean, int flags, int maxComponents = 0);
+
+    /** @overload
+    @param data input samples stored as matrix rows or matrix columns.
+    @param mean optional mean value; if the matrix is empty (noArray()),
+    the mean is computed from the data.
+    @param flags operation flags; currently the parameter is only used to
+    specify the data layout (PCA::Flags)
+    @param retainedVariance Percentage of variance that PCA should retain.
+    Using this parameter will let the PCA decided how many components to
+    retain but it will always keep at least 2.
+    */
+    PCA(InputArray data, InputArray mean, int flags, double retainedVariance);
+
+    /** @brief performs %PCA
+
+    The operator performs %PCA of the supplied dataset. It is safe to reuse
+    the same PCA structure for multiple datasets. That is, if the structure
+    has been previously used with another dataset, the existing internal
+    data is reclaimed and the new @ref eigenvalues, @ref eigenvectors and @ref
+    mean are allocated and computed.
+
+    The computed @ref eigenvalues are sorted from the largest to the smallest and
+    the corresponding @ref eigenvectors are stored as eigenvectors rows.
+
+    @param data input samples stored as the matrix rows or as the matrix
+    columns.
+    @param mean optional mean value; if the matrix is empty (noArray()),
+    the mean is computed from the data.
+    @param flags operation flags; currently the parameter is only used to
+    specify the data layout. (Flags)
+    @param maxComponents maximum number of components that PCA should
+    retain; by default, all the components are retained.
+    */
+    PCA& operator()(InputArray data, InputArray mean, int flags, int maxComponents = 0);
+
+    /** @overload
+    @param data input samples stored as the matrix rows or as the matrix
+    columns.
+    @param mean optional mean value; if the matrix is empty (noArray()),
+    the mean is computed from the data.
+    @param flags operation flags; currently the parameter is only used to
+    specify the data layout. (PCA::Flags)
+    @param retainedVariance Percentage of variance that %PCA should retain.
+    Using this parameter will let the %PCA decided how many components to
+    retain but it will always keep at least 2.
+     */
+    PCA& operator()(InputArray data, InputArray mean, int flags, double retainedVariance);
+
+    /** @brief Projects vector(s) to the principal component subspace.
+
+    The methods project one or more vectors to the principal component
+    subspace, where each vector projection is represented by coefficients in
+    the principal component basis. The first form of the method returns the
+    matrix that the second form writes to the result. So the first form can
+    be used as a part of expression while the second form can be more
+    efficient in a processing loop.
+    @param vec input vector(s); must have the same dimensionality and the
+    same layout as the input data used at %PCA phase, that is, if
+    DATA_AS_ROW are specified, then `vec.cols==data.cols`
+    (vector dimensionality) and `vec.rows` is the number of vectors to
+    project, and the same is true for the PCA::DATA_AS_COL case.
+    */
+    Mat project(InputArray vec) const;
+
+    /** @overload
+    @param vec input vector(s); must have the same dimensionality and the
+    same layout as the input data used at PCA phase, that is, if
+    DATA_AS_ROW are specified, then `vec.cols==data.cols`
+    (vector dimensionality) and `vec.rows` is the number of vectors to
+    project, and the same is true for the PCA::DATA_AS_COL case.
+    @param result output vectors; in case of PCA::DATA_AS_COL, the
+    output matrix has as many columns as the number of input vectors, this
+    means that `result.cols==vec.cols` and the number of rows match the
+    number of principal components (for example, `maxComponents` parameter
+    passed to the constructor).
+     */
+    void project(InputArray vec, OutputArray result) const;
+
+    /** @brief Reconstructs vectors from their PC projections.
+
+    The methods are inverse operations to PCA::project. They take PC
+    coordinates of projected vectors and reconstruct the original vectors.
+    Unless all the principal components have been retained, the
+    reconstructed vectors are different from the originals. But typically,
+    the difference is small if the number of components is large enough (but
+    still much smaller than the original vector dimensionality). As a
+    result, PCA is used.
+    @param vec coordinates of the vectors in the principal component
+    subspace, the layout and size are the same as of PCA::project output
+    vectors.
+     */
+    Mat backProject(InputArray vec) const;
+
+    /** @overload
+    @param vec coordinates of the vectors in the principal component
+    subspace, the layout and size are the same as of PCA::project output
+    vectors.
+    @param result reconstructed vectors; the layout and size are the same as
+    of PCA::project input vectors.
+     */
+    void backProject(InputArray vec, OutputArray result) const;
+
+    /** @brief write PCA objects
+
+    Writes @ref eigenvalues @ref eigenvectors and @ref mean to specified FileStorage
+     */
+    void write(FileStorage& fs) const;
+
+    /** @brief load PCA objects
+
+    Loads @ref eigenvalues @ref eigenvectors and @ref mean from specified FileNode
+     */
+    void read(const FileNode& fn);
+
+    Mat eigenvectors; //!< eigenvectors of the covariation matrix
+    Mat eigenvalues; //!< eigenvalues of the covariation matrix
+    Mat mean; //!< mean value subtracted before the projection and added after the back projection
+};
+
+/** @example samples/cpp/pca.cpp
+An example using %PCA for dimensionality reduction while maintaining an amount of variance
+*/
+
+/** @example samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+Check @ref tutorial_introduction_to_pca "the corresponding tutorial" for more details
+*/
+
+/**
+@brief Linear Discriminant Analysis
+@todo document this class
+*/
+class CV_EXPORTS LDA
+{
+public:
+    /** @brief constructor
+    Initializes a LDA with num_components (default 0).
+    */
+    explicit LDA(int num_components = 0);
+
+    /** Initializes and performs a Discriminant Analysis with Fisher's
+     Optimization Criterion on given data in src and corresponding labels
+     in labels. If 0 (or less) number of components are given, they are
+     automatically determined for given data in computation.
+    */
+    LDA(InputArrayOfArrays src, InputArray labels, int num_components = 0);
+
+    /** Serializes this object to a given filename.
+      */
+    void save(const String& filename) const;
+
+    /** Deserializes this object from a given filename.
+      */
+    void load(const String& filename);
+
+    /** Serializes this object to a given cv::FileStorage.
+      */
+    void save(FileStorage& fs) const;
+
+    /** Deserializes this object from a given cv::FileStorage.
+      */
+    void load(const FileStorage& node);
+
+    /** destructor
+      */
+    ~LDA();
+
+    /** Compute the discriminants for data in src (row aligned) and labels.
+      */
+    void compute(InputArrayOfArrays src, InputArray labels);
+
+    /** Projects samples into the LDA subspace.
+        src may be one or more row aligned samples.
+      */
+    Mat project(InputArray src);
+
+    /** Reconstructs projections from the LDA subspace.
+        src may be one or more row aligned projections.
+      */
+    Mat reconstruct(InputArray src);
+
+    /** Returns the eigenvectors of this LDA.
+      */
+    Mat eigenvectors() const { return _eigenvectors; }
+
+    /** Returns the eigenvalues of this LDA.
+      */
+    Mat eigenvalues() const { return _eigenvalues; }
+
+    static Mat subspaceProject(InputArray W, InputArray mean, InputArray src);
+    static Mat subspaceReconstruct(InputArray W, InputArray mean, InputArray src);
+
+protected:
+    int _num_components;
+    Mat _eigenvectors;
+    Mat _eigenvalues;
+    void lda(InputArrayOfArrays src, InputArray labels);
+};
+
+/** @brief Singular Value Decomposition
+
+Class for computing Singular Value Decomposition of a floating-point
+matrix. The Singular Value Decomposition is used to solve least-square
+problems, under-determined linear systems, invert matrices, compute
+condition numbers, and so on.
+
+If you want to compute a condition number of a matrix or an absolute value of
+its determinant, you do not need `u` and `vt`. You can pass
+flags=SVD::NO_UV|... . Another flag SVD::FULL_UV indicates that full-size u
+and vt must be computed, which is not necessary most of the time.
+
+@sa invert, solve, eigen, determinant
+*/
+class CV_EXPORTS SVD
+{
+public:
+    enum Flags {
+        /** allow the algorithm to modify the decomposed matrix; it can save space and speed up
+            processing. currently ignored. */
+        MODIFY_A = 1,
+        /** indicates that only a vector of singular values `w` is to be processed, while u and vt
+            will be set to empty matrices */
+        NO_UV    = 2,
+        /** when the matrix is not square, by default the algorithm produces u and vt matrices of
+            sufficiently large size for the further A reconstruction; if, however, FULL_UV flag is
+            specified, u and vt will be full-size square orthogonal matrices.*/
+        FULL_UV  = 4
+    };
+
+    /** @brief the default constructor
+
+    initializes an empty SVD structure
+      */
+    SVD();
+
+    /** @overload
+    initializes an empty SVD structure and then calls SVD::operator()
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
+    @param flags operation flags (SVD::Flags)
+      */
+    SVD( InputArray src, int flags = 0 );
+
+    /** @brief the operator that performs SVD. The previously allocated u, w and vt are released.
+
+    The operator performs the singular value decomposition of the supplied
+    matrix. The u,`vt` , and the vector of singular values w are stored in
+    the structure. The same SVD structure can be reused many times with
+    different matrices. Each time, if needed, the previous u,`vt` , and w
+    are reclaimed and the new matrices are created, which is all handled by
+    Mat::create.
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
+    @param flags operation flags (SVD::Flags)
+      */
+    SVD& operator ()( InputArray src, int flags = 0 );
+
+    /** @brief decomposes matrix and stores the results to user-provided matrices
+
+    The methods/functions perform SVD of matrix. Unlike SVD::SVD constructor
+    and SVD::operator(), they store the results to the user-provided
+    matrices:
+
+    @code{.cpp}
+    Mat A, w, u, vt;
+    SVD::compute(A, w, u, vt);
+    @endcode
+
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
+    @param w calculated singular values
+    @param u calculated left singular vectors
+    @param vt transposed matrix of right singular vectors
+    @param flags operation flags - see SVD::Flags.
+      */
+    static void compute( InputArray src, OutputArray w,
+                         OutputArray u, OutputArray vt, int flags = 0 );
+
+    /** @overload
+    computes singular values of a matrix
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
+    @param w calculated singular values
+    @param flags operation flags - see SVD::Flags.
+      */
+    static void compute( InputArray src, OutputArray w, int flags = 0 );
+
+    /** @brief performs back substitution
+      */
+    static void backSubst( InputArray w, InputArray u,
+                           InputArray vt, InputArray rhs,
+                           OutputArray dst );
+
+    /** @brief solves an under-determined singular linear system
+
+    The method finds a unit-length solution x of a singular linear system
+    A\*x = 0. Depending on the rank of A, there can be no solutions, a
+    single solution or an infinite number of solutions. In general, the
+    algorithm solves the following problem:
+    \f[dst =  \arg \min _{x:  \| x \| =1}  \| src  \cdot x  \|\f]
+    @param src left-hand-side matrix.
+    @param dst found solution.
+      */
+    static void solveZ( InputArray src, OutputArray dst );
+
+    /** @brief performs a singular value back substitution.
+
+    The method calculates a back substitution for the specified right-hand
+    side:
+
+    \f[\texttt{x} =  \texttt{vt} ^T  \cdot diag( \texttt{w} )^{-1}  \cdot \texttt{u} ^T  \cdot \texttt{rhs} \sim \texttt{A} ^{-1}  \cdot \texttt{rhs}\f]
+
+    Using this technique you can either get a very accurate solution of the
+    convenient linear system, or the best (in the least-squares terms)
+    pseudo-solution of an overdetermined linear system.
+
+    @param rhs right-hand side of a linear system (u\*w\*v')\*dst = rhs to
+    be solved, where A has been previously decomposed.
+
+    @param dst found solution of the system.
+
+    @note Explicit SVD with the further back substitution only makes sense
+    if you need to solve many linear systems with the same left-hand side
+    (for example, src ). If all you need is to solve a single system
+    (possibly with multiple rhs immediately available), simply call solve
+    add pass #DECOMP_SVD there. It does absolutely the same thing.
+      */
+    void backSubst( InputArray rhs, OutputArray dst ) const;
+
+    /** @todo document */
+    template<typename _Tp, int m, int n, int nm> static
+    void compute( const Matx<_Tp, m, n>& a, Matx<_Tp, nm, 1>& w, Matx<_Tp, m, nm>& u, Matx<_Tp, n, nm>& vt );
+
+    /** @todo document */
+    template<typename _Tp, int m, int n, int nm> static
+    void compute( const Matx<_Tp, m, n>& a, Matx<_Tp, nm, 1>& w );
+
+    /** @todo document */
+    template<typename _Tp, int m, int n, int nm, int nb> static
+    void backSubst( const Matx<_Tp, nm, 1>& w, const Matx<_Tp, m, nm>& u, const Matx<_Tp, n, nm>& vt, const Matx<_Tp, m, nb>& rhs, Matx<_Tp, n, nb>& dst );
+
+    Mat u, w, vt;
+};
+
+/** @brief Random Number Generator
+
+Random number generator. It encapsulates the state (currently, a 64-bit
+integer) and has methods to return scalar random values and to fill
+arrays with random values. Currently it supports uniform and Gaussian
+(normal) distributions. The generator uses Multiply-With-Carry
+algorithm, introduced by G. Marsaglia (
+<http://en.wikipedia.org/wiki/Multiply-with-carry> ).
+Gaussian-distribution random numbers are generated using the Ziggurat
+algorithm ( <http://en.wikipedia.org/wiki/Ziggurat_algorithm> ),
+introduced by G. Marsaglia and W. W. Tsang.
+*/
+class CV_EXPORTS RNG
+{
+public:
+    enum { UNIFORM = 0,
+           NORMAL  = 1
+         };
+
+    /** @brief constructor
+
+    These are the RNG constructors. The first form sets the state to some
+    pre-defined value, equal to 2\*\*32-1 in the current implementation. The
+    second form sets the state to the specified value. If you passed state=0
+    , the constructor uses the above default value instead to avoid the
+    singular random number sequence, consisting of all zeros.
+    */
+    RNG();
+    /** @overload
+    @param state 64-bit value used to initialize the RNG.
+    */
+    RNG(uint64 state);
+    /**The method updates the state using the MWC algorithm and returns the
+    next 32-bit random number.*/
+    unsigned next();
+
+    /**Each of the methods updates the state using the MWC algorithm and
+    returns the next random number of the specified type. In case of integer
+    types, the returned number is from the available value range for the
+    specified type. In case of floating-point types, the returned value is
+    from [0,1) range.
+    */
+    operator uchar();
+    /** @overload */
+    operator schar();
+    /** @overload */
+    operator ushort();
+    /** @overload */
+    operator short();
+    /** @overload */
+    operator unsigned();
+    /** @overload */
+    operator int();
+    /** @overload */
+    operator float();
+    /** @overload */
+    operator double();
+
+    /** @brief returns a random integer sampled uniformly from [0, N).
+
+    The methods transform the state using the MWC algorithm and return the
+    next random number. The first form is equivalent to RNG::next . The
+    second form returns the random number modulo N, which means that the
+    result is in the range [0, N) .
+    */
+    unsigned operator ()();
+    /** @overload
+    @param N upper non-inclusive boundary of the returned random number.
+    */
+    unsigned operator ()(unsigned N);
+
+    /** @brief returns uniformly distributed integer random number from [a,b) range
+
+    The methods transform the state using the MWC algorithm and return the
+    next uniformly-distributed random number of the specified type, deduced
+    from the input parameter type, from the range [a, b) . There is a nuance
+    illustrated by the following sample:
+
+    @code{.cpp}
+    RNG rng;
+
+    // always produces 0
+    double a = rng.uniform(0, 1);
+
+    // produces double from [0, 1)
+    double a1 = rng.uniform((double)0, (double)1);
+
+    // produces float from [0, 1)
+    float b = rng.uniform(0.f, 1.f);
+
+    // produces double from [0, 1)
+    double c = rng.uniform(0., 1.);
+
+    // may cause compiler error because of ambiguity:
+    //  RNG::uniform(0, (int)0.999999)? or RNG::uniform((double)0, 0.99999)?
+    double d = rng.uniform(0, 0.999999);
+    @endcode
+
+    The compiler does not take into account the type of the variable to
+    which you assign the result of RNG::uniform . The only thing that
+    matters to the compiler is the type of a and b parameters. So, if you
+    want a floating-point random number, but the range boundaries are
+    integer numbers, either put dots in the end, if they are constants, or
+    use explicit type cast operators, as in the a1 initialization above.
+    @param a lower inclusive boundary of the returned random number.
+    @param b upper non-inclusive boundary of the returned random number.
+    */
+    int uniform(int a, int b);
+    /** @overload */
+    float uniform(float a, float b);
+    /** @overload */
+    double uniform(double a, double b);
+
+    /** @brief Fills arrays with random numbers.
+
+    @param mat 2D or N-dimensional matrix; currently matrices with more than
+    4 channels are not supported by the methods, use Mat::reshape as a
+    possible workaround.
+    @param distType distribution type, RNG::UNIFORM or RNG::NORMAL.
+    @param a first distribution parameter; in case of the uniform
+    distribution, this is an inclusive lower boundary, in case of the normal
+    distribution, this is a mean value.
+    @param b second distribution parameter; in case of the uniform
+    distribution, this is a non-inclusive upper boundary, in case of the
+    normal distribution, this is a standard deviation (diagonal of the
+    standard deviation matrix or the full standard deviation matrix).
+    @param saturateRange pre-saturation flag; for uniform distribution only;
+    if true, the method will first convert a and b to the acceptable value
+    range (according to the mat datatype) and then will generate uniformly
+    distributed random numbers within the range [saturate(a), saturate(b)),
+    if saturateRange=false, the method will generate uniformly distributed
+    random numbers in the original range [a, b) and then will saturate them,
+    it means, for example, that
+    <tt>theRNG().fill(mat_8u, RNG::UNIFORM, -DBL_MAX, DBL_MAX)</tt> will likely
+    produce array mostly filled with 0's and 255's, since the range (0, 255)
+    is significantly smaller than [-DBL_MAX, DBL_MAX).
+
+    Each of the methods fills the matrix with the random values from the
+    specified distribution. As the new numbers are generated, the RNG state
+    is updated accordingly. In case of multiple-channel images, every
+    channel is filled independently, which means that RNG cannot generate
+    samples from the multi-dimensional Gaussian distribution with
+    non-diagonal covariance matrix directly. To do that, the method
+    generates samples from multi-dimensional standard Gaussian distribution
+    with zero mean and identity covariation matrix, and then transforms them
+    using transform to get samples from the specified Gaussian distribution.
+    */
+    void fill( InputOutputArray mat, int distType, InputArray a, InputArray b, bool saturateRange = false );
+
+    /** @brief Returns the next random number sampled from the Gaussian distribution
+    @param sigma standard deviation of the distribution.
+
+    The method transforms the state using the MWC algorithm and returns the
+    next random number from the Gaussian distribution N(0,sigma) . That is,
+    the mean value of the returned random numbers is zero and the standard
+    deviation is the specified sigma .
+    */
+    double gaussian(double sigma);
+
+    uint64 state;
+
+    bool operator ==(const RNG& other) const;
+};
+
+/** @brief Mersenne Twister random number generator
+
+Inspired by http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c
+@todo document
+*/
+class CV_EXPORTS RNG_MT19937
+{
+public:
+    RNG_MT19937();
+    RNG_MT19937(unsigned s);
+    void seed(unsigned s);
+
+    unsigned next();
+
+    operator int();
+    operator unsigned();
+    operator float();
+    operator double();
+
+    unsigned operator ()(unsigned N);
+    unsigned operator ()();
+
+    /** @brief returns uniformly distributed integer random number from [a,b) range*/
+    int uniform(int a, int b);
+    /** @brief returns uniformly distributed floating-point random number from [a,b) range*/
+    float uniform(float a, float b);
+    /** @brief returns uniformly distributed double-precision floating-point random number from [a,b) range*/
+    double uniform(double a, double b);
+
+private:
+    enum PeriodParameters {N = 624, M = 397};
+    unsigned state[N];
+    int mti;
+};
+
+//! @} core_array
+
+//! @addtogroup core_cluster
+//!  @{
+
+/** @example samples/cpp/kmeans.cpp
+An example on K-means clustering
+*/
+
+/** @brief Finds centers of clusters and groups input samples around the clusters.
+
+The function kmeans implements a k-means algorithm that finds the centers of cluster_count clusters
+and groups the input samples around the clusters. As an output, \f$\texttt{bestLabels}_i\f$ contains a
+0-based cluster index for the sample stored in the \f$i^{th}\f$ row of the samples matrix.
+
+@note
+-   (Python) An example on K-means clustering can be found at
+    opencv_source_code/samples/python/kmeans.py
+@param data Data for clustering. An array of N-Dimensional points with float coordinates is needed.
+Examples of this array can be:
+-   Mat points(count, 2, CV_32F);
+-   Mat points(count, 1, CV_32FC2);
+-   Mat points(1, count, CV_32FC2);
+-   std::vector\<cv::Point2f\> points(sampleCount);
+@param K Number of clusters to split the set by.
+@param bestLabels Input/output integer array that stores the cluster indices for every sample.
+@param criteria The algorithm termination criteria, that is, the maximum number of iterations and/or
+the desired accuracy. The accuracy is specified as criteria.epsilon. As soon as each of the cluster
+centers moves by less than criteria.epsilon on some iteration, the algorithm stops.
+@param attempts Flag to specify the number of times the algorithm is executed using different
+initial labellings. The algorithm returns the labels that yield the best compactness (see the last
+function parameter).
+@param flags Flag that can take values of cv::KmeansFlags
+@param centers Output matrix of the cluster centers, one row per each cluster center.
+@return The function returns the compactness measure that is computed as
+\f[\sum _i  \| \texttt{samples} _i -  \texttt{centers} _{ \texttt{labels} _i} \| ^2\f]
+after every attempt. The best (minimum) value is chosen and the corresponding labels and the
+compactness value are returned by the function. Basically, you can use only the core of the
+function, set the number of attempts to 1, initialize labels each time using a custom algorithm,
+pass them with the ( flags = #KMEANS_USE_INITIAL_LABELS ) flag, and then choose the best
+(most-compact) clustering.
+*/
+CV_EXPORTS_W double kmeans( InputArray data, int K, InputOutputArray bestLabels,
+                            TermCriteria criteria, int attempts,
+                            int flags, OutputArray centers = noArray() );
+
+//! @} core_cluster
+
+//! @addtogroup core_basic
+//! @{
+
+/////////////////////////////// Formatted output of cv::Mat ///////////////////////////
+
+/** @todo document */
+class CV_EXPORTS Formatted
+{
+public:
+    virtual const char* next() = 0;
+    virtual void reset() = 0;
+    virtual ~Formatted();
+};
+
+/** @todo document */
+class CV_EXPORTS Formatter
+{
+public:
+    enum FormatType {
+           FMT_DEFAULT = 0,
+           FMT_MATLAB  = 1,
+           FMT_CSV     = 2,
+           FMT_PYTHON  = 3,
+           FMT_NUMPY   = 4,
+           FMT_C       = 5
+         };
+
+    virtual ~Formatter();
+
+    virtual Ptr<Formatted> format(const Mat& mtx) const = 0;
+
+    virtual void set16fPrecision(int p = 4) = 0;
+    virtual void set32fPrecision(int p = 8) = 0;
+    virtual void set64fPrecision(int p = 16) = 0;
+    virtual void setMultiline(bool ml = true) = 0;
+
+    static Ptr<Formatter> get(Formatter::FormatType fmt = FMT_DEFAULT);
+
+};
+
+static inline
+String& operator << (String& out, Ptr<Formatted> fmtd)
+{
+    fmtd->reset();
+    for(const char* str = fmtd->next(); str; str = fmtd->next())
+        out += cv::String(str);
+    return out;
+}
+
+static inline
+String& operator << (String& out, const Mat& mtx)
+{
+    return out << Formatter::get()->format(mtx);
+}
+
+//////////////////////////////////////// Algorithm ////////////////////////////////////
+
+class CV_EXPORTS Algorithm;
+
+template<typename _Tp, typename _EnumTp = void> struct ParamType {};
+
+
+/** @brief This is a base class for all more or less complex algorithms in OpenCV
+
+especially for classes of algorithms, for which there can be multiple implementations. The examples
+are stereo correspondence (for which there are algorithms like block matching, semi-global block
+matching, graph-cut etc.), background subtraction (which can be done using mixture-of-gaussians
+models, codebook-based algorithm etc.), optical flow (block matching, Lucas-Kanade, Horn-Schunck
+etc.).
+
+Here is example of SimpleBlobDetector use in your application via Algorithm interface:
+@snippet snippets/core_various.cpp Algorithm
+*/
+class CV_EXPORTS_W Algorithm
+{
+public:
+    Algorithm();
+    virtual ~Algorithm();
+
+    /** @brief Clears the algorithm state
+    */
+    CV_WRAP virtual void clear() {}
+
+    /** @brief Stores algorithm parameters in a file storage
+    */
+    CV_WRAP virtual void write(FileStorage& fs) const { CV_UNUSED(fs); }
+
+    /**
+    * @overload
+    */
+    CV_WRAP void write(FileStorage& fs, const String& name) const;
+#if CV_VERSION_MAJOR < 5
+    /** @deprecated */
+    void write(const Ptr<FileStorage>& fs, const String& name = String()) const;
+#endif
+
+    /** @brief Reads algorithm parameters from a file storage
+    */
+    CV_WRAP virtual void read(const FileNode& fn) { CV_UNUSED(fn); }
+
+    /** @brief Returns true if the Algorithm is empty (e.g. in the very beginning or after unsuccessful read
+    */
+    CV_WRAP virtual bool empty() const { return false; }
+
+    /** @brief Reads algorithm from the file node
+
+    This is static template method of Algorithm. It's usage is following (in the case of SVM):
+    @code
+    cv::FileStorage fsRead("example.xml", FileStorage::READ);
+    Ptr<SVM> svm = Algorithm::read<SVM>(fsRead.root());
+    @endcode
+    In order to make this method work, the derived class must overwrite Algorithm::read(const
+    FileNode& fn) and also have static create() method without parameters
+    (or with all the optional parameters)
+    */
+    template<typename _Tp> static Ptr<_Tp> read(const FileNode& fn)
+    {
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** @brief Loads algorithm from the file
+
+    @param filename Name of the file to read.
+    @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+
+    This is static template method of Algorithm. It's usage is following (in the case of SVM):
+    @code
+    Ptr<SVM> svm = Algorithm::load<SVM>("my_svm_model.xml");
+    @endcode
+    In order to make this method work, the derived class must overwrite Algorithm::read(const
+    FileNode& fn).
+    */
+    template<typename _Tp> static Ptr<_Tp> load(const String& filename, const String& objname=String())
+    {
+        FileStorage fs(filename, FileStorage::READ);
+        CV_Assert(fs.isOpened());
+        FileNode fn = objname.empty() ? fs.getFirstTopLevelNode() : fs[objname];
+        if (fn.empty()) return Ptr<_Tp>();
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** @brief Loads algorithm from a String
+
+    @param strModel The string variable containing the model you want to load.
+    @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+
+    This is static template method of Algorithm. It's usage is following (in the case of SVM):
+    @code
+    Ptr<SVM> svm = Algorithm::loadFromString<SVM>(myStringModel);
+    @endcode
+    */
+    template<typename _Tp> static Ptr<_Tp> loadFromString(const String& strModel, const String& objname=String())
+    {
+        FileStorage fs(strModel, FileStorage::READ + FileStorage::MEMORY);
+        FileNode fn = objname.empty() ? fs.getFirstTopLevelNode() : fs[objname];
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** Saves the algorithm to a file.
+    In order to make this method work, the derived class must implement Algorithm::write(FileStorage& fs). */
+    CV_WRAP virtual void save(const String& filename) const;
+
+    /** Returns the algorithm string identifier.
+    This string is used as top level xml/yml node tag when the object is saved to a file or string. */
+    CV_WRAP virtual String getDefaultName() const;
+
+protected:
+    void writeFormat(FileStorage& fs) const;
+};
+
+enum struct Param {
+    INT=0, BOOLEAN=1, REAL=2, STRING=3, MAT=4, MAT_VECTOR=5, ALGORITHM=6, FLOAT=7,
+    UNSIGNED_INT=8, UINT64=9, UCHAR=11, SCALAR=12
+};
+
+
+
+template<> struct ParamType<bool>
+{
+    typedef bool const_param_type;
+    typedef bool member_type;
+
+    static const Param type = Param::BOOLEAN;
+};
+
+template<> struct ParamType<int>
+{
+    typedef int const_param_type;
+    typedef int member_type;
+
+    static const Param type = Param::INT;
+};
+
+template<> struct ParamType<double>
+{
+    typedef double const_param_type;
+    typedef double member_type;
+
+    static const Param type = Param::REAL;
+};
+
+template<> struct ParamType<String>
+{
+    typedef const String& const_param_type;
+    typedef String member_type;
+
+    static const Param type = Param::STRING;
+};
+
+template<> struct ParamType<Mat>
+{
+    typedef const Mat& const_param_type;
+    typedef Mat member_type;
+
+    static const Param type = Param::MAT;
+};
+
+template<> struct ParamType<std::vector<Mat> >
+{
+    typedef const std::vector<Mat>& const_param_type;
+    typedef std::vector<Mat> member_type;
+
+    static const Param type = Param::MAT_VECTOR;
+};
+
+template<> struct ParamType<Algorithm>
+{
+    typedef const Ptr<Algorithm>& const_param_type;
+    typedef Ptr<Algorithm> member_type;
+
+    static const Param type = Param::ALGORITHM;
+};
+
+template<> struct ParamType<float>
+{
+    typedef float const_param_type;
+    typedef float member_type;
+
+    static const Param type = Param::FLOAT;
+};
+
+template<> struct ParamType<unsigned>
+{
+    typedef unsigned const_param_type;
+    typedef unsigned member_type;
+
+    static const Param type = Param::UNSIGNED_INT;
+};
+
+template<> struct ParamType<uint64>
+{
+    typedef uint64 const_param_type;
+    typedef uint64 member_type;
+
+    static const Param type = Param::UINT64;
+};
+
+template<> struct ParamType<uchar>
+{
+    typedef uchar const_param_type;
+    typedef uchar member_type;
+
+    static const Param type = Param::UCHAR;
+};
+
+template<> struct ParamType<Scalar>
+{
+    typedef const Scalar& const_param_type;
+    typedef Scalar member_type;
+
+    static const Param type = Param::SCALAR;
+};
+
+template<typename _Tp>
+struct ParamType<_Tp, typename std::enable_if< std::is_enum<_Tp>::value >::type>
+{
+    typedef typename std::underlying_type<_Tp>::type const_param_type;
+    typedef typename std::underlying_type<_Tp>::type member_type;
+
+    static const Param type = Param::INT;
+};
+
+//! @} core_basic
+
+} //namespace cv
+
+#include "opencv2/core/operations.hpp"
+#include "opencv2/core/cvstd.inl.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/optim.hpp"
+#include "opencv2/core/ovx.hpp"
+
+#endif /*OPENCV_CORE_HPP*/
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/affine.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/affine.hpp
new file mode 100644
index 000000000000..1806382e99ae
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/affine.hpp
@@ -0,0 +1,678 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_AFFINE3_HPP
+#define OPENCV_CORE_AFFINE3_HPP
+
+#ifdef __cplusplus
+
+#include <opencv2/core.hpp>
+
+namespace cv
+{
+
+//! @addtogroup core
+//! @{
+
+    /** @brief Affine transform
+     *
+     * It represents a 4x4 homogeneous transformation matrix \f$T\f$
+     *
+     *  \f[T =
+     *  \begin{bmatrix}
+     *  R & t\\
+     *  0 & 1\\
+     *  \end{bmatrix}
+     *  \f]
+     *
+     *  where \f$R\f$ is a 3x3 rotation matrix and \f$t\f$ is a 3x1 translation vector.
+     *
+     *  You can specify \f$R\f$ either by a 3x3 rotation matrix or by a 3x1 rotation vector,
+     *  which is converted to a 3x3 rotation matrix by the Rodrigues formula.
+     *
+     *  To construct a matrix \f$T\f$ representing first rotation around the axis \f$r\f$ with rotation
+     *  angle \f$|r|\f$ in radian (right hand rule) and then translation by the vector \f$t\f$, you can use
+     *
+     *  @code
+     *  cv::Vec3f r, t;
+     *  cv::Affine3f T(r, t);
+     *  @endcode
+     *
+     *  If you already have the rotation matrix \f$R\f$, then you can use
+     *
+     *  @code
+     *  cv::Matx33f R;
+     *  cv::Affine3f T(R, t);
+     *  @endcode
+     *
+     *  To extract the rotation matrix \f$R\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Matx33f R = T.rotation();
+     *  @endcode
+     *
+     *  To extract the translation vector \f$t\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Vec3f t = T.translation();
+     *  @endcode
+     *
+     *  To extract the rotation vector \f$r\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Vec3f r = T.rvec();
+     *  @endcode
+     *
+     *  Note that since the mapping from rotation vectors to rotation matrices
+     *  is many to one. The returned rotation vector is not necessarily the one
+     *  you used before to set the matrix.
+     *
+     *  If you have two transformations \f$T = T_1 * T_2\f$, use
+     *
+     *  @code
+     *  cv::Affine3f T, T1, T2;
+     *  T = T2.concatenate(T1);
+     *  @endcode
+     *
+     *  To get the inverse transform of \f$T\f$, use
+     *
+     *  @code
+     *  cv::Affine3f T, T_inv;
+     *  T_inv = T.inv();
+     *  @endcode
+     *
+     */
+    template<typename T>
+    class Affine3
+    {
+    public:
+        typedef T float_type;
+        typedef Matx<float_type, 3, 3> Mat3;
+        typedef Matx<float_type, 4, 4> Mat4;
+        typedef Vec<float_type, 3> Vec3;
+
+       //! Default constructor. It represents a 4x4 identity matrix.
+        Affine3();
+
+        //! Augmented affine matrix
+        Affine3(const Mat4& affine);
+
+        /**
+         *  The resulting 4x4 matrix is
+         *
+         *  \f[
+         *  \begin{bmatrix}
+         *  R & t\\
+         *  0 & 1\\
+         *  \end{bmatrix}
+         *  \f]
+         *
+         * @param R 3x3 rotation matrix.
+         * @param t 3x1 translation vector.
+         */
+        Affine3(const Mat3& R, const Vec3& t = Vec3::all(0));
+
+        /**
+         * Rodrigues vector.
+         *
+         * The last row of the current matrix is set to [0,0,0,1].
+         *
+         * @param rvec 3x1 rotation vector. Its direction indicates the rotation axis and its length
+         *             indicates the rotation angle in radian (using right hand rule).
+         * @param t 3x1 translation vector.
+         */
+        Affine3(const Vec3& rvec, const Vec3& t = Vec3::all(0));
+
+        /**
+         * Combines all constructors above. Supports 4x4, 3x4, 3x3, 1x3, 3x1 sizes of data matrix.
+         *
+         * The last row of the current matrix is set to [0,0,0,1] when data is not 4x4.
+         *
+         * @param data 1-channel matrix.
+         *             when it is 4x4, it is copied to the current matrix and t is not used.
+         *             When it is 3x4, it is copied to the upper part 3x4 of the current matrix and t is not used.
+         *             When it is 3x3, it is copied to the upper left 3x3 part of the current matrix.
+         *             When it is 3x1 or 1x3, it is treated as a rotation vector and the Rodrigues formula is used
+         *                             to compute a 3x3 rotation matrix.
+         * @param t 3x1 translation vector. It is used only when data is neither 4x4 nor 3x4.
+         */
+        explicit Affine3(const Mat& data, const Vec3& t = Vec3::all(0));
+
+        //! From 16-element array
+        explicit Affine3(const float_type* vals);
+
+        //! Create an 4x4 identity transform
+        static Affine3 Identity();
+
+        /**
+         * Rotation matrix.
+         *
+         * Copy the rotation matrix to the upper left 3x3 part of the current matrix.
+         * The remaining elements of the current matrix are not changed.
+         *
+         * @param R 3x3 rotation matrix.
+         *
+         */
+        void rotation(const Mat3& R);
+
+        /**
+         * Rodrigues vector.
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param rvec 3x1 rotation vector. The direction indicates the rotation axis and
+         *             its length indicates the rotation angle in radian (using the right thumb convention).
+         */
+        void rotation(const Vec3& rvec);
+
+        /**
+         * Combines rotation methods above. Supports 3x3, 1x3, 3x1 sizes of data matrix.
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param data 1-channel matrix.
+         *             When it is a 3x3 matrix, it sets the upper left 3x3 part of the current matrix.
+         *             When it is a 1x3 or 3x1 matrix, it is used as a rotation vector. The Rodrigues formula
+         *             is used to compute the rotation matrix and sets the upper left 3x3 part of the current matrix.
+         */
+        void rotation(const Mat& data);
+
+        /**
+         * Copy the 3x3 matrix L to the upper left part of the current matrix
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param L 3x3 matrix.
+         */
+        void linear(const Mat3& L);
+
+        /**
+         * Copy t to the first three elements of the last column of the current matrix
+         *
+         * It sets the upper right 3x1 part of the matrix. The remaining part is unaffected.
+         *
+         * @param t 3x1 translation vector.
+         */
+        void translation(const Vec3& t);
+
+        //! @return the upper left 3x3 part
+        Mat3 rotation() const;
+
+        //! @return the upper left 3x3 part
+        Mat3 linear() const;
+
+        //! @return the upper right 3x1 part
+        Vec3 translation() const;
+
+        //! Rodrigues vector.
+        //! @return a vector representing the upper left 3x3 rotation matrix of the current matrix.
+        //! @warning  Since the mapping between rotation vectors and rotation matrices is many to one,
+        //!           this function returns only one rotation vector that represents the current rotation matrix,
+        //!           which is not necessarily the same one set by `rotation(const Vec3& rvec)`.
+        Vec3 rvec() const;
+
+        //! @return the inverse of the current matrix.
+        Affine3 inv(int method = cv::DECOMP_SVD) const;
+
+        //! a.rotate(R) is equivalent to Affine(R, 0) * a;
+        Affine3 rotate(const Mat3& R) const;
+
+        //! a.rotate(rvec) is equivalent to Affine(rvec, 0) * a;
+        Affine3 rotate(const Vec3& rvec) const;
+
+        //! a.translate(t) is equivalent to Affine(E, t) * a, where E is an identity matrix
+        Affine3 translate(const Vec3& t) const;
+
+        //! a.concatenate(affine) is equivalent to affine * a;
+        Affine3 concatenate(const Affine3& affine) const;
+
+        template <typename Y> operator Affine3<Y>() const;
+
+        template <typename Y> Affine3<Y> cast() const;
+
+        Mat4 matrix;
+
+#if defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H
+        Affine3(const Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>& affine);
+        Affine3(const Eigen::Transform<T, 3, Eigen::Affine>& affine);
+        operator Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>() const;
+        operator Eigen::Transform<T, 3, Eigen::Affine>() const;
+#endif
+    };
+
+    template<typename T> static
+    Affine3<T> operator*(const Affine3<T>& affine1, const Affine3<T>& affine2);
+
+    //! V is a 3-element vector with member fields x, y and z
+    template<typename T, typename V> static
+    V operator*(const Affine3<T>& affine, const V& vector);
+
+    typedef Affine3<float> Affine3f;
+    typedef Affine3<double> Affine3d;
+
+    static Vec3f operator*(const Affine3f& affine, const Vec3f& vector);
+    static Vec3d operator*(const Affine3d& affine, const Vec3d& vector);
+
+    template<typename _Tp> class DataType< Affine3<_Tp> >
+    {
+    public:
+        typedef Affine3<_Tp>                               value_type;
+        typedef Affine3<typename DataType<_Tp>::work_type> work_type;
+        typedef _Tp                                        channel_type;
+
+        enum { generic_type = 0,
+               channels     = 16,
+               fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+               ,depth        = DataType<channel_type>::depth
+               ,type         = CV_MAKETYPE(depth, channels)
+#endif
+             };
+
+        typedef Vec<channel_type, channels> vec_type;
+    };
+
+    namespace traits {
+    template<typename _Tp>
+    struct Depth< Affine3<_Tp> > { enum { value = Depth<_Tp>::value }; };
+    template<typename _Tp>
+    struct Type< Affine3<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 16) }; };
+    } // namespace
+
+//! @} core
+
+}
+
+//! @cond IGNORED
+
+///////////////////////////////////////////////////////////////////////////////////
+// Implementation
+
+template<typename T> inline
+cv::Affine3<T>::Affine3()
+    : matrix(Mat4::eye())
+{}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Mat4& affine)
+    : matrix(affine)
+{}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Mat3& R, const Vec3& t)
+{
+    rotation(R);
+    translation(t);
+    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
+    matrix.val[15] = 1;
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Vec3& _rvec, const Vec3& t)
+{
+    rotation(_rvec);
+    translation(t);
+    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
+    matrix.val[15] = 1;
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const cv::Mat& data, const Vec3& t)
+{
+    CV_Assert(data.type() == cv::traits::Type<T>::value);
+    CV_Assert(data.channels() == 1);
+
+    if (data.cols == 4 && data.rows == 4)
+    {
+        data.copyTo(matrix);
+        return;
+    }
+    else if (data.cols == 4 && data.rows == 3)
+    {
+        rotation(data(Rect(0, 0, 3, 3)));
+        translation(data(Rect(3, 0, 1, 3)));
+    }
+    else
+    {
+        rotation(data);
+        translation(t);
+    }
+
+    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
+    matrix.val[15] = 1;
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const float_type* vals) : matrix(vals)
+{}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::Identity()
+{
+    return Affine3<T>(cv::Affine3<T>::Mat4::eye());
+}
+
+template<typename T> inline
+void cv::Affine3<T>::rotation(const Mat3& R)
+{
+    linear(R);
+}
+
+template<typename T> inline
+void cv::Affine3<T>::rotation(const Vec3& _rvec)
+{
+    double theta = norm(_rvec);
+
+    if (theta < DBL_EPSILON)
+        rotation(Mat3::eye());
+    else
+    {
+        double c = std::cos(theta);
+        double s = std::sin(theta);
+        double c1 = 1. - c;
+        double itheta = (theta != 0) ? 1./theta : 0.;
+
+        Point3_<T> r = _rvec*itheta;
+
+        Mat3 rrt( r.x*r.x, r.x*r.y, r.x*r.z, r.x*r.y, r.y*r.y, r.y*r.z, r.x*r.z, r.y*r.z, r.z*r.z );
+        Mat3 r_x( 0, -r.z, r.y, r.z, 0, -r.x, -r.y, r.x, 0 );
+
+        // R = cos(theta)*I + (1 - cos(theta))*r*rT + sin(theta)*[r_x]
+        // where [r_x] is [0 -rz ry; rz 0 -rx; -ry rx 0]
+        Mat3 R = c*Mat3::eye() + c1*rrt + s*r_x;
+
+        rotation(R);
+    }
+}
+
+//Combines rotation methods above. Supports 3x3, 1x3, 3x1 sizes of data matrix;
+template<typename T> inline
+void cv::Affine3<T>::rotation(const cv::Mat& data)
+{
+    CV_Assert(data.type() == cv::traits::Type<T>::value);
+    CV_Assert(data.channels() == 1);
+
+    if (data.cols == 3 && data.rows == 3)
+    {
+        Mat3 R;
+        data.copyTo(R);
+        rotation(R);
+    }
+    else if ((data.cols == 3 && data.rows == 1) || (data.cols == 1 && data.rows == 3))
+    {
+        Vec3 _rvec;
+        data.reshape(1, 3).copyTo(_rvec);
+        rotation(_rvec);
+    }
+    else
+        CV_Error(Error::StsError, "Input matrix can only be 3x3, 1x3 or 3x1");
+}
+
+template<typename T> inline
+void cv::Affine3<T>::linear(const Mat3& L)
+{
+    matrix.val[0] = L.val[0]; matrix.val[1] = L.val[1];  matrix.val[ 2] = L.val[2];
+    matrix.val[4] = L.val[3]; matrix.val[5] = L.val[4];  matrix.val[ 6] = L.val[5];
+    matrix.val[8] = L.val[6]; matrix.val[9] = L.val[7];  matrix.val[10] = L.val[8];
+}
+
+template<typename T> inline
+void cv::Affine3<T>::translation(const Vec3& t)
+{
+    matrix.val[3] = t[0]; matrix.val[7] = t[1]; matrix.val[11] = t[2];
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Mat3 cv::Affine3<T>::rotation() const
+{
+    return linear();
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Mat3 cv::Affine3<T>::linear() const
+{
+    typename cv::Affine3<T>::Mat3 R;
+    R.val[0] = matrix.val[0];  R.val[1] = matrix.val[1];  R.val[2] = matrix.val[ 2];
+    R.val[3] = matrix.val[4];  R.val[4] = matrix.val[5];  R.val[5] = matrix.val[ 6];
+    R.val[6] = matrix.val[8];  R.val[7] = matrix.val[9];  R.val[8] = matrix.val[10];
+    return R;
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Vec3 cv::Affine3<T>::translation() const
+{
+    return Vec3(matrix.val[3], matrix.val[7], matrix.val[11]);
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Vec3 cv::Affine3<T>::rvec() const
+{
+    cv::Vec3d w;
+    cv::Matx33d u, vt, R = rotation();
+    cv::SVD::compute(R, w, u, vt, cv::SVD::FULL_UV + cv::SVD::MODIFY_A);
+    R = u * vt;
+
+    double rx = R.val[7] - R.val[5];
+    double ry = R.val[2] - R.val[6];
+    double rz = R.val[3] - R.val[1];
+
+    double s = std::sqrt((rx*rx + ry*ry + rz*rz)*0.25);
+    double c = (R.val[0] + R.val[4] + R.val[8] - 1) * 0.5;
+    c = c > 1.0 ? 1.0 : c < -1.0 ? -1.0 : c;
+    double theta = std::acos(c);
+
+    if( s < 1e-5 )
+    {
+        if( c > 0 )
+            rx = ry = rz = 0;
+        else
+        {
+            double t;
+            t = (R.val[0] + 1) * 0.5;
+            rx = std::sqrt(std::max(t, 0.0));
+            t = (R.val[4] + 1) * 0.5;
+            ry = std::sqrt(std::max(t, 0.0)) * (R.val[1] < 0 ? -1.0 : 1.0);
+            t = (R.val[8] + 1) * 0.5;
+            rz = std::sqrt(std::max(t, 0.0)) * (R.val[2] < 0 ? -1.0 : 1.0);
+
+            if( fabs(rx) < fabs(ry) && fabs(rx) < fabs(rz) && (R.val[5] > 0) != (ry*rz > 0) )
+                rz = -rz;
+            theta /= std::sqrt(rx*rx + ry*ry + rz*rz);
+            rx *= theta;
+            ry *= theta;
+            rz *= theta;
+        }
+    }
+    else
+    {
+        double vth = 1/(2*s);
+        vth *= theta;
+        rx *= vth; ry *= vth; rz *= vth;
+    }
+
+    return cv::Vec3d(rx, ry, rz);
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::inv(int method) const
+{
+    return matrix.inv(method);
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::rotate(const Mat3& R) const
+{
+    Mat3 Lc = linear();
+    Vec3 tc = translation();
+    Mat4 result;
+    result.val[12] = result.val[13] = result.val[14] = 0;
+    result.val[15] = 1;
+
+    for(int j = 0; j < 3; ++j)
+    {
+        for(int i = 0; i < 3; ++i)
+        {
+            float_type value = 0;
+            for(int k = 0; k < 3; ++k)
+                value += R(j, k) * Lc(k, i);
+            result(j, i) = value;
+        }
+
+        result(j, 3) = R.row(j).dot(tc.t());
+    }
+    return result;
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::rotate(const Vec3& _rvec) const
+{
+    return rotate(Affine3f(_rvec).rotation());
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::translate(const Vec3& t) const
+{
+    Mat4 m = matrix;
+    m.val[ 3] += t[0];
+    m.val[ 7] += t[1];
+    m.val[11] += t[2];
+    return m;
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::concatenate(const Affine3<T>& affine) const
+{
+    return (*this).rotate(affine.rotation()).translate(affine.translation());
+}
+
+template<typename T> template <typename Y> inline
+cv::Affine3<T>::operator Affine3<Y>() const
+{
+    return Affine3<Y>(matrix);
+}
+
+template<typename T> template <typename Y> inline
+cv::Affine3<Y> cv::Affine3<T>::cast() const
+{
+    return Affine3<Y>(matrix);
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::operator*(const cv::Affine3<T>& affine1, const cv::Affine3<T>& affine2)
+{
+    return affine2.concatenate(affine1);
+}
+
+template<typename T, typename V> inline
+V cv::operator*(const cv::Affine3<T>& affine, const V& v)
+{
+    const typename Affine3<T>::Mat4& m = affine.matrix;
+
+    V r;
+    r.x = m.val[0] * v.x + m.val[1] * v.y + m.val[ 2] * v.z + m.val[ 3];
+    r.y = m.val[4] * v.x + m.val[5] * v.y + m.val[ 6] * v.z + m.val[ 7];
+    r.z = m.val[8] * v.x + m.val[9] * v.y + m.val[10] * v.z + m.val[11];
+    return r;
+}
+
+static inline
+cv::Vec3f cv::operator*(const cv::Affine3f& affine, const cv::Vec3f& v)
+{
+    const cv::Matx44f& m = affine.matrix;
+    cv::Vec3f r;
+    r.val[0] = m.val[0] * v[0] + m.val[1] * v[1] + m.val[ 2] * v[2] + m.val[ 3];
+    r.val[1] = m.val[4] * v[0] + m.val[5] * v[1] + m.val[ 6] * v[2] + m.val[ 7];
+    r.val[2] = m.val[8] * v[0] + m.val[9] * v[1] + m.val[10] * v[2] + m.val[11];
+    return r;
+}
+
+static inline
+cv::Vec3d cv::operator*(const cv::Affine3d& affine, const cv::Vec3d& v)
+{
+    const cv::Matx44d& m = affine.matrix;
+    cv::Vec3d r;
+    r.val[0] = m.val[0] * v[0] + m.val[1] * v[1] + m.val[ 2] * v[2] + m.val[ 3];
+    r.val[1] = m.val[4] * v[0] + m.val[5] * v[1] + m.val[ 6] * v[2] + m.val[ 7];
+    r.val[2] = m.val[8] * v[0] + m.val[9] * v[1] + m.val[10] * v[2] + m.val[11];
+    return r;
+}
+
+
+
+#if defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>& affine)
+{
+    cv::Mat(4, 4, cv::traits::Type<T>::value, affine.matrix().data()).copyTo(matrix);
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Eigen::Transform<T, 3, Eigen::Affine>& affine)
+{
+    Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)> a = affine;
+    cv::Mat(4, 4, cv::traits::Type<T>::value, a.matrix().data()).copyTo(matrix);
+}
+
+template<typename T> inline
+cv::Affine3<T>::operator Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>() const
+{
+    Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)> r;
+    cv::Mat hdr(4, 4, cv::traits::Type<T>::value, r.matrix().data());
+    cv::Mat(matrix, false).copyTo(hdr);
+    return r;
+}
+
+template<typename T> inline
+cv::Affine3<T>::operator Eigen::Transform<T, 3, Eigen::Affine>() const
+{
+    return this->operator Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>();
+}
+
+#endif /* defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H */
+
+//! @endcond
+
+#endif /* __cplusplus */
+
+#endif /* OPENCV_CORE_AFFINE3_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/async.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/async.hpp
new file mode 100644
index 000000000000..98868a130b23
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/async.hpp
@@ -0,0 +1,101 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ASYNC_HPP
+#define OPENCV_CORE_ASYNC_HPP
+
+#include <opencv2/core/mat.hpp>
+
+//#include <future>
+#include <chrono>
+
+namespace cv {
+
+/** @addtogroup core_async
+
+@{
+*/
+
+
+/** @brief Returns result of asynchronous operations
+
+Object has attached asynchronous state.
+Assignment operator doesn't clone asynchronous state (it is shared between all instances).
+
+Result can be fetched via get() method only once.
+
+*/
+class CV_EXPORTS_W AsyncArray
+{
+public:
+    ~AsyncArray() CV_NOEXCEPT;
+    CV_WRAP AsyncArray() CV_NOEXCEPT;
+    AsyncArray(const AsyncArray& o) CV_NOEXCEPT;
+    AsyncArray& operator=(const AsyncArray& o) CV_NOEXCEPT;
+    CV_WRAP void release() CV_NOEXCEPT;
+
+    /** Fetch the result.
+    @param[out] dst destination array
+
+    Waits for result until container has valid result.
+    Throws exception if exception was stored as a result.
+
+    Throws exception on invalid container state.
+
+    @note Result or stored exception can be fetched only once.
+    */
+    CV_WRAP void get(OutputArray dst) const;
+
+    /** Retrieving the result with timeout
+    @param[out] dst destination array
+    @param[in] timeoutNs timeout in nanoseconds, -1 for infinite wait
+
+    @returns true if result is ready, false if the timeout has expired
+
+    @note Result or stored exception can be fetched only once.
+    */
+    bool get(OutputArray dst, int64 timeoutNs) const;
+
+    CV_WRAP inline
+    bool get(OutputArray dst, double timeoutNs) const { return get(dst, (int64)timeoutNs); }
+
+    bool wait_for(int64 timeoutNs) const;
+
+    CV_WRAP inline
+    bool wait_for(double timeoutNs) const { return wait_for((int64)timeoutNs); }
+
+    CV_WRAP bool valid() const CV_NOEXCEPT;
+
+    inline AsyncArray(AsyncArray&& o) { p = o.p; o.p = NULL; }
+    inline AsyncArray& operator=(AsyncArray&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
+
+    template<typename _Rep, typename _Period>
+    inline bool get(OutputArray dst, const std::chrono::duration<_Rep, _Period>& timeout)
+    {
+        return get(dst, (int64)(std::chrono::nanoseconds(timeout).count()));
+    }
+
+    template<typename _Rep, typename _Period>
+    inline bool wait_for(const std::chrono::duration<_Rep, _Period>& timeout)
+    {
+        return wait_for((int64)(std::chrono::nanoseconds(timeout).count()));
+    }
+
+#if 0
+    std::future<Mat> getFutureMat() const;
+    std::future<UMat> getFutureUMat() const;
+#endif
+
+
+    // PImpl
+    struct Impl; friend struct Impl;
+    inline void* _getImpl() const CV_NOEXCEPT { return p; }
+protected:
+    Impl* p;
+};
+
+
+//! @}
+} // namespace
+#endif // OPENCV_CORE_ASYNC_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/base.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/base.hpp
new file mode 100644
index 000000000000..cc4cc0ddd25c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/base.hpp
@@ -0,0 +1,664 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_BASE_HPP
+#define OPENCV_CORE_BASE_HPP
+
+#ifndef __cplusplus
+#  error base.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/opencv_modules.hpp"
+
+#include <climits>
+#include <algorithm>
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+
+namespace cv
+{
+
+//! @addtogroup core_utils
+//! @{
+
+namespace Error {
+//! error codes
+enum Code {
+    StsOk=                       0,  //!< everything is ok
+    StsBackTrace=               -1,  //!< pseudo error for back trace
+    StsError=                   -2,  //!< unknown /unspecified error
+    StsInternal=                -3,  //!< internal error (bad state)
+    StsNoMem=                   -4,  //!< insufficient memory
+    StsBadArg=                  -5,  //!< function arg/param is bad
+    StsBadFunc=                 -6,  //!< unsupported function
+    StsNoConv=                  -7,  //!< iteration didn't converge
+    StsAutoTrace=               -8,  //!< tracing
+    HeaderIsNull=               -9,  //!< image header is NULL
+    BadImageSize=              -10,  //!< image size is invalid
+    BadOffset=                 -11,  //!< offset is invalid
+    BadDataPtr=                -12,  //!<
+    BadStep=                   -13,  //!< image step is wrong, this may happen for a non-continuous matrix.
+    BadModelOrChSeq=           -14,  //!<
+    BadNumChannels=            -15,  //!< bad number of channels, for example, some functions accept only single channel matrices.
+    BadNumChannel1U=           -16,  //!<
+    BadDepth=                  -17,  //!< input image depth is not supported by the function
+    BadAlphaChannel=           -18,  //!<
+    BadOrder=                  -19,  //!< number of dimensions is out of range
+    BadOrigin=                 -20,  //!< incorrect input origin
+    BadAlign=                  -21,  //!< incorrect input align
+    BadCallBack=               -22,  //!<
+    BadTileSize=               -23,  //!<
+    BadCOI=                    -24,  //!< input COI is not supported
+    BadROISize=                -25,  //!< incorrect input roi
+    MaskIsTiled=               -26,  //!<
+    StsNullPtr=                -27,  //!< null pointer
+    StsVecLengthErr=           -28,  //!< incorrect vector length
+    StsFilterStructContentErr= -29,  //!< incorrect filter structure content
+    StsKernelStructContentErr= -30,  //!< incorrect transform kernel content
+    StsFilterOffsetErr=        -31,  //!< incorrect filter offset value
+    StsBadSize=                -201, //!< the input/output structure size is incorrect
+    StsDivByZero=              -202, //!< division by zero
+    StsInplaceNotSupported=    -203, //!< in-place operation is not supported
+    StsObjectNotFound=         -204, //!< request can't be completed
+    StsUnmatchedFormats=       -205, //!< formats of input/output arrays differ
+    StsBadFlag=                -206, //!< flag is wrong or not supported
+    StsBadPoint=               -207, //!< bad CvPoint
+    StsBadMask=                -208, //!< bad format of mask (neither 8uC1 nor 8sC1)
+    StsUnmatchedSizes=         -209, //!< sizes of input/output structures do not match
+    StsUnsupportedFormat=      -210, //!< the data format/type is not supported by the function
+    StsOutOfRange=             -211, //!< some of parameters are out of range
+    StsParseError=             -212, //!< invalid syntax/structure of the parsed file
+    StsNotImplemented=         -213, //!< the requested function/feature is not implemented
+    StsBadMemBlock=            -214, //!< an allocated block has been corrupted
+    StsAssert=                 -215, //!< assertion failed
+    GpuNotSupported=           -216, //!< no CUDA support
+    GpuApiCallError=           -217, //!< GPU API call error
+    OpenGlNotSupported=        -218, //!< no OpenGL support
+    OpenGlApiCallError=        -219, //!< OpenGL API call error
+    OpenCLApiCallError=        -220, //!< OpenCL API call error
+    OpenCLDoubleNotSupported=  -221,
+    OpenCLInitError=           -222, //!< OpenCL initialization error
+    OpenCLNoAMDBlasFft=        -223
+};
+} //Error
+
+//! @} core_utils
+
+//! @addtogroup core_array
+//! @{
+
+//! matrix decomposition types
+enum DecompTypes {
+    /** Gaussian elimination with the optimal pivot element chosen. */
+    DECOMP_LU       = 0,
+    /** singular value decomposition (SVD) method; the system can be over-defined and/or the matrix
+    src1 can be singular */
+    DECOMP_SVD      = 1,
+    /** eigenvalue decomposition; the matrix src1 must be symmetrical */
+    DECOMP_EIG      = 2,
+    /** Cholesky \f$LL^T\f$ factorization; the matrix src1 must be symmetrical and positively
+    defined */
+    DECOMP_CHOLESKY = 3,
+    /** QR factorization; the system can be over-defined and/or the matrix src1 can be singular */
+    DECOMP_QR       = 4,
+    /** while all the previous flags are mutually exclusive, this flag can be used together with
+    any of the previous; it means that the normal equations
+    \f$\texttt{src1}^T\cdot\texttt{src1}\cdot\texttt{dst}=\texttt{src1}^T\texttt{src2}\f$ are
+    solved instead of the original system
+    \f$\texttt{src1}\cdot\texttt{dst}=\texttt{src2}\f$ */
+    DECOMP_NORMAL   = 16
+};
+
+/** norm types
+
+src1 and src2 denote input arrays.
+*/
+
+enum NormTypes {
+                /**
+                \f[
+                norm =  \forkthree
+                {\|\texttt{src1}\|_{L_{\infty}} =  \max _I | \texttt{src1} (I)|}{if  \(\texttt{normType} = \texttt{NORM_INF}\) }
+                {\|\texttt{src1}-\texttt{src2}\|_{L_{\infty}} =  \max _I | \texttt{src1} (I) -  \texttt{src2} (I)|}{if  \(\texttt{normType} = \texttt{NORM_INF}\) }
+                {\frac{\|\texttt{src1}-\texttt{src2}\|_{L_{\infty}}    }{\|\texttt{src2}\|_{L_{\infty}} }}{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_INF}\) }
+                \f]
+                */
+                NORM_INF       = 1,
+                /**
+                \f[
+                norm =  \forkthree
+                {\| \texttt{src1} \| _{L_1} =  \sum _I | \texttt{src1} (I)|}{if  \(\texttt{normType} = \texttt{NORM_L1}\)}
+                { \| \texttt{src1} - \texttt{src2} \| _{L_1} =  \sum _I | \texttt{src1} (I) -  \texttt{src2} (I)|}{if  \(\texttt{normType} = \texttt{NORM_L1}\) }
+                { \frac{\|\texttt{src1}-\texttt{src2}\|_{L_1} }{\|\texttt{src2}\|_{L_1}} }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L1}\) }
+                \f]*/
+                 NORM_L1        = 2,
+                 /**
+                 \f[
+                 norm =  \forkthree
+                 { \| \texttt{src1} \| _{L_2} =  \sqrt{\sum_I \texttt{src1}(I)^2} }{if  \(\texttt{normType} = \texttt{NORM_L2}\) }
+                 { \| \texttt{src1} - \texttt{src2} \| _{L_2} =  \sqrt{\sum_I (\texttt{src1}(I) - \texttt{src2}(I))^2} }{if  \(\texttt{normType} = \texttt{NORM_L2}\) }
+                 { \frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}} }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2}\) }
+                 \f]
+                 */
+                 NORM_L2        = 4,
+                 /**
+                 \f[
+                 norm =  \forkthree
+                 { \| \texttt{src1} \| _{L_2} ^{2} = \sum_I \texttt{src1}(I)^2} {if  \(\texttt{normType} = \texttt{NORM_L2SQR}\)}
+                 { \| \texttt{src1} - \texttt{src2} \| _{L_2} ^{2} =  \sum_I (\texttt{src1}(I) - \texttt{src2}(I))^2 }{if  \(\texttt{normType} = \texttt{NORM_L2SQR}\) }
+                 { \left(\frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}}\right)^2 }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2SQR}\) }
+                 \f]
+                 */
+                 NORM_L2SQR     = 5,
+                 /**
+                 In the case of one input array, calculates the Hamming distance of the array from zero,
+                 In the case of two input arrays, calculates the Hamming distance between the arrays.
+                 */
+                 NORM_HAMMING   = 6,
+                 /**
+                 Similar to NORM_HAMMING, but in the calculation, each two bits of the input sequence will
+                 be added and treated as a single bit to be used in the same calculation as NORM_HAMMING.
+                 */
+                 NORM_HAMMING2  = 7,
+                 NORM_TYPE_MASK = 7, //!< bit-mask which can be used to separate norm type from norm flags
+                 NORM_RELATIVE  = 8, //!< flag
+                 NORM_MINMAX    = 32 //!< flag
+               };
+
+//! comparison types
+enum CmpTypes { CMP_EQ = 0, //!< src1 is equal to src2.
+                CMP_GT = 1, //!< src1 is greater than src2.
+                CMP_GE = 2, //!< src1 is greater than or equal to src2.
+                CMP_LT = 3, //!< src1 is less than src2.
+                CMP_LE = 4, //!< src1 is less than or equal to src2.
+                CMP_NE = 5  //!< src1 is unequal to src2.
+              };
+
+//! generalized matrix multiplication flags
+enum GemmFlags { GEMM_1_T = 1, //!< transposes src1
+                 GEMM_2_T = 2, //!< transposes src2
+                 GEMM_3_T = 4 //!< transposes src3
+               };
+
+enum DftFlags {
+    /** performs an inverse 1D or 2D transform instead of the default forward
+        transform. */
+    DFT_INVERSE        = 1,
+    /** scales the result: divide it by the number of array elements. Normally, it is
+        combined with DFT_INVERSE. */
+    DFT_SCALE          = 2,
+    /** performs a forward or inverse transform of every individual row of the input
+        matrix; this flag enables you to transform multiple vectors simultaneously and can be used to
+        decrease the overhead (which is sometimes several times larger than the processing itself) to
+        perform 3D and higher-dimensional transformations and so forth.*/
+    DFT_ROWS           = 4,
+    /** performs a forward transformation of 1D or 2D real array; the result,
+        though being a complex array, has complex-conjugate symmetry (*CCS*, see the function
+        description below for details), and such an array can be packed into a real array of the same
+        size as input, which is the fastest option and which is what the function does by default;
+        however, you may wish to get a full complex array (for simpler spectrum analysis, and so on) -
+        pass the flag to enable the function to produce a full-size complex output array. */
+    DFT_COMPLEX_OUTPUT = 16,
+    /** performs an inverse transformation of a 1D or 2D complex array; the
+        result is normally a complex array of the same size, however, if the input array has
+        conjugate-complex symmetry (for example, it is a result of forward transformation with
+        DFT_COMPLEX_OUTPUT flag), the output is a real array; while the function itself does not
+        check whether the input is symmetrical or not, you can pass the flag and then the function
+        will assume the symmetry and produce the real output array (note that when the input is packed
+        into a real array and inverse transformation is executed, the function treats the input as a
+        packed complex-conjugate symmetrical array, and the output will also be a real array). */
+    DFT_REAL_OUTPUT    = 32,
+    /** specifies that input is complex input. If this flag is set, the input must have 2 channels.
+        On the other hand, for backwards compatibility reason, if input has 2 channels, input is
+        already considered complex. */
+    DFT_COMPLEX_INPUT  = 64,
+    /** performs an inverse 1D or 2D transform instead of the default forward transform. */
+    DCT_INVERSE        = DFT_INVERSE,
+    /** performs a forward or inverse transform of every individual row of the input
+        matrix. This flag enables you to transform multiple vectors simultaneously and can be used to
+        decrease the overhead (which is sometimes several times larger than the processing itself) to
+        perform 3D and higher-dimensional transforms and so forth.*/
+    DCT_ROWS           = DFT_ROWS
+};
+
+//! Various border types, image boundaries are denoted with `|`
+//! @see borderInterpolate, copyMakeBorder
+enum BorderTypes {
+    BORDER_CONSTANT    = 0, //!< `iiiiii|abcdefgh|iiiiiii`  with some specified `i`
+    BORDER_REPLICATE   = 1, //!< `aaaaaa|abcdefgh|hhhhhhh`
+    BORDER_REFLECT     = 2, //!< `fedcba|abcdefgh|hgfedcb`
+    BORDER_WRAP        = 3, //!< `cdefgh|abcdefgh|abcdefg`
+    BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
+    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno` - Treats outliers as transparent.
+
+    BORDER_REFLECT101  = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_DEFAULT     = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_ISOLATED    = 16 //!< Interpolation restricted within the ROI boundaries.
+};
+
+//! @} core_array
+
+//! @addtogroup core_utils
+//! @{
+
+/*! @brief Signals an error and raises the exception.
+
+By default the function prints information about the error to stderr,
+then it either stops if setBreakOnError() had been called before or raises the exception.
+It is possible to alternate error processing by using redirectError().
+@param _code - error code (Error::Code)
+@param _err - error description
+@param _func - function name. Available only when the compiler supports getting it
+@param _file - source file name where the error has occurred
+@param _line - line number in the source file where the error has occurred
+@see CV_Error, CV_Error_, CV_Assert, CV_DbgAssert
+ */
+CV_EXPORTS CV_NORETURN void error(int _code, const String& _err, const char* _func, const char* _file, int _line);
+
+#ifdef CV_STATIC_ANALYSIS
+
+// In practice, some macro are not processed correctly (noreturn is not detected).
+// We need to use simplified definition for them.
+#define CV_Error(code, msg) do { (void)(code); (void)(msg); abort(); } while (0)
+#define CV_Error_(code, args) do { (void)(code); (void)(cv::format args); abort(); } while (0)
+#define CV_Assert( expr ) do { if (!(expr)) abort(); } while (0)
+
+#else // CV_STATIC_ANALYSIS
+
+/** @brief Call the error handler.
+
+Currently, the error handler prints the error code and the error message to the standard
+error stream `stderr`. In the Debug configuration, it then provokes memory access violation, so that
+the execution stack and all the parameters can be analyzed by the debugger. In the Release
+configuration, the exception is thrown.
+
+@param code one of Error::Code
+@param msg error message
+*/
+#define CV_Error( code, msg ) cv::error( code, msg, CV_Func, __FILE__, __LINE__ )
+
+/**  @brief Call the error handler.
+
+This macro can be used to construct an error message on-fly to include some dynamic information,
+for example:
+@code
+    // note the extra parentheses around the formatted text message
+    CV_Error_(Error::StsOutOfRange,
+    ("the value at (%d, %d)=%g is out of range", badPt.x, badPt.y, badValue));
+@endcode
+@param code one of Error::Code
+@param args printf-like formatted error message in parentheses
+*/
+#define CV_Error_( code, args ) cv::error( code, cv::format args, CV_Func, __FILE__, __LINE__ )
+
+/** @brief Checks a condition at runtime and throws exception if it fails
+
+The macros CV_Assert (and CV_DbgAssert(expr)) evaluate the specified expression. If it is 0, the macros
+raise an error (see cv::error). The macro CV_Assert checks the condition in both Debug and Release
+configurations while CV_DbgAssert is only retained in the Debug configuration.
+*/
+#define CV_Assert( expr ) do { if(!!(expr)) ; else cv::error( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ ); } while(0)
+
+#endif // CV_STATIC_ANALYSIS
+
+//! @cond IGNORED
+#if !defined(__OPENCV_BUILD)  // TODO: backward compatibility only
+#ifndef CV_ErrorNoReturn
+#define CV_ErrorNoReturn CV_Error
+#endif
+#ifndef CV_ErrorNoReturn_
+#define CV_ErrorNoReturn_ CV_Error_
+#endif
+#endif
+
+#define CV_Assert_1 CV_Assert
+#define CV_Assert_2( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_1( __VA_ARGS__ ))
+#define CV_Assert_3( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_2( __VA_ARGS__ ))
+#define CV_Assert_4( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_3( __VA_ARGS__ ))
+#define CV_Assert_5( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_4( __VA_ARGS__ ))
+#define CV_Assert_6( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_5( __VA_ARGS__ ))
+#define CV_Assert_7( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_6( __VA_ARGS__ ))
+#define CV_Assert_8( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_7( __VA_ARGS__ ))
+#define CV_Assert_9( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_8( __VA_ARGS__ ))
+#define CV_Assert_10( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_9( __VA_ARGS__ ))
+
+#define CV_Assert_N(...) do { __CV_EXPAND(__CV_CAT(CV_Assert_, __CV_VA_NUM_ARGS(__VA_ARGS__)) (__VA_ARGS__)); } while(0)
+
+//! @endcond
+
+#if defined _DEBUG || defined CV_STATIC_ANALYSIS
+#  define CV_DbgAssert(expr) CV_Assert(expr)
+#else
+/** replaced with CV_Assert(expr) in Debug configuration */
+#  define CV_DbgAssert(expr)
+#endif
+
+/*
+ * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
+ * bit count of A exclusive XOR'ed with B
+ */
+struct CV_EXPORTS Hamming
+{
+    static const NormTypes normType = NORM_HAMMING;
+    typedef unsigned char ValueType;
+    typedef int ResultType;
+
+    /** this will count the bits in a ^ b
+     */
+    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const;
+};
+
+typedef Hamming HammingLUT;
+
+/////////////////////////////////// inline norms ////////////////////////////////////
+
+template<typename _Tp> inline _Tp cv_abs(_Tp x) { return std::abs(x); }
+inline int cv_abs(uchar x) { return x; }
+inline int cv_abs(schar x) { return std::abs(x); }
+inline int cv_abs(ushort x) { return x; }
+inline int cv_abs(short x) { return std::abs(x); }
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    int i=0;
+#if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = a[i], v1 = a[i+1], v2 = a[i+2], v3 = a[i+3];
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+#endif
+    for( ; i < n; i++ )
+    {
+        _AccTp v = a[i];
+        s += v*v;
+    }
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    int i = 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
+    {
+        s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) +
+            (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]);
+    }
+#endif
+    for( ; i < n; i++ )
+        s += cv_abs(a[i]);
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+        s = std::max(s, (_AccTp)cv_abs(a[i]));
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    int i= 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = _AccTp(a[i] - b[i]), v1 = _AccTp(a[i+1] - b[i+1]), v2 = _AccTp(a[i+2] - b[i+2]), v3 = _AccTp(a[i+3] - b[i+3]);
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+#endif
+    for( ; i < n; i++ )
+    {
+        _AccTp v = _AccTp(a[i] - b[i]);
+        s += v*v;
+    }
+    return s;
+}
+
+static inline float normL2Sqr(const float* a, const float* b, int n)
+{
+    float s = 0.f;
+    for( int i = 0; i < n; i++ )
+    {
+        float v = a[i] - b[i];
+        s += v*v;
+    }
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    int i= 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = _AccTp(a[i] - b[i]), v1 = _AccTp(a[i+1] - b[i+1]), v2 = _AccTp(a[i+2] - b[i+2]), v3 = _AccTp(a[i+3] - b[i+3]);
+        s += std::abs(v0) + std::abs(v1) + std::abs(v2) + std::abs(v3);
+    }
+#endif
+    for( ; i < n; i++ )
+    {
+        _AccTp v = _AccTp(a[i] - b[i]);
+        s += std::abs(v);
+    }
+    return s;
+}
+
+inline float normL1(const float* a, const float* b, int n)
+{
+    float s = 0.f;
+    for( int i = 0; i < n; i++ )
+    {
+        s += std::abs(a[i] - b[i]);
+    }
+    return s;
+}
+
+inline int normL1(const uchar* a, const uchar* b, int n)
+{
+    int s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        s += std::abs(a[i] - b[i]);
+    }
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        _AccTp v0 = a[i] - b[i];
+        s = std::max(s, std::abs(v0));
+    }
+    return s;
+}
+
+/** @brief Computes the cube root of an argument.
+
+ The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
+ NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
+ single-precision data.
+ @param val A function argument.
+ */
+CV_EXPORTS_W float cubeRoot(float val);
+
+/** @overload
+
+cubeRoot with argument of `double` type calls `std::cbrt(double)`
+*/
+static inline
+double cubeRoot(double val)
+{
+    return std::cbrt(val);
+}
+
+/** @brief Calculates the angle of a 2D vector in degrees.
+
+ The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
+ in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
+ @param x x-coordinate of the vector.
+ @param y y-coordinate of the vector.
+ */
+CV_EXPORTS_W float fastAtan2(float y, float x);
+
+/** proxy for hal::LU */
+CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+/** proxy for hal::LU */
+CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+/** proxy for hal::Cholesky */
+CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+/** proxy for hal::Cholesky */
+CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+////////////////// forward declarations for important OpenCV types //////////////////
+
+//! @cond IGNORED
+
+template<typename _Tp, int cn> class Vec;
+template<typename _Tp, int m, int n> class Matx;
+
+template<typename _Tp> class Complex;
+template<typename _Tp> class Point_;
+template<typename _Tp> class Point3_;
+template<typename _Tp> class Size_;
+template<typename _Tp> class Rect_;
+template<typename _Tp> class Scalar_;
+
+class CV_EXPORTS RotatedRect;
+class CV_EXPORTS Range;
+class CV_EXPORTS TermCriteria;
+class CV_EXPORTS KeyPoint;
+class CV_EXPORTS DMatch;
+class CV_EXPORTS RNG;
+
+class CV_EXPORTS Mat;
+class CV_EXPORTS MatExpr;
+
+class CV_EXPORTS UMat;
+
+class CV_EXPORTS SparseMat;
+typedef Mat MatND;
+
+template<typename _Tp> class Mat_;
+template<typename _Tp> class SparseMat_;
+
+class CV_EXPORTS MatConstIterator;
+class CV_EXPORTS SparseMatIterator;
+class CV_EXPORTS SparseMatConstIterator;
+template<typename _Tp> class MatIterator_;
+template<typename _Tp> class MatConstIterator_;
+template<typename _Tp> class SparseMatIterator_;
+template<typename _Tp> class SparseMatConstIterator_;
+
+namespace ogl
+{
+    class CV_EXPORTS Buffer;
+    class CV_EXPORTS Texture2D;
+    class CV_EXPORTS Arrays;
+}
+
+namespace cuda
+{
+    class CV_EXPORTS GpuMat;
+    class CV_EXPORTS HostMem;
+    class CV_EXPORTS Stream;
+    class CV_EXPORTS Event;
+}
+
+namespace cudev
+{
+    template <typename _Tp> class GpuMat_;
+}
+
+namespace ipp
+{
+CV_EXPORTS   unsigned long long getIppFeatures();
+CV_EXPORTS   void setIppStatus(int status, const char * const funcname = NULL, const char * const filename = NULL,
+                             int line = 0);
+CV_EXPORTS   int getIppStatus();
+CV_EXPORTS   String getIppErrorLocation();
+CV_EXPORTS_W bool   useIPP();
+CV_EXPORTS_W void   setUseIPP(bool flag);
+CV_EXPORTS_W String getIppVersion();
+
+// IPP Not-Exact mode. This function may force use of IPP then both IPP and OpenCV provide proper results
+// but have internal accuracy differences which have too much direct or indirect impact on accuracy tests.
+CV_EXPORTS_W bool useIPP_NotExact();
+CV_EXPORTS_W void setUseIPP_NotExact(bool flag);
+#ifndef DISABLE_OPENCV_3_COMPATIBILITY
+static inline bool useIPP_NE() { return useIPP_NotExact(); }
+static inline void setUseIPP_NE(bool flag) { setUseIPP_NotExact(flag); }
+#endif
+
+} // ipp
+
+//! @endcond
+
+//! @} core_utils
+
+
+
+
+} // cv
+
+#include "opencv2/core/neon_utils.hpp"
+#include "opencv2/core/vsx_utils.hpp"
+#include "opencv2/core/check.hpp"
+
+#endif //OPENCV_CORE_BASE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/bindings_utils.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/bindings_utils.hpp
new file mode 100644
index 000000000000..9c8f9e0f2bce
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/bindings_utils.hpp
@@ -0,0 +1,357 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_BINDINGS_UTILS_HPP
+#define OPENCV_CORE_BINDINGS_UTILS_HPP
+
+#include <opencv2/core/async.hpp>
+#include <opencv2/core/detail/async_promise.hpp>
+#include <opencv2/core/utils/logger.hpp>
+
+#include <stdexcept>
+
+namespace cv { namespace utils {
+//! @addtogroup core_utils
+//! @{
+
+CV_EXPORTS_W String dumpInputArray(InputArray argument);
+
+CV_EXPORTS_W String dumpInputArrayOfArrays(InputArrayOfArrays argument);
+
+CV_EXPORTS_W String dumpInputOutputArray(InputOutputArray argument);
+
+CV_EXPORTS_W String dumpInputOutputArrayOfArrays(InputOutputArrayOfArrays argument);
+
+CV_WRAP static inline
+String dumpBool(bool argument)
+{
+    return (argument) ? String("Bool: True") : String("Bool: False");
+}
+
+CV_WRAP static inline
+String dumpInt(int argument)
+{
+    return cv::format("Int: %d", argument);
+}
+
+CV_WRAP static inline
+String dumpInt64(int64 argument)
+{
+    std::ostringstream oss("Int64: ", std::ios::ate);
+    oss << argument;
+    return oss.str();
+}
+
+CV_WRAP static inline
+String dumpSizeT(size_t argument)
+{
+    std::ostringstream oss("size_t: ", std::ios::ate);
+    oss << argument;
+    return oss.str();
+}
+
+CV_WRAP static inline
+String dumpFloat(float argument)
+{
+    return cv::format("Float: %.2f", argument);
+}
+
+CV_WRAP static inline
+String dumpDouble(double argument)
+{
+    return cv::format("Double: %.2f", argument);
+}
+
+CV_WRAP static inline
+String dumpCString(const char* argument)
+{
+    return cv::format("String: %s", argument);
+}
+
+CV_WRAP static inline
+String dumpString(const String& argument)
+{
+    return cv::format("String: %s", argument.c_str());
+}
+
+CV_WRAP static inline
+String dumpRect(const Rect& argument)
+{
+    return format("rect: (x=%d, y=%d, w=%d, h=%d)", argument.x, argument.y,
+                  argument.width, argument.height);
+}
+
+CV_WRAP static inline
+String dumpTermCriteria(const TermCriteria& argument)
+{
+    return format("term_criteria: (type=%d, max_count=%d, epsilon=%lf",
+                  argument.type, argument.maxCount, argument.epsilon);
+}
+
+CV_WRAP static inline
+String dumpRotatedRect(const RotatedRect& argument)
+{
+    return format("rotated_rect: (c_x=%f, c_y=%f, w=%f, h=%f, a=%f)",
+                  argument.center.x, argument.center.y, argument.size.width,
+                  argument.size.height, argument.angle);
+}
+
+CV_WRAP static inline
+String dumpRange(const Range& argument)
+{
+    if (argument == Range::all())
+    {
+        return "range: all";
+    }
+    else
+    {
+        return format("range: (s=%d, e=%d)", argument.start, argument.end);
+    }
+}
+
+CV_EXPORTS_W String dumpVectorOfInt(const std::vector<int>& vec);
+
+CV_EXPORTS_W String dumpVectorOfDouble(const std::vector<double>& vec);
+
+CV_EXPORTS_W String dumpVectorOfRect(const std::vector<Rect>& vec);
+
+
+//! @cond IGNORED
+
+CV_WRAP static inline
+String testOverloadResolution(int value, const Point& point = Point(42, 24))
+{
+    return format("overload (int=%d, point=(x=%d, y=%d))", value, point.x,
+                  point.y);
+}
+
+CV_WRAP static inline
+String testOverloadResolution(const Rect& rect)
+{
+    return format("overload (rect=(x=%d, y=%d, w=%d, h=%d))", rect.x, rect.y,
+                  rect.width, rect.height);
+}
+
+CV_WRAP static inline
+RotatedRect testRotatedRect(float x, float y, float w, float h, float angle)
+{
+    return RotatedRect(Point2f(x, y), Size2f(w, h), angle);
+}
+
+CV_WRAP static inline
+std::vector<RotatedRect> testRotatedRectVector(float x, float y, float w, float h, float angle)
+{
+    std::vector<RotatedRect> result;
+    for (int i = 0; i < 10; i++)
+        result.push_back(RotatedRect(Point2f(x + i, y + 2 * i), Size2f(w, h), angle + 10 * i));
+    return result;
+}
+
+CV_WRAP static inline
+int testOverwriteNativeMethod(int argument)
+{
+    return argument;
+}
+
+CV_WRAP static inline
+String testReservedKeywordConversion(int positional_argument, int lambda = 2, int from = 3)
+{
+    return format("arg=%d, lambda=%d, from=%d", positional_argument, lambda, from);
+}
+
+CV_WRAP static inline
+void generateVectorOfRect(size_t len, CV_OUT std::vector<Rect>& vec)
+{
+    vec.resize(len);
+    if (len > 0)
+    {
+        RNG rng(12345);
+        Mat tmp(static_cast<int>(len), 1, CV_32SC4);
+        rng.fill(tmp, RNG::UNIFORM, 10, 20);
+        tmp.copyTo(vec);
+    }
+}
+
+CV_WRAP static inline
+void generateVectorOfInt(size_t len, CV_OUT std::vector<int>& vec)
+{
+    vec.resize(len);
+    if (len > 0)
+    {
+        RNG rng(554433);
+        Mat tmp(static_cast<int>(len), 1, CV_32SC1);
+        rng.fill(tmp, RNG::UNIFORM, -10, 10);
+        tmp.copyTo(vec);
+    }
+}
+
+CV_WRAP static inline
+void generateVectorOfMat(size_t len, int rows, int cols, int dtype, CV_OUT std::vector<Mat>& vec)
+{
+    vec.resize(len);
+    if (len > 0)
+    {
+        RNG rng(65431);
+        for (size_t i = 0; i < len; ++i)
+        {
+            vec[i].create(rows, cols, dtype);
+            rng.fill(vec[i], RNG::UNIFORM, 0, 10);
+        }
+    }
+}
+
+CV_WRAP static inline
+void testRaiseGeneralException()
+{
+    throw std::runtime_error("exception text");
+}
+
+CV_WRAP static inline
+AsyncArray testAsyncArray(InputArray argument)
+{
+    AsyncPromise p;
+    p.setValue(argument);
+    return p.getArrayResult();
+}
+
+CV_WRAP static inline
+AsyncArray testAsyncException()
+{
+    AsyncPromise p;
+    try
+    {
+        CV_Error(Error::StsOk, "Test: Generated async error");
+    }
+    catch (const cv::Exception& e)
+    {
+        p.setException(e);
+    }
+    return p.getArrayResult();
+}
+
+CV_WRAP static inline
+String dumpVec2i(const cv::Vec2i value = cv::Vec2i(42, 24)) {
+    return format("Vec2i(%d, %d)", value[0], value[1]);
+}
+
+struct CV_EXPORTS_W_SIMPLE ClassWithKeywordProperties {
+    CV_PROP_RW int lambda;
+    CV_PROP int except;
+
+    CV_WRAP explicit ClassWithKeywordProperties(int lambda_arg = 24, int except_arg = 42)
+    {
+        lambda = lambda_arg;
+        except = except_arg;
+    }
+};
+
+struct CV_EXPORTS_W_PARAMS FunctionParams
+{
+    CV_PROP_RW int lambda = -1;
+    CV_PROP_RW float sigma = 0.0f;
+
+    FunctionParams& setLambda(int value) CV_NOEXCEPT
+    {
+        lambda = value;
+        return *this;
+    }
+
+    FunctionParams& setSigma(float value) CV_NOEXCEPT
+    {
+        sigma = value;
+        return *this;
+    }
+};
+
+CV_WRAP static inline String
+copyMatAndDumpNamedArguments(InputArray src, OutputArray dst,
+                             const FunctionParams& params = FunctionParams())
+{
+    src.copyTo(dst);
+    return format("lambda=%d, sigma=%.1f", params.lambda,
+                  params.sigma);
+}
+
+namespace nested {
+CV_WRAP static inline bool testEchoBooleanFunction(bool flag) {
+    return flag;
+}
+
+class CV_EXPORTS_W CV_WRAP_AS(ExportClassName) OriginalClassName
+{
+public:
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_PROP_RW int int_value;
+        CV_PROP_RW float float_value;
+
+        CV_WRAP explicit Params(int int_param = 123, float float_param = 3.5f)
+        {
+            int_value = int_param;
+            float_value = float_param;
+        }
+    };
+
+    explicit OriginalClassName(const OriginalClassName::Params& params = OriginalClassName::Params())
+    {
+        params_ = params;
+    }
+
+    CV_WRAP int getIntParam() const
+    {
+        return params_.int_value;
+    }
+
+    CV_WRAP float getFloatParam() const
+    {
+        return params_.float_value;
+    }
+
+    CV_WRAP static std::string originalName()
+    {
+        return "OriginalClassName";
+    }
+
+    CV_WRAP static Ptr<OriginalClassName>
+    create(const OriginalClassName::Params& params = OriginalClassName::Params())
+    {
+        return makePtr<OriginalClassName>(params);
+    }
+
+private:
+    OriginalClassName::Params params_;
+};
+
+typedef OriginalClassName::Params OriginalClassName_Params;
+} // namespace nested
+
+//! @endcond IGNORED
+
+namespace fs {
+    CV_EXPORTS_W cv::String getCacheDirectoryForDownloads();
+} // namespace fs
+
+//! @}  // core_utils
+}  // namespace cv::utils
+
+//! @cond IGNORED
+
+CV_WRAP static inline
+int setLogLevel(int level)
+{
+    // NB: Binding generators doesn't work with enums properly yet, so we define separate overload here
+    return cv::utils::logging::setLogLevel((cv::utils::logging::LogLevel)level);
+}
+
+CV_WRAP static inline
+int getLogLevel()
+{
+    return cv::utils::logging::getLogLevel();
+}
+
+//! @endcond IGNORED
+
+} // namespaces cv /  utils
+
+#endif // OPENCV_CORE_BINDINGS_UTILS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/bufferpool.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/bufferpool.hpp
new file mode 100644
index 000000000000..4698e5da167d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/bufferpool.hpp
@@ -0,0 +1,40 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+
+#ifndef OPENCV_CORE_BUFFER_POOL_HPP
+#define OPENCV_CORE_BUFFER_POOL_HPP
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4265)
+#endif
+
+namespace cv
+{
+
+//! @addtogroup core
+//! @{
+
+class BufferPoolController
+{
+protected:
+    ~BufferPoolController() { }
+public:
+    virtual size_t getReservedSize() const = 0;
+    virtual size_t getMaxReservedSize() const = 0;
+    virtual void setMaxReservedSize(size_t size) = 0;
+    virtual void freeAllReservedBuffers() = 0;
+};
+
+//! @}
+
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // OPENCV_CORE_BUFFER_POOL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/check.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/check.hpp
new file mode 100644
index 000000000000..c9ce97b6ae25
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/check.hpp
@@ -0,0 +1,170 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_CHECK_HPP
+#define OPENCV_CORE_CHECK_HPP
+
+#include <opencv2/core/base.hpp>
+
+namespace cv {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or "<invalid depth>" */
+CV_EXPORTS const char* depthToString(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or "<invalid type>" */
+CV_EXPORTS String typeToString(int type);
+
+
+//! @cond IGNORED
+namespace detail {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or NULL */
+CV_EXPORTS const char* depthToString_(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or cv::String() */
+CV_EXPORTS cv::String typeToString_(int type);
+
+enum TestOp {
+  TEST_CUSTOM = 0,
+  TEST_EQ = 1,
+  TEST_NE = 2,
+  TEST_LE = 3,
+  TEST_LT = 4,
+  TEST_GE = 5,
+  TEST_GT = 6,
+  CV__LAST_TEST_OP
+};
+
+struct CheckContext {
+    const char* func;
+    const char* file;
+    int line;
+    enum TestOp testOp;
+    const char* message;
+    const char* p1_str;
+    const char* p2_str;
+};
+
+#ifndef CV__CHECK_FILENAME
+# define CV__CHECK_FILENAME __FILE__
+#endif
+
+#ifndef CV__CHECK_FUNCTION
+# if defined _MSC_VER
+#   define CV__CHECK_FUNCTION __FUNCSIG__
+# elif defined __GNUC__
+#   define CV__CHECK_FUNCTION __PRETTY_FUNCTION__
+# else
+#   define CV__CHECK_FUNCTION "<unknown>"
+# endif
+#endif
+
+#define CV__CHECK_LOCATION_VARNAME(id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_check_, id), __LINE__)
+#define CV__DEFINE_CHECK_CONTEXT(id, message, testOp, p1_str, p2_str) \
+    static const cv::detail::CheckContext CV__CHECK_LOCATION_VARNAME(id) = \
+            { CV__CHECK_FUNCTION, CV__CHECK_FILENAME, __LINE__, testOp, "" message, "" p1_str, "" p2_str }
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const bool v1, const bool v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v1, const size_t v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v1, const float v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v1, const double v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v1, const Size_<int> v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx);
+
+CV_EXPORTS void CV_NORETURN check_failed_true(const bool v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_false(const bool v, const CheckContext& ctx);
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const std::string& v1, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckContext& ctx);
+
+
+#define CV__TEST_EQ(v1, v2) ((v1) == (v2))
+#define CV__TEST_NE(v1, v2) ((v1) != (v2))
+#define CV__TEST_LE(v1, v2) ((v1) <= (v2))
+#define CV__TEST_LT(v1, v2) ((v1) < (v2))
+#define CV__TEST_GE(v1, v2) ((v1) >= (v2))
+#define CV__TEST_GT(v1, v2) ((v1) > (v2))
+
+#define CV__CHECK(id, op, type, v1, v2, v1_str, v2_str, msg_str) do { \
+    if(CV__TEST_##op((v1), (v2))) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_ ## op, v1_str, v2_str); \
+        cv::detail::check_failed_ ## type((v1), (v2), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+#define CV__CHECK_CUSTOM_TEST(id, type, v, test_expr, v_str, test_expr_str, msg_str) do { \
+    if(!!(test_expr)) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_CUSTOM, v_str, test_expr_str); \
+        cv::detail::check_failed_ ## type((v), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+} // namespace
+//! @endcond
+
+
+/// Supported values of these types: int, float, double
+#define CV_CheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+
+/// Check with additional "decoding" of type values in error message
+#define CV_CheckTypeEQ(t1, t2, msg)  CV__CHECK(_, EQ, MatType, t1, t2, #t1, #t2, msg)
+/// Check with additional "decoding" of depth values in error message
+#define CV_CheckDepthEQ(d1, d2, msg)  CV__CHECK(_, EQ, MatDepth, d1, d2, #d1, #d2, msg)
+
+#define CV_CheckChannelsEQ(c1, c2, msg)  CV__CHECK(_, EQ, MatChannels, c1, c2, #c1, #c2, msg)
+
+/// Example: type == CV_8UC1 || type == CV_8UC3
+#define CV_CheckType(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatType, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: depth == CV_32F || depth == CV_64F
+#define CV_CheckDepth(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatDepth, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: v == A || v == B
+#define CV_Check(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+
+/// Example: v == true
+#define CV_CheckTrue(v, msg)  CV__CHECK_CUSTOM_TEST(_, true, v, v, #v, "", msg)
+
+/// Example: v == false
+#define CV_CheckFalse(v, msg)  CV__CHECK_CUSTOM_TEST(_, false, v, (!(v)), #v, "", msg)
+
+/// Some complex conditions: CV_Check(src2, src2.empty() || (src2.type() == src1.type() && src2.size() == src1.size()), "src2 should have same size/type as src1")
+// TODO define pretty-printers
+
+#ifndef NDEBUG
+#define CV_DbgCheck(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+#define CV_DbgCheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+#else
+#define CV_DbgCheck(v, test_expr, msg)  do { } while (0)
+#define CV_DbgCheckEQ(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckNE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckLE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckLT(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckGE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckGT(v1, v2, msg)  do { } while (0)
+#endif
+
+} // namespace
+
+#endif // OPENCV_CORE_CHECK_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/core.hpp
new file mode 100644
index 000000000000..438918359be4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/core.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/core.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/core_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/core/core_c.h
new file mode 100644
index 000000000000..7b686b86f315
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/core_c.h
@@ -0,0 +1,3128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#ifndef OPENCV_CORE_C_H
+#define OPENCV_CORE_C_H
+
+#include "opencv2/core/types_c.h"
+
+#ifdef __cplusplus
+/* disable MSVC warning C4190 / clang-cl -Wreturn-type-c-linkage:
+       'function' has C-linkage specified, but returns UDT 'typename'
+       which is incompatible with C
+
+   It is OK to disable it because we only extend few plain structures with
+   C++ constructors for simpler interoperability with C++ API of the library
+*/
+#  if defined(__clang__)
+     // handle clang on Linux and clang-cl (i. e. clang on Windows) first
+#    pragma GCC diagnostic ignored "-Wreturn-type-c-linkage"
+#  elif defined(_MSC_VER)
+     // then handle MSVC
+#    pragma warning(disable:4190)
+#  endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup core_c
+    @{
+*/
+
+/****************************************************************************************\
+*          Array allocation, deallocation, initialization and access to elements         *
+\****************************************************************************************/
+
+/** `malloc` wrapper.
+   If there is no enough memory, the function
+   (as well as other OpenCV functions that call cvAlloc)
+   raises an error. */
+CVAPI(void*)  cvAlloc( size_t size );
+
+/** `free` wrapper.
+   Here and further all the memory releasing functions
+   (that all call cvFree) take double pointer in order to
+   to clear pointer to the data after releasing it.
+   Passing pointer to NULL pointer is Ok: nothing happens in this case
+*/
+CVAPI(void)   cvFree_( void* ptr );
+#define cvFree(ptr) (cvFree_(*(ptr)), *(ptr)=0)
+
+/** @brief Creates an image header but does not allocate the image data.
+
+@param size Image width and height
+@param depth Image depth (see cvCreateImage )
+@param channels Number of channels (see cvCreateImage )
+ */
+CVAPI(IplImage*)  cvCreateImageHeader( CvSize size, int depth, int channels );
+
+/** @brief Initializes an image header that was previously allocated.
+
+The returned IplImage\* points to the initialized header.
+@param image Image header to initialize
+@param size Image width and height
+@param depth Image depth (see cvCreateImage )
+@param channels Number of channels (see cvCreateImage )
+@param origin Top-left IPL_ORIGIN_TL or bottom-left IPL_ORIGIN_BL
+@param align Alignment for image rows, typically 4 or 8 bytes
+ */
+CVAPI(IplImage*) cvInitImageHeader( IplImage* image, CvSize size, int depth,
+                                   int channels, int origin CV_DEFAULT(0),
+                                   int align CV_DEFAULT(4));
+
+/** @brief Creates an image header and allocates the image data.
+
+This function call is equivalent to the following code:
+@code
+    header = cvCreateImageHeader(size, depth, channels);
+    cvCreateData(header);
+@endcode
+@param size Image width and height
+@param depth Bit depth of image elements. See IplImage for valid depths.
+@param channels Number of channels per pixel. See IplImage for details. This function only creates
+images with interleaved channels.
+ */
+CVAPI(IplImage*)  cvCreateImage( CvSize size, int depth, int channels );
+
+/** @brief Deallocates an image header.
+
+This call is an analogue of :
+@code
+    if(image )
+    {
+        iplDeallocate(*image, IPL_IMAGE_HEADER | IPL_IMAGE_ROI);
+        *image = 0;
+    }
+@endcode
+but it does not use IPL functions by default (see the CV_TURN_ON_IPL_COMPATIBILITY macro).
+@param image Double pointer to the image header
+ */
+CVAPI(void)  cvReleaseImageHeader( IplImage** image );
+
+/** @brief Deallocates the image header and the image data.
+
+This call is a shortened form of :
+@code
+    if(*image )
+    {
+        cvReleaseData(*image);
+        cvReleaseImageHeader(image);
+    }
+@endcode
+@param image Double pointer to the image header
+*/
+CVAPI(void)  cvReleaseImage( IplImage** image );
+
+/** Creates a copy of IPL image (widthStep may differ) */
+CVAPI(IplImage*) cvCloneImage( const IplImage* image );
+
+/** @brief Sets the channel of interest in an IplImage.
+
+If the ROI is set to NULL and the coi is *not* 0, the ROI is allocated. Most OpenCV functions do
+*not* support the COI setting, so to process an individual image/matrix channel one may copy (via
+cvCopy or cvSplit) the channel to a separate image/matrix, process it and then copy the result
+back (via cvCopy or cvMerge) if needed.
+@param image A pointer to the image header
+@param coi The channel of interest. 0 - all channels are selected, 1 - first channel is selected,
+etc. Note that the channel indices become 1-based.
+ */
+CVAPI(void)  cvSetImageCOI( IplImage* image, int coi );
+
+/** @brief Returns the index of the channel of interest.
+
+Returns the channel of interest of in an IplImage. Returned values correspond to the coi in
+cvSetImageCOI.
+@param image A pointer to the image header
+ */
+CVAPI(int)  cvGetImageCOI( const IplImage* image );
+
+/** @brief Sets an image Region Of Interest (ROI) for a given rectangle.
+
+If the original image ROI was NULL and the rect is not the whole image, the ROI structure is
+allocated.
+
+Most OpenCV functions support the use of ROI and treat the image rectangle as a separate image. For
+example, all of the pixel coordinates are counted from the top-left (or bottom-left) corner of the
+ROI, not the original image.
+@param image A pointer to the image header
+@param rect The ROI rectangle
+ */
+CVAPI(void)  cvSetImageROI( IplImage* image, CvRect rect );
+
+/** @brief Resets the image ROI to include the entire image and releases the ROI structure.
+
+This produces a similar result to the following, but in addition it releases the ROI structure. :
+@code
+    cvSetImageROI(image, cvRect(0, 0, image->width, image->height ));
+    cvSetImageCOI(image, 0);
+@endcode
+@param image A pointer to the image header
+ */
+CVAPI(void)  cvResetImageROI( IplImage* image );
+
+/** @brief Returns the image ROI.
+
+If there is no ROI set, cvRect(0,0,image-\>width,image-\>height) is returned.
+@param image A pointer to the image header
+ */
+CVAPI(CvRect) cvGetImageROI( const IplImage* image );
+
+/** @brief Creates a matrix header but does not allocate the matrix data.
+
+The function allocates a new matrix header and returns a pointer to it. The matrix data can then be
+allocated using cvCreateData or set explicitly to user-allocated data via cvSetData.
+@param rows Number of rows in the matrix
+@param cols Number of columns in the matrix
+@param type Type of the matrix elements, see cvCreateMat
+ */
+CVAPI(CvMat*)  cvCreateMatHeader( int rows, int cols, int type );
+
+#define CV_AUTOSTEP  0x7fffffff
+
+/** @brief Initializes a pre-allocated matrix header.
+
+This function is often used to process raw data with OpenCV matrix functions. For example, the
+following code computes the matrix product of two matrices, stored as ordinary arrays:
+@code
+    double a[] = { 1, 2, 3, 4,
+                   5, 6, 7, 8,
+                   9, 10, 11, 12 };
+
+    double b[] = { 1, 5, 9,
+                   2, 6, 10,
+                   3, 7, 11,
+                   4, 8, 12 };
+
+    double c[9];
+    CvMat Ma, Mb, Mc ;
+
+    cvInitMatHeader(&Ma, 3, 4, CV_64FC1, a);
+    cvInitMatHeader(&Mb, 4, 3, CV_64FC1, b);
+    cvInitMatHeader(&Mc, 3, 3, CV_64FC1, c);
+
+    cvMatMulAdd(&Ma, &Mb, 0, &Mc);
+    // the c array now contains the product of a (3x4) and b (4x3)
+@endcode
+@param mat A pointer to the matrix header to be initialized
+@param rows Number of rows in the matrix
+@param cols Number of columns in the matrix
+@param type Type of the matrix elements, see cvCreateMat .
+@param data Optional: data pointer assigned to the matrix header
+@param step Optional: full row width in bytes of the assigned data. By default, the minimal
+possible step is used which assumes there are no gaps between subsequent rows of the matrix.
+ */
+CVAPI(CvMat*) cvInitMatHeader( CvMat* mat, int rows, int cols,
+                              int type, void* data CV_DEFAULT(NULL),
+                              int step CV_DEFAULT(CV_AUTOSTEP) );
+
+/** @brief Creates a matrix header and allocates the matrix data.
+
+The function call is equivalent to the following code:
+@code
+    CvMat* mat = cvCreateMatHeader(rows, cols, type);
+    cvCreateData(mat);
+@endcode
+@param rows Number of rows in the matrix
+@param cols Number of columns in the matrix
+@param type The type of the matrix elements in the form
+CV_\<bit depth\>\<S|U|F\>C\<number of channels\> , where S=signed, U=unsigned, F=float. For
+example, CV _ 8UC1 means the elements are 8-bit unsigned and the there is 1 channel, and CV _
+32SC2 means the elements are 32-bit signed and there are 2 channels.
+ */
+CVAPI(CvMat*)  cvCreateMat( int rows, int cols, int type );
+
+/** @brief Deallocates a matrix.
+
+The function decrements the matrix data reference counter and deallocates matrix header. If the data
+reference counter is 0, it also deallocates the data. :
+@code
+    if(*mat )
+        cvDecRefData(*mat);
+    cvFree((void**)mat);
+@endcode
+@param mat Double pointer to the matrix
+ */
+CVAPI(void)  cvReleaseMat( CvMat** mat );
+
+/** @brief Decrements an array data reference counter.
+
+The function decrements the data reference counter in a CvMat or CvMatND if the reference counter
+
+pointer is not NULL. If the counter reaches zero, the data is deallocated. In the current
+implementation the reference counter is not NULL only if the data was allocated using the
+cvCreateData function. The counter will be NULL in other cases such as: external data was assigned
+to the header using cvSetData, header is part of a larger matrix or image, or the header was
+converted from an image or n-dimensional matrix header.
+@param arr Pointer to an array header
+ */
+CV_INLINE  void  cvDecRefData( CvArr* arr )
+{
+    if( CV_IS_MAT( arr ))
+    {
+        CvMat* mat = (CvMat*)arr;
+        mat->data.ptr = NULL;
+        if( mat->refcount != NULL && --*mat->refcount == 0 )
+            cvFree( &mat->refcount );
+        mat->refcount = NULL;
+    }
+    else if( CV_IS_MATND( arr ))
+    {
+        CvMatND* mat = (CvMatND*)arr;
+        mat->data.ptr = NULL;
+        if( mat->refcount != NULL && --*mat->refcount == 0 )
+            cvFree( &mat->refcount );
+        mat->refcount = NULL;
+    }
+}
+
+/** @brief Increments array data reference counter.
+
+The function increments CvMat or CvMatND data reference counter and returns the new counter value if
+the reference counter pointer is not NULL, otherwise it returns zero.
+@param arr Array header
+ */
+CV_INLINE  int  cvIncRefData( CvArr* arr )
+{
+    int refcount = 0;
+    if( CV_IS_MAT( arr ))
+    {
+        CvMat* mat = (CvMat*)arr;
+        if( mat->refcount != NULL )
+            refcount = ++*mat->refcount;
+    }
+    else if( CV_IS_MATND( arr ))
+    {
+        CvMatND* mat = (CvMatND*)arr;
+        if( mat->refcount != NULL )
+            refcount = ++*mat->refcount;
+    }
+    return refcount;
+}
+
+
+/** Creates an exact copy of the input matrix (except, may be, step value) */
+CVAPI(CvMat*) cvCloneMat( const CvMat* mat );
+
+
+/** @brief Returns matrix header corresponding to the rectangular sub-array of input image or matrix.
+
+The function returns header, corresponding to a specified rectangle of the input array. In other
+
+words, it allows the user to treat a rectangular part of input array as a stand-alone array. ROI is
+taken into account by the function so the sub-array of ROI is actually extracted.
+@param arr Input array
+@param submat Pointer to the resultant sub-array header
+@param rect Zero-based coordinates of the rectangle of interest
+ */
+CVAPI(CvMat*) cvGetSubRect( const CvArr* arr, CvMat* submat, CvRect rect );
+#define cvGetSubArr cvGetSubRect
+
+/** @brief Returns array row or row span.
+
+The function returns the header, corresponding to a specified row/row span of the input array.
+cvGetRow(arr, submat, row) is a shortcut for cvGetRows(arr, submat, row, row+1).
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param start_row Zero-based index of the starting row (inclusive) of the span
+@param end_row Zero-based index of the ending row (exclusive) of the span
+@param delta_row Index step in the row span. That is, the function extracts every delta_row -th
+row from start_row and up to (but not including) end_row .
+ */
+CVAPI(CvMat*) cvGetRows( const CvArr* arr, CvMat* submat,
+                        int start_row, int end_row,
+                        int delta_row CV_DEFAULT(1));
+
+/** @overload
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param row Zero-based index of the selected row
+*/
+CV_INLINE  CvMat*  cvGetRow( const CvArr* arr, CvMat* submat, int row )
+{
+    return cvGetRows( arr, submat, row, row + 1, 1 );
+}
+
+
+/** @brief Returns one of more array columns.
+
+The function returns the header, corresponding to a specified column span of the input array. That
+
+is, no data is copied. Therefore, any modifications of the submatrix will affect the original array.
+If you need to copy the columns, use cvCloneMat. cvGetCol(arr, submat, col) is a shortcut for
+cvGetCols(arr, submat, col, col+1).
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param start_col Zero-based index of the starting column (inclusive) of the span
+@param end_col Zero-based index of the ending column (exclusive) of the span
+ */
+CVAPI(CvMat*) cvGetCols( const CvArr* arr, CvMat* submat,
+                        int start_col, int end_col );
+
+/** @overload
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param col Zero-based index of the selected column
+*/
+CV_INLINE  CvMat*  cvGetCol( const CvArr* arr, CvMat* submat, int col )
+{
+    return cvGetCols( arr, submat, col, col + 1 );
+}
+
+/** @brief Returns one of array diagonals.
+
+The function returns the header, corresponding to a specified diagonal of the input array.
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param diag Index of the array diagonal. Zero value corresponds to the main diagonal, -1
+corresponds to the diagonal above the main, 1 corresponds to the diagonal below the main, and so
+forth.
+ */
+CVAPI(CvMat*) cvGetDiag( const CvArr* arr, CvMat* submat,
+                            int diag CV_DEFAULT(0));
+
+/** low-level scalar <-> raw data conversion functions */
+CVAPI(void) cvScalarToRawData( const CvScalar* scalar, void* data, int type,
+                              int extend_to_12 CV_DEFAULT(0) );
+
+CVAPI(void) cvRawDataToScalar( const void* data, int type, CvScalar* scalar );
+
+/** @brief Creates a new matrix header but does not allocate the matrix data.
+
+The function allocates a header for a multi-dimensional dense array. The array data can further be
+allocated using cvCreateData or set explicitly to user-allocated data via cvSetData.
+@param dims Number of array dimensions
+@param sizes Array of dimension sizes
+@param type Type of array elements, see cvCreateMat
+ */
+CVAPI(CvMatND*)  cvCreateMatNDHeader( int dims, const int* sizes, int type );
+
+/** @brief Creates the header and allocates the data for a multi-dimensional dense array.
+
+This function call is equivalent to the following code:
+@code
+    CvMatND* mat = cvCreateMatNDHeader(dims, sizes, type);
+    cvCreateData(mat);
+@endcode
+@param dims Number of array dimensions. This must not exceed CV_MAX_DIM (32 by default, but can be
+changed at build time).
+@param sizes Array of dimension sizes.
+@param type Type of array elements, see cvCreateMat .
+ */
+CVAPI(CvMatND*)  cvCreateMatND( int dims, const int* sizes, int type );
+
+/** @brief Initializes a pre-allocated multi-dimensional array header.
+
+@param mat A pointer to the array header to be initialized
+@param dims The number of array dimensions
+@param sizes An array of dimension sizes
+@param type Type of array elements, see cvCreateMat
+@param data Optional data pointer assigned to the matrix header
+ */
+CVAPI(CvMatND*)  cvInitMatNDHeader( CvMatND* mat, int dims, const int* sizes,
+                                    int type, void* data CV_DEFAULT(NULL) );
+
+/** @brief Deallocates a multi-dimensional array.
+
+The function decrements the array data reference counter and releases the array header. If the
+reference counter reaches 0, it also deallocates the data. :
+@code
+    if(*mat )
+        cvDecRefData(*mat);
+    cvFree((void**)mat);
+@endcode
+@param mat Double pointer to the array
+ */
+CV_INLINE  void  cvReleaseMatND( CvMatND** mat )
+{
+    cvReleaseMat( (CvMat**)mat );
+}
+
+/** Creates a copy of CvMatND (except, may be, steps) */
+CVAPI(CvMatND*) cvCloneMatND( const CvMatND* mat );
+
+/** @brief Creates sparse array.
+
+The function allocates a multi-dimensional sparse array. Initially the array contain no elements,
+that is PtrND and other related functions will return 0 for every index.
+@param dims Number of array dimensions. In contrast to the dense matrix, the number of dimensions is
+practically unlimited (up to \f$2^{16}\f$ ).
+@param sizes Array of dimension sizes
+@param type Type of array elements. The same as for CvMat
+ */
+CVAPI(CvSparseMat*)  cvCreateSparseMat( int dims, const int* sizes, int type );
+
+/** @brief Deallocates sparse array.
+
+The function releases the sparse array and clears the array pointer upon exit.
+@param mat Double pointer to the array
+ */
+CVAPI(void)  cvReleaseSparseMat( CvSparseMat** mat );
+
+/** Creates a copy of CvSparseMat (except, may be, zero items) */
+CVAPI(CvSparseMat*) cvCloneSparseMat( const CvSparseMat* mat );
+
+/** @brief Initializes sparse array elements iterator.
+
+The function initializes iterator of sparse array elements and returns pointer to the first element,
+or NULL if the array is empty.
+@param mat Input array
+@param mat_iterator Initialized iterator
+ */
+CVAPI(CvSparseNode*) cvInitSparseMatIterator( const CvSparseMat* mat,
+                                              CvSparseMatIterator* mat_iterator );
+
+/** @brief Returns the next sparse matrix element
+
+The function moves iterator to the next sparse matrix element and returns pointer to it. In the
+current version there is no any particular order of the elements, because they are stored in the
+hash table. The sample below demonstrates how to iterate through the sparse matrix:
+@code
+    // print all the non-zero sparse matrix elements and compute their sum
+    double sum = 0;
+    int i, dims = cvGetDims(sparsemat);
+    CvSparseMatIterator it;
+    CvSparseNode* node = cvInitSparseMatIterator(sparsemat, &it);
+
+    for(; node != 0; node = cvGetNextSparseNode(&it))
+    {
+        int* idx = CV_NODE_IDX(array, node);
+        float val = *(float*)CV_NODE_VAL(array, node);
+        printf("M");
+        for(i = 0; i < dims; i++ )
+            printf("[%d]", idx[i]);
+        printf("=%g\n", val);
+
+        sum += val;
+    }
+
+    printf("nTotal sum = %g\n", sum);
+@endcode
+@param mat_iterator Sparse array iterator
+ */
+CV_INLINE CvSparseNode* cvGetNextSparseNode( CvSparseMatIterator* mat_iterator )
+{
+    if( mat_iterator->node->next )
+        return mat_iterator->node = mat_iterator->node->next;
+    else
+    {
+        int idx;
+        for( idx = ++mat_iterator->curidx; idx < mat_iterator->mat->hashsize; idx++ )
+        {
+            CvSparseNode* node = (CvSparseNode*)mat_iterator->mat->hashtable[idx];
+            if( node )
+            {
+                mat_iterator->curidx = idx;
+                return mat_iterator->node = node;
+            }
+        }
+        return NULL;
+    }
+}
+
+
+#define CV_MAX_ARR 10
+
+/** matrix iterator: used for n-ary operations on dense arrays */
+typedef struct CvNArrayIterator
+{
+    int count; /**< number of arrays */
+    int dims; /**< number of dimensions to iterate */
+    CvSize size; /**< maximal common linear size: { width = size, height = 1 } */
+    uchar* ptr[CV_MAX_ARR]; /**< pointers to the array slices */
+    int stack[CV_MAX_DIM]; /**< for internal use */
+    CvMatND* hdr[CV_MAX_ARR]; /**< pointers to the headers of the
+                                 matrices that are processed */
+}
+CvNArrayIterator;
+
+#define CV_NO_DEPTH_CHECK     1
+#define CV_NO_CN_CHECK        2
+#define CV_NO_SIZE_CHECK      4
+
+/** initializes iterator that traverses through several arrays simultaneously
+   (the function together with cvNextArraySlice is used for
+    N-ari element-wise operations) */
+CVAPI(int) cvInitNArrayIterator( int count, CvArr** arrs,
+                                 const CvArr* mask, CvMatND* stubs,
+                                 CvNArrayIterator* array_iterator,
+                                 int flags CV_DEFAULT(0) );
+
+/** returns zero value if iteration is finished, non-zero (slice length) otherwise */
+CVAPI(int) cvNextNArraySlice( CvNArrayIterator* array_iterator );
+
+
+/** @brief Returns type of array elements.
+
+The function returns type of the array elements. In the case of IplImage the type is converted to
+CvMat-like representation. For example, if the image has been created as:
+@code
+    IplImage* img = cvCreateImage(cvSize(640, 480), IPL_DEPTH_8U, 3);
+@endcode
+The code cvGetElemType(img) will return CV_8UC3.
+@param arr Input array
+ */
+CVAPI(int) cvGetElemType( const CvArr* arr );
+
+/** @brief Return number of array dimensions
+
+The function returns the array dimensionality and the array of dimension sizes. In the case of
+IplImage or CvMat it always returns 2 regardless of number of image/matrix rows. For example, the
+following code calculates total number of array elements:
+@code
+    int sizes[CV_MAX_DIM];
+    int i, total = 1;
+    int dims = cvGetDims(arr, size);
+    for(i = 0; i < dims; i++ )
+        total *= sizes[i];
+@endcode
+@param arr Input array
+@param sizes Optional output vector of the array dimension sizes. For 2d arrays the number of rows
+(height) goes first, number of columns (width) next.
+ */
+CVAPI(int) cvGetDims( const CvArr* arr, int* sizes CV_DEFAULT(NULL) );
+
+
+/** @brief Returns array size along the specified dimension.
+
+@param arr Input array
+@param index Zero-based dimension index (for matrices 0 means number of rows, 1 means number of
+columns; for images 0 means height, 1 means width)
+ */
+CVAPI(int) cvGetDimSize( const CvArr* arr, int index );
+
+
+/** @brief Return pointer to a particular array element.
+
+The functions return a pointer to a specific array element. Number of array dimension should match
+to the number of indices passed to the function except for cvPtr1D function that can be used for
+sequential access to 1D, 2D or nD dense arrays.
+
+The functions can be used for sparse arrays as well - if the requested node does not exist they
+create it and set it to zero.
+
+All these as well as other functions accessing array elements ( cvGetND , cvGetRealND , cvSet
+, cvSetND , cvSetRealND ) raise an error in case if the element index is out of range.
+@param arr Input array
+@param idx0 The first zero-based component of the element index
+@param type Optional output parameter: type of matrix elements
+ */
+CVAPI(uchar*) cvPtr1D( const CvArr* arr, int idx0, int* type CV_DEFAULT(NULL));
+/** @overload */
+CVAPI(uchar*) cvPtr2D( const CvArr* arr, int idx0, int idx1, int* type CV_DEFAULT(NULL) );
+/** @overload */
+CVAPI(uchar*) cvPtr3D( const CvArr* arr, int idx0, int idx1, int idx2,
+                      int* type CV_DEFAULT(NULL));
+/** @overload
+@param arr Input array
+@param idx Array of the element indices
+@param type Optional output parameter: type of matrix elements
+@param create_node Optional input parameter for sparse matrices. Non-zero value of the parameter
+means that the requested element is created if it does not exist already.
+@param precalc_hashval Optional input parameter for sparse matrices. If the pointer is not NULL,
+the function does not recalculate the node hash value, but takes it from the specified location.
+It is useful for speeding up pair-wise operations (TODO: provide an example)
+*/
+CVAPI(uchar*) cvPtrND( const CvArr* arr, const int* idx, int* type CV_DEFAULT(NULL),
+                      int create_node CV_DEFAULT(1),
+                      unsigned* precalc_hashval CV_DEFAULT(NULL));
+
+/** @brief Return a specific array element.
+
+The functions return a specific array element. In the case of a sparse array the functions return 0
+if the requested node does not exist (no new node is created by the functions).
+@param arr Input array
+@param idx0 The first zero-based component of the element index
+ */
+CVAPI(CvScalar) cvGet1D( const CvArr* arr, int idx0 );
+/** @overload */
+CVAPI(CvScalar) cvGet2D( const CvArr* arr, int idx0, int idx1 );
+/** @overload */
+CVAPI(CvScalar) cvGet3D( const CvArr* arr, int idx0, int idx1, int idx2 );
+/** @overload
+@param arr Input array
+@param idx Array of the element indices
+*/
+CVAPI(CvScalar) cvGetND( const CvArr* arr, const int* idx );
+
+/** @brief Return a specific element of single-channel 1D, 2D, 3D or nD array.
+
+Returns a specific element of a single-channel array. If the array has multiple channels, a runtime
+error is raised. Note that Get?D functions can be used safely for both single-channel and
+multiple-channel arrays though they are a bit slower.
+
+In the case of a sparse array the functions return 0 if the requested node does not exist (no new
+node is created by the functions).
+@param arr Input array. Must have a single channel.
+@param idx0 The first zero-based component of the element index
+ */
+CVAPI(double) cvGetReal1D( const CvArr* arr, int idx0 );
+/** @overload */
+CVAPI(double) cvGetReal2D( const CvArr* arr, int idx0, int idx1 );
+/** @overload */
+CVAPI(double) cvGetReal3D( const CvArr* arr, int idx0, int idx1, int idx2 );
+/** @overload
+@param arr Input array. Must have a single channel.
+@param idx Array of the element indices
+*/
+CVAPI(double) cvGetRealND( const CvArr* arr, const int* idx );
+
+/** @brief Change the particular array element.
+
+The functions assign the new value to a particular array element. In the case of a sparse array the
+functions create the node if it does not exist yet.
+@param arr Input array
+@param idx0 The first zero-based component of the element index
+@param value The assigned value
+ */
+CVAPI(void) cvSet1D( CvArr* arr, int idx0, CvScalar value );
+/** @overload */
+CVAPI(void) cvSet2D( CvArr* arr, int idx0, int idx1, CvScalar value );
+/** @overload */
+CVAPI(void) cvSet3D( CvArr* arr, int idx0, int idx1, int idx2, CvScalar value );
+/** @overload
+@param arr Input array
+@param idx Array of the element indices
+@param value The assigned value
+*/
+CVAPI(void) cvSetND( CvArr* arr, const int* idx, CvScalar value );
+
+/** @brief Change a specific array element.
+
+The functions assign a new value to a specific element of a single-channel array. If the array has
+multiple channels, a runtime error is raised. Note that the Set\*D function can be used safely for
+both single-channel and multiple-channel arrays, though they are a bit slower.
+
+In the case of a sparse array the functions create the node if it does not yet exist.
+@param arr Input array
+@param idx0 The first zero-based component of the element index
+@param value The assigned value
+ */
+CVAPI(void) cvSetReal1D( CvArr* arr, int idx0, double value );
+/** @overload */
+CVAPI(void) cvSetReal2D( CvArr* arr, int idx0, int idx1, double value );
+/** @overload */
+CVAPI(void) cvSetReal3D( CvArr* arr, int idx0,
+                        int idx1, int idx2, double value );
+/** @overload
+@param arr Input array
+@param idx Array of the element indices
+@param value The assigned value
+*/
+CVAPI(void) cvSetRealND( CvArr* arr, const int* idx, double value );
+
+/** clears element of ND dense array,
+   in case of sparse arrays it deletes the specified node */
+CVAPI(void) cvClearND( CvArr* arr, const int* idx );
+
+/** @brief Returns matrix header for arbitrary array.
+
+The function returns a matrix header for the input array that can be a matrix - CvMat, an image -
+IplImage, or a multi-dimensional dense array - CvMatND (the third option is allowed only if
+allowND != 0) . In the case of matrix the function simply returns the input pointer. In the case of
+IplImage\* or CvMatND it initializes the header structure with parameters of the current image ROI
+and returns &header. Because COI is not supported by CvMat, it is returned separately.
+
+The function provides an easy way to handle both types of arrays - IplImage and CvMat using the same
+code. Input array must have non-zero data pointer, otherwise the function will report an error.
+
+@note If the input array is IplImage with planar data layout and COI set, the function returns the
+pointer to the selected plane and COI == 0. This feature allows user to process IplImage structures
+with planar data layout, even though OpenCV does not support such images.
+@param arr Input array
+@param header Pointer to CvMat structure used as a temporary buffer
+@param coi Optional output parameter for storing COI
+@param allowND If non-zero, the function accepts multi-dimensional dense arrays (CvMatND\*) and
+returns 2D matrix (if CvMatND has two dimensions) or 1D matrix (when CvMatND has 1 dimension or
+more than 2 dimensions). The CvMatND array must be continuous.
+@sa cvGetImage, cvarrToMat.
+ */
+CVAPI(CvMat*) cvGetMat( const CvArr* arr, CvMat* header,
+                       int* coi CV_DEFAULT(NULL),
+                       int allowND CV_DEFAULT(0));
+
+/** @brief Returns image header for arbitrary array.
+
+The function returns the image header for the input array that can be a matrix (CvMat) or image
+(IplImage). In the case of an image the function simply returns the input pointer. In the case of
+CvMat it initializes an image_header structure with the parameters of the input matrix. Note that
+if we transform IplImage to CvMat using cvGetMat and then transform CvMat back to IplImage using
+this function, we will get different headers if the ROI is set in the original image.
+@param arr Input array
+@param image_header Pointer to IplImage structure used as a temporary buffer
+ */
+CVAPI(IplImage*) cvGetImage( const CvArr* arr, IplImage* image_header );
+
+
+/** @brief Changes the shape of a multi-dimensional array without copying the data.
+
+The function is an advanced version of cvReshape that can work with multi-dimensional arrays as
+well (though it can work with ordinary images and matrices) and change the number of dimensions.
+
+Below are the two samples from the cvReshape description rewritten using cvReshapeMatND:
+@code
+    IplImage* color_img = cvCreateImage(cvSize(320,240), IPL_DEPTH_8U, 3);
+    IplImage gray_img_hdr, *gray_img;
+    gray_img = (IplImage*)cvReshapeMatND(color_img, sizeof(gray_img_hdr), &gray_img_hdr, 1, 0, 0);
+    ...
+    int size[] = { 2, 2, 2 };
+    CvMatND* mat = cvCreateMatND(3, size, CV_32F);
+    CvMat row_header, *row;
+    row = (CvMat*)cvReshapeMatND(mat, sizeof(row_header), &row_header, 0, 1, 0);
+@endcode
+In C, the header file for this function includes a convenient macro cvReshapeND that does away with
+the sizeof_header parameter. So, the lines containing the call to cvReshapeMatND in the examples
+may be replaced as follow:
+@code
+    gray_img = (IplImage*)cvReshapeND(color_img, &gray_img_hdr, 1, 0, 0);
+    ...
+    row = (CvMat*)cvReshapeND(mat, &row_header, 0, 1, 0);
+@endcode
+@param arr Input array
+@param sizeof_header Size of output header to distinguish between IplImage, CvMat and CvMatND
+output headers
+@param header Output header to be filled
+@param new_cn New number of channels. new_cn = 0 means that the number of channels remains
+unchanged.
+@param new_dims New number of dimensions. new_dims = 0 means that the number of dimensions
+remains the same.
+@param new_sizes Array of new dimension sizes. Only new_dims-1 values are used, because the
+total number of elements must remain the same. Thus, if new_dims = 1, new_sizes array is not
+used.
+ */
+CVAPI(CvArr*) cvReshapeMatND( const CvArr* arr,
+                             int sizeof_header, CvArr* header,
+                             int new_cn, int new_dims, int* new_sizes );
+
+#define cvReshapeND( arr, header, new_cn, new_dims, new_sizes )   \
+      cvReshapeMatND( (arr), sizeof(*(header)), (header),         \
+                      (new_cn), (new_dims), (new_sizes))
+
+/** @brief Changes shape of matrix/image without copying data.
+
+The function initializes the CvMat header so that it points to the same data as the original array
+but has a different shape - different number of channels, different number of rows, or both.
+
+The following example code creates one image buffer and two image headers, the first is for a
+320x240x3 image and the second is for a 960x240x1 image:
+@code
+    IplImage* color_img = cvCreateImage(cvSize(320,240), IPL_DEPTH_8U, 3);
+    CvMat gray_mat_hdr;
+    IplImage gray_img_hdr, *gray_img;
+    cvReshape(color_img, &gray_mat_hdr, 1);
+    gray_img = cvGetImage(&gray_mat_hdr, &gray_img_hdr);
+@endcode
+And the next example converts a 3x3 matrix to a single 1x9 vector:
+@code
+    CvMat* mat = cvCreateMat(3, 3, CV_32F);
+    CvMat row_header, *row;
+    row = cvReshape(mat, &row_header, 0, 1);
+@endcode
+@param arr Input array
+@param header Output header to be filled
+@param new_cn New number of channels. 'new_cn = 0' means that the number of channels remains
+unchanged.
+@param new_rows New number of rows. 'new_rows = 0' means that the number of rows remains
+unchanged unless it needs to be changed according to new_cn value.
+*/
+CVAPI(CvMat*) cvReshape( const CvArr* arr, CvMat* header,
+                        int new_cn, int new_rows CV_DEFAULT(0) );
+
+/** Repeats source 2d array several times in both horizontal and
+   vertical direction to fill destination array */
+CVAPI(void) cvRepeat( const CvArr* src, CvArr* dst );
+
+/** @brief Allocates array data
+
+The function allocates image, matrix or multi-dimensional dense array data. Note that in the case of
+matrix types OpenCV allocation functions are used. In the case of IplImage they are used unless
+CV_TURN_ON_IPL_COMPATIBILITY() has been called before. In the latter case IPL functions are used
+to allocate the data.
+@param arr Array header
+ */
+CVAPI(void)  cvCreateData( CvArr* arr );
+
+/** @brief Releases array data.
+
+The function releases the array data. In the case of CvMat or CvMatND it simply calls
+cvDecRefData(), that is the function can not deallocate external data. See also the note to
+cvCreateData .
+@param arr Array header
+ */
+CVAPI(void)  cvReleaseData( CvArr* arr );
+
+/** @brief Assigns user data to the array header.
+
+The function assigns user data to the array header. Header should be initialized before using
+cvCreateMatHeader, cvCreateImageHeader, cvCreateMatNDHeader, cvInitMatHeader,
+cvInitImageHeader or cvInitMatNDHeader.
+@param arr Array header
+@param data User data
+@param step Full row length in bytes
+ */
+CVAPI(void)  cvSetData( CvArr* arr, void* data, int step );
+
+/** @brief Retrieves low-level information about the array.
+
+The function fills output variables with low-level information about the array data. All output
+
+parameters are optional, so some of the pointers may be set to NULL. If the array is IplImage with
+ROI set, the parameters of ROI are returned.
+
+The following example shows how to get access to array elements. It computes absolute values of the
+array elements :
+@code
+    float* data;
+    int step;
+    CvSize size;
+
+    cvGetRawData(array, (uchar**)&data, &step, &size);
+    step /= sizeof(data[0]);
+
+    for(int y = 0; y < size.height; y++, data += step )
+        for(int x = 0; x < size.width; x++ )
+            data[x] = (float)fabs(data[x]);
+@endcode
+@param arr Array header
+@param data Output pointer to the whole image origin or ROI origin if ROI is set
+@param step Output full row length in bytes
+@param roi_size Output ROI size
+ */
+CVAPI(void) cvGetRawData( const CvArr* arr, uchar** data,
+                         int* step CV_DEFAULT(NULL),
+                         CvSize* roi_size CV_DEFAULT(NULL));
+
+/** @brief Returns size of matrix or image ROI.
+
+The function returns number of rows (CvSize::height) and number of columns (CvSize::width) of the
+input matrix or image. In the case of image the size of ROI is returned.
+@param arr array header
+ */
+CVAPI(CvSize) cvGetSize( const CvArr* arr );
+
+/** @brief Copies one array to another.
+
+The function copies selected elements from an input array to an output array:
+
+\f[\texttt{dst} (I)= \texttt{src} (I)  \quad \text{if} \quad \texttt{mask} (I)  \ne 0.\f]
+
+If any of the passed arrays is of IplImage type, then its ROI and COI fields are used. Both arrays
+must have the same type, the same number of dimensions, and the same size. The function can also
+copy sparse arrays (mask is not supported in this case).
+@param src The source array
+@param dst The destination array
+@param mask Operation mask, 8-bit single channel array; specifies elements of the destination array
+to be changed
+ */
+CVAPI(void)  cvCopy( const CvArr* src, CvArr* dst,
+                     const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Sets every element of an array to a given value.
+
+The function copies the scalar value to every selected element of the destination array:
+\f[\texttt{arr} (I)= \texttt{value} \quad \text{if} \quad \texttt{mask} (I)  \ne 0\f]
+If array arr is of IplImage type, then is ROI used, but COI must not be set.
+@param arr The destination array
+@param value Fill value
+@param mask Operation mask, 8-bit single channel array; specifies elements of the destination
+array to be changed
+ */
+CVAPI(void)  cvSet( CvArr* arr, CvScalar value,
+                    const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Clears the array.
+
+The function clears the array. In the case of dense arrays (CvMat, CvMatND or IplImage),
+cvZero(array) is equivalent to cvSet(array,cvScalarAll(0),0). In the case of sparse arrays all the
+elements are removed.
+@param arr Array to be cleared
+ */
+CVAPI(void)  cvSetZero( CvArr* arr );
+#define cvZero  cvSetZero
+
+
+/** Splits a multi-channel array into the set of single-channel arrays or
+   extracts particular [color] plane */
+CVAPI(void)  cvSplit( const CvArr* src, CvArr* dst0, CvArr* dst1,
+                      CvArr* dst2, CvArr* dst3 );
+
+/** Merges a set of single-channel arrays into the single multi-channel array
+   or inserts one particular [color] plane to the array */
+CVAPI(void)  cvMerge( const CvArr* src0, const CvArr* src1,
+                      const CvArr* src2, const CvArr* src3,
+                      CvArr* dst );
+
+/** Copies several channels from input arrays to
+   certain channels of output arrays */
+CVAPI(void)  cvMixChannels( const CvArr** src, int src_count,
+                            CvArr** dst, int dst_count,
+                            const int* from_to, int pair_count );
+
+/** @brief Converts one array to another with optional linear transformation.
+
+The function has several different purposes, and thus has several different names. It copies one
+array to another with optional scaling, which is performed first, and/or optional type conversion,
+performed after:
+
+\f[\texttt{dst} (I) =  \texttt{scale} \texttt{src} (I) + ( \texttt{shift} _0, \texttt{shift} _1,...)\f]
+
+All the channels of multi-channel arrays are processed independently.
+
+The type of conversion is done with rounding and saturation, that is if the result of scaling +
+conversion can not be represented exactly by a value of the destination array element type, it is
+set to the nearest representable value on the real axis.
+@param src Source array
+@param dst Destination array
+@param scale Scale factor
+@param shift Value added to the scaled source array elements
+ */
+CVAPI(void)  cvConvertScale( const CvArr* src, CvArr* dst,
+                             double scale CV_DEFAULT(1),
+                             double shift CV_DEFAULT(0) );
+#define cvCvtScale cvConvertScale
+#define cvScale  cvConvertScale
+#define cvConvert( src, dst )  cvConvertScale( (src), (dst), 1, 0 )
+
+
+/** Performs linear transformation on every source array element,
+   stores absolute value of the result:
+   dst(x,y,c) = abs(scale*src(x,y,c)+shift).
+   destination array must have 8u type.
+   In other cases one may use cvConvertScale + cvAbsDiffS */
+CVAPI(void)  cvConvertScaleAbs( const CvArr* src, CvArr* dst,
+                                double scale CV_DEFAULT(1),
+                                double shift CV_DEFAULT(0) );
+#define cvCvtScaleAbs  cvConvertScaleAbs
+
+
+/** checks termination criteria validity and
+   sets eps to default_eps (if it is not set),
+   max_iter to default_max_iters (if it is not set)
+*/
+CVAPI(CvTermCriteria) cvCheckTermCriteria( CvTermCriteria criteria,
+                                           double default_eps,
+                                           int default_max_iters );
+
+/****************************************************************************************\
+*                   Arithmetic, logic and comparison operations                          *
+\****************************************************************************************/
+
+/** dst(mask) = src1(mask) + src2(mask) */
+CVAPI(void)  cvAdd( const CvArr* src1, const CvArr* src2, CvArr* dst,
+                    const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(mask) = src(mask) + value */
+CVAPI(void)  cvAddS( const CvArr* src, CvScalar value, CvArr* dst,
+                     const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(mask) = src1(mask) - src2(mask) */
+CVAPI(void)  cvSub( const CvArr* src1, const CvArr* src2, CvArr* dst,
+                    const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(mask) = src(mask) - value = src(mask) + (-value) */
+CV_INLINE  void  cvSubS( const CvArr* src, CvScalar value, CvArr* dst,
+                         const CvArr* mask CV_DEFAULT(NULL))
+{
+    cvAddS( src, cvScalar( -value.val[0], -value.val[1], -value.val[2], -value.val[3]),
+            dst, mask );
+}
+
+/** dst(mask) = value - src(mask) */
+CVAPI(void)  cvSubRS( const CvArr* src, CvScalar value, CvArr* dst,
+                      const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src1(idx) * src2(idx) * scale
+   (scaled element-wise multiplication of 2 arrays) */
+CVAPI(void)  cvMul( const CvArr* src1, const CvArr* src2,
+                    CvArr* dst, double scale CV_DEFAULT(1) );
+
+/** element-wise division/inversion with scaling:
+    dst(idx) = src1(idx) * scale / src2(idx)
+    or dst(idx) = scale / src2(idx) if src1 == 0 */
+CVAPI(void)  cvDiv( const CvArr* src1, const CvArr* src2,
+                    CvArr* dst, double scale CV_DEFAULT(1));
+
+/** dst = src1 * scale + src2 */
+CVAPI(void)  cvScaleAdd( const CvArr* src1, CvScalar scale,
+                         const CvArr* src2, CvArr* dst );
+#define cvAXPY( A, real_scalar, B, C ) cvScaleAdd(A, cvRealScalar(real_scalar), B, C)
+
+/** dst = src1 * alpha + src2 * beta + gamma */
+CVAPI(void)  cvAddWeighted( const CvArr* src1, double alpha,
+                            const CvArr* src2, double beta,
+                            double gamma, CvArr* dst );
+
+/** @brief Calculates the dot product of two arrays in Euclidean metrics.
+
+The function calculates and returns the Euclidean dot product of two arrays.
+
+\f[src1  \bullet src2 =  \sum _I ( \texttt{src1} (I)  \texttt{src2} (I))\f]
+
+In the case of multiple channel arrays, the results for all channels are accumulated. In particular,
+cvDotProduct(a,a) where a is a complex vector, will return \f$||\texttt{a}||^2\f$. The function can
+process multi-dimensional arrays, row by row, layer by layer, and so on.
+@param src1 The first source array
+@param src2 The second source array
+ */
+CVAPI(double)  cvDotProduct( const CvArr* src1, const CvArr* src2 );
+
+/** dst(idx) = src1(idx) & src2(idx) */
+CVAPI(void) cvAnd( const CvArr* src1, const CvArr* src2,
+                  CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src(idx) & value */
+CVAPI(void) cvAndS( const CvArr* src, CvScalar value,
+                   CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src1(idx) | src2(idx) */
+CVAPI(void) cvOr( const CvArr* src1, const CvArr* src2,
+                 CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src(idx) | value */
+CVAPI(void) cvOrS( const CvArr* src, CvScalar value,
+                  CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src1(idx) ^ src2(idx) */
+CVAPI(void) cvXor( const CvArr* src1, const CvArr* src2,
+                  CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src(idx) ^ value */
+CVAPI(void) cvXorS( const CvArr* src, CvScalar value,
+                   CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = ~src(idx) */
+CVAPI(void) cvNot( const CvArr* src, CvArr* dst );
+
+/** dst(idx) = lower(idx) <= src(idx) < upper(idx) */
+CVAPI(void) cvInRange( const CvArr* src, const CvArr* lower,
+                      const CvArr* upper, CvArr* dst );
+
+/** dst(idx) = lower <= src(idx) < upper */
+CVAPI(void) cvInRangeS( const CvArr* src, CvScalar lower,
+                       CvScalar upper, CvArr* dst );
+
+#define CV_CMP_EQ   0
+#define CV_CMP_GT   1
+#define CV_CMP_GE   2
+#define CV_CMP_LT   3
+#define CV_CMP_LE   4
+#define CV_CMP_NE   5
+
+/** The comparison operation support single-channel arrays only.
+   Destination image should be 8uC1 or 8sC1 */
+
+/** dst(idx) = src1(idx) _cmp_op_ src2(idx) */
+CVAPI(void) cvCmp( const CvArr* src1, const CvArr* src2, CvArr* dst, int cmp_op );
+
+/** dst(idx) = src1(idx) _cmp_op_ value */
+CVAPI(void) cvCmpS( const CvArr* src, double value, CvArr* dst, int cmp_op );
+
+/** dst(idx) = min(src1(idx),src2(idx)) */
+CVAPI(void) cvMin( const CvArr* src1, const CvArr* src2, CvArr* dst );
+
+/** dst(idx) = max(src1(idx),src2(idx)) */
+CVAPI(void) cvMax( const CvArr* src1, const CvArr* src2, CvArr* dst );
+
+/** dst(idx) = min(src(idx),value) */
+CVAPI(void) cvMinS( const CvArr* src, double value, CvArr* dst );
+
+/** dst(idx) = max(src(idx),value) */
+CVAPI(void) cvMaxS( const CvArr* src, double value, CvArr* dst );
+
+/** dst(x,y,c) = abs(src1(x,y,c) - src2(x,y,c)) */
+CVAPI(void) cvAbsDiff( const CvArr* src1, const CvArr* src2, CvArr* dst );
+
+/** dst(x,y,c) = abs(src(x,y,c) - value(c)) */
+CVAPI(void) cvAbsDiffS( const CvArr* src, CvArr* dst, CvScalar value );
+#define cvAbs( src, dst ) cvAbsDiffS( (src), (dst), cvScalarAll(0))
+
+/****************************************************************************************\
+*                                Math operations                                         *
+\****************************************************************************************/
+
+/** Does cartesian->polar coordinates conversion.
+   Either of output components (magnitude or angle) is optional */
+CVAPI(void)  cvCartToPolar( const CvArr* x, const CvArr* y,
+                            CvArr* magnitude, CvArr* angle CV_DEFAULT(NULL),
+                            int angle_in_degrees CV_DEFAULT(0));
+
+/** Does polar->cartesian coordinates conversion.
+   Either of output components (magnitude or angle) is optional.
+   If magnitude is missing it is assumed to be all 1's */
+CVAPI(void)  cvPolarToCart( const CvArr* magnitude, const CvArr* angle,
+                            CvArr* x, CvArr* y,
+                            int angle_in_degrees CV_DEFAULT(0));
+
+/** Does powering: dst(idx) = src(idx)^power */
+CVAPI(void)  cvPow( const CvArr* src, CvArr* dst, double power );
+
+/** Does exponention: dst(idx) = exp(src(idx)).
+   Overflow is not handled yet. Underflow is handled.
+   Maximal relative error is ~7e-6 for single-precision input */
+CVAPI(void)  cvExp( const CvArr* src, CvArr* dst );
+
+/** Calculates natural logarithms: dst(idx) = log(abs(src(idx))).
+   Logarithm of 0 gives large negative number(~-700)
+   Maximal relative error is ~3e-7 for single-precision output
+*/
+CVAPI(void)  cvLog( const CvArr* src, CvArr* dst );
+
+/** Fast arctangent calculation */
+CVAPI(float) cvFastArctan( float y, float x );
+
+/** Fast cubic root calculation */
+CVAPI(float)  cvCbrt( float value );
+
+#define  CV_CHECK_RANGE    1
+#define  CV_CHECK_QUIET    2
+/** Checks array values for NaNs, Infs or simply for too large numbers
+   (if CV_CHECK_RANGE is set). If CV_CHECK_QUIET is set,
+   no runtime errors is raised (function returns zero value in case of "bad" values).
+   Otherwise cvError is called */
+CVAPI(int)  cvCheckArr( const CvArr* arr, int flags CV_DEFAULT(0),
+                        double min_val CV_DEFAULT(0), double max_val CV_DEFAULT(0));
+#define cvCheckArray cvCheckArr
+
+#define CV_RAND_UNI      0
+#define CV_RAND_NORMAL   1
+
+/** @brief Fills an array with random numbers and updates the RNG state.
+
+The function fills the destination array with uniformly or normally distributed random numbers.
+@param rng CvRNG state initialized by cvRNG
+@param arr The destination array
+@param dist_type Distribution type
+> -   **CV_RAND_UNI** uniform distribution
+> -   **CV_RAND_NORMAL** normal or Gaussian distribution
+@param param1 The first parameter of the distribution. In the case of a uniform distribution it is
+the inclusive lower boundary of the random numbers range. In the case of a normal distribution it
+is the mean value of the random numbers.
+@param param2 The second parameter of the distribution. In the case of a uniform distribution it
+is the exclusive upper boundary of the random numbers range. In the case of a normal distribution
+it is the standard deviation of the random numbers.
+@sa randu, randn, RNG::fill.
+ */
+CVAPI(void) cvRandArr( CvRNG* rng, CvArr* arr, int dist_type,
+                      CvScalar param1, CvScalar param2 );
+
+CVAPI(void) cvRandShuffle( CvArr* mat, CvRNG* rng,
+                           double iter_factor CV_DEFAULT(1.));
+
+#define CV_SORT_EVERY_ROW 0
+#define CV_SORT_EVERY_COLUMN 1
+#define CV_SORT_ASCENDING 0
+#define CV_SORT_DESCENDING 16
+
+CVAPI(void) cvSort( const CvArr* src, CvArr* dst CV_DEFAULT(NULL),
+                    CvArr* idxmat CV_DEFAULT(NULL),
+                    int flags CV_DEFAULT(0));
+
+/** Finds real roots of a cubic equation */
+CVAPI(int) cvSolveCubic( const CvMat* coeffs, CvMat* roots );
+
+/** Finds all real and complex roots of a polynomial equation */
+CVAPI(void) cvSolvePoly(const CvMat* coeffs, CvMat *roots2,
+      int maxiter CV_DEFAULT(20), int fig CV_DEFAULT(100));
+
+/****************************************************************************************\
+*                                Matrix operations                                       *
+\****************************************************************************************/
+
+/** @brief Calculates the cross product of two 3D vectors.
+
+The function calculates the cross product of two 3D vectors:
+\f[\texttt{dst} =  \texttt{src1} \times \texttt{src2}\f]
+or:
+\f[\begin{array}{l} \texttt{dst} _1 =  \texttt{src1} _2  \texttt{src2} _3 -  \texttt{src1} _3  \texttt{src2} _2 \\ \texttt{dst} _2 =  \texttt{src1} _3  \texttt{src2} _1 -  \texttt{src1} _1  \texttt{src2} _3 \\ \texttt{dst} _3 =  \texttt{src1} _1  \texttt{src2} _2 -  \texttt{src1} _2  \texttt{src2} _1 \end{array}\f]
+@param src1 The first source vector
+@param src2 The second source vector
+@param dst The destination vector
+ */
+CVAPI(void)  cvCrossProduct( const CvArr* src1, const CvArr* src2, CvArr* dst );
+
+/** Matrix transform: dst = A*B + C, C is optional */
+#define cvMatMulAdd( src1, src2, src3, dst ) cvGEMM( (src1), (src2), 1., (src3), 1., (dst), 0 )
+#define cvMatMul( src1, src2, dst )  cvMatMulAdd( (src1), (src2), NULL, (dst))
+
+#define CV_GEMM_A_T 1
+#define CV_GEMM_B_T 2
+#define CV_GEMM_C_T 4
+/** Extended matrix transform:
+   dst = alpha*op(A)*op(B) + beta*op(C), where op(X) is X or X^T */
+CVAPI(void)  cvGEMM( const CvArr* src1, const CvArr* src2, double alpha,
+                     const CvArr* src3, double beta, CvArr* dst,
+                     int tABC CV_DEFAULT(0));
+#define cvMatMulAddEx cvGEMM
+
+/** Transforms each element of source array and stores
+   resultant vectors in destination array */
+CVAPI(void)  cvTransform( const CvArr* src, CvArr* dst,
+                          const CvMat* transmat,
+                          const CvMat* shiftvec CV_DEFAULT(NULL));
+#define cvMatMulAddS cvTransform
+
+/** Does perspective transform on every element of input array */
+CVAPI(void)  cvPerspectiveTransform( const CvArr* src, CvArr* dst,
+                                     const CvMat* mat );
+
+/** Calculates (A-delta)*(A-delta)^T (order=0) or (A-delta)^T*(A-delta) (order=1) */
+CVAPI(void) cvMulTransposed( const CvArr* src, CvArr* dst, int order,
+                             const CvArr* delta CV_DEFAULT(NULL),
+                             double scale CV_DEFAULT(1.) );
+
+/** Transposes matrix. Square matrices can be transposed in-place */
+CVAPI(void)  cvTranspose( const CvArr* src, CvArr* dst );
+#define cvT cvTranspose
+
+/** Completes the symmetric matrix from the lower (LtoR=0) or from the upper (LtoR!=0) part */
+CVAPI(void)  cvCompleteSymm( CvMat* matrix, int LtoR CV_DEFAULT(0) );
+
+/** Mirror array data around horizontal (flip=0),
+   vertical (flip=1) or both(flip=-1) axises:
+   cvFlip(src) flips images vertically and sequences horizontally (inplace) */
+CVAPI(void)  cvFlip( const CvArr* src, CvArr* dst CV_DEFAULT(NULL),
+                     int flip_mode CV_DEFAULT(0));
+#define cvMirror cvFlip
+
+
+#define CV_SVD_MODIFY_A   1
+#define CV_SVD_U_T        2
+#define CV_SVD_V_T        4
+
+/** Performs Singular Value Decomposition of a matrix */
+CVAPI(void)   cvSVD( CvArr* A, CvArr* W, CvArr* U CV_DEFAULT(NULL),
+                     CvArr* V CV_DEFAULT(NULL), int flags CV_DEFAULT(0));
+
+/** Performs Singular Value Back Substitution (solves A*X = B):
+   flags must be the same as in cvSVD */
+CVAPI(void)   cvSVBkSb( const CvArr* W, const CvArr* U,
+                        const CvArr* V, const CvArr* B,
+                        CvArr* X, int flags );
+
+#define CV_LU  0
+#define CV_SVD 1
+#define CV_SVD_SYM 2
+#define CV_CHOLESKY 3
+#define CV_QR  4
+#define CV_NORMAL 16
+
+/** Inverts matrix */
+CVAPI(double)  cvInvert( const CvArr* src, CvArr* dst,
+                         int method CV_DEFAULT(CV_LU));
+#define cvInv cvInvert
+
+/** Solves linear system (src1)*(dst) = (src2)
+   (returns 0 if src1 is a singular and CV_LU method is used) */
+CVAPI(int)  cvSolve( const CvArr* src1, const CvArr* src2, CvArr* dst,
+                     int method CV_DEFAULT(CV_LU));
+
+/** Calculates determinant of input matrix */
+CVAPI(double) cvDet( const CvArr* mat );
+
+/** Calculates trace of the matrix (sum of elements on the main diagonal) */
+CVAPI(CvScalar) cvTrace( const CvArr* mat );
+
+/** Finds eigen values and vectors of a symmetric matrix */
+CVAPI(void)  cvEigenVV( CvArr* mat, CvArr* evects, CvArr* evals,
+                        double eps CV_DEFAULT(0),
+                        int lowindex CV_DEFAULT(-1),
+                        int highindex CV_DEFAULT(-1));
+
+///* Finds selected eigen values and vectors of a symmetric matrix */
+//CVAPI(void)  cvSelectedEigenVV( CvArr* mat, CvArr* evects, CvArr* evals,
+//                                int lowindex, int highindex );
+
+/** Makes an identity matrix (mat_ij = i == j) */
+CVAPI(void)  cvSetIdentity( CvArr* mat, CvScalar value CV_DEFAULT(cvRealScalar(1)) );
+
+/** Fills matrix with given range of numbers */
+CVAPI(CvArr*)  cvRange( CvArr* mat, double start, double end );
+
+/**   @anchor core_c_CovarFlags
+@name Flags for cvCalcCovarMatrix
+@see cvCalcCovarMatrix
+  @{
+*/
+
+/** flag for cvCalcCovarMatrix, transpose([v1-avg, v2-avg,...]) * [v1-avg,v2-avg,...] */
+#define CV_COVAR_SCRAMBLED 0
+
+/** flag for cvCalcCovarMatrix, [v1-avg, v2-avg,...] * transpose([v1-avg,v2-avg,...]) */
+#define CV_COVAR_NORMAL    1
+
+/** flag for cvCalcCovarMatrix, do not calc average (i.e. mean vector) - use the input vector instead
+   (useful for calculating covariance matrix by parts) */
+#define CV_COVAR_USE_AVG   2
+
+/** flag for cvCalcCovarMatrix, scale the covariance matrix coefficients by number of the vectors */
+#define CV_COVAR_SCALE     4
+
+/** flag for cvCalcCovarMatrix, all the input vectors are stored in a single matrix, as its rows */
+#define CV_COVAR_ROWS      8
+
+/** flag for cvCalcCovarMatrix, all the input vectors are stored in a single matrix, as its columns */
+#define CV_COVAR_COLS     16
+
+/** @} */
+
+/** Calculates covariation matrix for a set of vectors
+@see @ref core_c_CovarFlags "flags"
+*/
+CVAPI(void)  cvCalcCovarMatrix( const CvArr** vects, int count,
+                                CvArr* cov_mat, CvArr* avg, int flags );
+
+#define CV_PCA_DATA_AS_ROW 0
+#define CV_PCA_DATA_AS_COL 1
+#define CV_PCA_USE_AVG 2
+CVAPI(void)  cvCalcPCA( const CvArr* data, CvArr* mean,
+                        CvArr* eigenvals, CvArr* eigenvects, int flags );
+
+CVAPI(void)  cvProjectPCA( const CvArr* data, const CvArr* mean,
+                           const CvArr* eigenvects, CvArr* result );
+
+CVAPI(void)  cvBackProjectPCA( const CvArr* proj, const CvArr* mean,
+                               const CvArr* eigenvects, CvArr* result );
+
+/** Calculates Mahalanobis(weighted) distance */
+CVAPI(double)  cvMahalanobis( const CvArr* vec1, const CvArr* vec2, const CvArr* mat );
+#define cvMahalonobis  cvMahalanobis
+
+/****************************************************************************************\
+*                                    Array Statistics                                    *
+\****************************************************************************************/
+
+/** Finds sum of array elements */
+CVAPI(CvScalar)  cvSum( const CvArr* arr );
+
+/** Calculates number of non-zero pixels */
+CVAPI(int)  cvCountNonZero( const CvArr* arr );
+
+/** Calculates mean value of array elements */
+CVAPI(CvScalar)  cvAvg( const CvArr* arr, const CvArr* mask CV_DEFAULT(NULL) );
+
+/** Calculates mean and standard deviation of pixel values */
+CVAPI(void)  cvAvgSdv( const CvArr* arr, CvScalar* mean, CvScalar* std_dev,
+                       const CvArr* mask CV_DEFAULT(NULL) );
+
+/** Finds global minimum, maximum and their positions */
+CVAPI(void)  cvMinMaxLoc( const CvArr* arr, double* min_val, double* max_val,
+                          CvPoint* min_loc CV_DEFAULT(NULL),
+                          CvPoint* max_loc CV_DEFAULT(NULL),
+                          const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @anchor core_c_NormFlags
+  @name Flags for cvNorm and cvNormalize
+  @{
+*/
+#define CV_C            1
+#define CV_L1           2
+#define CV_L2           4
+#define CV_NORM_MASK    7
+#define CV_RELATIVE     8
+#define CV_DIFF         16
+#define CV_MINMAX       32
+
+#define CV_DIFF_C       (CV_DIFF | CV_C)
+#define CV_DIFF_L1      (CV_DIFF | CV_L1)
+#define CV_DIFF_L2      (CV_DIFF | CV_L2)
+#define CV_RELATIVE_C   (CV_RELATIVE | CV_C)
+#define CV_RELATIVE_L1  (CV_RELATIVE | CV_L1)
+#define CV_RELATIVE_L2  (CV_RELATIVE | CV_L2)
+/** @} */
+
+/** Finds norm, difference norm or relative difference norm for an array (or two arrays)
+@see ref core_c_NormFlags "flags"
+*/
+CVAPI(double)  cvNorm( const CvArr* arr1, const CvArr* arr2 CV_DEFAULT(NULL),
+                       int norm_type CV_DEFAULT(CV_L2),
+                       const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @see ref core_c_NormFlags "flags" */
+CVAPI(void)  cvNormalize( const CvArr* src, CvArr* dst,
+                          double a CV_DEFAULT(1.), double b CV_DEFAULT(0.),
+                          int norm_type CV_DEFAULT(CV_L2),
+                          const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @anchor core_c_ReduceFlags
+  @name Flags for cvReduce
+  @{
+*/
+#define CV_REDUCE_SUM 0
+#define CV_REDUCE_AVG 1
+#define CV_REDUCE_MAX 2
+#define CV_REDUCE_MIN 3
+/** @} */
+
+/** @see @ref core_c_ReduceFlags "flags" */
+CVAPI(void)  cvReduce( const CvArr* src, CvArr* dst, int dim CV_DEFAULT(-1),
+                       int op CV_DEFAULT(CV_REDUCE_SUM) );
+
+/****************************************************************************************\
+*                      Discrete Linear Transforms and Related Functions                  *
+\****************************************************************************************/
+
+/** @anchor core_c_DftFlags
+  @name Flags for cvDFT, cvDCT and cvMulSpectrums
+  @{
+  */
+#define CV_DXT_FORWARD  0
+#define CV_DXT_INVERSE  1
+#define CV_DXT_SCALE    2 /**< divide result by size of array */
+#define CV_DXT_INV_SCALE (CV_DXT_INVERSE + CV_DXT_SCALE)
+#define CV_DXT_INVERSE_SCALE CV_DXT_INV_SCALE
+#define CV_DXT_ROWS     4 /**< transform each row individually */
+#define CV_DXT_MUL_CONJ 8 /**< conjugate the second argument of cvMulSpectrums */
+/** @} */
+
+/** Discrete Fourier Transform:
+    complex->complex,
+    real->ccs (forward),
+    ccs->real (inverse)
+@see core_c_DftFlags "flags"
+*/
+CVAPI(void)  cvDFT( const CvArr* src, CvArr* dst, int flags,
+                    int nonzero_rows CV_DEFAULT(0) );
+#define cvFFT cvDFT
+
+/** Multiply results of DFTs: DFT(X)*DFT(Y) or DFT(X)*conj(DFT(Y))
+@see core_c_DftFlags "flags"
+*/
+CVAPI(void)  cvMulSpectrums( const CvArr* src1, const CvArr* src2,
+                             CvArr* dst, int flags );
+
+/** Finds optimal DFT vector size >= size0 */
+CVAPI(int)  cvGetOptimalDFTSize( int size0 );
+
+/** Discrete Cosine Transform
+@see core_c_DftFlags "flags"
+*/
+CVAPI(void)  cvDCT( const CvArr* src, CvArr* dst, int flags );
+
+/****************************************************************************************\
+*                              Dynamic data structures                                   *
+\****************************************************************************************/
+
+/** Calculates length of sequence slice (with support of negative indices). */
+CVAPI(int) cvSliceLength( CvSlice slice, const CvSeq* seq );
+
+
+/** Creates new memory storage.
+   block_size == 0 means that default,
+   somewhat optimal size, is used (currently, it is 64K) */
+CVAPI(CvMemStorage*)  cvCreateMemStorage( int block_size CV_DEFAULT(0));
+
+
+/** Creates a memory storage that will borrow memory blocks from parent storage */
+CVAPI(CvMemStorage*)  cvCreateChildMemStorage( CvMemStorage* parent );
+
+
+/** Releases memory storage. All the children of a parent must be released before
+   the parent. A child storage returns all the blocks to parent when it is released */
+CVAPI(void)  cvReleaseMemStorage( CvMemStorage** storage );
+
+
+/** Clears memory storage. This is the only way(!!!) (besides cvRestoreMemStoragePos)
+   to reuse memory allocated for the storage - cvClearSeq,cvClearSet ...
+   do not free any memory.
+   A child storage returns all the blocks to the parent when it is cleared */
+CVAPI(void)  cvClearMemStorage( CvMemStorage* storage );
+
+/** Remember a storage "free memory" position */
+CVAPI(void)  cvSaveMemStoragePos( const CvMemStorage* storage, CvMemStoragePos* pos );
+
+/** Restore a storage "free memory" position */
+CVAPI(void)  cvRestoreMemStoragePos( CvMemStorage* storage, CvMemStoragePos* pos );
+
+/** Allocates continuous buffer of the specified size in the storage */
+CVAPI(void*) cvMemStorageAlloc( CvMemStorage* storage, size_t size );
+
+/** Allocates string in memory storage */
+//CVAPI(CvString) cvMemStorageAllocString( CvMemStorage* storage, const char* ptr,
+//                                         int len CV_DEFAULT(-1) );
+
+/** Creates new empty sequence that will reside in the specified storage */
+CVAPI(CvSeq*)  cvCreateSeq( int seq_flags, size_t header_size,
+                            size_t elem_size, CvMemStorage* storage );
+
+/** Changes default size (granularity) of sequence blocks.
+   The default size is ~1Kbyte */
+CVAPI(void)  cvSetSeqBlockSize( CvSeq* seq, int delta_elems );
+
+
+/** Adds new element to the end of sequence. Returns pointer to the element */
+CVAPI(schar*)  cvSeqPush( CvSeq* seq, const void* element CV_DEFAULT(NULL));
+
+
+/** Adds new element to the beginning of sequence. Returns pointer to it */
+CVAPI(schar*)  cvSeqPushFront( CvSeq* seq, const void* element CV_DEFAULT(NULL));
+
+
+/** Removes the last element from sequence and optionally saves it */
+CVAPI(void)  cvSeqPop( CvSeq* seq, void* element CV_DEFAULT(NULL));
+
+
+/** Removes the first element from sequence and optioanally saves it */
+CVAPI(void)  cvSeqPopFront( CvSeq* seq, void* element CV_DEFAULT(NULL));
+
+
+#define CV_FRONT 1
+#define CV_BACK 0
+/** Adds several new elements to the end of sequence */
+CVAPI(void)  cvSeqPushMulti( CvSeq* seq, const void* elements,
+                             int count, int in_front CV_DEFAULT(0) );
+
+/** Removes several elements from the end of sequence and optionally saves them */
+CVAPI(void)  cvSeqPopMulti( CvSeq* seq, void* elements,
+                            int count, int in_front CV_DEFAULT(0) );
+
+/** Inserts a new element in the middle of sequence.
+   cvSeqInsert(seq,0,elem) == cvSeqPushFront(seq,elem) */
+CVAPI(schar*)  cvSeqInsert( CvSeq* seq, int before_index,
+                            const void* element CV_DEFAULT(NULL));
+
+/** Removes specified sequence element */
+CVAPI(void)  cvSeqRemove( CvSeq* seq, int index );
+
+
+/** Removes all the elements from the sequence. The freed memory
+   can be reused later only by the same sequence unless cvClearMemStorage
+   or cvRestoreMemStoragePos is called */
+CVAPI(void)  cvClearSeq( CvSeq* seq );
+
+
+/** Retrieves pointer to specified sequence element.
+   Negative indices are supported and mean counting from the end
+   (e.g -1 means the last sequence element) */
+CVAPI(schar*)  cvGetSeqElem( const CvSeq* seq, int index );
+
+/** Calculates index of the specified sequence element.
+   Returns -1 if element does not belong to the sequence */
+CVAPI(int)  cvSeqElemIdx( const CvSeq* seq, const void* element,
+                         CvSeqBlock** block CV_DEFAULT(NULL) );
+
+/** Initializes sequence writer. The new elements will be added to the end of sequence */
+CVAPI(void)  cvStartAppendToSeq( CvSeq* seq, CvSeqWriter* writer );
+
+
+/** Combination of cvCreateSeq and cvStartAppendToSeq */
+CVAPI(void)  cvStartWriteSeq( int seq_flags, int header_size,
+                              int elem_size, CvMemStorage* storage,
+                              CvSeqWriter* writer );
+
+/** Closes sequence writer, updates sequence header and returns pointer
+   to the resultant sequence
+   (which may be useful if the sequence was created using cvStartWriteSeq))
+*/
+CVAPI(CvSeq*)  cvEndWriteSeq( CvSeqWriter* writer );
+
+
+/** Updates sequence header. May be useful to get access to some of previously
+   written elements via cvGetSeqElem or sequence reader */
+CVAPI(void)   cvFlushSeqWriter( CvSeqWriter* writer );
+
+
+/** Initializes sequence reader.
+   The sequence can be read in forward or backward direction */
+CVAPI(void) cvStartReadSeq( const CvSeq* seq, CvSeqReader* reader,
+                           int reverse CV_DEFAULT(0) );
+
+
+/** Returns current sequence reader position (currently observed sequence element) */
+CVAPI(int)  cvGetSeqReaderPos( CvSeqReader* reader );
+
+
+/** Changes sequence reader position. It may seek to an absolute or
+   to relative to the current position */
+CVAPI(void)   cvSetSeqReaderPos( CvSeqReader* reader, int index,
+                                 int is_relative CV_DEFAULT(0));
+
+/** Copies sequence content to a continuous piece of memory */
+CVAPI(void*)  cvCvtSeqToArray( const CvSeq* seq, void* elements,
+                               CvSlice slice CV_DEFAULT(CV_WHOLE_SEQ) );
+
+/** Creates sequence header for array.
+   After that all the operations on sequences that do not alter the content
+   can be applied to the resultant sequence */
+CVAPI(CvSeq*) cvMakeSeqHeaderForArray( int seq_type, int header_size,
+                                       int elem_size, void* elements, int total,
+                                       CvSeq* seq, CvSeqBlock* block );
+
+/** Extracts sequence slice (with or without copying sequence elements) */
+CVAPI(CvSeq*) cvSeqSlice( const CvSeq* seq, CvSlice slice,
+                         CvMemStorage* storage CV_DEFAULT(NULL),
+                         int copy_data CV_DEFAULT(0));
+
+CV_INLINE CvSeq* cvCloneSeq( const CvSeq* seq, CvMemStorage* storage CV_DEFAULT(NULL))
+{
+    return cvSeqSlice( seq, CV_WHOLE_SEQ, storage, 1 );
+}
+
+/** Removes sequence slice */
+CVAPI(void)  cvSeqRemoveSlice( CvSeq* seq, CvSlice slice );
+
+/** Inserts a sequence or array into another sequence */
+CVAPI(void)  cvSeqInsertSlice( CvSeq* seq, int before_index, const CvArr* from_arr );
+
+/** a < b ? -1 : a > b ? 1 : 0 */
+typedef int (CV_CDECL* CvCmpFunc)(const void* a, const void* b, void* userdata );
+
+/** Sorts sequence in-place given element comparison function */
+CVAPI(void) cvSeqSort( CvSeq* seq, CvCmpFunc func, void* userdata CV_DEFAULT(NULL) );
+
+/** Finds element in a [sorted] sequence */
+CVAPI(schar*) cvSeqSearch( CvSeq* seq, const void* elem, CvCmpFunc func,
+                           int is_sorted, int* elem_idx,
+                           void* userdata CV_DEFAULT(NULL) );
+
+/** Reverses order of sequence elements in-place */
+CVAPI(void) cvSeqInvert( CvSeq* seq );
+
+/** Splits sequence into one or more equivalence classes using the specified criteria */
+CVAPI(int)  cvSeqPartition( const CvSeq* seq, CvMemStorage* storage,
+                            CvSeq** labels, CvCmpFunc is_equal, void* userdata );
+
+/************ Internal sequence functions ************/
+CVAPI(void)  cvChangeSeqBlock( void* reader, int direction );
+CVAPI(void)  cvCreateSeqBlock( CvSeqWriter* writer );
+
+
+/** Creates a new set */
+CVAPI(CvSet*)  cvCreateSet( int set_flags, int header_size,
+                            int elem_size, CvMemStorage* storage );
+
+/** Adds new element to the set and returns pointer to it */
+CVAPI(int)  cvSetAdd( CvSet* set_header, CvSetElem* elem CV_DEFAULT(NULL),
+                      CvSetElem** inserted_elem CV_DEFAULT(NULL) );
+
+/** Fast variant of cvSetAdd */
+CV_INLINE  CvSetElem* cvSetNew( CvSet* set_header )
+{
+    CvSetElem* elem = set_header->free_elems;
+    if( elem )
+    {
+        set_header->free_elems = elem->next_free;
+        elem->flags = elem->flags & CV_SET_ELEM_IDX_MASK;
+        set_header->active_count++;
+    }
+    else
+        cvSetAdd( set_header, NULL, &elem );
+    return elem;
+}
+
+/** Removes set element given its pointer */
+CV_INLINE  void cvSetRemoveByPtr( CvSet* set_header, void* elem )
+{
+    CvSetElem* _elem = (CvSetElem*)elem;
+    assert( _elem->flags >= 0 /*&& (elem->flags & CV_SET_ELEM_IDX_MASK) < set_header->total*/ );
+    _elem->next_free = set_header->free_elems;
+    _elem->flags = (_elem->flags & CV_SET_ELEM_IDX_MASK) | CV_SET_ELEM_FREE_FLAG;
+    set_header->free_elems = _elem;
+    set_header->active_count--;
+}
+
+/** Removes element from the set by its index  */
+CVAPI(void)   cvSetRemove( CvSet* set_header, int index );
+
+/** Returns a set element by index. If the element doesn't belong to the set,
+   NULL is returned */
+CV_INLINE CvSetElem* cvGetSetElem( const CvSet* set_header, int idx )
+{
+    CvSetElem* elem = (CvSetElem*)(void *)cvGetSeqElem( (CvSeq*)set_header, idx );
+    return elem && CV_IS_SET_ELEM( elem ) ? elem : 0;
+}
+
+/** Removes all the elements from the set */
+CVAPI(void)  cvClearSet( CvSet* set_header );
+
+/** Creates new graph */
+CVAPI(CvGraph*)  cvCreateGraph( int graph_flags, int header_size,
+                                int vtx_size, int edge_size,
+                                CvMemStorage* storage );
+
+/** Adds new vertex to the graph */
+CVAPI(int)  cvGraphAddVtx( CvGraph* graph, const CvGraphVtx* vtx CV_DEFAULT(NULL),
+                           CvGraphVtx** inserted_vtx CV_DEFAULT(NULL) );
+
+
+/** Removes vertex from the graph together with all incident edges */
+CVAPI(int)  cvGraphRemoveVtx( CvGraph* graph, int index );
+CVAPI(int)  cvGraphRemoveVtxByPtr( CvGraph* graph, CvGraphVtx* vtx );
+
+
+/** Link two vertices specified by indices or pointers if they
+   are not connected or return pointer to already existing edge
+   connecting the vertices.
+   Functions return 1 if a new edge was created, 0 otherwise */
+CVAPI(int)  cvGraphAddEdge( CvGraph* graph,
+                            int start_idx, int end_idx,
+                            const CvGraphEdge* edge CV_DEFAULT(NULL),
+                            CvGraphEdge** inserted_edge CV_DEFAULT(NULL) );
+
+CVAPI(int)  cvGraphAddEdgeByPtr( CvGraph* graph,
+                               CvGraphVtx* start_vtx, CvGraphVtx* end_vtx,
+                               const CvGraphEdge* edge CV_DEFAULT(NULL),
+                               CvGraphEdge** inserted_edge CV_DEFAULT(NULL) );
+
+/** Remove edge connecting two vertices */
+CVAPI(void)  cvGraphRemoveEdge( CvGraph* graph, int start_idx, int end_idx );
+CVAPI(void)  cvGraphRemoveEdgeByPtr( CvGraph* graph, CvGraphVtx* start_vtx,
+                                     CvGraphVtx* end_vtx );
+
+/** Find edge connecting two vertices */
+CVAPI(CvGraphEdge*)  cvFindGraphEdge( const CvGraph* graph, int start_idx, int end_idx );
+CVAPI(CvGraphEdge*)  cvFindGraphEdgeByPtr( const CvGraph* graph,
+                                           const CvGraphVtx* start_vtx,
+                                           const CvGraphVtx* end_vtx );
+#define cvGraphFindEdge cvFindGraphEdge
+#define cvGraphFindEdgeByPtr cvFindGraphEdgeByPtr
+
+/** Remove all vertices and edges from the graph */
+CVAPI(void)  cvClearGraph( CvGraph* graph );
+
+
+/** Count number of edges incident to the vertex */
+CVAPI(int)  cvGraphVtxDegree( const CvGraph* graph, int vtx_idx );
+CVAPI(int)  cvGraphVtxDegreeByPtr( const CvGraph* graph, const CvGraphVtx* vtx );
+
+
+/** Retrieves graph vertex by given index */
+#define cvGetGraphVtx( graph, idx ) (CvGraphVtx*)cvGetSetElem((CvSet*)(graph), (idx))
+
+/** Retrieves index of a graph vertex given its pointer */
+#define cvGraphVtxIdx( graph, vtx ) ((vtx)->flags & CV_SET_ELEM_IDX_MASK)
+
+/** Retrieves index of a graph edge given its pointer */
+#define cvGraphEdgeIdx( graph, edge ) ((edge)->flags & CV_SET_ELEM_IDX_MASK)
+
+#define cvGraphGetVtxCount( graph ) ((graph)->active_count)
+#define cvGraphGetEdgeCount( graph ) ((graph)->edges->active_count)
+
+#define  CV_GRAPH_VERTEX        1
+#define  CV_GRAPH_TREE_EDGE     2
+#define  CV_GRAPH_BACK_EDGE     4
+#define  CV_GRAPH_FORWARD_EDGE  8
+#define  CV_GRAPH_CROSS_EDGE    16
+#define  CV_GRAPH_ANY_EDGE      30
+#define  CV_GRAPH_NEW_TREE      32
+#define  CV_GRAPH_BACKTRACKING  64
+#define  CV_GRAPH_OVER          -1
+
+#define  CV_GRAPH_ALL_ITEMS    -1
+
+/** flags for graph vertices and edges */
+#define  CV_GRAPH_ITEM_VISITED_FLAG  (1 << 30)
+#define  CV_IS_GRAPH_VERTEX_VISITED(vtx) \
+    (((CvGraphVtx*)(vtx))->flags & CV_GRAPH_ITEM_VISITED_FLAG)
+#define  CV_IS_GRAPH_EDGE_VISITED(edge) \
+    (((CvGraphEdge*)(edge))->flags & CV_GRAPH_ITEM_VISITED_FLAG)
+#define  CV_GRAPH_SEARCH_TREE_NODE_FLAG   (1 << 29)
+#define  CV_GRAPH_FORWARD_EDGE_FLAG       (1 << 28)
+
+typedef struct CvGraphScanner
+{
+    CvGraphVtx* vtx;       /* current graph vertex (or current edge origin) */
+    CvGraphVtx* dst;       /* current graph edge destination vertex */
+    CvGraphEdge* edge;     /* current edge */
+
+    CvGraph* graph;        /* the graph */
+    CvSeq*   stack;        /* the graph vertex stack */
+    int      index;        /* the lower bound of certainly visited vertices */
+    int      mask;         /* event mask */
+}
+CvGraphScanner;
+
+/** Creates new graph scanner. */
+CVAPI(CvGraphScanner*)  cvCreateGraphScanner( CvGraph* graph,
+                                             CvGraphVtx* vtx CV_DEFAULT(NULL),
+                                             int mask CV_DEFAULT(CV_GRAPH_ALL_ITEMS));
+
+/** Releases graph scanner. */
+CVAPI(void) cvReleaseGraphScanner( CvGraphScanner** scanner );
+
+/** Get next graph element */
+CVAPI(int)  cvNextGraphItem( CvGraphScanner* scanner );
+
+/** Creates a copy of graph */
+CVAPI(CvGraph*) cvCloneGraph( const CvGraph* graph, CvMemStorage* storage );
+
+
+/** Does look-up transformation. Elements of the source array
+   (that should be 8uC1 or 8sC1) are used as indexes in lutarr 256-element table */
+CVAPI(void) cvLUT( const CvArr* src, CvArr* dst, const CvArr* lut );
+
+
+/******************* Iteration through the sequence tree *****************/
+typedef struct CvTreeNodeIterator
+{
+    const void* node;
+    int level;
+    int max_level;
+}
+CvTreeNodeIterator;
+
+CVAPI(void) cvInitTreeNodeIterator( CvTreeNodeIterator* tree_iterator,
+                                   const void* first, int max_level );
+CVAPI(void*) cvNextTreeNode( CvTreeNodeIterator* tree_iterator );
+CVAPI(void*) cvPrevTreeNode( CvTreeNodeIterator* tree_iterator );
+
+/** Inserts sequence into tree with specified "parent" sequence.
+   If parent is equal to frame (e.g. the most external contour),
+   then added contour will have null pointer to parent. */
+CVAPI(void) cvInsertNodeIntoTree( void* node, void* parent, void* frame );
+
+/** Removes contour from tree (together with the contour children). */
+CVAPI(void) cvRemoveNodeFromTree( void* node, void* frame );
+
+/** Gathers pointers to all the sequences,
+   accessible from the `first`, to the single sequence */
+CVAPI(CvSeq*) cvTreeToNodeSeq( const void* first, int header_size,
+                              CvMemStorage* storage );
+
+/** The function implements the K-means algorithm for clustering an array of sample
+   vectors in a specified number of classes */
+#define CV_KMEANS_USE_INITIAL_LABELS    1
+CVAPI(int) cvKMeans2( const CvArr* samples, int cluster_count, CvArr* labels,
+                      CvTermCriteria termcrit, int attempts CV_DEFAULT(1),
+                      CvRNG* rng CV_DEFAULT(0), int flags CV_DEFAULT(0),
+                      CvArr* _centers CV_DEFAULT(0), double* compactness CV_DEFAULT(0) );
+
+/****************************************************************************************\
+*                                    System functions                                    *
+\****************************************************************************************/
+
+/** Loads optimized functions from IPP, MKL etc. or switches back to pure C code */
+CVAPI(int)  cvUseOptimized( int on_off );
+
+typedef IplImage* (CV_STDCALL* Cv_iplCreateImageHeader)
+                            (int,int,int,char*,char*,int,int,int,int,int,
+                            IplROI*,IplImage*,void*,IplTileInfo*);
+typedef void (CV_STDCALL* Cv_iplAllocateImageData)(IplImage*,int,int);
+typedef void (CV_STDCALL* Cv_iplDeallocate)(IplImage*,int);
+typedef IplROI* (CV_STDCALL* Cv_iplCreateROI)(int,int,int,int,int);
+typedef IplImage* (CV_STDCALL* Cv_iplCloneImage)(const IplImage*);
+
+/** @brief Makes OpenCV use IPL functions for allocating IplImage and IplROI structures.
+
+Normally, the function is not called directly. Instead, a simple macro
+CV_TURN_ON_IPL_COMPATIBILITY() is used that calls cvSetIPLAllocators and passes there pointers
+to IPL allocation functions. :
+@code
+    ...
+    CV_TURN_ON_IPL_COMPATIBILITY()
+    ...
+@endcode
+@param create_header pointer to a function, creating IPL image header.
+@param allocate_data pointer to a function, allocating IPL image data.
+@param deallocate pointer to a function, deallocating IPL image.
+@param create_roi pointer to a function, creating IPL image ROI (i.e. Region of Interest).
+@param clone_image pointer to a function, cloning an IPL image.
+ */
+CVAPI(void) cvSetIPLAllocators( Cv_iplCreateImageHeader create_header,
+                               Cv_iplAllocateImageData allocate_data,
+                               Cv_iplDeallocate deallocate,
+                               Cv_iplCreateROI create_roi,
+                               Cv_iplCloneImage clone_image );
+
+#define CV_TURN_ON_IPL_COMPATIBILITY()                                  \
+    cvSetIPLAllocators( iplCreateImageHeader, iplAllocateImage,         \
+                        iplDeallocate, iplCreateROI, iplCloneImage )
+
+/****************************************************************************************\
+*                                    Data Persistence                                    *
+\****************************************************************************************/
+
+#if 0
+/********************************** High-level functions ********************************/
+
+/** @brief Opens file storage for reading or writing data.
+
+The function opens file storage for reading or writing data. In the latter case, a new file is
+created or an existing file is rewritten. The type of the read or written file is determined by the
+filename extension: .xml for XML, .yml or .yaml for YAML and .json for JSON.
+
+At the same time, it also supports adding parameters like "example.xml?base64".
+
+The function returns a pointer to the CvFileStorage structure.
+If the file cannot be opened then the function returns NULL.
+@param filename Name of the file associated with the storage
+@param memstorage Memory storage used for temporary data and for
+:   storing dynamic structures, such as CvSeq or CvGraph . If it is NULL, a temporary memory
+    storage is created and used.
+@param flags Can be one of the following:
+> -   **CV_STORAGE_READ** the storage is open for reading
+> -   **CV_STORAGE_WRITE** the storage is open for writing
+      (use **CV_STORAGE_WRITE | CV_STORAGE_WRITE_BASE64** to write rawdata in Base64)
+@param encoding
+ */
+CVAPI(CvFileStorage*)  cvOpenFileStorage( const char* filename, CvMemStorage* memstorage,
+                                          int flags, const char* encoding CV_DEFAULT(NULL) );
+
+/** @brief Releases file storage.
+
+The function closes the file associated with the storage and releases all the temporary structures.
+It must be called after all I/O operations with the storage are finished.
+@param fs Double pointer to the released file storage
+ */
+CVAPI(void) cvReleaseFileStorage( CvFileStorage** fs );
+
+/** returns attribute value or 0 (NULL) if there is no such attribute */
+CVAPI(const char*) cvAttrValue( const CvAttrList* attr, const char* attr_name );
+
+/** @brief Starts writing a new structure.
+
+The function starts writing a compound structure (collection) that can be a sequence or a map. After
+all the structure fields, which can be scalars or structures, are written, cvEndWriteStruct should
+be called. The function can be used to group some objects or to implement the write function for a
+some user object (see CvTypeInfo).
+@param fs File storage
+@param name Name of the written structure. The structure can be accessed by this name when the
+storage is read.
+@param struct_flags A combination one of the following values:
+-   **CV_NODE_SEQ** the written structure is a sequence (see discussion of CvFileStorage ),
+    that is, its elements do not have a name.
+-   **CV_NODE_MAP** the written structure is a map (see discussion of CvFileStorage ), that
+    is, all its elements have names.
+One and only one of the two above flags must be specified
+-   **CV_NODE_FLOW** the optional flag that makes sense only for YAML streams. It means that
+     the structure is written as a flow (not as a block), which is more compact. It is
+     recommended to use this flag for structures or arrays whose elements are all scalars.
+@param type_name Optional parameter - the object type name. In
+    case of XML it is written as a type_id attribute of the structure opening tag. In the case of
+    YAML it is written after a colon following the structure name (see the example in
+    CvFileStorage description). In case of JSON it is written as a name/value pair.
+    Mainly it is used with user objects. When the storage is read, the
+    encoded type name is used to determine the object type (see CvTypeInfo and cvFindType ).
+@param attributes This parameter is not used in the current implementation
+ */
+CVAPI(void) cvStartWriteStruct( CvFileStorage* fs, const char* name,
+                                int struct_flags, const char* type_name CV_DEFAULT(NULL),
+                                CvAttrList attributes CV_DEFAULT(cvAttrList()));
+
+/** @brief Finishes writing to a file node collection.
+@param fs File storage
+@sa cvStartWriteStruct.
+ */
+CVAPI(void) cvEndWriteStruct( CvFileStorage* fs );
+
+/** @brief Writes an integer value.
+
+The function writes a single integer value (with or without a name) to the file storage.
+@param fs File storage
+@param name Name of the written value. Should be NULL if and only if the parent structure is a
+sequence.
+@param value The written value
+ */
+CVAPI(void) cvWriteInt( CvFileStorage* fs, const char* name, int value );
+
+/** @brief Writes a floating-point value.
+
+The function writes a single floating-point value (with or without a name) to file storage. Special
+values are encoded as follows: NaN (Not A Number) as .NaN, infinity as +.Inf or -.Inf.
+
+The following example shows how to use the low-level writing functions to store custom structures,
+such as termination criteria, without registering a new type. :
+@code
+    void write_termcriteria( CvFileStorage* fs, const char* struct_name,
+                             CvTermCriteria* termcrit )
+    {
+        cvStartWriteStruct( fs, struct_name, CV_NODE_MAP, NULL, cvAttrList(0,0));
+        cvWriteComment( fs, "termination criteria", 1 ); // just a description
+        if( termcrit->type & CV_TERMCRIT_ITER )
+            cvWriteInteger( fs, "max_iterations", termcrit->max_iter );
+        if( termcrit->type & CV_TERMCRIT_EPS )
+            cvWriteReal( fs, "accuracy", termcrit->epsilon );
+        cvEndWriteStruct( fs );
+    }
+@endcode
+@param fs File storage
+@param name Name of the written value. Should be NULL if and only if the parent structure is a
+sequence.
+@param value The written value
+*/
+CVAPI(void) cvWriteReal( CvFileStorage* fs, const char* name, double value );
+
+/** @brief Writes a text string.
+
+The function writes a text string to file storage.
+@param fs File storage
+@param name Name of the written string . Should be NULL if and only if the parent structure is a
+sequence.
+@param str The written text string
+@param quote If non-zero, the written string is put in quotes, regardless of whether they are
+required. Otherwise, if the flag is zero, quotes are used only when they are required (e.g. when
+the string starts with a digit or contains spaces).
+ */
+CVAPI(void) cvWriteString( CvFileStorage* fs, const char* name,
+                           const char* str, int quote CV_DEFAULT(0) );
+
+/** @brief Writes a comment.
+
+The function writes a comment into file storage. The comments are skipped when the storage is read.
+@param fs File storage
+@param comment The written comment, single-line or multi-line
+@param eol_comment If non-zero, the function tries to put the comment at the end of current line.
+If the flag is zero, if the comment is multi-line, or if it does not fit at the end of the current
+line, the comment starts a new line.
+ */
+CVAPI(void) cvWriteComment( CvFileStorage* fs, const char* comment,
+                            int eol_comment );
+
+/** @brief Writes an object to file storage.
+
+The function writes an object to file storage. First, the appropriate type info is found using
+cvTypeOf. Then, the write method associated with the type info is called.
+
+Attributes are used to customize the writing procedure. The standard types support the following
+attributes (all the dt attributes have the same format as in cvWriteRawData):
+
+-# CvSeq
+    -   **header_dt** description of user fields of the sequence header that follow CvSeq, or
+        CvChain (if the sequence is a Freeman chain) or CvContour (if the sequence is a contour or
+        point sequence)
+    -   **dt** description of the sequence elements.
+    -   **recursive** if the attribute is present and is not equal to "0" or "false", the whole
+        tree of sequences (contours) is stored.
+-# CvGraph
+    -   **header_dt** description of user fields of the graph header that follows CvGraph;
+    -   **vertex_dt** description of user fields of graph vertices
+    -   **edge_dt** description of user fields of graph edges (note that the edge weight is
+        always written, so there is no need to specify it explicitly)
+
+Below is the code that creates the YAML file shown in the CvFileStorage description:
+@code
+    #include "cxcore.h"
+
+    int main( int argc, char** argv )
+    {
+        CvMat* mat = cvCreateMat( 3, 3, CV_32F );
+        CvFileStorage* fs = cvOpenFileStorage( "example.yml", 0, CV_STORAGE_WRITE );
+
+        cvSetIdentity( mat );
+        cvWrite( fs, "A", mat, cvAttrList(0,0) );
+
+        cvReleaseFileStorage( &fs );
+        cvReleaseMat( &mat );
+        return 0;
+    }
+@endcode
+@param fs File storage
+@param name Name of the written object. Should be NULL if and only if the parent structure is a
+sequence.
+@param ptr Pointer to the object
+@param attributes The attributes of the object. They are specific for each particular type (see
+the discussion below).
+ */
+CVAPI(void) cvWrite( CvFileStorage* fs, const char* name, const void* ptr,
+                         CvAttrList attributes CV_DEFAULT(cvAttrList()));
+
+/** @brief Starts the next stream.
+
+The function finishes the currently written stream and starts the next stream. In the case of XML
+the file with multiple streams looks like this:
+@code{.xml}
+    <opencv_storage>
+    <!-- stream #1 data -->
+    </opencv_storage>
+    <opencv_storage>
+    <!-- stream #2 data -->
+    </opencv_storage>
+    ...
+@endcode
+The YAML file will look like this:
+@code{.yaml}
+    %YAML 1.0
+    # stream #1 data
+    ...
+    ---
+    # stream #2 data
+@endcode
+This is useful for concatenating files or for resuming the writing process.
+@param fs File storage
+ */
+CVAPI(void) cvStartNextStream( CvFileStorage* fs );
+
+/** @brief Writes multiple numbers.
+
+The function writes an array, whose elements consist of single or multiple numbers. The function
+call can be replaced with a loop containing a few cvWriteInt and cvWriteReal calls, but a single
+call is more efficient. Note that because none of the elements have a name, they should be written
+to a sequence rather than a map.
+@param fs File storage
+@param src Pointer to the written array
+@param len Number of the array elements to write
+@param dt Specification of each array element, see @ref format_spec "format specification"
+ */
+CVAPI(void) cvWriteRawData( CvFileStorage* fs, const void* src,
+                                int len, const char* dt );
+
+/** @brief Writes multiple numbers in Base64.
+
+If either CV_STORAGE_WRITE_BASE64 or cv::FileStorage::WRITE_BASE64 is used,
+this function will be the same as cvWriteRawData. If neither, the main
+difference is that it outputs a sequence in Base64 encoding rather than
+in plain text.
+
+This function can only be used to write a sequence with a type "binary".
+
+@param fs File storage
+@param src Pointer to the written array
+@param len Number of the array elements to write
+@param dt Specification of each array element, see @ref format_spec "format specification"
+*/
+CVAPI(void) cvWriteRawDataBase64( CvFileStorage* fs, const void* src,
+                                 int len, const char* dt );
+
+/** @brief Returns a unique pointer for a given name.
+
+The function returns a unique pointer for each particular file node name. This pointer can be then
+passed to the cvGetFileNode function that is faster than cvGetFileNodeByName because it compares
+text strings by comparing pointers rather than the strings' content.
+
+Consider the following example where an array of points is encoded as a sequence of 2-entry maps:
+@code
+    points:
+      - { x: 10, y: 10 }
+      - { x: 20, y: 20 }
+      - { x: 30, y: 30 }
+      # ...
+@endcode
+Then, it is possible to get hashed "x" and "y" pointers to speed up decoding of the points. :
+@code
+    #include "cxcore.h"
+
+    int main( int argc, char** argv )
+    {
+        CvFileStorage* fs = cvOpenFileStorage( "points.yml", 0, CV_STORAGE_READ );
+        CvStringHashNode* x_key = cvGetHashedNode( fs, "x", -1, 1 );
+        CvStringHashNode* y_key = cvGetHashedNode( fs, "y", -1, 1 );
+        CvFileNode* points = cvGetFileNodeByName( fs, 0, "points" );
+
+        if( CV_NODE_IS_SEQ(points->tag) )
+        {
+            CvSeq* seq = points->data.seq;
+            int i, total = seq->total;
+            CvSeqReader reader;
+            cvStartReadSeq( seq, &reader, 0 );
+            for( i = 0; i < total; i++ )
+            {
+                CvFileNode* pt = (CvFileNode*)reader.ptr;
+    #if 1 // faster variant
+                CvFileNode* xnode = cvGetFileNode( fs, pt, x_key, 0 );
+                CvFileNode* ynode = cvGetFileNode( fs, pt, y_key, 0 );
+                assert( xnode && CV_NODE_IS_INT(xnode->tag) &&
+                        ynode && CV_NODE_IS_INT(ynode->tag));
+                int x = xnode->data.i; // or x = cvReadInt( xnode, 0 );
+                int y = ynode->data.i; // or y = cvReadInt( ynode, 0 );
+    #elif 1 // slower variant; does not use x_key & y_key
+                CvFileNode* xnode = cvGetFileNodeByName( fs, pt, "x" );
+                CvFileNode* ynode = cvGetFileNodeByName( fs, pt, "y" );
+                assert( xnode && CV_NODE_IS_INT(xnode->tag) &&
+                        ynode && CV_NODE_IS_INT(ynode->tag));
+                int x = xnode->data.i; // or x = cvReadInt( xnode, 0 );
+                int y = ynode->data.i; // or y = cvReadInt( ynode, 0 );
+    #else // the slowest yet the easiest to use variant
+                int x = cvReadIntByName( fs, pt, "x", 0 );
+                int y = cvReadIntByName( fs, pt, "y", 0 );
+    #endif
+                CV_NEXT_SEQ_ELEM( seq->elem_size, reader );
+                printf("
+            }
+        }
+        cvReleaseFileStorage( &fs );
+        return 0;
+    }
+@endcode
+Please note that whatever method of accessing a map you are using, it is still much slower than
+using plain sequences; for example, in the above example, it is more efficient to encode the points
+as pairs of integers in a single numeric sequence.
+@param fs File storage
+@param name Literal node name
+@param len Length of the name (if it is known apriori), or -1 if it needs to be calculated
+@param create_missing Flag that specifies, whether an absent key should be added into the hash table
+*/
+CVAPI(CvStringHashNode*) cvGetHashedKey( CvFileStorage* fs, const char* name,
+                                        int len CV_DEFAULT(-1),
+                                        int create_missing CV_DEFAULT(0));
+
+/** @brief Retrieves one of the top-level nodes of the file storage.
+
+The function returns one of the top-level file nodes. The top-level nodes do not have a name, they
+correspond to the streams that are stored one after another in the file storage. If the index is out
+of range, the function returns a NULL pointer, so all the top-level nodes can be iterated by
+subsequent calls to the function with stream_index=0,1,..., until the NULL pointer is returned.
+This function can be used as a base for recursive traversal of the file storage.
+@param fs File storage
+@param stream_index Zero-based index of the stream. See cvStartNextStream . In most cases,
+there is only one stream in the file; however, there can be several.
+ */
+CVAPI(CvFileNode*) cvGetRootFileNode( const CvFileStorage* fs,
+                                     int stream_index CV_DEFAULT(0) );
+
+/** @brief Finds a node in a map or file storage.
+
+The function finds a file node. It is a faster version of cvGetFileNodeByName (see
+cvGetHashedKey discussion). Also, the function can insert a new node, if it is not in the map yet.
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node. If both map and
+key are NULLs, the function returns the root file node - a map that contains top-level nodes.
+@param key Unique pointer to the node name, retrieved with cvGetHashedKey
+@param create_missing Flag that specifies whether an absent node should be added to the map
+ */
+CVAPI(CvFileNode*) cvGetFileNode( CvFileStorage* fs, CvFileNode* map,
+                                 const CvStringHashNode* key,
+                                 int create_missing CV_DEFAULT(0) );
+
+/** @brief Finds a node in a map or file storage.
+
+The function finds a file node by name. The node is searched either in map or, if the pointer is
+NULL, among the top-level file storage nodes. Using this function for maps and cvGetSeqElem (or
+sequence reader) for sequences, it is possible to navigate through the file storage. To speed up
+multiple queries for a certain key (e.g., in the case of an array of structures) one may use a
+combination of cvGetHashedKey and cvGetFileNode.
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches in all the top-level nodes
+(streams), starting with the first one.
+@param name The file node name
+ */
+CVAPI(CvFileNode*) cvGetFileNodeByName( const CvFileStorage* fs,
+                                       const CvFileNode* map,
+                                       const char* name );
+
+/** @brief Retrieves an integer value from a file node.
+
+The function returns an integer that is represented by the file node. If the file node is NULL, the
+default_value is returned (thus, it is convenient to call the function right after cvGetFileNode
+without checking for a NULL pointer). If the file node has type CV_NODE_INT, then node-\>data.i is
+returned. If the file node has type CV_NODE_REAL, then node-\>data.f is converted to an integer
+and returned. Otherwise the error is reported.
+@param node File node
+@param default_value The value that is returned if node is NULL
+ */
+CV_INLINE int cvReadInt( const CvFileNode* node, int default_value CV_DEFAULT(0) )
+{
+    return !node ? default_value :
+        CV_NODE_IS_INT(node->tag) ? node->data.i :
+        CV_NODE_IS_REAL(node->tag) ? cvRound(node->data.f) : 0x7fffffff;
+}
+
+/** @brief Finds a file node and returns its value.
+
+The function is a simple superposition of cvGetFileNodeByName and cvReadInt.
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node.
+@param name The node name
+@param default_value The value that is returned if the file node is not found
+ */
+CV_INLINE int cvReadIntByName( const CvFileStorage* fs, const CvFileNode* map,
+                         const char* name, int default_value CV_DEFAULT(0) )
+{
+    return cvReadInt( cvGetFileNodeByName( fs, map, name ), default_value );
+}
+
+/** @brief Retrieves a floating-point value from a file node.
+
+The function returns a floating-point value that is represented by the file node. If the file node
+is NULL, the default_value is returned (thus, it is convenient to call the function right after
+cvGetFileNode without checking for a NULL pointer). If the file node has type CV_NODE_REAL ,
+then node-\>data.f is returned. If the file node has type CV_NODE_INT , then node-:math:\>data.f
+is converted to floating-point and returned. Otherwise the result is not determined.
+@param node File node
+@param default_value The value that is returned if node is NULL
+ */
+CV_INLINE double cvReadReal( const CvFileNode* node, double default_value CV_DEFAULT(0.) )
+{
+    return !node ? default_value :
+        CV_NODE_IS_INT(node->tag) ? (double)node->data.i :
+        CV_NODE_IS_REAL(node->tag) ? node->data.f : 1e300;
+}
+
+/** @brief Finds a file node and returns its value.
+
+The function is a simple superposition of cvGetFileNodeByName and cvReadReal .
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node.
+@param name The node name
+@param default_value The value that is returned if the file node is not found
+ */
+CV_INLINE double cvReadRealByName( const CvFileStorage* fs, const CvFileNode* map,
+                        const char* name, double default_value CV_DEFAULT(0.) )
+{
+    return cvReadReal( cvGetFileNodeByName( fs, map, name ), default_value );
+}
+
+/** @brief Retrieves a text string from a file node.
+
+The function returns a text string that is represented by the file node. If the file node is NULL,
+the default_value is returned (thus, it is convenient to call the function right after
+cvGetFileNode without checking for a NULL pointer). If the file node has type CV_NODE_STR , then
+node-:math:\>data.str.ptr is returned. Otherwise the result is not determined.
+@param node File node
+@param default_value The value that is returned if node is NULL
+ */
+CV_INLINE const char* cvReadString( const CvFileNode* node,
+                        const char* default_value CV_DEFAULT(NULL) )
+{
+    return !node ? default_value : CV_NODE_IS_STRING(node->tag) ? node->data.str.ptr : 0;
+}
+
+/** @brief Finds a file node by its name and returns its value.
+
+The function is a simple superposition of cvGetFileNodeByName and cvReadString .
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node.
+@param name The node name
+@param default_value The value that is returned if the file node is not found
+ */
+CV_INLINE const char* cvReadStringByName( const CvFileStorage* fs, const CvFileNode* map,
+                        const char* name, const char* default_value CV_DEFAULT(NULL) )
+{
+    return cvReadString( cvGetFileNodeByName( fs, map, name ), default_value );
+}
+
+
+/** @brief Decodes an object and returns a pointer to it.
+
+The function decodes a user object (creates an object in a native representation from the file
+storage subtree) and returns it. The object to be decoded must be an instance of a registered type
+that supports the read method (see CvTypeInfo). The type of the object is determined by the type
+name that is encoded in the file. If the object is a dynamic structure, it is created either in
+memory storage and passed to cvOpenFileStorage or, if a NULL pointer was passed, in temporary
+memory storage, which is released when cvReleaseFileStorage is called. Otherwise, if the object is
+not a dynamic structure, it is created in a heap and should be released with a specialized function
+or by using the generic cvRelease.
+@param fs File storage
+@param node The root object node
+@param attributes Unused parameter
+ */
+CVAPI(void*) cvRead( CvFileStorage* fs, CvFileNode* node,
+                        CvAttrList* attributes CV_DEFAULT(NULL));
+
+/** @brief Finds an object by name and decodes it.
+
+The function is a simple superposition of cvGetFileNodeByName and cvRead.
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node.
+@param name The node name
+@param attributes Unused parameter
+ */
+CV_INLINE void* cvReadByName( CvFileStorage* fs, const CvFileNode* map,
+                              const char* name, CvAttrList* attributes CV_DEFAULT(NULL) )
+{
+    return cvRead( fs, cvGetFileNodeByName( fs, map, name ), attributes );
+}
+
+
+/** @brief Initializes the file node sequence reader.
+
+The function initializes the sequence reader to read data from a file node. The initialized reader
+can be then passed to cvReadRawDataSlice.
+@param fs File storage
+@param src The file node (a sequence) to read numbers from
+@param reader Pointer to the sequence reader
+ */
+CVAPI(void) cvStartReadRawData( const CvFileStorage* fs, const CvFileNode* src,
+                               CvSeqReader* reader );
+
+/** @brief Initializes file node sequence reader.
+
+The function reads one or more elements from the file node, representing a sequence, to a
+user-specified array. The total number of read sequence elements is a product of total and the
+number of components in each array element. For example, if dt=2if, the function will read total\*3
+sequence elements. As with any sequence, some parts of the file node sequence can be skipped or read
+repeatedly by repositioning the reader using cvSetSeqReaderPos.
+@param fs File storage
+@param reader The sequence reader. Initialize it with cvStartReadRawData .
+@param count The number of elements to read
+@param dst Pointer to the destination array
+@param dt Specification of each array element. It has the same format as in cvWriteRawData .
+ */
+CVAPI(void) cvReadRawDataSlice( const CvFileStorage* fs, CvSeqReader* reader,
+                               int count, void* dst, const char* dt );
+
+/** @brief Reads multiple numbers.
+
+The function reads elements from a file node that represents a sequence of scalars.
+@param fs File storage
+@param src The file node (a sequence) to read numbers from
+@param dst Pointer to the destination array
+@param dt Specification of each array element. It has the same format as in cvWriteRawData .
+ */
+CVAPI(void) cvReadRawData( const CvFileStorage* fs, const CvFileNode* src,
+                          void* dst, const char* dt );
+
+/** @brief Writes a file node to another file storage.
+
+The function writes a copy of a file node to file storage. Possible applications of the function are
+merging several file storages into one and conversion between XML, YAML and JSON formats.
+@param fs Destination file storage
+@param new_node_name New name of the file node in the destination file storage. To keep the
+existing name, use cvcvGetFileNodeName
+@param node The written node
+@param embed If the written node is a collection and this parameter is not zero, no extra level of
+hierarchy is created. Instead, all the elements of node are written into the currently written
+structure. Of course, map elements can only be embedded into another map, and sequence elements
+can only be embedded into another sequence.
+ */
+CVAPI(void) cvWriteFileNode( CvFileStorage* fs, const char* new_node_name,
+                            const CvFileNode* node, int embed );
+
+/** @brief Returns the name of a file node.
+
+The function returns the name of a file node or NULL, if the file node does not have a name or if
+node is NULL.
+@param node File node
+ */
+CVAPI(const char*) cvGetFileNodeName( const CvFileNode* node );
+
+/*********************************** Adding own types ***********************************/
+
+/** @brief Registers a new type.
+
+The function registers a new type, which is described by info . The function creates a copy of the
+structure, so the user should delete it after calling the function.
+@param info Type info structure
+ */
+CVAPI(void) cvRegisterType( const CvTypeInfo* info );
+
+/** @brief Unregisters the type.
+
+The function unregisters a type with a specified name. If the name is unknown, it is possible to
+locate the type info by an instance of the type using cvTypeOf or by iterating the type list,
+starting from cvFirstType, and then calling cvUnregisterType(info-\>typeName).
+@param type_name Name of an unregistered type
+ */
+CVAPI(void) cvUnregisterType( const char* type_name );
+
+/** @brief Returns the beginning of a type list.
+
+The function returns the first type in the list of registered types. Navigation through the list can
+be done via the prev and next fields of the CvTypeInfo structure.
+ */
+CVAPI(CvTypeInfo*) cvFirstType(void);
+
+/** @brief Finds a type by its name.
+
+The function finds a registered type by its name. It returns NULL if there is no type with the
+specified name.
+@param type_name Type name
+ */
+CVAPI(CvTypeInfo*) cvFindType( const char* type_name );
+
+/** @brief Returns the type of an object.
+
+The function finds the type of a given object. It iterates through the list of registered types and
+calls the is_instance function/method for every type info structure with that object until one of
+them returns non-zero or until the whole list has been traversed. In the latter case, the function
+returns NULL.
+@param struct_ptr The object pointer
+ */
+CVAPI(CvTypeInfo*) cvTypeOf( const void* struct_ptr );
+
+#endif
+
+/** @brief Releases an object.
+
+ The function finds the type of a given object and calls release with the double pointer.
+ @param struct_ptr Double pointer to the object
+ */
+CVAPI(void) cvRelease( void** struct_ptr );
+
+/** @brief Makes a clone of an object.
+
+The function finds the type of a given object and calls clone with the passed object. Of course, if
+you know the object type, for example, struct_ptr is CvMat\*, it is faster to call the specific
+function, like cvCloneMat.
+@param struct_ptr The object to clone
+ */
+CVAPI(void*) cvClone( const void* struct_ptr );
+
+/*********************************** Measuring Execution Time ***************************/
+
+/** helper functions for RNG initialization and accurate time measurement:
+   uses internal clock counter on x86 */
+CVAPI(int64)  cvGetTickCount( void );
+CVAPI(double) cvGetTickFrequency( void );
+
+/*********************************** CPU capabilities ***********************************/
+
+CVAPI(int) cvCheckHardwareSupport(int feature);
+
+/*********************************** Multi-Threading ************************************/
+
+/** retrieve/set the number of threads used in OpenMP implementations */
+CVAPI(int)  cvGetNumThreads( void );
+CVAPI(void) cvSetNumThreads( int threads CV_DEFAULT(0) );
+/** get index of the thread being executed */
+CVAPI(int)  cvGetThreadNum( void );
+
+
+/********************************** Error Handling **************************************/
+
+/** Get current OpenCV error status */
+CVAPI(int) cvGetErrStatus( void );
+
+/** Sets error status silently */
+CVAPI(void) cvSetErrStatus( int status );
+
+#define CV_ErrModeLeaf     0   /* Print error and exit program */
+#define CV_ErrModeParent   1   /* Print error and continue */
+#define CV_ErrModeSilent   2   /* Don't print and continue */
+
+/** Retrieves current error processing mode */
+CVAPI(int)  cvGetErrMode( void );
+
+/** Sets error processing mode, returns previously used mode */
+CVAPI(int) cvSetErrMode( int mode );
+
+/** Sets error status and performs some additional actions (displaying message box,
+ writing message to stderr, terminating application etc.)
+ depending on the current error mode */
+CVAPI(void) cvError( int status, const char* func_name,
+                    const char* err_msg, const char* file_name, int line );
+
+/** Retrieves textual description of the error given its code */
+CVAPI(const char*) cvErrorStr( int status );
+
+/** Retrieves detailed information about the last error occurred */
+CVAPI(int) cvGetErrInfo( const char** errcode_desc, const char** description,
+                        const char** filename, int* line );
+
+/** Maps IPP error codes to the counterparts from OpenCV */
+CVAPI(int) cvErrorFromIppStatus( int ipp_status );
+
+typedef int (CV_CDECL *CvErrorCallback)( int status, const char* func_name,
+                                        const char* err_msg, const char* file_name, int line, void* userdata );
+
+/** Assigns a new error-handling function */
+CVAPI(CvErrorCallback) cvRedirectError( CvErrorCallback error_handler,
+                                       void* userdata CV_DEFAULT(NULL),
+                                       void** prev_userdata CV_DEFAULT(NULL) );
+
+/** Output nothing */
+CVAPI(int) cvNulDevReport( int status, const char* func_name, const char* err_msg,
+                          const char* file_name, int line, void* userdata );
+
+/** Output to console(fprintf(stderr,...)) */
+CVAPI(int) cvStdErrReport( int status, const char* func_name, const char* err_msg,
+                          const char* file_name, int line, void* userdata );
+
+/** Output to MessageBox(WIN32) */
+CVAPI(int) cvGuiBoxReport( int status, const char* func_name, const char* err_msg,
+                          const char* file_name, int line, void* userdata );
+
+#define OPENCV_ERROR(status,func,context)                           \
+cvError((status),(func),(context),__FILE__,__LINE__)
+
+#define OPENCV_ASSERT(expr,func,context)                            \
+{if (! (expr))                                      \
+{OPENCV_ERROR(CV_StsInternal,(func),(context));}}
+
+#define OPENCV_CALL( Func )                                         \
+{                                                                   \
+Func;                                                           \
+}
+
+
+/** CV_FUNCNAME macro defines icvFuncName constant which is used by CV_ERROR macro */
+#ifdef CV_NO_FUNC_NAMES
+#define CV_FUNCNAME( Name )
+#define cvFuncName ""
+#else
+#define CV_FUNCNAME( Name )  \
+static char cvFuncName[] = Name
+#endif
+
+
+/**
+ CV_ERROR macro unconditionally raises error with passed code and message.
+ After raising error, control will be transferred to the exit label.
+ */
+#define CV_ERROR( Code, Msg )                                       \
+{                                                                   \
+    cvError( (Code), cvFuncName, Msg, __FILE__, __LINE__ );        \
+    __CV_EXIT__;                                                   \
+}
+
+/**
+ CV_CHECK macro checks error status after CV (or IPL)
+ function call. If error detected, control will be transferred to the exit
+ label.
+ */
+#define CV_CHECK()                                                  \
+{                                                                   \
+    if( cvGetErrStatus() < 0 )                                      \
+        CV_ERROR( CV_StsBackTrace, "Inner function failed." );      \
+}
+
+
+/**
+ CV_CALL macro calls CV (or IPL) function, checks error status and
+ signals a error if the function failed. Useful in "parent node"
+ error processing mode
+ */
+#define CV_CALL( Func )                                             \
+{                                                                   \
+    Func;                                                           \
+    CV_CHECK();                                                     \
+}
+
+
+/** Runtime assertion macro */
+#define CV_ASSERT( Condition )                                          \
+{                                                                       \
+    if( !(Condition) )                                                  \
+        CV_ERROR( CV_StsInternal, "Assertion: " #Condition " failed" ); \
+}
+
+#define __CV_BEGIN__       {
+#define __CV_END__         goto exit; exit: ; }
+#define __CV_EXIT__        goto exit
+
+/** @} core_c */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#ifdef __cplusplus
+
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @addtogroup core_c_glue
+//! @{
+
+/////////////////////////////////////////// glue ///////////////////////////////////////////
+
+//! converts array (CvMat or IplImage) to cv::Mat
+CV_EXPORTS Mat cvarrToMat(const CvArr* arr, bool copyData=false,
+                          bool allowND=true, int coiMode=0,
+                          AutoBuffer<double>* buf=0);
+
+static inline Mat cvarrToMatND(const CvArr* arr, bool copyData=false, int coiMode=0)
+{
+    return cvarrToMat(arr, copyData, true, coiMode);
+}
+
+
+//! extracts Channel of Interest from CvMat or IplImage and makes cv::Mat out of it.
+CV_EXPORTS void extractImageCOI(const CvArr* arr, OutputArray coiimg, int coi=-1);
+//! inserts single-channel cv::Mat into a multi-channel CvMat or IplImage
+CV_EXPORTS void insertImageCOI(InputArray coiimg, CvArr* arr, int coi=-1);
+
+
+
+////// specialized implementations of DefaultDeleter::operator() for classic OpenCV types //////
+
+template<> struct DefaultDeleter<CvMat>{ CV_EXPORTS void operator ()(CvMat* obj) const; };
+template<> struct DefaultDeleter<IplImage>{ CV_EXPORTS void operator ()(IplImage* obj) const; };
+template<> struct DefaultDeleter<CvMatND>{ CV_EXPORTS void operator ()(CvMatND* obj) const; };
+template<> struct DefaultDeleter<CvSparseMat>{ CV_EXPORTS void operator ()(CvSparseMat* obj) const; };
+template<> struct DefaultDeleter<CvMemStorage>{ CV_EXPORTS void operator ()(CvMemStorage* obj) const; };
+
+////////////// convenient wrappers for operating old-style dynamic structures //////////////
+
+template<typename _Tp> class SeqIterator;
+
+typedef Ptr<CvMemStorage> MemStorage;
+
+/*!
+ Template Sequence Class derived from CvSeq
+
+ The class provides more convenient access to sequence elements,
+ STL-style operations and iterators.
+
+ \note The class is targeted for simple data types,
+    i.e. no constructors or destructors
+    are called for the sequence elements.
+*/
+template<typename _Tp> class Seq
+{
+public:
+    typedef SeqIterator<_Tp> iterator;
+    typedef SeqIterator<_Tp> const_iterator;
+
+    //! the default constructor
+    Seq();
+    //! the constructor for wrapping CvSeq structure. The real element type in CvSeq should match _Tp.
+    Seq(const CvSeq* seq);
+    //! creates the empty sequence that resides in the specified storage
+    Seq(MemStorage& storage, int headerSize = sizeof(CvSeq));
+    //! returns read-write reference to the specified element
+    _Tp& operator [](int idx);
+    //! returns read-only reference to the specified element
+    const _Tp& operator[](int idx) const;
+    //! returns iterator pointing to the beginning of the sequence
+    SeqIterator<_Tp> begin() const;
+    //! returns iterator pointing to the element following the last sequence element
+    SeqIterator<_Tp> end() const;
+    //! returns the number of elements in the sequence
+    size_t size() const;
+    //! returns the type of sequence elements (CV_8UC1 ... CV_64FC(CV_CN_MAX) ...)
+    int type() const;
+    //! returns the depth of sequence elements (CV_8U ... CV_64F)
+    int depth() const;
+    //! returns the number of channels in each sequence element
+    int channels() const;
+    //! returns the size of each sequence element
+    size_t elemSize() const;
+    //! returns index of the specified sequence element
+    size_t index(const _Tp& elem) const;
+    //! appends the specified element to the end of the sequence
+    void push_back(const _Tp& elem);
+    //! appends the specified element to the front of the sequence
+    void push_front(const _Tp& elem);
+    //! appends zero or more elements to the end of the sequence
+    void push_back(const _Tp* elems, size_t count);
+    //! appends zero or more elements to the front of the sequence
+    void push_front(const _Tp* elems, size_t count);
+    //! inserts the specified element to the specified position
+    void insert(int idx, const _Tp& elem);
+    //! inserts zero or more elements to the specified position
+    void insert(int idx, const _Tp* elems, size_t count);
+    //! removes element at the specified position
+    void remove(int idx);
+    //! removes the specified subsequence
+    void remove(const Range& r);
+
+    //! returns reference to the first sequence element
+    _Tp& front();
+    //! returns read-only reference to the first sequence element
+    const _Tp& front() const;
+    //! returns reference to the last sequence element
+    _Tp& back();
+    //! returns read-only reference to the last sequence element
+    const _Tp& back() const;
+    //! returns true iff the sequence contains no elements
+    bool empty() const;
+
+    //! removes all the elements from the sequence
+    void clear();
+    //! removes the first element from the sequence
+    void pop_front();
+    //! removes the last element from the sequence
+    void pop_back();
+    //! removes zero or more elements from the beginning of the sequence
+    void pop_front(_Tp* elems, size_t count);
+    //! removes zero or more elements from the end of the sequence
+    void pop_back(_Tp* elems, size_t count);
+
+    //! copies the whole sequence or the sequence slice to the specified vector
+    void copyTo(std::vector<_Tp>& vec, const Range& range=Range::all()) const;
+    //! returns the vector containing all the sequence elements
+    operator std::vector<_Tp>() const;
+
+    CvSeq* seq;
+};
+
+
+/*!
+ STL-style Sequence Iterator inherited from the CvSeqReader structure
+*/
+template<typename _Tp> class SeqIterator : public CvSeqReader
+{
+public:
+    //! the default constructor
+    SeqIterator();
+    //! the constructor setting the iterator to the beginning or to the end of the sequence
+    SeqIterator(const Seq<_Tp>& seq, bool seekEnd=false);
+    //! positions the iterator within the sequence
+    void seek(size_t pos);
+    //! reports the current iterator position
+    size_t tell() const;
+    //! returns reference to the current sequence element
+    _Tp& operator *();
+    //! returns read-only reference to the current sequence element
+    const _Tp& operator *() const;
+    //! moves iterator to the next sequence element
+    SeqIterator& operator ++();
+    //! moves iterator to the next sequence element
+    SeqIterator operator ++(int) const;
+    //! moves iterator to the previous sequence element
+    SeqIterator& operator --();
+    //! moves iterator to the previous sequence element
+    SeqIterator operator --(int) const;
+
+    //! moves iterator forward by the specified offset (possibly negative)
+    SeqIterator& operator +=(int);
+    //! moves iterator backward by the specified offset (possibly negative)
+    SeqIterator& operator -=(int);
+
+    // this is index of the current element module seq->total*2
+    // (to distinguish between 0 and seq->total)
+    int index;
+};
+
+
+
+// bridge C++ => C Seq API
+CV_EXPORTS schar*  seqPush( CvSeq* seq, const void* element=0);
+CV_EXPORTS schar*  seqPushFront( CvSeq* seq, const void* element=0);
+CV_EXPORTS void  seqPop( CvSeq* seq, void* element=0);
+CV_EXPORTS void  seqPopFront( CvSeq* seq, void* element=0);
+CV_EXPORTS void  seqPopMulti( CvSeq* seq, void* elements,
+                              int count, int in_front=0 );
+CV_EXPORTS void  seqRemove( CvSeq* seq, int index );
+CV_EXPORTS void  clearSeq( CvSeq* seq );
+CV_EXPORTS schar*  getSeqElem( const CvSeq* seq, int index );
+CV_EXPORTS void  seqRemoveSlice( CvSeq* seq, CvSlice slice );
+CV_EXPORTS void  seqInsertSlice( CvSeq* seq, int before_index, const CvArr* from_arr );
+
+template<typename _Tp> inline Seq<_Tp>::Seq() : seq(0) {}
+template<typename _Tp> inline Seq<_Tp>::Seq( const CvSeq* _seq ) : seq((CvSeq*)_seq)
+{
+    CV_Assert(!_seq || _seq->elem_size == sizeof(_Tp));
+}
+
+template<typename _Tp> inline Seq<_Tp>::Seq( MemStorage& storage,
+                                             int headerSize )
+{
+    CV_Assert(headerSize >= (int)sizeof(CvSeq));
+    seq = cvCreateSeq(DataType<_Tp>::type, headerSize, sizeof(_Tp), storage);
+}
+
+template<typename _Tp> inline _Tp& Seq<_Tp>::operator [](int idx)
+{ return *(_Tp*)getSeqElem(seq, idx); }
+
+template<typename _Tp> inline const _Tp& Seq<_Tp>::operator [](int idx) const
+{ return *(_Tp*)getSeqElem(seq, idx); }
+
+template<typename _Tp> inline SeqIterator<_Tp> Seq<_Tp>::begin() const
+{ return SeqIterator<_Tp>(*this); }
+
+template<typename _Tp> inline SeqIterator<_Tp> Seq<_Tp>::end() const
+{ return SeqIterator<_Tp>(*this, true); }
+
+template<typename _Tp> inline size_t Seq<_Tp>::size() const
+{ return seq ? seq->total : 0; }
+
+template<typename _Tp> inline int Seq<_Tp>::type() const
+{ return seq ? CV_MAT_TYPE(seq->flags) : 0; }
+
+template<typename _Tp> inline int Seq<_Tp>::depth() const
+{ return seq ? CV_MAT_DEPTH(seq->flags) : 0; }
+
+template<typename _Tp> inline int Seq<_Tp>::channels() const
+{ return seq ? CV_MAT_CN(seq->flags) : 0; }
+
+template<typename _Tp> inline size_t Seq<_Tp>::elemSize() const
+{ return seq ? seq->elem_size : 0; }
+
+template<typename _Tp> inline size_t Seq<_Tp>::index(const _Tp& elem) const
+{ return cvSeqElemIdx(seq, &elem); }
+
+template<typename _Tp> inline void Seq<_Tp>::push_back(const _Tp& elem)
+{ cvSeqPush(seq, &elem); }
+
+template<typename _Tp> inline void Seq<_Tp>::push_front(const _Tp& elem)
+{ cvSeqPushFront(seq, &elem); }
+
+template<typename _Tp> inline void Seq<_Tp>::push_back(const _Tp* elem, size_t count)
+{ cvSeqPushMulti(seq, elem, (int)count, 0); }
+
+template<typename _Tp> inline void Seq<_Tp>::push_front(const _Tp* elem, size_t count)
+{ cvSeqPushMulti(seq, elem, (int)count, 1); }
+
+template<typename _Tp> inline _Tp& Seq<_Tp>::back()
+{ return *(_Tp*)getSeqElem(seq, -1); }
+
+template<typename _Tp> inline const _Tp& Seq<_Tp>::back() const
+{ return *(const _Tp*)getSeqElem(seq, -1); }
+
+template<typename _Tp> inline _Tp& Seq<_Tp>::front()
+{ return *(_Tp*)getSeqElem(seq, 0); }
+
+template<typename _Tp> inline const _Tp& Seq<_Tp>::front() const
+{ return *(const _Tp*)getSeqElem(seq, 0); }
+
+template<typename _Tp> inline bool Seq<_Tp>::empty() const
+{ return !seq || seq->total == 0; }
+
+template<typename _Tp> inline void Seq<_Tp>::clear()
+{ if(seq) clearSeq(seq); }
+
+template<typename _Tp> inline void Seq<_Tp>::pop_back()
+{ seqPop(seq); }
+
+template<typename _Tp> inline void Seq<_Tp>::pop_front()
+{ seqPopFront(seq); }
+
+template<typename _Tp> inline void Seq<_Tp>::pop_back(_Tp* elem, size_t count)
+{ seqPopMulti(seq, elem, (int)count, 0); }
+
+template<typename _Tp> inline void Seq<_Tp>::pop_front(_Tp* elem, size_t count)
+{ seqPopMulti(seq, elem, (int)count, 1); }
+
+template<typename _Tp> inline void Seq<_Tp>::insert(int idx, const _Tp& elem)
+{ seqInsert(seq, idx, &elem); }
+
+template<typename _Tp> inline void Seq<_Tp>::insert(int idx, const _Tp* elems, size_t count)
+{
+    CvMat m = cvMat(1, count, DataType<_Tp>::type, elems);
+    seqInsertSlice(seq, idx, &m);
+}
+
+template<typename _Tp> inline void Seq<_Tp>::remove(int idx)
+{ seqRemove(seq, idx); }
+
+template<typename _Tp> inline void Seq<_Tp>::remove(const Range& r)
+{ seqRemoveSlice(seq, cvSlice(r.start, r.end)); }
+
+template<typename _Tp> inline void Seq<_Tp>::copyTo(std::vector<_Tp>& vec, const Range& range) const
+{
+    size_t len = !seq ? 0 : range == Range::all() ? seq->total : range.end - range.start;
+    vec.resize(len);
+    if( seq && len )
+        cvCvtSeqToArray(seq, &vec[0], cvSlice(range));
+}
+
+template<typename _Tp> inline Seq<_Tp>::operator std::vector<_Tp>() const
+{
+    std::vector<_Tp> vec;
+    copyTo(vec);
+    return vec;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp>::SeqIterator()
+{ memset(this, 0, sizeof(*this)); }
+
+template<typename _Tp> inline SeqIterator<_Tp>::SeqIterator(const Seq<_Tp>& _seq, bool seekEnd)
+{
+    cvStartReadSeq(_seq.seq, this);
+    index = seekEnd ? _seq.seq->total : 0;
+}
+
+template<typename _Tp> inline void SeqIterator<_Tp>::seek(size_t pos)
+{
+    cvSetSeqReaderPos(this, (int)pos, false);
+    index = pos;
+}
+
+template<typename _Tp> inline size_t SeqIterator<_Tp>::tell() const
+{ return index; }
+
+template<typename _Tp> inline _Tp& SeqIterator<_Tp>::operator *()
+{ return *(_Tp*)ptr; }
+
+template<typename _Tp> inline const _Tp& SeqIterator<_Tp>::operator *() const
+{ return *(const _Tp*)ptr; }
+
+template<typename _Tp> inline SeqIterator<_Tp>& SeqIterator<_Tp>::operator ++()
+{
+    CV_NEXT_SEQ_ELEM(sizeof(_Tp), *this);
+    if( ++index >= seq->total*2 )
+        index = 0;
+    return *this;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp> SeqIterator<_Tp>::operator ++(int) const
+{
+    SeqIterator<_Tp> it = *this;
+    ++*this;
+    return it;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp>& SeqIterator<_Tp>::operator --()
+{
+    CV_PREV_SEQ_ELEM(sizeof(_Tp), *this);
+    if( --index < 0 )
+        index = seq->total*2-1;
+    return *this;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp> SeqIterator<_Tp>::operator --(int) const
+{
+    SeqIterator<_Tp> it = *this;
+    --*this;
+    return it;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp>& SeqIterator<_Tp>::operator +=(int delta)
+{
+    cvSetSeqReaderPos(this, delta, 1);
+    index += delta;
+    int n = seq->total*2;
+    if( index < 0 )
+        index += n;
+    if( index >= n )
+        index -= n;
+    return *this;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp>& SeqIterator<_Tp>::operator -=(int delta)
+{
+    return (*this += -delta);
+}
+
+template<typename _Tp> inline ptrdiff_t operator - (const SeqIterator<_Tp>& a,
+                                                    const SeqIterator<_Tp>& b)
+{
+    ptrdiff_t delta = a.index - b.index, n = a.seq->total;
+    if( delta > n || delta < -n )
+        delta += delta < 0 ? n : -n;
+    return delta;
+}
+
+template<typename _Tp> inline bool operator == (const SeqIterator<_Tp>& a,
+                                                const SeqIterator<_Tp>& b)
+{
+    return a.seq == b.seq && a.index == b.index;
+}
+
+template<typename _Tp> inline bool operator != (const SeqIterator<_Tp>& a,
+                                                const SeqIterator<_Tp>& b)
+{
+    return !(a == b);
+}
+
+//! @}
+
+} // cv
+
+#endif
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda.hpp
new file mode 100644
index 000000000000..9d210ed7b55b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda.hpp
@@ -0,0 +1,1337 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDA_HPP
+#define OPENCV_CORE_CUDA_HPP
+
+#ifndef __cplusplus
+#  error cuda.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/cuda_types.hpp"
+
+/**
+  @defgroup cuda CUDA-accelerated Computer Vision
+  @{
+    @defgroup cudacore Core part
+    @{
+      @defgroup cudacore_init Initialization and Information
+      @defgroup cudacore_struct Data Structures
+    @}
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudacore_struct
+//! @{
+
+//===================================================================================
+// GpuMat
+//===================================================================================
+
+/** @brief Base storage class for GPU memory with reference counting.
+
+Its interface matches the Mat interface with the following limitations:
+
+-   no arbitrary dimensions support (only 2D)
+-   no functions that return references to their data (because references on GPU are not valid for
+    CPU)
+-   no expression templates technique support
+
+Beware that the latter limitation may lead to overloaded matrix operators that cause memory
+allocations. The GpuMat class is convertible to cuda::PtrStepSz and cuda::PtrStep so it can be
+passed directly to the kernel.
+
+@note In contrast with Mat, in most cases GpuMat::isContinuous() == false . This means that rows are
+aligned to a size depending on the hardware. Single-row GpuMat is always a continuous matrix.
+
+@note You are not recommended to leave static or global GpuMat variables allocated, that is, to rely
+on its destructor. The destruction order of such variables and CUDA context is undefined. GPU memory
+release function returns error if the CUDA context has been destroyed before.
+
+Some member functions are described as a "Blocking Call" while some are described as a
+"Non-Blocking Call". Blocking functions are synchronous to host. It is guaranteed that the GPU
+operation is finished when the function returns. However, non-blocking functions are asynchronous to
+host. Those functions may return even if the GPU operation is not finished.
+
+Compared to their blocking counterpart, non-blocking functions accept Stream as an additional
+argument. If a non-default stream is passed, the GPU operation may overlap with operations in other
+streams.
+
+@sa Mat
+ */
+class CV_EXPORTS_W GpuMat
+{
+public:
+    class CV_EXPORTS_W Allocator
+    {
+    public:
+        virtual ~Allocator() {}
+
+        // allocator must fill data, step and refcount fields
+        virtual bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize) = 0;
+        virtual void free(GpuMat* mat) = 0;
+    };
+
+    //! default allocator
+    CV_WRAP static GpuMat::Allocator* defaultAllocator();
+    CV_WRAP static void setDefaultAllocator(GpuMat::Allocator* allocator);
+
+    //! default constructor
+    CV_WRAP explicit GpuMat(GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+
+    //! constructs GpuMat of the specified size and type
+    CV_WRAP GpuMat(int rows, int cols, int type, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+    CV_WRAP GpuMat(Size size, int type, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+
+    //! constructs GpuMat and fills it with the specified value _s
+    CV_WRAP GpuMat(int rows, int cols, int type, Scalar s, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+    CV_WRAP GpuMat(Size size, int type, Scalar s, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+
+    //! copy constructor
+    CV_WRAP GpuMat(const GpuMat& m);
+
+    //! constructor for GpuMat headers pointing to user-allocated data
+    GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
+    GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
+
+    //! creates a GpuMat header for a part of the bigger matrix
+    CV_WRAP GpuMat(const GpuMat& m, Range rowRange, Range colRange);
+    CV_WRAP GpuMat(const GpuMat& m, Rect roi);
+
+    //! builds GpuMat from host memory (Blocking call)
+    CV_WRAP explicit GpuMat(InputArray arr, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+
+    //! destructor - calls release()
+    ~GpuMat();
+
+    //! assignment operators
+    GpuMat& operator =(const GpuMat& m);
+
+    //! allocates new GpuMat data unless the GpuMat already has specified size and type
+    CV_WRAP void create(int rows, int cols, int type);
+    CV_WRAP void create(Size size, int type);
+
+    //! decreases reference counter, deallocate the data when reference counter reaches 0
+    CV_WRAP void release();
+
+    //! swaps with other smart pointer
+    CV_WRAP void swap(GpuMat& mat);
+
+    /** @brief Performs data upload to GpuMat (Blocking call)
+
+    This function copies data from host memory to device memory. As being a blocking call, it is
+    guaranteed that the copy operation is finished when this function returns.
+    */
+    CV_WRAP void upload(InputArray arr);
+
+    /** @brief Performs data upload to GpuMat (Non-Blocking call)
+
+    This function copies data from host memory to device memory. As being a non-blocking call, this
+    function may return even if the copy operation is not finished.
+
+    The copy operation may be overlapped with operations in other non-default streams if \p stream is
+    not the default stream and \p dst is HostMem allocated with HostMem::PAGE_LOCKED option.
+    */
+    CV_WRAP void upload(InputArray arr, Stream& stream);
+
+    /** @brief Performs data download from GpuMat (Blocking call)
+
+    This function copies data from device memory to host memory. As being a blocking call, it is
+    guaranteed that the copy operation is finished when this function returns.
+    */
+    CV_WRAP void download(OutputArray dst) const;
+
+    /** @brief Performs data download from GpuMat (Non-Blocking call)
+
+    This function copies data from device memory to host memory. As being a non-blocking call, this
+    function may return even if the copy operation is not finished.
+
+    The copy operation may be overlapped with operations in other non-default streams if \p stream is
+    not the default stream and \p dst is HostMem allocated with HostMem::PAGE_LOCKED option.
+    */
+    CV_WRAP void download(OutputArray dst, Stream& stream) const;
+
+    //! returns deep copy of the GpuMat, i.e. the data is copied
+    CV_WRAP GpuMat clone() const;
+
+    //! copies the GpuMat content to device memory (Blocking call)
+    void copyTo(OutputArray dst) const;
+    //! bindings overload which copies the GpuMat content to device memory (Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst) const {
+        copyTo(static_cast<OutputArray>(dst));
+    }
+
+    //! copies the GpuMat content to device memory (Non-Blocking call)
+    void copyTo(OutputArray dst, Stream& stream) const;
+    //! bindings overload which copies the GpuMat content to device memory (Non-Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, Stream& stream) const {
+        copyTo(static_cast<OutputArray>(dst), stream);
+    }
+
+    //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
+    void copyTo(OutputArray dst, InputArray mask) const;
+    //! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask) const {
+        copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask));
+    }
+
+    //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
+    void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+    //! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask, Stream& stream) const {
+        copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask), stream);
+    }
+
+    //! sets some of the GpuMat elements to s (Blocking call)
+    CV_WRAP GpuMat& setTo(Scalar s);
+
+    //! sets some of the GpuMat elements to s (Non-Blocking call)
+    CV_WRAP GpuMat& setTo(Scalar s, Stream& stream);
+
+    //! sets some of the GpuMat elements to s, according to the mask (Blocking call)
+    CV_WRAP GpuMat& setTo(Scalar s, InputArray mask);
+
+    //! sets some of the GpuMat elements to s, according to the mask (Non-Blocking call)
+    CV_WRAP GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
+
+    //! converts GpuMat to another datatype (Blocking call)
+    void convertTo(OutputArray dst, int rtype) const;
+
+    //! converts GpuMat to another datatype (Non-Blocking call)
+    void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+    //! bindings overload which converts GpuMat to another datatype (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, Stream& stream) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, stream);
+    }
+
+    //! converts GpuMat to another datatype with scaling (Blocking call)
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+    //! bindings overload which converts GpuMat to another datatype with scaling(Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha = 1.0, double beta = 0.0) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta);
+    }
+
+    //! converts GpuMat to another datatype with scaling (Non-Blocking call)
+    void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
+
+    //! converts GpuMat to another datatype with scaling (Non-Blocking call)
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+    //! bindings overload which converts GpuMat to another datatype with scaling (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha, double beta, Stream& stream) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta, stream);
+    }
+
+    CV_WRAP void assignTo(GpuMat& m, int type = -1) const;
+
+    //! returns pointer to y-th row
+    uchar* ptr(int y = 0);
+    const uchar* ptr(int y = 0) const;
+
+    //! template version of the above method
+    template<typename _Tp> _Tp* ptr(int y = 0);
+    template<typename _Tp> const _Tp* ptr(int y = 0) const;
+
+    template <typename _Tp> operator PtrStepSz<_Tp>() const;
+    template <typename _Tp> operator PtrStep<_Tp>() const;
+
+    //! returns a new GpuMat header for the specified row
+    CV_WRAP GpuMat row(int y) const;
+
+    //! returns a new GpuMat header for the specified column
+    CV_WRAP GpuMat col(int x) const;
+
+    //! ... for the specified row span
+    CV_WRAP GpuMat rowRange(int startrow, int endrow) const;
+    CV_WRAP GpuMat rowRange(Range r) const;
+
+    //! ... for the specified column span
+    CV_WRAP GpuMat colRange(int startcol, int endcol) const;
+    CV_WRAP GpuMat colRange(Range r) const;
+
+    //! extracts a rectangular sub-GpuMat (this is a generalized form of row, rowRange etc.)
+    GpuMat operator ()(Range rowRange, Range colRange) const;
+    GpuMat operator ()(Rect roi) const;
+
+    //! creates alternative GpuMat header for the same data, with different
+    //! number of channels and/or different number of rows
+    CV_WRAP GpuMat reshape(int cn, int rows = 0) const;
+
+    //! locates GpuMat header within a parent GpuMat
+    CV_WRAP void locateROI(Size& wholeSize, Point& ofs) const;
+
+    //! moves/resizes the current GpuMat ROI inside the parent GpuMat
+    CV_WRAP GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
+
+    //! returns true iff the GpuMat data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    CV_WRAP bool isContinuous() const;
+
+    //! returns element size in bytes
+    CV_WRAP size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    CV_WRAP size_t elemSize1() const;
+
+    //! returns element type
+    CV_WRAP int type() const;
+
+    //! returns element type
+    CV_WRAP int depth() const;
+
+    //! returns number of channels
+    CV_WRAP int channels() const;
+
+    //! returns step/elemSize1()
+    CV_WRAP size_t step1() const;
+
+    //! returns GpuMat size : width == number of columns, height == number of rows
+    CV_WRAP Size size() const;
+
+    //! returns true if GpuMat data is NULL
+    CV_WRAP bool empty() const;
+
+    // returns pointer to cuda memory
+    CV_WRAP void* cudaPtr() const;
+
+    //! internal use method: updates the continuity flag
+    CV_WRAP void updateContinuityFlag();
+
+    /*! includes several bit-fields:
+    - the magic signature
+    - continuity flag
+    - depth
+    - number of channels
+    */
+    int flags;
+
+    //! the number of rows and columns
+    int rows, cols;
+
+    //! a distance between successive rows in bytes; includes the gap if any
+    CV_PROP size_t step;
+
+    //! pointer to the data
+    uchar* data;
+
+    //! pointer to the reference counter;
+    //! when GpuMat points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    //! helper fields used in locateROI and adjustROI
+    uchar* datastart;
+    const uchar* dataend;
+
+    //! allocator
+    Allocator* allocator;
+};
+
+struct CV_EXPORTS_W GpuData
+{
+    explicit GpuData(size_t _size);
+     ~GpuData();
+
+    GpuData(const GpuData&) = delete;
+    GpuData& operator=(const GpuData&) = delete;
+
+    GpuData(GpuData&&) = delete;
+    GpuData& operator=(GpuData&&) = delete;
+
+    uchar* data;
+    size_t size;
+};
+
+class CV_EXPORTS_W GpuMatND
+{
+public:
+    using SizeArray = std::vector<int>;
+    using StepArray = std::vector<size_t>;
+    using IndexArray = std::vector<int>;
+
+    //! destructor
+    ~GpuMatND();
+
+    //! default constructor
+    GpuMatND();
+
+    /** @overload
+    @param size Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    GpuMatND(SizeArray size, int type);
+
+    /** @overload
+    @param size Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param step Array of _size.size()-1 steps in case of a multi-dimensional array (the last step is always
+    set to the element size). If not specified, the matrix is assumed to be continuous.
+    */
+    GpuMatND(SizeArray size, int type, void* data, StepArray step = StepArray());
+
+    /** @brief Allocates GPU memory.
+    Suppose there is some GPU memory already allocated. In that case, this method may choose to reuse that
+    GPU memory under the specific condition: it must be of the same size and type, not externally allocated,
+    the GPU memory is continuous(i.e., isContinuous() is true), and is not a sub-matrix of another GpuMatND
+    (i.e., isSubmatrix() is false). In other words, this method guarantees that the GPU memory allocated by
+    this method is always continuous and is not a sub-region of another GpuMatND.
+    */
+    void create(SizeArray size, int type);
+
+    void release();
+
+    void swap(GpuMatND& m) noexcept;
+
+    /** @brief Creates a full copy of the array and the underlying data.
+    The method creates a full copy of the array. It mimics the behavior of Mat::clone(), i.e.
+    the original step is not taken into account. So, the array copy is a continuous array
+    occupying total()\*elemSize() bytes.
+    */
+    GpuMatND clone() const;
+
+    /** @overload
+    This overload is non-blocking, so it may return even if the copy operation is not finished.
+    */
+    GpuMatND clone(Stream& stream) const;
+
+    /** @brief Extracts a sub-matrix.
+    The operator makes a new header for the specified sub-array of \*this.
+    The operator is an O(1) operation, that is, no matrix data is copied.
+    @param ranges Array of selected ranges along each dimension.
+    */
+    GpuMatND operator()(const std::vector<Range>& ranges) const;
+
+    /** @brief Creates a GpuMat header for a 2D plane part of an n-dim matrix.
+    @note The returned GpuMat is constructed with the constructor for user-allocated data.
+    That is, It does not perform reference counting.
+    @note This function does not increment this GpuMatND's reference counter.
+    */
+    GpuMat createGpuMatHeader(IndexArray idx, Range rowRange, Range colRange) const;
+
+    /** @overload
+    Creates a GpuMat header if this GpuMatND is effectively 2D.
+    @note The returned GpuMat is constructed with the constructor for user-allocated data.
+    That is, It does not perform reference counting.
+    @note This function does not increment this GpuMatND's reference counter.
+    */
+    GpuMat createGpuMatHeader() const;
+
+    /** @brief Extracts a 2D plane part of an n-dim matrix.
+    It differs from createGpuMatHeader(IndexArray, Range, Range) in that it clones a part of this
+    GpuMatND to the returned GpuMat.
+    @note This operator does not increment this GpuMatND's reference counter;
+    */
+    GpuMat operator()(IndexArray idx, Range rowRange, Range colRange) const;
+
+    /** @brief Extracts a 2D plane part of an n-dim matrix if this GpuMatND is effectively 2D.
+    It differs from createGpuMatHeader() in that it clones a part of this GpuMatND.
+    @note This operator does not increment this GpuMatND's reference counter;
+    */
+    operator GpuMat() const;
+
+    GpuMatND(const GpuMatND&) = default;
+    GpuMatND& operator=(const GpuMatND&) = default;
+
+#if defined(__GNUC__) && __GNUC__ < 5
+    // error: function '...' defaulted on its first declaration with an exception-specification
+    // that differs from the implicit declaration '...'
+
+    GpuMatND(GpuMatND&&) = default;
+    GpuMatND& operator=(GpuMatND&&) = default;
+#else
+    GpuMatND(GpuMatND&&) noexcept = default;
+    GpuMatND& operator=(GpuMatND&&) noexcept = default;
+#endif
+
+    void upload(InputArray src);
+    void upload(InputArray src, Stream& stream);
+    void download(OutputArray dst) const;
+    void download(OutputArray dst, Stream& stream) const;
+
+    //! returns true iff the GpuMatND data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    bool isContinuous() const;
+
+    //! returns true if the matrix is a sub-matrix of another matrix
+    bool isSubmatrix() const;
+
+    //! returns element size in bytes
+    size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    size_t elemSize1() const;
+
+    //! returns true if data is null
+    bool empty() const;
+
+    //! returns true if not empty and points to external(user-allocated) gpu memory
+    bool external() const;
+
+    //! returns pointer to the first byte of the GPU memory
+    uchar* getDevicePtr() const;
+
+    //! returns the total number of array elements
+    size_t total() const;
+
+    //! returns the size of underlying memory in bytes
+    size_t totalMemSize() const;
+
+    //! returns element type
+    int type() const;
+
+private:
+    //! internal use
+    void setFields(SizeArray size, int type, StepArray step = StepArray());
+
+public:
+    /*! includes several bit-fields:
+    - the magic signature
+    - continuity flag
+    - depth
+    - number of channels
+    */
+    int flags;
+
+    //! matrix dimensionality
+    int dims;
+
+    //! shape of this array
+    SizeArray size;
+
+    /*! step values
+    Their semantics is identical to the semantics of step for Mat.
+    */
+    StepArray step;
+
+private:
+    /*! internal use
+    If this GpuMatND holds external memory, this is empty.
+    */
+    std::shared_ptr<GpuData> data_;
+
+    /*! internal use
+    If this GpuMatND manages memory with reference counting, this value is
+    always equal to data_->data. If this GpuMatND holds external memory,
+    data_ is empty and data points to the external memory.
+    */
+    uchar* data;
+
+    /*! internal use
+    If this GpuMatND is a sub-matrix of a larger matrix, this value is the
+    difference of the first byte between the sub-matrix and the whole matrix.
+    */
+    size_t offset;
+};
+
+/** @brief Creates a continuous matrix.
+
+@param rows Row count.
+@param cols Column count.
+@param type Type of the matrix.
+@param arr Destination matrix. This parameter changes only if it has a proper type and area (
+\f$\texttt{rows} \times \texttt{cols}\f$ ).
+
+Matrix is called continuous if its elements are stored continuously, that is, without gaps at the
+end of each row.
+ */
+CV_EXPORTS_W void createContinuous(int rows, int cols, int type, OutputArray arr);
+
+/** @brief Ensures that the size of a matrix is big enough and the matrix has a proper type.
+
+@param rows Minimum desired number of rows.
+@param cols Minimum desired number of columns.
+@param type Desired matrix type.
+@param arr Destination matrix.
+
+The function does not reallocate memory if the matrix has proper attributes already.
+ */
+CV_EXPORTS_W void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
+
+/** @brief Bindings overload to create a GpuMat from existing GPU memory.
+@param rows Row count.
+@param cols Column count.
+@param type Type of the matrix.
+@param cudaMemoryAddress Address of the allocated GPU memory on the device. This does not allocate matrix data. Instead, it just initializes the matrix header that points to the specified \a cudaMemoryAddress, which means that no data is copied. This operation is very efficient and can be used to process external data using OpenCV functions. The external data is not automatically deallocated, so you should take care of it.
+@param step Number of bytes each matrix row occupies. The value should include the padding bytes at the end of each row, if any. If the parameter is missing (set to Mat::AUTO_STEP ), no padding is assumed and the actual step is calculated as cols*elemSize(). See GpuMat::elemSize.
+@note Overload for generation of bindings only, not exported or intended for use internally from C++.
+ */
+CV_EXPORTS_W GpuMat inline createGpuMatFromCudaMemory(int rows, int cols, int type, size_t cudaMemoryAddress, size_t step = Mat::AUTO_STEP) {
+    return GpuMat(rows, cols, type, reinterpret_cast<void*>(cudaMemoryAddress), step);
+}
+
+ /** @overload
+@param size 2D array size: Size(cols, rows). In the Size() constructor, the number of rows and the number of columns go in the reverse order.
+@param type Type of the matrix.
+@param cudaMemoryAddress Address of the allocated GPU memory on the device. This does not allocate matrix data. Instead, it just initializes the matrix header that points to the specified \a cudaMemoryAddress, which means that no data is copied. This operation is very efficient and can be used to process external data using OpenCV functions. The external data is not automatically deallocated, so you should take care of it.
+@param step Number of bytes each matrix row occupies. The value should include the padding bytes at the end of each row, if any. If the parameter is missing (set to Mat::AUTO_STEP ), no padding is assumed and the actual step is calculated as cols*elemSize(). See GpuMat::elemSize.
+@note Overload for generation of bindings only, not exported or intended for use internally from C++.
+ */
+CV_EXPORTS_W inline GpuMat createGpuMatFromCudaMemory(Size size, int type, size_t cudaMemoryAddress, size_t step = Mat::AUTO_STEP) {
+    return GpuMat(size, type, reinterpret_cast<void*>(cudaMemoryAddress), step);
+}
+
+/** @brief BufferPool for use with CUDA streams
+
+BufferPool utilizes Stream's allocator to create new buffers for GpuMat's. It is
+only useful when enabled with #setBufferPoolUsage.
+
+@code
+    setBufferPoolUsage(true);
+@endcode
+
+@note #setBufferPoolUsage must be called \em before any Stream declaration.
+
+Users may specify custom allocator for Stream and may implement their own stream based
+functions utilizing the same underlying GPU memory management.
+
+If custom allocator is not specified, BufferPool utilizes StackAllocator by
+default. StackAllocator allocates a chunk of GPU device memory beforehand,
+and when GpuMat is declared later on, it is given the pre-allocated memory.
+This kind of strategy reduces the number of calls for memory allocating APIs
+such as cudaMalloc or cudaMallocPitch.
+
+Below is an example that utilizes BufferPool with StackAllocator:
+
+@code
+    #include <opencv2/opencv.hpp>
+
+    using namespace cv;
+    using namespace cv::cuda
+
+    int main()
+    {
+        setBufferPoolUsage(true);                               // Tell OpenCV that we are going to utilize BufferPool
+        setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2);  // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks)
+
+        Stream stream1, stream2;                                // Each stream uses 1 stack
+        BufferPool pool1(stream1), pool2(stream2);
+
+        GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1);   // 16MB
+        GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3);   // 48MB, pool1 is now full
+
+        GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1);   // 1MB
+        GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3);   // 3MB
+
+        cvtColor(d_src1, d_dst1, cv::COLOR_GRAY2BGR, 0, stream1);
+        cvtColor(d_src2, d_dst2, cv::COLOR_GRAY2BGR, 0, stream2);
+    }
+@endcode
+
+If we allocate another GpuMat on pool1 in the above example, it will be carried out by
+the DefaultAllocator since the stack for pool1 is full.
+
+@code
+    GpuMat d_add1 = pool1.getBuffer(1024, 1024, CV_8UC1);   // Stack for pool1 is full, memory is allocated with DefaultAllocator
+@endcode
+
+If a third stream is declared in the above example, allocating with #getBuffer
+within that stream will also be carried out by the DefaultAllocator because we've run out of
+stacks.
+
+@code
+    Stream stream3;                                         // Only 2 stacks were allocated, we've run out of stacks
+    BufferPool pool3(stream3);
+    GpuMat d_src3 = pool3.getBuffer(1024, 1024, CV_8UC1);   // Memory is allocated with DefaultAllocator
+@endcode
+
+@warning When utilizing StackAllocator, deallocation order is important.
+
+Just like a stack, deallocation must be done in LIFO order. Below is an example of
+erroneous usage that violates LIFO rule. If OpenCV is compiled in Debug mode, this
+sample code will emit CV_Assert error.
+
+@code
+    int main()
+    {
+        setBufferPoolUsage(true);                               // Tell OpenCV that we are going to utilize BufferPool
+        Stream stream;                                          // A default size (10 MB) stack is allocated to this stream
+        BufferPool pool(stream);
+
+        GpuMat mat1 = pool.getBuffer(1024, 1024, CV_8UC1);      // Allocate mat1 (1MB)
+        GpuMat mat2 = pool.getBuffer(1024, 1024, CV_8UC1);      // Allocate mat2 (1MB)
+
+        mat1.release();                                         // erroneous usage : mat2 must be deallocated before mat1
+    }
+@endcode
+
+Since C++ local variables are destroyed in the reverse order of construction,
+the code sample below satisfies the LIFO rule. Local GpuMat's are deallocated
+and the corresponding memory is automatically returned to the pool for later usage.
+
+@code
+    int main()
+    {
+        setBufferPoolUsage(true);                               // Tell OpenCV that we are going to utilize BufferPool
+        setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2);  // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks)
+
+        Stream stream1, stream2;                                // Each stream uses 1 stack
+        BufferPool pool1(stream1), pool2(stream2);
+
+        for (int i = 0; i < 10; i++)
+        {
+            GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1);   // 16MB
+            GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3);   // 48MB, pool1 is now full
+
+            GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1);   // 1MB
+            GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3);   // 3MB
+
+            d_src1.setTo(Scalar(i), stream1);
+            d_src2.setTo(Scalar(i), stream2);
+
+            cvtColor(d_src1, d_dst1, cv::COLOR_GRAY2BGR, 0, stream1);
+            cvtColor(d_src2, d_dst2, cv::COLOR_GRAY2BGR, 0, stream2);
+                                                                    // The order of destruction of the local variables is:
+                                                                    //   d_dst2 => d_src2 => d_dst1 => d_src1
+                                                                    // LIFO rule is satisfied, this code runs without error
+        }
+    }
+@endcode
+ */
+class CV_EXPORTS_W BufferPool
+{
+public:
+
+    //! Gets the BufferPool for the given stream.
+    CV_WRAP explicit BufferPool(Stream& stream);
+
+    //! Allocates a new GpuMat of given size and type.
+    CV_WRAP GpuMat getBuffer(int rows, int cols, int type);
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+    //! Allocates a new GpuMat of given size and type.
+    CV_WRAP GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+    //! Returns the allocator associated with the stream.
+    CV_WRAP Ptr<GpuMat::Allocator> getAllocator() const { return allocator_; }
+
+private:
+    Ptr<GpuMat::Allocator> allocator_;
+};
+
+//! BufferPool management (must be called before Stream creation)
+CV_EXPORTS_W void setBufferPoolUsage(bool on);
+CV_EXPORTS_W void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);
+
+//===================================================================================
+// HostMem
+//===================================================================================
+
+/** @brief Class with reference counting wrapping special memory type allocation functions from CUDA.
+
+Its interface is also Mat-like but with additional memory type parameters.
+
+-   **PAGE_LOCKED** sets a page locked memory type used commonly for fast and asynchronous
+    uploading/downloading data from/to GPU.
+-   **SHARED** specifies a zero copy memory allocation that enables mapping the host memory to GPU
+    address space, if supported.
+-   **WRITE_COMBINED** sets the write combined buffer that is not cached by CPU. Such buffers are
+    used to supply GPU with data when GPU only reads it. The advantage is a better CPU cache
+    utilization.
+
+@note Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2
+Pinned Memory APIs* document or *CUDA C Programming Guide*.
+ */
+class CV_EXPORTS_W HostMem
+{
+public:
+    enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };
+
+    static MatAllocator* getAllocator(HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+
+    CV_WRAP explicit HostMem(HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+
+    HostMem(const HostMem& m);
+
+    CV_WRAP HostMem(int rows, int cols, int type, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+    CV_WRAP HostMem(Size size, int type, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+
+    //! creates from host memory with coping data
+    CV_WRAP explicit HostMem(InputArray arr, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+
+    ~HostMem();
+
+    HostMem& operator =(const HostMem& m);
+
+    //! swaps with other smart pointer
+    CV_WRAP void swap(HostMem& b);
+
+    //! returns deep copy of the matrix, i.e. the data is copied
+    CV_WRAP HostMem clone() const;
+
+    //! allocates new matrix data unless the matrix already has specified size and type.
+    CV_WRAP void create(int rows, int cols, int type);
+    void create(Size size, int type);
+
+    //! creates alternative HostMem header for the same data, with different
+    //! number of channels and/or different number of rows
+    CV_WRAP HostMem reshape(int cn, int rows = 0) const;
+
+    //! decrements reference counter and released memory if needed.
+    void release();
+
+    //! returns matrix header with disabled reference counting for HostMem data.
+    CV_WRAP Mat createMatHeader() const;
+
+    /** @brief Maps CPU memory to GPU address space and creates the cuda::GpuMat header without reference counting
+    for it.
+
+    This can be done only if memory was allocated with the SHARED flag and if it is supported by the
+    hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which
+    eliminates an extra copy.
+     */
+    GpuMat createGpuMatHeader() const;
+
+    // Please see cv::Mat for descriptions
+    CV_WRAP bool isContinuous() const;
+    CV_WRAP size_t elemSize() const;
+    CV_WRAP size_t elemSize1() const;
+    CV_WRAP int type() const;
+    CV_WRAP int depth() const;
+    CV_WRAP int channels() const;
+    CV_WRAP size_t step1() const;
+    CV_WRAP Size size() const;
+    CV_WRAP bool empty() const;
+
+    // Please see cv::Mat for descriptions
+    int flags;
+    int rows, cols;
+    CV_PROP size_t step;
+
+    uchar* data;
+    int* refcount;
+
+    uchar* datastart;
+    const uchar* dataend;
+
+    AllocType alloc_type;
+};
+
+/** @brief Page-locks the memory of matrix and maps it for the device(s).
+
+@param m Input matrix.
+ */
+CV_EXPORTS_W void registerPageLocked(Mat& m);
+
+/** @brief Unmaps the memory of matrix and makes it pageable again.
+
+@param m Input matrix.
+ */
+CV_EXPORTS_W void unregisterPageLocked(Mat& m);
+
+//===================================================================================
+// Stream
+//===================================================================================
+
+/** @brief This class encapsulates a queue of asynchronous calls.
+
+@note Currently, you may face problems if an operation is enqueued twice with different data. Some
+functions use the constant GPU memory, and next call may update the memory before the previous one
+has been finished. But calling different operations asynchronously is safe because each operation
+has its own constant buffer. Memory copy/upload/download/set operations to the buffers you hold are
+also safe.
+
+@note The Stream class is not thread-safe. Please use different Stream objects for different CPU threads.
+
+@code
+void thread1()
+{
+    cv::cuda::Stream stream1;
+    cv::cuda::func1(..., stream1);
+}
+
+void thread2()
+{
+    cv::cuda::Stream stream2;
+    cv::cuda::func2(..., stream2);
+}
+@endcode
+
+@note By default all CUDA routines are launched in Stream::Null() object, if the stream is not specified by user.
+In multi-threading environment the stream objects must be passed explicitly (see previous note).
+ */
+class CV_EXPORTS_W Stream
+{
+    typedef void (Stream::*bool_type)() const;
+    void this_type_does_not_support_comparisons() const {}
+
+public:
+    typedef void (*StreamCallback)(int status, void* userData);
+
+    //! creates a new asynchronous stream
+    CV_WRAP Stream();
+
+    //! creates a new asynchronous stream with custom allocator
+    CV_WRAP Stream(const Ptr<GpuMat::Allocator>& allocator);
+
+    /** @brief creates a new Stream using the cudaFlags argument to determine the behaviors of the stream
+
+    @note The cudaFlags parameter is passed to the underlying api cudaStreamCreateWithFlags() and
+    supports the same parameter values.
+    @code
+        // creates an OpenCV cuda::Stream that manages an asynchronous, non-blocking,
+        // non-default CUDA stream
+        cv::cuda::Stream cvStream(cudaStreamNonBlocking);
+    @endcode
+     */
+    CV_WRAP Stream(const size_t cudaFlags);
+
+    /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false.
+    */
+    CV_WRAP bool queryIfComplete() const;
+
+    /** @brief Blocks the current CPU thread until all operations in the stream are complete.
+    */
+    CV_WRAP void waitForCompletion();
+
+    /** @brief Makes a compute stream wait on an event.
+    */
+    CV_WRAP void waitEvent(const Event& event);
+
+    /** @brief Adds a callback to be called on the host after all currently enqueued items in the stream have
+    completed.
+
+    @note Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization
+    that may depend on outstanding device work or other callbacks that are not mandated to run earlier.
+    Callbacks without a mandated order (in independent streams) execute in undefined order and may be
+    serialized.
+     */
+    void enqueueHostCallback(StreamCallback callback, void* userData);
+
+    //! return Stream object for default CUDA stream
+    CV_WRAP static Stream& Null();
+
+    //! returns true if stream object is not default (!= 0)
+    operator bool_type() const;
+
+    //! return Pointer to CUDA stream
+    CV_WRAP void* cudaPtr() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    Stream(const Ptr<Impl>& impl);
+
+    friend struct StreamAccessor;
+    friend class BufferPool;
+    friend class DefaultDeviceInitializer;
+};
+
+
+/** @brief Bindings overload to create a Stream object from the address stored in an existing CUDA Runtime API stream pointer (cudaStream_t).
+@param cudaStreamMemoryAddress Memory address stored in a CUDA Runtime API stream pointer (cudaStream_t). The created Stream object does not perform any allocation or deallocation and simply wraps existing raw CUDA Runtime API stream pointer.
+@note Overload for generation of bindings only, not exported or intended for use internally from C++.
+ */
+CV_EXPORTS_W Stream wrapStream(size_t cudaStreamMemoryAddress);
+
+class CV_EXPORTS_W Event
+{
+public:
+    enum CreateFlags
+    {
+        DEFAULT        = 0x00,  /**< Default event flag */
+        BLOCKING_SYNC  = 0x01,  /**< Event uses blocking synchronization */
+        DISABLE_TIMING = 0x02,  /**< Event will not record timing data */
+        INTERPROCESS   = 0x04   /**< Event is suitable for interprocess use. DisableTiming must be set */
+    };
+
+    CV_WRAP explicit Event(const Event::CreateFlags flags = Event::CreateFlags::DEFAULT);
+
+    //! records an event
+    CV_WRAP void record(Stream& stream = Stream::Null());
+
+    //! queries an event's status
+    CV_WRAP bool queryIfComplete() const;
+
+    //! waits for an event to complete
+    CV_WRAP void waitForCompletion();
+
+    //! computes the elapsed time between events
+    CV_WRAP static float elapsedTime(const Event& start, const Event& end);
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    Event(const Ptr<Impl>& impl);
+
+    friend struct EventAccessor;
+};
+CV_ENUM_FLAGS(Event::CreateFlags)
+
+//! @} cudacore_struct
+
+//===================================================================================
+// Initialization & Info
+//===================================================================================
+
+//! @addtogroup cudacore_init
+//! @{
+
+/** @brief Returns the number of installed CUDA-enabled devices.
+
+Use this function before any other CUDA functions calls. If OpenCV is compiled without CUDA support,
+this function returns 0. If the CUDA driver is not installed, or is incompatible, this function
+returns -1.
+ */
+CV_EXPORTS_W int getCudaEnabledDeviceCount();
+
+/** @brief Sets a device and initializes it for the current thread.
+
+@param device System index of a CUDA device starting with 0.
+
+If the call of this function is omitted, a default device is initialized at the fist CUDA usage.
+ */
+CV_EXPORTS_W void setDevice(int device);
+
+/** @brief Returns the current device index set by cuda::setDevice or initialized by default.
+ */
+CV_EXPORTS_W int getDevice();
+
+/** @brief Explicitly destroys and cleans up all resources associated with the current device in the current
+process.
+
+Any subsequent API call to this device will reinitialize the device.
+ */
+CV_EXPORTS_W void resetDevice();
+
+/** @brief Enumeration providing CUDA computing features.
+ */
+enum FeatureSet
+{
+    FEATURE_SET_COMPUTE_10 = 10,
+    FEATURE_SET_COMPUTE_11 = 11,
+    FEATURE_SET_COMPUTE_12 = 12,
+    FEATURE_SET_COMPUTE_13 = 13,
+    FEATURE_SET_COMPUTE_20 = 20,
+    FEATURE_SET_COMPUTE_21 = 21,
+    FEATURE_SET_COMPUTE_30 = 30,
+    FEATURE_SET_COMPUTE_32 = 32,
+    FEATURE_SET_COMPUTE_35 = 35,
+    FEATURE_SET_COMPUTE_50 = 50,
+
+    GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
+    SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
+    NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
+    WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30,
+    DYNAMIC_PARALLELISM = FEATURE_SET_COMPUTE_35
+};
+
+//! checks whether current device supports the given feature
+CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
+
+/** @brief Class providing a set of static methods to check what NVIDIA\* card architecture the CUDA module was
+built for.
+
+According to the CUDA C Programming Guide Version 3.2: "PTX code produced for some specific compute
+capability can always be compiled to binary code of greater or equal compute capability".
+ */
+class CV_EXPORTS_W TargetArchs
+{
+public:
+    /** @brief The following method checks whether the module was built with the support of the given feature:
+
+    @param feature_set Features to be checked. See :ocvcuda::FeatureSet.
+     */
+    static bool builtWith(FeatureSet feature_set);
+
+    /** @brief There is a set of methods to check whether the module contains intermediate (PTX) or binary CUDA
+    code for the given architecture(s):
+
+    @param major Major compute capability version.
+    @param minor Minor compute capability version.
+     */
+    CV_WRAP static bool has(int major, int minor);
+    CV_WRAP static bool hasPtx(int major, int minor);
+    CV_WRAP static bool hasBin(int major, int minor);
+
+    CV_WRAP static bool hasEqualOrLessPtx(int major, int minor);
+    CV_WRAP static bool hasEqualOrGreater(int major, int minor);
+    CV_WRAP static bool hasEqualOrGreaterPtx(int major, int minor);
+    CV_WRAP static bool hasEqualOrGreaterBin(int major, int minor);
+};
+
+/** @brief Class providing functionality for querying the specified GPU properties.
+ */
+class CV_EXPORTS_W DeviceInfo
+{
+public:
+    //! creates DeviceInfo object for the current GPU
+    CV_WRAP DeviceInfo();
+
+    /** @brief The constructors.
+
+    @param device_id System index of the CUDA device starting with 0.
+
+    Constructs the DeviceInfo object for the specified device. If device_id parameter is missed, it
+    constructs an object for the current device.
+     */
+    CV_WRAP DeviceInfo(int device_id);
+
+    /** @brief Returns system index of the CUDA device starting with 0.
+    */
+    CV_WRAP int deviceID() const;
+
+    //! ASCII string identifying device
+    const char* name() const;
+
+    //! global memory available on device in bytes
+    CV_WRAP size_t totalGlobalMem() const;
+
+    //! shared memory available per block in bytes
+    CV_WRAP size_t sharedMemPerBlock() const;
+
+    //! 32-bit registers available per block
+    CV_WRAP int regsPerBlock() const;
+
+    //! warp size in threads
+    CV_WRAP int warpSize() const;
+
+    //! maximum pitch in bytes allowed by memory copies
+    CV_WRAP size_t memPitch() const;
+
+    //! maximum number of threads per block
+    CV_WRAP int maxThreadsPerBlock() const;
+
+    //! maximum size of each dimension of a block
+    CV_WRAP Vec3i maxThreadsDim() const;
+
+    //! maximum size of each dimension of a grid
+    CV_WRAP Vec3i maxGridSize() const;
+
+    //! clock frequency in kilohertz
+    CV_WRAP int clockRate() const;
+
+    //! constant memory available on device in bytes
+    CV_WRAP size_t totalConstMem() const;
+
+    //! major compute capability
+    CV_WRAP int majorVersion() const;
+
+    //! minor compute capability
+    CV_WRAP int minorVersion() const;
+
+    //! alignment requirement for textures
+    CV_WRAP size_t textureAlignment() const;
+
+    //! pitch alignment requirement for texture references bound to pitched memory
+    CV_WRAP size_t texturePitchAlignment() const;
+
+    //! number of multiprocessors on device
+    CV_WRAP int multiProcessorCount() const;
+
+    //! specified whether there is a run time limit on kernels
+    CV_WRAP bool kernelExecTimeoutEnabled() const;
+
+    //! device is integrated as opposed to discrete
+    CV_WRAP bool integrated() const;
+
+    //! device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer
+    CV_WRAP bool canMapHostMemory() const;
+
+    enum ComputeMode
+    {
+        ComputeModeDefault,         /**< default compute mode (Multiple threads can use cudaSetDevice with this device) */
+        ComputeModeExclusive,       /**< compute-exclusive-thread mode (Only one thread in one process will be able to use cudaSetDevice with this device) */
+        ComputeModeProhibited,      /**< compute-prohibited mode (No threads can use cudaSetDevice with this device) */
+        ComputeModeExclusiveProcess /**< compute-exclusive-process mode (Many threads in one process will be able to use cudaSetDevice with this device) */
+    };
+
+    //! compute mode
+    CV_WRAP DeviceInfo::ComputeMode computeMode() const;
+
+    //! maximum 1D texture size
+    CV_WRAP int maxTexture1D() const;
+
+    //! maximum 1D mipmapped texture size
+    CV_WRAP int maxTexture1DMipmap() const;
+
+    //! maximum size for 1D textures bound to linear memory
+    CV_WRAP int maxTexture1DLinear() const;
+
+    //! maximum 2D texture dimensions
+    CV_WRAP Vec2i maxTexture2D() const;
+
+    //! maximum 2D mipmapped texture dimensions
+    CV_WRAP Vec2i maxTexture2DMipmap() const;
+
+    //! maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory
+    CV_WRAP Vec3i maxTexture2DLinear() const;
+
+    //! maximum 2D texture dimensions if texture gather operations have to be performed
+    CV_WRAP Vec2i maxTexture2DGather() const;
+
+    //! maximum 3D texture dimensions
+    CV_WRAP Vec3i maxTexture3D() const;
+
+    //! maximum Cubemap texture dimensions
+    CV_WRAP int maxTextureCubemap() const;
+
+    //! maximum 1D layered texture dimensions
+    CV_WRAP Vec2i maxTexture1DLayered() const;
+
+    //! maximum 2D layered texture dimensions
+    CV_WRAP Vec3i maxTexture2DLayered() const;
+
+    //! maximum Cubemap layered texture dimensions
+    CV_WRAP Vec2i maxTextureCubemapLayered() const;
+
+    //! maximum 1D surface size
+    CV_WRAP int maxSurface1D() const;
+
+    //! maximum 2D surface dimensions
+    CV_WRAP Vec2i maxSurface2D() const;
+
+    //! maximum 3D surface dimensions
+    CV_WRAP Vec3i maxSurface3D() const;
+
+    //! maximum 1D layered surface dimensions
+    CV_WRAP Vec2i maxSurface1DLayered() const;
+
+    //! maximum 2D layered surface dimensions
+    CV_WRAP Vec3i maxSurface2DLayered() const;
+
+    //! maximum Cubemap surface dimensions
+    CV_WRAP int maxSurfaceCubemap() const;
+
+    //! maximum Cubemap layered surface dimensions
+    CV_WRAP Vec2i maxSurfaceCubemapLayered() const;
+
+    //! alignment requirements for surfaces
+    CV_WRAP size_t surfaceAlignment() const;
+
+    //! device can possibly execute multiple kernels concurrently
+    CV_WRAP bool concurrentKernels() const;
+
+    //! device has ECC support enabled
+    CV_WRAP bool ECCEnabled() const;
+
+    //! PCI bus ID of the device
+    CV_WRAP int pciBusID() const;
+
+    //! PCI device ID of the device
+    CV_WRAP int pciDeviceID() const;
+
+    //! PCI domain ID of the device
+    CV_WRAP int pciDomainID() const;
+
+    //! true if device is a Tesla device using TCC driver, false otherwise
+    CV_WRAP bool tccDriver() const;
+
+    //! number of asynchronous engines
+    CV_WRAP int asyncEngineCount() const;
+
+    //! device shares a unified address space with the host
+    CV_WRAP bool unifiedAddressing() const;
+
+    //! peak memory clock frequency in kilohertz
+    CV_WRAP int memoryClockRate() const;
+
+    //! global memory bus width in bits
+    CV_WRAP int memoryBusWidth() const;
+
+    //! size of L2 cache in bytes
+    CV_WRAP int l2CacheSize() const;
+
+    //! maximum resident threads per multiprocessor
+    CV_WRAP int maxThreadsPerMultiProcessor() const;
+
+    //! gets free and total device memory
+    CV_WRAP void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
+    CV_WRAP size_t freeMemory() const;
+    CV_WRAP size_t totalMemory() const;
+
+    /** @brief Provides information on CUDA feature support.
+
+    @param feature_set Features to be checked. See cuda::FeatureSet.
+
+    This function returns true if the device has the specified CUDA feature. Otherwise, it returns false
+     */
+    bool supports(FeatureSet feature_set) const;
+
+    /** @brief Checks the CUDA module and device compatibility.
+
+    This function returns true if the CUDA module can be run on the specified device. Otherwise, it
+    returns false .
+     */
+    CV_WRAP bool isCompatible() const;
+
+private:
+    int device_id_;
+};
+
+CV_EXPORTS_W void printCudaDeviceInfo(int device);
+CV_EXPORTS_W void printShortCudaDeviceInfo(int device);
+
+/** @brief Converts an array to half precision floating number.
+
+@param _src input array.
+@param _dst output array.
+@param stream Stream for the asynchronous version.
+@sa convertFp16
+*/
+CV_EXPORTS void convertFp16(InputArray _src, OutputArray _dst, Stream& stream = Stream::Null());
+
+//! @} cudacore_init
+
+}} // namespace cv { namespace cuda {
+
+
+#include "opencv2/core/cuda.inl.hpp"
+
+#endif /* OPENCV_CORE_CUDA_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda.inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda.inl.hpp
new file mode 100644
index 000000000000..9390b3a529f6
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda.inl.hpp
@@ -0,0 +1,763 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDAINL_HPP
+#define OPENCV_CORE_CUDAINL_HPP
+
+#include "opencv2/core/cuda.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda {
+
+//===================================================================================
+// GpuMat
+//===================================================================================
+
+inline
+GpuMat::GpuMat(Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{}
+
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+    {
+        create(rows_, cols_, type_);
+        setTo(s_);
+    }
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_, Scalar s_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+    {
+        create(size_.height, size_.width, type_);
+        setTo(s_);
+    }
+}
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+inline
+GpuMat::GpuMat(const GpuMat& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), allocator(m.allocator)
+{
+    if (refcount)
+        CV_XADD(refcount, 1);
+}
+
+inline
+GpuMat::GpuMat(InputArray arr, Allocator* allocator_) :
+    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    upload(arr);
+}
+
+inline
+GpuMat::~GpuMat()
+{
+    release();
+}
+
+inline
+GpuMat& GpuMat::operator =(const GpuMat& m)
+{
+    if (this != &m)
+    {
+        GpuMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void GpuMat::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+void GpuMat::swap(GpuMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+    std::swap(allocator, b.allocator);
+}
+
+inline
+GpuMat GpuMat::clone() const
+{
+    GpuMat m;
+    copyTo(m);
+    return m;
+}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+inline
+void GpuMat::copyTo(OutputArray dst, InputArray mask) const
+{
+    copyTo(dst, mask, Stream::Null());
+}
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+inline
+GpuMat& GpuMat::setTo(Scalar s)
+{
+    return setTo(s, Stream::Null());
+}
+
+inline
+GpuMat& GpuMat::setTo(Scalar s, InputArray mask)
+{
+    return setTo(s, mask, Stream::Null());
+}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype) const
+{
+    convertTo(dst, rtype, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, double beta) const
+{
+    convertTo(dst, rtype, alpha, beta, Stream::Null());
+}
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const
+{
+    convertTo(dst, rtype, alpha, 0.0, stream);
+}
+
+inline
+void GpuMat::assignTo(GpuMat& m, int _type) const
+{
+    if (_type < 0)
+        m = *this;
+    else
+        convertTo(m, _type);
+}
+
+inline
+uchar* GpuMat::ptr(int y)
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+inline
+const uchar* GpuMat::ptr(int y) const
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+template<typename _Tp> inline
+_Tp* GpuMat::ptr(int y)
+{
+    return (_Tp*)ptr(y);
+}
+
+template<typename _Tp> inline
+const _Tp* GpuMat::ptr(int y) const
+{
+    return (const _Tp*)ptr(y);
+}
+
+template <class T> inline
+GpuMat::operator PtrStepSz<T>() const
+{
+    return PtrStepSz<T>(rows, cols, (T*)data, step);
+}
+
+template <class T> inline
+GpuMat::operator PtrStep<T>() const
+{
+    return PtrStep<T>((T*)data, step);
+}
+
+inline
+GpuMat GpuMat::row(int y) const
+{
+    return GpuMat(*this, Range(y, y+1), Range::all());
+}
+
+inline
+GpuMat GpuMat::col(int x) const
+{
+    return GpuMat(*this, Range::all(), Range(x, x+1));
+}
+
+inline
+GpuMat GpuMat::rowRange(int startrow, int endrow) const
+{
+    return GpuMat(*this, Range(startrow, endrow), Range::all());
+}
+
+inline
+GpuMat GpuMat::rowRange(Range r) const
+{
+    return GpuMat(*this, r, Range::all());
+}
+
+inline
+GpuMat GpuMat::colRange(int startcol, int endcol) const
+{
+    return GpuMat(*this, Range::all(), Range(startcol, endcol));
+}
+
+inline
+GpuMat GpuMat::colRange(Range r) const
+{
+    return GpuMat(*this, Range::all(), r);
+}
+
+inline
+GpuMat GpuMat::operator ()(Range rowRange_, Range colRange_) const
+{
+    return GpuMat(*this, rowRange_, colRange_);
+}
+
+inline
+GpuMat GpuMat::operator ()(Rect roi) const
+{
+    return GpuMat(*this, roi);
+}
+
+inline
+bool GpuMat::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t GpuMat::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t GpuMat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int GpuMat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int GpuMat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int GpuMat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t GpuMat::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size GpuMat::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool GpuMat::empty() const
+{
+    return data == 0;
+}
+
+inline
+void* GpuMat::cudaPtr() const
+{
+    return data;
+}
+
+static inline
+GpuMat createContinuous(int rows, int cols, int type)
+{
+    GpuMat m;
+    createContinuous(rows, cols, type, m);
+    return m;
+}
+
+static inline
+void createContinuous(Size size, int type, OutputArray arr)
+{
+    createContinuous(size.height, size.width, type, arr);
+}
+
+static inline
+GpuMat createContinuous(Size size, int type)
+{
+    GpuMat m;
+    createContinuous(size, type, m);
+    return m;
+}
+
+static inline
+void ensureSizeIsEnough(Size size, int type, OutputArray arr)
+{
+    ensureSizeIsEnough(size.height, size.width, type, arr);
+}
+
+static inline
+void swap(GpuMat& a, GpuMat& b)
+{
+    a.swap(b);
+}
+
+//===================================================================================
+// GpuMatND
+//===================================================================================
+
+inline
+GpuMatND::GpuMatND() :
+    flags(0), dims(0), data(nullptr), offset(0)
+{
+}
+
+inline
+GpuMatND::GpuMatND(SizeArray _size, int _type) :
+    flags(0), dims(0), data(nullptr), offset(0)
+{
+    create(std::move(_size), _type);
+}
+
+inline
+void GpuMatND::swap(GpuMatND& m) noexcept
+{
+    std::swap(*this, m);
+}
+
+inline
+bool GpuMatND::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+bool GpuMatND::isSubmatrix() const
+{
+    return (flags & Mat::SUBMATRIX_FLAG) != 0;
+}
+
+inline
+size_t GpuMatND::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t GpuMatND::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+bool GpuMatND::empty() const
+{
+    return data == nullptr;
+}
+
+inline
+bool GpuMatND::external() const
+{
+    return !empty() && data_.use_count() == 0;
+}
+
+inline
+uchar* GpuMatND::getDevicePtr() const
+{
+    return data + offset;
+}
+
+inline
+size_t GpuMatND::total() const
+{
+    size_t p = 1;
+    for(auto s : size)
+        p *= s;
+    return p;
+}
+
+inline
+size_t GpuMatND::totalMemSize() const
+{
+    return size[0] * step[0];
+}
+
+inline
+int GpuMatND::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+//===================================================================================
+// HostMem
+//===================================================================================
+
+inline
+HostMem::HostMem(AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+}
+
+inline
+HostMem::HostMem(const HostMem& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
+{
+    if( refcount )
+        CV_XADD(refcount, 1);
+}
+
+inline
+HostMem::HostMem(int rows_, int cols_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+HostMem::HostMem(Size size_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline
+HostMem::HostMem(InputArray arr, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    arr.getMat().copyTo(*this);
+}
+
+inline
+HostMem::~HostMem()
+{
+    release();
+}
+
+inline
+HostMem& HostMem::operator =(const HostMem& m)
+{
+    if (this != &m)
+    {
+        HostMem temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void HostMem::swap(HostMem& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+    std::swap(alloc_type, b.alloc_type);
+}
+
+inline
+HostMem HostMem::clone() const
+{
+    HostMem m(size(), type(), alloc_type);
+    createMatHeader().copyTo(m);
+    return m;
+}
+
+inline
+void HostMem::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+Mat HostMem::createMatHeader() const
+{
+    return Mat(size(), type(), data, step);
+}
+
+inline
+bool HostMem::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t HostMem::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t HostMem::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int HostMem::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int HostMem::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int HostMem::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t HostMem::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size HostMem::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool HostMem::empty() const
+{
+    return data == 0;
+}
+
+static inline
+void swap(HostMem& a, HostMem& b)
+{
+    a.swap(b);
+}
+
+//===================================================================================
+// Stream
+//===================================================================================
+
+inline
+Stream::Stream(const Ptr<Impl>& impl)
+    : impl_(impl)
+{
+}
+
+//===================================================================================
+// Event
+//===================================================================================
+
+inline
+Event::Event(const Ptr<Impl>& impl)
+    : impl_(impl)
+{
+}
+
+//===================================================================================
+// Initialization & Info
+//===================================================================================
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+inline
+bool TargetArchs::has(int major, int minor)
+{
+    return hasPtx(major, minor) || hasBin(major, minor);
+}
+
+inline
+bool TargetArchs::hasEqualOrGreater(int major, int minor)
+{
+    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+}
+
+inline
+DeviceInfo::DeviceInfo()
+{
+    device_id_ = getDevice();
+}
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+inline
+DeviceInfo::DeviceInfo(int device_id)
+{
+    CV_Assert( device_id >= 0 && device_id < getCudaEnabledDeviceCount() );
+    device_id_ = device_id;
+}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+inline
+int DeviceInfo::deviceID() const
+{
+    return device_id_;
+}
+
+inline
+size_t DeviceInfo::freeMemory() const
+{
+    size_t _totalMemory = 0, _freeMemory = 0;
+    queryMemory(_totalMemory, _freeMemory);
+    return _freeMemory;
+}
+
+inline
+size_t DeviceInfo::totalMemory() const
+{
+    size_t _totalMemory = 0, _freeMemory = 0;
+    queryMemory(_totalMemory, _freeMemory);
+    return _totalMemory;
+}
+
+inline
+bool DeviceInfo::supports(FeatureSet feature_set) const
+{
+    int version = majorVersion() * 10 + minorVersion();
+    return version >= feature_set;
+}
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+
+}} // namespace cv { namespace cuda {
+
+//===================================================================================
+// Mat
+//===================================================================================
+
+namespace cv {
+
+inline
+Mat::Mat(const cuda::GpuMat& m)
+    : flags(0), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows)
+{
+    m.download(*this);
+}
+
+}
+
+//! @endcond
+
+#endif // OPENCV_CORE_CUDAINL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/block.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/block.hpp
new file mode 100644
index 000000000000..c277f0ea9c41
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/block.hpp
@@ -0,0 +1,211 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DEVICE_BLOCK_HPP
+#define OPENCV_CUDA_DEVICE_BLOCK_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Block
+    {
+        static __device__ __forceinline__ unsigned int id()
+        {
+            return blockIdx.x;
+        }
+
+        static __device__ __forceinline__ unsigned int stride()
+        {
+            return blockDim.x * blockDim.y * blockDim.z;
+        }
+
+        static __device__ __forceinline__ void sync()
+        {
+            __syncthreads();
+        }
+
+        static __device__ __forceinline__ int flattenedThreadId()
+        {
+            return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+        }
+
+        template<typename It, typename T>
+        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+        {
+            int STRIDE = stride();
+            It t = beg + flattenedThreadId();
+
+            for(; t < end; t += STRIDE)
+                *t = value;
+        }
+
+        template<typename OutIt, typename T>
+        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+        {
+            int STRIDE = stride();
+            int tid = flattenedThreadId();
+            value += tid;
+
+            for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
+                *t = value;
+        }
+
+        template<typename InIt, typename OutIt>
+        static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
+        {
+            int STRIDE = stride();
+            InIt  t = beg + flattenedThreadId();
+            OutIt o = out + (t - beg);
+
+            for(; t < end; t += STRIDE, o += STRIDE)
+                *o = *t;
+        }
+
+        template<typename InIt, typename OutIt, class UnOp>
+        static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
+        {
+            int STRIDE = stride();
+            InIt  t = beg + flattenedThreadId();
+            OutIt o = out + (t - beg);
+
+            for(; t < end; t += STRIDE, o += STRIDE)
+                *o = op(*t);
+        }
+
+        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+        static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+        {
+            int STRIDE = stride();
+            InIt1 t1 = beg1 + flattenedThreadId();
+            InIt2 t2 = beg2 + flattenedThreadId();
+            OutIt o  = out + (t1 - beg1);
+
+            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
+                *o = op(*t1, *t2);
+        }
+
+        template<int CTA_SIZE, typename T, class BinOp>
+        static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
+        {
+            int tid = flattenedThreadId();
+            T val =  buffer[tid];
+
+            if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+            if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+            if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+            if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+            if (tid < 32)
+            {
+                if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+            }
+        }
+
+        template<int CTA_SIZE, typename T, class BinOp>
+        static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
+        {
+            int tid = flattenedThreadId();
+            T val =  buffer[tid] = init;
+            __syncthreads();
+
+            if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+            if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+            if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+            if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+            if (tid < 32)
+            {
+                if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+            }
+            __syncthreads();
+            return buffer[0];
+        }
+
+        template <typename T, class BinOp>
+        static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
+        {
+            int ftid = flattenedThreadId();
+            int sft = stride();
+
+            if (sft < n)
+            {
+                for (unsigned int i = sft + ftid; i < n; i += sft)
+                    data[ftid] = op(data[ftid], data[i]);
+
+                __syncthreads();
+
+                n = sft;
+            }
+
+            while (n > 1)
+            {
+                unsigned int half = n/2;
+
+                if (ftid < half)
+                    data[ftid] = op(data[ftid], data[n - ftid - 1]);
+
+                __syncthreads();
+
+                n = n - half;
+            }
+        }
+    };
+}}}
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_DEVICE_BLOCK_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/border_interpolate.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/border_interpolate.hpp
new file mode 100644
index 000000000000..874f705baf02
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/border_interpolate.hpp
@@ -0,0 +1,722 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_BORDER_INTERPOLATE_HPP
+#define OPENCV_CUDA_BORDER_INTERPOLATE_HPP
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "vec_math.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    //////////////////////////////////////////////////////////////
+    // BrdConstant
+
+    template <typename D> struct BrdRowConstant
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return x >= 0 ? saturate_cast<D>(data[x]) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return x < width ? saturate_cast<D>(data[x]) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;
+        }
+
+        int width;
+        D val;
+    };
+
+    template <typename D> struct BrdColConstant
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        int height;
+        D val;
+    };
+
+    template <typename D> struct BrdConstant
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_)
+        {
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+
+        int height;
+        int width;
+        D val;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReplicate
+
+    template <typename D> struct BrdRowReplicate
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::min(x, last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReplicate
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::min(y, last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReplicate
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::min(y, last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::min(x, last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReflect101
+
+    template <typename D> struct BrdRowReflect101
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReflect101
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReflect101
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReflect
+
+    template <typename D> struct BrdRowReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(::abs(x) - (x < 0));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(::abs(y) - (y < 0));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReflect
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (last_col - ::abs(last_col - x) + (x > last_col));
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdWrap
+
+    template <typename D> struct BrdRowWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (x < width) * x + (x >= width) * (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int width;
+    };
+
+    template <typename D> struct BrdColWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return (y < height) * y + (y >= height) * (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int height;
+    };
+
+    template <typename D> struct BrdWrap
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) :
+            height(height_), width(width_)
+        {
+        }
+        template <typename U>
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) :
+            height(height_), width(width_)
+        {
+        }
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) ? y : (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return (y < height) ? y : (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) ? x : (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (x < width) ? x : (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int height;
+        int width;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BorderReader
+
+    template <typename Ptr2D, typename B> struct BorderReader
+    {
+        typedef typename B::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}
+
+        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const
+        {
+            return b.at(y, x, ptr);
+        }
+
+        Ptr2D ptr;
+        B b;
+    };
+
+    // under win32 there is some bug with templated types that passed as kernel parameters
+    // with this specialization all works fine
+    template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >
+    {
+        typedef typename BrdConstant<D>::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) :
+            src(src_), height(b.height), width(b.width), val(b.val)
+        {
+        }
+
+        __device__ __forceinline__ D operator ()(index_type y, index_type x) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+
+        Ptr2D src;
+        int height;
+        int width;
+        D val;
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_BORDER_INTERPOLATE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/color.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/color.hpp
new file mode 100644
index 000000000000..dcce28021482
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/color.hpp
@@ -0,0 +1,309 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COLOR_HPP
+#define OPENCV_CUDA_COLOR_HPP
+
+#include "detail/color_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // All OPENCV_CUDA_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
+    // {
+    //     typedef ... functor_type;
+    //     static __host__ __device__ functor_type create_functor();
+    // };
+
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COLOR_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/common.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/common.hpp
new file mode 100644
index 000000000000..1e1d5de1b08e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/common.hpp
@@ -0,0 +1,131 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COMMON_HPP
+#define OPENCV_CUDA_COMMON_HPP
+
+#include <cuda_runtime.h>
+#include "opencv2/core/cuda_types.hpp"
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+#ifndef CV_PI_F
+    #ifndef CV_PI
+        #define CV_PI_F 3.14159265f
+    #else
+        #define CV_PI_F ((float)CV_PI)
+    #endif
+#endif
+
+namespace cv { namespace cuda {
+    static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
+    {
+        if (cudaSuccess != err) {
+            cudaGetLastError(); // reset the last stored error to cudaSuccess
+            cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
+        }
+    }
+}}
+
+#ifndef cudaSafeCall
+    #define cudaSafeCall(expr)  cv::cuda::checkCudaError(expr, __FILE__, __LINE__, CV_Func)
+#endif
+
+namespace cv { namespace cuda
+{
+    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+    {
+        return reinterpret_cast<size_t>(ptr) % size == 0;
+    }
+
+    static inline bool isAligned(size_t step, size_t size)
+    {
+        return step % size == 0;
+    }
+}}
+
+namespace cv { namespace cuda
+{
+    namespace device
+    {
+        __host__ __device__ __forceinline__ int divUp(int total, int grain)
+        {
+            return (total + grain - 1) / grain;
+        }
+
+#if (CUDART_VERSION >= 12000)
+        template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t*, PtrStepSz<T>&, const cudaTextureDesc&) {
+            CV_Error(cv::Error::GpuNotSupported, "Function removed in CUDA SDK 12"); }
+#else
+        //TODO: remove from OpenCV 5.x
+        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+        }
+
+        template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t* tex, PtrStepSz<T>& img, const cudaTextureDesc& texDesc)
+        {
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = static_cast<void*>(img.ptr());
+            resDesc.res.pitch2D.height = img.rows;
+            resDesc.res.pitch2D.width = img.cols;
+            resDesc.res.pitch2D.pitchInBytes = img.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+
+            cudaSafeCall( cudaCreateTextureObject(tex, &resDesc, &texDesc, NULL) );
+        }
+#endif
+    }
+}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COMMON_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/datamov_utils.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/datamov_utils.hpp
new file mode 100644
index 000000000000..6820d0fd64de
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/datamov_utils.hpp
@@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DATAMOV_UTILS_HPP
+#define OPENCV_CUDA_DATAMOV_UTILS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
+
+        // for Fermi memory space is detected automatically
+        template <typename T> struct ForceGlob
+        {
+            __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
+        };
+
+    #else // __CUDA_ARCH__ >= 200
+
+        #if defined(_WIN64) || defined(__LP64__)
+            // 64-bit register modifier for inlined asm
+            #define OPENCV_CUDA_ASM_PTR "l"
+        #else
+            // 32-bit register modifier for inlined asm
+            #define OPENCV_CUDA_ASM_PTR "r"
+        #endif
+
+        template<class T> struct ForceGlob;
+
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(uchar,  u8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(schar,  s8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(char,   b8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (ushort, u16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (short,  s16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (uint,   u32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (int,    s32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (float,  f32, f)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (double, f64, d)
+
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB_B
+        #undef OPENCV_CUDA_ASM_PTR
+
+    #endif // __CUDA_ARCH__ >= 200
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_DATAMOV_UTILS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/color_detail.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/color_detail.hpp
new file mode 100644
index 000000000000..f4b4796571dc
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/color_detail.hpp
@@ -0,0 +1,2018 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COLOR_DETAIL_HPP
+#define OPENCV_CUDA_COLOR_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+#include "../saturate_cast.hpp"
+#include "../limits.hpp"
+#include "../functional.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    #ifndef CV_DESCALE
+        #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
+    #endif
+
+    namespace color_detail
+    {
+        template<typename T> struct ColorChannel
+        {
+            typedef float worktype_f;
+            static __device__ __forceinline__ T max() { return numeric_limits<T>::max(); }
+            static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }
+        };
+
+        template<> struct ColorChannel<float>
+        {
+            typedef float worktype_f;
+            static __device__ __forceinline__ float max() { return 1.f; }
+            static __device__ __forceinline__ float half() { return 0.5f; }
+        };
+
+        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_type& vec, T val)
+        {
+        }
+
+        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_type& vec, T val)
+        {
+            vec.w = val;
+        }
+
+        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_type& vec)
+        {
+            return ColorChannel<T>::max();
+        }
+
+        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_type& vec)
+        {
+            return vec.w;
+        }
+
+        //constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+        constexpr float B2YF = 0.114f;
+        constexpr float G2YF = 0.587f;
+        constexpr float R2YF = 0.299f;
+
+        //to YCbCr
+        constexpr float YCBF = 0.564f; // == 1/2/(1-B2YF)
+        constexpr float YCRF = 0.713f; // == 1/2/(1-R2YF)
+        const     int   YCBI = 9241;  // == YCBF*16384
+        const     int   YCRI = 11682; // == YCRF*16384
+        //to YUV
+        constexpr float B2UF = 0.492f;
+        constexpr float R2VF = 0.877f;
+        const     int   B2UI = 8061;  // == B2UF*16384
+        const     int   R2VI = 14369; // == R2VF*16384
+        //from YUV
+        constexpr float U2BF = 2.032f;
+        constexpr float U2GF = -0.395f;
+        constexpr float V2GF = -0.581f;
+        constexpr float V2RF = 1.140f;
+        const     int   U2BI = 33292;
+        const     int   U2GI = -6472;
+        const     int   V2GI = -9519;
+        const     int   V2RI = 18678;
+        //from YCrCb
+        constexpr float CB2BF = 1.773f;
+        constexpr float CB2GF = -0.344f;
+        constexpr float CR2GF = -0.714f;
+        constexpr float CR2RF = 1.403f;
+        const     int   CB2BI = 29049;
+        const     int   CB2GI = -5636;
+        const     int   CR2GI = -11698;
+        const     int   CR2RI = 22987;
+
+        enum
+        {
+            yuv_shift  = 14,
+            xyz_shift  = 12,
+            gray_shift = 15,
+            R2Y        = 4899,
+            G2Y        = 9617,
+            B2Y        = 1868,
+            RY15 =  9798, // == R2YF*32768 + 0.5
+            GY15 = 19235, // == G2YF*32768 + 0.5
+            BY15 =  3735, // == B2YF*32768 + 0.5
+            BLOCK_SIZE = 256
+        };
+    }
+
+////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
+
+    namespace color_detail
+    {
+        template <typename T, int scn, int dcn, int bidx> struct RGB2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                dst.x = (&src.x)[bidx];
+                dst.y = src.y;
+                dst.z = (&src.x)[bidx^2];
+                setAlpha(dst, getAlpha<T>(src));
+
+                return dst;
+            }
+
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
+        };
+
+        template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>
+        {
+            __device__ uint operator()(uint src) const
+            {
+                uint dst = 0;
+
+                dst |= (0xffu & (src >> 16));
+                dst |= (0xffu & (src >> 8)) << 8;
+                dst |= (0xffu & (src)) << 16;
+                dst |= (0xffu & (src >> 24)) << 24;
+
+                return dst;
+            }
+
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
+
+    namespace color_detail
+    {
+        template <int green_bits, int bidx> struct RGB2RGB5x5Converter;
+        template<int bidx> struct RGB2RGB5x5Converter<6, bidx>
+        {
+            static __device__ __forceinline__ ushort cvt(const uchar3& src)
+            {
+                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
+            }
+
+            static __device__ __forceinline__ ushort cvt(uint src)
+            {
+                uint b = 0xffu & (src >> (bidx * 8));
+                uint g = 0xffu & (src >> 8);
+                uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+                return (ushort)((b >> 3) | ((g & ~3) << 3) | ((r & ~7) << 8));
+            }
+        };
+
+        template<int bidx> struct RGB2RGB5x5Converter<5, bidx>
+        {
+            static __device__ __forceinline__ ushort cvt(const uchar3& src)
+            {
+                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
+            }
+
+            static __device__ __forceinline__ ushort cvt(uint src)
+            {
+                uint b = 0xffu & (src >> (bidx * 8));
+                uint g = 0xffu & (src >> 8);
+                uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+                uint a = 0xffu & (src >> 24);
+                return (ushort)((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a * 0x8000));
+            }
+        };
+
+        template<int scn, int bidx, int green_bits> struct RGB2RGB5x5;
+
+        template<int bidx, int green_bits> struct RGB2RGB5x5<3, bidx,green_bits> : unary_function<uchar3, ushort>
+        {
+            __device__ __forceinline__ ushort operator()(const uchar3& src) const
+            {
+                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
+            }
+
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
+        };
+
+        template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>
+        {
+            __device__ __forceinline__ ushort operator()(uint src) const
+            {
+                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
+            }
+
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        template <int green_bits, int bidx> struct RGB5x52RGBConverter;
+
+        template <int bidx> struct RGB5x52RGBConverter<5, bidx>
+        {
+            static __device__ __forceinline__ void cvt(uint src, uchar3& dst)
+            {
+                (&dst.x)[bidx] = src << 3;
+                dst.y = (src >> 2) & ~7;
+                (&dst.x)[bidx ^ 2] = (src >> 7) & ~7;
+            }
+
+            static __device__ __forceinline__ void cvt(uint src, uint& dst)
+            {
+                dst = 0;
+
+                dst |= (0xffu & (src << 3)) << (bidx * 8);
+                dst |= (0xffu & ((src >> 2) & ~7)) << 8;
+                dst |= (0xffu & ((src >> 7) & ~7)) << ((bidx ^ 2) * 8);
+                dst |= ((src & 0x8000) * 0xffu) << 24;
+            }
+        };
+
+        template <int bidx> struct RGB5x52RGBConverter<6, bidx>
+        {
+            static __device__ __forceinline__ void cvt(uint src, uchar3& dst)
+            {
+                (&dst.x)[bidx] = src << 3;
+                dst.y = (src >> 3) & ~3;
+                (&dst.x)[bidx ^ 2] = (src >> 8) & ~7;
+            }
+
+            static __device__ __forceinline__ void cvt(uint src, uint& dst)
+            {
+                dst = 0xffu << 24;
+
+                dst |= (0xffu & (src << 3)) << (bidx * 8);
+                dst |= (0xffu &((src >> 3) & ~3)) << 8;
+                dst |= (0xffu & ((src >> 8) & ~7)) << ((bidx ^ 2) * 8);
+            }
+        };
+
+        template <int dcn, int bidx, int green_bits> struct RGB5x52RGB;
+
+        template <int bidx, int green_bits> struct RGB5x52RGB<3, bidx, green_bits> : unary_function<ushort, uchar3>
+        {
+            __device__ __forceinline__ uchar3 operator()(ushort src) const
+            {
+                uchar3 dst;
+                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
+
+        };
+
+        template <int bidx, int green_bits> struct RGB5x52RGB<4, bidx, green_bits> : unary_function<ushort, uint>
+        {
+            __device__ __forceinline__ uint operator()(ushort src) const
+            {
+                uint dst;
+                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////// Grayscale to Color ////////////////////////////////
+
+    namespace color_detail
+    {
+        template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                dst.z = dst.y = dst.x = src;
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
+        };
+
+        template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                uint dst = 0xffu << 24;
+
+                dst |= src;
+                dst |= src << 8;
+                dst |= src << 16;
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::Gray2RGB<T, dcn> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        template <int green_bits> struct Gray2RGB5x5Converter;
+        template<> struct Gray2RGB5x5Converter<6>
+        {
+            static __device__ __forceinline__ ushort cvt(uint t)
+            {
+                return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+            }
+        };
+
+        template<> struct Gray2RGB5x5Converter<5>
+        {
+            static __device__ __forceinline__ ushort cvt(uint t)
+            {
+                t >>= 3;
+                return (ushort)(t | (t << 5) | (t << 10));
+            }
+        };
+
+        template<int green_bits> struct Gray2RGB5x5 : unary_function<uchar, ushort>
+        {
+            __device__ __forceinline__ ushort operator()(uint src) const
+            {
+                return Gray2RGB5x5Converter<green_bits>::cvt(src);
+            }
+
+            __host__ __device__ __forceinline__ Gray2RGB5x5() {}
+            __host__ __device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::Gray2RGB5x5<green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////// Color to Grayscale ////////////////////////////////
+
+    namespace color_detail
+    {
+        template <int green_bits> struct RGB5x52GrayConverter;
+        template <> struct RGB5x52GrayConverter<6>
+        {
+            static __device__ __forceinline__ uchar cvt(uint t)
+            {
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * BY15 + ((t >> 3) & 0xfc) * GY15 + ((t >> 8) & 0xf8) * RY15, gray_shift);
+            }
+        };
+
+        template <> struct RGB5x52GrayConverter<5>
+        {
+            static __device__ __forceinline__ uchar cvt(uint t)
+            {
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * BY15 + ((t >> 2) & 0xf8) * GY15 + ((t >> 7) & 0xf8) * RY15, gray_shift);
+            }
+        };
+
+        template<int green_bits> struct RGB5x52Gray : unary_function<ushort, uchar>
+        {
+            __device__ __forceinline__ uchar operator()(uint src) const
+            {
+                return RGB5x52GrayConverter<green_bits>::cvt(src);
+            }
+            __host__ __device__ __forceinline__ RGB5x52Gray() {}
+            __host__ __device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB5x52Gray<green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)
+        {
+            return (T)CV_DESCALE((unsigned)(src[bidx] * BY15 + src[1] * GY15 + src[bidx^2] * RY15), gray_shift);
+        }
+
+        template <int bidx> static __device__ __forceinline__ uchar RGB2GrayConvert(uint src)
+        {
+            uint b = 0xffu & (src >> (bidx * 8));
+            uint g = 0xffu & (src >> 8);
+            uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+            return CV_DESCALE((uint)(b * BY15 + g * GY15 + r * RY15), gray_shift);
+        }
+
+        template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)
+        {
+            return src[bidx] * B2YF + src[1] * G2YF + src[bidx^2] * R2YF;
+        }
+
+        template <typename T, int scn, int bidx> struct RGB2Gray : unary_function<typename TypeVec<T, scn>::vec_type, T>
+        {
+            __device__ __forceinline__ T operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                return RGB2GrayConvert<bidx>(&src.x);
+            }
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
+        };
+
+        template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>
+        {
+            __device__ __forceinline__ uchar operator()(uint src) const
+            {
+                return RGB2GrayConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2Gray<T, scn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ float c_RGB2YUVCoeffs_f[5] = { B2YF, G2YF, R2YF, B2UF, R2VF };
+        __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, B2UI, R2VI };
+
+        template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)
+        {
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+
+            const int Y = CV_DESCALE(src[0] * c_RGB2YUVCoeffs_i[bidx^2] + src[1] * c_RGB2YUVCoeffs_i[1] + src[2] * c_RGB2YUVCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);
+
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cr);
+            dst.z = saturate_cast<T>(Cb);
+        }
+
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YUVConvert(const float* src, D& dst)
+        {
+            dst.x = src[0] * c_RGB2YUVCoeffs_f[bidx^2] + src[1] * c_RGB2YUVCoeffs_f[1] + src[2] * c_RGB2YUVCoeffs_f[bidx];
+            dst.y = (src[bidx^2] - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (src[bidx] - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct RGB2YUV
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+                RGB2YUVConvert<bidx>(&src.x, dst);
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2YUV() {}
+            __host__ __device__ __forceinline__ RGB2YUV(const RGB2YUV&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ float c_YUV2RGBCoeffs_f[5] = { U2BF, U2GF, V2GF, V2RF };
+        __constant__ int   c_YUV2RGBCoeffs_i[5] = { U2BI, U2GI, V2GI, V2RI };
+
+        template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)
+        {
+            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
+
+            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[2]
+                                             + (src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
+
+            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
+
+            dst[bidx] = saturate_cast<D>(b);
+            dst[1] = saturate_cast<D>(g);
+            dst[bidx^2] = saturate_cast<D>(r);
+        }
+
+        template <int bidx> static __device__ uint YUV2RGBConvert(uint src)
+        {
+            const int x = 0xff & (src);
+            const int y = 0xff & (src >> 8);
+            const int z = 0xff & (src >> 16);
+
+            const int b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
+
+            const int g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[2]
+                                         + (y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
+
+            const int r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(b) << (bidx * 8);
+            dst |= saturate_cast<uchar>(g) << 8;
+            dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);
+
+            return dst;
+        }
+
+        template <int bidx, typename T> static __device__ __forceinline__ void YUV2RGBConvert(const T& src, float* dst)
+        {
+            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];
+
+            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2]
+                     + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];
+
+            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct YUV2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                YUV2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
+        };
+
+        template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return YUV2RGBConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ float c_RGB2YCrCbCoeffs_f[5] = {R2YF, G2YF, B2YF, YCRF, YCBF};
+        __constant__ int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, YCRI, YCBI};
+
+        template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)
+        {
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+
+            const int Y = CV_DESCALE(src[0] * c_RGB2YCrCbCoeffs_i[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_i[1] + src[2] * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
+
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cr);
+            dst.z = saturate_cast<T>(Cb);
+        }
+
+        template <int bidx> static __device__ uint RGB2YCrCbConvert(uint src)
+        {
+            const int delta = ColorChannel<uchar>::half() * (1 << yuv_shift);
+
+            const int Y = CV_DESCALE((0xffu & src) * c_RGB2YCrCbCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YCrCbCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
+
+            uint dst = 0;
+
+            dst |= saturate_cast<uchar>(Y);
+            dst |= saturate_cast<uchar>(Cr) << 8;
+            dst |= saturate_cast<uchar>(Cb) << 16;
+
+            return dst;
+        }
+
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst)
+        {
+            dst.x = src[0] * c_RGB2YCrCbCoeffs_f[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_f[1] + src[2] * c_RGB2YCrCbCoeffs_f[bidx];
+            dst.y = (src[bidx^2] - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (src[bidx] - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct RGB2YCrCb
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+                RGB2YCrCbConvert<bidx>(&src.x, dst);
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
+        };
+
+        template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return RGB2YCrCbConvert<bidx>(src);
+            }
+
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ float c_YCrCb2RGBCoeffs_f[5] = {CR2RF, CR2GF, CB2GF, CB2BF};
+        __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {CR2RI, CR2GI, CB2GI, CB2BI};
+
+        template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)
+        {
+            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
+            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
+            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
+
+            dst[bidx] = saturate_cast<D>(b);
+            dst[1] = saturate_cast<D>(g);
+            dst[bidx^2] = saturate_cast<D>(r);
+        }
+
+        template <int bidx> static __device__ uint YCrCb2RGBConvert(uint src)
+        {
+            const int x = 0xff & (src);
+            const int y = 0xff & (src >> 8);
+            const int z = 0xff & (src >> 16);
+
+            const int b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
+            const int g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
+            const int r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(b) << (bidx * 8);
+            dst |= saturate_cast<uchar>(g) << 8;
+            dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);
+
+            return dst;
+        }
+
+        template <int bidx, typename T> __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst)
+        {
+            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];
+            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];
+            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct YCrCb2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                YCrCb2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
+        };
+
+        template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return YCrCb2RGBConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };
+        __constant__ int   c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };
+
+        template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
+        {
+            dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));
+            dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));
+            dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));
+        }
+
+        template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)
+        {
+            const uint b = 0xffu & (src >> (bidx * 8));
+            const uint g = 0xffu & (src >> 8);
+            const uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+
+            const uint x = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[0] + g * c_RGB2XYZ_D65i[1] + b * c_RGB2XYZ_D65i[2], xyz_shift));
+            const uint y = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[3] + g * c_RGB2XYZ_D65i[4] + b * c_RGB2XYZ_D65i[5], xyz_shift));
+            const uint z = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[6] + g * c_RGB2XYZ_D65i[7] + b * c_RGB2XYZ_D65i[8], xyz_shift));
+
+            uint dst = 0;
+
+            dst |= x;
+            dst |= y << 8;
+            dst |= z << 16;
+
+            return dst;
+        }
+
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)
+        {
+            dst.x = src[bidx^2] * c_RGB2XYZ_D65f[0] + src[1] * c_RGB2XYZ_D65f[1] + src[bidx] * c_RGB2XYZ_D65f[2];
+            dst.y = src[bidx^2] * c_RGB2XYZ_D65f[3] + src[1] * c_RGB2XYZ_D65f[4] + src[bidx] * c_RGB2XYZ_D65f[5];
+            dst.z = src[bidx^2] * c_RGB2XYZ_D65f[6] + src[1] * c_RGB2XYZ_D65f[7] + src[bidx] * c_RGB2XYZ_D65f[8];
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct RGB2XYZ
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                RGB2XYZConvert<bidx>(&src.x, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
+        };
+
+        template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return RGB2XYZConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };
+        __constant__ int   c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };
+
+        template <int bidx, typename T, typename D> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)
+        {
+            dst[bidx^2] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift));
+            dst[1]      = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift));
+            dst[bidx]   = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift));
+        }
+
+        template <int bidx> static __device__ __forceinline__ uint XYZ2RGBConvert(uint src)
+        {
+            const int x = 0xff & src;
+            const int y = 0xff & (src >> 8);
+            const int z = 0xff & (src >> 16);
+
+            const uint r = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[0] + y * c_XYZ2sRGB_D65i[1] + z * c_XYZ2sRGB_D65i[2], xyz_shift));
+            const uint g = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[3] + y * c_XYZ2sRGB_D65i[4] + z * c_XYZ2sRGB_D65i[5], xyz_shift));
+            const uint b = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[6] + y * c_XYZ2sRGB_D65i[7] + z * c_XYZ2sRGB_D65i[8], xyz_shift));
+
+            uint dst = 0xffu << 24;
+
+            dst |= b << (bidx * 8);
+            dst |= g << 8;
+            dst |= r << ((bidx ^ 2) * 8);
+
+            return dst;
+        }
+
+        template <int bidx, typename T> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)
+        {
+            dst[bidx^2] = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];
+            dst[1]      = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];
+            dst[bidx]   = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct XYZ2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                XYZ2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
+        };
+
+        template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return XYZ2RGBConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ int c_HsvDivTable   [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};
+        __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};
+        __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};
+
+        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const uchar* src, D& dst)
+        {
+            const int hsv_shift = 12;
+            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
+
+            int b = src[bidx], g = src[1], r = src[bidx^2];
+            int h, s, v = b;
+            int vmin = b, diff;
+            int vr, vg;
+
+            v = ::max(v, g);
+            v = ::max(v, r);
+            vmin = ::min(vmin, g);
+            vmin = ::min(vmin, r);
+
+            diff = v - vmin;
+            vr = (v == r) * -1;
+            vg = (v == g) * -1;
+
+            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h += (h < 0) * hr;
+
+            dst.x = saturate_cast<uchar>(h);
+            dst.y = (uchar)s;
+            dst.z = (uchar)v;
+        }
+
+        template <int bidx, int hr> static __device__ uint RGB2HSVConvert(uint src)
+        {
+            const int hsv_shift = 12;
+            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
+
+            const int b = 0xff & (src >> (bidx * 8));
+            const int g = 0xff & (src >> 8);
+            const int r = 0xff & (src >> ((bidx ^ 2) * 8));
+
+            int h, s, v = b;
+            int vmin = b, diff;
+            int vr, vg;
+
+            v = ::max(v, g);
+            v = ::max(v, r);
+            vmin = ::min(vmin, g);
+            vmin = ::min(vmin, r);
+
+            diff = v - vmin;
+            vr = (v == r) * -1;
+            vg = (v == g) * -1;
+
+            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h += (h < 0) * hr;
+
+            uint dst = 0;
+
+            dst |= saturate_cast<uchar>(h);
+            dst |= (0xffu & s) << 8;
+            dst |= (0xffu & v) << 16;
+
+            return dst;
+        }
+
+        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const float* src, D& dst)
+        {
+            const float hscale = hr * (1.f / 360.f);
+
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h, s, v;
+
+            float vmin, diff;
+
+            v = vmin = r;
+            v = fmax(v, g);
+            v = fmax(v, b);
+            vmin = fmin(vmin, g);
+            vmin = fmin(vmin, b);
+
+            diff = v - vmin;
+            s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());
+            diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));
+
+            h  = (v == r) * (g - b) * diff;
+            h += (v != r && v == g) * ((b - r) * diff + 120.f);
+            h += (v != r && v != g) * ((r - g) * diff + 240.f);
+            h += (h < 0) * 360.f;
+
+            dst.x = h * hscale;
+            dst.y = s;
+            dst.z = v;
+        }
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HSV
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                RGB2HSVConvert<bidx, hr>(&src.x, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
+        };
+
+        template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return RGB2HSVConvert<bidx, hr>(src);
+            }
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
+
+        template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)
+        {
+            const float hscale = 6.f / hr;
+
+            float h = src.x, s = src.y, v = src.z;
+            float b = v, g = v, r = v;
+
+            if (s != 0)
+            {
+                h *= hscale;
+
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+
+                int sector = __float2int_rd(h);
+                h -= sector;
+
+                if ( (unsigned)sector >= 6u )
+                {
+                    sector = 0;
+                    h = 0.f;
+                }
+
+                float tab[4];
+                tab[0] = v;
+                tab[1] = v * (1.f - s);
+                tab[2] = v * (1.f - s * h);
+                tab[3] = v * (1.f - s * (1.f - h));
+
+                b = tab[c_HsvSectorData[sector][0]];
+                g = tab[c_HsvSectorData[sector][1]];
+                r = tab[c_HsvSectorData[sector][2]];
+            }
+
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
+        }
+
+        template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, uchar* dst)
+        {
+            float3 buf;
+
+            buf.x = src.x;
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
+
+            HSV2RGBConvert<bidx, HR>(buf, &buf.x);
+
+            dst[0] = saturate_cast<uchar>(buf.x * 255.f);
+            dst[1] = saturate_cast<uchar>(buf.y * 255.f);
+            dst[2] = saturate_cast<uchar>(buf.z * 255.f);
+        }
+
+        template <int bidx, int hr> static __device__ uint HSV2RGBConvert(uint src)
+        {
+            float3 buf;
+
+            buf.x = src & 0xff;
+            buf.y = ((src >> 8) & 0xff) * (1.f/255.f);
+            buf.z = ((src >> 16) & 0xff) * (1.f/255.f);
+
+            HSV2RGBConvert<bidx, hr>(buf, &buf.x);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(buf.x * 255.f);
+            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
+            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
+
+            return dst;
+        }
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct HSV2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                HSV2RGBConvert<bidx, hr>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
+        };
+
+        template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return HSV2RGBConvert<bidx, hr>(src);
+            }
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+/////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
+
+    namespace color_detail
+    {
+        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)
+        {
+            const float hscale = hr * (1.f / 360.f);
+
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h = 0.f, s = 0.f, l;
+            float vmin, vmax, diff;
+
+            vmax = vmin = r;
+            vmax = fmax(vmax, g);
+            vmax = fmax(vmax, b);
+            vmin = fmin(vmin, g);
+            vmin = fmin(vmin, b);
+
+            diff = vmax - vmin;
+            l = (vmax + vmin) * 0.5f;
+
+            if (diff > numeric_limits<float>::epsilon())
+            {
+                s = (l < 0.5f) * diff / (vmax + vmin);
+                s += (l >= 0.5f) * diff / (2.0f - vmax - vmin);
+
+                diff = 60.f / diff;
+
+                h  = (vmax == r) * (g - b) * diff;
+                h += (vmax != r && vmax == g) * ((b - r) * diff + 120.f);
+                h += (vmax != r && vmax != g) * ((r - g) * diff + 240.f);
+                h += (h < 0.f) * 360.f;
+            }
+
+            dst.x = h * hscale;
+            dst.y = l;
+            dst.z = s;
+        }
+
+        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const uchar* src, D& dst)
+        {
+            float3 buf;
+
+            buf.x = src[0] * (1.f / 255.f);
+            buf.y = src[1] * (1.f / 255.f);
+            buf.z = src[2] * (1.f / 255.f);
+
+            RGB2HLSConvert<bidx, hr>(&buf.x, buf);
+
+            dst.x = saturate_cast<uchar>(buf.x);
+            dst.y = saturate_cast<uchar>(buf.y*255.f);
+            dst.z = saturate_cast<uchar>(buf.z*255.f);
+        }
+
+        template <int bidx, int hr> static __device__ uint RGB2HLSConvert(uint src)
+        {
+            float3 buf;
+
+            buf.x = (0xff & src) * (1.f / 255.f);
+            buf.y = (0xff & (src >> 8)) * (1.f / 255.f);
+            buf.z = (0xff & (src >> 16)) * (1.f / 255.f);
+
+            RGB2HLSConvert<bidx, hr>(&buf.x, buf);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(buf.x);
+            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
+            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
+
+            return dst;
+        }
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HLS
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                RGB2HLSConvert<bidx, hr>(&src.x, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
+        };
+
+        template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return RGB2HLSConvert<bidx, hr>(src);
+            }
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
+
+        template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)
+        {
+            const float hscale = 6.0f / hr;
+
+            float h = src.x, l = src.y, s = src.z;
+            float b = l, g = l, r = l;
+
+            if (s != 0)
+            {
+                float p2  = (l <= 0.5f) * l * (1 + s);
+                      p2 += (l > 0.5f) * (l + s - l * s);
+                float p1 = 2 * l - p2;
+
+                h *= hscale;
+
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+
+                int sector;
+                sector = __float2int_rd(h);
+
+                h -= sector;
+
+                float tab[4];
+                tab[0] = p2;
+                tab[1] = p1;
+                tab[2] = p1 + (p2 - p1) * (1 - h);
+                tab[3] = p1 + (p2 - p1) * h;
+
+                b = tab[c_HlsSectorData[sector][0]];
+                g = tab[c_HlsSectorData[sector][1]];
+                r = tab[c_HlsSectorData[sector][2]];
+            }
+
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
+        }
+
+        template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, uchar* dst)
+        {
+            float3 buf;
+
+            buf.x = src.x;
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
+
+            HLS2RGBConvert<bidx, hr>(buf, &buf.x);
+
+            dst[0] = saturate_cast<uchar>(buf.x * 255.f);
+            dst[1] = saturate_cast<uchar>(buf.y * 255.f);
+            dst[2] = saturate_cast<uchar>(buf.z * 255.f);
+        }
+
+        template <int bidx, int hr> static __device__ uint HLS2RGBConvert(uint src)
+        {
+            float3 buf;
+
+            buf.x = 0xff & src;
+            buf.y = (0xff & (src >> 8)) * (1.f / 255.f);
+            buf.z = (0xff & (src >> 16)) * (1.f / 255.f);
+
+            HLS2RGBConvert<bidx, hr>(buf, &buf.x);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(buf.x * 255.f);
+            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
+            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
+
+            return dst;
+        }
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct HLS2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                HLS2RGBConvert<bidx, hr>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
+        };
+
+        template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return HLS2RGBConvert<bidx, hr>(src);
+            }
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> Lab /////////////////////////////////////
+
+    namespace color_detail
+    {
+        enum
+        {
+            LAB_CBRT_TAB_SIZE = 1024,
+            GAMMA_TAB_SIZE = 1024,
+            lab_shift = xyz_shift,
+            gamma_shift = 3,
+            lab_shift2 = (lab_shift + gamma_shift),
+            LAB_CBRT_TAB_SIZE_B = (256 * 3 / 2 * (1 << gamma_shift))
+        };
+
+        __constant__ ushort c_sRGBGammaTab_b[] = {0,1,1,2,2,3,4,4,5,6,6,7,8,8,9,10,11,11,12,13,14,15,16,17,19,20,21,22,24,25,26,28,29,31,33,34,36,38,40,41,43,45,47,49,51,54,56,58,60,63,65,68,70,73,75,78,81,83,86,89,92,95,98,101,105,108,111,115,118,121,125,129,132,136,140,144,147,151,155,160,164,168,172,176,181,185,190,194,199,204,209,213,218,223,228,233,239,244,249,255,260,265,271,277,282,288,294,300,306,312,318,324,331,337,343,350,356,363,370,376,383,390,397,404,411,418,426,433,440,448,455,463,471,478,486,494,502,510,518,527,535,543,552,560,569,578,586,595,604,613,622,631,641,650,659,669,678,688,698,707,717,727,737,747,757,768,778,788,799,809,820,831,842,852,863,875,886,897,908,920,931,943,954,966,978,990,1002,1014,1026,1038,1050,1063,1075,1088,1101,1113,1126,1139,1152,1165,1178,1192,1205,1218,1232,1245,1259,1273,1287,1301,1315,1329,1343,1357,1372,1386,1401,1415,1430,1445,1460,1475,1490,1505,1521,1536,1551,1567,1583,1598,1614,1630,1646,1662,1678,1695,1711,1728,1744,1761,1778,1794,1811,1828,1846,1863,1880,1897,1915,1933,1950,1968,1986,2004,2022,2040};
+
+        __device__ __forceinline__ int LabCbrt_b(int i)
+        {
+            float x = i * (1.f / (255.f * (1 << gamma_shift)));
+            return (1 << lab_shift2) * (x < 0.008856f ? x * 7.787f + 0.13793103448275862f : ::cbrtf(x));
+        }
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void RGB2LabConvert_b(const T& src, D& dst)
+        {
+            const int Lscale = (116 * 255 + 50) / 100;
+            const int Lshift = -((16 * 255 * (1 << lab_shift2) + 50) / 100);
+
+            int B = blueIdx == 0 ? src.x : src.z;
+            int G = src.y;
+            int R = blueIdx == 0 ? src.z : src.x;
+
+            if (srgb)
+            {
+                B = c_sRGBGammaTab_b[B];
+                G = c_sRGBGammaTab_b[G];
+                R = c_sRGBGammaTab_b[R];
+            }
+            else
+            {
+                B <<= 3;
+                G <<= 3;
+                R <<= 3;
+            }
+
+            int fX = LabCbrt_b(CV_DESCALE(B * 778 + G * 1541 + R * 1777, lab_shift));
+            int fY = LabCbrt_b(CV_DESCALE(B * 296 + G * 2929 + R * 871, lab_shift));
+            int fZ = LabCbrt_b(CV_DESCALE(B * 3575 + G * 448 + R * 73, lab_shift));
+
+            int L = CV_DESCALE(Lscale * fY + Lshift, lab_shift2);
+            int a = CV_DESCALE(500 * (fX - fY) + 128 * (1 << lab_shift2), lab_shift2);
+            int b = CV_DESCALE(200 * (fY - fZ) + 128 * (1 << lab_shift2), lab_shift2);
+
+            dst.x = saturate_cast<uchar>(L);
+            dst.y = saturate_cast<uchar>(a);
+            dst.z = saturate_cast<uchar>(b);
+        }
+
+        __device__ __forceinline__ float splineInterpolate(float x, const float* tab, int n)
+        {
+            int ix = ::min(::max(int(x), 0), n-1);
+            x -= ix;
+            tab += ix * 4;
+            return ((tab[3] * x + tab[2]) * x + tab[1]) * x + tab[0];
+        }
+
+        __constant__ float c_sRGBGammaTab[] = {0,7.55853e-05,0.,-7.51331e-13,7.55853e-05,7.55853e-05,-2.25399e-12,3.75665e-12,0.000151171,7.55853e-05,9.01597e-12,-6.99932e-12,0.000226756,7.55853e-05,-1.1982e-11,2.41277e-12,0.000302341,7.55853e-05,-4.74369e-12,1.19001e-11,0.000377927,7.55853e-05,3.09568e-11,-2.09095e-11,0.000453512,7.55853e-05,-3.17718e-11,1.35303e-11,0.000529097,7.55853e-05,8.81905e-12,-4.10782e-12,0.000604683,7.55853e-05,-3.50439e-12,2.90097e-12,0.000680268,7.55853e-05,5.19852e-12,-7.49607e-12,0.000755853,7.55853e-05,-1.72897e-11,2.70833e-11,0.000831439,7.55854e-05,6.39602e-11,-4.26295e-11,0.000907024,7.55854e-05,-6.39282e-11,2.70193e-11,0.000982609,7.55853e-05,1.71298e-11,-7.24017e-12,0.00105819,7.55853e-05,-4.59077e-12,1.94137e-12,0.00113378,7.55853e-05,1.23333e-12,-5.25291e-13,0.00120937,7.55853e-05,-3.42545e-13,1.59799e-13,0.00128495,7.55853e-05,1.36852e-13,-1.13904e-13,0.00136054,7.55853e-05,-2.04861e-13,2.95818e-13,0.00143612,7.55853e-05,6.82594e-13,-1.06937e-12,0.00151171,7.55853e-05,-2.52551e-12,3.98166e-12,0.00158729,7.55853e-05,9.41946e-12,-1.48573e-11,0.00166288,7.55853e-05,-3.51523e-11,5.54474e-11,0.00173846,7.55854e-05,1.3119e-10,-9.0517e-11,0.00181405,7.55854e-05,-1.40361e-10,7.37899e-11,0.00188963,7.55853e-05,8.10085e-11,-8.82272e-11,0.00196522,7.55852e-05,-1.83673e-10,1.62704e-10,0.0020408,7.55853e-05,3.04438e-10,-2.13341e-10,0.00211639,7.55853e-05,-3.35586e-10,2.25e-10,0.00219197,7.55853e-05,3.39414e-10,-2.20997e-10,0.00226756,7.55853e-05,-3.23576e-10,1.93326e-10,0.00234315,7.55853e-05,2.564e-10,-8.66446e-11,0.00241873,7.55855e-05,-3.53328e-12,-7.9578e-11,0.00249432,7.55853e-05,-2.42267e-10,1.72126e-10,0.0025699,7.55853e-05,2.74111e-10,-1.43265e-10,0.00264549,7.55854e-05,-1.55683e-10,-6.47292e-11,0.00272107,7.55849e-05,-3.4987e-10,8.67842e-10,0.00279666,7.55868e-05,2.25366e-09,-3.8723e-09,0.00287224,7.55797e-05,-9.36325e-09,1.5087e-08,0.00294783,7.56063e-05,3.58978e-08,-5.69415e-08,0.00302341,7.55072e-05,-1.34927e-07,2.13144e-07,0.003099,7.58768e-05,5.04507e-07,1.38713e-07,0.00317552,7.7302e-05,9.20646e-07,-1.55186e-07,0.00325359,7.86777e-05,4.55087e-07,4.26813e-08,0.00333276,7.97159e-05,5.83131e-07,-1.06495e-08,0.00341305,8.08502e-05,5.51182e-07,3.87467e-09,0.00349446,8.19642e-05,5.62806e-07,-1.92586e-10,0.00357698,8.30892e-05,5.62228e-07,1.0866e-09,0.00366063,8.4217e-05,5.65488e-07,5.02818e-10,0.00374542,8.53494e-05,5.66997e-07,8.60211e-10,0.00383133,8.6486e-05,5.69577e-07,7.13044e-10,0.00391839,8.76273e-05,5.71716e-07,4.78527e-10,0.00400659,8.87722e-05,5.73152e-07,1.09818e-09,0.00409594,8.99218e-05,5.76447e-07,2.50964e-10,0.00418644,9.10754e-05,5.772e-07,1.15762e-09,0.00427809,9.22333e-05,5.80672e-07,2.40865e-10,0.0043709,9.33954e-05,5.81395e-07,1.13854e-09,0.00446488,9.45616e-05,5.84811e-07,3.27267e-10,0.00456003,9.57322e-05,5.85792e-07,8.1197e-10,0.00465635,9.69062e-05,5.88228e-07,6.15823e-10,0.00475384,9.80845e-05,5.90076e-07,9.15747e-10,0.00485252,9.92674e-05,5.92823e-07,3.778e-10,0.00495238,0.000100454,5.93956e-07,8.32623e-10,0.00505343,0.000101645,5.96454e-07,4.82695e-10,0.00515567,0.000102839,5.97902e-07,9.61904e-10,0.00525911,0.000104038,6.00788e-07,3.26281e-10,0.00536375,0.00010524,6.01767e-07,9.926e-10,0.00546959,0.000106447,6.04745e-07,3.59933e-10,0.00557664,0.000107657,6.05824e-07,8.2728e-10,0.0056849,0.000108871,6.08306e-07,5.21898e-10,0.00579438,0.00011009,6.09872e-07,8.10492e-10,0.00590508,0.000111312,6.12303e-07,4.27046e-10,0.00601701,0.000112538,6.13585e-07,7.40878e-10,0.00613016,0.000113767,6.15807e-07,8.00469e-10,0.00624454,0.000115001,6.18209e-07,2.48178e-10,0.00636016,0.000116238,6.18953e-07,1.00073e-09,0.00647702,0.000117479,6.21955e-07,4.05654e-10,0.00659512,0.000118724,6.23172e-07,6.36192e-10,0.00671447,0.000119973,6.25081e-07,7.74927e-10,0.00683507,0.000121225,6.27406e-07,4.54975e-10,0.00695692,0.000122481,6.28771e-07,6.64841e-10,0.00708003,0.000123741,6.30765e-07,6.10972e-10,0.00720441,0.000125004,6.32598e-07,6.16543e-10,0.00733004,0.000126271,6.34448e-07,6.48204e-10,0.00745695,0.000127542,6.36392e-07,5.15835e-10,0.00758513,0.000128816,6.3794e-07,5.48103e-10,0.00771458,0.000130094,6.39584e-07,1.01706e-09,0.00784532,0.000131376,6.42635e-07,4.0283e-11,0.00797734,0.000132661,6.42756e-07,6.84471e-10,0.00811064,0.000133949,6.4481e-07,9.47144e-10,0.00824524,0.000135241,6.47651e-07,1.83472e-10,0.00838112,0.000136537,6.48201e-07,1.11296e-09,0.00851831,0.000137837,6.5154e-07,2.13163e-11,0.0086568,0.00013914,6.51604e-07,6.64462e-10,0.00879659,0.000140445,6.53598e-07,1.04613e-09,0.00893769,0.000141756,6.56736e-07,-1.92377e-10,0.0090801,0.000143069,6.56159e-07,1.58601e-09,0.00922383,0.000144386,6.60917e-07,-5.63754e-10,0.00936888,0.000145706,6.59226e-07,1.60033e-09,0.00951524,0.000147029,6.64027e-07,-2.49543e-10,0.00966294,0.000148356,6.63278e-07,1.26043e-09,0.00981196,0.000149687,6.67059e-07,-1.35572e-10,0.00996231,0.00015102,6.66653e-07,1.14458e-09,0.010114,0.000152357,6.70086e-07,2.13864e-10,0.010267,0.000153698,6.70728e-07,7.93856e-10,0.0104214,0.000155042,6.73109e-07,3.36077e-10,0.0105771,0.000156389,6.74118e-07,6.55765e-10,0.0107342,0.000157739,6.76085e-07,7.66211e-10,0.0108926,0.000159094,6.78384e-07,4.66116e-12,0.0110524,0.000160451,6.78398e-07,1.07775e-09,0.0112135,0.000161811,6.81631e-07,3.41023e-10,0.011376,0.000163175,6.82654e-07,3.5205e-10,0.0115398,0.000164541,6.8371e-07,1.04473e-09,0.0117051,0.000165912,6.86844e-07,1.25757e-10,0.0118717,0.000167286,6.87222e-07,3.14818e-10,0.0120396,0.000168661,6.88166e-07,1.40886e-09,0.012209,0.000170042,6.92393e-07,-3.62244e-10,0.0123797,0.000171425,6.91306e-07,9.71397e-10,0.0125518,0.000172811,6.9422e-07,2.02003e-10,0.0127253,0.0001742,6.94826e-07,1.01448e-09,0.0129002,0.000175593,6.97869e-07,3.96653e-10,0.0130765,0.00017699,6.99059e-07,1.92927e-10,0.0132542,0.000178388,6.99638e-07,6.94305e-10,0.0134333,0.00017979,7.01721e-07,7.55108e-10,0.0136138,0.000181195,7.03986e-07,1.05918e-11,0.0137957,0.000182603,7.04018e-07,1.06513e-09,0.013979,0.000184015,7.07214e-07,3.85512e-10,0.0141637,0.00018543,7.0837e-07,1.86769e-10,0.0143499,0.000186848,7.0893e-07,7.30116e-10,0.0145374,0.000188268,7.11121e-07,6.17983e-10,0.0147264,0.000189692,7.12975e-07,5.23282e-10,0.0149168,0.000191119,7.14545e-07,8.28398e-11,0.0151087,0.000192549,7.14793e-07,1.0081e-09,0.0153019,0.000193981,7.17817e-07,5.41244e-10,0.0154966,0.000195418,7.19441e-07,-3.7907e-10,0.0156928,0.000196856,7.18304e-07,1.90641e-09,0.0158903,0.000198298,7.24023e-07,-7.27387e-10,0.0160893,0.000199744,7.21841e-07,1.00317e-09,0.0162898,0.000201191,7.24851e-07,4.39949e-10,0.0164917,0.000202642,7.2617e-07,9.6234e-10,0.0166951,0.000204097,7.29057e-07,-5.64019e-10,0.0168999,0.000205554,7.27365e-07,1.29374e-09,0.0171062,0.000207012,7.31247e-07,9.77025e-10,0.017314,0.000208478,7.34178e-07,-1.47651e-09,0.0175232,0.000209942,7.29748e-07,3.06636e-09,0.0177338,0.00021141,7.38947e-07,-1.47573e-09,0.017946,0.000212884,7.3452e-07,9.7386e-10,0.0181596,0.000214356,7.37442e-07,1.30562e-09,0.0183747,0.000215835,7.41358e-07,-6.08376e-10,0.0185913,0.000217315,7.39533e-07,1.12785e-09,0.0188093,0.000218798,7.42917e-07,-1.77711e-10,0.0190289,0.000220283,7.42384e-07,1.44562e-09,0.0192499,0.000221772,7.46721e-07,-1.68825e-11,0.0194724,0.000223266,7.4667e-07,4.84533e-10,0.0196964,0.000224761,7.48124e-07,-5.85298e-11,0.0199219,0.000226257,7.47948e-07,1.61217e-09,0.0201489,0.000227757,7.52785e-07,-8.02136e-10,0.0203775,0.00022926,7.50378e-07,1.59637e-09,0.0206075,0.000230766,7.55167e-07,4.47168e-12,0.020839,0.000232276,7.55181e-07,2.48387e-10,0.021072,0.000233787,7.55926e-07,8.6474e-10,0.0213066,0.000235302,7.5852e-07,1.78299e-11,0.0215426,0.000236819,7.58573e-07,9.26567e-10,0.0217802,0.000238339,7.61353e-07,1.34529e-12,0.0220193,0.000239862,7.61357e-07,9.30659e-10,0.0222599,0.000241387,7.64149e-07,1.34529e-12,0.0225021,0.000242915,7.64153e-07,9.26567e-10,0.0227458,0.000244447,7.66933e-07,1.76215e-11,0.022991,0.00024598,7.66986e-07,8.65536e-10,0.0232377,0.000247517,7.69582e-07,2.45677e-10,0.023486,0.000249057,7.70319e-07,1.44193e-11,0.0237358,0.000250598,7.70363e-07,1.55918e-09,0.0239872,0.000252143,7.7504e-07,-6.63173e-10,0.0242401,0.000253691,7.73051e-07,1.09357e-09,0.0244946,0.000255241,7.76331e-07,1.41919e-11,0.0247506,0.000256793,7.76374e-07,7.12248e-10,0.0250082,0.000258348,7.78511e-07,8.62049e-10,0.0252673,0.000259908,7.81097e-07,-4.35061e-10,0.025528,0.000261469,7.79792e-07,8.7825e-10,0.0257902,0.000263031,7.82426e-07,6.47181e-10,0.0260541,0.000264598,7.84368e-07,2.58448e-10,0.0263194,0.000266167,7.85143e-07,1.81558e-10,0.0265864,0.000267738,7.85688e-07,8.78041e-10,0.0268549,0.000269312,7.88322e-07,3.15102e-11,0.027125,0.000270889,7.88417e-07,8.58525e-10,0.0273967,0.000272468,7.90992e-07,2.59812e-10,0.02767,0.000274051,7.91772e-07,-3.5224e-11,0.0279448,0.000275634,7.91666e-07,1.74377e-09,0.0282212,0.000277223,7.96897e-07,-1.35196e-09,0.0284992,0.000278813,7.92841e-07,1.80141e-09,0.0287788,0.000280404,7.98246e-07,-2.65629e-10,0.0290601,0.000281999,7.97449e-07,1.12374e-09,0.0293428,0.000283598,8.0082e-07,-5.04106e-10,0.0296272,0.000285198,7.99308e-07,8.92764e-10,0.0299132,0.000286799,8.01986e-07,6.58379e-10,0.0302008,0.000288405,8.03961e-07,1.98971e-10,0.0304901,0.000290014,8.04558e-07,4.08382e-10,0.0307809,0.000291624,8.05783e-07,3.01839e-11,0.0310733,0.000293236,8.05874e-07,1.33343e-09,0.0313673,0.000294851,8.09874e-07,2.2419e-10,0.031663,0.000296472,8.10547e-07,-3.67606e-10,0.0319603,0.000298092,8.09444e-07,1.24624e-09,0.0322592,0.000299714,8.13182e-07,-8.92025e-10,0.0325597,0.000301338,8.10506e-07,2.32183e-09,0.0328619,0.000302966,8.17472e-07,-9.44719e-10,0.0331657,0.000304598,8.14638e-07,1.45703e-09,0.0334711,0.000306232,8.19009e-07,-1.15805e-09,0.0337781,0.000307866,8.15535e-07,3.17507e-09,0.0340868,0.000309507,8.2506e-07,-4.09161e-09,0.0343971,0.000311145,8.12785e-07,5.74079e-09,0.0347091,0.000312788,8.30007e-07,-3.97034e-09,0.0350227,0.000314436,8.18096e-07,2.68985e-09,0.035338,0.00031608,8.26166e-07,6.61676e-10,0.0356549,0.000317734,8.28151e-07,-1.61123e-09,0.0359734,0.000319386,8.23317e-07,2.05786e-09,0.0362936,0.000321038,8.29491e-07,8.30388e-10,0.0366155,0.0003227,8.31982e-07,-1.65424e-09,0.036939,0.000324359,8.27019e-07,2.06129e-09,0.0372642,0.000326019,8.33203e-07,8.59719e-10,0.0375911,0.000327688,8.35782e-07,-1.77488e-09,0.0379196,0.000329354,8.30458e-07,2.51464e-09,0.0382498,0.000331023,8.38002e-07,-8.33135e-10,0.0385817,0.000332696,8.35502e-07,8.17825e-10,0.0389152,0.00033437,8.37956e-07,1.28718e-09,0.0392504,0.00033605,8.41817e-07,-2.2413e-09,0.0395873,0.000337727,8.35093e-07,3.95265e-09,0.0399258,0.000339409,8.46951e-07,-2.39332e-09,0.0402661,0.000341095,8.39771e-07,1.89533e-09,0.040608,0.000342781,8.45457e-07,-1.46271e-09,0.0409517,0.000344467,8.41069e-07,3.95554e-09,0.041297,0.000346161,8.52936e-07,-3.18369e-09,0.041644,0.000347857,8.43385e-07,1.32873e-09,0.0419927,0.000349548,8.47371e-07,1.59402e-09,0.0423431,0.000351248,8.52153e-07,-2.54336e-10,0.0426952,0.000352951,8.5139e-07,-5.76676e-10,0.043049,0.000354652,8.4966e-07,2.56114e-09,0.0434045,0.000356359,8.57343e-07,-2.21744e-09,0.0437617,0.000358067,8.50691e-07,2.58344e-09,0.0441206,0.000359776,8.58441e-07,-6.65826e-10,0.0444813,0.000361491,8.56444e-07,7.99218e-11,0.0448436,0.000363204,8.56684e-07,3.46063e-10,0.0452077,0.000364919,8.57722e-07,2.26116e-09,0.0455734,0.000366641,8.64505e-07,-1.94005e-09,0.045941,0.000368364,8.58685e-07,1.77384e-09,0.0463102,0.000370087,8.64007e-07,-1.43005e-09,0.0466811,0.000371811,8.59717e-07,3.94634e-09,0.0470538,0.000373542,8.71556e-07,-3.17946e-09,0.0474282,0.000375276,8.62017e-07,1.32104e-09,0.0478043,0.000377003,8.6598e-07,1.62045e-09,0.0481822,0.00037874,8.70842e-07,-3.52297e-10,0.0485618,0.000380481,8.69785e-07,-2.11211e-10,0.0489432,0.00038222,8.69151e-07,1.19716e-09,0.0493263,0.000383962,8.72743e-07,-8.52026e-10,0.0497111,0.000385705,8.70187e-07,2.21092e-09,0.0500977,0.000387452,8.76819e-07,-5.41339e-10,0.050486,0.000389204,8.75195e-07,-4.5361e-11,0.0508761,0.000390954,8.75059e-07,7.22669e-10,0.0512679,0.000392706,8.77227e-07,8.79936e-10,0.0516615,0.000394463,8.79867e-07,-5.17048e-10,0.0520568,0.000396222,8.78316e-07,1.18833e-09,0.0524539,0.000397982,8.81881e-07,-5.11022e-10,0.0528528,0.000399744,8.80348e-07,8.55683e-10,0.0532534,0.000401507,8.82915e-07,8.13562e-10,0.0536558,0.000403276,8.85356e-07,-3.84603e-10,0.05406,0.000405045,8.84202e-07,7.24962e-10,0.0544659,0.000406816,8.86377e-07,1.20986e-09,0.0548736,0.000408592,8.90006e-07,-1.83896e-09,0.0552831,0.000410367,8.84489e-07,2.42071e-09,0.0556944,0.000412143,8.91751e-07,-3.93413e-10,0.0561074,0.000413925,8.90571e-07,-8.46967e-10,0.0565222,0.000415704,8.8803e-07,3.78122e-09,0.0569388,0.000417491,8.99374e-07,-3.1021e-09,0.0573572,0.000419281,8.90068e-07,1.17658e-09,0.0577774,0.000421064,8.93597e-07,2.12117e-09,0.0581993,0.000422858,8.99961e-07,-2.21068e-09,0.0586231,0.000424651,8.93329e-07,2.9961e-09,0.0590486,0.000426447,9.02317e-07,-2.32311e-09,0.059476,0.000428244,8.95348e-07,2.57122e-09,0.0599051,0.000430043,9.03062e-07,-5.11098e-10,0.0603361,0.000431847,9.01528e-07,-5.27166e-10,0.0607688,0.000433649,8.99947e-07,2.61984e-09,0.0612034,0.000435457,9.07806e-07,-2.50141e-09,0.0616397,0.000437265,9.00302e-07,3.66045e-09,0.0620779,0.000439076,9.11283e-07,-4.68977e-09,0.0625179,0.000440885,8.97214e-07,7.64783e-09,0.0629597,0.000442702,9.20158e-07,-7.27499e-09,0.0634033,0.000444521,8.98333e-07,6.55113e-09,0.0638487,0.000446337,9.17986e-07,-4.02844e-09,0.0642959,0.000448161,9.05901e-07,2.11196e-09,0.064745,0.000449979,9.12236e-07,3.03125e-09,0.0651959,0.000451813,9.2133e-07,-6.78648e-09,0.0656486,0.000453635,9.00971e-07,9.21375e-09,0.0661032,0.000455464,9.28612e-07,-7.71684e-09,0.0665596,0.000457299,9.05462e-07,6.7522e-09,0.0670178,0.00045913,9.25718e-07,-4.3907e-09,0.0674778,0.000460968,9.12546e-07,3.36e-09,0.0679397,0.000462803,9.22626e-07,-1.59876e-09,0.0684034,0.000464644,9.1783e-07,3.0351e-09,0.068869,0.000466488,9.26935e-07,-3.09101e-09,0.0693364,0.000468333,9.17662e-07,1.8785e-09,0.0698057,0.000470174,9.23298e-07,3.02733e-09,0.0702768,0.00047203,9.3238e-07,-6.53722e-09,0.0707497,0.000473875,9.12768e-07,8.22054e-09,0.0712245,0.000475725,9.37429e-07,-3.99325e-09,0.0717012,0.000477588,9.2545e-07,3.01839e-10,0.0721797,0.00047944,9.26355e-07,2.78597e-09,0.0726601,0.000481301,9.34713e-07,-3.99507e-09,0.0731423,0.000483158,9.22728e-07,5.7435e-09,0.0736264,0.000485021,9.39958e-07,-4.07776e-09,0.0741123,0.000486888,9.27725e-07,3.11695e-09,0.0746002,0.000488753,9.37076e-07,-9.39394e-10,0.0750898,0.000490625,9.34258e-07,6.4055e-10,0.0755814,0.000492495,9.3618e-07,-1.62265e-09,0.0760748,0.000494363,9.31312e-07,5.84995e-09,0.0765701,0.000496243,9.48861e-07,-6.87601e-09,0.0770673,0.00049812,9.28233e-07,6.75296e-09,0.0775664,0.000499997,9.48492e-07,-5.23467e-09,0.0780673,0.000501878,9.32788e-07,6.73523e-09,0.0785701,0.000503764,9.52994e-07,-6.80514e-09,0.0790748,0.000505649,9.32578e-07,5.5842e-09,0.0795814,0.000507531,9.49331e-07,-6.30583e-10,0.0800899,0.000509428,9.47439e-07,-3.0618e-09,0.0806003,0.000511314,9.38254e-07,5.4273e-09,0.0811125,0.000513206,9.54536e-07,-3.74627e-09,0.0816267,0.000515104,9.43297e-07,2.10713e-09,0.0821427,0.000516997,9.49618e-07,2.76839e-09,0.0826607,0.000518905,9.57924e-07,-5.73006e-09,0.0831805,0.000520803,9.40733e-07,5.25072e-09,0.0837023,0.0005227,9.56486e-07,-3.71718e-10,0.084226,0.000524612,9.5537e-07,-3.76404e-09,0.0847515,0.000526512,9.44078e-07,7.97735e-09,0.085279,0.000528424,9.6801e-07,-5.79367e-09,0.0858084,0.000530343,9.50629e-07,2.96268e-10,0.0863397,0.000532245,9.51518e-07,4.6086e-09,0.0868729,0.000534162,9.65344e-07,-3.82947e-09,0.087408,0.000536081,9.53856e-07,3.25861e-09,0.087945,0.000537998,9.63631e-07,-1.7543e-09,0.088484,0.00053992,9.58368e-07,3.75849e-09,0.0890249,0.000541848,9.69644e-07,-5.82891e-09,0.0895677,0.00054377,9.52157e-07,4.65593e-09,0.0901124,0.000545688,9.66125e-07,2.10643e-09,0.0906591,0.000547627,9.72444e-07,-5.63099e-09,0.0912077,0.000549555,9.55551e-07,5.51627e-09,0.0917582,0.000551483,9.721e-07,-1.53292e-09,0.0923106,0.000553422,9.67501e-07,6.15311e-10,0.092865,0.000555359,9.69347e-07,-9.28291e-10,0.0934213,0.000557295,9.66562e-07,3.09774e-09,0.0939796,0.000559237,9.75856e-07,-4.01186e-09,0.0945398,0.000561177,9.6382e-07,5.49892e-09,0.095102,0.000563121,9.80317e-07,-3.08258e-09,0.0956661,0.000565073,9.71069e-07,-6.19176e-10,0.0962321,0.000567013,9.69212e-07,5.55932e-09,0.0968001,0.000568968,9.8589e-07,-6.71704e-09,0.09737,0.00057092,9.65738e-07,6.40762e-09,0.0979419,0.00057287,9.84961e-07,-4.0122e-09,0.0985158,0.000574828,9.72925e-07,2.19059e-09,0.0990916,0.000576781,9.79496e-07,2.70048e-09,0.0996693,0.000578748,9.87598e-07,-5.54193e-09,0.100249,0.000580706,9.70972e-07,4.56597e-09,0.100831,0.000582662,9.8467e-07,2.17923e-09,0.101414,0.000584638,9.91208e-07,-5.83232e-09,0.102,0.000586603,9.73711e-07,6.24884e-09,0.102588,0.000588569,9.92457e-07,-4.26178e-09,0.103177,0.000590541,9.79672e-07,3.34781e-09,0.103769,0.00059251,9.89715e-07,-1.67904e-09,0.104362,0.000594485,9.84678e-07,3.36839e-09,0.104958,0.000596464,9.94783e-07,-4.34397e-09,0.105555,0.000598441,9.81751e-07,6.55696e-09,0.106155,0.000600424,1.00142e-06,-6.98272e-09,0.106756,0.000602406,9.80474e-07,6.4728e-09,0.107359,0.000604386,9.99893e-07,-4.00742e-09,0.107965,0.000606374,9.8787e-07,2.10654e-09,0.108572,0.000608356,9.9419e-07,3.0318e-09,0.109181,0.000610353,1.00329e-06,-6.7832e-09,0.109793,0.00061234,9.82936e-07,9.1998e-09,0.110406,0.000614333,1.01054e-06,-7.6642e-09,0.111021,0.000616331,9.87543e-07,6.55579e-09,0.111639,0.000618326,1.00721e-06,-3.65791e-09,0.112258,0.000620329,9.96236e-07,6.25467e-10,0.112879,0.000622324,9.98113e-07,1.15593e-09,0.113503,0.000624323,1.00158e-06,2.20158e-09,0.114128,0.000626333,1.00819e-06,-2.51191e-09,0.114755,0.000628342,1.00065e-06,3.95517e-10,0.115385,0.000630345,1.00184e-06,9.29807e-10,0.116016,0.000632351,1.00463e-06,3.33599e-09,0.116649,0.00063437,1.01463e-06,-6.82329e-09,0.117285,0.000636379,9.94163e-07,9.05595e-09,0.117922,0.000638395,1.02133e-06,-7.04862e-09,0.118562,0.000640416,1.00019e-06,4.23737e-09,0.119203,0.000642429,1.0129e-06,-2.45033e-09,0.119847,0.000644448,1.00555e-06,5.56395e-09,0.120492,0.000646475,1.02224e-06,-4.9043e-09,0.121139,0.000648505,1.00753e-06,-8.47952e-10,0.121789,0.000650518,1.00498e-06,8.29622e-09,0.122441,0.000652553,1.02987e-06,-9.98538e-09,0.123094,0.000654582,9.99914e-07,9.2936e-09,0.12375,0.00065661,1.02779e-06,-4.83707e-09,0.124407,0.000658651,1.01328e-06,2.60411e-09,0.125067,0.000660685,1.0211e-06,-5.57945e-09,0.125729,0.000662711,1.00436e-06,1.22631e-08,0.126392,0.000664756,1.04115e-06,-1.36704e-08,0.127058,0.000666798,1.00014e-06,1.26161e-08,0.127726,0.000668836,1.03798e-06,-6.99155e-09,0.128396,0.000670891,1.01701e-06,4.48836e-10,0.129068,0.000672926,1.01836e-06,5.19606e-09,0.129742,0.000674978,1.03394e-06,-6.3319e-09,0.130418,0.000677027,1.01495e-06,5.2305e-09,0.131096,0.000679073,1.03064e-06,3.11123e-10,0.131776,0.000681135,1.03157e-06,-6.47511e-09,0.132458,0.000683179,1.01215e-06,1.06882e-08,0.133142,0.000685235,1.04421e-06,-6.47519e-09,0.133829,0.000687304,1.02479e-06,3.11237e-10,0.134517,0.000689355,1.02572e-06,5.23035e-09,0.135207,0.000691422,1.04141e-06,-6.3316e-09,0.1359,0.000693486,1.02242e-06,5.19484e-09,0.136594,0.000695546,1.038e-06,4.53497e-10,0.137291,0.000697623,1.03936e-06,-7.00891e-09,0.137989,0.000699681,1.01834e-06,1.2681e-08,0.13869,0.000701756,1.05638e-06,-1.39128e-08,0.139393,0.000703827,1.01464e-06,1.31679e-08,0.140098,0.000705896,1.05414e-06,-8.95659e-09,0.140805,0.000707977,1.02727e-06,7.75742e-09,0.141514,0.000710055,1.05055e-06,-7.17182e-09,0.142225,0.000712135,1.02903e-06,6.02862e-09,0.142938,0.000714211,1.04712e-06,-2.04163e-09,0.143653,0.000716299,1.04099e-06,2.13792e-09,0.144371,0.000718387,1.04741e-06,-6.51009e-09,0.14509,0.000720462,1.02787e-06,9.00123e-09,0.145812,0.000722545,1.05488e-06,3.07523e-10,0.146535,0.000724656,1.0558e-06,-1.02312e-08,0.147261,0.000726737,1.02511e-06,1.0815e-08,0.147989,0.000728819,1.05755e-06,-3.22681e-09,0.148719,0.000730925,1.04787e-06,2.09244e-09,0.14945,0.000733027,1.05415e-06,-5.143e-09,0.150185,0.00073512,1.03872e-06,3.57844e-09,0.150921,0.000737208,1.04946e-06,5.73027e-09,0.151659,0.000739324,1.06665e-06,-1.15983e-08,0.152399,0.000741423,1.03185e-06,1.08605e-08,0.153142,0.000743519,1.06443e-06,-2.04106e-09,0.153886,0.000745642,1.05831e-06,-2.69642e-09,0.154633,0.00074775,1.05022e-06,-2.07425e-09,0.155382,0.000749844,1.044e-06,1.09934e-08,0.156133,0.000751965,1.07698e-06,-1.20972e-08,0.156886,0.000754083,1.04069e-06,7.59288e-09,0.157641,0.000756187,1.06347e-06,-3.37305e-09,0.158398,0.000758304,1.05335e-06,5.89921e-09,0.159158,0.000760428,1.07104e-06,-5.32248e-09,0.159919,0.000762554,1.05508e-06,4.8927e-10,0.160683,0.000764666,1.05654e-06,3.36547e-09,0.161448,0.000766789,1.06664e-06,9.50081e-10,0.162216,0.000768925,1.06949e-06,-7.16568e-09,0.162986,0.000771043,1.04799e-06,1.28114e-08,0.163758,0.000773177,1.08643e-06,-1.42774e-08,0.164533,0.000775307,1.0436e-06,1.44956e-08,0.165309,0.000777438,1.08708e-06,-1.39025e-08,0.166087,0.00077957,1.04538e-06,1.13118e-08,0.166868,0.000781695,1.07931e-06,-1.54224e-09,0.167651,0.000783849,1.07468e-06,-5.14312e-09,0.168436,0.000785983,1.05925e-06,7.21381e-09,0.169223,0.000788123,1.0809e-06,-8.81096e-09,0.170012,0.000790259,1.05446e-06,1.31289e-08,0.170803,0.000792407,1.09385e-06,-1.39022e-08,0.171597,0.000794553,1.05214e-06,1.26775e-08,0.172392,0.000796695,1.09018e-06,-7.00557e-09,0.17319,0.000798855,1.06916e-06,4.43796e-10,0.17399,0.000800994,1.07049e-06,5.23031e-09,0.174792,0.000803151,1.08618e-06,-6.46397e-09,0.175596,0.000805304,1.06679e-06,5.72444e-09,0.176403,0.000807455,1.08396e-06,-1.53254e-09,0.177211,0.000809618,1.07937e-06,4.05673e-10,0.178022,0.000811778,1.08058e-06,-9.01916e-11,0.178835,0.000813939,1.08031e-06,-4.49821e-11,0.17965,0.000816099,1.08018e-06,2.70234e-10,0.180467,0.00081826,1.08099e-06,-1.03603e-09,0.181286,0.000820419,1.07788e-06,3.87392e-09,0.182108,0.000822587,1.0895e-06,4.41522e-10,0.182932,0.000824767,1.09083e-06,-5.63997e-09,0.183758,0.000826932,1.07391e-06,7.21707e-09,0.184586,0.000829101,1.09556e-06,-8.32718e-09,0.185416,0.000831267,1.07058e-06,1.11907e-08,0.186248,0.000833442,1.10415e-06,-6.63336e-09,0.187083,0.00083563,1.08425e-06,4.41484e-10,0.187919,0.0008378,1.08557e-06,4.86754e-09,0.188758,0.000839986,1.10017e-06,-5.01041e-09,0.189599,0.000842171,1.08514e-06,2.72811e-10,0.190443,0.000844342,1.08596e-06,3.91916e-09,0.191288,0.000846526,1.09772e-06,-1.04819e-09,0.192136,0.000848718,1.09457e-06,2.73531e-10,0.192985,0.000850908,1.0954e-06,-4.58916e-11,0.193837,0.000853099,1.09526e-06,-9.01158e-11,0.194692,0.000855289,1.09499e-06,4.06506e-10,0.195548,0.00085748,1.09621e-06,-1.53595e-09,0.196407,0.000859668,1.0916e-06,5.73717e-09,0.197267,0.000861869,1.10881e-06,-6.51164e-09,0.19813,0.000864067,1.08928e-06,5.40831e-09,0.198995,0.000866261,1.1055e-06,-2.20401e-10,0.199863,0.000868472,1.10484e-06,-4.52652e-09,0.200732,0.000870668,1.09126e-06,3.42508e-09,0.201604,0.000872861,1.10153e-06,5.72762e-09,0.202478,0.000875081,1.11872e-06,-1.14344e-08,0.203354,0.000877284,1.08441e-06,1.02076e-08,0.204233,0.000879484,1.11504e-06,4.06355e-10,0.205113,0.000881715,1.11626e-06,-1.18329e-08,0.205996,0.000883912,1.08076e-06,1.71227e-08,0.206881,0.000886125,1.13213e-06,-1.19546e-08,0.207768,0.000888353,1.09626e-06,8.93465e-10,0.208658,0.000890548,1.09894e-06,8.38062e-09,0.209549,0.000892771,1.12408e-06,-4.61353e-09,0.210443,0.000895006,1.11024e-06,-4.82756e-09,0.211339,0.000897212,1.09576e-06,9.02245e-09,0.212238,0.00089943,1.12283e-06,-1.45997e-09,0.213138,0.000901672,1.11845e-06,-3.18255e-09,0.214041,0.000903899,1.1089e-06,-7.11073e-10,0.214946,0.000906115,1.10677e-06,6.02692e-09,0.215853,0.000908346,1.12485e-06,-8.49548e-09,0.216763,0.00091057,1.09936e-06,1.30537e-08,0.217675,0.000912808,1.13852e-06,-1.3917e-08,0.218588,0.000915044,1.09677e-06,1.28121e-08,0.219505,0.000917276,1.13521e-06,-7.5288e-09,0.220423,0.000919523,1.11262e-06,2.40205e-09,0.221344,0.000921756,1.11983e-06,-2.07941e-09,0.222267,0.000923989,1.11359e-06,5.91551e-09,0.223192,0.000926234,1.13134e-06,-6.68149e-09,0.224119,0.000928477,1.11129e-06,5.90929e-09,0.225049,0.000930717,1.12902e-06,-2.05436e-09,0.22598,0.000932969,1.12286e-06,2.30807e-09,0.226915,0.000935222,1.12978e-06,-7.17796e-09,0.227851,0.00093746,1.10825e-06,1.15028e-08,0.228789,0.000939711,1.14276e-06,-9.03083e-09,0.22973,0.000941969,1.11566e-06,9.71932e-09,0.230673,0.00094423,1.14482e-06,-1.49452e-08,0.231619,0.000946474,1.09998e-06,2.02591e-08,0.232566,0.000948735,1.16076e-06,-2.13879e-08,0.233516,0.000950993,1.0966e-06,2.05888e-08,0.234468,0.000953247,1.15837e-06,-1.62642e-08,0.235423,0.000955515,1.10957e-06,1.46658e-08,0.236379,0.000957779,1.15357e-06,-1.25966e-08,0.237338,0.000960048,1.11578e-06,5.91793e-09,0.238299,0.000962297,1.13353e-06,3.82602e-09,0.239263,0.000964576,1.14501e-06,-6.3208e-09,0.240229,0.000966847,1.12605e-06,6.55613e-09,0.241197,0.000969119,1.14572e-06,-5.00268e-09,0.242167,0.000971395,1.13071e-06,-1.44659e-09,0.243139,0.000973652,1.12637e-06,1.07891e-08,0.244114,0.000975937,1.15874e-06,-1.19073e-08,0.245091,0.000978219,1.12302e-06,7.03782e-09,0.246071,0.000980486,1.14413e-06,-1.34276e-09,0.247052,0.00098277,1.1401e-06,-1.66669e-09,0.248036,0.000985046,1.1351e-06,8.00935e-09,0.249022,0.00098734,1.15913e-06,-1.54694e-08,0.250011,0.000989612,1.11272e-06,2.4066e-08,0.251002,0.000991909,1.18492e-06,-2.11901e-08,0.251995,0.000994215,1.12135e-06,1.08973e-09,0.25299,0.000996461,1.12462e-06,1.68311e-08,0.253988,0.000998761,1.17511e-06,-8.8094e-09,0.254987,0.00100109,1.14868e-06,-1.13958e-08,0.25599,0.00100335,1.1145e-06,2.45902e-08,0.256994,0.00100565,1.18827e-06,-2.73603e-08,0.258001,0.00100795,1.10618e-06,2.52464e-08,0.25901,0.00101023,1.18192e-06,-1.40207e-08,0.260021,0.00101256,1.13986e-06,1.03387e-09,0.261035,0.00101484,1.14296e-06,9.8853e-09,0.262051,0.00101715,1.17262e-06,-1.07726e-08,0.263069,0.00101947,1.1403e-06,3.40272e-09,0.26409,0.00102176,1.15051e-06,-2.83827e-09,0.265113,0.00102405,1.142e-06,7.95039e-09,0.266138,0.00102636,1.16585e-06,8.39047e-10,0.267166,0.00102869,1.16836e-06,-1.13066e-08,0.268196,0.00103099,1.13444e-06,1.4585e-08,0.269228,0.00103331,1.1782e-06,-1.72314e-08,0.270262,0.00103561,1.1265e-06,2.45382e-08,0.271299,0.00103794,1.20012e-06,-2.13166e-08,0.272338,0.00104028,1.13617e-06,1.12364e-09,0.273379,0.00104255,1.13954e-06,1.68221e-08,0.274423,0.00104488,1.19001e-06,-8.80736e-09,0.275469,0.00104723,1.16358e-06,-1.13948e-08,0.276518,0.00104953,1.1294e-06,2.45839e-08,0.277568,0.00105186,1.20315e-06,-2.73361e-08,0.278621,0.00105418,1.12114e-06,2.51559e-08,0.279677,0.0010565,1.19661e-06,-1.36832e-08,0.280734,0.00105885,1.15556e-06,-2.25706e-10,0.281794,0.00106116,1.15488e-06,1.45862e-08,0.282857,0.00106352,1.19864e-06,-2.83167e-08,0.283921,0.00106583,1.11369e-06,3.90759e-08,0.284988,0.00106817,1.23092e-06,-3.85801e-08,0.286058,0.00107052,1.11518e-06,2.58375e-08,0.287129,0.00107283,1.19269e-06,-5.16498e-09,0.288203,0.0010752,1.1772e-06,-5.17768e-09,0.28928,0.00107754,1.16167e-06,-3.92671e-09,0.290358,0.00107985,1.14988e-06,2.08846e-08,0.29144,0.00108221,1.21254e-06,-2.00072e-08,0.292523,0.00108458,1.15252e-06,-4.60659e-10,0.293609,0.00108688,1.15114e-06,2.18499e-08,0.294697,0.00108925,1.21669e-06,-2.73343e-08,0.295787,0.0010916,1.13468e-06,2.78826e-08,0.29688,0.00109395,1.21833e-06,-2.45915e-08,0.297975,0.00109632,1.14456e-06,1.08787e-08,0.299073,0.00109864,1.17719e-06,1.08788e-08,0.300172,0.00110102,1.20983e-06,-2.45915e-08,0.301275,0.00110337,1.13605e-06,2.78828e-08,0.302379,0.00110573,1.2197e-06,-2.73348e-08,0.303486,0.00110808,1.1377e-06,2.18518e-08,0.304595,0.00111042,1.20325e-06,-4.67556e-10,0.305707,0.00111283,1.20185e-06,-1.99816e-08,0.306821,0.00111517,1.14191e-06,2.07891e-08,0.307937,0.00111752,1.20427e-06,-3.57026e-09,0.309056,0.00111992,1.19356e-06,-6.50797e-09,0.310177,0.00112228,1.17404e-06,-2.00165e-10,0.3113,0.00112463,1.17344e-06,7.30874e-09,0.312426,0.001127,1.19536e-06,7.67424e-10,0.313554,0.00112939,1.19767e-06,-1.03784e-08,0.314685,0.00113176,1.16653e-06,1.09437e-08,0.315818,0.00113412,1.19936e-06,-3.59406e-09,0.316953,0.00113651,1.18858e-06,3.43251e-09,0.318091,0.0011389,1.19888e-06,-1.0136e-08,0.319231,0.00114127,1.16847e-06,7.30915e-09,0.320374,0.00114363,1.1904e-06,1.07018e-08,0.321518,0.00114604,1.2225e-06,-2.03137e-08,0.322666,0.00114842,1.16156e-06,1.09484e-08,0.323815,0.00115078,1.19441e-06,6.32224e-09,0.324967,0.00115319,1.21337e-06,-6.43509e-09,0.326122,0.00115559,1.19407e-06,-1.03842e-08,0.327278,0.00115795,1.16291e-06,1.81697e-08,0.328438,0.00116033,1.21742e-06,-2.6901e-09,0.329599,0.00116276,1.20935e-06,-7.40939e-09,0.330763,0.00116515,1.18713e-06,2.52533e-09,0.331929,0.00116754,1.1947e-06,-2.69191e-09,0.333098,0.00116992,1.18663e-06,8.24218e-09,0.334269,0.00117232,1.21135e-06,-4.74377e-10,0.335443,0.00117474,1.20993e-06,-6.34471e-09,0.336619,0.00117714,1.1909e-06,-3.94922e-09,0.337797,0.00117951,1.17905e-06,2.21417e-08,0.338978,0.00118193,1.24547e-06,-2.50128e-08,0.340161,0.00118435,1.17043e-06,1.8305e-08,0.341346,0.00118674,1.22535e-06,-1.84048e-08,0.342534,0.00118914,1.17013e-06,2.55121e-08,0.343725,0.00119156,1.24667e-06,-2.40389e-08,0.344917,0.00119398,1.17455e-06,1.10389e-08,0.346113,0.00119636,1.20767e-06,9.68574e-09,0.34731,0.0011988,1.23673e-06,-1.99797e-08,0.34851,0.00120122,1.17679e-06,1.06284e-08,0.349713,0.0012036,1.20867e-06,7.26868e-09,0.350917,0.00120604,1.23048e-06,-9.90072e-09,0.352125,0.00120847,1.20078e-06,2.53177e-09,0.353334,0.00121088,1.20837e-06,-2.26199e-10,0.354546,0.0012133,1.20769e-06,-1.62705e-09,0.355761,0.00121571,1.20281e-06,6.73435e-09,0.356978,0.00121813,1.22302e-06,4.49207e-09,0.358197,0.00122059,1.23649e-06,-2.47027e-08,0.359419,0.00122299,1.16238e-06,3.47142e-08,0.360643,0.00122542,1.26653e-06,-2.47472e-08,0.36187,0.00122788,1.19229e-06,4.66965e-09,0.363099,0.00123028,1.20629e-06,6.06872e-09,0.36433,0.00123271,1.2245e-06,8.57729e-10,0.365564,0.00123516,1.22707e-06,-9.49952e-09,0.366801,0.00123759,1.19858e-06,7.33792e-09,0.36804,0.00124001,1.22059e-06,9.95025e-09,0.369281,0.00124248,1.25044e-06,-1.73366e-08,0.370525,0.00124493,1.19843e-06,-2.08464e-10,0.371771,0.00124732,1.1978e-06,1.81704e-08,0.373019,0.00124977,1.25232e-06,-1.28683e-08,0.37427,0.00125224,1.21371e-06,3.50042e-09,0.375524,0.00125468,1.22421e-06,-1.1335e-09,0.37678,0.00125712,1.22081e-06,1.03345e-09,0.378038,0.00125957,1.22391e-06,-3.00023e-09,0.379299,0.00126201,1.21491e-06,1.09676e-08,0.380562,0.00126447,1.24781e-06,-1.10676e-08,0.381828,0.00126693,1.21461e-06,3.50042e-09,0.383096,0.00126937,1.22511e-06,-2.93403e-09,0.384366,0.00127181,1.21631e-06,8.23574e-09,0.385639,0.00127427,1.24102e-06,-2.06607e-10,0.386915,0.00127675,1.2404e-06,-7.40935e-09,0.388193,0.00127921,1.21817e-06,4.1761e-11,0.389473,0.00128165,1.21829e-06,7.24223e-09,0.390756,0.0012841,1.24002e-06,7.91564e-10,0.392042,0.00128659,1.2424e-06,-1.04086e-08,0.393329,0.00128904,1.21117e-06,1.10405e-08,0.39462,0.0012915,1.24429e-06,-3.951e-09,0.395912,0.00129397,1.23244e-06,4.7634e-09,0.397208,0.00129645,1.24673e-06,-1.51025e-08,0.398505,0.0012989,1.20142e-06,2.58443e-08,0.399805,0.00130138,1.27895e-06,-2.86702e-08,0.401108,0.00130385,1.19294e-06,2.92318e-08,0.402413,0.00130632,1.28064e-06,-2.86524e-08,0.403721,0.0013088,1.19468e-06,2.57731e-08,0.405031,0.00131127,1.272e-06,-1.48355e-08,0.406343,0.00131377,1.2275e-06,3.76652e-09,0.407658,0.00131623,1.23879e-06,-2.30784e-10,0.408976,0.00131871,1.2381e-06,-2.84331e-09,0.410296,0.00132118,1.22957e-06,1.16041e-08,0.411618,0.00132367,1.26438e-06,-1.37708e-08,0.412943,0.00132616,1.22307e-06,1.36768e-08,0.41427,0.00132865,1.2641e-06,-1.1134e-08,0.4156,0.00133114,1.2307e-06,1.05714e-09,0.416933,0.00133361,1.23387e-06,6.90538e-09,0.418267,0.00133609,1.25459e-06,1.12372e-09,0.419605,0.00133861,1.25796e-06,-1.14002e-08,0.420945,0.00134109,1.22376e-06,1.46747e-08,0.422287,0.00134358,1.26778e-06,-1.7496e-08,0.423632,0.00134606,1.21529e-06,2.5507e-08,0.424979,0.00134857,1.29182e-06,-2.49272e-08,0.426329,0.00135108,1.21703e-06,1.45972e-08,0.427681,0.00135356,1.26083e-06,-3.65935e-09,0.429036,0.00135607,1.24985e-06,4.00178e-11,0.430393,0.00135857,1.24997e-06,3.49917e-09,0.431753,0.00136108,1.26047e-06,-1.40366e-08,0.433116,0.00136356,1.21836e-06,2.28448e-08,0.43448,0.00136606,1.28689e-06,-1.77378e-08,0.435848,0.00136858,1.23368e-06,1.83043e-08,0.437218,0.0013711,1.28859e-06,-2.56769e-08,0.43859,0.0013736,1.21156e-06,2.47987e-08,0.439965,0.0013761,1.28595e-06,-1.39133e-08,0.441342,0.00137863,1.24421e-06,1.05202e-09,0.442722,0.00138112,1.24737e-06,9.70507e-09,0.444104,0.00138365,1.27649e-06,-1.00698e-08,0.445489,0.00138617,1.24628e-06,7.72123e-10,0.446877,0.00138867,1.24859e-06,6.98132e-09,0.448267,0.00139118,1.26954e-06,1.10477e-09,0.449659,0.00139373,1.27285e-06,-1.14003e-08,0.451054,0.00139624,1.23865e-06,1.4694e-08,0.452452,0.00139876,1.28273e-06,-1.75734e-08,0.453852,0.00140127,1.23001e-06,2.5797e-08,0.455254,0.00140381,1.3074e-06,-2.60097e-08,0.456659,0.00140635,1.22937e-06,1.86371e-08,0.458067,0.00140886,1.28529e-06,-1.8736e-08,0.459477,0.00141137,1.22908e-06,2.65048e-08,0.46089,0.00141391,1.30859e-06,-2.76784e-08,0.462305,0.00141645,1.22556e-06,2.46043e-08,0.463722,0.00141897,1.29937e-06,-1.11341e-08,0.465143,0.00142154,1.26597e-06,-9.87033e-09,0.466565,0.00142404,1.23636e-06,2.08131e-08,0.467991,0.00142657,1.2988e-06,-1.37773e-08,0.469419,0.00142913,1.25746e-06,4.49378e-09,0.470849,0.00143166,1.27094e-06,-4.19781e-09,0.472282,0.00143419,1.25835e-06,1.22975e-08,0.473717,0.00143674,1.29524e-06,-1.51902e-08,0.475155,0.00143929,1.24967e-06,1.86608e-08,0.476596,0.00144184,1.30566e-06,-2.96506e-08,0.478039,0.00144436,1.2167e-06,4.03368e-08,0.479485,0.00144692,1.33771e-06,-4.22896e-08,0.480933,0.00144947,1.21085e-06,3.94148e-08,0.482384,0.00145201,1.32909e-06,-2.59626e-08,0.483837,0.00145459,1.2512e-06,4.83124e-09,0.485293,0.0014571,1.2657e-06,6.63757e-09,0.486751,0.00145966,1.28561e-06,-1.57911e-09,0.488212,0.00146222,1.28087e-06,-3.21468e-10,0.489676,0.00146478,1.27991e-06,2.86517e-09,0.491142,0.00146735,1.2885e-06,-1.11392e-08,0.49261,0.00146989,1.25508e-06,1.18893e-08,0.494081,0.00147244,1.29075e-06,-6.61574e-09,0.495555,0.001475,1.27091e-06,1.45736e-08,0.497031,0.00147759,1.31463e-06,-2.18759e-08,0.49851,0.00148015,1.249e-06,1.33252e-08,0.499992,0.00148269,1.28897e-06,-1.62277e-09,0.501476,0.00148526,1.28411e-06,-6.83421e-09,0.502962,0.00148781,1.2636e-06,2.89596e-08,0.504451,0.00149042,1.35048e-06,-4.93997e-08,0.505943,0.00149298,1.20228e-06,4.94299e-08,0.507437,0.00149553,1.35057e-06,-2.91107e-08,0.508934,0.00149814,1.26324e-06,7.40848e-09,0.510434,0.00150069,1.28547e-06,-5.23187e-10,0.511936,0.00150326,1.2839e-06,-5.31585e-09,0.51344,0.00150581,1.26795e-06,2.17866e-08,0.514947,0.00150841,1.33331e-06,-2.22257e-08,0.516457,0.00151101,1.26663e-06,7.51178e-09,0.517969,0.00151357,1.28917e-06,-7.82128e-09,0.519484,0.00151613,1.2657e-06,2.37733e-08,0.521002,0.00151873,1.33702e-06,-2.76674e-08,0.522522,0.00152132,1.25402e-06,2.72917e-08,0.524044,0.00152391,1.3359e-06,-2.18949e-08,0.525569,0.00152652,1.27021e-06,6.83372e-10,0.527097,0.00152906,1.27226e-06,1.91613e-08,0.528628,0.00153166,1.32974e-06,-1.77241e-08,0.53016,0.00153427,1.27657e-06,-7.86963e-09,0.531696,0.0015368,1.25296e-06,4.92027e-08,0.533234,0.00153945,1.40057e-06,-6.9732e-08,0.534775,0.00154204,1.19138e-06,5.09114e-08,0.536318,0.00154458,1.34411e-06,-1.4704e-08,0.537864,0.00154722,1.3e-06,7.9048e-09,0.539413,0.00154984,1.32371e-06,-1.69152e-08,0.540964,0.00155244,1.27297e-06,1.51355e-10,0.542517,0.00155499,1.27342e-06,1.63099e-08,0.544074,0.00155758,1.32235e-06,-5.78647e-09,0.545633,0.00156021,1.30499e-06,6.83599e-09,0.547194,0.00156284,1.3255e-06,-2.15575e-08,0.548758,0.00156543,1.26083e-06,1.97892e-08,0.550325,0.00156801,1.32019e-06,2.00525e-09,0.551894,0.00157065,1.32621e-06,-2.78103e-08,0.553466,0.00157322,1.24278e-06,4.96314e-08,0.555041,0.00157586,1.39167e-06,-5.1506e-08,0.556618,0.00157849,1.23716e-06,3.71835e-08,0.558198,0.00158107,1.34871e-06,-3.76233e-08,0.55978,0.00158366,1.23584e-06,5.37052e-08,0.561365,0.00158629,1.39695e-06,-5.79884e-08,0.562953,0.00158891,1.22299e-06,5.90392e-08,0.564543,0.00159153,1.4001e-06,-5.89592e-08,0.566136,0.00159416,1.22323e-06,5.7588e-08,0.567731,0.00159678,1.39599e-06,-5.21835e-08,0.569329,0.00159941,1.23944e-06,3.19369e-08,0.57093,0.00160199,1.33525e-06,-1.59594e-08,0.572533,0.00160461,1.28737e-06,3.19006e-08,0.574139,0.00160728,1.38307e-06,-5.20383e-08,0.575748,0.00160989,1.22696e-06,5.70431e-08,0.577359,0.00161251,1.39809e-06,-5.69247e-08,0.578973,0.00161514,1.22731e-06,5.14463e-08,0.580589,0.00161775,1.38165e-06,-2.9651e-08,0.582208,0.00162042,1.2927e-06,7.55339e-09,0.58383,0.00162303,1.31536e-06,-5.62636e-10,0.585455,0.00162566,1.31367e-06,-5.30281e-09,0.587081,0.00162827,1.29776e-06,2.17738e-08,0.588711,0.00163093,1.36309e-06,-2.21875e-08,0.590343,0.00163359,1.29652e-06,7.37164e-09,0.591978,0.00163621,1.31864e-06,-7.29907e-09,0.593616,0.00163882,1.29674e-06,2.18247e-08,0.595256,0.00164148,1.36221e-06,-2.03952e-08,0.596899,0.00164414,1.30103e-06,1.51241e-10,0.598544,0.00164675,1.30148e-06,1.97902e-08,0.600192,0.00164941,1.36085e-06,-1.97074e-08,0.601843,0.00165207,1.30173e-06,-5.65175e-10,0.603496,0.00165467,1.30004e-06,2.1968e-08,0.605152,0.00165734,1.36594e-06,-2.77024e-08,0.606811,0.00165999,1.28283e-06,2.92369e-08,0.608472,0.00166264,1.37054e-06,-2.96407e-08,0.610136,0.00166529,1.28162e-06,2.97215e-08,0.611803,0.00166795,1.37079e-06,-2.96408e-08,0.613472,0.0016706,1.28186e-06,2.92371e-08,0.615144,0.00167325,1.36957e-06,-2.77031e-08,0.616819,0.00167591,1.28647e-06,2.19708e-08,0.618496,0.00167855,1.35238e-06,-5.75407e-10,0.620176,0.00168125,1.35065e-06,-1.9669e-08,0.621858,0.00168389,1.29164e-06,1.96468e-08,0.623544,0.00168653,1.35058e-06,6.86403e-10,0.625232,0.00168924,1.35264e-06,-2.23924e-08,0.626922,0.00169187,1.28547e-06,2.92788e-08,0.628615,0.00169453,1.3733e-06,-3.51181e-08,0.630311,0.00169717,1.26795e-06,5.15889e-08,0.63201,0.00169987,1.42272e-06,-5.2028e-08,0.633711,0.00170255,1.26663e-06,3.73139e-08,0.635415,0.0017052,1.37857e-06,-3.76227e-08,0.637121,0.00170784,1.2657e-06,5.35722e-08,0.63883,0.00171054,1.42642e-06,-5.74567e-08,0.640542,0.00171322,1.25405e-06,5.70456e-08,0.642257,0.0017159,1.42519e-06,-5.15163e-08,0.643974,0.00171859,1.27064e-06,2.98103e-08,0.645694,0.00172122,1.36007e-06,-8.12016e-09,0.647417,0.00172392,1.33571e-06,2.67039e-09,0.649142,0.0017266,1.34372e-06,-2.56152e-09,0.65087,0.00172928,1.33604e-06,7.57571e-09,0.6526,0.00173197,1.35876e-06,-2.77413e-08,0.654334,0.00173461,1.27554e-06,4.3785e-08,0.65607,0.00173729,1.40689e-06,-2.81896e-08,0.657808,0.00174002,1.32233e-06,9.36893e-09,0.65955,0.00174269,1.35043e-06,-9.28617e-09,0.661294,0.00174536,1.32257e-06,2.77757e-08,0.66304,0.00174809,1.4059e-06,-4.2212e-08,0.66479,0.00175078,1.27926e-06,2.1863e-08,0.666542,0.0017534,1.34485e-06,1.43648e-08,0.668297,0.00175613,1.38795e-06,-1.97177e-08,0.670054,0.00175885,1.3288e-06,4.90115e-09,0.671814,0.00176152,1.3435e-06,1.13232e-10,0.673577,0.00176421,1.34384e-06,-5.3542e-09,0.675343,0.00176688,1.32778e-06,2.13035e-08,0.677111,0.0017696,1.39169e-06,-2.02553e-08,0.678882,0.00177232,1.33092e-06,1.13005e-10,0.680656,0.00177499,1.33126e-06,1.98031e-08,0.682432,0.00177771,1.39067e-06,-1.97211e-08,0.684211,0.00178043,1.33151e-06,-5.2349e-10,0.685993,0.00178309,1.32994e-06,2.18151e-08,0.687777,0.00178582,1.39538e-06,-2.71325e-08,0.689564,0.00178853,1.31398e-06,2.71101e-08,0.691354,0.00179124,1.39531e-06,-2.17035e-08,0.693147,0.00179396,1.3302e-06,9.92865e-11,0.694942,0.00179662,1.3305e-06,2.13063e-08,0.69674,0.00179935,1.39442e-06,-2.57198e-08,0.698541,0.00180206,1.31726e-06,2.19682e-08,0.700344,0.00180476,1.38317e-06,-2.54852e-09,0.70215,0.00180752,1.37552e-06,-1.17741e-08,0.703959,0.00181023,1.3402e-06,-9.95999e-09,0.705771,0.00181288,1.31032e-06,5.16141e-08,0.707585,0.00181566,1.46516e-06,-7.72869e-08,0.709402,0.00181836,1.2333e-06,7.87197e-08,0.711222,0.00182106,1.46946e-06,-5.87781e-08,0.713044,0.00182382,1.29312e-06,3.71834e-08,0.714869,0.00182652,1.40467e-06,-3.03511e-08,0.716697,0.00182924,1.31362e-06,2.46161e-08,0.718528,0.00183194,1.38747e-06,-8.5087e-09,0.720361,0.00183469,1.36194e-06,9.41892e-09,0.722197,0.00183744,1.3902e-06,-2.91671e-08,0.724036,0.00184014,1.3027e-06,4.76448e-08,0.725878,0.00184288,1.44563e-06,-4.22028e-08,0.727722,0.00184565,1.31902e-06,1.95682e-09,0.729569,0.00184829,1.3249e-06,3.43754e-08,0.731419,0.00185104,1.42802e-06,-2.0249e-08,0.733271,0.00185384,1.36727e-06,-1.29838e-08,0.735126,0.00185654,1.32832e-06,1.25794e-08,0.736984,0.00185923,1.36606e-06,2.22711e-08,0.738845,0.00186203,1.43287e-06,-4.20594e-08,0.740708,0.00186477,1.3067e-06,2.67571e-08,0.742574,0.00186746,1.38697e-06,-5.36424e-09,0.744443,0.00187022,1.37087e-06,-5.30023e-09,0.746315,0.00187295,1.35497e-06,2.65653e-08,0.748189,0.00187574,1.43467e-06,-4.13564e-08,0.750066,0.00187848,1.3106e-06,1.9651e-08,0.751946,0.00188116,1.36955e-06,2.23572e-08,0.753828,0.00188397,1.43663e-06,-4.9475e-08,0.755714,0.00188669,1.2882e-06,5.63335e-08,0.757602,0.00188944,1.4572e-06,-5.66499e-08,0.759493,0.00189218,1.28725e-06,5.10567e-08,0.761386,0.00189491,1.44042e-06,-2.83677e-08,0.763283,0.00189771,1.35532e-06,2.80962e-09,0.765182,0.00190042,1.36375e-06,1.71293e-08,0.767083,0.0019032,1.41513e-06,-1.17221e-08,0.768988,0.001906,1.37997e-06,-2.98453e-08,0.770895,0.00190867,1.29043e-06,7.14987e-08,0.772805,0.00191146,1.50493e-06,-7.73354e-08,0.774718,0.00191424,1.27292e-06,5.90292e-08,0.776634,0.00191697,1.45001e-06,-3.9572e-08,0.778552,0.00191975,1.33129e-06,3.9654e-08,0.780473,0.00192253,1.45026e-06,-5.94395e-08,0.782397,0.00192525,1.27194e-06,7.88945e-08,0.784324,0.00192803,1.50862e-06,-7.73249e-08,0.786253,0.00193082,1.27665e-06,5.15913e-08,0.788185,0.00193352,1.43142e-06,-9.83099e-09,0.79012,0.00193636,1.40193e-06,-1.22672e-08,0.792058,0.00193912,1.36513e-06,-7.05275e-10,0.793999,0.00194185,1.36301e-06,1.50883e-08,0.795942,0.00194462,1.40828e-06,-4.33147e-11,0.797888,0.00194744,1.40815e-06,-1.49151e-08,0.799837,0.00195021,1.3634e-06,9.93244e-11,0.801788,0.00195294,1.3637e-06,1.45179e-08,0.803743,0.00195571,1.40725e-06,1.43363e-09,0.8057,0.00195853,1.41155e-06,-2.02525e-08,0.80766,0.00196129,1.35079e-06,1.99718e-08,0.809622,0.00196405,1.41071e-06,-3.01649e-11,0.811588,0.00196687,1.41062e-06,-1.9851e-08,0.813556,0.00196964,1.35107e-06,1.98296e-08,0.815527,0.0019724,1.41056e-06,1.37485e-10,0.817501,0.00197522,1.41097e-06,-2.03796e-08,0.819477,0.00197798,1.34983e-06,2.17763e-08,0.821457,0.00198074,1.41516e-06,-7.12085e-09,0.823439,0.00198355,1.3938e-06,6.70707e-09,0.825424,0.00198636,1.41392e-06,-1.97074e-08,0.827412,0.00198913,1.35479e-06,1.25179e-08,0.829402,0.00199188,1.39235e-06,2.92405e-08,0.831396,0.00199475,1.48007e-06,-6.98755e-08,0.833392,0.0019975,1.27044e-06,7.14477e-08,0.835391,0.00200026,1.48479e-06,-3.71014e-08,0.837392,0.00200311,1.37348e-06,1.73533e-08,0.839397,0.00200591,1.42554e-06,-3.23118e-08,0.841404,0.00200867,1.32861e-06,5.2289e-08,0.843414,0.00201148,1.48547e-06,-5.76348e-08,0.845427,0.00201428,1.31257e-06,5.9041e-08,0.847443,0.00201708,1.48969e-06,-5.93197e-08,0.849461,0.00201988,1.31173e-06,5.90289e-08,0.851482,0.00202268,1.48882e-06,-5.75864e-08,0.853507,0.00202549,1.31606e-06,5.21075e-08,0.855533,0.00202828,1.47238e-06,-3.16344e-08,0.857563,0.00203113,1.37748e-06,1.48257e-08,0.859596,0.00203393,1.42196e-06,-2.76684e-08,0.861631,0.00203669,1.33895e-06,3.62433e-08,0.863669,0.00203947,1.44768e-06,1.90463e-09,0.86571,0.00204237,1.45339e-06,-4.38617e-08,0.867754,0.00204515,1.32181e-06,5.43328e-08,0.8698,0.00204796,1.48481e-06,-5.42603e-08,0.87185,0.00205076,1.32203e-06,4.34989e-08,0.873902,0.00205354,1.45252e-06,-5.26029e-10,0.875957,0.00205644,1.45095e-06,-4.13949e-08,0.878015,0.00205922,1.32676e-06,4.68962e-08,0.880075,0.00206201,1.46745e-06,-2.69807e-08,0.882139,0.00206487,1.38651e-06,1.42181e-09,0.884205,0.00206764,1.39077e-06,2.12935e-08,0.886274,0.00207049,1.45465e-06,-2.69912e-08,0.888346,0.00207332,1.37368e-06,2.70664e-08,0.890421,0.00207615,1.45488e-06,-2.16698e-08,0.892498,0.00207899,1.38987e-06,8.14756e-12,0.894579,0.00208177,1.38989e-06,2.16371e-08,0.896662,0.00208462,1.45481e-06,-2.6952e-08,0.898748,0.00208744,1.37395e-06,2.65663e-08,0.900837,0.00209027,1.45365e-06,-1.97084e-08,0.902928,0.00209312,1.39452e-06,-7.33731e-09,0.905023,0.00209589,1.37251e-06,4.90578e-08,0.90712,0.00209878,1.51968e-06,-6.96845e-08,0.90922,0.00210161,1.31063e-06,5.08664e-08,0.911323,0.00210438,1.46323e-06,-1.45717e-08,0.913429,0.00210727,1.41952e-06,7.42038e-09,0.915538,0.00211013,1.44178e-06,-1.51097e-08,0.917649,0.00211297,1.39645e-06,-6.58618e-09,0.919764,0.00211574,1.37669e-06,4.14545e-08,0.921881,0.00211862,1.50105e-06,-4.00222e-08,0.924001,0.0021215,1.38099e-06,-5.7518e-10,0.926124,0.00212426,1.37926e-06,4.23229e-08,0.92825,0.00212714,1.50623e-06,-4.9507e-08,0.930378,0.00213001,1.35771e-06,3.64958e-08,0.93251,0.00213283,1.4672e-06,-3.68713e-08,0.934644,0.00213566,1.35658e-06,5.13848e-08,0.936781,0.00213852,1.51074e-06,-4.94585e-08,0.938921,0.0021414,1.36236e-06,2.72399e-08,0.941064,0.0021442,1.44408e-06,1.0372e-10,0.943209,0.00214709,1.44439e-06,-2.76547e-08,0.945358,0.0021499,1.36143e-06,5.09106e-08,0.947509,0.00215277,1.51416e-06,-5.67784e-08,0.949663,0.00215563,1.34382e-06,5.69935e-08,0.95182,0.00215849,1.5148e-06,-5.19861e-08,0.95398,0.00216136,1.35885e-06,3.17417e-08,0.956143,0.00216418,1.45407e-06,-1.53758e-08,0.958309,0.00216704,1.40794e-06,2.97615e-08,0.960477,0.00216994,1.49723e-06,-4.40657e-08,0.962649,0.00217281,1.36503e-06,2.72919e-08,0.964823,0.00217562,1.44691e-06,-5.49729e-09,0.967,0.0021785,1.43041e-06,-5.30273e-09,0.96918,0.00218134,1.41451e-06,2.67084e-08,0.971363,0.00218425,1.49463e-06,-4.19265e-08,0.973548,0.00218711,1.36885e-06,2.17881e-08,0.975737,0.00218992,1.43422e-06,1.43789e-08,0.977928,0.00219283,1.47735e-06,-1.96989e-08,0.980122,0.00219572,1.41826e-06,4.81221e-09,0.98232,0.00219857,1.43269e-06,4.50048e-10,0.98452,0.00220144,1.43404e-06,-6.61237e-09,0.986722,0.00220429,1.41421e-06,2.59993e-08,0.988928,0.0022072,1.4922e-06,-3.77803e-08,0.991137,0.00221007,1.37886e-06,5.9127e-09,0.993348,0.00221284,1.3966e-06,1.33339e-07,0.995563,0.00221604,1.79662e-06,-5.98872e-07,0.99778,0.00222015,0.,0.};
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void RGB2LabConvert_f(const T& src, D& dst)
+        {
+            const float _1_3 = 1.0f / 3.0f;
+            const float _a = 16.0f / 116.0f;
+
+            float B = blueIdx == 0 ? src.x : src.z;
+            float G = src.y;
+            float R = blueIdx == 0 ? src.z : src.x;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            float X = B * 0.189828f + G * 0.376219f + R * 0.433953f;
+            float Y = B * 0.072169f + G * 0.715160f + R * 0.212671f;
+            float Z = B * 0.872766f + G * 0.109477f + R * 0.017758f;
+
+            float FX = X > 0.008856f ? ::powf(X, _1_3) : (7.787f * X + _a);
+            float FY = Y > 0.008856f ? ::powf(Y, _1_3) : (7.787f * Y + _a);
+            float FZ = Z > 0.008856f ? ::powf(Z, _1_3) : (7.787f * Z + _a);
+
+            float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
+            float a = 500.f * (FX - FY);
+            float b = 200.f * (FY - FZ);
+
+            dst.x = L;
+            dst.y = a;
+            dst.z = b;
+        }
+
+        template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct RGB2Lab;
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct RGB2Lab<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<uchar, scn>::vec_type, typename TypeVec<uchar, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<uchar, dcn>::vec_type operator ()(const typename TypeVec<uchar, scn>::vec_type& src) const
+            {
+                typename TypeVec<uchar, dcn>::vec_type dst;
+
+                RGB2LabConvert_b<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
+        };
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct RGB2Lab<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<float, dcn>::vec_type operator ()(const typename TypeVec<float, scn>::vec_type& src) const
+            {
+                typename TypeVec<float, dcn>::vec_type dst;
+
+                RGB2LabConvert_f<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(name, scn, dcn, srgb, blueIdx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2Lab<T, scn, dcn, srgb, blueIdx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ float c_sRGBInvGammaTab[] = {0,0.0126255,0.,-8.33961e-06,0.0126172,0.0126005,-2.50188e-05,4.1698e-05,0.0252344,0.0126756,0.000100075,-0.000158451,0.0378516,0.0124004,-0.000375277,-0.000207393,0.0496693,0.0110276,-0.000997456,0.00016837,0.0598678,0.00953783,-0.000492346,2.07235e-05,0.068934,0.00861531,-0.000430176,3.62876e-05,0.0771554,0.00786382,-0.000321313,1.87625e-05,0.0847167,0.00727748,-0.000265025,1.53594e-05,0.0917445,0.00679351,-0.000218947,1.10545e-05,0.0983301,0.00638877,-0.000185784,8.66984e-06,0.104542,0.00604322,-0.000159774,6.82996e-06,0.110432,0.00574416,-0.000139284,5.51008e-06,0.116042,0.00548212,-0.000122754,4.52322e-06,0.121406,0.00525018,-0.000109184,3.75557e-06,0.126551,0.00504308,-9.79177e-05,3.17134e-06,0.131499,0.00485676,-8.84037e-05,2.68469e-06,0.13627,0.004688,-8.03496e-05,2.31725e-06,0.14088,0.00453426,-7.33978e-05,2.00868e-06,0.145343,0.00439349,-6.73718e-05,1.74775e-06,0.149671,0.00426399,-6.21286e-05,1.53547e-06,0.153875,0.00414434,-5.75222e-05,1.364e-06,0.157963,0.00403338,-5.34301e-05,1.20416e-06,0.161944,0.00393014,-4.98177e-05,1.09114e-06,0.165825,0.00383377,-4.65443e-05,9.57987e-07,0.169613,0.00374356,-4.36703e-05,8.88359e-07,0.173314,0.00365888,-4.10052e-05,7.7849e-07,0.176933,0.00357921,-3.86697e-05,7.36254e-07,0.180474,0.00350408,-3.6461e-05,6.42534e-07,0.183942,0.00343308,-3.45334e-05,6.12614e-07,0.187342,0.00336586,-3.26955e-05,5.42894e-07,0.190675,0.00330209,-3.10669e-05,5.08967e-07,0.193947,0.00324149,-2.954e-05,4.75977e-07,0.197159,0.00318383,-2.8112e-05,4.18343e-07,0.200315,0.00312887,-2.6857e-05,4.13651e-07,0.203418,0.00307639,-2.5616e-05,3.70847e-07,0.206469,0.00302627,-2.45035e-05,3.3813e-07,0.209471,0.00297828,-2.34891e-05,3.32999e-07,0.212426,0.0029323,-2.24901e-05,2.96826e-07,0.215336,0.00288821,-2.15996e-05,2.82736e-07,0.218203,0.00284586,-2.07514e-05,2.70961e-07,0.221029,0.00280517,-1.99385e-05,2.42744e-07,0.223814,0.00276602,-1.92103e-05,2.33277e-07,0.226561,0.0027283,-1.85105e-05,2.2486e-07,0.229271,0.00269195,-1.78359e-05,2.08383e-07,0.231945,0.00265691,-1.72108e-05,1.93305e-07,0.234585,0.00262307,-1.66308e-05,1.80687e-07,0.237192,0.00259035,-1.60888e-05,1.86632e-07,0.239766,0.00255873,-1.55289e-05,1.60569e-07,0.24231,0.00252815,-1.50472e-05,1.54566e-07,0.244823,0.00249852,-1.45835e-05,1.59939e-07,0.247307,0.00246983,-1.41037e-05,1.29549e-07,0.249763,0.00244202,-1.3715e-05,1.41429e-07,0.252191,0.00241501,-1.32907e-05,1.39198e-07,0.254593,0.00238885,-1.28731e-05,1.06444e-07,0.256969,0.00236342,-1.25538e-05,1.2048e-07,0.25932,0.00233867,-1.21924e-05,1.26892e-07,0.261647,0.00231467,-1.18117e-05,8.72084e-08,0.26395,0.00229131,-1.15501e-05,1.20323e-07,0.26623,0.00226857,-1.11891e-05,8.71514e-08,0.268487,0.00224645,-1.09276e-05,9.73165e-08,0.270723,0.00222489,-1.06357e-05,8.98259e-08,0.272937,0.00220389,-1.03662e-05,7.98218e-08,0.275131,0.00218339,-1.01267e-05,9.75254e-08,0.277304,0.00216343,-9.83416e-06,6.65195e-08,0.279458,0.00214396,-9.63461e-06,8.34313e-08,0.281592,0.00212494,-9.38431e-06,7.65919e-08,0.283708,0.00210641,-9.15454e-06,5.7236e-08,0.285805,0.00208827,-8.98283e-06,8.18939e-08,0.287885,0.00207055,-8.73715e-06,6.2224e-08,0.289946,0.00205326,-8.55047e-06,5.66388e-08,0.291991,0.00203633,-8.38056e-06,6.88491e-08,0.294019,0.00201978,-8.17401e-06,5.53955e-08,0.296031,0.00200359,-8.00782e-06,6.71971e-08,0.298027,0.00198778,-7.80623e-06,3.34439e-08,0.300007,0.00197227,-7.7059e-06,6.7248e-08,0.301971,0.00195706,-7.50416e-06,5.51915e-08,0.303921,0.00194221,-7.33858e-06,3.98124e-08,0.305856,0.00192766,-7.21915e-06,5.37795e-08,0.307776,0.00191338,-7.05781e-06,4.30919e-08,0.309683,0.00189939,-6.92853e-06,4.20744e-08,0.311575,0.00188566,-6.80231e-06,5.68321e-08,0.313454,0.00187223,-6.63181e-06,2.86195e-08,0.31532,0.00185905,-6.54595e-06,3.73075e-08,0.317172,0.00184607,-6.43403e-06,6.05684e-08,0.319012,0.00183338,-6.25233e-06,1.84426e-08,0.320839,0.00182094,-6.197e-06,4.44757e-08,0.322654,0.00180867,-6.06357e-06,4.20729e-08,0.324456,0.00179667,-5.93735e-06,2.56511e-08,0.326247,0.00178488,-5.8604e-06,3.41368e-08,0.328026,0.00177326,-5.75799e-06,4.64177e-08,0.329794,0.00176188,-5.61874e-06,1.86107e-08,0.33155,0.0017507,-5.5629e-06,2.81511e-08,0.333295,0.00173966,-5.47845e-06,4.75987e-08,0.335029,0.00172884,-5.33565e-06,1.98726e-08,0.336753,0.00171823,-5.27604e-06,2.19226e-08,0.338466,0.00170775,-5.21027e-06,4.14483e-08,0.340169,0.00169745,-5.08592e-06,2.09017e-08,0.341861,0.00168734,-5.02322e-06,2.39561e-08,0.343543,0.00167737,-4.95135e-06,3.22852e-08,0.345216,0.00166756,-4.85449e-06,2.57173e-08,0.346878,0.00165793,-4.77734e-06,1.38569e-08,0.348532,0.00164841,-4.73577e-06,3.80634e-08,0.350175,0.00163906,-4.62158e-06,1.27043e-08,0.35181,0.00162985,-4.58347e-06,3.03279e-08,0.353435,0.00162078,-4.49249e-06,1.49961e-08,0.355051,0.00161184,-4.4475e-06,2.88977e-08,0.356659,0.00160303,-4.3608e-06,1.84241e-08,0.358257,0.00159436,-4.30553e-06,1.6616e-08,0.359848,0.0015858,-4.25568e-06,3.43218e-08,0.361429,0.00157739,-4.15272e-06,-4.89172e-09,0.363002,0.00156907,-4.16739e-06,4.48498e-08,0.364567,0.00156087,-4.03284e-06,4.30676e-09,0.366124,0.00155282,-4.01992e-06,2.73303e-08,0.367673,0.00154486,-3.93793e-06,5.58036e-09,0.369214,0.001537,-3.92119e-06,3.97554e-08,0.370747,0.00152928,-3.80193e-06,-1.55904e-08,0.372272,0.00152163,-3.8487e-06,5.24081e-08,0.37379,0.00151409,-3.69147e-06,-1.52272e-08,0.375301,0.00150666,-3.73715e-06,3.83028e-08,0.376804,0.0014993,-3.62225e-06,1.10278e-08,0.378299,0.00149209,-3.58916e-06,6.99326e-09,0.379788,0.00148493,-3.56818e-06,2.06038e-08,0.381269,0.00147786,-3.50637e-06,2.98009e-08,0.382744,0.00147093,-3.41697e-06,-2.05978e-08,0.384211,0.00146404,-3.47876e-06,5.25899e-08,0.385672,0.00145724,-3.32099e-06,-1.09471e-08,0.387126,0.00145056,-3.35383e-06,2.10009e-08,0.388573,0.00144392,-3.29083e-06,1.63501e-08,0.390014,0.00143739,-3.24178e-06,3.00641e-09,0.391448,0.00143091,-3.23276e-06,3.12282e-08,0.392875,0.00142454,-3.13908e-06,-8.70932e-09,0.394297,0.00141824,-3.16521e-06,3.34114e-08,0.395712,0.00141201,-3.06497e-06,-5.72754e-09,0.397121,0.00140586,-3.08215e-06,1.9301e-08,0.398524,0.00139975,-3.02425e-06,1.7931e-08,0.39992,0.00139376,-2.97046e-06,-1.61822e-09,0.401311,0.00138781,-2.97531e-06,1.83442e-08,0.402696,0.00138192,-2.92028e-06,1.76485e-08,0.404075,0.00137613,-2.86733e-06,4.68617e-10,0.405448,0.00137039,-2.86593e-06,1.02794e-08,0.406816,0.00136469,-2.83509e-06,1.80179e-08,0.408178,0.00135908,-2.78104e-06,7.05594e-09,0.409534,0.00135354,-2.75987e-06,1.33633e-08,0.410885,0.00134806,-2.71978e-06,-9.04568e-10,0.41223,0.00134261,-2.72249e-06,2.0057e-08,0.41357,0.00133723,-2.66232e-06,1.00841e-08,0.414905,0.00133194,-2.63207e-06,-7.88835e-10,0.416234,0.00132667,-2.63444e-06,2.28734e-08,0.417558,0.00132147,-2.56582e-06,-1.29785e-09,0.418877,0.00131633,-2.56971e-06,1.21205e-08,0.420191,0.00131123,-2.53335e-06,1.24202e-08,0.421499,0.0013062,-2.49609e-06,-2.19681e-09,0.422803,0.0013012,-2.50268e-06,2.61696e-08,0.424102,0.00129628,-2.42417e-06,-1.30747e-08,0.425396,0.00129139,-2.46339e-06,2.6129e-08,0.426685,0.00128654,-2.38501e-06,-2.03454e-09,0.427969,0.00128176,-2.39111e-06,1.18115e-08,0.429248,0.00127702,-2.35567e-06,1.43932e-08,0.430523,0.00127235,-2.31249e-06,-9.77965e-09,0.431793,0.00126769,-2.34183e-06,2.47253e-08,0.433058,0.00126308,-2.26766e-06,2.85278e-10,0.434319,0.00125855,-2.2668e-06,3.93614e-09,0.435575,0.00125403,-2.25499e-06,1.37722e-08,0.436827,0.00124956,-2.21368e-06,5.79803e-10,0.438074,0.00124513,-2.21194e-06,1.37112e-08,0.439317,0.00124075,-2.1708e-06,4.17973e-09,0.440556,0.00123642,-2.15826e-06,-6.27703e-10,0.44179,0.0012321,-2.16015e-06,2.81332e-08,0.44302,0.00122787,-2.07575e-06,-2.24985e-08,0.444246,0.00122365,-2.14324e-06,3.20586e-08,0.445467,0.00121946,-2.04707e-06,-1.6329e-08,0.446685,0.00121532,-2.09605e-06,3.32573e-08,0.447898,0.00121122,-1.99628e-06,-2.72927e-08,0.449107,0.00120715,-2.07816e-06,4.6111e-08,0.450312,0.00120313,-1.93983e-06,-3.79416e-08,0.451514,0.00119914,-2.05365e-06,4.60507e-08,0.452711,0.00119517,-1.9155e-06,-2.7052e-08,0.453904,0.00119126,-1.99666e-06,3.23551e-08,0.455093,0.00118736,-1.89959e-06,-1.29613e-08,0.456279,0.00118352,-1.93848e-06,1.94905e-08,0.45746,0.0011797,-1.88e-06,-5.39588e-09,0.458638,0.00117593,-1.89619e-06,2.09282e-09,0.459812,0.00117214,-1.88991e-06,2.68267e-08,0.460982,0.00116844,-1.80943e-06,-1.99925e-08,0.462149,0.00116476,-1.86941e-06,2.3341e-08,0.463312,0.00116109,-1.79939e-06,-1.37674e-08,0.464471,0.00115745,-1.84069e-06,3.17287e-08,0.465627,0.00115387,-1.7455e-06,-2.37407e-08,0.466779,0.00115031,-1.81673e-06,3.34315e-08,0.467927,0.00114677,-1.71643e-06,-2.05786e-08,0.469073,0.00114328,-1.77817e-06,1.90802e-08,0.470214,0.00113978,-1.72093e-06,3.86247e-09,0.471352,0.00113635,-1.70934e-06,-4.72759e-09,0.472487,0.00113292,-1.72352e-06,1.50478e-08,0.473618,0.00112951,-1.67838e-06,4.14108e-09,0.474746,0.00112617,-1.66595e-06,-1.80986e-09,0.47587,0.00112283,-1.67138e-06,3.09816e-09,0.476991,0.0011195,-1.66209e-06,1.92198e-08,0.478109,0.00111623,-1.60443e-06,-2.03726e-08,0.479224,0.00111296,-1.66555e-06,3.2468e-08,0.480335,0.00110973,-1.56814e-06,-2.00922e-08,0.481443,0.00110653,-1.62842e-06,1.80983e-08,0.482548,0.00110333,-1.57413e-06,7.30362e-09,0.48365,0.0011002,-1.55221e-06,-1.75107e-08,0.484749,0.00109705,-1.60475e-06,3.29373e-08,0.485844,0.00109393,-1.50594e-06,-2.48315e-08,0.486937,0.00109085,-1.58043e-06,3.65865e-08,0.488026,0.0010878,-1.47067e-06,-3.21078e-08,0.489112,0.00108476,-1.56699e-06,3.22397e-08,0.490195,0.00108172,-1.47027e-06,-7.44391e-09,0.491276,0.00107876,-1.49261e-06,-2.46428e-09,0.492353,0.00107577,-1.5e-06,1.73011e-08,0.493427,0.00107282,-1.4481e-06,-7.13552e-09,0.494499,0.0010699,-1.4695e-06,1.1241e-08,0.495567,0.001067,-1.43578e-06,-8.02637e-09,0.496633,0.0010641,-1.45986e-06,2.08645e-08,0.497695,0.00106124,-1.39726e-06,-1.58271e-08,0.498755,0.0010584,-1.44475e-06,1.26415e-08,0.499812,0.00105555,-1.40682e-06,2.48655e-08,0.500866,0.00105281,-1.33222e-06,-5.24988e-08,0.501918,0.00104999,-1.48972e-06,6.59206e-08,0.502966,0.00104721,-1.29196e-06,-3.237e-08,0.504012,0.00104453,-1.38907e-06,3.95479e-09,0.505055,0.00104176,-1.3772e-06,1.65509e-08,0.506096,0.00103905,-1.32755e-06,-1.05539e-08,0.507133,0.00103637,-1.35921e-06,2.56648e-08,0.508168,0.00103373,-1.28222e-06,-3.25007e-08,0.509201,0.00103106,-1.37972e-06,4.47336e-08,0.51023,0.00102844,-1.24552e-06,-2.72245e-08,0.511258,0.00102587,-1.32719e-06,4.55952e-09,0.512282,0.00102323,-1.31352e-06,8.98645e-09,0.513304,0.00102063,-1.28656e-06,1.90992e-08,0.514323,0.00101811,-1.22926e-06,-2.57786e-08,0.51534,0.00101557,-1.30659e-06,2.44104e-08,0.516355,0.00101303,-1.23336e-06,-1.22581e-08,0.517366,0.00101053,-1.27014e-06,2.4622e-08,0.518376,0.00100806,-1.19627e-06,-2.66253e-08,0.519383,0.00100559,-1.27615e-06,2.22744e-08,0.520387,0.00100311,-1.20932e-06,-2.8679e-09,0.521389,0.00100068,-1.21793e-06,-1.08029e-08,0.522388,0.000998211,-1.25034e-06,4.60795e-08,0.523385,0.000995849,-1.1121e-06,-5.4306e-08,0.52438,0.000993462,-1.27502e-06,5.19354e-08,0.525372,0.000991067,-1.11921e-06,-3.42262e-08,0.526362,0.000988726,-1.22189e-06,2.53646e-08,0.52735,0.000986359,-1.14579e-06,-7.62782e-09,0.528335,0.000984044,-1.16868e-06,5.14668e-09,0.529318,0.000981722,-1.15324e-06,-1.29589e-08,0.530298,0.000979377,-1.19211e-06,4.66888e-08,0.531276,0.000977133,-1.05205e-06,-5.45868e-08,0.532252,0.000974865,-1.21581e-06,5.24495e-08,0.533226,0.000972591,-1.05846e-06,-3.60019e-08,0.534198,0.000970366,-1.16647e-06,3.19537e-08,0.535167,0.000968129,-1.07061e-06,-3.2208e-08,0.536134,0.000965891,-1.16723e-06,3.72738e-08,0.537099,0.000963668,-1.05541e-06,2.32205e-09,0.538061,0.000961564,-1.04844e-06,-4.65618e-08,0.539022,0.000959328,-1.18813e-06,6.47159e-08,0.53998,0.000957146,-9.93979e-07,-3.3488e-08,0.540936,0.000955057,-1.09444e-06,9.63166e-09,0.54189,0.000952897,-1.06555e-06,-5.03871e-09,0.542842,0.000950751,-1.08066e-06,1.05232e-08,0.543792,0.000948621,-1.04909e-06,2.25503e-08,0.544739,0.000946591,-9.81444e-07,-4.11195e-08,0.545685,0.000944504,-1.1048e-06,2.27182e-08,0.546628,0.000942363,-1.03665e-06,9.85146e-09,0.54757,0.000940319,-1.00709e-06,-2.51938e-09,0.548509,0.000938297,-1.01465e-06,2.25858e-10,0.549446,0.000936269,-1.01397e-06,1.61598e-09,0.550381,0.000934246,-1.00913e-06,-6.68983e-09,0.551315,0.000932207,-1.0292e-06,2.51434e-08,0.552246,0.000930224,-9.53765e-07,-3.42793e-08,0.553175,0.000928214,-1.0566e-06,5.23688e-08,0.554102,0.000926258,-8.99497e-07,-5.59865e-08,0.555028,0.000924291,-1.06746e-06,5.23679e-08,0.555951,0.000922313,-9.10352e-07,-3.42763e-08,0.556872,0.00092039,-1.01318e-06,2.51326e-08,0.557792,0.000918439,-9.37783e-07,-6.64954e-09,0.558709,0.000916543,-9.57732e-07,1.46554e-09,0.559625,0.000914632,-9.53335e-07,7.87281e-10,0.560538,0.000912728,-9.50973e-07,-4.61466e-09,0.56145,0.000910812,-9.64817e-07,1.76713e-08,0.56236,0.000908935,-9.11804e-07,-6.46564e-09,0.563268,0.000907092,-9.312e-07,8.19121e-09,0.564174,0.000905255,-9.06627e-07,-2.62992e-08,0.565078,0.000903362,-9.85524e-07,3.74007e-08,0.565981,0.000901504,-8.73322e-07,-4.0942e-09,0.566882,0.000899745,-8.85605e-07,-2.1024e-08,0.56778,0.00089791,-9.48677e-07,2.85854e-08,0.568677,0.000896099,-8.62921e-07,-3.3713e-08,0.569573,0.000894272,-9.64059e-07,4.6662e-08,0.570466,0.000892484,-8.24073e-07,-3.37258e-08,0.571358,0.000890734,-9.25251e-07,2.86365e-08,0.572247,0.00088897,-8.39341e-07,-2.12155e-08,0.573135,0.000887227,-9.02988e-07,-3.37913e-09,0.574022,0.000885411,-9.13125e-07,3.47319e-08,0.574906,0.000883689,-8.08929e-07,-1.63394e-08,0.575789,0.000882022,-8.57947e-07,-2.8979e-08,0.57667,0.00088022,-9.44885e-07,7.26509e-08,0.57755,0.000878548,-7.26932e-07,-8.28106e-08,0.578427,0.000876845,-9.75364e-07,7.97774e-08,0.579303,0.000875134,-7.36032e-07,-5.74849e-08,0.580178,0.00087349,-9.08486e-07,3.09529e-08,0.58105,0.000871765,-8.15628e-07,-6.72206e-09,0.581921,0.000870114,-8.35794e-07,-4.06451e-09,0.582791,0.00086843,-8.47987e-07,2.29799e-08,0.583658,0.000866803,-7.79048e-07,-2.82503e-08,0.584524,0.00086516,-8.63799e-07,3.04167e-08,0.585388,0.000863524,-7.72548e-07,-3.38119e-08,0.586251,0.000861877,-8.73984e-07,4.52264e-08,0.587112,0.000860265,-7.38305e-07,-2.78842e-08,0.587972,0.000858705,-8.21958e-07,6.70567e-09,0.58883,0.000857081,-8.01841e-07,1.06161e-09,0.589686,0.000855481,-7.98656e-07,-1.09521e-08,0.590541,0.00085385,-8.31512e-07,4.27468e-08,0.591394,0.000852316,-7.03272e-07,-4.08257e-08,0.592245,0.000850787,-8.25749e-07,1.34677e-09,0.593095,0.000849139,-8.21709e-07,3.54387e-08,0.593944,0.000847602,-7.15393e-07,-2.38924e-08,0.59479,0.0008461,-7.8707e-07,5.26143e-10,0.595636,0.000844527,-7.85491e-07,2.17879e-08,0.596479,0.000843021,-7.20127e-07,-2.80733e-08,0.597322,0.000841497,-8.04347e-07,3.09005e-08,0.598162,0.000839981,-7.11646e-07,-3.5924e-08,0.599002,0.00083845,-8.19418e-07,5.3191e-08,0.599839,0.000836971,-6.59845e-07,-5.76307e-08,0.600676,0.000835478,-8.32737e-07,5.81227e-08,0.60151,0.000833987,-6.58369e-07,-5.56507e-08,0.602344,0.000832503,-8.25321e-07,4.52706e-08,0.603175,0.000830988,-6.89509e-07,-6.22236e-09,0.604006,0.000829591,-7.08176e-07,-2.03811e-08,0.604834,0.000828113,-7.6932e-07,2.8142e-08,0.605662,0.000826659,-6.84894e-07,-3.25822e-08,0.606488,0.000825191,-7.8264e-07,4.25823e-08,0.607312,0.000823754,-6.54893e-07,-1.85376e-08,0.608135,0.000822389,-7.10506e-07,-2.80365e-08,0.608957,0.000820883,-7.94616e-07,7.1079e-08,0.609777,0.000819507,-5.81379e-07,-7.74655e-08,0.610596,0.000818112,-8.13775e-07,5.9969e-08,0.611413,0.000816665,-6.33868e-07,-4.32013e-08,0.612229,0.000815267,-7.63472e-07,5.32313e-08,0.613044,0.0008139,-6.03778e-07,-5.05148e-08,0.613857,0.000812541,-7.55323e-07,2.96187e-08,0.614669,0.000811119,-6.66466e-07,-8.35545e-09,0.615479,0.000809761,-6.91533e-07,3.80301e-09,0.616288,0.00080839,-6.80124e-07,-6.85666e-09,0.617096,0.000807009,-7.00694e-07,2.36237e-08,0.617903,0.000805678,-6.29822e-07,-2.80336e-08,0.618708,0.000804334,-7.13923e-07,2.8906e-08,0.619511,0.000802993,-6.27205e-07,-2.79859e-08,0.620314,0.000801655,-7.11163e-07,2.34329e-08,0.621114,0.000800303,-6.40864e-07,-6.14108e-09,0.621914,0.000799003,-6.59287e-07,1.13151e-09,0.622712,0.000797688,-6.55893e-07,1.61507e-09,0.62351,0.000796381,-6.51048e-07,-7.59186e-09,0.624305,0.000795056,-6.73823e-07,2.87524e-08,0.6251,0.000793794,-5.87566e-07,-4.7813e-08,0.625893,0.000792476,-7.31005e-07,4.32901e-08,0.626685,0.000791144,-6.01135e-07,-6.13814e-09,0.627475,0.000789923,-6.19549e-07,-1.87376e-08,0.628264,0.000788628,-6.75762e-07,2.14837e-08,0.629052,0.000787341,-6.11311e-07,-7.59265e-09,0.629839,0.000786095,-6.34089e-07,8.88692e-09,0.630625,0.000784854,-6.07428e-07,-2.7955e-08,0.631409,0.000783555,-6.91293e-07,4.33285e-08,0.632192,0.000782302,-5.61307e-07,-2.61497e-08,0.632973,0.000781101,-6.39757e-07,1.6658e-09,0.633754,0.000779827,-6.34759e-07,1.94866e-08,0.634533,0.000778616,-5.76299e-07,-2.00076e-08,0.635311,0.000777403,-6.36322e-07,9.39091e-10,0.636088,0.000776133,-6.33505e-07,1.62512e-08,0.636863,0.000774915,-5.84751e-07,-6.33937e-09,0.637638,0.000773726,-6.03769e-07,9.10609e-09,0.638411,0.000772546,-5.76451e-07,-3.00849e-08,0.639183,0.000771303,-6.66706e-07,5.1629e-08,0.639953,0.000770125,-5.11819e-07,-5.7222e-08,0.640723,0.000768929,-6.83485e-07,5.80497e-08,0.641491,0.000767736,-5.09336e-07,-5.57674e-08,0.642259,0.000766551,-6.76638e-07,4.58105e-08,0.643024,0.000765335,-5.39206e-07,-8.26541e-09,0.643789,0.000764231,-5.64002e-07,-1.27488e-08,0.644553,0.000763065,-6.02249e-07,-3.44168e-10,0.645315,0.00076186,-6.03281e-07,1.41254e-08,0.646077,0.000760695,-5.60905e-07,3.44727e-09,0.646837,0.000759584,-5.50563e-07,-2.79144e-08,0.647596,0.000758399,-6.34307e-07,4.86057e-08,0.648354,0.000757276,-4.88489e-07,-4.72989e-08,0.64911,0.000756158,-6.30386e-07,2.13807e-08,0.649866,0.000754961,-5.66244e-07,2.13808e-08,0.65062,0.000753893,-5.02102e-07,-4.7299e-08,0.651374,0.000752746,-6.43999e-07,4.86059e-08,0.652126,0.000751604,-4.98181e-07,-2.79154e-08,0.652877,0.000750524,-5.81927e-07,3.45089e-09,0.653627,0.000749371,-5.71575e-07,1.41119e-08,0.654376,0.00074827,-5.29239e-07,-2.93748e-10,0.655123,0.00074721,-5.3012e-07,-1.29368e-08,0.65587,0.000746111,-5.68931e-07,-7.56355e-09,0.656616,0.000744951,-5.91621e-07,4.3191e-08,0.65736,0.000743897,-4.62048e-07,-4.59911e-08,0.658103,0.000742835,-6.00022e-07,2.15642e-08,0.658846,0.0007417,-5.35329e-07,1.93389e-08,0.659587,0.000740687,-4.77312e-07,-3.93152e-08,0.660327,0.000739615,-5.95258e-07,1.87126e-08,0.661066,0.00073848,-5.3912e-07,2.40695e-08,0.661804,0.000737474,-4.66912e-07,-5.53859e-08,0.662541,0.000736374,-6.33069e-07,7.82648e-08,0.663277,0.000735343,-3.98275e-07,-7.88593e-08,0.664012,0.00073431,-6.34853e-07,5.83585e-08,0.664745,0.000733215,-4.59777e-07,-3.53656e-08,0.665478,0.000732189,-5.65874e-07,2.34994e-08,0.66621,0.000731128,-4.95376e-07,9.72743e-10,0.66694,0.00073014,-4.92458e-07,-2.73903e-08,0.66767,0.000729073,-5.74629e-07,4.89839e-08,0.668398,0.000728071,-4.27677e-07,-4.93359e-08,0.669126,0.000727068,-5.75685e-07,2.91504e-08,0.669853,0.000726004,-4.88234e-07,-7.66109e-09,0.670578,0.000725004,-5.11217e-07,1.49392e-09,0.671303,0.000723986,-5.06735e-07,1.68533e-09,0.672026,0.000722978,-5.01679e-07,-8.23525e-09,0.672749,0.00072195,-5.26385e-07,3.12556e-08,0.67347,0.000720991,-4.32618e-07,-5.71825e-08,0.674191,0.000719954,-6.04166e-07,7.8265e-08,0.67491,0.00071898,-3.69371e-07,-7.70634e-08,0.675628,0.00071801,-6.00561e-07,5.11747e-08,0.676346,0.000716963,-4.47037e-07,-8.42615e-09,0.677062,0.000716044,-4.72315e-07,-1.747e-08,0.677778,0.000715046,-5.24725e-07,1.87015e-08,0.678493,0.000714053,-4.68621e-07,2.26856e-09,0.679206,0.000713123,-4.61815e-07,-2.77758e-08,0.679919,0.000712116,-5.45142e-07,4.92298e-08,0.68063,0.000711173,-3.97453e-07,-4.99339e-08,0.681341,0.000710228,-5.47255e-07,3.12967e-08,0.682051,0.000709228,-4.53365e-07,-1.56481e-08,0.68276,0.000708274,-5.00309e-07,3.12958e-08,0.683467,0.000707367,-4.06422e-07,-4.99303e-08,0.684174,0.000706405,-5.56213e-07,4.9216e-08,0.68488,0.00070544,-4.08565e-07,-2.77245e-08,0.685585,0.00070454,-4.91738e-07,2.07748e-09,0.686289,0.000703562,-4.85506e-07,1.94146e-08,0.686992,0.00070265,-4.27262e-07,-2.01314e-08,0.687695,0.000701735,-4.87656e-07,1.50616e-09,0.688396,0.000700764,-4.83137e-07,1.41067e-08,0.689096,0.00069984,-4.40817e-07,1.67168e-09,0.689795,0.000698963,-4.35802e-07,-2.07934e-08,0.690494,0.000698029,-4.98182e-07,2.18972e-08,0.691192,0.000697099,-4.32491e-07,-7.19092e-09,0.691888,0.000696212,-4.54064e-07,6.86642e-09,0.692584,0.000695325,-4.33464e-07,-2.02747e-08,0.693279,0.000694397,-4.94288e-07,1.46279e-08,0.693973,0.000693452,-4.50405e-07,2.13678e-08,0.694666,0.000692616,-3.86301e-07,-4.04945e-08,0.695358,0.000691721,-5.07785e-07,2.14009e-08,0.696049,0.00069077,-4.43582e-07,1.44955e-08,0.69674,0.000689926,-4.00096e-07,-1.97783e-08,0.697429,0.000689067,-4.5943e-07,5.01296e-09,0.698118,0.000688163,-4.44392e-07,-2.73521e-10,0.698805,0.000687273,-4.45212e-07,-3.91893e-09,0.699492,0.000686371,-4.56969e-07,1.59493e-08,0.700178,0.000685505,-4.09121e-07,-2.73351e-10,0.700863,0.000684686,-4.09941e-07,-1.4856e-08,0.701548,0.000683822,-4.54509e-07,9.25979e-11,0.702231,0.000682913,-4.54231e-07,1.44855e-08,0.702913,0.000682048,-4.10775e-07,1.56992e-09,0.703595,0.000681231,-4.06065e-07,-2.07652e-08,0.704276,0.000680357,-4.68361e-07,2.18864e-08,0.704956,0.000679486,-4.02701e-07,-7.17595e-09,0.705635,0.000678659,-4.24229e-07,6.81748e-09,0.706313,0.000677831,-4.03777e-07,-2.0094e-08,0.70699,0.000676963,-4.64059e-07,1.39538e-08,0.707667,0.000676077,-4.22197e-07,2.38835e-08,0.708343,0.000675304,-3.50547e-07,-4.98831e-08,0.709018,0.000674453,-5.00196e-07,5.64395e-08,0.709692,0.000673622,-3.30878e-07,-5.66657e-08,0.710365,0.00067279,-5.00875e-07,5.1014e-08,0.711037,0.000671942,-3.47833e-07,-2.81809e-08,0.711709,0.000671161,-4.32376e-07,2.10513e-09,0.712379,0.000670303,-4.2606e-07,1.97604e-08,0.713049,0.00066951,-3.66779e-07,-2.15422e-08,0.713718,0.000668712,-4.31406e-07,6.8038e-09,0.714387,0.000667869,-4.10994e-07,-5.67295e-09,0.715054,0.00066703,-4.28013e-07,1.5888e-08,0.715721,0.000666222,-3.80349e-07,1.72576e-09,0.716387,0.000665467,-3.75172e-07,-2.27911e-08,0.717052,0.000664648,-4.43545e-07,2.9834e-08,0.717716,0.00066385,-3.54043e-07,-3.69401e-08,0.718379,0.000663031,-4.64864e-07,5.83219e-08,0.719042,0.000662277,-2.89898e-07,-7.71382e-08,0.719704,0.000661465,-5.21313e-07,7.14171e-08,0.720365,0.000660637,-3.07061e-07,-2.97161e-08,0.721025,0.000659934,-3.96209e-07,-1.21575e-08,0.721685,0.000659105,-4.32682e-07,1.87412e-08,0.722343,0.000658296,-3.76458e-07,-3.2029e-09,0.723001,0.000657533,-3.86067e-07,-5.9296e-09,0.723659,0.000656743,-4.03856e-07,2.69213e-08,0.724315,0.000656016,-3.23092e-07,-4.21511e-08,0.724971,0.000655244,-4.49545e-07,2.24737e-08,0.725625,0.000654412,-3.82124e-07,1.18611e-08,0.726279,0.000653683,-3.46541e-07,-1.03132e-08,0.726933,0.000652959,-3.7748e-07,-3.02128e-08,0.727585,0.000652114,-4.68119e-07,7.15597e-08,0.728237,0.000651392,-2.5344e-07,-7.72119e-08,0.728888,0.000650654,-4.85075e-07,5.8474e-08,0.729538,0.000649859,-3.09654e-07,-3.74746e-08,0.730188,0.000649127,-4.22077e-07,3.18197e-08,0.730837,0.000648379,-3.26618e-07,-3.01997e-08,0.731485,0.000647635,-4.17217e-07,2.93747e-08,0.732132,0.000646888,-3.29093e-07,-2.76943e-08,0.732778,0.000646147,-4.12176e-07,2.17979e-08,0.733424,0.000645388,-3.46783e-07,1.07292e-10,0.734069,0.000644695,-3.46461e-07,-2.22271e-08,0.734713,0.000643935,-4.13142e-07,2.91963e-08,0.735357,0.000643197,-3.25553e-07,-3.49536e-08,0.736,0.000642441,-4.30414e-07,5.10133e-08,0.736642,0.000641733,-2.77374e-07,-4.98904e-08,0.737283,0.000641028,-4.27045e-07,2.93392e-08,0.737924,0.000640262,-3.39028e-07,-7.86156e-09,0.738564,0.000639561,-3.62612e-07,2.10703e-09,0.739203,0.000638842,-3.56291e-07,-5.6653e-10,0.739842,0.000638128,-3.57991e-07,1.59086e-10,0.740479,0.000637412,-3.57513e-07,-6.98321e-11,0.741116,0.000636697,-3.57723e-07,1.20214e-10,0.741753,0.000635982,-3.57362e-07,-4.10987e-10,0.742388,0.000635266,-3.58595e-07,1.5237e-09,0.743023,0.000634553,-3.54024e-07,-5.68376e-09,0.743657,0.000633828,-3.71075e-07,2.12113e-08,0.744291,0.00063315,-3.07441e-07,-1.95569e-08,0.744924,0.000632476,-3.66112e-07,-2.58816e-09,0.745556,0.000631736,-3.73877e-07,2.99096e-08,0.746187,0.000631078,-2.84148e-07,-5.74454e-08,0.746818,0.000630337,-4.56484e-07,8.06629e-08,0.747448,0.000629666,-2.14496e-07,-8.63922e-08,0.748077,0.000628978,-4.73672e-07,8.60918e-08,0.748706,0.000628289,-2.15397e-07,-7.91613e-08,0.749334,0.000627621,-4.5288e-07,5.17393e-08,0.749961,0.00062687,-2.97663e-07,-8.58662e-09,0.750588,0.000626249,-3.23422e-07,-1.73928e-08,0.751214,0.00062555,-3.75601e-07,1.85532e-08,0.751839,0.000624855,-3.19941e-07,2.78479e-09,0.752463,0.000624223,-3.11587e-07,-2.96923e-08,0.753087,0.000623511,-4.00664e-07,5.63799e-08,0.75371,0.000622879,-2.31524e-07,-7.66179e-08,0.754333,0.000622186,-4.61378e-07,7.12778e-08,0.754955,0.000621477,-2.47545e-07,-2.96794e-08,0.755576,0.000620893,-3.36583e-07,-1.21648e-08,0.756196,0.000620183,-3.73077e-07,1.87339e-08,0.756816,0.000619493,-3.16875e-07,-3.16622e-09,0.757435,0.00061885,-3.26374e-07,-6.0691e-09,0.758054,0.000618179,-3.44581e-07,2.74426e-08,0.758672,0.000617572,-2.62254e-07,-4.40968e-08,0.759289,0.000616915,-3.94544e-07,2.97352e-08,0.759906,0.000616215,-3.05338e-07,-1.52393e-08,0.760522,0.000615559,-3.51056e-07,3.12221e-08,0.761137,0.000614951,-2.5739e-07,-5.00443e-08,0.761751,0.000614286,-4.07523e-07,4.9746e-08,0.762365,0.00061362,-2.58285e-07,-2.97303e-08,0.762979,0.000613014,-3.47476e-07,9.57079e-09,0.763591,0.000612348,-3.18764e-07,-8.55287e-09,0.764203,0.000611685,-3.44422e-07,2.46407e-08,0.764815,0.00061107,-2.705e-07,-3.04053e-08,0.765426,0.000610437,-3.61716e-07,3.73759e-08,0.766036,0.000609826,-2.49589e-07,-5.94935e-08,0.766645,0.000609149,-4.28069e-07,8.13889e-08,0.767254,0.000608537,-1.83902e-07,-8.72483e-08,0.767862,0.000607907,-4.45647e-07,8.87901e-08,0.76847,0.000607282,-1.79277e-07,-8.90983e-08,0.769077,0.000606656,-4.46572e-07,8.87892e-08,0.769683,0.000606029,-1.80204e-07,-8.72446e-08,0.770289,0.000605407,-4.41938e-07,8.13752e-08,0.770894,0.000604768,-1.97812e-07,-5.94423e-08,0.771498,0.000604194,-3.76139e-07,3.71848e-08,0.772102,0.000603553,-2.64585e-07,-2.96922e-08,0.772705,0.000602935,-3.53661e-07,2.19793e-08,0.773308,0.000602293,-2.87723e-07,1.37955e-09,0.77391,0.000601722,-2.83585e-07,-2.74976e-08,0.774512,0.000601072,-3.66077e-07,4.9006e-08,0.775112,0.000600487,-2.19059e-07,-4.93171e-08,0.775712,0.000599901,-3.67011e-07,2.90531e-08,0.776312,0.000599254,-2.79851e-07,-7.29081e-09,0.776911,0.000598673,-3.01724e-07,1.10077e-10,0.777509,0.00059807,-3.01393e-07,6.85053e-09,0.778107,0.000597487,-2.80842e-07,-2.75123e-08,0.778704,0.000596843,-3.63379e-07,4.35939e-08,0.779301,0.000596247,-2.32597e-07,-2.7654e-08,0.779897,0.000595699,-3.15559e-07,7.41741e-09,0.780492,0.00059509,-2.93307e-07,-2.01562e-09,0.781087,0.000594497,-2.99354e-07,6.45059e-10,0.781681,0.000593901,-2.97418e-07,-5.64635e-10,0.782275,0.000593304,-2.99112e-07,1.61347e-09,0.782868,0.000592711,-2.94272e-07,-5.88926e-09,0.78346,0.000592105,-3.1194e-07,2.19436e-08,0.784052,0.000591546,-2.46109e-07,-2.22805e-08,0.784643,0.000590987,-3.1295e-07,7.57368e-09,0.785234,0.000590384,-2.90229e-07,-8.01428e-09,0.785824,0.00058978,-3.14272e-07,2.44834e-08,0.786414,0.000589225,-2.40822e-07,-3.03148e-08,0.787003,0.000588652,-3.31766e-07,3.7171e-08,0.787591,0.0005881,-2.20253e-07,-5.87646e-08,0.788179,0.000587483,-3.96547e-07,7.86782e-08,0.788766,0.000586926,-1.60512e-07,-7.71342e-08,0.789353,0.000586374,-3.91915e-07,5.10444e-08,0.789939,0.000585743,-2.38782e-07,-7.83422e-09,0.790524,0.000585242,-2.62284e-07,-1.97076e-08,0.791109,0.000584658,-3.21407e-07,2.70598e-08,0.791693,0.000584097,-2.40228e-07,-2.89269e-08,0.792277,0.000583529,-3.27008e-07,2.90431e-08,0.792861,0.000582963,-2.39879e-07,-2.76409e-08,0.793443,0.0005824,-3.22802e-07,2.1916e-08,0.794025,0.00058182,-2.57054e-07,-4.18368e-10,0.794607,0.000581305,-2.58309e-07,-2.02425e-08,0.795188,0.000580727,-3.19036e-07,2.17838e-08,0.795768,0.000580155,-2.53685e-07,-7.28814e-09,0.796348,0.000579625,-2.75549e-07,7.36871e-09,0.796928,0.000579096,-2.53443e-07,-2.21867e-08,0.797506,0.000578523,-3.20003e-07,2.17736e-08,0.798085,0.000577948,-2.54683e-07,-5.30296e-09,0.798662,0.000577423,-2.70592e-07,-5.61698e-10,0.799239,0.00057688,-2.72277e-07,7.54977e-09,0.799816,0.000576358,-2.49627e-07,-2.96374e-08,0.800392,0.00057577,-3.38539e-07,5.1395e-08,0.800968,0.000575247,-1.84354e-07,-5.67335e-08,0.801543,0.000574708,-3.54555e-07,5.63297e-08,0.802117,0.000574168,-1.85566e-07,-4.93759e-08,0.802691,0.000573649,-3.33693e-07,2.19646e-08,0.803264,0.000573047,-2.678e-07,2.1122e-08,0.803837,0.000572575,-2.04433e-07,-4.68482e-08,0.804409,0.000572026,-3.44978e-07,4.70613e-08,0.804981,0.000571477,-2.03794e-07,-2.21877e-08,0.805552,0.000571003,-2.70357e-07,-1.79153e-08,0.806123,0.000570408,-3.24103e-07,3.42443e-08,0.806693,0.000569863,-2.2137e-07,1.47556e-10,0.807263,0.000569421,-2.20928e-07,-3.48345e-08,0.807832,0.000568874,-3.25431e-07,1.99812e-08,0.808401,0.000568283,-2.65487e-07,1.45143e-08,0.808969,0.000567796,-2.21945e-07,-1.84338e-08,0.809536,0.000567297,-2.77246e-07,-3.83608e-10,0.810103,0.000566741,-2.78397e-07,1.99683e-08,0.81067,0.000566244,-2.18492e-07,-1.98848e-08,0.811236,0.000565747,-2.78146e-07,-3.38976e-11,0.811801,0.000565191,-2.78248e-07,2.00204e-08,0.812366,0.000564695,-2.18187e-07,-2.04429e-08,0.812931,0.000564197,-2.79516e-07,2.1467e-09,0.813495,0.000563644,-2.73076e-07,1.18561e-08,0.814058,0.000563134,-2.37507e-07,1.00334e-08,0.814621,0.000562689,-2.07407e-07,-5.19898e-08,0.815183,0.000562118,-3.63376e-07,7.87163e-08,0.815745,0.000561627,-1.27227e-07,-8.40616e-08,0.816306,0.000561121,-3.79412e-07,7.87163e-08,0.816867,0.000560598,-1.43263e-07,-5.19898e-08,0.817428,0.000560156,-2.99233e-07,1.00335e-08,0.817988,0.000559587,-2.69132e-07,1.18559e-08,0.818547,0.000559085,-2.33564e-07,2.14764e-09,0.819106,0.000558624,-2.27122e-07,-2.04464e-08,0.819664,0.000558108,-2.88461e-07,2.00334e-08,0.820222,0.000557591,-2.28361e-07,-8.24277e-11,0.820779,0.000557135,-2.28608e-07,-1.97037e-08,0.821336,0.000556618,-2.87719e-07,1.92925e-08,0.821893,0.000556101,-2.29841e-07,2.13831e-09,0.822448,0.000555647,-2.23427e-07,-2.78458e-08,0.823004,0.000555117,-3.06964e-07,4.96402e-08,0.823559,0.000554652,-1.58043e-07,-5.15058e-08,0.824113,0.000554181,-3.12561e-07,3.71737e-08,0.824667,0.000553668,-2.0104e-07,-3.75844e-08,0.82522,0.000553153,-3.13793e-07,5.35592e-08,0.825773,0.000552686,-1.53115e-07,-5.74431e-08,0.826326,0.000552207,-3.25444e-07,5.7004e-08,0.826878,0.000551728,-1.54433e-07,-5.13635e-08,0.827429,0.000551265,-3.08523e-07,2.92406e-08,0.82798,0.000550735,-2.20801e-07,-5.99424e-09,0.828531,0.000550276,-2.38784e-07,-5.26363e-09,0.829081,0.000549782,-2.54575e-07,2.70488e-08,0.82963,0.000549354,-1.73429e-07,-4.33268e-08,0.83018,0.000548878,-3.03409e-07,2.7049e-08,0.830728,0.000548352,-2.22262e-07,-5.26461e-09,0.831276,0.000547892,-2.38056e-07,-5.99057e-09,0.831824,0.000547397,-2.56027e-07,2.92269e-08,0.832371,0.000546973,-1.68347e-07,-5.13125e-08,0.832918,0.000546482,-3.22284e-07,5.68139e-08,0.833464,0.000546008,-1.51843e-07,-5.67336e-08,0.83401,0.000545534,-3.22043e-07,5.09113e-08,0.834555,0.000545043,-1.6931e-07,-2.77022e-08,0.8351,0.000544621,-2.52416e-07,2.92924e-10,0.835644,0.000544117,-2.51537e-07,2.65305e-08,0.836188,0.000543694,-1.71946e-07,-4.68105e-08,0.836732,0.00054321,-3.12377e-07,4.15021e-08,0.837275,0.000542709,-1.87871e-07,1.13355e-11,0.837817,0.000542334,-1.87837e-07,-4.15474e-08,0.838359,0.000541833,-3.12479e-07,4.69691e-08,0.838901,0.000541349,-1.71572e-07,-2.71196e-08,0.839442,0.000540925,-2.52931e-07,1.90462e-09,0.839983,0.000540425,-2.47217e-07,1.95011e-08,0.840523,0.000539989,-1.88713e-07,-2.03045e-08,0.841063,0.00053955,-2.49627e-07,2.11216e-09,0.841602,0.000539057,-2.4329e-07,1.18558e-08,0.842141,0.000538606,-2.07723e-07,1.00691e-08,0.842679,0.000538221,-1.77516e-07,-5.21324e-08,0.843217,0.00053771,-3.33913e-07,7.92513e-08,0.843755,0.00053728,-9.6159e-08,-8.60587e-08,0.844292,0.000536829,-3.54335e-07,8.61696e-08,0.844828,0.000536379,-9.58263e-08,-7.98057e-08,0.845364,0.000535948,-3.35243e-07,5.42394e-08,0.8459,0.00053544,-1.72525e-07,-1.79426e-08,0.846435,0.000535041,-2.26353e-07,1.75308e-08,0.84697,0.000534641,-1.73761e-07,-5.21806e-08,0.847505,0.000534137,-3.30302e-07,7.19824e-08,0.848038,0.000533692,-1.14355e-07,-5.69349e-08,0.848572,0.000533293,-2.8516e-07,3.65479e-08,0.849105,0.000532832,-1.75516e-07,-2.96519e-08,0.849638,0.000532392,-2.64472e-07,2.2455e-08,0.85017,0.000531931,-1.97107e-07,-5.63451e-10,0.850702,0.000531535,-1.98797e-07,-2.02011e-08,0.851233,0.000531077,-2.59401e-07,2.17634e-08,0.851764,0.000530623,-1.94111e-07,-7.24794e-09,0.852294,0.000530213,-2.15854e-07,7.22832e-09,0.852824,0.000529803,-1.94169e-07,-2.16653e-08,0.853354,0.00052935,-2.59165e-07,1.98283e-08,0.853883,0.000528891,-1.9968e-07,1.95678e-09,0.854412,0.000528497,-1.9381e-07,-2.76554e-08,0.85494,0.000528027,-2.76776e-07,4.90603e-08,0.855468,0.00052762,-1.29596e-07,-4.93764e-08,0.855995,0.000527213,-2.77725e-07,2.92361e-08,0.856522,0.000526745,-1.90016e-07,-7.96341e-09,0.857049,0.000526341,-2.13907e-07,2.61752e-09,0.857575,0.000525922,-2.06054e-07,-2.50665e-09,0.8581,0.000525502,-2.13574e-07,7.40906e-09,0.858626,0.000525097,-1.91347e-07,-2.71296e-08,0.859151,0.000524633,-2.72736e-07,4.15048e-08,0.859675,0.000524212,-1.48221e-07,-1.96802e-08,0.860199,0.000523856,-2.07262e-07,-2.23886e-08,0.860723,0.000523375,-2.74428e-07,4.96299e-08,0.861246,0.000522975,-1.25538e-07,-5.69216e-08,0.861769,0.000522553,-2.96303e-07,5.88473e-08,0.862291,0.000522137,-1.19761e-07,-5.92584e-08,0.862813,0.00052172,-2.97536e-07,5.8977e-08,0.863334,0.000521301,-1.20605e-07,-5.74403e-08,0.863855,0.000520888,-2.92926e-07,5.15751e-08,0.864376,0.000520457,-1.38201e-07,-2.96506e-08,0.864896,0.000520091,-2.27153e-07,7.42277e-09,0.865416,0.000519659,-2.04885e-07,-4.05057e-11,0.865936,0.00051925,-2.05006e-07,-7.26074e-09,0.866455,0.000518818,-2.26788e-07,2.90835e-08,0.866973,0.000518451,-1.39538e-07,-4.94686e-08,0.867492,0.000518024,-2.87944e-07,4.95814e-08,0.868009,0.000517597,-1.39199e-07,-2.96479e-08,0.868527,0.000517229,-2.28143e-07,9.40539e-09,0.869044,0.000516801,-1.99927e-07,-7.9737e-09,0.86956,0.000516378,-2.23848e-07,2.24894e-08,0.870077,0.000515997,-1.5638e-07,-2.23793e-08,0.870592,0.000515617,-2.23517e-07,7.42302e-09,0.871108,0.000515193,-2.01248e-07,-7.31283e-09,0.871623,0.000514768,-2.23187e-07,2.18283e-08,0.872137,0.000514387,-1.57702e-07,-2.03959e-08,0.872652,0.000514011,-2.1889e-07,1.50711e-10,0.873165,0.000513573,-2.18437e-07,1.97931e-08,0.873679,0.000513196,-1.59058e-07,-1.97183e-08,0.874192,0.000512819,-2.18213e-07,-5.24324e-10,0.874704,0.000512381,-2.19786e-07,2.18156e-08,0.875217,0.000512007,-1.54339e-07,-2.71336e-08,0.875728,0.000511616,-2.3574e-07,2.71141e-08,0.87624,0.000511226,-1.54398e-07,-2.17182e-08,0.876751,0.000510852,-2.19552e-07,1.54131e-10,0.877262,0.000510414,-2.1909e-07,2.11017e-08,0.877772,0.000510039,-1.55785e-07,-2.49562e-08,0.878282,0.000509652,-2.30654e-07,1.91183e-08,0.878791,0.000509248,-1.73299e-07,8.08751e-09,0.8793,0.000508926,-1.49036e-07,-5.14684e-08,0.879809,0.000508474,-3.03441e-07,7.85766e-08,0.880317,0.000508103,-6.77112e-08,-8.40242e-08,0.880825,0.000507715,-3.19784e-07,7.87063e-08,0.881333,0.000507312,-8.36649e-08,-5.19871e-08,0.88184,0.000506988,-2.39626e-07,1.00327e-08,0.882346,0.000506539,-2.09528e-07,1.18562e-08,0.882853,0.000506156,-1.73959e-07,2.14703e-09,0.883359,0.000505814,-1.67518e-07,-2.04444e-08,0.883864,0.000505418,-2.28851e-07,2.00258e-08,0.88437,0.00050502,-1.68774e-07,-5.42855e-11,0.884874,0.000504682,-1.68937e-07,-1.98087e-08,0.885379,0.000504285,-2.28363e-07,1.96842e-08,0.885883,0.000503887,-1.6931e-07,6.76342e-10,0.886387,0.000503551,-1.67281e-07,-2.23896e-08,0.88689,0.000503149,-2.3445e-07,2.92774e-08,0.887393,0.000502768,-1.46618e-07,-3.51152e-08,0.887896,0.00050237,-2.51963e-07,5.15787e-08,0.888398,0.00050202,-9.72271e-08,-5.19903e-08,0.8889,0.00050167,-2.53198e-07,3.71732e-08,0.889401,0.000501275,-1.41678e-07,-3.70978e-08,0.889902,0.00050088,-2.52972e-07,5.16132e-08,0.890403,0.000500529,-9.81321e-08,-5.01459e-08,0.890903,0.000500183,-2.4857e-07,2.9761e-08,0.891403,0.000499775,-1.59287e-07,-9.29351e-09,0.891903,0.000499428,-1.87167e-07,7.41301e-09,0.892402,0.000499076,-1.64928e-07,-2.03585e-08,0.892901,0.000498685,-2.26004e-07,1.44165e-08,0.893399,0.000498276,-1.82754e-07,2.22974e-08,0.893898,0.000497978,-1.15862e-07,-4.40013e-08,0.894395,0.000497614,-2.47866e-07,3.44985e-08,0.894893,0.000497222,-1.44371e-07,-3.43882e-08,0.89539,0.00049683,-2.47535e-07,4.34497e-08,0.895886,0.000496465,-1.17186e-07,-2.02012e-08,0.896383,0.00049617,-1.7779e-07,-2.22497e-08,0.896879,0.000495748,-2.44539e-07,4.95952e-08,0.897374,0.000495408,-9.57532e-08,-5.69217e-08,0.89787,0.000495045,-2.66518e-07,5.88823e-08,0.898364,0.000494689,-8.98713e-08,-5.93983e-08,0.898859,0.000494331,-2.68066e-07,5.95017e-08,0.899353,0.000493973,-8.95613e-08,-5.9399e-08,0.899847,0.000493616,-2.67758e-07,5.8885e-08,0.90034,0.000493257,-9.11033e-08,-5.69317e-08,0.900833,0.000492904,-2.61898e-07,4.96326e-08,0.901326,0.000492529,-1.13001e-07,-2.23893e-08,0.901819,0.000492236,-1.80169e-07,-1.968e-08,0.902311,0.000491817,-2.39209e-07,4.15047e-08,0.902802,0.000491463,-1.14694e-07,-2.71296e-08,0.903293,0.000491152,-1.96083e-07,7.409e-09,0.903784,0.000490782,-1.73856e-07,-2.50645e-09,0.904275,0.000490427,-1.81376e-07,2.61679e-09,0.904765,0.000490072,-1.73525e-07,-7.96072e-09,0.905255,0.000489701,-1.97407e-07,2.92261e-08,0.905745,0.000489394,-1.09729e-07,-4.93389e-08,0.906234,0.000489027,-2.57746e-07,4.89204e-08,0.906723,0.000488658,-1.10985e-07,-2.71333e-08,0.907211,0.000488354,-1.92385e-07,8.30861e-12,0.907699,0.00048797,-1.9236e-07,2.71001e-08,0.908187,0.000487666,-1.1106e-07,-4.88041e-08,0.908675,0.000487298,-2.57472e-07,4.89069e-08,0.909162,0.000486929,-1.10751e-07,-2.76143e-08,0.909649,0.000486625,-1.93594e-07,1.9457e-09,0.910135,0.000486244,-1.87757e-07,1.98315e-08,0.910621,0.000485928,-1.28262e-07,-2.16671e-08,0.911107,0.000485606,-1.93264e-07,7.23216e-09,0.911592,0.000485241,-1.71567e-07,-7.26152e-09,0.912077,0.000484877,-1.93352e-07,2.18139e-08,0.912562,0.000484555,-1.2791e-07,-2.03895e-08,0.913047,0.000484238,-1.89078e-07,1.39494e-10,0.913531,0.000483861,-1.8866e-07,1.98315e-08,0.914014,0.000483543,-1.29165e-07,-1.98609e-08,0.914498,0.000483225,-1.88748e-07,7.39912e-12,0.914981,0.000482847,-1.88726e-07,1.98313e-08,0.915463,0.000482529,-1.29232e-07,-1.9728e-08,0.915946,0.000482212,-1.88416e-07,-5.24035e-10,0.916428,0.000481833,-1.89988e-07,2.18241e-08,0.916909,0.000481519,-1.24516e-07,-2.71679e-08,0.917391,0.000481188,-2.06019e-07,2.72427e-08,0.917872,0.000480858,-1.24291e-07,-2.21985e-08,0.918353,0.000480543,-1.90886e-07,1.94644e-09,0.918833,0.000480167,-1.85047e-07,1.44127e-08,0.919313,0.00047984,-1.41809e-07,7.39438e-12,0.919793,0.000479556,-1.41787e-07,-1.44423e-08,0.920272,0.000479229,-1.85114e-07,-1.84291e-09,0.920751,0.000478854,-1.90642e-07,2.18139e-08,0.92123,0.000478538,-1.25201e-07,-2.58081e-08,0.921708,0.00047821,-2.02625e-07,2.18139e-08,0.922186,0.00047787,-1.37183e-07,-1.84291e-09,0.922664,0.00047759,-1.42712e-07,-1.44423e-08,0.923141,0.000477262,-1.86039e-07,7.34701e-12,0.923618,0.00047689,-1.86017e-07,1.44129e-08,0.924095,0.000476561,-1.42778e-07,1.94572e-09,0.924572,0.000476281,-1.36941e-07,-2.21958e-08,0.925048,0.000475941,-2.03528e-07,2.72327e-08,0.925523,0.000475615,-1.2183e-07,-2.71304e-08,0.925999,0.00047529,-2.03221e-07,2.16843e-08,0.926474,0.000474949,-1.38168e-07,-2.16005e-12,0.926949,0.000474672,-1.38175e-07,-2.16756e-08,0.927423,0.000474331,-2.03202e-07,2.71001e-08,0.927897,0.000474006,-1.21902e-07,-2.71201e-08,0.928371,0.000473681,-2.03262e-07,2.17757e-08,0.928845,0.00047334,-1.37935e-07,-3.78028e-10,0.929318,0.000473063,-1.39069e-07,-2.02636e-08,0.929791,0.000472724,-1.9986e-07,2.18276e-08,0.930263,0.000472389,-1.34377e-07,-7.44231e-09,0.930736,0.000472098,-1.56704e-07,7.94165e-09,0.931208,0.000471809,-1.32879e-07,-2.43243e-08,0.931679,0.00047147,-2.05851e-07,2.97508e-08,0.932151,0.000471148,-1.16599e-07,-3.50742e-08,0.932622,0.000470809,-2.21822e-07,5.09414e-08,0.933092,0.000470518,-6.89976e-08,-4.94821e-08,0.933563,0.000470232,-2.17444e-07,2.77775e-08,0.934033,0.00046988,-1.34111e-07,-2.02351e-09,0.934502,0.000469606,-1.40182e-07,-1.96835e-08,0.934972,0.000469267,-1.99232e-07,2.11529e-08,0.935441,0.000468932,-1.35774e-07,-5.32332e-09,0.93591,0.000468644,-1.51743e-07,1.40413e-10,0.936378,0.000468341,-1.51322e-07,4.76166e-09,0.936846,0.000468053,-1.37037e-07,-1.9187e-08,0.937314,0.000467721,-1.94598e-07,1.23819e-08,0.937782,0.000467369,-1.57453e-07,2.92642e-08,0.938249,0.000467142,-6.96601e-08,-6.98342e-08,0.938716,0.000466793,-2.79163e-07,7.12586e-08,0.939183,0.000466449,-6.53869e-08,-3.63863e-08,0.939649,0.000466209,-1.74546e-07,1.46818e-08,0.940115,0.000465904,-1.305e-07,-2.2341e-08,0.940581,0.000465576,-1.97523e-07,1.50774e-08,0.941046,0.000465226,-1.52291e-07,2.16359e-08,0.941511,0.000464986,-8.73832e-08,-4.20162e-08,0.941976,0.000464685,-2.13432e-07,2.72198e-08,0.942441,0.00046434,-1.31773e-07,-7.2581e-09,0.942905,0.000464055,-1.53547e-07,1.81263e-09,0.943369,0.000463753,-1.48109e-07,7.58386e-12,0.943832,0.000463457,-1.48086e-07,-1.84298e-09,0.944296,0.000463155,-1.53615e-07,7.36433e-09,0.944759,0.00046287,-1.31522e-07,-2.76143e-08,0.945221,0.000462524,-2.14365e-07,4.34883e-08,0.945684,0.000462226,-8.39003e-08,-2.71297e-08,0.946146,0.000461977,-1.65289e-07,5.42595e-09,0.946608,0.000461662,-1.49012e-07,5.42593e-09,0.947069,0.000461381,-1.32734e-07,-2.71297e-08,0.94753,0.000461034,-2.14123e-07,4.34881e-08,0.947991,0.000460736,-8.36585e-08,-2.76134e-08,0.948452,0.000460486,-1.66499e-07,7.36083e-09,0.948912,0.000460175,-1.44416e-07,-1.82993e-09,0.949372,0.000459881,-1.49906e-07,-4.11073e-11,0.949832,0.000459581,-1.50029e-07,1.99434e-09,0.950291,0.000459287,-1.44046e-07,-7.93627e-09,0.950751,0.000458975,-1.67855e-07,2.97507e-08,0.951209,0.000458728,-7.86029e-08,-5.1462e-08,0.951668,0.000458417,-2.32989e-07,5.6888e-08,0.952126,0.000458121,-6.2325e-08,-5.68806e-08,0.952584,0.000457826,-2.32967e-07,5.14251e-08,0.953042,0.000457514,-7.86914e-08,-2.96107e-08,0.953499,0.000457268,-1.67523e-07,7.41296e-09,0.953956,0.000456955,-1.45285e-07,-4.11262e-11,0.954413,0.000456665,-1.45408e-07,-7.24847e-09,0.95487,0.000456352,-1.67153e-07,2.9035e-08,0.955326,0.000456105,-8.00484e-08,-4.92869e-08,0.955782,0.000455797,-2.27909e-07,4.89032e-08,0.956238,0.000455488,-8.11994e-08,-2.71166e-08,0.956693,0.000455244,-1.62549e-07,-4.13678e-11,0.957148,0.000454919,-1.62673e-07,2.72821e-08,0.957603,0.000454675,-8.0827e-08,-4.94824e-08,0.958057,0.000454365,-2.29274e-07,5.14382e-08,0.958512,0.000454061,-7.49597e-08,-3.7061e-08,0.958965,0.0004538,-1.86143e-07,3.72013e-08,0.959419,0.000453539,-7.45389e-08,-5.21396e-08,0.959873,0.000453234,-2.30958e-07,5.21476e-08,0.960326,0.000452928,-7.45146e-08,-3.72416e-08,0.960778,0.000452667,-1.8624e-07,3.72143e-08,0.961231,0.000452407,-7.45967e-08,-5.20109e-08,0.961683,0.000452101,-2.30629e-07,5.16199e-08,0.962135,0.000451795,-7.57696e-08,-3.52595e-08,0.962587,0.000451538,-1.81548e-07,2.98133e-08,0.963038,0.000451264,-9.2108e-08,-2.43892e-08,0.963489,0.000451007,-1.65276e-07,8.13892e-09,0.96394,0.000450701,-1.40859e-07,-8.16647e-09,0.964391,0.000450394,-1.65358e-07,2.45269e-08,0.964841,0.000450137,-9.17775e-08,-3.03367e-08,0.965291,0.000449863,-1.82787e-07,3.7215e-08,0.965741,0.000449609,-7.11424e-08,-5.89188e-08,0.96619,0.00044929,-2.47899e-07,7.92509e-08,0.966639,0.000449032,-1.01462e-08,-7.92707e-08,0.967088,0.000448773,-2.47958e-07,5.90181e-08,0.967537,0.000448455,-7.0904e-08,-3.75925e-08,0.967985,0.0004482,-1.83681e-07,3.17471e-08,0.968433,0.000447928,-8.84401e-08,-2.97913e-08,0.968881,0.000447662,-1.77814e-07,2.78133e-08,0.969329,0.000447389,-9.4374e-08,-2.18572e-08,0.969776,0.000447135,-1.59946e-07,1.10134e-11,0.970223,0.000446815,-1.59913e-07,2.18132e-08,0.97067,0.000446561,-9.44732e-08,-2.76591e-08,0.971116,0.000446289,-1.7745e-07,2.92185e-08,0.971562,0.000446022,-8.97948e-08,-2.96104e-08,0.972008,0.000445753,-1.78626e-07,2.96185e-08,0.972454,0.000445485,-8.97706e-08,-2.92588e-08,0.972899,0.000445218,-1.77547e-07,2.78123e-08,0.973344,0.000444946,-9.41103e-08,-2.23856e-08,0.973789,0.000444691,-1.61267e-07,2.12559e-09,0.974233,0.000444374,-1.5489e-07,1.38833e-08,0.974678,0.000444106,-1.13241e-07,1.94591e-09,0.975122,0.000443886,-1.07403e-07,-2.16669e-08,0.975565,0.000443606,-1.72404e-07,2.5117e-08,0.976009,0.000443336,-9.70526e-08,-1.91963e-08,0.976452,0.000443085,-1.54642e-07,-7.93627e-09,0.976895,0.000442752,-1.7845e-07,5.09414e-08,0.977338,0.000442548,-2.56262e-08,-7.66201e-08,0.97778,0.000442266,-2.55486e-07,7.67249e-08,0.978222,0.000441986,-2.53118e-08,-5.14655e-08,0.978664,0.000441781,-1.79708e-07,9.92773e-09,0.979106,0.000441451,-1.49925e-07,1.17546e-08,0.979547,0.000441186,-1.14661e-07,2.65868e-09,0.979988,0.000440965,-1.06685e-07,-2.23893e-08,0.980429,0.000440684,-1.73853e-07,2.72939e-08,0.980869,0.000440419,-9.19716e-08,-2.71816e-08,0.98131,0.000440153,-1.73516e-07,2.18278e-08,0.98175,0.000439872,-1.08033e-07,-5.24833e-10,0.982189,0.000439654,-1.09607e-07,-1.97284e-08,0.982629,0.000439376,-1.68793e-07,1.98339e-08,0.983068,0.000439097,-1.09291e-07,-2.62901e-12,0.983507,0.000438879,-1.09299e-07,-1.98234e-08,0.983946,0.000438601,-1.68769e-07,1.96916e-08,0.984384,0.000438322,-1.09694e-07,6.6157e-10,0.984823,0.000438105,-1.0771e-07,-2.23379e-08,0.985261,0.000437823,-1.74723e-07,2.90855e-08,0.985698,0.00043756,-8.74669e-08,-3.43992e-08,0.986136,0.000437282,-1.90665e-07,4.89068e-08,0.986573,0.000437048,-4.39442e-08,-4.20188e-08,0.98701,0.000436834,-1.7e-07,-4.11073e-11,0.987446,0.000436494,-1.70124e-07,4.21832e-08,0.987883,0.00043628,-4.35742e-08,-4.94824e-08,0.988319,0.000436044,-1.92021e-07,3.6537e-08,0.988755,0.00043577,-8.24102e-08,-3.70611e-08,0.989191,0.000435494,-1.93593e-07,5.21026e-08,0.989626,0.000435263,-3.72855e-08,-5.21402e-08,0.990061,0.000435032,-1.93706e-07,3.7249e-08,0.990496,0.000434756,-8.19592e-08,-3.72512e-08,0.990931,0.000434481,-1.93713e-07,5.21511e-08,0.991365,0.00043425,-3.72595e-08,-5.21439e-08,0.991799,0.000434019,-1.93691e-07,3.72152e-08,0.992233,0.000433743,-8.20456e-08,-3.71123e-08,0.992667,0.000433468,-1.93382e-07,5.16292e-08,0.9931,0.000433236,-3.84947e-08,-5.01953e-08,0.993533,0.000433008,-1.89081e-07,2.99427e-08,0.993966,0.00043272,-9.92525e-08,-9.9708e-09,0.994399,0.000432491,-1.29165e-07,9.94051e-09,0.994831,0.000432263,-9.93434e-08,-2.97912e-08,0.995263,0.000431975,-1.88717e-07,4.96198e-08,0.995695,0.000431746,-3.98578e-08,-4.94785e-08,0.996127,0.000431518,-1.88293e-07,2.9085e-08,0.996558,0.000431229,-1.01038e-07,-7.25675e-09,0.996989,0.000431005,-1.22809e-07,-5.79945e-11,0.99742,0.000430759,-1.22983e-07,7.48873e-09,0.997851,0.000430536,-1.00516e-07,-2.98969e-08,0.998281,0.000430245,-1.90207e-07,5.24942e-08,0.998711,0.000430022,-3.27246e-08,-6.08706e-08,0.999141,0.000429774,-2.15336e-07,7.17788e-08,0.999571,0.000429392,0.,0.};
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void Lab2RGBConvert_f(const T& src, D& dst)
+        {
+            const float lThresh = 0.008856f * 903.3f;
+            const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
+
+            float Y, fy;
+
+            if (src.x <= lThresh)
+            {
+                Y = src.x / 903.3f;
+                fy = 7.787f * Y + 16.0f / 116.0f;
+            }
+            else
+            {
+                fy = (src.x + 16.0f) / 116.0f;
+                Y = fy * fy * fy;
+            }
+
+            float X = src.y / 500.0f + fy;
+            float Z = fy - src.z / 200.0f;
+
+            if (X <= fThresh)
+                X = (X - 16.0f / 116.0f) / 7.787f;
+            else
+                X = X * X * X;
+
+            if (Z <= fThresh)
+                Z = (Z - 16.0f / 116.0f) / 7.787f;
+            else
+                Z = Z * Z * Z;
+
+            float B = 0.052891f * X - 0.204043f * Y + 1.151152f * Z;
+            float G = -0.921235f * X + 1.875991f * Y + 0.045244f * Z;
+            float R = 3.079933f * X - 1.537150f * Y - 0.542782f * Z;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            dst.x = blueIdx == 0 ? B : R;
+            dst.y = G;
+            dst.z = blueIdx == 0 ? R : B;
+            setAlpha(dst, ColorChannel<float>::max());
+        }
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void Lab2RGBConvert_b(const T& src, D& dst)
+        {
+            float3 srcf, dstf;
+
+            srcf.x = src.x * (100.f / 255.f);
+            srcf.y = src.y - 128;
+            srcf.z = src.z - 128;
+
+            Lab2RGBConvert_f<srgb, blueIdx>(srcf, dstf);
+
+            dst.x = saturate_cast<uchar>(dstf.x * 255.f);
+            dst.y = saturate_cast<uchar>(dstf.y * 255.f);
+            dst.z = saturate_cast<uchar>(dstf.z * 255.f);
+            setAlpha(dst, ColorChannel<uchar>::max());
+        }
+
+        template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct Lab2RGB;
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct Lab2RGB<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<uchar, scn>::vec_type, typename TypeVec<uchar, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<uchar, dcn>::vec_type operator ()(const typename TypeVec<uchar, scn>::vec_type& src) const
+            {
+                typename TypeVec<uchar, dcn>::vec_type dst;
+
+                Lab2RGBConvert_b<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
+        };
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct Lab2RGB<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<float, dcn>::vec_type operator ()(const typename TypeVec<float, scn>::vec_type& src) const
+            {
+                typename TypeVec<float, dcn>::vec_type dst;
+
+                Lab2RGBConvert_f<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::Lab2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> Luv /////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ float c_LabCbrtTab[] = {0.137931,0.0114066,0.,1.18859e-07,0.149338,0.011407,3.56578e-07,-5.79396e-07,0.160745,0.0114059,-1.38161e-06,2.16892e-06,0.172151,0.0114097,5.12516e-06,-8.0814e-06,0.183558,0.0113957,-1.9119e-05,3.01567e-05,0.194965,0.0114479,7.13509e-05,-0.000112545,0.206371,0.011253,-0.000266285,-0.000106493,0.217252,0.0104009,-0.000585765,7.32149e-05,0.22714,0.00944906,-0.00036612,1.21917e-05,0.236235,0.0087534,-0.000329545,2.01753e-05,0.244679,0.00815483,-0.000269019,1.24435e-05,0.252577,0.00765412,-0.000231689,1.05618e-05,0.26001,0.00722243,-0.000200003,8.26662e-06,0.267041,0.00684723,-0.000175203,6.76746e-06,0.27372,0.00651712,-0.000154901,5.61192e-06,0.280088,0.00622416,-0.000138065,4.67009e-06,0.286179,0.00596204,-0.000124055,3.99012e-06,0.292021,0.0057259,-0.000112085,3.36032e-06,0.297638,0.00551181,-0.000102004,2.95338e-06,0.30305,0.00531666,-9.31435e-05,2.52875e-06,0.308277,0.00513796,-8.55572e-05,2.22022e-06,0.313331,0.00497351,-7.88966e-05,1.97163e-06,0.318228,0.00482163,-7.29817e-05,1.7248e-06,0.322978,0.00468084,-6.78073e-05,1.55998e-06,0.327593,0.0045499,-6.31274e-05,1.36343e-06,0.332081,0.00442774,-5.90371e-05,1.27136e-06,0.336451,0.00431348,-5.5223e-05,1.09111e-06,0.34071,0.00420631,-5.19496e-05,1.0399e-06,0.344866,0.00410553,-4.88299e-05,9.18347e-07,0.348923,0.00401062,-4.60749e-05,8.29942e-07,0.352889,0.00392096,-4.35851e-05,7.98478e-07,0.356767,0.00383619,-4.11896e-05,6.84917e-07,0.360562,0.00375586,-3.91349e-05,6.63976e-07,0.36428,0.00367959,-3.7143e-05,5.93086e-07,0.367923,0.00360708,-3.53637e-05,5.6976e-07,0.371495,0.00353806,-3.36544e-05,4.95533e-07,0.375,0.00347224,-3.21678e-05,4.87951e-07,0.378441,0.00340937,-3.0704e-05,4.4349e-07,0.38182,0.00334929,-2.93735e-05,4.20297e-07,0.38514,0.0032918,-2.81126e-05,3.7872e-07,0.388404,0.00323671,-2.69764e-05,3.596e-07,0.391614,0.00318384,-2.58976e-05,3.5845e-07,0.394772,0.00313312,-2.48223e-05,2.92765e-07,0.397881,0.00308435,-2.3944e-05,3.18232e-07,0.400942,0.00303742,-2.29893e-05,2.82046e-07,0.403957,0.00299229,-2.21432e-05,2.52315e-07,0.406927,0.00294876,-2.13862e-05,2.58416e-07,0.409855,0.00290676,-2.0611e-05,2.33939e-07,0.412741,0.00286624,-1.99092e-05,2.36342e-07,0.415587,0.00282713,-1.92001e-05,1.916e-07,0.418396,0.00278931,-1.86253e-05,2.1915e-07,0.421167,0.00275271,-1.79679e-05,1.83498e-07,0.423901,0.00271733,-1.74174e-05,1.79343e-07,0.426602,0.00268303,-1.68794e-05,1.72013e-07,0.429268,0.00264979,-1.63633e-05,1.75686e-07,0.431901,0.00261759,-1.58363e-05,1.3852e-07,0.434503,0.00258633,-1.54207e-05,1.64304e-07,0.437074,0.00255598,-1.49278e-05,1.28136e-07,0.439616,0.00252651,-1.45434e-05,1.57618e-07,0.442128,0.0024979,-1.40705e-05,1.0566e-07,0.444612,0.00247007,-1.37535e-05,1.34998e-07,0.447068,0.00244297,-1.33485e-05,1.29207e-07,0.449498,0.00241666,-1.29609e-05,9.32347e-08,0.451902,0.00239102,-1.26812e-05,1.23703e-07,0.45428,0.00236603,-1.23101e-05,9.74072e-08,0.456634,0.0023417,-1.20179e-05,1.12518e-07,0.458964,0.002318,-1.16803e-05,7.83681e-08,0.46127,0.00229488,-1.14452e-05,1.10452e-07,0.463554,0.00227232,-1.11139e-05,7.58719e-08,0.465815,0.00225032,-1.08863e-05,9.2699e-08,0.468055,0.00222882,-1.06082e-05,8.97738e-08,0.470273,0.00220788,-1.03388e-05,5.4845e-08,0.47247,0.00218736,-1.01743e-05,1.0808e-07,0.474648,0.00216734,-9.85007e-06,4.9277e-08,0.476805,0.00214779,-9.70224e-06,8.22408e-08,0.478943,0.00212863,-9.45551e-06,6.87942e-08,0.481063,0.00210993,-9.24913e-06,5.98144e-08,0.483163,0.00209161,-9.06969e-06,7.93789e-08,0.485246,0.00207371,-8.83155e-06,3.99032e-08,0.487311,0.00205616,-8.71184e-06,8.88325e-08,0.489358,0.002039,-8.44534e-06,2.20004e-08,0.491389,0.00202218,-8.37934e-06,9.13872e-08,0.493403,0.0020057,-8.10518e-06,2.96829e-08,0.495401,0.00198957,-8.01613e-06,5.81028e-08,0.497382,0.00197372,-7.84183e-06,6.5731e-08,0.499348,0.00195823,-7.64463e-06,3.66019e-08,0.501299,0.00194305,-7.53483e-06,2.62811e-08,0.503234,0.00192806,-7.45598e-06,9.66907e-08,0.505155,0.00191344,-7.16591e-06,4.18928e-09,0.507061,0.00189912,-7.15334e-06,6.53665e-08,0.508953,0.00188501,-6.95724e-06,3.23686e-08,0.510831,0.00187119,-6.86014e-06,4.35774e-08,0.512696,0.0018576,-6.72941e-06,3.17406e-08,0.514547,0.00184424,-6.63418e-06,6.78785e-08,0.516384,0.00183117,-6.43055e-06,-5.23126e-09,0.518209,0.0018183,-6.44624e-06,7.22562e-08,0.520021,0.00180562,-6.22947e-06,1.42292e-08,0.52182,0.0017932,-6.18679e-06,4.9641e-08,0.523607,0.00178098,-6.03786e-06,2.56259e-08,0.525382,0.00176898,-5.96099e-06,2.66696e-08,0.527145,0.00175714,-5.88098e-06,4.65094e-08,0.528897,0.00174552,-5.74145e-06,2.57114e-08,0.530637,0.00173411,-5.66431e-06,2.94588e-08,0.532365,0.00172287,-5.57594e-06,3.52667e-08,0.534082,0.00171182,-5.47014e-06,8.28868e-09,0.535789,0.00170091,-5.44527e-06,5.07871e-08,0.537484,0.00169017,-5.29291e-06,2.69817e-08,0.539169,0.00167967,-5.21197e-06,2.01009e-08,0.540844,0.0016693,-5.15166e-06,1.18237e-08,0.542508,0.00165903,-5.11619e-06,5.18135e-08,0.544162,0.00164896,-4.96075e-06,1.9341e-08,0.545806,0.00163909,-4.90273e-06,-9.96867e-09,0.54744,0.00162926,-4.93263e-06,8.01382e-08,0.549064,0.00161963,-4.69222e-06,-1.25601e-08,0.550679,0.00161021,-4.7299e-06,2.97067e-08,0.552285,0.00160084,-4.64078e-06,1.29426e-08,0.553881,0.0015916,-4.60195e-06,3.77327e-08,0.555468,0.00158251,-4.48875e-06,1.49412e-08,0.557046,0.00157357,-4.44393e-06,2.17118e-08,0.558615,0.00156475,-4.3788e-06,1.74206e-08,0.560176,0.00155605,-4.32653e-06,2.78152e-08,0.561727,0.00154748,-4.24309e-06,-9.47239e-09,0.563271,0.00153896,-4.27151e-06,6.9679e-08,0.564805,0.00153063,-4.06247e-06,-3.08246e-08,0.566332,0.00152241,-4.15494e-06,5.36188e-08,0.56785,0.00151426,-3.99409e-06,-4.83594e-09,0.56936,0.00150626,-4.00859e-06,2.53293e-08,0.570863,0.00149832,-3.93261e-06,2.27286e-08,0.572357,0.00149052,-3.86442e-06,2.96541e-09,0.573844,0.0014828,-3.85552e-06,2.50147e-08,0.575323,0.00147516,-3.78048e-06,1.61842e-08,0.576794,0.00146765,-3.73193e-06,2.94582e-08,0.578258,0.00146028,-3.64355e-06,-1.48076e-08,0.579715,0.00145295,-3.68798e-06,2.97724e-08,0.581164,0.00144566,-3.59866e-06,1.49272e-08,0.582606,0.00143851,-3.55388e-06,2.97285e-08,0.584041,0.00143149,-3.46469e-06,-1.46323e-08,0.585469,0.00142451,-3.50859e-06,2.88004e-08,0.58689,0.00141758,-3.42219e-06,1.864e-08,0.588304,0.00141079,-3.36627e-06,1.58482e-08,0.589712,0.00140411,-3.31872e-06,-2.24279e-08,0.591112,0.00139741,-3.38601e-06,7.38639e-08,0.592507,0.00139085,-3.16441e-06,-3.46088e-08,0.593894,0.00138442,-3.26824e-06,4.96675e-09,0.595275,0.0013779,-3.25334e-06,7.4346e-08,0.59665,0.00137162,-3.0303e-06,-6.39319e-08,0.598019,0.00136536,-3.2221e-06,6.21725e-08,0.599381,0.00135911,-3.03558e-06,-5.94423e-09,0.600737,0.00135302,-3.05341e-06,2.12091e-08,0.602087,0.00134697,-2.98979e-06,-1.92876e-08,0.603431,0.00134094,-3.04765e-06,5.5941e-08,0.604769,0.00133501,-2.87983e-06,-2.56622e-08,0.606101,0.00132917,-2.95681e-06,4.67078e-08,0.607427,0.0013234,-2.81669e-06,-4.19592e-08,0.608748,0.00131764,-2.94257e-06,6.15243e-08,0.610062,0.00131194,-2.75799e-06,-2.53244e-08,0.611372,0.00130635,-2.83397e-06,3.97739e-08,0.612675,0.0013008,-2.71465e-06,-1.45618e-08,0.613973,0.00129533,-2.75833e-06,1.84733e-08,0.615266,0.00128986,-2.70291e-06,2.73606e-10,0.616553,0.00128446,-2.70209e-06,4.00367e-08,0.617835,0.00127918,-2.58198e-06,-4.12113e-08,0.619111,0.00127389,-2.70561e-06,6.52039e-08,0.620383,0.00126867,-2.51e-06,-4.07901e-08,0.621649,0.00126353,-2.63237e-06,3.83516e-08,0.62291,0.00125838,-2.51732e-06,6.59315e-09,0.624166,0.00125337,-2.49754e-06,-5.11939e-09,0.625416,0.00124836,-2.5129e-06,1.38846e-08,0.626662,0.00124337,-2.47124e-06,9.18514e-09,0.627903,0.00123846,-2.44369e-06,8.97952e-09,0.629139,0.0012336,-2.41675e-06,1.45012e-08,0.63037,0.00122881,-2.37325e-06,-7.37949e-09,0.631597,0.00122404,-2.39538e-06,1.50169e-08,0.632818,0.00121929,-2.35033e-06,6.91648e-09,0.634035,0.00121461,-2.32958e-06,1.69219e-08,0.635248,0.00121,-2.27882e-06,-1.49997e-08,0.636455,0.0012054,-2.32382e-06,4.30769e-08,0.637659,0.00120088,-2.19459e-06,-3.80986e-08,0.638857,0.00119638,-2.30888e-06,4.97134e-08,0.640051,0.00119191,-2.15974e-06,-4.15463e-08,0.641241,0.00118747,-2.28438e-06,5.68667e-08,0.642426,0.00118307,-2.11378e-06,-7.10641e-09,0.643607,0.00117882,-2.1351e-06,-2.8441e-08,0.644784,0.00117446,-2.22042e-06,6.12658e-08,0.645956,0.00117021,-2.03663e-06,-3.78083e-08,0.647124,0.00116602,-2.15005e-06,3.03627e-08,0.648288,0.00116181,-2.05896e-06,-2.40379e-08,0.649448,0.00115762,-2.13108e-06,6.57887e-08,0.650603,0.00115356,-1.93371e-06,-6.03028e-08,0.651755,0.00114951,-2.11462e-06,5.62134e-08,0.652902,0.00114545,-1.94598e-06,-4.53417e-08,0.654046,0.00114142,-2.082e-06,6.55489e-08,0.655185,0.00113745,-1.88536e-06,-3.80396e-08,0.656321,0.00113357,-1.99948e-06,2.70049e-08,0.657452,0.00112965,-1.91846e-06,-1.03755e-08,0.65858,0.00112578,-1.94959e-06,1.44973e-08,0.659704,0.00112192,-1.9061e-06,1.1991e-08,0.660824,0.00111815,-1.87012e-06,-2.85634e-09,0.66194,0.0011144,-1.87869e-06,-5.65782e-10,0.663053,0.00111064,-1.88039e-06,5.11947e-09,0.664162,0.0011069,-1.86503e-06,3.96924e-08,0.665267,0.00110328,-1.74595e-06,-4.46795e-08,0.666368,0.00109966,-1.87999e-06,1.98161e-08,0.667466,0.00109596,-1.82054e-06,2.502e-08,0.66856,0.00109239,-1.74548e-06,-6.86593e-10,0.669651,0.0010889,-1.74754e-06,-2.22739e-08,0.670738,0.00108534,-1.81437e-06,3.01776e-08,0.671821,0.0010818,-1.72383e-06,2.07732e-08,0.672902,0.00107841,-1.66151e-06,-5.36658e-08,0.673978,0.00107493,-1.82251e-06,7.46802e-08,0.675051,0.00107151,-1.59847e-06,-6.62411e-08,0.676121,0.00106811,-1.79719e-06,7.10748e-08,0.677188,0.00106473,-1.58397e-06,-3.92441e-08,0.678251,0.00106145,-1.7017e-06,2.62973e-08,0.679311,0.00105812,-1.62281e-06,-6.34035e-09,0.680367,0.00105486,-1.64183e-06,-9.36249e-10,0.68142,0.00105157,-1.64464e-06,1.00854e-08,0.68247,0.00104831,-1.61438e-06,2.01995e-08,0.683517,0.00104514,-1.55378e-06,-3.1279e-08,0.68456,0.00104194,-1.64762e-06,4.53114e-08,0.685601,0.00103878,-1.51169e-06,-3.07573e-08,0.686638,0.00103567,-1.60396e-06,1.81133e-08,0.687672,0.00103251,-1.54962e-06,1.79085e-08,0.688703,0.00102947,-1.49589e-06,-3.01428e-08,0.689731,0.00102639,-1.58632e-06,4.30583e-08,0.690756,0.00102334,-1.45715e-06,-2.28814e-08,0.691778,0.00102036,-1.52579e-06,-1.11373e-08,0.692797,0.00101727,-1.5592e-06,6.74305e-08,0.693812,0.00101436,-1.35691e-06,-7.97709e-08,0.694825,0.0010114,-1.59622e-06,7.28391e-08,0.695835,0.00100843,-1.37771e-06,-3.27715e-08,0.696842,0.00100558,-1.47602e-06,-1.35807e-09,0.697846,0.00100262,-1.48009e-06,3.82037e-08,0.698847,0.000999775,-1.36548e-06,-3.22474e-08,0.699846,0.000996948,-1.46223e-06,3.11809e-08,0.700841,0.000994117,-1.36868e-06,-3.28714e-08,0.701834,0.000991281,-1.4673e-06,4.07001e-08,0.702824,0.000988468,-1.3452e-06,-1.07197e-08,0.703811,0.000985746,-1.37736e-06,2.17866e-09,0.704795,0.000982998,-1.37082e-06,2.00521e-09,0.705777,0.000980262,-1.3648e-06,-1.01996e-08,0.706756,0.000977502,-1.3954e-06,3.87931e-08,0.707732,0.000974827,-1.27902e-06,-2.57632e-08,0.708706,0.000972192,-1.35631e-06,4.65513e-09,0.709676,0.000969493,-1.34235e-06,7.14257e-09,0.710645,0.00096683,-1.32092e-06,2.63791e-08,0.71161,0.000964267,-1.24178e-06,-5.30543e-08,0.712573,0.000961625,-1.40095e-06,6.66289e-08,0.713533,0.000959023,-1.20106e-06,-3.46474e-08,0.714491,0.000956517,-1.305e-06,1.23559e-08,0.715446,0.000953944,-1.26793e-06,-1.47763e-08,0.716399,0.000951364,-1.31226e-06,4.67494e-08,0.717349,0.000948879,-1.17201e-06,-5.3012e-08,0.718297,0.000946376,-1.33105e-06,4.60894e-08,0.719242,0.000943852,-1.19278e-06,-1.21366e-08,0.720185,0.00094143,-1.22919e-06,2.45673e-09,0.721125,0.000938979,-1.22182e-06,2.30966e-09,0.722063,0.000936543,-1.21489e-06,-1.16954e-08,0.722998,0.000934078,-1.24998e-06,4.44718e-08,0.723931,0.000931711,-1.11656e-06,-4.69823e-08,0.724861,0.000929337,-1.25751e-06,2.4248e-08,0.725789,0.000926895,-1.18477e-06,9.5949e-09,0.726715,0.000924554,-1.15598e-06,-3.02286e-09,0.727638,0.000922233,-1.16505e-06,2.49649e-09,0.72856,0.00091991,-1.15756e-06,-6.96321e-09,0.729478,0.000917575,-1.17845e-06,2.53564e-08,0.730395,0.000915294,-1.10238e-06,-3.48578e-08,0.731309,0.000912984,-1.20695e-06,5.44704e-08,0.732221,0.000910734,-1.04354e-06,-6.38144e-08,0.73313,0.000908455,-1.23499e-06,8.15781e-08,0.734038,0.00090623,-9.90253e-07,-8.3684e-08,0.734943,0.000903999,-1.2413e-06,7.43441e-08,0.735846,0.000901739,-1.01827e-06,-3.48787e-08,0.736746,0.000899598,-1.12291e-06,5.56596e-09,0.737645,0.000897369,-1.10621e-06,1.26148e-08,0.738541,0.000895194,-1.06837e-06,3.57935e-09,0.739435,0.000893068,-1.05763e-06,-2.69322e-08,0.740327,0.000890872,-1.13842e-06,4.45448e-08,0.741217,0.000888729,-1.00479e-06,-3.20376e-08,0.742105,0.000886623,-1.1009e-06,2.40011e-08,0.74299,0.000884493,-1.0289e-06,-4.36209e-09,0.743874,0.000882422,-1.04199e-06,-6.55268e-09,0.744755,0.000880319,-1.06164e-06,3.05728e-08,0.745634,0.000878287,-9.69926e-07,-5.61338e-08,0.746512,0.000876179,-1.13833e-06,7.4753e-08,0.747387,0.000874127,-9.14068e-07,-6.40644e-08,0.74826,0.000872106,-1.10626e-06,6.22955e-08,0.749131,0.000870081,-9.19375e-07,-6.59083e-08,0.75,0.000868044,-1.1171e-06,8.21284e-08,0.750867,0.000866056,-8.70714e-07,-8.37915e-08,0.751732,0.000864064,-1.12209e-06,7.42237e-08,0.752595,0.000862042,-8.99418e-07,-3.42894e-08,0.753456,0.00086014,-1.00229e-06,3.32955e-09,0.754315,0.000858146,-9.92297e-07,2.09712e-08,0.755173,0.000856224,-9.29384e-07,-2.76096e-08,0.756028,0.000854282,-1.01221e-06,2.98627e-08,0.756881,0.000852348,-9.22625e-07,-3.22365e-08,0.757733,0.000850406,-1.01933e-06,3.94786e-08,0.758582,0.000848485,-9.00898e-07,-6.46833e-09,0.75943,0.000846664,-9.20303e-07,-1.36052e-08,0.760275,0.000844783,-9.61119e-07,1.28447e-09,0.761119,0.000842864,-9.57266e-07,8.4674e-09,0.761961,0.000840975,-9.31864e-07,2.44506e-08,0.762801,0.000839185,-8.58512e-07,-4.6665e-08,0.763639,0.000837328,-9.98507e-07,4.30001e-08,0.764476,0.00083546,-8.69507e-07,-6.12609e-09,0.76531,0.000833703,-8.87885e-07,-1.84959e-08,0.766143,0.000831871,-9.43372e-07,2.05052e-08,0.766974,0.000830046,-8.81857e-07,-3.92026e-09,0.767803,0.000828271,-8.93618e-07,-4.82426e-09,0.768631,0.000826469,-9.0809e-07,2.32172e-08,0.769456,0.000824722,-8.38439e-07,-2.84401e-08,0.77028,0.00082296,-9.23759e-07,3.09386e-08,0.771102,0.000821205,-8.30943e-07,-3.57099e-08,0.771922,0.000819436,-9.38073e-07,5.22963e-08,0.772741,0.000817717,-7.81184e-07,-5.42658e-08,0.773558,0.000815992,-9.43981e-07,4.55579e-08,0.774373,0.000814241,-8.07308e-07,-8.75656e-09,0.775186,0.0008126,-8.33578e-07,-1.05315e-08,0.775998,0.000810901,-8.65172e-07,-8.72188e-09,0.776808,0.000809145,-8.91338e-07,4.54191e-08,0.777616,0.000807498,-7.5508e-07,-5.37454e-08,0.778423,0.000805827,-9.16317e-07,5.03532e-08,0.779228,0.000804145,-7.65257e-07,-2.84584e-08,0.780031,0.000802529,-8.50632e-07,3.87579e-09,0.780833,0.00080084,-8.39005e-07,1.29552e-08,0.781633,0.0007992,-8.00139e-07,3.90804e-09,0.782432,0.000797612,-7.88415e-07,-2.85874e-08,0.783228,0.000795949,-8.74177e-07,5.0837e-08,0.784023,0.000794353,-7.21666e-07,-5.55513e-08,0.784817,0.000792743,-8.8832e-07,5.21587e-08,0.785609,0.000791123,-7.31844e-07,-3.38744e-08,0.786399,0.000789558,-8.33467e-07,2.37342e-08,0.787188,0.000787962,-7.62264e-07,-1.45775e-09,0.787975,0.000786433,-7.66638e-07,-1.79034e-08,0.788761,0.000784846,-8.20348e-07,1.34665e-08,0.789545,0.000783246,-7.79948e-07,2.3642e-08,0.790327,0.000781757,-7.09022e-07,-4.84297e-08,0.791108,0.000780194,-8.54311e-07,5.08674e-08,0.791888,0.000778638,-7.01709e-07,-3.58303e-08,0.792666,0.000777127,-8.092e-07,3.28493e-08,0.793442,0.000775607,-7.10652e-07,-3.59624e-08,0.794217,0.000774078,-8.1854e-07,5.13959e-08,0.79499,0.000772595,-6.64352e-07,-5.04121e-08,0.795762,0.000771115,-8.15588e-07,3.10431e-08,0.796532,0.000769577,-7.22459e-07,-1.41557e-08,0.797301,0.00076809,-7.64926e-07,2.55795e-08,0.798069,0.000766636,-6.88187e-07,-2.85578e-08,0.798835,0.000765174,-7.73861e-07,2.90472e-08,0.799599,0.000763714,-6.86719e-07,-2.80262e-08,0.800362,0.000762256,-7.70798e-07,2.34531e-08,0.801123,0.000760785,-7.00438e-07,-6.18144e-09,0.801884,0.000759366,-7.18983e-07,1.27263e-09,0.802642,0.000757931,-7.15165e-07,1.09101e-09,0.803399,0.000756504,-7.11892e-07,-5.63675e-09,0.804155,0.000755064,-7.28802e-07,2.14559e-08,0.80491,0.00075367,-6.64434e-07,-2.05821e-08,0.805663,0.00075228,-7.26181e-07,1.26812e-09,0.806414,0.000750831,-7.22377e-07,1.55097e-08,0.807164,0.000749433,-6.75848e-07,-3.70216e-09,0.807913,0.00074807,-6.86954e-07,-7.0105e-10,0.80866,0.000746694,-6.89057e-07,6.5063e-09,0.809406,0.000745336,-6.69538e-07,-2.53242e-08,0.810151,0.000743921,-7.45511e-07,3.51858e-08,0.810894,0.000742535,-6.39953e-07,3.79034e-09,0.811636,0.000741267,-6.28582e-07,-5.03471e-08,0.812377,0.000739858,-7.79624e-07,7.83886e-08,0.813116,0.000738534,-5.44458e-07,-8.43935e-08,0.813854,0.000737192,-7.97638e-07,8.03714e-08,0.81459,0.000735838,-5.56524e-07,-5.82784e-08,0.815325,0.00073455,-7.31359e-07,3.35329e-08,0.816059,0.000733188,-6.3076e-07,-1.62486e-08,0.816792,0.000731878,-6.79506e-07,3.14614e-08,0.817523,0.000730613,-5.85122e-07,-4.99925e-08,0.818253,0.000729293,-7.35099e-07,4.92994e-08,0.818982,0.000727971,-5.87201e-07,-2.79959e-08,0.819709,0.000726712,-6.71189e-07,3.07959e-09,0.820435,0.000725379,-6.6195e-07,1.56777e-08,0.82116,0.000724102,-6.14917e-07,-6.18564e-09,0.821883,0.000722854,-6.33474e-07,9.06488e-09,0.822606,0.000721614,-6.06279e-07,-3.00739e-08,0.823327,0.000720311,-6.96501e-07,5.16262e-08,0.824046,0.000719073,-5.41623e-07,-5.72214e-08,0.824765,0.000717818,-7.13287e-07,5.80503e-08,0.825482,0.000716566,-5.39136e-07,-5.57703e-08,0.826198,0.00071532,-7.06447e-07,4.58215e-08,0.826912,0.000714045,-5.68983e-07,-8.30636e-09,0.827626,0.000712882,-5.93902e-07,-1.25961e-08,0.828338,0.000711656,-6.3169e-07,-9.13985e-10,0.829049,0.00071039,-6.34432e-07,1.62519e-08,0.829759,0.00070917,-5.85676e-07,-4.48904e-09,0.830468,0.000707985,-5.99143e-07,1.70418e-09,0.831175,0.000706792,-5.9403e-07,-2.32768e-09,0.831881,0.000705597,-6.01014e-07,7.60648e-09,0.832586,0.000704418,-5.78194e-07,-2.80982e-08,0.83329,0.000703177,-6.62489e-07,4.51817e-08,0.833993,0.000701988,-5.26944e-07,-3.34192e-08,0.834694,0.000700834,-6.27201e-07,2.88904e-08,0.835394,0.000699666,-5.4053e-07,-2.25378e-08,0.836093,0.000698517,-6.08143e-07,1.65589e-09,0.836791,0.000697306,-6.03176e-07,1.59142e-08,0.837488,0.000696147,-5.55433e-07,-5.70801e-09,0.838184,0.000695019,-5.72557e-07,6.91792e-09,0.838878,0.000693895,-5.51803e-07,-2.19637e-08,0.839571,0.000692725,-6.17694e-07,2.13321e-08,0.840263,0.000691554,-5.53698e-07,-3.75996e-09,0.840954,0.000690435,-5.64978e-07,-6.29219e-09,0.841644,0.000689287,-5.83855e-07,2.89287e-08,0.842333,0.000688206,-4.97068e-07,-4.98181e-08,0.843021,0.000687062,-6.46523e-07,5.11344e-08,0.843707,0.000685922,-4.9312e-07,-3.55102e-08,0.844393,0.00068483,-5.9965e-07,3.13019e-08,0.845077,0.000683724,-5.05745e-07,-3.00925e-08,0.84576,0.000682622,-5.96022e-07,2.94636e-08,0.846442,0.000681519,-5.07631e-07,-2.81572e-08,0.847123,0.000680419,-5.92103e-07,2.35606e-08,0.847803,0.000679306,-5.21421e-07,-6.48045e-09,0.848482,0.000678243,-5.40863e-07,2.36124e-09,0.849159,0.000677169,-5.33779e-07,-2.96461e-09,0.849836,0.000676092,-5.42673e-07,9.49728e-09,0.850512,0.000675035,-5.14181e-07,-3.50245e-08,0.851186,0.000673902,-6.19254e-07,7.09959e-08,0.851859,0.000672876,-4.06267e-07,-7.01453e-08,0.852532,0.000671853,-6.16703e-07,3.07714e-08,0.853203,0.000670712,-5.24388e-07,6.66423e-09,0.853873,0.000669684,-5.04396e-07,2.17629e-09,0.854542,0.000668681,-4.97867e-07,-1.53693e-08,0.855211,0.000667639,-5.43975e-07,-3.03752e-10,0.855878,0.000666551,-5.44886e-07,1.65844e-08,0.856544,0.000665511,-4.95133e-07,-6.42907e-09,0.857209,0.000664501,-5.1442e-07,9.13195e-09,0.857873,0.0006635,-4.87024e-07,-3.00987e-08,0.858536,0.000662435,-5.7732e-07,5.16584e-08,0.859198,0.000661436,-4.22345e-07,-5.73255e-08,0.859859,0.000660419,-5.94322e-07,5.84343e-08,0.860518,0.000659406,-4.19019e-07,-5.72022e-08,0.861177,0.000658396,-5.90626e-07,5.11653e-08,0.861835,0.000657368,-4.3713e-07,-2.82495e-08,0.862492,0.000656409,-5.21878e-07,2.22788e-09,0.863148,0.000655372,-5.15195e-07,1.9338e-08,0.863803,0.0006544,-4.5718e-07,-1.99754e-08,0.864457,0.000653425,-5.17107e-07,9.59024e-10,0.86511,0.000652394,-5.1423e-07,1.61393e-08,0.865762,0.000651414,-4.65812e-07,-5.91149e-09,0.866413,0.000650465,-4.83546e-07,7.50665e-09,0.867063,0.00064952,-4.61026e-07,-2.4115e-08,0.867712,0.000648526,-5.33371e-07,2.93486e-08,0.86836,0.000647547,-4.45325e-07,-3.36748e-08,0.869007,0.000646555,-5.4635e-07,4.57461e-08,0.869653,0.0006456,-4.09112e-07,-3.01002e-08,0.870298,0.000644691,-4.99412e-07,1.50501e-08,0.870942,0.000643738,-4.54262e-07,-3.01002e-08,0.871585,0.000642739,-5.44563e-07,4.57461e-08,0.872228,0.000641787,-4.07324e-07,-3.36748e-08,0.872869,0.000640871,-5.08349e-07,2.93486e-08,0.873509,0.000639943,-4.20303e-07,-2.4115e-08,0.874149,0.00063903,-4.92648e-07,7.50655e-09,0.874787,0.000638067,-4.70128e-07,-5.91126e-09,0.875425,0.000637109,-4.87862e-07,1.61385e-08,0.876062,0.000636182,-4.39447e-07,9.61961e-10,0.876697,0.000635306,-4.36561e-07,-1.99863e-08,0.877332,0.000634373,-4.9652e-07,1.93785e-08,0.877966,0.000633438,-4.38384e-07,2.07697e-09,0.878599,0.000632567,-4.32153e-07,-2.76864e-08,0.879231,0.00063162,-5.15212e-07,4.90641e-08,0.879862,0.000630737,-3.6802e-07,-4.93606e-08,0.880493,0.000629852,-5.16102e-07,2.9169e-08,0.881122,0.000628908,-4.28595e-07,-7.71083e-09,0.881751,0.000628027,-4.51727e-07,1.6744e-09,0.882378,0.000627129,-4.46704e-07,1.01317e-09,0.883005,0.000626239,-4.43665e-07,-5.72703e-09,0.883631,0.000625334,-4.60846e-07,2.1895e-08,0.884255,0.000624478,-3.95161e-07,-2.22481e-08,0.88488,0.000623621,-4.61905e-07,7.4928e-09,0.885503,0.00062272,-4.39427e-07,-7.72306e-09,0.886125,0.000621818,-4.62596e-07,2.33995e-08,0.886746,0.000620963,-3.92398e-07,-2.62704e-08,0.887367,0.000620099,-4.71209e-07,2.20775e-08,0.887987,0.000619223,-4.04976e-07,-2.43496e-09,0.888605,0.000618406,-4.12281e-07,-1.23377e-08,0.889223,0.000617544,-4.49294e-07,-7.81876e-09,0.88984,0.000616622,-4.72751e-07,4.36128e-08,0.890457,0.000615807,-3.41912e-07,-4.7423e-08,0.891072,0.000614981,-4.84181e-07,2.68698e-08,0.891687,0.000614093,-4.03572e-07,-4.51384e-10,0.8923,0.000613285,-4.04926e-07,-2.50643e-08,0.892913,0.0006124,-4.80119e-07,4.11038e-08,0.893525,0.000611563,-3.56808e-07,-2.01414e-08,0.894136,0.000610789,-4.17232e-07,-2.01426e-08,0.894747,0.000609894,-4.7766e-07,4.11073e-08,0.895356,0.000609062,-3.54338e-07,-2.50773e-08,0.895965,0.000608278,-4.2957e-07,-4.02954e-10,0.896573,0.000607418,-4.30779e-07,2.66891e-08,0.89718,0.000606636,-3.50711e-07,-4.67489e-08,0.897786,0.000605795,-4.90958e-07,4.10972e-08,0.898391,0.000604936,-3.67666e-07,1.56948e-09,0.898996,0.000604205,-3.62958e-07,-4.73751e-08,0.8996,0.000603337,-5.05083e-07,6.87214e-08,0.900202,0.000602533,-2.98919e-07,-4.86966e-08,0.900805,0.000601789,-4.45009e-07,6.85589e-09,0.901406,0.00060092,-4.24441e-07,2.1273e-08,0.902007,0.000600135,-3.60622e-07,-3.23434e-08,0.902606,0.000599317,-4.57652e-07,4.84959e-08,0.903205,0.000598547,-3.12164e-07,-4.24309e-08,0.903803,0.000597795,-4.39457e-07,2.01844e-09,0.904401,0.000596922,-4.33402e-07,3.43571e-08,0.904997,0.000596159,-3.30331e-07,-2.02374e-08,0.905593,0.000595437,-3.91043e-07,-1.30123e-08,0.906188,0.000594616,-4.3008e-07,1.26819e-08,0.906782,0.000593794,-3.92034e-07,2.18894e-08,0.907376,0.000593076,-3.26366e-07,-4.06349e-08,0.907968,0.000592301,-4.4827e-07,2.1441e-08,0.90856,0.000591469,-3.83947e-07,1.44754e-08,0.909151,0.000590744,-3.40521e-07,-1.97379e-08,0.909742,0.000590004,-3.99735e-07,4.87161e-09,0.910331,0.000589219,-3.8512e-07,2.51532e-10,0.91092,0.00058845,-3.84366e-07,-5.87776e-09,0.911508,0.000587663,-4.01999e-07,2.32595e-08,0.912096,0.000586929,-3.3222e-07,-2.75554e-08,0.912682,0.000586182,-4.14887e-07,2.73573e-08,0.913268,0.000585434,-3.32815e-07,-2.22692e-08,0.913853,0.000584702,-3.99622e-07,2.11486e-09,0.914437,0.000583909,-3.93278e-07,1.38098e-08,0.915021,0.000583164,-3.51848e-07,2.25042e-09,0.915604,0.000582467,-3.45097e-07,-2.28115e-08,0.916186,0.000581708,-4.13531e-07,2.93911e-08,0.916767,0.000580969,-3.25358e-07,-3.51481e-08,0.917348,0.000580213,-4.30803e-07,5.15967e-08,0.917928,0.000579506,-2.76012e-07,-5.20296e-08,0.918507,0.000578798,-4.32101e-07,3.73124e-08,0.919085,0.000578046,-3.20164e-07,-3.76154e-08,0.919663,0.000577293,-4.3301e-07,5.35447e-08,0.92024,0.000576587,-2.72376e-07,-5.7354e-08,0.920816,0.000575871,-4.44438e-07,5.66621e-08,0.921391,0.000575152,-2.74452e-07,-5.00851e-08,0.921966,0.000574453,-4.24707e-07,2.4469e-08,0.92254,0.000573677,-3.513e-07,1.18138e-08,0.923114,0.000573009,-3.15859e-07,-1.21195e-08,0.923686,0.000572341,-3.52217e-07,-2.29403e-08,0.924258,0.000571568,-4.21038e-07,4.4276e-08,0.924829,0.000570859,-2.8821e-07,-3.49546e-08,0.9254,0.000570178,-3.93074e-07,3.59377e-08,0.92597,0.000569499,-2.85261e-07,-4.91915e-08,0.926539,0.000568781,-4.32835e-07,4.16189e-08,0.927107,0.00056804,-3.07979e-07,1.92523e-09,0.927675,0.00056743,-3.02203e-07,-4.93198e-08,0.928242,0.000566678,-4.50162e-07,7.61447e-08,0.928809,0.000566006,-2.21728e-07,-7.6445e-08,0.929374,0.000565333,-4.51063e-07,5.08216e-08,0.929939,0.000564583,-2.98599e-07,-7.63212e-09,0.930503,0.000563963,-3.21495e-07,-2.02931e-08,0.931067,0.000563259,-3.82374e-07,2.92001e-08,0.93163,0.000562582,-2.94774e-07,-3.69025e-08,0.932192,0.000561882,-4.05482e-07,5.88053e-08,0.932754,0.000561247,-2.29066e-07,-7.91094e-08,0.933315,0.000560552,-4.66394e-07,7.88184e-08,0.933875,0.000559856,-2.29939e-07,-5.73501e-08,0.934434,0.000559224,-4.01989e-07,3.13727e-08,0.934993,0.000558514,-3.07871e-07,-8.53611e-09,0.935551,0.000557873,-3.33479e-07,2.77175e-09,0.936109,0.000557214,-3.25164e-07,-2.55091e-09,0.936666,0.000556556,-3.32817e-07,7.43188e-09,0.937222,0.000555913,-3.10521e-07,-2.71766e-08,0.937778,0.00055521,-3.92051e-07,4.167e-08,0.938333,0.000554551,-2.67041e-07,-2.02941e-08,0.938887,0.000553956,-3.27923e-07,-2.00984e-08,0.93944,0.00055324,-3.88218e-07,4.10828e-08,0.939993,0.000552587,-2.6497e-07,-2.50237e-08,0.940546,0.000551982,-3.40041e-07,-5.92583e-10,0.941097,0.0005513,-3.41819e-07,2.7394e-08,0.941648,0.000550698,-2.59637e-07,-4.93788e-08,0.942199,0.000550031,-4.07773e-07,5.09119e-08,0.942748,0.000549368,-2.55038e-07,-3.50595e-08,0.943297,0.000548753,-3.60216e-07,2.97214e-08,0.943846,0.000548122,-2.71052e-07,-2.42215e-08,0.944394,0.000547507,-3.43716e-07,7.55985e-09,0.944941,0.000546842,-3.21037e-07,-6.01796e-09,0.945487,0.000546182,-3.3909e-07,1.65119e-08,0.946033,0.000545553,-2.89555e-07,-4.2498e-10,0.946578,0.000544973,-2.9083e-07,-1.4812e-08,0.947123,0.000544347,-3.35266e-07,6.83068e-11,0.947667,0.000543676,-3.35061e-07,1.45388e-08,0.94821,0.00054305,-2.91444e-07,1.38123e-09,0.948753,0.000542471,-2.87301e-07,-2.00637e-08,0.949295,0.000541836,-3.47492e-07,1.92688e-08,0.949837,0.000541199,-2.89685e-07,2.59298e-09,0.950378,0.000540628,-2.81906e-07,-2.96407e-08,0.950918,0.000539975,-3.70829e-07,5.63652e-08,0.951458,0.000539402,-2.01733e-07,-7.66107e-08,0.951997,0.000538769,-4.31565e-07,7.12638e-08,0.952535,0.00053812,-2.17774e-07,-2.96305e-08,0.953073,0.000537595,-3.06665e-07,-1.23464e-08,0.95361,0.000536945,-3.43704e-07,1.94114e-08,0.954147,0.000536316,-2.8547e-07,-5.69451e-09,0.954683,0.000535728,-3.02554e-07,3.36666e-09,0.955219,0.000535133,-2.92454e-07,-7.77208e-09,0.955753,0.000534525,-3.1577e-07,2.77216e-08,0.956288,0.000533976,-2.32605e-07,-4.35097e-08,0.956821,0.00053338,-3.63134e-07,2.7108e-08,0.957354,0.000532735,-2.8181e-07,-5.31772e-09,0.957887,0.000532156,-2.97764e-07,-5.83718e-09,0.958419,0.000531543,-3.15275e-07,2.86664e-08,0.95895,0.000530998,-2.29276e-07,-4.9224e-08,0.959481,0.000530392,-3.76948e-07,4.90201e-08,0.960011,0.000529785,-2.29887e-07,-2.76471e-08,0.96054,0.000529243,-3.12829e-07,1.96385e-09,0.961069,0.000528623,-3.06937e-07,1.97917e-08,0.961598,0.000528068,-2.47562e-07,-2.15261e-08,0.962125,0.000527508,-3.1214e-07,6.70795e-09,0.962653,0.000526904,-2.92016e-07,-5.30573e-09,0.963179,0.000526304,-3.07934e-07,1.4515e-08,0.963705,0.000525732,-2.64389e-07,6.85048e-09,0.964231,0.000525224,-2.43837e-07,-4.19169e-08,0.964756,0.00052461,-3.69588e-07,4.1608e-08,0.96528,0.000523996,-2.44764e-07,-5.30598e-09,0.965804,0.000523491,-2.60682e-07,-2.03841e-08,0.966327,0.000522908,-3.21834e-07,2.72378e-08,0.966849,0.000522346,-2.40121e-07,-2.89625e-08,0.967371,0.000521779,-3.27008e-07,2.90075e-08,0.967893,0.000521212,-2.39986e-07,-2.74629e-08,0.968414,0.00052065,-3.22374e-07,2.12396e-08,0.968934,0.000520069,-2.58656e-07,2.10922e-09,0.969454,0.000519558,-2.52328e-07,-2.96765e-08,0.969973,0.000518964,-3.41357e-07,5.6992e-08,0.970492,0.000518452,-1.70382e-07,-7.90821e-08,0.97101,0.000517874,-4.07628e-07,8.05224e-08,0.971528,0.000517301,-1.66061e-07,-6.41937e-08,0.972045,0.000516776,-3.58642e-07,5.70429e-08,0.972561,0.00051623,-1.87513e-07,-4.47686e-08,0.973077,0.00051572,-3.21819e-07,2.82237e-09,0.973593,0.000515085,-3.13352e-07,3.34792e-08,0.974108,0.000514559,-2.12914e-07,-1.75298e-08,0.974622,0.000514081,-2.65503e-07,-2.29648e-08,0.975136,0.000513481,-3.34398e-07,4.97843e-08,0.975649,0.000512961,-1.85045e-07,-5.6963e-08,0.976162,0.00051242,-3.55934e-07,5.88585e-08,0.976674,0.000511885,-1.79359e-07,-5.92616e-08,0.977185,0.000511348,-3.57143e-07,5.89785e-08,0.977696,0.000510811,-1.80208e-07,-5.74433e-08,0.978207,0.000510278,-3.52538e-07,5.15854e-08,0.978717,0.000509728,-1.97781e-07,-2.9689e-08,0.979226,0.000509243,-2.86848e-07,7.56591e-09,0.979735,0.000508692,-2.64151e-07,-5.74649e-10,0.980244,0.000508162,-2.65875e-07,-5.26732e-09,0.980752,0.000507615,-2.81677e-07,2.16439e-08,0.981259,0.000507116,-2.16745e-07,-2.17037e-08,0.981766,0.000506618,-2.81856e-07,5.56636e-09,0.982272,0.000506071,-2.65157e-07,-5.61689e-10,0.982778,0.000505539,-2.66842e-07,-3.31963e-09,0.983283,0.000504995,-2.76801e-07,1.38402e-08,0.983788,0.000504483,-2.3528e-07,7.56339e-09,0.984292,0.000504035,-2.1259e-07,-4.40938e-08,0.984796,0.000503478,-3.44871e-07,4.96026e-08,0.985299,0.000502937,-1.96064e-07,-3.51071e-08,0.985802,0.000502439,-3.01385e-07,3.12212e-08,0.986304,0.00050193,-2.07721e-07,-3.0173e-08,0.986806,0.000501424,-2.9824e-07,2.9866e-08,0.987307,0.000500917,-2.08642e-07,-2.96865e-08,0.987808,0.000500411,-2.97702e-07,2.92753e-08,0.988308,0.000499903,-2.09876e-07,-2.78101e-08,0.988807,0.0004994,-2.93306e-07,2.23604e-08,0.989307,0.000498881,-2.26225e-07,-2.02681e-09,0.989805,0.000498422,-2.32305e-07,-1.42531e-08,0.990303,0.000497915,-2.75065e-07,-5.65232e-10,0.990801,0.000497363,-2.76761e-07,1.65141e-08,0.991298,0.000496859,-2.27218e-07,-5.88639e-09,0.991795,0.000496387,-2.44878e-07,7.0315e-09,0.992291,0.000495918,-2.23783e-07,-2.22396e-08,0.992787,0.000495404,-2.90502e-07,2.23224e-08,0.993282,0.00049489,-2.23535e-07,-7.44543e-09,0.993776,0.000494421,-2.45871e-07,7.45924e-09,0.994271,0.000493951,-2.23493e-07,-2.23915e-08,0.994764,0.000493437,-2.90668e-07,2.25021e-08,0.995257,0.000492923,-2.23161e-07,-8.01218e-09,0.99575,0.000492453,-2.47198e-07,9.54669e-09,0.996242,0.000491987,-2.18558e-07,-3.01746e-08,0.996734,0.000491459,-3.09082e-07,5.1547e-08,0.997225,0.000490996,-1.54441e-07,-5.68039e-08,0.997716,0.000490517,-3.24853e-07,5.64594e-08,0.998206,0.000490036,-1.55474e-07,-4.98245e-08,0.998696,0.000489576,-3.04948e-07,2.36292e-08,0.999186,0.000489037,-2.3406e-07,1.49121e-08,0.999674,0.000488613,-1.89324e-07,-2.3673e-08,1.00016,0.000488164,-2.60343e-07,2.01754e-08,1.00065,0.000487704,-1.99816e-07,-5.70288e-08,1.00114,0.000487133,-3.70903e-07,8.87303e-08,1.00162,0.000486657,-1.04712e-07,-5.94737e-08,1.00211,0.000486269,-2.83133e-07,2.99553e-08,1.0026,0.000485793,-1.93267e-07,-6.03474e-08,1.00308,0.000485225,-3.74309e-07,9.2225e-08,1.00357,0.000484754,-9.76345e-08,-7.0134e-08,1.00405,0.000484348,-3.08036e-07,6.91016e-08,1.00454,0.000483939,-1.00731e-07,-8.70633e-08,1.00502,0.000483476,-3.61921e-07,4.07328e-08,1.0055,0.000482875,-2.39723e-07,4.33413e-08,1.00599,0.000482525,-1.09699e-07,-9.48886e-08,1.00647,0.000482021,-3.94365e-07,9.77947e-08,1.00695,0.000481526,-1.00981e-07,-5.78713e-08,1.00743,0.00048115,-2.74595e-07,1.44814e-08,1.00791,0.000480645,-2.31151e-07,-5.42665e-11,1.00839,0.000480182,-2.31314e-07,-1.42643e-08,1.00887,0.000479677,-2.74106e-07,5.71115e-08,1.00935,0.0004793,-1.02772e-07,-9.49724e-08,1.00983,0.000478809,-3.87689e-07,8.43596e-08,1.01031,0.000478287,-1.3461e-07,-4.04755e-09,1.01079,0.000478006,-1.46753e-07,-6.81694e-08,1.01127,0.000477508,-3.51261e-07,3.83067e-08,1.01174,0.00047692,-2.36341e-07,3.41521e-08,1.01222,0.00047655,-1.33885e-07,-5.57058e-08,1.0127,0.000476115,-3.01002e-07,6.94616e-08,1.01317,0.000475721,-9.26174e-08,-1.02931e-07,1.01365,0.000475227,-4.01412e-07,1.03846e-07,1.01412,0.000474736,-8.98751e-08,-7.40321e-08,1.0146,0.000474334,-3.11971e-07,7.30735e-08,1.01507,0.00047393,-9.27508e-08,-9.90527e-08,1.01554,0.000473447,-3.89909e-07,8.47188e-08,1.01602,0.000472921,-1.35753e-07,-1.40381e-09,1.01649,0.000472645,-1.39964e-07,-7.91035e-08,1.01696,0.000472128,-3.77275e-07,7.93993e-08,1.01744,0.000471612,-1.39077e-07,-7.52607e-11,1.01791,0.000471334,-1.39302e-07,-7.90983e-08,1.01838,0.000470818,-3.76597e-07,7.80499e-08,1.01885,0.000470299,-1.42448e-07,5.31733e-09,1.01932,0.00047003,-1.26496e-07,-9.93193e-08,1.01979,0.000469479,-4.24453e-07,1.53541e-07,1.02026,0.00046909,3.617e-08,-1.57217e-07,1.02073,0.000468691,-4.35482e-07,1.177e-07,1.02119,0.000468173,-8.23808e-08,-7.51659e-08,1.02166,0.000467783,-3.07878e-07,6.37538e-08,1.02213,0.000467358,-1.16617e-07,-6.064e-08,1.0226,0.000466943,-2.98537e-07,5.9597e-08,1.02306,0.000466525,-1.19746e-07,-5.85386e-08,1.02353,0.00046611,-2.95362e-07,5.53482e-08,1.024,0.000465685,-1.29317e-07,-4.36449e-08,1.02446,0.000465296,-2.60252e-07,2.20268e-11,1.02493,0.000464775,-2.60186e-07,4.35568e-08,1.02539,0.000464386,-1.29516e-07,-5.50398e-08,1.02586,0.000463961,-2.94635e-07,5.73932e-08,1.02632,0.000463544,-1.22456e-07,-5.53236e-08,1.02678,0.000463133,-2.88426e-07,4.46921e-08,1.02725,0.000462691,-1.5435e-07,-4.23534e-09,1.02771,0.000462369,-1.67056e-07,-2.77507e-08,1.02817,0.000461952,-2.50308e-07,-3.97101e-09,1.02863,0.000461439,-2.62221e-07,4.36348e-08,1.02909,0.000461046,-1.31317e-07,-5.13589e-08,1.02955,0.000460629,-2.85394e-07,4.25913e-08,1.03001,0.000460186,-1.5762e-07,2.0285e-10,1.03047,0.000459871,-1.57011e-07,-4.34027e-08,1.03093,0.000459427,-2.87219e-07,5.41987e-08,1.03139,0.000459015,-1.24623e-07,-5.4183e-08,1.03185,0.000458604,-2.87172e-07,4.33239e-08,1.03231,0.000458159,-1.572e-07,9.65817e-11,1.03277,0.000457845,-1.56911e-07,-4.37103e-08,1.03323,0.0004574,-2.88041e-07,5.55351e-08,1.03368,0.000456991,-1.21436e-07,-5.9221e-08,1.03414,0.00045657,-2.99099e-07,6.21394e-08,1.0346,0.000456158,-1.1268e-07,-7.01275e-08,1.03505,0.000455723,-3.23063e-07,9.91614e-08,1.03551,0.000455374,-2.55788e-08,-8.80996e-08,1.03596,0.000455058,-2.89878e-07,1.48184e-08,1.03642,0.000454523,-2.45422e-07,2.88258e-08,1.03687,0.000454119,-1.58945e-07,-1.09125e-08,1.03733,0.000453768,-1.91682e-07,1.48241e-08,1.03778,0.000453429,-1.4721e-07,-4.83838e-08,1.03823,0.00045299,-2.92361e-07,5.95019e-08,1.03869,0.000452584,-1.13856e-07,-7.04146e-08,1.03914,0.000452145,-3.25099e-07,1.02947e-07,1.03959,0.000451803,-1.62583e-08,-1.02955e-07,1.04004,0.000451462,-3.25123e-07,7.04544e-08,1.04049,0.000451023,-1.1376e-07,-5.96534e-08,1.04094,0.000450616,-2.9272e-07,4.89499e-08,1.04139,0.000450178,-1.45871e-07,-1.69369e-08,1.04184,0.000449835,-1.96681e-07,1.87977e-08,1.04229,0.000449498,-1.40288e-07,-5.82539e-08,1.04274,0.000449043,-3.1505e-07,9.50087e-08,1.04319,0.000448698,-3.00238e-08,-8.33623e-08,1.04364,0.000448388,-2.80111e-07,2.20363e-11,1.04409,0.000447828,-2.80045e-07,8.32742e-08,1.04454,0.000447517,-3.02221e-08,-9.47002e-08,1.04498,0.000447173,-3.14323e-07,5.7108e-08,1.04543,0.000446716,-1.42999e-07,-1.45225e-08,1.04588,0.000446386,-1.86566e-07,9.82022e-10,1.04632,0.000446016,-1.8362e-07,1.05944e-08,1.04677,0.00044568,-1.51837e-07,-4.33597e-08,1.04721,0.000445247,-2.81916e-07,4.36352e-08,1.04766,0.000444814,-1.51011e-07,-1.19717e-08,1.0481,0.000444476,-1.86926e-07,4.25158e-09,1.04855,0.000444115,-1.74171e-07,-5.03461e-09,1.04899,0.000443751,-1.89275e-07,1.58868e-08,1.04944,0.00044342,-1.41614e-07,-5.85127e-08,1.04988,0.000442961,-3.17152e-07,9.89548e-08,1.05032,0.000442624,-2.0288e-08,-9.88878e-08,1.05076,0.000442287,-3.16951e-07,5.81779e-08,1.05121,0.000441827,-1.42418e-07,-1.46144e-08,1.05165,0.000441499,-1.86261e-07,2.79892e-10,1.05209,0.000441127,-1.85421e-07,1.34949e-08,1.05253,0.000440797,-1.44937e-07,-5.42594e-08,1.05297,0.000440344,-3.07715e-07,8.43335e-08,1.05341,0.000439982,-5.47146e-08,-4.46558e-08,1.05385,0.000439738,-1.88682e-07,-2.49193e-08,1.05429,0.000439286,-2.6344e-07,2.5124e-08,1.05473,0.000438835,-1.88068e-07,4.36328e-08,1.05517,0.000438589,-5.71699e-08,-8.04459e-08,1.05561,0.000438234,-2.98508e-07,3.97324e-08,1.05605,0.000437756,-1.79311e-07,4.07258e-08,1.05648,0.000437519,-5.71332e-08,-8.34263e-08,1.05692,0.000437155,-3.07412e-07,5.45608e-08,1.05736,0.000436704,-1.4373e-07,-1.56078e-08,1.05779,0.000436369,-1.90553e-07,7.87043e-09,1.05823,0.000436012,-1.66942e-07,-1.58739e-08,1.05867,0.00043563,-2.14563e-07,5.56251e-08,1.0591,0.000435368,-4.76881e-08,-8.74172e-08,1.05954,0.000435011,-3.0994e-07,5.56251e-08,1.05997,0.000434558,-1.43064e-07,-1.58739e-08,1.06041,0.000434224,-1.90686e-07,7.87042e-09,1.06084,0.000433866,-1.67075e-07,-1.56078e-08,1.06127,0.000433485,-2.13898e-07,5.45609e-08,1.06171,0.000433221,-5.02157e-08,-8.34263e-08,1.06214,0.00043287,-3.00495e-07,4.07258e-08,1.06257,0.000432391,-1.78317e-07,3.97325e-08,1.063,0.000432154,-5.91198e-08,-8.04464e-08,1.06344,0.000431794,-3.00459e-07,4.36347e-08,1.06387,0.000431324,-1.69555e-07,2.5117e-08,1.0643,0.000431061,-9.42041e-08,-2.48934e-08,1.06473,0.000430798,-1.68884e-07,-4.47527e-08,1.06516,0.000430326,-3.03142e-07,8.46951e-08,1.06559,0.000429973,-4.90573e-08,-5.56089e-08,1.06602,0.000429708,-2.15884e-07,1.85314e-08,1.06645,0.000429332,-1.6029e-07,-1.85166e-08,1.06688,0.000428956,-2.1584e-07,5.5535e-08,1.06731,0.000428691,-4.92347e-08,-8.44142e-08,1.06774,0.000428339,-3.02477e-07,4.37032e-08,1.06816,0.000427865,-1.71368e-07,2.88107e-08,1.06859,0.000427609,-8.49356e-08,-3.97367e-08,1.06902,0.00042732,-2.04146e-07,1.09267e-08,1.06945,0.000426945,-1.71365e-07,-3.97023e-09,1.06987,0.00042659,-1.83276e-07,4.9542e-09,1.0703,0.000426238,-1.68414e-07,-1.58466e-08,1.07073,0.000425854,-2.15953e-07,5.84321e-08,1.07115,0.000425597,-4.0657e-08,-9.86725e-08,1.07158,0.00042522,-3.36674e-07,9.78392e-08,1.072,0.00042484,-4.31568e-08,-5.42658e-08,1.07243,0.000424591,-2.05954e-07,1.45377e-11,1.07285,0.000424179,-2.0591e-07,5.42076e-08,1.07328,0.00042393,-4.32877e-08,-9.76357e-08,1.0737,0.00042355,-3.36195e-07,9.79165e-08,1.07412,0.000423172,-4.24451e-08,-5.56118e-08,1.07455,0.00042292,-2.09281e-07,5.32143e-09,1.07497,0.000422518,-1.93316e-07,3.43261e-08,1.07539,0.000422234,-9.0338e-08,-2.34165e-08,1.07581,0.000421983,-1.60588e-07,-5.98692e-08,1.07623,0.000421482,-3.40195e-07,1.43684e-07,1.07666,0.000421233,9.08574e-08,-1.5724e-07,1.07708,0.000420943,-3.80862e-07,1.27647e-07,1.0775,0.000420564,2.0791e-09,-1.1493e-07,1.07792,0.000420223,-3.4271e-07,9.36534e-08,1.07834,0.000419819,-6.17499e-08,-2.12653e-08,1.07876,0.000419632,-1.25546e-07,-8.59219e-09,1.07918,0.000419355,-1.51322e-07,-6.35752e-08,1.0796,0.000418861,-3.42048e-07,1.43684e-07,1.08002,0.000418608,8.90034e-08,-1.53532e-07,1.08043,0.000418326,-3.71593e-07,1.12817e-07,1.08085,0.000417921,-3.31414e-08,-5.93184e-08,1.08127,0.000417677,-2.11097e-07,5.24697e-09,1.08169,0.00041727,-1.95356e-07,3.83305e-08,1.0821,0.000416995,-8.03642e-08,-3.93597e-08,1.08252,0.000416716,-1.98443e-07,-1.0094e-10,1.08294,0.000416319,-1.98746e-07,3.97635e-08,1.08335,0.00041604,-7.94557e-08,-3.97437e-08,1.08377,0.000415762,-1.98687e-07,1.94215e-12,1.08419,0.000415365,-1.98681e-07,3.97359e-08,1.0846,0.000415087,-7.94732e-08,-3.97362e-08,1.08502,0.000414809,-1.98682e-07,-4.31063e-13,1.08543,0.000414411,-1.98683e-07,3.97379e-08,1.08584,0.000414133,-7.94694e-08,-3.97418e-08,1.08626,0.000413855,-1.98695e-07,2.00563e-11,1.08667,0.000413458,-1.98635e-07,3.96616e-08,1.08709,0.000413179,-7.965e-08,-3.9457e-08,1.0875,0.000412902,-1.98021e-07,-1.04281e-09,1.08791,0.000412502,-2.01149e-07,4.36282e-08,1.08832,0.000412231,-7.02648e-08,-5.42608e-08,1.08874,0.000411928,-2.33047e-07,5.42057e-08,1.08915,0.000411624,-7.04301e-08,-4.33527e-08,1.08956,0.000411353,-2.00488e-07,-4.07378e-12,1.08997,0.000410952,-2.005e-07,4.3369e-08,1.09038,0.000410681,-7.03934e-08,-5.42627e-08,1.09079,0.000410378,-2.33182e-07,5.44726e-08,1.0912,0.000410075,-6.97637e-08,-4.44186e-08,1.09161,0.000409802,-2.03019e-07,3.99235e-09,1.09202,0.000409408,-1.91042e-07,2.84491e-08,1.09243,0.000409111,-1.05695e-07,1.42043e-09,1.09284,0.000408904,-1.01434e-07,-3.41308e-08,1.09325,0.000408599,-2.03826e-07,1.58937e-08,1.09366,0.000408239,-1.56145e-07,-2.94438e-08,1.09406,0.000407838,-2.44476e-07,1.01881e-07,1.09447,0.000407655,6.11676e-08,-1.39663e-07,1.09488,0.000407358,-3.57822e-07,9.91432e-08,1.09529,0.00040694,-6.03921e-08,-1.84912e-08,1.09569,0.000406764,-1.15866e-07,-2.51785e-08,1.0961,0.000406457,-1.91401e-07,-4.03115e-12,1.09651,0.000406074,-1.91413e-07,2.51947e-08,1.09691,0.000405767,-1.15829e-07,1.84346e-08,1.09732,0.00040559,-6.05254e-08,-9.89332e-08,1.09772,0.000405172,-3.57325e-07,1.3888e-07,1.09813,0.000404874,5.93136e-08,-9.8957e-08,1.09853,0.000404696,-2.37557e-07,1.853e-08,1.09894,0.000404277,-1.81968e-07,2.48372e-08,1.09934,0.000403987,-1.07456e-07,1.33047e-09,1.09975,0.000403776,-1.03465e-07,-3.01591e-08,1.10015,0.000403479,-1.93942e-07,9.66054e-11,1.10055,0.000403091,-1.93652e-07,2.97727e-08,1.10096,0.000402793,-1.04334e-07,2.19273e-11,1.10136,0.000402585,-1.04268e-07,-2.98604e-08,1.10176,0.000402287,-1.93849e-07,2.10325e-10,1.10216,0.0004019,-1.93218e-07,2.90191e-08,1.10256,0.0004016,-1.06161e-07,2.92264e-09,1.10297,0.000401397,-9.73931e-08,-4.07096e-08,1.10337,0.00040108,-2.19522e-07,4.07067e-08,1.10377,0.000400763,-9.7402e-08,-2.90783e-09,1.10417,0.000400559,-1.06126e-07,-2.90754e-08,1.10457,0.00040026,-1.93352e-07,9.00021e-14,1.10497,0.000399873,-1.93351e-07,2.9075e-08,1.10537,0.000399574,-1.06126e-07,2.90902e-09,1.10577,0.00039937,-9.73992e-08,-4.07111e-08,1.10617,0.000399053,-2.19533e-07,4.07262e-08,1.10657,0.000398736,-9.73541e-08,-2.98424e-09,1.10697,0.000398533,-1.06307e-07,-2.87892e-08,1.10736,0.000398234,-1.92674e-07,-1.06824e-09,1.10776,0.000397845,-1.95879e-07,3.30622e-08,1.10816,0.000397552,-9.66926e-08,-1.19712e-08,1.10856,0.000397323,-1.32606e-07,1.48225e-08,1.10895,0.000397102,-8.81387e-08,-4.73187e-08,1.10935,0.000396784,-2.30095e-07,5.52429e-08,1.10975,0.00039649,-6.4366e-08,-5.44437e-08,1.11014,0.000396198,-2.27697e-07,4.33226e-08,1.11054,0.000395872,-9.77293e-08,3.62656e-10,1.11094,0.000395678,-9.66414e-08,-4.47732e-08,1.11133,0.00039535,-2.30961e-07,5.95208e-08,1.11173,0.000395067,-5.23985e-08,-7.41008e-08,1.11212,0.00039474,-2.74701e-07,1.17673e-07,1.11252,0.000394543,7.83181e-08,-1.58172e-07,1.11291,0.000394225,-3.96199e-07,1.57389e-07,1.1133,0.000393905,7.59679e-08,-1.13756e-07,1.1137,0.000393716,-2.653e-07,5.92165e-08,1.11409,0.000393363,-8.76507e-08,-3.90074e-09,1.11449,0.000393176,-9.93529e-08,-4.36136e-08,1.11488,0.000392846,-2.30194e-07,5.91457e-08,1.11527,0.000392563,-5.27564e-08,-7.376e-08,1.11566,0.000392237,-2.74037e-07,1.16685e-07,1.11606,0.000392039,7.60189e-08,-1.54562e-07,1.11645,0.000391727,-3.87667e-07,1.43935e-07,1.11684,0.000391384,4.4137e-08,-6.35487e-08,1.11723,0.000391281,-1.46509e-07,-8.94896e-09,1.11762,0.000390961,-1.73356e-07,-1.98647e-08,1.11801,0.000390555,-2.3295e-07,8.8408e-08,1.1184,0.000390354,3.22736e-08,-9.53486e-08,1.11879,0.000390133,-2.53772e-07,5.45677e-08,1.11918,0.000389789,-9.0069e-08,-3.71296e-09,1.11957,0.000389598,-1.01208e-07,-3.97159e-08,1.11996,0.000389276,-2.20355e-07,4.33671e-08,1.12035,0.000388966,-9.02542e-08,-1.45431e-08,1.12074,0.000388741,-1.33883e-07,1.48052e-08,1.12113,0.000388518,-8.94678e-08,-4.46778e-08,1.12152,0.000388205,-2.23501e-07,4.46966e-08,1.12191,0.000387892,-8.94114e-08,-1.48992e-08,1.12229,0.000387669,-1.34109e-07,1.49003e-08,1.12268,0.000387445,-8.94082e-08,-4.47019e-08,1.12307,0.000387132,-2.23514e-07,4.4698e-08,1.12345,0.000386819,-8.942e-08,-1.48806e-08,1.12384,0.000386596,-1.34062e-07,1.48245e-08,1.12423,0.000386372,-8.95885e-08,-4.44172e-08,1.12461,0.00038606,-2.2284e-07,4.36351e-08,1.125,0.000385745,-9.19348e-08,-1.09139e-08,1.12539,0.000385528,-1.24677e-07,2.05584e-11,1.12577,0.000385279,-1.24615e-07,1.08317e-08,1.12616,0.000385062,-9.21198e-08,-4.33473e-08,1.12654,0.000384748,-2.22162e-07,4.33481e-08,1.12693,0.000384434,-9.21174e-08,-1.08356e-08,1.12731,0.000384217,-1.24624e-07,-5.50907e-12,1.12769,0.000383968,-1.24641e-07,1.08577e-08,1.12808,0.000383751,-9.20679e-08,-4.34252e-08,1.12846,0.000383437,-2.22343e-07,4.36337e-08,1.12884,0.000383123,-9.14422e-08,-1.19005e-08,1.12923,0.000382904,-1.27144e-07,3.96813e-09,1.12961,0.000382662,-1.15239e-07,-3.97207e-09,1.12999,0.000382419,-1.27155e-07,1.19201e-08,1.13038,0.000382201,-9.1395e-08,-4.37085e-08,1.13076,0.000381887,-2.2252e-07,4.37046e-08,1.13114,0.000381573,-9.14068e-08,-1.19005e-08,1.13152,0.000381355,-1.27108e-07,3.89734e-09,1.1319,0.000381112,-1.15416e-07,-3.68887e-09,1.13228,0.00038087,-1.26483e-07,1.08582e-08,1.13266,0.00038065,-9.39083e-08,-3.97438e-08,1.13304,0.000380343,-2.1314e-07,2.89076e-08,1.13342,0.000380003,-1.26417e-07,4.33225e-08,1.1338,0.00037988,3.55072e-09,-8.29883e-08,1.13418,0.000379638,-2.45414e-07,5.0212e-08,1.13456,0.000379298,-9.47781e-08,1.34964e-09,1.13494,0.000379113,-9.07292e-08,-5.56105e-08,1.13532,0.000378764,-2.57561e-07,1.01883e-07,1.1357,0.000378555,4.80889e-08,-1.13504e-07,1.13608,0.000378311,-2.92423e-07,1.13713e-07,1.13646,0.000378067,4.87176e-08,-1.02931e-07,1.13683,0.000377856,-2.60076e-07,5.95923e-08,1.13721,0.000377514,-8.12988e-08,-1.62288e-08,1.13759,0.000377303,-1.29985e-07,5.32278e-09,1.13797,0.000377059,-1.14017e-07,-5.06237e-09,1.13834,0.000376816,-1.29204e-07,1.49267e-08,1.13872,0.000376602,-8.44237e-08,-5.46444e-08,1.1391,0.000376269,-2.48357e-07,8.44417e-08,1.13947,0.000376026,4.96815e-09,-4.47039e-08,1.13985,0.000375902,-1.29143e-07,-2.48355e-08,1.14023,0.000375569,-2.0365e-07,2.48368e-08,1.1406,0.000375236,-1.2914e-07,4.46977e-08,1.14098,0.000375112,4.95341e-09,-8.44184e-08,1.14135,0.000374869,-2.48302e-07,5.45572e-08,1.14173,0.000374536,-8.463e-08,-1.46013e-08,1.1421,0.000374323,-1.28434e-07,3.8478e-09,1.14247,0.000374077,-1.1689e-07,-7.89941e-10,1.14285,0.000373841,-1.1926e-07,-6.88042e-10,1.14322,0.0003736,-1.21324e-07,3.54213e-09,1.1436,0.000373368,-1.10698e-07,-1.34805e-08,1.14397,0.000373107,-1.51139e-07,5.03798e-08,1.14434,0.000372767,0.,0.};
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void RGB2LuvConvert_f(const T& src, D& dst)
+        {
+            const float _d = 1.f / (0.950456f + 15 + 1.088754f * 3);
+            const float _un = 13 * (4 * 0.950456f * _d);
+            const float _vn = 13 * (9 * _d);
+
+            float B = blueIdx == 0 ? src.x : src.z;
+            float G = src.y;
+            float R = blueIdx == 0 ? src.z : src.x;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            float X = R * 0.412453f + G * 0.357580f + B * 0.180423f;
+            float Y = R * 0.212671f + G * 0.715160f + B * 0.072169f;
+            float Z = R * 0.019334f + G * 0.119193f + B * 0.950227f;
+
+            float L = splineInterpolate(Y * (LAB_CBRT_TAB_SIZE / 1.5f), c_LabCbrtTab, LAB_CBRT_TAB_SIZE);
+            L = 116.f * L - 16.f;
+
+            const float d = (4 * 13) / ::fmaxf(X + 15 * Y + 3 * Z, numeric_limits<float>::epsilon());
+            float u = L * (X * d - _un);
+            float v = L * ((9 * 0.25f) * Y * d - _vn);
+
+            dst.x = L;
+            dst.y = u;
+            dst.z = v;
+        }
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void RGB2LuvConvert_b(const T& src, D& dst)
+        {
+            float3 srcf, dstf;
+
+            srcf.x = src.x * (1.f / 255.f);
+            srcf.y = src.y * (1.f / 255.f);
+            srcf.z = src.z * (1.f / 255.f);
+
+            RGB2LuvConvert_f<srgb, blueIdx>(srcf, dstf);
+
+            dst.x = saturate_cast<uchar>(dstf.x * 2.55f);
+            dst.y = saturate_cast<uchar>(dstf.y * 0.72033898305084743f + 96.525423728813564f);
+            dst.z = saturate_cast<uchar>(dstf.z * 0.9732824427480916f + 136.259541984732824f);
+        }
+
+        template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct RGB2Luv;
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct RGB2Luv<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<uchar, scn>::vec_type, typename TypeVec<uchar, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<uchar, dcn>::vec_type operator ()(const typename TypeVec<uchar, scn>::vec_type& src) const
+            {
+                typename TypeVec<uchar, dcn>::vec_type dst;
+
+                RGB2LuvConvert_b<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
+        };
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct RGB2Luv<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<float, dcn>::vec_type operator ()(const typename TypeVec<float, scn>::vec_type& src) const
+            {
+                typename TypeVec<float, dcn>::vec_type dst;
+
+                RGB2LuvConvert_f<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(name, scn, dcn, srgb, blueIdx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2Luv<T, scn, dcn, srgb, blueIdx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void Luv2RGBConvert_f(const T& src, D& dst)
+        {
+            const float _d = 1.f / (0.950456f + 15 + 1.088754f * 3);
+            const float _un = 4 * 0.950456f * _d;
+            const float _vn = 9 * _d;
+
+            float L = src.x;
+            float u = src.y;
+            float v = src.z;
+
+            float Y = (L + 16.f) * (1.f / 116.f);
+            Y = Y * Y * Y;
+
+            float d = (1.f / 13.f) / L;
+            u = u * d + _un;
+            v = v * d + _vn;
+
+            float iv = 1.f / v;
+            float X = 2.25f * u * Y * iv;
+            float Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
+
+            float B = 0.055648f * X - 0.204043f * Y + 1.057311f * Z;
+            float G = -0.969256f * X + 1.875991f * Y + 0.041556f * Z;
+            float R = 3.240479f * X - 1.537150f * Y - 0.498535f * Z;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            dst.x = blueIdx == 0 ? B : R;
+            dst.y = G;
+            dst.z = blueIdx == 0 ? R : B;
+            setAlpha(dst, ColorChannel<float>::max());
+        }
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void Luv2RGBConvert_b(const T& src, D& dst)
+        {
+            float3 srcf, dstf;
+
+            srcf.x = src.x * (100.f / 255.f);
+            srcf.y = src.y * 1.388235294117647f - 134.f;
+            srcf.z = src.z * 1.027450980392157f - 140.f;
+
+            Luv2RGBConvert_f<srgb, blueIdx>(srcf, dstf);
+
+            dst.x = saturate_cast<uchar>(dstf.x * 255.f);
+            dst.y = saturate_cast<uchar>(dstf.y * 255.f);
+            dst.z = saturate_cast<uchar>(dstf.z * 255.f);
+            setAlpha(dst, ColorChannel<uchar>::max());
+        }
+
+        template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct Luv2RGB;
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct Luv2RGB<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<uchar, scn>::vec_type, typename TypeVec<uchar, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<uchar, dcn>::vec_type operator ()(const typename TypeVec<uchar, scn>::vec_type& src) const
+            {
+                typename TypeVec<uchar, dcn>::vec_type dst;
+
+                Luv2RGBConvert_b<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
+        };
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct Luv2RGB<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<float, dcn>::vec_type operator ()(const typename TypeVec<float, scn>::vec_type& src) const
+            {
+                typename TypeVec<float, dcn>::vec_type dst;
+
+                Luv2RGBConvert_f<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::Luv2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    #undef CV_DESCALE
+
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COLOR_DETAIL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/reduce.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/reduce.hpp
new file mode 100644
index 000000000000..05a672c3dc36
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/reduce.hpp
@@ -0,0 +1,394 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+                For<I + 1, N>::loadToSmem(smem, val, tid);
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, val, tid);
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::merge(smem, val, tid, delta, op);
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+                thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::mergeShfl(val, delta, width, op);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+        };
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            smem[tid] = val;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            val = smem[tid];
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+
+#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                       const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                       unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+        }
+
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                         const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                         unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                              const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              unsigned int tid,
+                                              unsigned int delta,
+                                              const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+        }
+        template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                  unsigned int delta,
+                                                  unsigned int width,
+                                                  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+        }
+#else
+        template <typename... P, typename... R>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::loadToSmem(smem, val, tid);
+        }
+
+        template <typename... P, typename... R>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename... P, typename... R, class... Op>
+        __device__ __forceinline__ void merge(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid, unsigned int delta, const thrust::tuple<Op...>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::merge(smem, val, tid, delta, op);
+        }
+
+        template <typename... R, class... Op>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R...>& val, unsigned int delta, unsigned int width, const thrust::tuple<Op...>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R...> >::value>::mergeShfl(val, delta, width, op);
+        }
+#endif
+        template <unsigned int N> struct Generic
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                loadToSmem(smem, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(smem, val, tid, 1024, op);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(smem, val, tid, 512, op);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(smem, val, tid, 256, op);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(smem, val, tid, 128, op);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(smem, val, tid, 64, op);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(smem, val, tid, 32, op);
+                }
+
+                if (tid < 16)
+                {
+                    merge(smem, val, tid, 16, op);
+                    merge(smem, val, tid, 8, op);
+                    merge(smem, val, tid, 4, op);
+                    merge(smem, val, tid, 2, op);
+                    merge(smem, val, tid, 1, op);
+                }
+            }
+        };
+
+        template <unsigned int I, typename Pointer, typename Reference, class Op>
+        struct Unroll
+        {
+            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+            {
+                mergeShfl(val, I, N, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            }
+            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                merge(smem, val, tid, I, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            }
+        };
+        template <typename Pointer, typename Reference, class Op>
+        struct Unroll<0, Pointer, Reference, Op>
+        {
+            static __device__ void loopShfl(Reference, Op, unsigned int)
+            {
+            }
+            static __device__ void loop(Pointer, Reference, unsigned int, Op)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                CV_UNUSED(smem);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(smem, val, tid);
+
+                if (tid < 32)
+                {
+                #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                    Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+                #else
+                    Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_REDUCE_DETAIL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/reduce_key_val.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/reduce_key_val.hpp
new file mode 100644
index 000000000000..4a248c83657e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/reduce_key_val.hpp
@@ -0,0 +1,567 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_key_val_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(data);
+
+                For<I + 1, N>::loadToSmem(smem, data, tid);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(data) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, data, tid);
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
+            {
+                thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+
+                For<I + 1, N>::copyShfl(val, delta, width);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
+            {
+                thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+
+                For<I + 1, N>::copy(svals, val, tid, delta);
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(key) = reg;
+                    thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+                }
+
+                For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                         const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                         const CmpTuple& cmp,
+                                         unsigned int tid, unsigned int delta)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
+                    thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+                }
+
+                For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
+            {
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
+            {
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                         const ValPointerTuple&, const ValReferenceTuple&,
+                                         const CmpTuple&,
+                                         unsigned int, unsigned int)
+            {
+            }
+        };
+
+        //////////////////////////////////////////////////////
+        // loadToSmem
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            smem[tid] = data;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            data = smem[tid];
+        }
+
+#if (CUDART_VERSION < 12040)
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                   unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                     const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                     unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+        }
+#else
+        template <typename... VP, typename... VR>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP...>& smem, const thrust::tuple<VR...>& data, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename... VP, typename... VR>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP...>& smem, const thrust::tuple<VR...>& data, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::loadFromSmem(smem, data, tid);
+        }
+#endif
+
+        template <typename V>
+        __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
+        {
+            val = shfl_down(val, delta, width);
+        }
+        template <typename V>
+        __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
+        {
+            svals[tid] = val = svals[tid + delta];
+        }
+
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+
+#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
+        template <typename K,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const Cmp& cmp,
+                                                  unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+                  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                              const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+#else
+        template <typename... VR>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR...>& val, unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR...> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename... VP, typename... VR>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::copy(svals, val, tid, delta);
+        }
+
+        template <typename K, typename... VR, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, const thrust::tuple<VR...>& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename... VP, typename... VR, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, const thrust::tuple<VP...>& svals,
+                                              const thrust::tuple<VR...>& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename... KR, typename... VR, class... Cmp>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR...>& key,
+                                                  const thrust::tuple<VR...>& val,
+                                                  const thrust::tuple<Cmp...>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR...> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP...>& skeys,
+                                              const thrust::tuple<KR...>& key,
+                                              const thrust::tuple<VP...>& svals,
+                                              const thrust::tuple<VR...>& val,
+                                              const thrust::tuple<Cmp...>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+
+#endif
+        //////////////////////////////////////////////////////
+        // Generic
+
+        template <unsigned int N> struct Generic
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                loadToSmem(skeys, key, tid);
+                loadValsToSmem(svals, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(skeys, key, svals, val, cmp, tid, 1024);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(skeys, key, svals, val, cmp, tid, 512);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(skeys, key, svals, val, cmp, tid, 256);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(skeys, key, svals, val, cmp, tid, 128);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(skeys, key, svals, val, cmp, tid, 64);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(skeys, key, svals, val, cmp, tid, 32);
+                }
+
+                if (tid < 16)
+                {
+                    merge(skeys, key, svals, val, cmp, tid, 16);
+                    merge(skeys, key, svals, val, cmp, tid, 8);
+                    merge(skeys, key, svals, val, cmp, tid, 4);
+                    merge(skeys, key, svals, val, cmp, tid, 2);
+                    merge(skeys, key, svals, val, cmp, tid, 1);
+                }
+            }
+        };
+
+        template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll
+        {
+            static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
+            {
+                mergeShfl(key, val, cmp, I, N);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            }
+            static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                merge(skeys, key, svals, val, cmp, tid, I);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            }
+        };
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll<0, KP, KR, VP, VR, Cmp>
+        {
+            static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
+            {
+            }
+            static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+            #if 0 // __CUDA_ARCH__ >= 300
+                CV_UNUSED(skeys);
+                CV_UNUSED(svals);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if 0 // __CUDA_ARCH__ >= 300
+                Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(skeys, key, tid);
+
+                if (tid < 32)
+                {
+                #if 0 // __CUDA_ARCH__ >= 300
+                    loadFromSmem(svals, val, tid);
+
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
+                #else
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/transform_detail.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/transform_detail.hpp
new file mode 100644
index 000000000000..191984882754
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/transform_detail.hpp
@@ -0,0 +1,392 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+#define OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+#include "../functional.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace transform_detail
+    {
+        //! Read Write Traits
+
+        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
+        {
+            typedef typename TypeVec<T, shift>::vec_type read_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
+        {
+            typedef typename TypeVec<T1, shift>::vec_type read_type1;
+            typedef typename TypeVec<T2, shift>::vec_type read_type2;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        //! Transform kernels
+
+        template <int shift> struct OpUnroller;
+        template <> struct OpUnroller<1>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+            }
+        };
+        template <> struct OpUnroller<2>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+            }
+        };
+        template <> struct OpUnroller<3>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+            }
+        };
+        template <> struct OpUnroller<4>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src.w);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src1.w, src2.w);
+            }
+        };
+        template <> struct OpUnroller<8>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src.a7);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src1.a0, src2.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src1.a1, src2.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src1.a2, src2.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src1.a3, src2.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src1.a4, src2.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src1.a5, src2.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src1.a6, src2.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src1.a7, src2.a7);
+            }
+        };
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
+        {
+            typedef TransformFunctorTraits<UnOp> ft;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src_.rows)
+            {
+                const T* src = src_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src_.cols)
+                {
+                    const read_type src_n_el = ((const read_type*)src)[x];
+                    OpUnroller<ft::smart_shift>::unroll(src_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows && mask(y, x))
+            {
+                dst.ptr(y)[x] = op(src.ptr(y)[x]);
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
+            const Mask mask, const BinOp op)
+        {
+            typedef TransformFunctorTraits<BinOp> ft;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src1_.rows)
+            {
+                const T1* src1 = src1_.ptr(y);
+                const T2* src2 = src2_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
+                {
+                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
+
+                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src1[real_x], src2[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
+            const Mask mask, const BinOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src1.cols && y < src1.rows && mask(y, x))
+            {
+                const T1 src1_data = src1.ptr(y)[x];
+                const T2 src2_data = src2.ptr(y)[x];
+                dst.ptr(y)[x] = op(src1_data, src2_data);
+            }
+        }
+
+        template <bool UseSmart> struct TransformDispatcher;
+        template<> struct TransformDispatcher<false>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
+
+                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
+
+                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+        template<> struct TransformDispatcher<true>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
+
+                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
+                    !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
+
+                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+    } // namespace transform_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TRANSFORM_DETAIL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/type_traits_detail.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/type_traits_detail.hpp
new file mode 100644
index 000000000000..a78bd2c0d858
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/type_traits_detail.hpp
@@ -0,0 +1,191 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+#define OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace type_traits_detail
+    {
+        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
+        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
+
+        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+
+        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+
+        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+        template <> struct IsIntegral<char> { enum {value = 1}; };
+        template <> struct IsIntegral<bool> { enum {value = 1}; };
+
+        template <typename T> struct IsFloat { enum {value = 0}; };
+        template <> struct IsFloat<float> { enum {value = 1}; };
+        template <> struct IsFloat<double> { enum {value = 1}; };
+
+        template <typename T> struct IsVec { enum {value = 0}; };
+        template <> struct IsVec<uchar1> { enum {value = 1}; };
+        template <> struct IsVec<uchar2> { enum {value = 1}; };
+        template <> struct IsVec<uchar3> { enum {value = 1}; };
+        template <> struct IsVec<uchar4> { enum {value = 1}; };
+        template <> struct IsVec<uchar8> { enum {value = 1}; };
+        template <> struct IsVec<char1> { enum {value = 1}; };
+        template <> struct IsVec<char2> { enum {value = 1}; };
+        template <> struct IsVec<char3> { enum {value = 1}; };
+        template <> struct IsVec<char4> { enum {value = 1}; };
+        template <> struct IsVec<char8> { enum {value = 1}; };
+        template <> struct IsVec<ushort1> { enum {value = 1}; };
+        template <> struct IsVec<ushort2> { enum {value = 1}; };
+        template <> struct IsVec<ushort3> { enum {value = 1}; };
+        template <> struct IsVec<ushort4> { enum {value = 1}; };
+        template <> struct IsVec<ushort8> { enum {value = 1}; };
+        template <> struct IsVec<short1> { enum {value = 1}; };
+        template <> struct IsVec<short2> { enum {value = 1}; };
+        template <> struct IsVec<short3> { enum {value = 1}; };
+        template <> struct IsVec<short4> { enum {value = 1}; };
+        template <> struct IsVec<short8> { enum {value = 1}; };
+        template <> struct IsVec<uint1> { enum {value = 1}; };
+        template <> struct IsVec<uint2> { enum {value = 1}; };
+        template <> struct IsVec<uint3> { enum {value = 1}; };
+        template <> struct IsVec<uint4> { enum {value = 1}; };
+        template <> struct IsVec<uint8> { enum {value = 1}; };
+        template <> struct IsVec<int1> { enum {value = 1}; };
+        template <> struct IsVec<int2> { enum {value = 1}; };
+        template <> struct IsVec<int3> { enum {value = 1}; };
+        template <> struct IsVec<int4> { enum {value = 1}; };
+        template <> struct IsVec<int8> { enum {value = 1}; };
+        template <> struct IsVec<float1> { enum {value = 1}; };
+        template <> struct IsVec<float2> { enum {value = 1}; };
+        template <> struct IsVec<float3> { enum {value = 1}; };
+        template <> struct IsVec<float4> { enum {value = 1}; };
+        template <> struct IsVec<float8> { enum {value = 1}; };
+        template <> struct IsVec<double1> { enum {value = 1}; };
+        template <> struct IsVec<double2> { enum {value = 1}; };
+        template <> struct IsVec<double3> { enum {value = 1}; };
+        template <> struct IsVec<double4> { enum {value = 1}; };
+        template <> struct IsVec<double8> { enum {value = 1}; };
+
+        template <class U> struct AddParameterType { typedef const U& type; };
+        template <class U> struct AddParameterType<U&> { typedef U& type; };
+        template <> struct AddParameterType<void> { typedef void type; };
+
+        template <class U> struct ReferenceTraits
+        {
+            enum { value = false };
+            typedef U type;
+        };
+        template <class U> struct ReferenceTraits<U&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct PointerTraits
+        {
+            enum { value = false };
+            typedef void type;
+        };
+        template <class U> struct PointerTraits<U*>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+        template <class U> struct PointerTraits<U*&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct UnConst
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnConst<const U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnConst<const U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+
+        template <class U> struct UnVolatile
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnVolatile<volatile U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnVolatile<volatile U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+    } // namespace type_traits_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/vec_distance_detail.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
new file mode 100644
index 000000000000..8283a99560d9
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+#define OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+
+#include "../datamov_utils.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace vec_distance_detail
+    {
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+            {
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
+
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
+
+                    dist.reduceIter(val1, val2);
+
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
+
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
+
+                dist.reduceIter(val1, val2);
+
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
+
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    } // namespace vec_distance_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/dynamic_smem.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/dynamic_smem.hpp
new file mode 100644
index 000000000000..42570c6830c4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/dynamic_smem.hpp
@@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DYNAMIC_SMEM_HPP
+#define OPENCV_CUDA_DYNAMIC_SMEM_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<class T> struct DynamicSharedMem
+    {
+        __device__ __forceinline__ operator T*()
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+
+        __device__ __forceinline__ operator const T*() const
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+    };
+
+    // specialize for double to avoid unaligned memory access compile errors
+    template<> struct DynamicSharedMem<double>
+    {
+        __device__ __forceinline__ operator double*()
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+
+        __device__ __forceinline__ operator const double*() const
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+    };
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_DYNAMIC_SMEM_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/emulation.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/emulation.hpp
new file mode 100644
index 000000000000..17dc1171a237
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/emulation.hpp
@@ -0,0 +1,269 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_EMULATION_HPP_
+#define OPENCV_CUDA_EMULATION_HPP_
+
+#include "common.hpp"
+#include "warp_reduce.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Emulation
+    {
+
+        static __device__ __forceinline__ int syncthreadsOr(int pred)
+        {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
+                // just campilation stab
+                return 0;
+#else
+                return __syncthreads_or(pred);
+#endif
+        }
+
+        template<int CTA_SIZE>
+        static __forceinline__ __device__ int Ballot(int predicate)
+        {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            return __ballot(predicate);
+#else
+            __shared__ volatile int cta_buffer[CTA_SIZE];
+
+            int tid = threadIdx.x;
+            cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
+            return warp_reduce(cta_buffer);
+#endif
+        }
+
+        struct smem
+        {
+            enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicInc(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count;
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+                do
+                {
+                    count = *address & TAG_MASK;
+                    count = tag | (count + 1);
+                    *address = count;
+                } while (*address != count);
+
+                return (count & TAG_MASK) - 1;
+#else
+                return ::atomicInc(address, val);
+#endif
+            }
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicAdd(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count;
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+                do
+                {
+                    count = *address & TAG_MASK;
+                    count = tag | (count + val);
+                    *address = count;
+                } while (*address != count);
+
+                return (count & TAG_MASK) - val;
+#else
+                return ::atomicAdd(address, val);
+#endif
+            }
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicMin(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count = ::min(*address, val);
+                do
+                {
+                    *address = count;
+                } while (*address > count);
+
+                return count;
+#else
+                return ::atomicMin(address, val);
+#endif
+            }
+        }; // struct cmem
+
+        struct glob
+        {
+            static __device__ __forceinline__ int atomicAdd(int* address, int val)
+            {
+                return ::atomicAdd(address, val);
+            }
+            static __device__ __forceinline__ unsigned int atomicAdd(unsigned int* address, unsigned int val)
+            {
+                return ::atomicAdd(address, val);
+            }
+            static __device__ __forceinline__ float atomicAdd(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 200
+                return ::atomicAdd(address, val);
+            #else
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(val + __int_as_float(assumed)));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #endif
+            }
+            static __device__ __forceinline__ double atomicAdd(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val + __longlong_as_double(assumed)));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0;
+            #endif
+            }
+
+            static __device__ __forceinline__ int atomicMin(int* address, int val)
+            {
+                return ::atomicMin(address, val);
+            }
+            static __device__ __forceinline__ float atomicMin(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 120
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(::fminf(val, __int_as_float(assumed))));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0f;
+            #endif
+            }
+            static __device__ __forceinline__ double atomicMin(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0;
+            #endif
+            }
+
+            static __device__ __forceinline__ int atomicMax(int* address, int val)
+            {
+                return ::atomicMax(address, val);
+            }
+            static __device__ __forceinline__ float atomicMax(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 120
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(::fmaxf(val, __int_as_float(assumed))));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0f;
+            #endif
+            }
+            static __device__ __forceinline__ double atomicMax(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0;
+            #endif
+            }
+        };
+    }; //struct Emulation
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_EMULATION_HPP_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/filters.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/filters.hpp
new file mode 100644
index 000000000000..bf3147edb95f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/filters.hpp
@@ -0,0 +1,293 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_FILTERS_HPP
+#define OPENCV_CUDA_FILTERS_HPP
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "vec_math.hpp"
+#include "type_traits.hpp"
+#include "nppdefs.h"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename Ptr2D> struct PointFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            CV_UNUSED(fx);
+            CV_UNUSED(fy);
+        }
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            return src(__float2int_rz(y), __float2int_rz(x));
+        }
+
+        Ptr2D src;
+    };
+
+    template <typename Ptr2D> struct LinearFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            CV_UNUSED(fx);
+            CV_UNUSED(fy);
+        }
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+            work_type out = VecTraits<work_type>::all(0);
+
+            const int x1 = __float2int_rd(x);
+            const int y1 = __float2int_rd(y);
+            if (x1 <= NPP_MIN_32S || x1 >= NPP_MAX_32S || y1 <= NPP_MIN_32S || y1 >= NPP_MAX_32S)
+            {
+                elem_type src_reg = src(y1, x1);
+                out = out + src_reg * 1.0f;
+                return saturate_cast<elem_type>(out);
+            }
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+
+            elem_type src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - x) * (y2 - y));
+
+            src_reg = src(y1, x2);
+            out = out + src_reg * ((x - x1) * (y2 - y));
+
+            src_reg = src(y2, x1);
+            out = out + src_reg * ((x2 - x) * (y - y1));
+
+            src_reg = src(y2, x2);
+            out = out + src_reg * ((x - x1) * (y - y1));
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        Ptr2D src;
+    };
+
+    template <typename Ptr2D> struct CubicFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            CV_UNUSED(fx);
+            CV_UNUSED(fy);
+        }
+
+        static __device__ __forceinline__ float bicubicCoeff(float x_)
+        {
+            float x = fabsf(x_);
+            if (x <= 1.0f)
+            {
+                return x * x * (1.5f * x - 2.5f) + 1.0f;
+            }
+            else if (x < 2.0f)
+            {
+                return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+            }
+            else
+            {
+                return 0.0f;
+            }
+        }
+
+        __device__ elem_type operator ()(float y, float x) const
+        {
+            const float xmin = ::ceilf(x - 2.0f);
+            const float xmax = ::floorf(x + 2.0f);
+
+            const float ymin = ::ceilf(y - 2.0f);
+            const float ymax = ::floorf(y + 2.0f);
+
+            work_type sum = VecTraits<work_type>::all(0);
+            float wsum = 0.0f;
+
+            for (float cy = ymin; cy <= ymax; cy += 1.0f)
+            {
+                for (float cx = xmin; cx <= xmax; cx += 1.0f)
+                {
+                    const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
+                    sum = sum + w * src(__float2int_rd(cy), __float2int_rd(cx));
+                    wsum += w;
+                }
+            }
+
+            work_type res = (!wsum)? VecTraits<work_type>::all(0) : sum / wsum;
+
+            return saturate_cast<elem_type>(res);
+        }
+
+        Ptr2D src;
+    };
+    // for integer scaling
+    template <typename Ptr2D> struct IntegerAreaFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ IntegerAreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
+            : src(src_), scale_x(scale_x_), scale_y(scale_y_), scale(1.f / (scale_x * scale_y)) {}
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            float fsx1 = x * scale_x;
+            float fsx2 = fsx1 + scale_x;
+
+            int sx1 = __float2int_ru(fsx1);
+            int sx2 = __float2int_rd(fsx2);
+
+            float fsy1 = y * scale_y;
+            float fsy2 = fsy1 + scale_y;
+
+            int sy1 = __float2int_ru(fsy1);
+            int sy2 = __float2int_rd(fsy2);
+
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0.f);
+
+            for(int dy = sy1; dy < sy2; ++dy)
+                for(int dx = sx1; dx < sx2; ++dx)
+                {
+                    out = out + src(dy, dx) * scale;
+                }
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        Ptr2D src;
+        float scale_x, scale_y ,scale;
+    };
+
+    template <typename Ptr2D> struct AreaFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ AreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
+            : src(src_), scale_x(scale_x_), scale_y(scale_y_){}
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            float fsx1 = x * scale_x;
+            float fsx2 = fsx1 + scale_x;
+
+            int sx1 = __float2int_ru(fsx1);
+            int sx2 = __float2int_rd(fsx2);
+
+            float fsy1 = y * scale_y;
+            float fsy2 = fsy1 + scale_y;
+
+            int sy1 = __float2int_ru(fsy1);
+            int sy2 = __float2int_rd(fsy2);
+
+            float scale = 1.f / (fminf(scale_x, src.width - fsx1) * fminf(scale_y, src.height - fsy1));
+
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0.f);
+
+            for (int dy = sy1; dy < sy2; ++dy)
+            {
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src(dy, dx) * scale;
+
+                if (sx1 > fsx1)
+                    out = out + src(dy, (sx1 -1) ) * ((sx1 - fsx1) * scale);
+
+                if (sx2 < fsx2)
+                    out = out + src(dy, sx2) * ((fsx2 -sx2) * scale);
+            }
+
+            if (sy1 > fsy1)
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src( (sy1 - 1) , dx) * ((sy1 -fsy1) * scale);
+
+            if (sy2 < fsy2)
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src(sy2, dx) * ((fsy2 -sy2) * scale);
+
+            if ((sy1 > fsy1) &&  (sx1 > fsx1))
+                out = out + src( (sy1 - 1) , (sx1 - 1)) * ((sy1 -fsy1) * (sx1 -fsx1) * scale);
+
+            if ((sy1 > fsy1) &&  (sx2 < fsx2))
+                out = out + src( (sy1 - 1) , sx2) * ((sy1 -fsy1) * (fsx2 -sx2) * scale);
+
+            if ((sy2 < fsy2) &&  (sx2 < fsx2))
+                out = out + src(sy2, sx2) * ((fsy2 -sy2) * (fsx2 -sx2) * scale);
+
+            if ((sy2 < fsy2) &&  (sx1 > fsx1))
+                out = out + src(sy2, (sx1 - 1)) * ((fsy2 -sy2) * (sx1 -fsx1) * scale);
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        Ptr2D src;
+        float scale_x, scale_y;
+        int width, haight;
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_FILTERS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/funcattrib.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/funcattrib.hpp
new file mode 100644
index 000000000000..f58208048882
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/funcattrib.hpp
@@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP
+#define OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP
+
+#include <cstdio>
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<class Func>
+    void printFuncAttrib(Func& func)
+    {
+
+        cudaFuncAttributes attrs;
+        cudaFuncGetAttributes(&attrs, func);
+
+        printf("=== Function stats ===\n");
+        printf("Name: \n");
+        printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
+        printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
+        printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
+        printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
+        printf("numRegs            = %d\n", attrs.numRegs);
+        printf("ptxVersion         = %d\n", attrs.ptxVersion);
+        printf("binaryVersion      = %d\n", attrs.binaryVersion);
+        printf("\n");
+        fflush(stdout);
+    }
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif  /* OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/functional.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/functional.hpp
new file mode 100644
index 000000000000..9f53d87527c1
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/functional.hpp
@@ -0,0 +1,805 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_FUNCTIONAL_HPP
+#define OPENCV_CUDA_FUNCTIONAL_HPP
+
+#include <functional>
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "type_traits.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // Function Objects
+    template<typename Argument, typename Result> struct unary_function
+    {
+        typedef Argument argument_type;
+        typedef Result result_type;
+    };
+    template<typename Argument1, typename Argument2, typename Result> struct binary_function
+    {
+        typedef Argument1 first_argument_type;
+        typedef Argument2 second_argument_type;
+        typedef Result result_type;
+    };
+
+    // Arithmetic Operations
+    template <typename T> struct plus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a + b;
+        }
+        __host__ __device__ __forceinline__ plus() {}
+        __host__ __device__ __forceinline__ plus(const plus&) {}
+    };
+
+    template <typename T> struct minus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a - b;
+        }
+        __host__ __device__ __forceinline__ minus() {}
+        __host__ __device__ __forceinline__ minus(const minus&) {}
+    };
+
+    template <typename T> struct multiplies : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a * b;
+        }
+        __host__ __device__ __forceinline__ multiplies() {}
+        __host__ __device__ __forceinline__ multiplies(const multiplies&) {}
+    };
+
+    template <typename T> struct divides : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a / b;
+        }
+        __host__ __device__ __forceinline__ divides() {}
+        __host__ __device__ __forceinline__ divides(const divides&) {}
+    };
+
+    template <typename T> struct modulus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a % b;
+        }
+        __host__ __device__ __forceinline__ modulus() {}
+        __host__ __device__ __forceinline__ modulus(const modulus&) {}
+    };
+
+    template <typename T> struct negate : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return -a;
+        }
+        __host__ __device__ __forceinline__ negate() {}
+        __host__ __device__ __forceinline__ negate(const negate&) {}
+    };
+
+    // Comparison Operations
+    template <typename T> struct equal_to : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a == b;
+        }
+        __host__ __device__ __forceinline__ equal_to() {}
+        __host__ __device__ __forceinline__ equal_to(const equal_to&) {}
+    };
+
+    template <typename T> struct not_equal_to : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a != b;
+        }
+        __host__ __device__ __forceinline__ not_equal_to() {}
+        __host__ __device__ __forceinline__ not_equal_to(const not_equal_to&) {}
+    };
+
+    template <typename T> struct greater : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a > b;
+        }
+        __host__ __device__ __forceinline__ greater() {}
+        __host__ __device__ __forceinline__ greater(const greater&) {}
+    };
+
+    template <typename T> struct less : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a < b;
+        }
+        __host__ __device__ __forceinline__ less() {}
+        __host__ __device__ __forceinline__ less(const less&) {}
+    };
+
+    template <typename T> struct greater_equal : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a >= b;
+        }
+        __host__ __device__ __forceinline__ greater_equal() {}
+        __host__ __device__ __forceinline__ greater_equal(const greater_equal&) {}
+    };
+
+    template <typename T> struct less_equal : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a <= b;
+        }
+        __host__ __device__ __forceinline__ less_equal() {}
+        __host__ __device__ __forceinline__ less_equal(const less_equal&) {}
+    };
+
+    // Logical Operations
+    template <typename T> struct logical_and : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a && b;
+        }
+        __host__ __device__ __forceinline__ logical_and() {}
+        __host__ __device__ __forceinline__ logical_and(const logical_and&) {}
+    };
+
+    template <typename T> struct logical_or : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a || b;
+        }
+        __host__ __device__ __forceinline__ logical_or() {}
+        __host__ __device__ __forceinline__ logical_or(const logical_or&) {}
+    };
+
+    template <typename T> struct logical_not : unary_function<T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return !a;
+        }
+        __host__ __device__ __forceinline__ logical_not() {}
+        __host__ __device__ __forceinline__ logical_not(const logical_not&) {}
+    };
+
+    // Bitwise Operations
+    template <typename T> struct bit_and : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a & b;
+        }
+        __host__ __device__ __forceinline__ bit_and() {}
+        __host__ __device__ __forceinline__ bit_and(const bit_and&) {}
+    };
+
+    template <typename T> struct bit_or : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a | b;
+        }
+        __host__ __device__ __forceinline__ bit_or() {}
+        __host__ __device__ __forceinline__ bit_or(const bit_or&) {}
+    };
+
+    template <typename T> struct bit_xor : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a ^ b;
+        }
+        __host__ __device__ __forceinline__ bit_xor() {}
+        __host__ __device__ __forceinline__ bit_xor(const bit_xor&) {}
+    };
+
+    template <typename T> struct bit_not : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
+        {
+            return ~v;
+        }
+        __host__ __device__ __forceinline__ bit_not() {}
+        __host__ __device__ __forceinline__ bit_not(const bit_not&) {}
+    };
+
+    // Generalized Identity Operations
+    template <typename T> struct identity : unary_function<T, T>
+    {
+        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return x;
+        }
+        __host__ __device__ __forceinline__ identity() {}
+        __host__ __device__ __forceinline__ identity(const identity&) {}
+    };
+
+    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
+    {
+        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
+        {
+            return lhs;
+        }
+        __host__ __device__ __forceinline__ project1st() {}
+        __host__ __device__ __forceinline__ project1st(const project1st&) {}
+    };
+
+    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
+    {
+        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
+        {
+            return rhs;
+        }
+        __host__ __device__ __forceinline__ project2nd() {}
+        __host__ __device__ __forceinline__ project2nd(const project2nd&) {}
+    };
+
+    // Min/Max Operations
+
+#define OPENCV_CUDA_IMPLEMENT_MINMAX(name, type, op) \
+    template <> struct name<type> : binary_function<type, type, type> \
+    { \
+        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
+        __host__ __device__ __forceinline__ name() {}\
+        __host__ __device__ __forceinline__ name(const name&) {}\
+    };
+
+    template <typename T> struct maximum : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
+        {
+            return max(lhs, rhs);
+        }
+        __host__ __device__ __forceinline__ maximum() {}
+        __host__ __device__ __forceinline__ maximum(const maximum&) {}
+    };
+
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, uchar, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, schar, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, char, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, ushort, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, short, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, int, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, uint, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, float, ::fmax)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, double, ::fmax)
+
+    template <typename T> struct minimum : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
+        {
+            return min(lhs, rhs);
+        }
+        __host__ __device__ __forceinline__ minimum() {}
+        __host__ __device__ __forceinline__ minimum(const minimum&) {}
+    };
+
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, uchar, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, schar, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, char, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, ushort, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, short, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, int, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, uint, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, float, ::fmin)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, double, ::fmin)
+
+#undef OPENCV_CUDA_IMPLEMENT_MINMAX
+
+    // Math functions
+
+    template <typename T> struct abs_func : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return abs(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
+    {
+        __device__ __forceinline__ unsigned char operator ()(unsigned char x) const
+        {
+            return x;
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<signed char> : unary_function<signed char, signed char>
+    {
+        __device__ __forceinline__ signed char operator ()(signed char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<char> : unary_function<char, char>
+    {
+        __device__ __forceinline__ char operator ()(char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
+    {
+        __device__ __forceinline__ unsigned short operator ()(unsigned short x) const
+        {
+            return x;
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<short> : unary_function<short, short>
+    {
+        __device__ __forceinline__ short operator ()(short x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
+    {
+        __device__ __forceinline__ unsigned int operator ()(unsigned int x) const
+        {
+            return x;
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<int> : unary_function<int, int>
+    {
+        __device__ __forceinline__ int operator ()(int x) const
+        {
+            return ::abs(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<float> : unary_function<float, float>
+    {
+        __device__ __forceinline__ float operator ()(float x) const
+        {
+            return ::fabsf(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<double> : unary_function<double, double>
+    {
+        __device__ __forceinline__ double operator ()(double x) const
+        {
+            return ::fabs(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+
+#define OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(name, func) \
+    template <typename T> struct name ## _func : unary_function<T, float> \
+    { \
+        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \
+        { \
+            return func ## f(v); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    }; \
+    template <> struct name ## _func<double> : unary_function<double, double> \
+    { \
+        __device__ __forceinline__ double operator ()(double v) const \
+        { \
+            return func(v); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    };
+
+#define OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(name, func) \
+    template <typename T> struct name ## _func : binary_function<T, T, float> \
+    { \
+        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \
+        { \
+            return func ## f(v1, v2); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    }; \
+    template <> struct name ## _func<double> : binary_function<double, double, double> \
+    { \
+        __device__ __forceinline__ double operator ()(double v1, double v2) const \
+        { \
+            return func(v1, v2); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    };
+
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log, ::log)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log2, ::log2)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log10, ::log10)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sin, ::sin)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(cos, ::cos)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(tan, ::tan)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(asin, ::asin)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(acos, ::acos)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(atan, ::atan)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)
+
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)
+
+    #undef OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR
+    #undef OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR_NO_DOUBLE
+    #undef OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR
+
+    template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
+        {
+            return src1 * src1 + src2 * src2;
+        }
+        __host__ __device__ __forceinline__ hypot_sqr_func() {}
+        __host__ __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func&) {}
+    };
+
+    // Saturate Cast Functor
+    template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
+    {
+        __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
+        {
+            return saturate_cast<D>(v);
+        }
+        __host__ __device__ __forceinline__ saturate_cast_func() {}
+        __host__ __device__ __forceinline__ saturate_cast_func(const saturate_cast_func&) {}
+    };
+
+    // Threshold Functors
+    template <typename T> struct thresh_binary_func : unary_function<T, T>
+    {
+        __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src > thresh) * maxVal;
+        }
+
+        __host__ __device__ __forceinline__ thresh_binary_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}
+
+        T thresh;
+        T maxVal;
+    };
+
+    template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
+    {
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src <= thresh) * maxVal;
+        }
+
+        __host__ __device__ __forceinline__ thresh_binary_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}
+
+        T thresh;
+        T maxVal;
+    };
+
+    template <typename T> struct thresh_trunc_func : unary_function<T, T>
+    {
+        explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {CV_UNUSED(maxVal_);}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return minimum<T>()(src, thresh);
+        }
+
+        __host__ __device__ __forceinline__ thresh_trunc_func() {}
+        __host__ __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
+            : thresh(other.thresh) {}
+
+        T thresh;
+    };
+
+    template <typename T> struct thresh_to_zero_func : unary_function<T, T>
+    {
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {CV_UNUSED(maxVal_);}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src > thresh) * src;
+        }
+
+        __host__ __device__ __forceinline__ thresh_to_zero_func() {}
+       __host__  __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
+            : thresh(other.thresh) {}
+
+        T thresh;
+    };
+
+    template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
+    {
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {CV_UNUSED(maxVal_);}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src <= thresh) * src;
+        }
+
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
+            : thresh(other.thresh) {}
+
+        T thresh;
+    };
+
+    // Function Object Adaptors
+    template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
+    {
+      explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
+
+      __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
+      {
+          return !pred(x);
+      }
+
+      __host__ __device__ __forceinline__ unary_negate() {}
+      __host__ __device__ __forceinline__ unary_negate(const unary_negate& other) : pred(other.pred) {}
+
+      Predicate pred;
+    };
+
+    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
+    {
+        return unary_negate<Predicate>(pred);
+    }
+
+    template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
+    {
+        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
+
+        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,
+                                                   typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
+        {
+            return !pred(x,y);
+        }
+
+        __host__ __device__ __forceinline__ binary_negate() {}
+        __host__ __device__ __forceinline__ binary_negate(const binary_negate& other) : pred(other.pred) {}
+
+        Predicate pred;
+    };
+
+    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
+    {
+        return binary_negate<BinaryPredicate>(pred);
+    }
+
+    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
+    {
+        __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
+
+        __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
+        {
+            return op(arg1, a);
+        }
+
+        __host__ __device__ __forceinline__ binder1st() {}
+        __host__ __device__ __forceinline__ binder1st(const binder1st& other) : op(other.op), arg1(other.arg1) {}
+
+        Op op;
+        typename Op::first_argument_type arg1;
+    };
+
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
+    {
+        return binder1st<Op>(op, typename Op::first_argument_type(x));
+    }
+
+    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
+    {
+        __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
+
+        __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
+        {
+            return op(a, arg2);
+        }
+
+        __host__ __device__ __forceinline__ binder2nd() {}
+        __host__ __device__ __forceinline__ binder2nd(const binder2nd& other) : op(other.op), arg2(other.arg2) {}
+
+        Op op;
+        typename Op::second_argument_type arg2;
+    };
+
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
+    {
+        return binder2nd<Op>(op, typename Op::second_argument_type(x));
+    }
+
+    // Functor Traits
+    template <typename F> struct IsUnaryFunction
+    {
+        typedef char Yes;
+        struct No {Yes a[2];};
+
+        template <typename T, typename D> static Yes check(unary_function<T, D>);
+        static No check(...);
+
+        static F makeF();
+
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+    };
+
+    template <typename F> struct IsBinaryFunction
+    {
+        typedef char Yes;
+        struct No {Yes a[2];};
+
+        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
+        static No check(...);
+
+        static F makeF();
+
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+    };
+
+    namespace functional_detail
+    {
+        template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
+
+        template <typename T, typename D> struct DefaultUnaryShift
+        {
+            enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
+        };
+
+        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
+
+        template <typename T1, typename T2, typename D> struct DefaultBinaryShift
+        {
+            enum { shift = BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
+        };
+
+        template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
+        template <typename Func> struct ShiftDispatcher<Func, true>
+        {
+            enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
+        };
+        template <typename Func> struct ShiftDispatcher<Func, false>
+        {
+            enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
+        };
+    }
+
+    template <typename Func> struct DefaultTransformShift
+    {
+        enum { shift = functional_detail::ShiftDispatcher<Func>::shift };
+    };
+
+    template <typename Func> struct DefaultTransformFunctorTraits
+    {
+        enum { simple_block_dim_x = 16 };
+        enum { simple_block_dim_y = 16 };
+
+        enum { smart_block_dim_x = 16 };
+        enum { smart_block_dim_y = 16 };
+        enum { smart_shift = DefaultTransformShift<Func>::shift };
+    };
+
+    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
+
+#define OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(type) \
+    template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_FUNCTIONAL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/limits.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/limits.hpp
new file mode 100644
index 000000000000..7e15ed629a39
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/limits.hpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_LIMITS_HPP
+#define OPENCV_CUDA_LIMITS_HPP
+
+#include <limits.h>
+#include <float.h>
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+template <class T> struct numeric_limits;
+
+template <> struct numeric_limits<bool>
+{
+    __device__ __forceinline__ static bool min() { return false; }
+    __device__ __forceinline__ static bool max() { return true;  }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<signed char>
+{
+    __device__ __forceinline__ static signed char min() { return SCHAR_MIN; }
+    __device__ __forceinline__ static signed char max() { return SCHAR_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<unsigned char>
+{
+    __device__ __forceinline__ static unsigned char min() { return 0; }
+    __device__ __forceinline__ static unsigned char max() { return UCHAR_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<short>
+{
+    __device__ __forceinline__ static short min() { return SHRT_MIN; }
+    __device__ __forceinline__ static short max() { return SHRT_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<unsigned short>
+{
+    __device__ __forceinline__ static unsigned short min() { return 0; }
+    __device__ __forceinline__ static unsigned short max() { return USHRT_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<int>
+{
+    __device__ __forceinline__ static int min() { return INT_MIN; }
+    __device__ __forceinline__ static int max() { return INT_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<unsigned int>
+{
+    __device__ __forceinline__ static unsigned int min() { return 0; }
+    __device__ __forceinline__ static unsigned int max() { return UINT_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<float>
+{
+    __device__ __forceinline__ static float min() { return FLT_MIN; }
+    __device__ __forceinline__ static float max() { return FLT_MAX; }
+    __device__ __forceinline__ static float epsilon() { return FLT_EPSILON; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<double>
+{
+    __device__ __forceinline__ static double min() { return DBL_MIN; }
+    __device__ __forceinline__ static double max() { return DBL_MAX; }
+    __device__ __forceinline__ static double epsilon() { return DBL_EPSILON; }
+    static const bool is_signed = true;
+};
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+//! @endcond
+
+#endif // OPENCV_CUDA_LIMITS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/reduce.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/reduce.hpp
new file mode 100644
index 000000000000..fb74de95a8ab
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/reduce.hpp
@@ -0,0 +1,230 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_REDUCE_HPP
+#define OPENCV_CUDA_REDUCE_HPP
+
+#ifndef THRUST_DEBUG // eliminate -Wundef warning
+#define THRUST_DEBUG 0
+#endif
+
+#include <thrust/tuple.h>
+#include "detail/reduce.hpp"
+#include "detail/reduce_key_val.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <int N, typename T, class Op>
+    __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
+    }
+    template <unsigned int N, typename K, typename V, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
+    template <int N,
+              typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                           const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                           unsigned int tid,
+                                           const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
+                const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
+                const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
+    }
+
+    template <unsigned int N,
+              typename K,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+
+    template <unsigned int N,
+              typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+              typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+    __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                                 const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid,
+                                                 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
+                const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
+                >(skeys, key, svals, val, tid, cmp);
+    }
+#else
+    template <int N, typename... P, typename... R, class... Op>
+    __device__ __forceinline__ void reduce(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid, const thrust::tuple<Op...>& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<const thrust::tuple<P...>&, const thrust::tuple<R...>&, const thrust::tuple<Op...>&>(smem, val, tid, op);
+    }
+
+    template <unsigned int N, typename K, typename... VP, typename... VR, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, const thrust::tuple<VP...>&, const thrust::tuple<VR...>&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+
+    template <unsigned int N, typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
+    __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP...>& skeys, const thrust::tuple<KR...>& key, const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, const thrust::tuple<Cmp...>& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<const thrust::tuple<KP...>&, const thrust::tuple<KR...>&, const thrust::tuple<VP...>&, const thrust::tuple<VR...>&, const thrust::tuple<Cmp...>&>(skeys, key, svals, val, tid, cmp);
+    }
+#endif
+
+    // smem_tuple
+
+    template <typename T0>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*>
+    smem_tuple(T0* t0)
+    {
+        return thrust::make_tuple((volatile T0*) t0);
+    }
+
+    template <typename T0, typename T1>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*>
+    smem_tuple(T0* t0, T1* t1)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
+    }
+
+    template <typename T0, typename T1, typename T2>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
+    smem_tuple(T0* t0, T1* t1, T2* t2)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_REDUCE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/saturate_cast.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/saturate_cast.hpp
new file mode 100644
index 000000000000..c3a3d1cb8338
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/saturate_cast.hpp
@@ -0,0 +1,292 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_SATURATE_CAST_HPP
+#define OPENCV_CUDA_SATURATE_CAST_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
+
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
+    {
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<uchar>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
+    {
+        uint res = 0;
+        uint vi = v;
+        asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
+    {
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<schar>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
+    {
+        ushort res = 0;
+        int vi = v;
+        asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
+    {
+        ushort res = 0;
+        asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
+    {
+        ushort res = 0;
+        asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
+    {
+        ushort res = 0;
+        asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
+    {
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<ushort>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
+    {
+        short res = 0;
+        asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
+    {
+        short res = 0;
+        asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
+    {
+        short res = 0;
+        asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
+    {
+        short res = 0;
+        asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        short res = 0;
+        asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<short>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
+    {
+        int res = 0;
+        asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
+    {
+        return __float2int_rn(v);
+    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        return __double2int_rn(v);
+    #else
+        return saturate_cast<int>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
+    {
+        return __float2uint_rn(v);
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        return __double2uint_rn(v);
+    #else
+        return saturate_cast<uint>((float)v);
+    #endif
+    }
+}}}
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_SATURATE_CAST_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/scan.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/scan.hpp
new file mode 100644
index 000000000000..e128fb0962ef
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/scan.hpp
@@ -0,0 +1,258 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_SCAN_HPP
+#define OPENCV_CUDA_SCAN_HPP
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/warp.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };
+
+    template <ScanKind Kind, typename T, typename F> struct WarpScan
+    {
+        __device__ __forceinline__ WarpScan() {}
+        __device__ __forceinline__ WarpScan(const WarpScan& other) { CV_UNUSED(other); }
+
+        __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+        {
+            const unsigned int lane = idx & 31;
+            F op;
+
+            if ( lane >=  1) ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+            if ( lane >=  2) ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+            if ( lane >=  4) ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+            if ( lane >=  8) ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+            if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+            if( Kind == INCLUSIVE )
+                return ptr [idx];
+            else
+                return (lane > 0) ? ptr [idx - 1] : 0;
+        }
+
+        __device__ __forceinline__ unsigned int index(const unsigned int tid)
+        {
+            return tid;
+        }
+
+        __device__ __forceinline__ void init(volatile T *ptr){}
+
+        static const int warp_offset      = 0;
+
+        typedef WarpScan<INCLUSIVE, T, F>  merge;
+    };
+
+    template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
+    {
+        __device__ __forceinline__ WarpScanNoComp() {}
+        __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { CV_UNUSED(other); }
+
+        __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+        {
+            const unsigned int lane = threadIdx.x & 31;
+            F op;
+
+            ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+            ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+            ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+            ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+            ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+            if( Kind == INCLUSIVE )
+                return ptr [idx];
+            else
+                return (lane > 0) ? ptr [idx - 1] : 0;
+        }
+
+        __device__ __forceinline__ unsigned int index(const unsigned int tid)
+        {
+            return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
+        }
+
+        __device__ __forceinline__ void init(volatile T *ptr)
+        {
+            ptr[threadIdx.x] = 0;
+        }
+
+        static const int warp_smem_stride = 32 + 16 + 1;
+        static const int warp_offset      = 16;
+        static const int warp_log         = 5;
+        static const int warp_mask        = 31;
+
+        typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
+    };
+
+    template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
+    {
+        __device__ __forceinline__ BlockScan() {}
+        __device__ __forceinline__ BlockScan(const BlockScan& other) { CV_UNUSED(other); }
+
+        __device__ __forceinline__ T operator()(volatile T *ptr)
+        {
+            const unsigned int tid  = threadIdx.x;
+            const unsigned int lane = tid & warp_mask;
+            const unsigned int warp = tid >> warp_log;
+
+            Sc scan;
+            typename Sc::merge merge_scan;
+            const unsigned int idx = scan.index(tid);
+
+            T val = scan(ptr, idx);
+            __syncthreads ();
+
+            if( warp == 0)
+                scan.init(ptr);
+            __syncthreads ();
+
+            if( lane == 31 )
+                ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
+            __syncthreads ();
+
+            if( warp == 0 )
+                merge_scan(ptr, idx);
+            __syncthreads();
+
+            if ( warp > 0)
+                val = ptr [scan.warp_offset + warp - 1] + val;
+            __syncthreads ();
+
+            ptr[idx] = val;
+            __syncthreads ();
+
+            return val ;
+        }
+
+        static const int warp_log  = 5;
+        static const int warp_mask = 31;
+    };
+
+    template <typename T>
+    __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
+    {
+    #if __CUDA_ARCH__ >= 300
+        const unsigned int laneId = cv::cuda::device::Warp::laneId();
+
+        // scan on shuffl functions
+        #pragma unroll
+        for (int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
+        {
+            const T n = cv::cuda::device::shfl_up(idata, i);
+            if (laneId >= i)
+                  idata += n;
+        }
+
+        return idata;
+    #else
+        unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
+        s_Data[pos] = 0;
+        pos += OPENCV_CUDA_WARP_SIZE;
+        s_Data[pos] = idata;
+
+        s_Data[pos] += s_Data[pos - 1];
+        s_Data[pos] += s_Data[pos - 2];
+        s_Data[pos] += s_Data[pos - 4];
+        s_Data[pos] += s_Data[pos - 8];
+        s_Data[pos] += s_Data[pos - 16];
+
+        return s_Data[pos];
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T warpScanExclusive(T idata, volatile T* s_Data, unsigned int tid)
+    {
+        return warpScanInclusive(idata, s_Data, tid) - idata;
+    }
+
+    template <int tiNumScanThreads, typename T>
+    __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
+    {
+        if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
+        {
+            //Bottom-level inclusive warp scan
+            T warpResult = warpScanInclusive(idata, s_Data, tid);
+
+            //Save top elements of each warp for exclusive warp scan
+            //sync to wait for warp scans to complete (because s_Data is being overwritten)
+            __syncthreads();
+            if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
+            {
+                s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
+            }
+
+            //wait for warp scans to complete
+            __syncthreads();
+
+            if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
+            {
+                //grab top warp elements
+                T val = s_Data[tid];
+                //calculate exclusive scan and write back to shared memory
+                s_Data[tid] = warpScanExclusive(val, s_Data, tid);
+            }
+
+            //return updated warp scans with exclusive scan results
+            __syncthreads();
+
+            return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
+        }
+        else
+        {
+            return warpScanInclusive(idata, s_Data, tid);
+        }
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_SCAN_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/simd_functions.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/simd_functions.hpp
new file mode 100644
index 000000000000..3d8c2e0d8e56
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/simd_functions.hpp
@@ -0,0 +1,869 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+ * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ *   Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ *   Neither the name of NVIDIA Corporation nor the names of its contributors
+ *   may be used to endorse or promote products derived from this software
+ *   without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
+#define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // 2
+
+    static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a + b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
+        r = r - s;          // subtract out carry-out from low word
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a - b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // borrow to high word
+        r = r + s;          // compensate for borrow from low word
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u, v;
+        s = a & 0x0000ffff; // extract low halfword
+        r = b & 0x0000ffff; // extract low halfword
+        u = ::max(r, s);    // maximum of low halfwords
+        v = ::min(r, s);    // minimum of low halfwords
+        s = a & 0xffff0000; // extract high halfword
+        r = b & 0xffff0000; // extract high halfword
+        t = ::max(r, s);    // maximum of high halfwords
+        s = ::min(r, s);    // minimum of high halfwords
+        r = u | t;          // maximum of both halfwords
+        s = v | s;          // minimum of both halfwords
+        r = r - s;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
+        s = s >> 1;
+        s = r + s;
+
+        return s;
+    }
+
+    static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int s;
+        s = a ^ b;
+        r = a | b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
+        s = s >> 1;
+        r = r - s;
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::max(r, s);    // maximum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::max(r, s);    // maximum of high halfwords
+        r = t | u;          // combine halfword maximums
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::min(r, s);    // minimum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::min(r, s);    // minimum of high halfwords
+        r = t | u;          // combine halfword minimums
+    #endif
+
+        return r;
+    }
+
+    // 4
+
+    static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ b;          // sum bits
+        r = a & 0x7f7f7f7f; // clear msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // msb sum bits
+        r = r + t;          // add without msbs, record carry-out in msbs
+        r = r ^ s;          // sum of msb sum and carry-in bits, w/o carry-out
+    #endif /* __CUDA_ARCH__ >= 300 */
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ ~b;         // inverted sum bits
+        r = a | 0x80808080; // set msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // inverted msb sum bits
+        r = r - t;          // subtract w/o msbs, record inverted borrows in msb
+        r = r ^ s;          // combine inverted msb sum bits and borrows
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        s = s >> 1;
+        s = r + s;
+
+        return s;
+    }
+
+    static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int c;
+        c = a ^ b;
+        r = a | b;
+        c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        c = c >> 1;
+        r = r - c;
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x80
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r & ~c;         // msb = 1, if r was 0x00
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, t;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq4(a, b);
+        t = r << 8;         // convert bool
+        r = t - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        t = a ^ b;          // 0x00 if a == b
+        r = t | 0x80808080; // set msbs, to catch carry out
+        t = t ^ r;          // extract msbs, msb = 1 if t < 0x80
+        r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
+        r = t & ~r;         // msb = 1, if t was 0x00
+        t = r >> 7;         // build mask
+        t = r - t;          //  from
+        r = t | r;          //   msbs
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg4(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge4(a, b);
+        s = r << 8;         // convert bool
+        r = s - r;          //  to mask
+    #else
+        asm ("not.b32 %0,%0;" : "+r"(b));
+        r = vavrg4 (a, b);  // (a + ~b + 1) / 2 = (a - b) / 2
+        r = r & 0x80808080; // msb = carry-outs
+        s = r >> 7;         // build mask
+        s = r - s;          //  from
+        r = s | r;          //   msbs
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a ^ b;          //
+        s = (r &  s) ^ b;   // select a when a >= b, else select b => max(a,b)
+        r = s ^ r;          // select a when b >= a, else select b => min(a,b)
+        r = s - r;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+
+        return r;           // byte-wise unsigned maximum
+    }
+
+    static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(b, a);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+
+        return r;
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_SIMD_FUNCTIONS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/transform.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/transform.hpp
new file mode 100644
index 000000000000..42aa6ea170f2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/transform.hpp
@@ -0,0 +1,75 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TRANSFORM_HPP
+#define OPENCV_CUDA_TRANSFORM_HPP
+
+#include "common.hpp"
+#include "utility.hpp"
+#include "detail/transform_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename D, typename UnOp, typename Mask>
+    static inline void transform(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<UnOp> ft;
+        transform_detail::TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    static inline void transform(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<BinOp> ft;
+        transform_detail::TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TRANSFORM_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/type_traits.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/type_traits.hpp
new file mode 100644
index 000000000000..8b7a3fd16832
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/type_traits.hpp
@@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TYPE_TRAITS_HPP
+#define OPENCV_CUDA_TYPE_TRAITS_HPP
+
+#include "detail/type_traits_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct IsSimpleParameter
+    {
+        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
+            type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
+    };
+
+    template <typename T> struct TypeTraits
+    {
+        typedef typename type_traits_detail::UnConst<T>::type                                                NonConstType;
+        typedef typename type_traits_detail::UnVolatile<T>::type                                             NonVolatileType;
+        typedef typename type_traits_detail::UnVolatile<typename type_traits_detail::UnConst<T>::type>::type UnqualifiedType;
+        typedef typename type_traits_detail::PointerTraits<UnqualifiedType>::type                            PointeeType;
+        typedef typename type_traits_detail::ReferenceTraits<T>::type                                        ReferredType;
+
+        enum { isConst          = type_traits_detail::UnConst<T>::value };
+        enum { isVolatile       = type_traits_detail::UnVolatile<T>::value };
+
+        enum { isReference      = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
+        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
+
+        enum { isUnsignedInt    = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
+        enum { isSignedInt      = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
+        enum { isIntegral       = type_traits_detail::IsIntegral<UnqualifiedType>::value };
+        enum { isFloat          = type_traits_detail::IsFloat<UnqualifiedType>::value };
+        enum { isArith          = isIntegral || isFloat };
+        enum { isVec            = type_traits_detail::IsVec<UnqualifiedType>::value };
+
+        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
+            T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
+    };
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TYPE_TRAITS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/utility.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/utility.hpp
new file mode 100644
index 000000000000..7f5db48a500f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/utility.hpp
@@ -0,0 +1,230 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_UTILITY_HPP
+#define OPENCV_CUDA_UTILITY_HPP
+
+#include "saturate_cast.hpp"
+#include "datamov_utils.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct CV_EXPORTS ThrustAllocator
+    {
+        typedef uchar value_type;
+        virtual ~ThrustAllocator();
+        virtual __device__ __host__ uchar* allocate(size_t numBytes) = 0;
+        virtual __device__ __host__ void deallocate(uchar* ptr, size_t numBytes) = 0;
+        static ThrustAllocator& getAllocator();
+        static void setAllocator(ThrustAllocator* allocator);
+    };
+    #define OPENCV_CUDA_LOG_WARP_SIZE        (5)
+    #define OPENCV_CUDA_WARP_SIZE            (1 << OPENCV_CUDA_LOG_WARP_SIZE)
+    #define OPENCV_CUDA_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
+    #define OPENCV_CUDA_MEM_BANKS            (1 << OPENCV_CUDA_LOG_MEM_BANKS)
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // swap
+
+    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
+    {
+        const T temp = a;
+        a = b;
+        b = temp;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Mask Reader
+
+    struct SingleMask
+    {
+        explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
+        __host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            return mask.ptr(y)[x] != 0;
+        }
+
+        PtrStepb mask;
+    };
+
+    struct SingleMaskChannels
+    {
+        __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
+        : mask(mask_), channels(channels_) {}
+        __host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
+            :mask(mask_.mask), channels(mask_.channels){}
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            return mask.ptr(y)[x / channels] != 0;
+        }
+
+        PtrStepb mask;
+        int channels;
+    };
+
+    struct MaskCollection
+    {
+        explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_)
+            : maskCollection(maskCollection_) {}
+
+        __device__ __forceinline__ MaskCollection(const MaskCollection& masks_)
+            : maskCollection(masks_.maskCollection), curMask(masks_.curMask){}
+
+        __device__ __forceinline__ void next()
+        {
+            curMask = *maskCollection++;
+        }
+        __device__ __forceinline__ void setMask(int z)
+        {
+            curMask = maskCollection[z];
+        }
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            uchar val;
+            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
+        }
+
+        const PtrStepb* maskCollection;
+        PtrStepb curMask;
+    };
+
+    struct WithOutMask
+    {
+        __host__ __device__ __forceinline__ WithOutMask(){}
+        __host__ __device__ __forceinline__ WithOutMask(const WithOutMask&){}
+
+        __device__ __forceinline__ void next() const
+        {
+        }
+        __device__ __forceinline__ void setMask(int) const
+        {
+        }
+
+        __device__ __forceinline__ bool operator()(int, int) const
+        {
+            return true;
+        }
+
+        __device__ __forceinline__ bool operator()(int, int, int) const
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int)
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int, int)
+        {
+            return true;
+        }
+    };
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Solve linear system
+
+    // solve 2x2 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
+    {
+        T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
+
+            x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
+
+            return true;
+        }
+
+        return false;
+    }
+
+    // solve 3x3 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
+    {
+        T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
+              - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+              + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet *
+                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
+                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
+                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
+
+            x[1] = saturate_cast<T>(invdet *
+                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
+                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
+                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
+
+            x[2] = saturate_cast<T>(invdet *
+                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
+                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
+                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
+
+            return true;
+        }
+
+        return false;
+    }
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_UTILITY_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_distance.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_distance.hpp
new file mode 100644
index 000000000000..ef6e51087d33
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_distance.hpp
@@ -0,0 +1,232 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_DISTANCE_HPP
+#define OPENCV_CUDA_VEC_DISTANCE_HPP
+
+#include "reduce.hpp"
+#include "functional.hpp"
+#include "detail/vec_distance_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct L1Dist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum = __sad(val1, val2, mySum);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+    template <> struct L1Dist<float>
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            mySum += ::fabs(val1 - val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return mySum;
+        }
+
+        float mySum;
+    };
+
+    struct L2Dist
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            float reg = val1 - val2;
+            mySum += reg * reg;
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return sqrtf(mySum);
+        }
+
+        float mySum;
+    };
+
+    struct HammingDist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ HammingDist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum += __popc(val1 ^ val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename Dist, typename T1, typename T2>
+    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        for (int i = tid; i < len; i += THREAD_DIM)
+        {
+            T1 val1;
+            ForceGlob<T1>::Load(vec1, i, val1);
+
+            T2 val2;
+            ForceGlob<T2>::Load(vec2, i, val2);
+
+            dist.reduceIter(val1, val2);
+        }
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
+    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
+    {
+        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
+        {
+            vec1 = vec1_;
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
+        }
+
+        const T1* vec1;
+    };
+
+    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
+    {
+        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
+        {
+            if (glob_tid < len)
+                smem[glob_tid] = vec1[glob_tid];
+            __syncthreads();
+
+            U* vec1ValsPtr = vec1Vals;
+
+            #pragma unroll
+            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
+                *vec1ValsPtr++ = smem[i];
+
+            __syncthreads();
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
+        }
+
+        U vec1Vals[MAX_LEN / THREAD_DIM];
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_DISTANCE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_math.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_math.hpp
new file mode 100644
index 000000000000..80b130368187
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_math.hpp
@@ -0,0 +1,923 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VECMATH_HPP
+#define OPENCV_CUDA_VECMATH_HPP
+
+#include "vec_traits.hpp"
+#include "saturate_cast.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+
+// saturate_cast
+
+namespace vec_math_detail
+{
+    template <int cn, typename VecD> struct SatCastHelper;
+    template <typename VecD> struct SatCastHelper<1, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<2, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<3, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<4, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+        }
+    };
+
+    template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_helper(const VecS& v)
+    {
+        return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+    }
+}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+// unary operators
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(op (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(op (a.x), op (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(op (a.x), op (a.y), op (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(op (a.x), op (a.y), op (a.z), op (a.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_OP
+
+// unary functions
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x), func (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x), func (a.y), func (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrt, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::exp, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::log, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::cosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
+
+// binary operators (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op b.x, a.y op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op b.x, a.y op b.y, a.z op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP
+
+// binary operators (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(op, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op s); \
+    } \
+    __device__ __forceinline__ output_type ## 1 operator op(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(s op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op s, a.y op s); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(s op b.x, s op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op s, a.y op s, a.z op s); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(s op b.x, s op b.y, s op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op s, a.y op s, a.z op s, a.w op s); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(s op b.x, s op b.y, s op b.z, s op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uint, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP
+
+// binary function (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x, b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x, b.x), func (a.y, b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z), func (a.w, b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmaxf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmax, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fminf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fmin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypot, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC
+
+// binary function (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(func_name, func, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) a.x, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 1 func_name(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) s, (output_type) b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s), func ((output_type) a.w, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z), func ((output_type) s, (output_type) b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC
+
+}}} // namespace cv { namespace cuda { namespace device
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VECMATH_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_traits.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_traits.hpp
new file mode 100644
index 000000000000..b5ff281a0b8a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/vec_traits.hpp
@@ -0,0 +1,288 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_TRAITS_HPP
+#define OPENCV_CUDA_VEC_TRAITS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<typename T, int N> struct TypeVec;
+
+    struct __align__(8) uchar8
+    {
+        uchar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
+    {
+        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(8) char8
+    {
+        schar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
+    {
+        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) ushort8
+    {
+        ushort a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
+    {
+        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) short8
+    {
+        short a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
+    {
+        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) uint8
+    {
+        uint a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
+    {
+        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) int8
+    {
+        int a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
+    {
+        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) float8
+    {
+        float a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
+    {
+        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct double8
+    {
+        double a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
+    {
+        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_TYPE_VEC(type) \
+    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
+    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
+    template<> struct TypeVec<type, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type ## 2, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
+    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
+
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(uchar)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(char)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(ushort)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(short)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(int)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(uint)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(float)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(double)
+
+    #undef OPENCV_CUDA_IMPLEMENT_TYPE_VEC
+
+    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
+    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
+    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
+    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
+
+    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
+    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
+    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
+    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
+
+    template<typename T> struct VecTraits;
+
+#define OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(type) \
+    template<> struct VecTraits<type> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
+        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
+        static __device__ __host__ __forceinline__ type make(const type* v) {return *v;} \
+    }; \
+    template<> struct VecTraits<type ## 1> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(const type* v) {return make_ ## type ## 1(*v);} \
+    }; \
+    template<> struct VecTraits<type ## 2> \
+    { \
+        typedef type elem_type; \
+        enum {cn=2}; \
+        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(const type* v) {return make_ ## type ## 2(v[0], v[1]);} \
+    }; \
+    template<> struct VecTraits<type ## 3> \
+    { \
+        typedef type elem_type; \
+        enum {cn=3}; \
+        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(const type* v) {return make_ ## type ## 3(v[0], v[1], v[2]);} \
+    }; \
+    template<> struct VecTraits<type ## 4> \
+    { \
+        typedef type elem_type; \
+        enum {cn=4}; \
+        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(const type* v) {return make_ ## type ## 4(v[0], v[1], v[2], v[3]);} \
+    }; \
+    template<> struct VecTraits<type ## 8> \
+    { \
+        typedef type elem_type; \
+        enum {cn=8}; \
+        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
+    };
+
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(uchar)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(ushort)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(short)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(int)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(uint)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(float)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(double)
+
+    #undef OPENCV_CUDA_IMPLEMENT_VEC_TRAITS
+
+    template<> struct VecTraits<char>
+    {
+        typedef char elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char all(char v) {return v;}
+        static __device__ __host__ __forceinline__ char make(char x) {return x;}
+        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
+    };
+    template<> struct VecTraits<schar>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
+        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
+    };
+    template<> struct VecTraits<char1>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
+        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
+        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
+    };
+    template<> struct VecTraits<char2>
+    {
+        typedef schar elem_type;
+        enum {cn=2};
+        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
+        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
+        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
+    };
+    template<> struct VecTraits<char3>
+    {
+        typedef schar elem_type;
+        enum {cn=3};
+        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
+        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
+    };
+    template<> struct VecTraits<char4>
+    {
+        typedef schar elem_type;
+        enum {cn=4};
+        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
+        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
+    };
+    template<> struct VecTraits<char8>
+    {
+        typedef schar elem_type;
+        enum {cn=8};
+        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
+        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
+        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_TRAITS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp.hpp
new file mode 100644
index 000000000000..8af7e6a212af
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp.hpp
@@ -0,0 +1,139 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DEVICE_WARP_HPP
+#define OPENCV_CUDA_DEVICE_WARP_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Warp
+    {
+        enum
+        {
+            LOG_WARP_SIZE = 5,
+            WARP_SIZE     = 1 << LOG_WARP_SIZE,
+            STRIDE        = WARP_SIZE
+        };
+
+        /** \brief Returns the warp lane ID of the calling thread. */
+        static __device__ __forceinline__ unsigned int laneId()
+        {
+            unsigned int ret;
+            asm("mov.u32 %0, %%laneid;" : "=r"(ret) );
+            return ret;
+        }
+
+        template<typename It, typename T>
+        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+        {
+            for(It t = beg + laneId(); t < end; t += STRIDE)
+                *t = value;
+        }
+
+        template<typename InIt, typename OutIt>
+        static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
+        {
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = *t;
+            return out;
+        }
+
+        template<typename InIt, typename OutIt, class UnOp>
+        static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
+        {
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = op(*t);
+            return out;
+        }
+
+        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+        static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+        {
+            unsigned int lane = laneId();
+
+            InIt1 t1 = beg1 + lane;
+            InIt2 t2 = beg2 + lane;
+            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
+                *out = op(*t1, *t2);
+            return out;
+        }
+
+        template <class T, class BinOp>
+        static __device__ __forceinline__ T reduce(volatile T *ptr, BinOp op)
+        {
+            const unsigned int lane = laneId();
+
+            if (lane < 16)
+            {
+                T partial = ptr[lane];
+
+                ptr[lane] = partial = op(partial, ptr[lane + 16]);
+                ptr[lane] = partial = op(partial, ptr[lane + 8]);
+                ptr[lane] = partial = op(partial, ptr[lane + 4]);
+                ptr[lane] = partial = op(partial, ptr[lane + 2]);
+                ptr[lane] = partial = op(partial, ptr[lane + 1]);
+            }
+
+            return *ptr;
+        }
+
+        template<typename OutIt, typename T>
+        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+        {
+            unsigned int lane = laneId();
+            value += lane;
+
+            for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
+                *t = value;
+        }
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_DEVICE_WARP_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp_reduce.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp_reduce.hpp
new file mode 100644
index 000000000000..530303d2495f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp_reduce.hpp
@@ -0,0 +1,76 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_WARP_REDUCE_HPP__
+#define OPENCV_CUDA_WARP_REDUCE_HPP__
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <class T>
+    __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
+    {
+        const unsigned int lane = tid & 31; // index of thread in warp (0..31)
+
+        if (lane < 16)
+        {
+            T partial = ptr[tid];
+
+            ptr[tid] = partial = partial + ptr[tid + 16];
+            ptr[tid] = partial = partial + ptr[tid + 8];
+            ptr[tid] = partial = partial + ptr[tid + 4];
+            ptr[tid] = partial = partial + ptr[tid + 2];
+            ptr[tid] = partial = partial + ptr[tid + 1];
+        }
+
+        return ptr[tid - lane];
+    }
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_WARP_REDUCE_HPP__ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp_shuffle.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp_shuffle.hpp
new file mode 100644
index 000000000000..0da54aee99bf
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda/warp_shuffle.hpp
@@ -0,0 +1,162 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_WARP_SHUFFLE_HPP
+#define OPENCV_CUDA_WARP_SHUFFLE_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+#if __CUDACC_VER_MAJOR__ >= 9
+#  define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
+#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
+#  define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
+#endif
+    template <typename T>
+    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return __shfl(val, srcLane, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl((int) val, srcLane, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl(lo, srcLane, width);
+        hi = __shfl(hi, srcLane, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return __shfl_down(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_down((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_down(lo, delta, width);
+        hi = __shfl_down(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return __shfl_up(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_up((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_up(lo, delta, width);
+        hi = __shfl_up(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+}}}
+
+#  undef __shfl
+#  undef __shfl_up
+#  undef __shfl_down
+
+//! @endcond
+
+#endif // OPENCV_CUDA_WARP_SHUFFLE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda_stream_accessor.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda_stream_accessor.hpp
new file mode 100644
index 000000000000..deaf356fff43
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda_stream_accessor.hpp
@@ -0,0 +1,86 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDA_STREAM_ACCESSOR_HPP
+#define OPENCV_CORE_CUDA_STREAM_ACCESSOR_HPP
+
+#ifndef __cplusplus
+#  error cuda_stream_accessor.hpp header must be compiled as C++
+#endif
+
+/** @file cuda_stream_accessor.hpp
+ * This is only header file that depends on CUDA Runtime API. All other headers are independent.
+ */
+
+#include <cuda_runtime.h>
+#include "opencv2/core/cuda.hpp"
+
+namespace cv
+{
+    namespace cuda
+    {
+
+//! @addtogroup cudacore_struct
+//! @{
+
+        /** @brief Class that enables getting cudaStream_t from cuda::Stream
+         */
+        struct StreamAccessor
+        {
+            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+            CV_EXPORTS static Stream wrapStream(cudaStream_t stream);
+        };
+
+        /** @brief Class that enables getting cudaEvent_t from cuda::Event
+         */
+        struct EventAccessor
+        {
+            CV_EXPORTS static cudaEvent_t getEvent(const Event& event);
+            CV_EXPORTS static Event wrapEvent(cudaEvent_t event);
+        };
+
+//! @}
+
+    }
+}
+
+#endif /* OPENCV_CORE_CUDA_STREAM_ACCESSOR_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda_types.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda_types.hpp
new file mode 100644
index 000000000000..b33f06179d13
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cuda_types.hpp
@@ -0,0 +1,144 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDA_TYPES_HPP
+#define OPENCV_CORE_CUDA_TYPES_HPP
+
+#ifndef __cplusplus
+#  error cuda_types.hpp header must be compiled as C++
+#endif
+
+#if defined(__OPENCV_BUILD) && defined(__clang__)
+#pragma clang diagnostic ignored "-Winconsistent-missing-override"
+#endif
+#if defined(__OPENCV_BUILD) && defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wsuggest-override"
+#endif
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+#ifdef __CUDACC__
+    #define __CV_CUDA_HOST_DEVICE__ __host__ __device__ __forceinline__
+#else
+    #define __CV_CUDA_HOST_DEVICE__
+#endif
+
+namespace cv
+{
+    namespace cuda
+    {
+
+        // Simple lightweight structures that encapsulates information about an image on device.
+        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
+
+        template <typename T> struct DevPtr
+        {
+            typedef T elem_type;
+            typedef int index_type;
+
+            enum { elem_size = sizeof(elem_type) };
+
+            T* data;
+
+            __CV_CUDA_HOST_DEVICE__ DevPtr() : data(0) {}
+            __CV_CUDA_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}
+
+            __CV_CUDA_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
+            __CV_CUDA_HOST_DEVICE__ operator       T*()       { return data; }
+            __CV_CUDA_HOST_DEVICE__ operator const T*() const { return data; }
+        };
+
+        template <typename T> struct PtrSz : public DevPtr<T>
+        {
+            __CV_CUDA_HOST_DEVICE__ PtrSz() : size(0) {}
+            __CV_CUDA_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
+
+            size_t size;
+        };
+
+        template <typename T> struct PtrStep : public DevPtr<T>
+        {
+            __CV_CUDA_HOST_DEVICE__ PtrStep() : step(0) {}
+            __CV_CUDA_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}
+
+            size_t step;
+
+            __CV_CUDA_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)(((DevPtr<T>*)this)->data) + y * step); }
+            __CV_CUDA_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)(((DevPtr<T>*)this)->data) + y * step); }
+
+            __CV_CUDA_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
+            __CV_CUDA_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
+        };
+
+        template <typename T> struct PtrStepSz : public PtrStep<T>
+        {
+            __CV_CUDA_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
+            __CV_CUDA_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
+                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}
+
+            template <typename U>
+            explicit PtrStepSz(const PtrStepSz<U>& d) : PtrStep<T>((T*)d.data, d.step), cols(d.cols), rows(d.rows){}
+
+            int cols;
+            int rows;
+        };
+
+        typedef PtrStepSz<unsigned char> PtrStepSzb;
+        typedef PtrStepSz<unsigned short> PtrStepSzus;
+        typedef PtrStepSz<float> PtrStepSzf;
+        typedef PtrStepSz<int> PtrStepSzi;
+
+        typedef PtrStep<unsigned char> PtrStepb;
+        typedef PtrStep<unsigned short> PtrStepus;
+        typedef PtrStep<float> PtrStepf;
+        typedef PtrStep<int> PtrStepi;
+
+    }
+}
+
+//! @endcond
+
+#endif /* OPENCV_CORE_CUDA_TYPES_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cv_cpu_dispatch.h b/3rdparty/opencv/opencv410/build/include/opencv2/core/cv_cpu_dispatch.h
new file mode 100644
index 000000000000..0817e7ec7066
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cv_cpu_dispatch.h
@@ -0,0 +1,386 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if defined __OPENCV_BUILD \
+
+#include "cv_cpu_config.h"
+#include "cv_cpu_helper.h"
+
+#ifdef CV_CPU_DISPATCH_MODE
+#define CV_CPU_OPTIMIZATION_NAMESPACE __CV_CAT(opt_, CV_CPU_DISPATCH_MODE)
+#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace __CV_CAT(opt_, CV_CPU_DISPATCH_MODE) {
+#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
+#else
+#define CV_CPU_OPTIMIZATION_NAMESPACE cpu_baseline
+#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace cpu_baseline {
+#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
+#define CV_CPU_BASELINE_MODE 1
+#endif
+
+
+#define __CV_CPU_DISPATCH_CHAIN_END(fn, args, mode, ...)  /* done */
+#define __CV_CPU_DISPATCH(fn, args, mode, ...) __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+#define __CV_CPU_DISPATCH_EXPAND(fn, args, ...) __CV_EXPAND(__CV_CPU_DISPATCH(fn, args, __VA_ARGS__))
+#define CV_CPU_DISPATCH(fn, args, ...) __CV_CPU_DISPATCH_EXPAND(fn, args, __VA_ARGS__, END) // expand macros
+
+
+#if defined CV_ENABLE_INTRINSICS \
+    && !defined CV_DISABLE_OPTIMIZATION \
+    && !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */ \
+
+#ifdef CV_CPU_COMPILE_SSE2
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE3
+#  include <pmmintrin.h>
+#  define CV_SSE3 1
+#endif
+#ifdef CV_CPU_COMPILE_SSSE3
+#  include <tmmintrin.h>
+#  define CV_SSSE3 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE4_1
+#  include <smmintrin.h>
+#  define CV_SSE4_1 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE4_2
+#  include <nmmintrin.h>
+#  define CV_SSE4_2 1
+#endif
+#ifdef CV_CPU_COMPILE_POPCNT
+#  ifdef _MSC_VER
+#    include <nmmintrin.h>
+#    if defined(_M_X64)
+#      define CV_POPCNT_U64 (int)_mm_popcnt_u64
+#    endif
+#    define CV_POPCNT_U32 _mm_popcnt_u32
+#  else
+#    include <popcntintrin.h>
+#    if defined(__x86_64__)
+#      define CV_POPCNT_U64 __builtin_popcountll
+#    endif
+#    define CV_POPCNT_U32 __builtin_popcount
+#  endif
+#  define CV_POPCNT 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX
+#  include <immintrin.h>
+#  define CV_AVX 1
+#endif
+#ifdef CV_CPU_COMPILE_FP16
+#  if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
+#    include <arm_neon.h>
+#  else
+#    include <immintrin.h>
+#  endif
+#  define CV_FP16 1
+#endif
+#ifdef CV_CPU_COMPILE_NEON_DOTPROD
+#  include <arm_neon.h>
+#  define CV_NEON_DOT 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX2
+#  include <immintrin.h>
+#  define CV_AVX2 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX_512F
+#  include <immintrin.h>
+#  define CV_AVX_512F 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_COMMON
+#  define CV_AVX512_COMMON 1
+#  define CV_AVX_512CD 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_KNL
+#  define CV_AVX512_KNL 1
+#  define CV_AVX_512ER 1
+#  define CV_AVX_512PF 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_KNM
+#  define CV_AVX512_KNM 1
+#  define CV_AVX_5124FMAPS 1
+#  define CV_AVX_5124VNNIW 1
+#  define CV_AVX_512VPOPCNTDQ 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_SKX
+#  define CV_AVX512_SKX 1
+#  define CV_AVX_512VL 1
+#  define CV_AVX_512BW 1
+#  define CV_AVX_512DQ 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_CNL
+#  define CV_AVX512_CNL 1
+#  define CV_AVX_512IFMA 1
+#  define CV_AVX_512VBMI 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_CLX
+#  define CV_AVX512_CLX 1
+#  define CV_AVX_512VNNI 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_ICL
+#  define CV_AVX512_ICL 1
+#  undef CV_AVX_512IFMA
+#  define CV_AVX_512IFMA 1
+#  undef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 1
+#  undef CV_AVX_512VNNI
+#  define CV_AVX_512VNNI 1
+#  define CV_AVX_512VBMI2 1
+#  define CV_AVX_512BITALG 1
+#  define CV_AVX_512VPOPCNTDQ 1
+#endif
+#ifdef CV_CPU_COMPILE_FMA3
+#  define CV_FMA3 1
+#endif
+
+#if defined _WIN32 && (defined(_M_ARM) || defined(_M_ARM64)) && (defined(CV_CPU_COMPILE_NEON) || !defined(_MSC_VER))
+# include <Intrin.h>
+# include <arm_neon.h>
+# define CV_NEON 1
+#elif defined(__ARM_NEON)
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#endif
+
+#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071)
+# include<riscv_vector.h>
+# define CV_RVV071 1
+#endif
+
+#ifdef CV_CPU_COMPILE_VSX
+#  include <altivec.h>
+#  undef vector
+#  undef pixel
+#  undef bool
+#  define CV_VSX 1
+#endif
+
+#ifdef CV_CPU_COMPILE_VSX3
+#  define CV_VSX3 1
+#endif
+
+#ifdef CV_CPU_COMPILE_MSA
+#  include "hal/msa_macros.h"
+#  define CV_MSA 1
+#endif
+
+#ifdef CV_CPU_COMPILE_LSX
+#  include <lsxintrin.h>
+#  define CV_LSX 1
+#endif
+
+#ifdef CV_CPU_COMPILE_LASX
+#  include <lasxintrin.h>
+#  define CV_LASX 1
+#endif
+
+#ifdef __EMSCRIPTEN__
+#  define CV_WASM_SIMD 1
+#  include <wasm_simd128.h>
+#endif
+
+#if defined CV_CPU_COMPILE_RVV
+#  define CV_RVV 1
+#  include <riscv_vector.h>
+#endif
+
+#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
+
+#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
+struct VZeroUpperGuard {
+#ifdef __GNUC__
+    __attribute__((always_inline))
+#endif
+    inline VZeroUpperGuard() { _mm256_zeroupper(); }
+#ifdef __GNUC__
+    __attribute__((always_inline))
+#endif
+    inline ~VZeroUpperGuard() { _mm256_zeroupper(); }
+};
+#define __CV_AVX_GUARD VZeroUpperGuard __vzeroupper_guard; CV_UNUSED(__vzeroupper_guard);
+#endif
+
+#ifdef __CV_AVX_GUARD
+#define CV_AVX_GUARD __CV_AVX_GUARD
+#else
+#define CV_AVX_GUARD
+#endif
+
+#endif // __OPENCV_BUILD
+
+
+
+#if !defined __OPENCV_BUILD /* Compatibility code */ \
+    && !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#elif defined _WIN32 && (defined(_M_ARM) || defined(_M_ARM64)) && (defined(CV_CPU_COMPILE_NEON) || !defined(_MSC_VER))
+# include <Intrin.h>
+# include <arm_neon.h>
+# define CV_NEON 1
+#elif defined(__ARM_NEON)
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#elif defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
+#  include <altivec.h>
+#  undef vector
+#  undef pixel
+#  undef bool
+#  define CV_VSX 1
+#endif
+
+#ifdef __F16C__
+#  include <immintrin.h>
+#  define CV_FP16 1
+#endif
+
+#endif // !__OPENCV_BUILD && !__CUDACC (Compatibility code)
+
+
+
+#ifndef CV_MMX
+#  define CV_MMX 0
+#endif
+#ifndef CV_SSE
+#  define CV_SSE 0
+#endif
+#ifndef CV_SSE2
+#  define CV_SSE2 0
+#endif
+#ifndef CV_SSE3
+#  define CV_SSE3 0
+#endif
+#ifndef CV_SSSE3
+#  define CV_SSSE3 0
+#endif
+#ifndef CV_SSE4_1
+#  define CV_SSE4_1 0
+#endif
+#ifndef CV_SSE4_2
+#  define CV_SSE4_2 0
+#endif
+#ifndef CV_POPCNT
+#  define CV_POPCNT 0
+#endif
+#ifndef CV_AVX
+#  define CV_AVX 0
+#endif
+#ifndef CV_FP16
+#  define CV_FP16 0
+#endif
+#ifndef CV_AVX2
+#  define CV_AVX2 0
+#endif
+#ifndef CV_FMA3
+#  define CV_FMA3 0
+#endif
+#ifndef CV_AVX_512F
+#  define CV_AVX_512F 0
+#endif
+#ifndef CV_AVX_512BW
+#  define CV_AVX_512BW 0
+#endif
+#ifndef CV_AVX_512CD
+#  define CV_AVX_512CD 0
+#endif
+#ifndef CV_AVX_512DQ
+#  define CV_AVX_512DQ 0
+#endif
+#ifndef CV_AVX_512ER
+#  define CV_AVX_512ER 0
+#endif
+#ifndef CV_AVX_512IFMA
+#  define CV_AVX_512IFMA 0
+#endif
+#define CV_AVX_512IFMA512 CV_AVX_512IFMA // deprecated
+#ifndef CV_AVX_512PF
+#  define CV_AVX_512PF 0
+#endif
+#ifndef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 0
+#endif
+#ifndef CV_AVX_512VL
+#  define CV_AVX_512VL 0
+#endif
+#ifndef CV_AVX_5124FMAPS
+#  define CV_AVX_5124FMAPS 0
+#endif
+#ifndef CV_AVX_5124VNNIW
+#  define CV_AVX_5124VNNIW 0
+#endif
+#ifndef CV_AVX_512VPOPCNTDQ
+#  define CV_AVX_512VPOPCNTDQ 0
+#endif
+#ifndef CV_AVX_512VNNI
+#  define CV_AVX_512VNNI 0
+#endif
+#ifndef CV_AVX_512VBMI2
+#  define CV_AVX_512VBMI2 0
+#endif
+#ifndef CV_AVX_512BITALG
+#  define CV_AVX_512BITALG 0
+#endif
+#ifndef CV_AVX512_COMMON
+#  define CV_AVX512_COMMON 0
+#endif
+#ifndef CV_AVX512_KNL
+#  define CV_AVX512_KNL 0
+#endif
+#ifndef CV_AVX512_KNM
+#  define CV_AVX512_KNM 0
+#endif
+#ifndef CV_AVX512_SKX
+#  define CV_AVX512_SKX 0
+#endif
+#ifndef CV_AVX512_CNL
+#  define CV_AVX512_CNL 0
+#endif
+#ifndef CV_AVX512_CLX
+#  define CV_AVX512_CLX 0
+#endif
+#ifndef CV_AVX512_ICL
+#  define CV_AVX512_ICL 0
+#endif
+
+#ifndef CV_NEON
+#  define CV_NEON 0
+#endif
+
+#ifndef CV_RVV071
+#  define CV_RVV071 0
+#endif
+
+#ifndef CV_VSX
+#  define CV_VSX 0
+#endif
+
+#ifndef CV_VSX3
+#  define CV_VSX3 0
+#endif
+
+#ifndef CV_MSA
+#  define CV_MSA 0
+#endif
+
+#ifndef CV_WASM_SIMD
+#  define CV_WASM_SIMD 0
+#endif
+
+#ifndef CV_RVV
+#  define CV_RVV 0
+#endif
+
+#ifndef CV_LSX
+#  define CV_LSX 0
+#endif
+
+#ifndef CV_LASX
+#  define CV_LASX 0
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cv_cpu_helper.h b/3rdparty/opencv/opencv410/build/include/opencv2/core/cv_cpu_helper.h
new file mode 100644
index 000000000000..04b00d202443
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cv_cpu_helper.h
@@ -0,0 +1,613 @@
+// AUTOGENERATED, DO NOT EDIT
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE
+#  define CV_TRY_SSE 1
+#  define CV_CPU_FORCE_SSE 1
+#  define CV_CPU_HAS_SUPPORT_SSE 1
+#  define CV_CPU_CALL_SSE(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE_(fn, args) return (opt_SSE::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE
+#  define CV_TRY_SSE 1
+#  define CV_CPU_FORCE_SSE 0
+#  define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE))
+#  define CV_CPU_CALL_SSE(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
+#  define CV_CPU_CALL_SSE_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
+#else
+#  define CV_TRY_SSE 0
+#  define CV_CPU_FORCE_SSE 0
+#  define CV_CPU_HAS_SUPPORT_SSE 0
+#  define CV_CPU_CALL_SSE(fn, args)
+#  define CV_CPU_CALL_SSE_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE(fn, args, mode, ...)  CV_CPU_CALL_SSE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2
+#  define CV_TRY_SSE2 1
+#  define CV_CPU_FORCE_SSE2 1
+#  define CV_CPU_HAS_SUPPORT_SSE2 1
+#  define CV_CPU_CALL_SSE2(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE2_(fn, args) return (opt_SSE2::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2
+#  define CV_TRY_SSE2 1
+#  define CV_CPU_FORCE_SSE2 0
+#  define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2))
+#  define CV_CPU_CALL_SSE2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
+#  define CV_CPU_CALL_SSE2_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
+#else
+#  define CV_TRY_SSE2 0
+#  define CV_CPU_FORCE_SSE2 0
+#  define CV_CPU_HAS_SUPPORT_SSE2 0
+#  define CV_CPU_CALL_SSE2(fn, args)
+#  define CV_CPU_CALL_SSE2_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE2(fn, args, mode, ...)  CV_CPU_CALL_SSE2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3
+#  define CV_TRY_SSE3 1
+#  define CV_CPU_FORCE_SSE3 1
+#  define CV_CPU_HAS_SUPPORT_SSE3 1
+#  define CV_CPU_CALL_SSE3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE3_(fn, args) return (opt_SSE3::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3
+#  define CV_TRY_SSE3 1
+#  define CV_CPU_FORCE_SSE3 0
+#  define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3))
+#  define CV_CPU_CALL_SSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
+#  define CV_CPU_CALL_SSE3_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
+#else
+#  define CV_TRY_SSE3 0
+#  define CV_CPU_FORCE_SSE3 0
+#  define CV_CPU_HAS_SUPPORT_SSE3 0
+#  define CV_CPU_CALL_SSE3(fn, args)
+#  define CV_CPU_CALL_SSE3_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE3(fn, args, mode, ...)  CV_CPU_CALL_SSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3
+#  define CV_TRY_SSSE3 1
+#  define CV_CPU_FORCE_SSSE3 1
+#  define CV_CPU_HAS_SUPPORT_SSSE3 1
+#  define CV_CPU_CALL_SSSE3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSSE3_(fn, args) return (opt_SSSE3::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3
+#  define CV_TRY_SSSE3 1
+#  define CV_CPU_FORCE_SSSE3 0
+#  define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3))
+#  define CV_CPU_CALL_SSSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
+#  define CV_CPU_CALL_SSSE3_(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
+#else
+#  define CV_TRY_SSSE3 0
+#  define CV_CPU_FORCE_SSSE3 0
+#  define CV_CPU_HAS_SUPPORT_SSSE3 0
+#  define CV_CPU_CALL_SSSE3(fn, args)
+#  define CV_CPU_CALL_SSSE3_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSSE3(fn, args, mode, ...)  CV_CPU_CALL_SSSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1
+#  define CV_TRY_SSE4_1 1
+#  define CV_CPU_FORCE_SSE4_1 1
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 1
+#  define CV_CPU_CALL_SSE4_1(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE4_1_(fn, args) return (opt_SSE4_1::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1
+#  define CV_TRY_SSE4_1 1
+#  define CV_CPU_FORCE_SSE4_1 0
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1))
+#  define CV_CPU_CALL_SSE4_1(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
+#  define CV_CPU_CALL_SSE4_1_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
+#else
+#  define CV_TRY_SSE4_1 0
+#  define CV_CPU_FORCE_SSE4_1 0
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 0
+#  define CV_CPU_CALL_SSE4_1(fn, args)
+#  define CV_CPU_CALL_SSE4_1_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE4_1(fn, args, mode, ...)  CV_CPU_CALL_SSE4_1(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2
+#  define CV_TRY_SSE4_2 1
+#  define CV_CPU_FORCE_SSE4_2 1
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 1
+#  define CV_CPU_CALL_SSE4_2(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE4_2_(fn, args) return (opt_SSE4_2::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2
+#  define CV_TRY_SSE4_2 1
+#  define CV_CPU_FORCE_SSE4_2 0
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
+#  define CV_CPU_CALL_SSE4_2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
+#  define CV_CPU_CALL_SSE4_2_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
+#else
+#  define CV_TRY_SSE4_2 0
+#  define CV_CPU_FORCE_SSE4_2 0
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 0
+#  define CV_CPU_CALL_SSE4_2(fn, args)
+#  define CV_CPU_CALL_SSE4_2_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE4_2(fn, args, mode, ...)  CV_CPU_CALL_SSE4_2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT
+#  define CV_TRY_POPCNT 1
+#  define CV_CPU_FORCE_POPCNT 1
+#  define CV_CPU_HAS_SUPPORT_POPCNT 1
+#  define CV_CPU_CALL_POPCNT(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_POPCNT_(fn, args) return (opt_POPCNT::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT
+#  define CV_TRY_POPCNT 1
+#  define CV_CPU_FORCE_POPCNT 0
+#  define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT))
+#  define CV_CPU_CALL_POPCNT(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
+#  define CV_CPU_CALL_POPCNT_(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
+#else
+#  define CV_TRY_POPCNT 0
+#  define CV_CPU_FORCE_POPCNT 0
+#  define CV_CPU_HAS_SUPPORT_POPCNT 0
+#  define CV_CPU_CALL_POPCNT(fn, args)
+#  define CV_CPU_CALL_POPCNT_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_POPCNT(fn, args, mode, ...)  CV_CPU_CALL_POPCNT(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX
+#  define CV_TRY_AVX 1
+#  define CV_CPU_FORCE_AVX 1
+#  define CV_CPU_HAS_SUPPORT_AVX 1
+#  define CV_CPU_CALL_AVX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX_(fn, args) return (opt_AVX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX
+#  define CV_TRY_AVX 1
+#  define CV_CPU_FORCE_AVX 0
+#  define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
+#  define CV_CPU_CALL_AVX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
+#  define CV_CPU_CALL_AVX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
+#else
+#  define CV_TRY_AVX 0
+#  define CV_CPU_FORCE_AVX 0
+#  define CV_CPU_HAS_SUPPORT_AVX 0
+#  define CV_CPU_CALL_AVX(fn, args)
+#  define CV_CPU_CALL_AVX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX(fn, args, mode, ...)  CV_CPU_CALL_AVX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16
+#  define CV_TRY_FP16 1
+#  define CV_CPU_FORCE_FP16 1
+#  define CV_CPU_HAS_SUPPORT_FP16 1
+#  define CV_CPU_CALL_FP16(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_FP16_(fn, args) return (opt_FP16::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16
+#  define CV_TRY_FP16 1
+#  define CV_CPU_FORCE_FP16 0
+#  define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16))
+#  define CV_CPU_CALL_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
+#  define CV_CPU_CALL_FP16_(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
+#else
+#  define CV_TRY_FP16 0
+#  define CV_CPU_FORCE_FP16 0
+#  define CV_CPU_HAS_SUPPORT_FP16 0
+#  define CV_CPU_CALL_FP16(fn, args)
+#  define CV_CPU_CALL_FP16_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_FP16(fn, args, mode, ...)  CV_CPU_CALL_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2
+#  define CV_TRY_AVX2 1
+#  define CV_CPU_FORCE_AVX2 1
+#  define CV_CPU_HAS_SUPPORT_AVX2 1
+#  define CV_CPU_CALL_AVX2(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX2_(fn, args) return (opt_AVX2::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2
+#  define CV_TRY_AVX2 1
+#  define CV_CPU_FORCE_AVX2 0
+#  define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
+#  define CV_CPU_CALL_AVX2(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
+#  define CV_CPU_CALL_AVX2_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
+#else
+#  define CV_TRY_AVX2 0
+#  define CV_CPU_FORCE_AVX2 0
+#  define CV_CPU_HAS_SUPPORT_AVX2 0
+#  define CV_CPU_CALL_AVX2(fn, args)
+#  define CV_CPU_CALL_AVX2_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX2(fn, args, mode, ...)  CV_CPU_CALL_AVX2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3
+#  define CV_TRY_FMA3 1
+#  define CV_CPU_FORCE_FMA3 1
+#  define CV_CPU_HAS_SUPPORT_FMA3 1
+#  define CV_CPU_CALL_FMA3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_FMA3_(fn, args) return (opt_FMA3::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3
+#  define CV_TRY_FMA3 1
+#  define CV_CPU_FORCE_FMA3 0
+#  define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3))
+#  define CV_CPU_CALL_FMA3(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
+#  define CV_CPU_CALL_FMA3_(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
+#else
+#  define CV_TRY_FMA3 0
+#  define CV_CPU_FORCE_FMA3 0
+#  define CV_CPU_HAS_SUPPORT_FMA3 0
+#  define CV_CPU_CALL_FMA3(fn, args)
+#  define CV_CPU_CALL_FMA3_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_FMA3(fn, args, mode, ...)  CV_CPU_CALL_FMA3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX_512F
+#  define CV_TRY_AVX_512F 1
+#  define CV_CPU_FORCE_AVX_512F 1
+#  define CV_CPU_HAS_SUPPORT_AVX_512F 1
+#  define CV_CPU_CALL_AVX_512F(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX_512F_(fn, args) return (opt_AVX_512F::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX_512F
+#  define CV_TRY_AVX_512F 1
+#  define CV_CPU_FORCE_AVX_512F 0
+#  define CV_CPU_HAS_SUPPORT_AVX_512F (cv::checkHardwareSupport(CV_CPU_AVX_512F))
+#  define CV_CPU_CALL_AVX_512F(fn, args) if (CV_CPU_HAS_SUPPORT_AVX_512F) return (opt_AVX_512F::fn args)
+#  define CV_CPU_CALL_AVX_512F_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX_512F) return (opt_AVX_512F::fn args)
+#else
+#  define CV_TRY_AVX_512F 0
+#  define CV_CPU_FORCE_AVX_512F 0
+#  define CV_CPU_HAS_SUPPORT_AVX_512F 0
+#  define CV_CPU_CALL_AVX_512F(fn, args)
+#  define CV_CPU_CALL_AVX_512F_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX_512F(fn, args, mode, ...)  CV_CPU_CALL_AVX_512F(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_COMMON
+#  define CV_TRY_AVX512_COMMON 1
+#  define CV_CPU_FORCE_AVX512_COMMON 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_COMMON 1
+#  define CV_CPU_CALL_AVX512_COMMON(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_COMMON_(fn, args) return (opt_AVX512_COMMON::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_COMMON
+#  define CV_TRY_AVX512_COMMON 1
+#  define CV_CPU_FORCE_AVX512_COMMON 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_COMMON (cv::checkHardwareSupport(CV_CPU_AVX512_COMMON))
+#  define CV_CPU_CALL_AVX512_COMMON(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_COMMON) return (opt_AVX512_COMMON::fn args)
+#  define CV_CPU_CALL_AVX512_COMMON_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_COMMON) return (opt_AVX512_COMMON::fn args)
+#else
+#  define CV_TRY_AVX512_COMMON 0
+#  define CV_CPU_FORCE_AVX512_COMMON 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_COMMON 0
+#  define CV_CPU_CALL_AVX512_COMMON(fn, args)
+#  define CV_CPU_CALL_AVX512_COMMON_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_COMMON(fn, args, mode, ...)  CV_CPU_CALL_AVX512_COMMON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_KNL
+#  define CV_TRY_AVX512_KNL 1
+#  define CV_CPU_FORCE_AVX512_KNL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNL 1
+#  define CV_CPU_CALL_AVX512_KNL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_KNL_(fn, args) return (opt_AVX512_KNL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_KNL
+#  define CV_TRY_AVX512_KNL 1
+#  define CV_CPU_FORCE_AVX512_KNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNL (cv::checkHardwareSupport(CV_CPU_AVX512_KNL))
+#  define CV_CPU_CALL_AVX512_KNL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNL) return (opt_AVX512_KNL::fn args)
+#  define CV_CPU_CALL_AVX512_KNL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNL) return (opt_AVX512_KNL::fn args)
+#else
+#  define CV_TRY_AVX512_KNL 0
+#  define CV_CPU_FORCE_AVX512_KNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNL 0
+#  define CV_CPU_CALL_AVX512_KNL(fn, args)
+#  define CV_CPU_CALL_AVX512_KNL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_KNL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_KNL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_KNM
+#  define CV_TRY_AVX512_KNM 1
+#  define CV_CPU_FORCE_AVX512_KNM 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNM 1
+#  define CV_CPU_CALL_AVX512_KNM(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_KNM_(fn, args) return (opt_AVX512_KNM::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_KNM
+#  define CV_TRY_AVX512_KNM 1
+#  define CV_CPU_FORCE_AVX512_KNM 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNM (cv::checkHardwareSupport(CV_CPU_AVX512_KNM))
+#  define CV_CPU_CALL_AVX512_KNM(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNM) return (opt_AVX512_KNM::fn args)
+#  define CV_CPU_CALL_AVX512_KNM_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNM) return (opt_AVX512_KNM::fn args)
+#else
+#  define CV_TRY_AVX512_KNM 0
+#  define CV_CPU_FORCE_AVX512_KNM 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNM 0
+#  define CV_CPU_CALL_AVX512_KNM(fn, args)
+#  define CV_CPU_CALL_AVX512_KNM_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_KNM(fn, args, mode, ...)  CV_CPU_CALL_AVX512_KNM(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_SKX
+#  define CV_TRY_AVX512_SKX 1
+#  define CV_CPU_FORCE_AVX512_SKX 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_SKX 1
+#  define CV_CPU_CALL_AVX512_SKX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_SKX_(fn, args) return (opt_AVX512_SKX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX
+#  define CV_TRY_AVX512_SKX 1
+#  define CV_CPU_FORCE_AVX512_SKX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_SKX (cv::checkHardwareSupport(CV_CPU_AVX512_SKX))
+#  define CV_CPU_CALL_AVX512_SKX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_SKX) return (opt_AVX512_SKX::fn args)
+#  define CV_CPU_CALL_AVX512_SKX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_SKX) return (opt_AVX512_SKX::fn args)
+#else
+#  define CV_TRY_AVX512_SKX 0
+#  define CV_CPU_FORCE_AVX512_SKX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_SKX 0
+#  define CV_CPU_CALL_AVX512_SKX(fn, args)
+#  define CV_CPU_CALL_AVX512_SKX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_SKX(fn, args, mode, ...)  CV_CPU_CALL_AVX512_SKX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_CNL
+#  define CV_TRY_AVX512_CNL 1
+#  define CV_CPU_FORCE_AVX512_CNL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_CNL 1
+#  define CV_CPU_CALL_AVX512_CNL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_CNL_(fn, args) return (opt_AVX512_CNL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_CNL
+#  define CV_TRY_AVX512_CNL 1
+#  define CV_CPU_FORCE_AVX512_CNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CNL (cv::checkHardwareSupport(CV_CPU_AVX512_CNL))
+#  define CV_CPU_CALL_AVX512_CNL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CNL) return (opt_AVX512_CNL::fn args)
+#  define CV_CPU_CALL_AVX512_CNL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CNL) return (opt_AVX512_CNL::fn args)
+#else
+#  define CV_TRY_AVX512_CNL 0
+#  define CV_CPU_FORCE_AVX512_CNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CNL 0
+#  define CV_CPU_CALL_AVX512_CNL(fn, args)
+#  define CV_CPU_CALL_AVX512_CNL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_CNL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_CNL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_CLX
+#  define CV_TRY_AVX512_CLX 1
+#  define CV_CPU_FORCE_AVX512_CLX 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_CLX 1
+#  define CV_CPU_CALL_AVX512_CLX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_CLX_(fn, args) return (opt_AVX512_CLX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_CLX
+#  define CV_TRY_AVX512_CLX 1
+#  define CV_CPU_FORCE_AVX512_CLX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CLX (cv::checkHardwareSupport(CV_CPU_AVX512_CLX))
+#  define CV_CPU_CALL_AVX512_CLX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CLX) return (opt_AVX512_CLX::fn args)
+#  define CV_CPU_CALL_AVX512_CLX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CLX) return (opt_AVX512_CLX::fn args)
+#else
+#  define CV_TRY_AVX512_CLX 0
+#  define CV_CPU_FORCE_AVX512_CLX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CLX 0
+#  define CV_CPU_CALL_AVX512_CLX(fn, args)
+#  define CV_CPU_CALL_AVX512_CLX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_CLX(fn, args, mode, ...)  CV_CPU_CALL_AVX512_CLX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_ICL
+#  define CV_TRY_AVX512_ICL 1
+#  define CV_CPU_FORCE_AVX512_ICL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_ICL 1
+#  define CV_CPU_CALL_AVX512_ICL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_ICL_(fn, args) return (opt_AVX512_ICL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_ICL
+#  define CV_TRY_AVX512_ICL 1
+#  define CV_CPU_FORCE_AVX512_ICL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_ICL (cv::checkHardwareSupport(CV_CPU_AVX512_ICL))
+#  define CV_CPU_CALL_AVX512_ICL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_ICL) return (opt_AVX512_ICL::fn args)
+#  define CV_CPU_CALL_AVX512_ICL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_ICL) return (opt_AVX512_ICL::fn args)
+#else
+#  define CV_TRY_AVX512_ICL 0
+#  define CV_CPU_FORCE_AVX512_ICL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_ICL 0
+#  define CV_CPU_CALL_AVX512_ICL(fn, args)
+#  define CV_CPU_CALL_AVX512_ICL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_ICL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_ICL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
+#  define CV_TRY_NEON 1
+#  define CV_CPU_FORCE_NEON 1
+#  define CV_CPU_HAS_SUPPORT_NEON 1
+#  define CV_CPU_CALL_NEON(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_(fn, args) return (opt_NEON::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON
+#  define CV_TRY_NEON 1
+#  define CV_CPU_FORCE_NEON 0
+#  define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON))
+#  define CV_CPU_CALL_NEON(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
+#  define CV_CPU_CALL_NEON_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
+#else
+#  define CV_TRY_NEON 0
+#  define CV_CPU_FORCE_NEON 0
+#  define CV_CPU_HAS_SUPPORT_NEON 0
+#  define CV_CPU_CALL_NEON(fn, args)
+#  define CV_CPU_CALL_NEON_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...)  CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_DOTPROD
+#  define CV_TRY_NEON_DOTPROD 1
+#  define CV_CPU_FORCE_NEON_DOTPROD 1
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 1
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args) return (opt_NEON_DOTPROD::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_DOTPROD
+#  define CV_TRY_NEON_DOTPROD 1
+#  define CV_CPU_FORCE_NEON_DOTPROD 0
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD (cv::checkHardwareSupport(CV_CPU_NEON_DOTPROD))
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+#else
+#  define CV_TRY_NEON_DOTPROD 0
+#  define CV_CPU_FORCE_NEON_DOTPROD 0
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 0
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_DOTPROD(fn, args, mode, ...)  CV_CPU_CALL_NEON_DOTPROD(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_FP16
+#  define CV_TRY_NEON_FP16 1
+#  define CV_CPU_FORCE_NEON_FP16 1
+#  define CV_CPU_HAS_SUPPORT_NEON_FP16 1
+#  define CV_CPU_CALL_NEON_FP16(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_FP16_(fn, args) return (opt_NEON_FP16::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_FP16
+#  define CV_TRY_NEON_FP16 1
+#  define CV_CPU_FORCE_NEON_FP16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_FP16 (cv::checkHardwareSupport(CV_CPU_NEON_FP16))
+#  define CV_CPU_CALL_NEON_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_FP16) return (opt_NEON_FP16::fn args)
+#  define CV_CPU_CALL_NEON_FP16_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_FP16) return (opt_NEON_FP16::fn args)
+#else
+#  define CV_TRY_NEON_FP16 0
+#  define CV_CPU_FORCE_NEON_FP16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_FP16 0
+#  define CV_CPU_CALL_NEON_FP16(fn, args)
+#  define CV_CPU_CALL_NEON_FP16_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_FP16(fn, args, mode, ...)  CV_CPU_CALL_NEON_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_BF16
+#  define CV_TRY_NEON_BF16 1
+#  define CV_CPU_FORCE_NEON_BF16 1
+#  define CV_CPU_HAS_SUPPORT_NEON_BF16 1
+#  define CV_CPU_CALL_NEON_BF16(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_BF16_(fn, args) return (opt_NEON_BF16::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_BF16
+#  define CV_TRY_NEON_BF16 1
+#  define CV_CPU_FORCE_NEON_BF16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_BF16 (cv::checkHardwareSupport(CV_CPU_NEON_BF16))
+#  define CV_CPU_CALL_NEON_BF16(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_BF16) return (opt_NEON_BF16::fn args)
+#  define CV_CPU_CALL_NEON_BF16_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_BF16) return (opt_NEON_BF16::fn args)
+#else
+#  define CV_TRY_NEON_BF16 0
+#  define CV_CPU_FORCE_NEON_BF16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_BF16 0
+#  define CV_CPU_CALL_NEON_BF16(fn, args)
+#  define CV_CPU_CALL_NEON_BF16_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_BF16(fn, args, mode, ...)  CV_CPU_CALL_NEON_BF16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
+#  define CV_TRY_MSA 1
+#  define CV_CPU_FORCE_MSA 1
+#  define CV_CPU_HAS_SUPPORT_MSA 1
+#  define CV_CPU_CALL_MSA(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_MSA_(fn, args) return (opt_MSA::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_MSA
+#  define CV_TRY_MSA 1
+#  define CV_CPU_FORCE_MSA 0
+#  define CV_CPU_HAS_SUPPORT_MSA (cv::checkHardwareSupport(CV_CPU_MSA))
+#  define CV_CPU_CALL_MSA(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
+#  define CV_CPU_CALL_MSA_(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
+#else
+#  define CV_TRY_MSA 0
+#  define CV_CPU_FORCE_MSA 0
+#  define CV_CPU_HAS_SUPPORT_MSA 0
+#  define CV_CPU_CALL_MSA(fn, args)
+#  define CV_CPU_CALL_MSA_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_MSA(fn, args, mode, ...)  CV_CPU_CALL_MSA(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX
+#  define CV_TRY_VSX 1
+#  define CV_CPU_FORCE_VSX 1
+#  define CV_CPU_HAS_SUPPORT_VSX 1
+#  define CV_CPU_CALL_VSX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_VSX_(fn, args) return (opt_VSX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX
+#  define CV_TRY_VSX 1
+#  define CV_CPU_FORCE_VSX 0
+#  define CV_CPU_HAS_SUPPORT_VSX (cv::checkHardwareSupport(CV_CPU_VSX))
+#  define CV_CPU_CALL_VSX(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args)
+#  define CV_CPU_CALL_VSX_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args)
+#else
+#  define CV_TRY_VSX 0
+#  define CV_CPU_FORCE_VSX 0
+#  define CV_CPU_HAS_SUPPORT_VSX 0
+#  define CV_CPU_CALL_VSX(fn, args)
+#  define CV_CPU_CALL_VSX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_VSX(fn, args, mode, ...)  CV_CPU_CALL_VSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX3
+#  define CV_TRY_VSX3 1
+#  define CV_CPU_FORCE_VSX3 1
+#  define CV_CPU_HAS_SUPPORT_VSX3 1
+#  define CV_CPU_CALL_VSX3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_VSX3_(fn, args) return (opt_VSX3::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX3
+#  define CV_TRY_VSX3 1
+#  define CV_CPU_FORCE_VSX3 0
+#  define CV_CPU_HAS_SUPPORT_VSX3 (cv::checkHardwareSupport(CV_CPU_VSX3))
+#  define CV_CPU_CALL_VSX3(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args)
+#  define CV_CPU_CALL_VSX3_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args)
+#else
+#  define CV_TRY_VSX3 0
+#  define CV_CPU_FORCE_VSX3 0
+#  define CV_CPU_HAS_SUPPORT_VSX3 0
+#  define CV_CPU_CALL_VSX3(fn, args)
+#  define CV_CPU_CALL_VSX3_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_VSX3(fn, args, mode, ...)  CV_CPU_CALL_VSX3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_RVV
+#  define CV_TRY_RVV 1
+#  define CV_CPU_FORCE_RVV 1
+#  define CV_CPU_HAS_SUPPORT_RVV 1
+#  define CV_CPU_CALL_RVV(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_RVV_(fn, args) return (opt_RVV::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_RVV
+#  define CV_TRY_RVV 1
+#  define CV_CPU_FORCE_RVV 0
+#  define CV_CPU_HAS_SUPPORT_RVV (cv::checkHardwareSupport(CV_CPU_RVV))
+#  define CV_CPU_CALL_RVV(fn, args) if (CV_CPU_HAS_SUPPORT_RVV) return (opt_RVV::fn args)
+#  define CV_CPU_CALL_RVV_(fn, args) if (CV_CPU_HAS_SUPPORT_RVV) return (opt_RVV::fn args)
+#else
+#  define CV_TRY_RVV 0
+#  define CV_CPU_FORCE_RVV 0
+#  define CV_CPU_HAS_SUPPORT_RVV 0
+#  define CV_CPU_CALL_RVV(fn, args)
+#  define CV_CPU_CALL_RVV_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...)  CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LSX
+#  define CV_TRY_LSX 1
+#  define CV_CPU_FORCE_LSX 1
+#  define CV_CPU_HAS_SUPPORT_LSX 1
+#  define CV_CPU_CALL_LSX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_LSX_(fn, args) return (opt_LSX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LSX
+#  define CV_TRY_LSX 1
+#  define CV_CPU_FORCE_LSX 0
+#  define CV_CPU_HAS_SUPPORT_LSX (cv::checkHardwareSupport(CV_CPU_LSX))
+#  define CV_CPU_CALL_LSX(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
+#  define CV_CPU_CALL_LSX_(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
+#else
+#  define CV_TRY_LSX 0
+#  define CV_CPU_FORCE_LSX 0
+#  define CV_CPU_HAS_SUPPORT_LSX 0
+#  define CV_CPU_CALL_LSX(fn, args)
+#  define CV_CPU_CALL_LSX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_LSX(fn, args, mode, ...)  CV_CPU_CALL_LSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
+#  define CV_TRY_LASX 1
+#  define CV_CPU_FORCE_LASX 1
+#  define CV_CPU_HAS_SUPPORT_LASX 1
+#  define CV_CPU_CALL_LASX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_LASX_(fn, args) return (opt_LASX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LASX
+#  define CV_TRY_LASX 1
+#  define CV_CPU_FORCE_LASX 0
+#  define CV_CPU_HAS_SUPPORT_LASX (cv::checkHardwareSupport(CV_CPU_LASX))
+#  define CV_CPU_CALL_LASX(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
+#  define CV_CPU_CALL_LASX_(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
+#else
+#  define CV_TRY_LASX 0
+#  define CV_CPU_FORCE_LASX 0
+#  define CV_CPU_HAS_SUPPORT_LASX 0
+#  define CV_CPU_CALL_LASX(fn, args)
+#  define CV_CPU_CALL_LASX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_LASX(fn, args, mode, ...)  CV_CPU_CALL_LASX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
+#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...)  CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cvdef.h b/3rdparty/opencv/opencv410/build/include/opencv2/core/cvdef.h
new file mode 100644
index 000000000000..748ecb9eceeb
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cvdef.h
@@ -0,0 +1,932 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CVDEF_H
+#define OPENCV_CORE_CVDEF_H
+
+#include "opencv2/core/version.hpp"
+
+//! @addtogroup core_utils
+//! @{
+
+#ifdef OPENCV_INCLUDE_PORT_FILE  // User-provided header file with custom platform configuration
+#include OPENCV_INCLUDE_PORT_FILE
+#endif
+
+#if !defined CV_DOXYGEN && !defined CV_IGNORE_DEBUG_BUILD_GUARD
+#if (defined(_MSC_VER) && (defined(DEBUG) || defined(_DEBUG))) || \
+    (defined(_GLIBCXX_DEBUG) || defined(_GLIBCXX_DEBUG_PEDANTIC))
+// Guard to prevent using of binary incompatible binaries / runtimes
+// https://github.com/opencv/opencv/pull/9161
+#define CV__DEBUG_NS_BEGIN namespace debug_build_guard {
+#define CV__DEBUG_NS_END }
+namespace cv { namespace debug_build_guard { } using namespace debug_build_guard; }
+#endif
+#endif
+
+#ifndef CV__DEBUG_NS_BEGIN
+#define CV__DEBUG_NS_BEGIN
+#define CV__DEBUG_NS_END
+#endif
+
+
+#ifdef __OPENCV_BUILD
+#include "cvconfig.h"
+#endif
+
+#ifndef __CV_EXPAND
+#define __CV_EXPAND(x) x
+#endif
+
+#ifndef __CV_CAT
+#define __CV_CAT__(x, y) x ## y
+#define __CV_CAT_(x, y) __CV_CAT__(x, y)
+#define __CV_CAT(x, y) __CV_CAT_(x, y)
+#endif
+
+#define __CV_VA_NUM_ARGS_HELPER(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+#define __CV_VA_NUM_ARGS(...) __CV_EXPAND(__CV_VA_NUM_ARGS_HELPER(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+
+#ifdef CV_Func
+// keep current value (through OpenCV port file)
+#elif defined __GNUC__ || (defined (__cpluscplus) && (__cpluscplus >= 201103))
+#define CV_Func __func__
+#elif defined __clang__ && (__clang_minor__ * 100 + __clang_major__ >= 305)
+#define CV_Func __func__
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION >= 199901)
+#define CV_Func __func__
+#elif defined _MSC_VER
+#define CV_Func __FUNCTION__
+#elif defined(__INTEL_COMPILER) && (_INTEL_COMPILER >= 600)
+#define CV_Func __FUNCTION__
+#elif defined __IBMCPP__ && __IBMCPP__ >=500
+#define CV_Func __FUNCTION__
+#elif defined __BORLAND__ && (__BORLANDC__ >= 0x550)
+#define CV_Func __FUNC__
+#else
+#define CV_Func "<unknown>"
+#endif
+
+//! @cond IGNORED
+
+//////////////// static assert /////////////////
+#define CVAUX_CONCAT_EXP(a, b) a##b
+#define CVAUX_CONCAT(a, b) CVAUX_CONCAT_EXP(a,b)
+
+#if defined(__clang__)
+#  ifndef __has_extension
+#    define __has_extension __has_feature /* compatibility, for older versions of clang */
+#  endif
+#  if __has_extension(cxx_static_assert)
+#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
+#  elif __has_extension(c_static_assert)
+#    define CV_StaticAssert(condition, reason)    _Static_assert((condition), reason " " #condition)
+#  endif
+#elif defined(__GNUC__)
+#  if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
+#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
+#  endif
+#elif defined(_MSC_VER)
+#  if _MSC_VER >= 1600 /* MSVC 10 */
+#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
+#  endif
+#endif
+#ifndef CV_StaticAssert
+#  if !defined(__clang__) && defined(__GNUC__) && (__GNUC__*100 + __GNUC_MINOR__ > 302)
+#    define CV_StaticAssert(condition, reason) ({ extern int __attribute__((error("CV_StaticAssert: " reason " " #condition))) CV_StaticAssert(); ((condition) ? 0 : CV_StaticAssert()); })
+#  else
+namespace cv {
+     template <bool x> struct CV_StaticAssert_failed;
+     template <> struct CV_StaticAssert_failed<true> { enum { val = 1 }; };
+     template<int x> struct CV_StaticAssert_test {};
+}
+#    define CV_StaticAssert(condition, reason)\
+       typedef cv::CV_StaticAssert_test< sizeof(cv::CV_StaticAssert_failed< static_cast<bool>(condition) >) > CVAUX_CONCAT(CV_StaticAssert_failed_at_, __LINE__)
+#  endif
+#endif
+
+// Suppress warning "-Wdeprecated-declarations" / C4996
+#if defined(_MSC_VER)
+    #define CV_DO_PRAGMA(x) __pragma(x)
+#elif defined(__GNUC__)
+    #define CV_DO_PRAGMA(x) _Pragma (#x)
+#else
+    #define CV_DO_PRAGMA(x)
+#endif
+
+#ifdef _MSC_VER
+#define CV_SUPPRESS_DEPRECATED_START \
+    CV_DO_PRAGMA(warning(push)) \
+    CV_DO_PRAGMA(warning(disable: 4996))
+#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(warning(pop))
+#elif defined (__clang__) || ((__GNUC__)  && (__GNUC__*100 + __GNUC_MINOR__ > 405))
+#define CV_SUPPRESS_DEPRECATED_START \
+    CV_DO_PRAGMA(GCC diagnostic push) \
+    CV_DO_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
+#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(GCC diagnostic pop)
+#else
+#define CV_SUPPRESS_DEPRECATED_START
+#define CV_SUPPRESS_DEPRECATED_END
+#endif
+
+#define CV_UNUSED(name) (void)name
+
+//! @endcond
+
+// undef problematic defines sometimes defined by system headers (windows.h in particular)
+#undef small
+#undef min
+#undef max
+#undef abs
+#undef Complex
+
+#if defined __cplusplus
+#include <limits>
+#else
+#include <limits.h>
+#endif
+
+#include "opencv2/core/hal/interface.h"
+
+#if defined __ICL
+#  define CV_ICC   __ICL
+#elif defined __ICC
+#  define CV_ICC   __ICC
+#elif defined __ECL
+#  define CV_ICC   __ECL
+#elif defined __ECC
+#  define CV_ICC   __ECC
+#elif defined __INTEL_COMPILER
+#  define CV_ICC   __INTEL_COMPILER
+#endif
+
+#if defined _WIN32
+#  define CV_CDECL __cdecl
+#  define CV_STDCALL __stdcall
+#else
+#  define CV_CDECL
+#  define CV_STDCALL
+#endif
+
+#ifndef CV_INLINE
+#  if defined __cplusplus
+#    define CV_INLINE static inline
+#  elif defined _MSC_VER
+#    define CV_INLINE __inline
+#  else
+#    define CV_INLINE static
+#  endif
+#endif
+
+#ifndef CV_ALWAYS_INLINE
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define CV_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define CV_ALWAYS_INLINE __forceinline
+#else
+#define CV_ALWAYS_INLINE inline
+#endif
+#endif
+
+#if defined CV_DISABLE_OPTIMIZATION || (defined CV_ICC && !defined CV_ENABLE_UNROLLED)
+#  define CV_ENABLE_UNROLLED 0
+#else
+#  define CV_ENABLE_UNROLLED 1
+#endif
+
+#ifdef __GNUC__
+#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined _MSC_VER
+#  define CV_DECL_ALIGNED(x) __declspec(align(x))
+#else
+#  define CV_DECL_ALIGNED(x)
+#endif
+
+/* CPU features and intrinsics support */
+#define CV_CPU_NONE             0
+#define CV_CPU_MMX              1
+#define CV_CPU_SSE              2
+#define CV_CPU_SSE2             3
+#define CV_CPU_SSE3             4
+#define CV_CPU_SSSE3            5
+#define CV_CPU_SSE4_1           6
+#define CV_CPU_SSE4_2           7
+#define CV_CPU_POPCNT           8
+#define CV_CPU_FP16             9
+#define CV_CPU_AVX              10
+#define CV_CPU_AVX2             11
+#define CV_CPU_FMA3             12
+
+#define CV_CPU_AVX_512F         13
+#define CV_CPU_AVX_512BW        14
+#define CV_CPU_AVX_512CD        15
+#define CV_CPU_AVX_512DQ        16
+#define CV_CPU_AVX_512ER        17
+#define CV_CPU_AVX_512IFMA512   18 // deprecated
+#define CV_CPU_AVX_512IFMA      18
+#define CV_CPU_AVX_512PF        19
+#define CV_CPU_AVX_512VBMI      20
+#define CV_CPU_AVX_512VL        21
+#define CV_CPU_AVX_512VBMI2     22
+#define CV_CPU_AVX_512VNNI      23
+#define CV_CPU_AVX_512BITALG    24
+#define CV_CPU_AVX_512VPOPCNTDQ 25
+#define CV_CPU_AVX_5124VNNIW    26
+#define CV_CPU_AVX_5124FMAPS    27
+
+#define CV_CPU_NEON             100
+#define CV_CPU_NEON_DOTPROD     101
+#define CV_CPU_NEON_FP16        102
+#define CV_CPU_NEON_BF16        103
+
+#define CV_CPU_MSA              150
+
+#define CV_CPU_RISCVV           170
+
+#define CV_CPU_VSX              200
+#define CV_CPU_VSX3             201
+
+#define CV_CPU_RVV              210
+
+#define CV_CPU_LSX              230
+#define CV_CPU_LASX             231
+
+// CPU features groups
+#define CV_CPU_AVX512_SKX       256
+#define CV_CPU_AVX512_COMMON    257
+#define CV_CPU_AVX512_KNL       258
+#define CV_CPU_AVX512_KNM       259
+#define CV_CPU_AVX512_CNL       260
+#define CV_CPU_AVX512_CLX       261
+#define CV_CPU_AVX512_ICL       262
+
+// when adding to this list remember to update the following enum
+#define CV_HARDWARE_MAX_FEATURE 512
+
+/** @brief Available CPU features.
+*/
+enum CpuFeatures {
+    CPU_MMX             = 1,
+    CPU_SSE             = 2,
+    CPU_SSE2            = 3,
+    CPU_SSE3            = 4,
+    CPU_SSSE3           = 5,
+    CPU_SSE4_1          = 6,
+    CPU_SSE4_2          = 7,
+    CPU_POPCNT          = 8,
+    CPU_FP16            = 9,
+    CPU_AVX             = 10,
+    CPU_AVX2            = 11,
+    CPU_FMA3            = 12,
+
+    CPU_AVX_512F        = 13,
+    CPU_AVX_512BW       = 14,
+    CPU_AVX_512CD       = 15,
+    CPU_AVX_512DQ       = 16,
+    CPU_AVX_512ER       = 17,
+    CPU_AVX_512IFMA512  = 18, // deprecated
+    CPU_AVX_512IFMA     = 18,
+    CPU_AVX_512PF       = 19,
+    CPU_AVX_512VBMI     = 20,
+    CPU_AVX_512VL       = 21,
+    CPU_AVX_512VBMI2    = 22,
+    CPU_AVX_512VNNI     = 23,
+    CPU_AVX_512BITALG   = 24,
+    CPU_AVX_512VPOPCNTDQ= 25,
+    CPU_AVX_5124VNNIW   = 26,
+    CPU_AVX_5124FMAPS   = 27,
+
+    CPU_NEON            = 100,
+    CPU_NEON_DOTPROD    = 101,
+    CPU_NEON_FP16       = 102,
+    CPU_NEON_BF16       = 103,
+
+    CPU_MSA             = 150,
+
+    CPU_RISCVV          = 170,
+
+    CPU_VSX             = 200,
+    CPU_VSX3            = 201,
+
+    CPU_RVV             = 210,
+
+    CPU_LSX             = 230,
+    CPU_LASX            = 231,
+
+    CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
+    CPU_AVX512_COMMON   = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
+    CPU_AVX512_KNL      = 258, //!< Knights Landing with AVX-512F/CD/ER/PF
+    CPU_AVX512_KNM      = 259, //!< Knights Mill with AVX-512F/CD/ER/PF/4FMAPS/4VNNIW/VPOPCNTDQ
+    CPU_AVX512_CNL      = 260, //!< Cannon Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI
+    CPU_AVX512_CLX      = 261, //!< Cascade Lake with AVX-512F/CD/BW/DQ/VL/VNNI
+    CPU_AVX512_ICL      = 262, //!< Ice Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI/VNNI/VBMI2/BITALG/VPOPCNTDQ
+
+    CPU_MAX_FEATURE     = 512  // see CV_HARDWARE_MAX_FEATURE
+};
+
+
+#include "cv_cpu_dispatch.h"
+
+#if !defined(CV_STRONG_ALIGNMENT) && defined(__arm__) && !(defined(__aarch64__) || defined(_M_ARM64))
+// int*, int64* should be propertly aligned pointers on ARMv7
+#define CV_STRONG_ALIGNMENT 1
+#endif
+#if !defined(CV_STRONG_ALIGNMENT)
+#define CV_STRONG_ALIGNMENT 0
+#endif
+
+/* fundamental constants */
+#define CV_PI   3.1415926535897932384626433832795
+#define CV_2PI  6.283185307179586476925286766559
+#define CV_LOG2 0.69314718055994530941723212145818
+
+#if defined __ARM_FP16_FORMAT_IEEE \
+    && !defined __CUDACC__
+#  define CV_FP16_TYPE 1
+#else
+#  define CV_FP16_TYPE 0
+#endif
+
+typedef union Cv16suf
+{
+    short i;
+    ushort u;
+#if CV_FP16_TYPE
+    __fp16 h;
+#endif
+}
+Cv16suf;
+
+typedef union Cv32suf
+{
+    int i;
+    unsigned u;
+    float f;
+}
+Cv32suf;
+
+typedef union Cv64suf
+{
+    int64 i;
+    uint64 u;
+    double f;
+}
+Cv64suf;
+
+#ifndef OPENCV_ABI_COMPATIBILITY
+#define OPENCV_ABI_COMPATIBILITY 400
+#endif
+
+#ifdef __OPENCV_BUILD
+#  define DISABLE_OPENCV_3_COMPATIBILITY
+#  define OPENCV_DISABLE_DEPRECATED_COMPATIBILITY
+#endif
+
+#ifndef CV_EXPORTS
+# if (defined _WIN32 || defined WINCE || defined __CYGWIN__) && defined(CVAPI_EXPORTS)
+#   define CV_EXPORTS __declspec(dllexport)
+# elif defined __GNUC__ && __GNUC__ >= 4 && (defined(CVAPI_EXPORTS) || defined(__APPLE__))
+#   define CV_EXPORTS __attribute__ ((visibility ("default")))
+# endif
+#endif
+
+#ifndef CV_EXPORTS
+# define CV_EXPORTS
+#endif
+
+#ifdef _MSC_VER
+#   define CV_EXPORTS_TEMPLATE
+#else
+#   define CV_EXPORTS_TEMPLATE CV_EXPORTS
+#endif
+
+#ifndef CV_DEPRECATED
+#  if defined(__GNUC__)
+#    define CV_DEPRECATED __attribute__ ((deprecated))
+#  elif defined(_MSC_VER)
+#    define CV_DEPRECATED __declspec(deprecated)
+#  else
+#    define CV_DEPRECATED
+#  endif
+#endif
+
+#ifndef CV_DEPRECATED_EXTERNAL
+#  if defined(__OPENCV_BUILD)
+#    define CV_DEPRECATED_EXTERNAL /* nothing */
+#  else
+#    define CV_DEPRECATED_EXTERNAL CV_DEPRECATED
+#  endif
+#endif
+
+
+#ifndef CV_EXTERN_C
+#  ifdef __cplusplus
+#    define CV_EXTERN_C extern "C"
+#  else
+#    define CV_EXTERN_C
+#  endif
+#endif
+
+/* special informative macros for wrapper generators */
+#define CV_EXPORTS_W CV_EXPORTS
+#define CV_EXPORTS_W_SIMPLE CV_EXPORTS
+#define CV_EXPORTS_AS(synonym) CV_EXPORTS
+#define CV_EXPORTS_W_MAP CV_EXPORTS
+#define CV_EXPORTS_W_PARAMS CV_EXPORTS
+#define CV_IN_OUT
+#define CV_OUT
+#define CV_PROP
+#define CV_PROP_RW
+#define CV_WRAP
+#define CV_WRAP_AS(synonym)
+#define CV_WRAP_MAPPABLE(mappable)
+#define CV_WRAP_PHANTOM(phantom_header)
+#define CV_WRAP_DEFAULT(val)
+/* Indicates that the function parameter has filesystem path semantic */
+#define CV_WRAP_FILE_PATH
+
+/****************************************************************************************\
+*                                  Matrix type (Mat)                                     *
+\****************************************************************************************/
+
+#define CV_MAX_DIM              32
+#define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
+#define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
+#define CV_MAT_TYPE_MASK        (CV_DEPTH_MAX*CV_CN_MAX - 1)
+#define CV_MAT_TYPE(flags)      ((flags) & CV_MAT_TYPE_MASK)
+#define CV_MAT_CONT_FLAG_SHIFT  14
+#define CV_MAT_CONT_FLAG        (1 << CV_MAT_CONT_FLAG_SHIFT)
+#define CV_IS_MAT_CONT(flags)   ((flags) & CV_MAT_CONT_FLAG)
+#define CV_IS_CONT_MAT          CV_IS_MAT_CONT
+#define CV_SUBMAT_FLAG_SHIFT    15
+#define CV_SUBMAT_FLAG          (1 << CV_SUBMAT_FLAG_SHIFT)
+#define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)
+
+/** Size of each channel item,
+   0x28442211 = 0010 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
+
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
+
+#ifndef MIN
+#  define MIN(a,b)  ((a) > (b) ? (b) : (a))
+#endif
+
+#ifndef MAX
+#  define MAX(a,b)  ((a) < (b) ? (b) : (a))
+#endif
+
+/** min & max without jumps */
+#define CV_IMIN(a, b)  ((a) ^ (((a)^(b)) & (((a) < (b)) - 1)))
+#define CV_IMAX(a, b)  ((a) ^ (((a)^(b)) & (((a) > (b)) - 1)))
+#define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
+#define CV_CMP(a,b)    (((a) > (b)) - ((a) < (b)))
+#define CV_SIGN(a)     CV_CMP((a),0)
+
+///////////////////////////////////////// Enum operators ///////////////////////////////////////
+
+/**
+
+Provides compatibility operators for both classical and C++11 enum classes,
+as well as exposing the C++11 enum class members for backwards compatibility
+
+@code
+    // Provides operators required for flag enums
+    CV_ENUM_FLAGS(AccessFlag)
+
+    // Exposes the listed members of the enum class AccessFlag to the current namespace
+    CV_ENUM_CLASS_EXPOSE(AccessFlag, ACCESS_READ [, ACCESS_WRITE [, ...] ]);
+@endcode
+*/
+
+#define __CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST)                                              \
+static const EnumType MEMBER_CONST = EnumType::MEMBER_CONST;                                          \
+
+#define __CV_ENUM_CLASS_EXPOSE_2(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_1(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_3(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_2(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_4(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_3(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_5(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_4(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_6(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_5(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_7(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_6(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_8(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_7(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_9(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_8(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_FLAGS_LOGICAL_NOT(EnumType)                                                         \
+static inline bool operator!(const EnumType& val)                                                     \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return !static_cast<UnderlyingType>(val);                                                         \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_LOGICAL_NOT_EQ(Arg1Type, Arg2Type)                                            \
+static inline bool operator!=(const Arg1Type& a, const Arg2Type& b)                                   \
+{                                                                                                     \
+    return static_cast<int>(a) != static_cast<int>(b);                                                \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_LOGICAL_EQ(Arg1Type, Arg2Type)                                                \
+static inline bool operator==(const Arg1Type& a, const Arg2Type& b)                                   \
+{                                                                                                     \
+    return static_cast<int>(a) == static_cast<int>(b);                                                \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_NOT(EnumType)                                                         \
+static inline EnumType operator~(const EnumType& val)                                                 \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(~static_cast<UnderlyingType>(val));                                  \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_OR(EnumType, Arg1Type, Arg2Type)                                      \
+static inline EnumType operator|(const Arg1Type& a, const Arg2Type& b)                                \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(static_cast<UnderlyingType>(a) | static_cast<UnderlyingType>(b));    \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_AND(EnumType, Arg1Type, Arg2Type)                                     \
+static inline EnumType operator&(const Arg1Type& a, const Arg2Type& b)                                \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(static_cast<UnderlyingType>(a) & static_cast<UnderlyingType>(b));    \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_XOR(EnumType, Arg1Type, Arg2Type)                                     \
+static inline EnumType operator^(const Arg1Type& a, const Arg2Type& b)                                \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(static_cast<UnderlyingType>(a) ^ static_cast<UnderlyingType>(b));    \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_OR_EQ(EnumType, Arg1Type)                                             \
+static inline EnumType& operator|=(EnumType& _this, const Arg1Type& val)                              \
+{                                                                                                     \
+    _this = static_cast<EnumType>(static_cast<int>(_this) | static_cast<int>(val));                   \
+    return _this;                                                                                     \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_AND_EQ(EnumType, Arg1Type)                                            \
+static inline EnumType& operator&=(EnumType& _this, const Arg1Type& val)                              \
+{                                                                                                     \
+    _this = static_cast<EnumType>(static_cast<int>(_this) & static_cast<int>(val));                   \
+    return _this;                                                                                     \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_XOR_EQ(EnumType, Arg1Type)                                            \
+static inline EnumType& operator^=(EnumType& _this, const Arg1Type& val)                              \
+{                                                                                                     \
+    _this = static_cast<EnumType>(static_cast<int>(_this) ^ static_cast<int>(val));                   \
+    return _this;                                                                                     \
+}                                                                                                     \
+
+#define CV_ENUM_CLASS_EXPOSE(EnumType, ...)                                                           \
+__CV_EXPAND(__CV_CAT(__CV_ENUM_CLASS_EXPOSE_, __CV_VA_NUM_ARGS(__VA_ARGS__))(EnumType, __VA_ARGS__)); \
+
+#define CV_ENUM_FLAGS(EnumType)                                                                       \
+__CV_ENUM_FLAGS_LOGICAL_NOT      (EnumType)                                                           \
+__CV_ENUM_FLAGS_LOGICAL_EQ       (EnumType, int)                                                      \
+__CV_ENUM_FLAGS_LOGICAL_NOT_EQ   (EnumType, int)                                                      \
+                                                                                                      \
+__CV_ENUM_FLAGS_BITWISE_NOT      (EnumType)                                                           \
+__CV_ENUM_FLAGS_BITWISE_OR       (EnumType, EnumType, EnumType)                                       \
+__CV_ENUM_FLAGS_BITWISE_AND      (EnumType, EnumType, EnumType)                                       \
+__CV_ENUM_FLAGS_BITWISE_XOR      (EnumType, EnumType, EnumType)                                       \
+                                                                                                      \
+__CV_ENUM_FLAGS_BITWISE_OR_EQ    (EnumType, EnumType)                                                 \
+__CV_ENUM_FLAGS_BITWISE_AND_EQ   (EnumType, EnumType)                                                 \
+__CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)                                                 \
+
+/****************************************************************************************\
+*                                    static analysys                                     *
+\****************************************************************************************/
+
+// In practice, some macro are not processed correctly (noreturn is not detected).
+// We need to use simplified definition for them.
+#ifndef CV_STATIC_ANALYSIS
+# if defined(__KLOCWORK__) || defined(__clang_analyzer__) || defined(__COVERITY__)
+#   define CV_STATIC_ANALYSIS 1
+# endif
+#else
+# if defined(CV_STATIC_ANALYSIS) && !(__CV_CAT(1, CV_STATIC_ANALYSIS) == 1)  // defined and not empty
+#   if 0 == CV_STATIC_ANALYSIS
+#     undef CV_STATIC_ANALYSIS
+#   endif
+# endif
+#endif
+
+/****************************************************************************************\
+*                                    Thread sanitizer                                    *
+\****************************************************************************************/
+#ifndef CV_THREAD_SANITIZER
+# if defined(__has_feature)
+#   if __has_feature(thread_sanitizer)
+#     define CV_THREAD_SANITIZER
+#   endif
+# endif
+#endif
+
+/****************************************************************************************\
+*          exchange-add operation for atomic operations on reference counters            *
+\****************************************************************************************/
+
+#ifdef CV_XADD
+  // allow to use user-defined macro
+#elif defined __GNUC__ || defined __clang__
+#  if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)  && !defined __INTEL_COMPILER
+#    ifdef __ATOMIC_ACQ_REL
+#      define CV_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#    else
+#      define CV_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#    endif
+#  else
+#    if defined __ATOMIC_ACQ_REL && !defined __clang__
+       // version for gcc >= 4.7
+#      define CV_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#    else
+#      define CV_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#    endif
+#  endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#  include <intrin.h>
+#  define CV_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+  #ifdef OPENCV_FORCE_UNSAFE_XADD
+    CV_INLINE int CV_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
+  #else
+    #error "OpenCV: can't define safe CV_XADD macro for current platform (unsupported). Define CV_XADD macro through custom port header (see OPENCV_INCLUDE_PORT_FILE)"
+  #endif
+#endif
+
+
+/****************************************************************************************\
+*                                  CV_NORETURN attribute                                 *
+\****************************************************************************************/
+
+#ifndef CV_NORETURN
+#  if defined(__GNUC__)
+#    define CV_NORETURN __attribute__((__noreturn__))
+#  elif defined(_MSC_VER) && (_MSC_VER >= 1300)
+#    define CV_NORETURN __declspec(noreturn)
+#  else
+#    define CV_NORETURN /* nothing by default */
+#  endif
+#endif
+
+/****************************************************************************************\
+*                       CV_NODISCARD_STD attribute (C++17)                               *
+* encourages the compiler to issue a warning if the return value is discarded            *
+\****************************************************************************************/
+#ifndef CV_NODISCARD_STD
+#  ifndef __has_cpp_attribute
+//   workaround preprocessor non-compliance https://reviews.llvm.org/D57851
+#    define __has_cpp_attribute(__x) 0
+#  endif
+#  if __has_cpp_attribute(nodiscard)
+#    define CV_NODISCARD_STD [[nodiscard]]
+#  elif __cplusplus >= 201703L
+//   available when compiler is C++17 compliant
+#    define CV_NODISCARD_STD [[nodiscard]]
+#  elif defined(__INTEL_COMPILER)
+     // see above, available when C++17 is enabled
+#  elif defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L
+//   available with VS2017 v15.3+ with /std:c++17 or higher; works on functions and classes
+#    define CV_NODISCARD_STD [[nodiscard]]
+#  elif defined(__GNUC__) && (((__GNUC__ * 100) + __GNUC_MINOR__) >= 700) && (__cplusplus >= 201103L)
+//   available with GCC 7.0+; works on functions, works or silently fails on classes
+#    define CV_NODISCARD_STD [[nodiscard]]
+#  elif defined(__GNUC__) && (((__GNUC__ * 100) + __GNUC_MINOR__) >= 408) && (__cplusplus >= 201103L)
+//   available with GCC 4.8+ but it usually does nothing and can fail noisily -- therefore not used
+//   define CV_NODISCARD_STD [[gnu::warn_unused_result]]
+#  endif
+#endif
+#ifndef CV_NODISCARD_STD
+#  define CV_NODISCARD_STD /* nothing by default */
+#endif
+
+
+/****************************************************************************************\
+*                                    C++ 11                                              *
+\****************************************************************************************/
+#ifdef __cplusplus
+// MSVC was stuck at __cplusplus == 199711L for a long time, even where it supports C++11,
+// so check _MSC_VER instead. See:
+// <https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus>
+#  if defined(_MSC_VER)
+#    if _MSC_VER < 1800
+#      error "OpenCV 4.x+ requires enabled C++11 support"
+#    endif
+#  elif __cplusplus < 201103L
+#    error "OpenCV 4.x+ requires enabled C++11 support"
+#  endif
+#endif
+
+#ifndef CV_CXX11
+#  define CV_CXX11 1
+#endif
+
+#ifndef CV_OVERRIDE
+#  define CV_OVERRIDE override
+#endif
+
+#ifndef CV_FINAL
+#  define CV_FINAL final
+#endif
+
+#ifndef CV_NOEXCEPT
+#  define CV_NOEXCEPT noexcept
+#endif
+
+#ifndef CV_CONSTEXPR
+#  define CV_CONSTEXPR constexpr
+#endif
+
+// Integer types portability
+#ifdef __cplusplus
+#include <cstdint>
+namespace cv {
+using std::int8_t;
+using std::uint8_t;
+using std::int16_t;
+using std::uint16_t;
+using std::int32_t;
+using std::uint32_t;
+using std::int64_t;
+using std::uint64_t;
+}
+#else // pure C
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+namespace cv
+{
+
+class hfloat
+{
+public:
+#if CV_FP16_TYPE
+
+    hfloat() : h(0) {}
+    explicit hfloat(float x) { h = (__fp16)x; }
+    operator float() const { return (float)h; }
+protected:
+    __fp16 h;
+
+#else
+    hfloat() : w(0) {}
+    explicit hfloat(float x)
+    {
+    #if CV_FP16 && CV_AVX2
+        __m128 v = _mm_load_ss(&x);
+        w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
+    #else
+        Cv32suf in;
+        in.f = x;
+        unsigned sign = in.u & 0x80000000;
+        in.u ^= sign;
+
+        if( in.u >= 0x47800000 )
+            w = (ushort)(in.u > 0x7f800000 ? 0x7e00 : 0x7c00);
+        else
+        {
+            if (in.u < 0x38800000)
+            {
+                in.f += 0.5f;
+                w = (ushort)(in.u - 0x3f000000);
+            }
+            else
+            {
+                unsigned t = in.u + 0xc8000fff;
+                w = (ushort)((t + ((in.u >> 13) & 1)) >> 13);
+            }
+        }
+
+        w = (ushort)(w | (sign >> 16));
+    #endif
+    }
+
+    operator float() const
+    {
+    #if CV_FP16 && CV_AVX2
+        float f;
+        _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
+        return f;
+    #else
+        Cv32suf out;
+
+        unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
+        unsigned sign = (w & 0x8000) << 16;
+        unsigned e = w & 0x7c00;
+
+        out.u = t + (1 << 23);
+        out.u = (e >= 0x7c00 ? t + 0x38000000 :
+                 e == 0 ? (static_cast<void>(out.f -= 6.103515625e-05f), out.u) : t) | sign;
+        return out.f;
+    #endif
+    }
+
+protected:
+    ushort w;
+
+#endif
+};
+
+inline hfloat hfloatFromBits(ushort w) {
+#if CV_FP16_TYPE
+    Cv16suf u;
+    u.u = w;
+    hfloat res(float(u.h));
+    return res;
+#else
+    Cv32suf out;
+
+    unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
+    unsigned sign = (w & 0x8000) << 16;
+    unsigned e = w & 0x7c00;
+
+    out.u = t + (1 << 23);
+    out.u = (e >= 0x7c00 ? t + 0x38000000 :
+            e == 0 ? (static_cast<void>(out.f -= 6.103515625e-05f), out.u) : t) | sign;
+    hfloat res(out.f);
+    return res;
+#endif
+}
+
+#if !defined(__OPENCV_BUILD) && !(defined __STDCPP_FLOAT16_T__) && !(defined __ARM_NEON)
+typedef hfloat float16_t;
+#endif
+
+}
+#endif
+
+//! @}
+
+#ifndef __cplusplus
+#include "opencv2/core/fast_math.hpp" // define cvRound(double)
+#endif
+
+#endif // OPENCV_CORE_CVDEF_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd.hpp
new file mode 100644
index 000000000000..d216d267ef06
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd.hpp
@@ -0,0 +1,189 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CVSTD_HPP
+#define OPENCV_CORE_CVSTD_HPP
+
+#ifndef __cplusplus
+#  error cvstd.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cvdef.h"
+#include <cstddef>
+#include <cstring>
+#include <cctype>
+
+#include <string>
+
+// import useful primitives from stl
+#  include <algorithm>
+#  include <utility>
+#  include <cstdlib> //for abs(int)
+#  include <cmath>
+
+namespace cv
+{
+    static inline uchar abs(uchar a) { return a; }
+    static inline ushort abs(ushort a) { return a; }
+    static inline unsigned abs(unsigned a) { return a; }
+    static inline uint64 abs(uint64 a) { return a; }
+
+    using std::min;
+    using std::max;
+    using std::abs;
+    using std::swap;
+    using std::sqrt;
+    using std::exp;
+    using std::pow;
+    using std::log;
+}
+
+#include "cvstd_wrapper.hpp"
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+//////////////////////////// memory management functions ////////////////////////////
+
+/** @brief Allocates an aligned memory buffer.
+
+The function allocates the buffer of the specified size and returns it. When the buffer size is 16
+bytes or more, the returned buffer is aligned to 16 bytes.
+@param bufSize Allocated buffer size.
+ */
+CV_EXPORTS void* fastMalloc(size_t bufSize);
+
+/** @brief Deallocates a memory buffer.
+
+The function deallocates the buffer allocated with fastMalloc . If NULL pointer is passed, the
+function does nothing. C version of the function clears the pointer *pptr* to avoid problems with
+double memory deallocation.
+@param ptr Pointer to the allocated buffer.
+ */
+CV_EXPORTS void fastFree(void* ptr);
+
+/*!
+  The STL-compliant memory Allocator based on cv::fastMalloc() and cv::fastFree()
+*/
+template<typename _Tp> class Allocator
+{
+public:
+    typedef _Tp value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+    template<typename U> class rebind { typedef Allocator<U> other; };
+
+    explicit Allocator() {}
+    ~Allocator() {}
+    explicit Allocator(Allocator const&) {}
+    template<typename U>
+    explicit Allocator(Allocator<U> const&) {}
+
+    // address
+    pointer address(reference r) { return &r; }
+    const_pointer address(const_reference r) { return &r; }
+
+    pointer allocate(size_type count, const void* =0) { return reinterpret_cast<pointer>(fastMalloc(count * sizeof (_Tp))); }
+    void deallocate(pointer p, size_type) { fastFree(p); }
+
+    void construct(pointer p, const _Tp& v) { new(static_cast<void*>(p)) _Tp(v); }
+    void destroy(pointer p) { p->~_Tp(); }
+
+    size_type max_size() const { return cv::max(static_cast<_Tp>(-1)/sizeof(_Tp), 1); }
+};
+
+//! @} core_utils
+
+
+//! @addtogroup core_basic
+//! @{
+
+//////////////////////////////// string class ////////////////////////////////
+
+class CV_EXPORTS FileNode; //for string constructor from FileNode
+
+typedef std::string String;
+
+#ifndef OPENCV_DISABLE_STRING_LOWER_UPPER_CONVERSIONS
+
+//! @cond IGNORED
+namespace details {
+// std::tolower is int->int
+static inline char char_tolower(char ch)
+{
+    return (char)std::tolower((int)ch);
+}
+// std::toupper is int->int
+static inline char char_toupper(char ch)
+{
+    return (char)std::toupper((int)ch);
+}
+} // namespace details
+//! @endcond
+
+static inline std::string toLowerCase(const std::string& str)
+{
+    std::string result(str);
+    std::transform(result.begin(), result.end(), result.begin(), details::char_tolower);
+    return result;
+}
+
+static inline std::string toUpperCase(const std::string& str)
+{
+    std::string result(str);
+    std::transform(result.begin(), result.end(), result.begin(), details::char_toupper);
+    return result;
+}
+
+#endif // OPENCV_DISABLE_STRING_LOWER_UPPER_CONVERSIONS
+
+//! @} core_basic
+} // cv
+
+#endif //OPENCV_CORE_CVSTD_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd.inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd.inl.hpp
new file mode 100644
index 000000000000..37ad1e69067c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd.inl.hpp
@@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CVSTDINL_HPP
+#define OPENCV_CORE_CVSTDINL_HPP
+
+#include <complex>
+#include <ostream>
+#include <sstream>
+
+//! @cond IGNORED
+
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 4127 )
+#endif
+
+namespace cv
+{
+
+template<typename _Tp> class DataType< std::complex<_Tp> >
+{
+public:
+    typedef std::complex<_Tp>  value_type;
+    typedef value_type         work_type;
+    typedef _Tp                channel_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = 2,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKETYPE(depth, channels) };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+static inline
+std::ostream& operator << (std::ostream& out, Ptr<Formatted> fmtd)
+{
+    fmtd->reset();
+    for(const char* str = fmtd->next(); str; str = fmtd->next())
+        out << str;
+    return out;
+}
+
+static inline
+std::ostream& operator << (std::ostream& out, const Mat& mtx)
+{
+    return out << Formatter::get()->format(mtx);
+}
+
+static inline
+std::ostream& operator << (std::ostream& out, const UMat& m)
+{
+    return out << m.getMat(ACCESS_READ);
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Complex<_Tp>& c)
+{
+    return out << "(" << c.re << "," << c.im << ")";
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const std::vector<Point_<_Tp> >& vec)
+{
+    return out << Formatter::get()->format(Mat(vec));
+}
+
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const std::vector<Point3_<_Tp> >& vec)
+{
+    return out << Formatter::get()->format(Mat(vec));
+}
+
+
+template<typename _Tp, int m, int n> static inline
+std::ostream& operator << (std::ostream& out, const Matx<_Tp, m, n>& matx)
+{
+    return out << Formatter::get()->format(Mat(matx));
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Point_<_Tp>& p)
+{
+    out << "[" << p.x << ", " << p.y << "]";
+    return out;
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Point3_<_Tp>& p)
+{
+    out << "[" << p.x << ", " << p.y << ", " << p.z << "]";
+    return out;
+}
+
+template<typename _Tp, int n> static inline
+std::ostream& operator << (std::ostream& out, const Vec<_Tp, n>& vec)
+{
+    out << "[";
+    if (cv::traits::Depth<_Tp>::value <= CV_32S)
+    {
+        for (int i = 0; i < n - 1; ++i) {
+            out << (int)vec[i] << ", ";
+        }
+        out << (int)vec[n-1] << "]";
+    }
+    else
+    {
+        for (int i = 0; i < n - 1; ++i) {
+            out << vec[i] << ", ";
+        }
+        out << vec[n-1] << "]";
+    }
+
+    return out;
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Size_<_Tp>& size)
+{
+    return out << "[" << size.width << " x " << size.height << "]";
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Rect_<_Tp>& rect)
+{
+    return out << "[" << rect.width << " x " << rect.height << " from (" << rect.x << ", " << rect.y << ")]";
+}
+
+static inline std::ostream& operator << (std::ostream& out, const MatSize& msize)
+{
+    int i, dims = msize.dims();
+    for( i = 0; i < dims; i++ )
+    {
+        out << msize[i];
+        if( i < dims-1 )
+            out << " x ";
+    }
+    return out;
+}
+
+static inline std::ostream &operator<< (std::ostream &s, cv::Range &r)
+{
+    return s << "[" << r.start << " : " << r.end << ")";
+}
+
+} // cv
+
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
+//! @endcond
+
+#endif // OPENCV_CORE_CVSTDINL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd_wrapper.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd_wrapper.hpp
new file mode 100644
index 000000000000..25e0041f28e3
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/cvstd_wrapper.hpp
@@ -0,0 +1,154 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_CVSTD_WRAPPER_HPP
+#define OPENCV_CORE_CVSTD_WRAPPER_HPP
+
+#include "opencv2/core/cvdef.h"
+
+#include <string>
+#include <memory>  // std::shared_ptr
+#include <type_traits>  // std::enable_if
+
+namespace cv {
+
+using std::nullptr_t;
+
+//! @addtogroup core_basic
+//! @{
+
+#ifdef CV_DOXYGEN
+
+template <typename _Tp> using Ptr = std::shared_ptr<_Tp>;  // In ideal world it should look like this, but we need some compatibility workarounds below
+
+template<typename _Tp, typename ... A1> static inline
+Ptr<_Tp> makePtr(const A1&... a1) { return std::make_shared<_Tp>(a1...); }
+
+#else  // cv::Ptr with compatibility workarounds
+
+// It should be defined for C-API types only.
+// C++ types should use regular "delete" operator.
+template<typename Y> struct DefaultDeleter;
+#if 0
+{
+    void operator()(Y* p) const;
+};
+#endif
+
+namespace sfinae {
+template<typename C, typename Ret, typename... Args>
+struct has_parenthesis_operator
+{
+private:
+    template<typename T>
+    static CV_CONSTEXPR std::true_type has_parenthesis_operator_check(typename std::is_same<typename std::decay<decltype(std::declval<T>().operator()(std::declval<Args>()...))>::type, Ret>::type*);
+
+    template<typename> static CV_CONSTEXPR std::false_type has_parenthesis_operator_check(...);
+
+    typedef decltype(has_parenthesis_operator_check<C>(0)) type;
+
+public:
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
+    static CV_CONSTEXPR bool value = type::value;
+#else
+    // support MSVS 2013
+    static const int value = type::value;
+#endif
+};
+} // namespace sfinae
+
+template <typename T, typename = void>
+struct has_custom_delete
+        : public std::false_type {};
+
+// Force has_custom_delete to std::false_type when NVCC is compiling CUDA source files
+#ifndef __CUDACC__
+template <typename T>
+struct has_custom_delete<T, typename std::enable_if< sfinae::has_parenthesis_operator<DefaultDeleter<T>, void, T*>::value >::type >
+        : public std::true_type {};
+#endif
+
+template<typename T>
+struct Ptr : public std::shared_ptr<T>
+{
+#if 0
+    using std::shared_ptr<T>::shared_ptr;  // GCC 5.x can't handle this
+#else
+    inline Ptr() CV_NOEXCEPT : std::shared_ptr<T>() {}
+    inline Ptr(nullptr_t) CV_NOEXCEPT : std::shared_ptr<T>(nullptr) {}
+    template<typename Y, typename D> inline Ptr(Y* p, D d) : std::shared_ptr<T>(p, d) {}
+    template<typename D> inline Ptr(nullptr_t, D d) : std::shared_ptr<T>(nullptr, d) {}
+
+    template<typename Y> inline Ptr(const Ptr<Y>& r, T* ptr) CV_NOEXCEPT : std::shared_ptr<T>(r, ptr) {}
+
+    inline Ptr(const Ptr<T>& o) CV_NOEXCEPT : std::shared_ptr<T>(o) {}
+    inline Ptr(Ptr<T>&& o) CV_NOEXCEPT : std::shared_ptr<T>(std::move(o)) {}
+
+    template<typename Y> inline Ptr(const Ptr<Y>& o) CV_NOEXCEPT : std::shared_ptr<T>(o) {}
+    template<typename Y> inline Ptr(Ptr<Y>&& o) CV_NOEXCEPT : std::shared_ptr<T>(std::move(o)) {}
+#endif
+    inline Ptr(const std::shared_ptr<T>& o) CV_NOEXCEPT : std::shared_ptr<T>(o) {}
+    inline Ptr(std::shared_ptr<T>&& o) CV_NOEXCEPT : std::shared_ptr<T>(std::move(o)) {}
+
+    // Overload with custom DefaultDeleter: Ptr<IplImage>(...)
+    template<typename Y>
+    inline Ptr(const std::true_type&, Y* ptr) : std::shared_ptr<T>(ptr, DefaultDeleter<Y>()) {}
+
+    // Overload without custom deleter: Ptr<std::string>(...);
+    template<typename Y>
+    inline Ptr(const std::false_type&, Y* ptr) : std::shared_ptr<T>(ptr) {}
+
+    template<typename Y = T>
+    inline Ptr(Y* ptr) : Ptr(has_custom_delete<Y>(), ptr) {}
+
+    // Overload with custom DefaultDeleter: Ptr<IplImage>(...)
+    template<typename Y>
+    inline void reset(const std::true_type&, Y* ptr) { std::shared_ptr<T>::reset(ptr, DefaultDeleter<Y>()); }
+
+    // Overload without custom deleter: Ptr<std::string>(...);
+    template<typename Y>
+    inline void reset(const std::false_type&, Y* ptr) { std::shared_ptr<T>::reset(ptr); }
+
+    template<typename Y>
+    inline void reset(Y* ptr) { Ptr<T>::reset(has_custom_delete<Y>(), ptr); }
+
+    template<class Y, class Deleter>
+    void reset(Y* ptr, Deleter d) { std::shared_ptr<T>::reset(ptr, d); }
+
+    void reset() CV_NOEXCEPT { std::shared_ptr<T>::reset(); }
+
+    Ptr& operator=(const Ptr& o) { std::shared_ptr<T>::operator =(o); return *this; }
+    template<typename Y> inline Ptr& operator=(const Ptr<Y>& o) { std::shared_ptr<T>::operator =(o); return *this; }
+
+    T* operator->() const CV_NOEXCEPT { return std::shared_ptr<T>::get();}
+    typename std::add_lvalue_reference<T>::type operator*() const CV_NOEXCEPT { return *std::shared_ptr<T>::get(); }
+
+    // OpenCV 3.x methods (not a part of standard C++ library)
+    inline void release() { std::shared_ptr<T>::reset(); }
+    inline operator T* () const { return std::shared_ptr<T>::get(); }
+    inline bool empty() const { return std::shared_ptr<T>::get() == nullptr; }
+
+    template<typename Y> inline
+    Ptr<Y> staticCast() const CV_NOEXCEPT { return std::static_pointer_cast<Y>(*this); }
+
+    template<typename Y> inline
+    Ptr<Y> constCast() const CV_NOEXCEPT { return std::const_pointer_cast<Y>(*this); }
+
+    template<typename Y> inline
+    Ptr<Y> dynamicCast() const CV_NOEXCEPT { return std::dynamic_pointer_cast<Y>(*this); }
+};
+
+template<typename _Tp, typename ... A1> static inline
+Ptr<_Tp> makePtr(const A1&... a1)
+{
+    static_assert( !has_custom_delete<_Tp>::value, "Can't use this makePtr with custom DefaultDeleter");
+    return (Ptr<_Tp>)std::make_shared<_Tp>(a1...);
+}
+
+#endif // CV_DOXYGEN
+
+//! @} core_basic
+} // cv
+
+#endif //OPENCV_CORE_CVSTD_WRAPPER_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/detail/async_promise.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/detail/async_promise.hpp
new file mode 100644
index 000000000000..c039ec046a08
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/detail/async_promise.hpp
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ASYNC_PROMISE_HPP
+#define OPENCV_CORE_ASYNC_PROMISE_HPP
+
+#include "../async.hpp"
+
+#include "exception_ptr.hpp"
+
+namespace cv {
+
+/** @addtogroup core_async
+@{
+*/
+
+
+/** @brief Provides result of asynchronous operations
+
+*/
+class CV_EXPORTS AsyncPromise
+{
+public:
+    ~AsyncPromise() CV_NOEXCEPT;
+    AsyncPromise() CV_NOEXCEPT;
+    explicit AsyncPromise(const AsyncPromise& o) CV_NOEXCEPT;
+    AsyncPromise& operator=(const AsyncPromise& o) CV_NOEXCEPT;
+    void release() CV_NOEXCEPT;
+
+    /** Returns associated AsyncArray
+    @note Can be called once
+    */
+    AsyncArray getArrayResult();
+
+    /** Stores asynchronous result.
+    @param[in] value result
+    */
+    void setValue(InputArray value);
+
+    // TODO "move" setters
+
+#if CV__EXCEPTION_PTR
+    /** Stores exception.
+    @param[in] exception exception to be raised in AsyncArray
+    */
+    void setException(std::exception_ptr exception);
+#endif
+
+    /** Stores exception.
+    @param[in] exception exception to be raised in AsyncArray
+    */
+    void setException(const cv::Exception& exception);
+
+    explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; }
+    AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
+
+
+    // PImpl
+    typedef struct AsyncArray::Impl Impl; friend struct AsyncArray::Impl;
+    inline void* _getImpl() const CV_NOEXCEPT { return p; }
+protected:
+    Impl* p;
+};
+
+
+//! @}
+} // namespace
+#endif // OPENCV_CORE_ASYNC_PROMISE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/detail/dispatch_helper.impl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/detail/dispatch_helper.impl.hpp
new file mode 100644
index 000000000000..d6ec67692200
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/detail/dispatch_helper.impl.hpp
@@ -0,0 +1,49 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_DETAIL_DISPATCH_HELPER_IMPL_HPP
+#define OPENCV_CORE_DETAIL_DISPATCH_HELPER_IMPL_HPP
+
+//! @cond IGNORED
+
+namespace cv {
+namespace detail {
+
+template<template<typename> class Functor, typename... Args>
+static inline void depthDispatch(const int depth, Args&&... args)
+{
+    switch (depth)
+    {
+        case CV_8U:
+            Functor<uint8_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_8S:
+            Functor<int8_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_16U:
+            Functor<uint16_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_16S:
+            Functor<int16_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_32S:
+            Functor<int32_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_32F:
+            Functor<float>{}(std::forward<Args>(args)...);
+            break;
+        case CV_64F:
+            Functor<double>{}(std::forward<Args>(args)...);
+            break;
+        case CV_16F:
+        default:
+            CV_Error(cv::Error::BadDepth, "Unsupported matrix type.");
+    };
+}
+
+}}
+
+//! @endcond
+
+#endif //OPENCV_CORE_DETAIL_DISPATCH_HELPER_IMPL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/detail/exception_ptr.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/detail/exception_ptr.hpp
new file mode 100644
index 000000000000..a1a591e45582
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/detail/exception_ptr.hpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
+#define OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
+
+#ifndef CV__EXCEPTION_PTR
+#  if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2
+#    define CV__EXCEPTION_PTR 0  // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938
+#  else
+#    define CV__EXCEPTION_PTR 1
+#  endif
+#endif
+#ifndef CV__EXCEPTION_PTR
+#  define CV__EXCEPTION_PTR 0
+#elif CV__EXCEPTION_PTR
+#  include <exception>  // std::exception_ptr
+#endif
+
+#endif // OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/directx.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/directx.hpp
new file mode 100644
index 000000000000..056a85a1bc99
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/directx.hpp
@@ -0,0 +1,184 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_DIRECTX_HPP
+#define OPENCV_CORE_DIRECTX_HPP
+
+#include "mat.hpp"
+#include "ocl.hpp"
+
+#if !defined(__d3d11_h__)
+struct ID3D11Device;
+struct ID3D11Texture2D;
+#endif
+
+#if !defined(__d3d10_h__)
+struct ID3D10Device;
+struct ID3D10Texture2D;
+#endif
+
+#if !defined(_D3D9_H_)
+struct IDirect3DDevice9;
+struct IDirect3DDevice9Ex;
+struct IDirect3DSurface9;
+#endif
+
+
+namespace cv { namespace directx {
+
+namespace ocl {
+using namespace cv::ocl;
+
+//! @addtogroup core_directx
+// This section describes OpenCL and DirectX interoperability.
+//
+// To enable DirectX support, configure OpenCV using CMake with WITH_DIRECTX=ON . Note, DirectX is
+// supported only on Windows.
+//
+// To use OpenCL functionality you should first initialize OpenCL context from DirectX resource.
+//
+//! @{
+
+// TODO static functions in the Context class
+//! @brief Creates OpenCL context from D3D11 device
+//
+//! @param pD3D11Device - pointer to D3D11 device
+//! @return Returns reference to OpenCL Context
+CV_EXPORTS Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device);
+
+//! @brief Creates OpenCL context from D3D10 device
+//
+//! @param pD3D10Device - pointer to D3D10 device
+//! @return Returns reference to OpenCL Context
+CV_EXPORTS Context& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device);
+
+//! @brief Creates OpenCL context from Direct3DDevice9Ex device
+//
+//! @param pDirect3DDevice9Ex - pointer to Direct3DDevice9Ex device
+//! @return Returns reference to OpenCL Context
+CV_EXPORTS Context& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDevice9Ex);
+
+//! @brief Creates OpenCL context from Direct3DDevice9 device
+//
+//! @param pDirect3DDevice9 - pointer to Direct3Device9 device
+//! @return Returns reference to OpenCL Context
+CV_EXPORTS Context& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9);
+
+//! @}
+
+} // namespace cv::directx::ocl
+
+//! @addtogroup core_directx
+//! @{
+
+//! @brief Converts InputArray to ID3D11Texture2D. If destination texture format is DXGI_FORMAT_NV12 then
+//!        input UMat expected to be in BGR format and data will be downsampled and color-converted to NV12.
+//
+//! @note Note: Destination texture must be allocated by application. Function does memory copy from src to
+//!             pD3D11Texture2D
+//
+//! @param src - source InputArray
+//! @param pD3D11Texture2D - destination D3D11 texture
+CV_EXPORTS void convertToD3D11Texture2D(InputArray src, ID3D11Texture2D* pD3D11Texture2D);
+
+//! @brief Converts ID3D11Texture2D to OutputArray. If input texture format is DXGI_FORMAT_NV12 then
+//!        data will be upsampled and color-converted to BGR format.
+//
+//! @note Note: Destination matrix will be re-allocated if it has not enough memory to match texture size.
+//!             function does memory copy from pD3D11Texture2D to dst
+//
+//! @param pD3D11Texture2D - source D3D11 texture
+//! @param dst             - destination OutputArray
+CV_EXPORTS void convertFromD3D11Texture2D(ID3D11Texture2D* pD3D11Texture2D, OutputArray dst);
+
+//! @brief Converts InputArray to ID3D10Texture2D
+//
+//! @note Note: function does memory copy from src to
+//!             pD3D10Texture2D
+//
+//! @param src             - source InputArray
+//! @param pD3D10Texture2D - destination D3D10 texture
+CV_EXPORTS void convertToD3D10Texture2D(InputArray src, ID3D10Texture2D* pD3D10Texture2D);
+
+//! @brief Converts ID3D10Texture2D to OutputArray
+//
+//! @note Note: function does memory copy from pD3D10Texture2D
+//!             to dst
+//
+//! @param pD3D10Texture2D - source D3D10 texture
+//! @param dst             - destination OutputArray
+CV_EXPORTS void convertFromD3D10Texture2D(ID3D10Texture2D* pD3D10Texture2D, OutputArray dst);
+
+//! @brief Converts InputArray to IDirect3DSurface9
+//
+//! @note Note: function does memory copy from src to
+//!             pDirect3DSurface9
+//
+//! @param src                 - source InputArray
+//! @param pDirect3DSurface9   - destination D3D10 texture
+//! @param surfaceSharedHandle - shared handle
+CV_EXPORTS void convertToDirect3DSurface9(InputArray src, IDirect3DSurface9* pDirect3DSurface9, void* surfaceSharedHandle = NULL);
+
+//! @brief Converts IDirect3DSurface9 to OutputArray
+//
+//! @note Note: function does memory copy from pDirect3DSurface9
+//!             to dst
+//
+//! @param pDirect3DSurface9   - source D3D10 texture
+//! @param dst                 - destination OutputArray
+//! @param surfaceSharedHandle - shared handle
+CV_EXPORTS void convertFromDirect3DSurface9(IDirect3DSurface9* pDirect3DSurface9, OutputArray dst, void* surfaceSharedHandle = NULL);
+
+//! @brief Get OpenCV type from DirectX type
+//! @param iDXGI_FORMAT - enum DXGI_FORMAT for D3D10/D3D11
+//! @return OpenCV type or -1 if there is no equivalent
+CV_EXPORTS int getTypeFromDXGI_FORMAT(const int iDXGI_FORMAT); // enum DXGI_FORMAT for D3D10/D3D11
+
+//! @brief Get OpenCV type from DirectX type
+//! @param iD3DFORMAT - enum D3DTYPE for D3D9
+//! @return OpenCV type or -1 if there is no equivalent
+CV_EXPORTS int getTypeFromD3DFORMAT(const int iD3DFORMAT); // enum D3DTYPE for D3D9
+
+//! @}
+
+} } // namespace cv::directx
+
+#endif // OPENCV_CORE_DIRECTX_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/dualquaternion.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/dualquaternion.hpp
new file mode 100644
index 000000000000..1f644e9dc83b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/dualquaternion.hpp
@@ -0,0 +1,979 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <kongliangqian@huawei.com>
+//         Longbu Wang <wanglongbu@huawei.com>
+#ifndef OPENCV_CORE_DUALQUATERNION_HPP
+#define OPENCV_CORE_DUALQUATERNION_HPP
+
+#include <opencv2/core/quaternion.hpp>
+#include <opencv2/core/affine.hpp>
+
+namespace cv{
+//! @addtogroup core
+//! @{
+
+template <typename _Tp> class DualQuat;
+template <typename _Tp> std::ostream& operator<<(std::ostream&, const DualQuat<_Tp>&);
+
+/**
+ * Dual quaternions were introduced to describe rotation together with translation while ordinary
+ * quaternions can only describe rotation. It can be used for shortest path pose interpolation,
+ * local pose optimization or volumetric deformation. More details can be found
+ * - https://en.wikipedia.org/wiki/Dual_quaternion
+ * - ["A beginners guide to dual-quaternions: what they are, how they work, and how to use them for 3D character hierarchies", Ben Kenwright, 2012](https://borodust.org/public/shared/beginner_dual_quats.pdf)
+ * - ["Dual Quaternions", Yan-Bin Jia, 2013](http://web.cs.iastate.edu/~cs577/handouts/dual-quaternion.pdf)
+ * - ["Geometric Skinning with Approximate Dual Quaternion Blending", Kavan, 2008](https://www.cs.utah.edu/~ladislav/kavan08geometric/kavan08geometric)
+ * - http://rodolphe-vaillant.fr/?e=29
+ *
+ * A unit dual quaternion can be classically represented as:
+ * \f[
+ * \begin{equation}
+ * \begin{split}
+ * \sigma &= \left(r+\frac{\epsilon}{2}tr\right)\\
+ * &= [w, x, y, z, w\_, x\_, y\_, z\_]
+ * \end{split}
+ * \end{equation}
+ * \f]
+ * where \f$r, t\f$ represents the rotation (ordinary unit quaternion) and translation (pure ordinary quaternion) respectively.
+ *
+ * A general dual quaternions which consist of two quaternions is usually represented in form of:
+ * \f[
+ * \sigma = p + \epsilon q
+ * \f]
+ * where the introduced dual unit \f$\epsilon\f$ satisfies \f$\epsilon^2 = \epsilon^3 =...=0\f$, and \f$p, q\f$ are quaternions.
+ *
+ * Alternatively, dual quaternions can also be interpreted as four components which are all [dual numbers](https://www.cs.utah.edu/~ladislav/kavan08geometric/kavan08geometric):
+ * \f[
+ * \sigma = \hat{q}_w + \hat{q}_xi + \hat{q}_yj + \hat{q}_zk
+ * \f]
+ * If we set \f$\hat{q}_x, \hat{q}_y\f$ and \f$\hat{q}_z\f$ equal to 0, a dual quaternion is transformed to a dual number. see normalize().
+ *
+ * If you want to create a dual quaternion, you can use:
+ *
+ * ```
+ * using namespace cv;
+ * double angle = CV_PI;
+ *
+ * // create from eight number
+ * DualQuatd dq1(1, 2, 3, 4, 5, 6, 7, 8); //p = [1,2,3,4]. q=[5,6,7,8]
+ *
+ * // create from Vec
+ * Vec<double, 8> v{1,2,3,4,5,6,7,8};
+ * DualQuatd dq_v{v};
+ *
+ * // create from two quaternion
+ * Quatd p(1, 2, 3, 4);
+ * Quatd q(5, 6, 7, 8);
+ * DualQuatd dq2 = DualQuatd::createFromQuat(p, q);
+ *
+ * // create from an angle, an axis and a translation
+ * Vec3d axis{0, 0, 1};
+ * Vec3d trans{3, 4, 5};
+ * DualQuatd dq3 = DualQuatd::createFromAngleAxisTrans(angle, axis, trans);
+ *
+ * // If you already have an instance of class Affine3, then you can use
+ * Affine3d R = dq3.toAffine3();
+ * DualQuatd dq4 = DualQuatd::createFromAffine3(R);
+ *
+ * // or create directly by affine transformation matrix Rt
+ * // see createFromMat() in detail for the form of Rt
+ * Matx44d Rt = dq3.toMat();
+ * DualQuatd dq5 = DualQuatd::createFromMat(Rt);
+ *
+ * // Any rotation + translation movement can
+ * // be expressed as a rotation + translation around the same line in space (expressed by Plucker
+ * // coords), and here's a way to represent it this way.
+ * Vec3d axis{1, 1, 1}; // axis will be normalized in createFromPitch
+ * Vec3d trans{3, 4 ,5};
+ * axis = axis / std::sqrt(axis.dot(axis));// The formula for computing moment that I use below requires a normalized axis
+ * Vec3d moment = 1.0 / 2 * (trans.cross(axis) + axis.cross(trans.cross(axis)) *
+ *                            std::cos(rotation_angle / 2) / std::sin(rotation_angle / 2));
+ * double d = trans.dot(qaxis);
+ * DualQuatd dq6 = DualQuatd::createFromPitch(angle, d, axis, moment);
+ * ```
+ *
+ * A point \f$v=(x, y, z)\f$ in form of dual quaternion is \f$[1+\epsilon v]=[1,0,0,0,0,x,y,z]\f$.
+ * The transformation of a point \f$v_1\f$ to another point \f$v_2\f$ under the dual quaternion \f$\sigma\f$ is
+ * \f[
+ * 1 + \epsilon v_2 = \sigma * (1 + \epsilon v_1) * \sigma^{\star}
+ * \f]
+ * where \f$\sigma^{\star}=p^*-\epsilon q^*.\f$
+ *
+ * A line in the \f$Pl\ddot{u}cker\f$ coordinates \f$(\hat{l}, m)\f$ defined by the dual quaternion \f$l=\hat{l}+\epsilon m\f$.
+ * To transform a line, \f[l_2 = \sigma * l_1 * \sigma^*,\f] where \f$\sigma=r+\frac{\epsilon}{2}rt\f$ and
+ * \f$\sigma^*=p^*+\epsilon q^*\f$.
+ *
+ * To extract the Vec<double, 8> or Vec<float, 8>, see toVec();
+ *
+ * To extract the affine transformation matrix, see toMat();
+ *
+ * To extract the instance of Affine3, see toAffine3();
+ *
+ * If two quaternions \f$q_0, q_1\f$ are needed to be interpolated, you can use sclerp()
+ * ```
+ * DualQuatd::sclerp(q0, q1, t)
+ * ```
+ * or dqblend().
+ * ```
+ * DualQuatd::dqblend(q0, q1, t)
+ * ```
+ * With more than two dual quaternions to be blended, you can use generalize linear dual quaternion blending
+ * with the corresponding weights, i.e. gdqblend().
+ *
+ */
+template <typename _Tp>
+class CV_EXPORTS DualQuat{
+    static_assert(std::is_floating_point<_Tp>::value, "Dual quaternion only make sense with type of float or double");
+    using value_type = _Tp;
+
+public:
+    static constexpr _Tp CV_DUAL_QUAT_EPS = (_Tp)1.e-6;
+
+    DualQuat();
+
+    /**
+     * @brief create from eight same type numbers.
+     */
+    DualQuat(const _Tp w, const _Tp x, const _Tp y, const _Tp z, const _Tp w_, const _Tp x_, const _Tp y_, const _Tp z_);
+
+    /**
+     * @brief create from a double or float vector.
+     */
+    DualQuat(const Vec<_Tp, 8> &q);
+
+    _Tp w, x, y, z, w_, x_, y_, z_;
+
+    /**
+     * @brief create Dual Quaternion from two same type quaternions p and q.
+     * A Dual Quaternion \f$\sigma\f$ has the form:
+     * \f[\sigma = p + \epsilon q\f]
+     * where p and q are defined as follows:
+     * \f[\begin{equation}
+     *    \begin{split}
+     *    p &= w + x\boldsymbol{i} + y\boldsymbol{j} + z\boldsymbol{k}\\
+     *    q &= w\_ + x\_\boldsymbol{i} + y\_\boldsymbol{j} + z\_\boldsymbol{k}.
+     *    \end{split}
+     *   \end{equation}
+     * \f]
+     * The p and q are the real part and dual part respectively.
+     * @param realPart a quaternion, real part of dual quaternion.
+     * @param dualPart a quaternion, dual part of dual quaternion.
+     * @sa Quat
+    */
+    static DualQuat<_Tp> createFromQuat(const Quat<_Tp> &realPart, const Quat<_Tp> &dualPart);
+
+    /**
+     * @brief create a dual quaternion from a rotation angle \f$\theta\f$, a rotation axis
+     * \f$\boldsymbol{u}\f$ and a translation \f$\boldsymbol{t}\f$.
+     * It generates a dual quaternion \f$\sigma\f$ in the form of
+     * \f[\begin{equation}
+     *    \begin{split}
+     *    \sigma &= r + \frac{\epsilon}{2}\boldsymbol{t}r \\
+     *           &= [\cos(\frac{\theta}{2}), \boldsymbol{u}\sin(\frac{\theta}{2})]
+     *           + \frac{\epsilon}{2}[0, \boldsymbol{t}][[\cos(\frac{\theta}{2}),
+     *           \boldsymbol{u}\sin(\frac{\theta}{2})]]\\
+     *           &= \cos(\frac{\theta}{2}) + \boldsymbol{u}\sin(\frac{\theta}{2})
+     *           + \frac{\epsilon}{2}(-(\boldsymbol{t} \cdot \boldsymbol{u})\sin(\frac{\theta}{2})
+     *           + \boldsymbol{t}\cos(\frac{\theta}{2}) + \boldsymbol{u} \times \boldsymbol{t} \sin(\frac{\theta}{2})).
+     *    \end{split}
+     *    \end{equation}\f]
+     * @param angle rotation angle.
+     * @param axis rotation axis.
+     * @param translation a vector of length 3.
+     * @note Axis will be normalized in this function. And translation is applied
+     * after the rotation. Use @ref createFromQuat(r, r * t / 2) to create a dual quaternion
+     * which translation is applied before rotation.
+     * @sa Quat
+     */
+    static DualQuat<_Tp> createFromAngleAxisTrans(const _Tp angle, const Vec<_Tp, 3> &axis, const Vec<_Tp, 3> &translation);
+
+    /**
+     * @brief Transform this dual quaternion to an affine transformation matrix \f$M\f$.
+     * Dual quaternion consists of a rotation \f$r=[a,b,c,d]\f$ and a translation \f$t=[\Delta x,\Delta y,\Delta z]\f$. The
+     * affine transformation matrix \f$M\f$ has the form
+     * \f[
+     * \begin{bmatrix}
+     * 1-2(e_2^2 +e_3^2) &2(e_1e_2-e_0e_3) &2(e_0e_2+e_1e_3) &\Delta x\\
+     * 2(e_0e_3+e_1e_2)  &1-2(e_1^2+e_3^2) &2(e_2e_3-e_0e_1) &\Delta y\\
+     * 2(e_1e_3-e_0e_2)  &2(e_0e_1+e_2e_3) &1-2(e_1^2-e_2^2) &\Delta z\\
+     * 0&0&0&1
+     * \end{bmatrix}
+     * \f]
+     *  if A is a matrix consisting of  n points to be transformed, this could be achieved by
+     * \f[
+     *  new\_A = M * A
+     * \f]
+     * where A has the form
+     * \f[
+     * \begin{bmatrix}
+     * x_0& x_1& x_2&...&x_n\\
+     * y_0& y_1& y_2&...&y_n\\
+     * z_0& z_1& z_2&...&z_n\\
+     * 1&1&1&...&1
+     * \end{bmatrix}
+     * \f]
+     * where the same subscript represent the same point. The size of A should be \f$[4,n]\f$.
+     * and the same size for matrix new_A.
+     * @param _R 4x4 matrix that represents rotations and translation.
+     * @note Translation is applied after the rotation. Use createFromQuat(r, r * t / 2) to create
+     * a dual quaternion which translation is applied before rotation.
+     */
+    static DualQuat<_Tp> createFromMat(InputArray _R);
+
+    /**
+     * @brief create dual quaternion from an affine matrix. The definition of affine matrix can refer to  createFromMat()
+     */
+    static DualQuat<_Tp> createFromAffine3(const Affine3<_Tp> &R);
+
+    /**
+     * @brief A dual quaternion is a vector in form of
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma &=\boldsymbol{p} + \epsilon \boldsymbol{q}\\
+     * &= \cos\hat{\frac{\theta}{2}}+\overline{\hat{l}}\sin\frac{\hat{\theta}}{2}
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$\hat{\theta}\f$ is dual angle and \f$\overline{\hat{l}}\f$ is dual axis:
+     * \f[
+     * \hat{\theta}=\theta + \epsilon d,\\
+     * \overline{\hat{l}}= \hat{l} +\epsilon m.
+     * \f]
+     * In this representation, \f$\theta\f$ is rotation angle and \f$(\hat{l},m)\f$ is the screw axis, d is the translation distance along the axis.
+     *
+     * @param angle rotation angle.
+     * @param d translation along the rotation axis.
+     * @param axis rotation axis represented by quaternion with w = 0.
+     * @param moment the moment of line, and it should be orthogonal to axis.
+     * @note Translation is applied after the rotation. Use createFromQuat(r, r * t / 2) to create
+     * a dual quaternion which translation is applied before rotation.
+     */
+    static DualQuat<_Tp> createFromPitch(const _Tp angle, const _Tp d, const Vec<_Tp, 3> &axis, const Vec<_Tp, 3> &moment);
+
+    /**
+     * @brief return a quaternion which represent the real part of dual quaternion.
+     * The definition of real part is in createFromQuat().
+     * @sa createFromQuat, getDualPart
+     */
+    Quat<_Tp> getRealPart() const;
+
+    /**
+     * @brief return a quaternion which represent the dual part of dual quaternion.
+     * The definition of dual part is in createFromQuat().
+     * @sa createFromQuat, getRealPart
+     */
+    Quat<_Tp> getDualPart() const;
+
+    /**
+     * @brief return the conjugate of a dual quaternion.
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma^* &= (p + \epsilon q)^*
+     *          &= (p^* + \epsilon q^*)
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * @param dq a dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> conjugate(const DualQuat<T> &dq);
+
+    /**
+     * @brief return the conjugate of a dual quaternion.
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma^* &= (p + \epsilon q)^*
+     *          &= (p^* + \epsilon q^*)
+     * \end{split}
+     * \end{equation}
+     * \f]
+     */
+    DualQuat<_Tp> conjugate() const;
+
+    /**
+     * @brief return the rotation in quaternion form.
+     */
+    Quat<_Tp> getRotation(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the translation vector.
+     * The rotation \f$r\f$ in this dual quaternion \f$\sigma\f$ is applied before translation \f$t\f$.
+     * The dual quaternion \f$\sigma\f$ is defined as
+     * \f[\begin{equation}
+     * \begin{split}
+     * \sigma &= p + \epsilon q \\
+     *        &= r + \frac{\epsilon}{2}{t}r.
+     * \end{split}
+     * \end{equation}\f]
+     * Thus, the translation can be obtained as follows
+     * \f[t = 2qp^*.\f]
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     * @note This dual quaternion's translation is applied after the rotation.
+     */
+    Vec<_Tp, 3> getTranslation(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the norm \f$||\sigma||\f$ of dual quaternion \f$\sigma = p + \epsilon q\f$.
+     * \f[
+     *  \begin{equation}
+     *  \begin{split}
+     *  ||\sigma|| &= \sqrt{\sigma * \sigma^*} \\
+     *        &= ||p|| + \epsilon \frac{p \cdot q}{||p||}.
+     *  \end{split}
+     *  \end{equation}
+     *  \f]
+     * Generally speaking, the norm of a not unit dual
+     * quaternion is a dual number. For convenience, we return it in the form of a dual quaternion
+     * , i.e.
+     * \f[ ||\sigma|| = [||p||, 0, 0, 0, \frac{p \cdot q}{||p||}, 0, 0, 0].\f]
+     *
+     * @note The data type of dual number is dual quaternion.
+     */
+    DualQuat<_Tp> norm() const;
+
+    /**
+     * @brief return a normalized dual quaternion.
+     * A dual quaternion can be expressed as
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma &= p + \epsilon q\\
+     * &=||\sigma||\left(r+\frac{1}{2}tr\right)
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$r, t\f$ represents the rotation (ordinary quaternion) and translation (pure ordinary quaternion) respectively,
+     * and \f$||\sigma||\f$ is the norm of dual quaternion(a dual number).
+     * A dual quaternion is unit if and only if
+     * \f[
+     * ||p||=1, p \cdot q=0
+     * \f]
+     * where \f$\cdot\f$ means dot product.
+     * The process of normalization is
+     * \f[
+     * \sigma_{u}=\frac{\sigma}{||\sigma||}
+     * \f]
+     * Next, we simply proof \f$\sigma_u\f$ is a unit dual quaternion:
+     * \f[
+     * \renewcommand{\Im}{\operatorname{Im}}
+     * \begin{equation}
+     * \begin{split}
+     * \sigma_{u}=\frac{\sigma}{||\sigma||}&=\frac{p + \epsilon q}{||p||+\epsilon\frac{p\cdot q}{||p||}}\\
+     * &=\frac{p}{||p||}+\epsilon\left(\frac{q}{||p||}-p\frac{p\cdot q}{||p||^3}\right)\\
+     * &=\frac{p}{||p||}+\epsilon\frac{1}{||p||^2}\left(qp^{*}-p\cdot q\right)\frac{p}{||p||}\\
+     * &=\frac{p}{||p||}+\epsilon\frac{1}{||p||^2}\Im(qp^*)\frac{p}{||p||}.\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * As expected, the real part is a rotation and dual part is a pure quaternion.
+     */
+    DualQuat<_Tp> normalize() const;
+
+    /**
+     * @brief if \f$\sigma = p + \epsilon q\f$ is a dual quaternion, p is not zero,
+     * the inverse dual quaternion is
+     * \f[\sigma^{-1} = \frac{\sigma^*}{||\sigma||^2}, \f]
+     * or equivalentlly,
+     * \f[\sigma^{-1} = p^{-1} - \epsilon p^{-1}qp^{-1}.\f]
+     * @param dq a dual quaternion.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion dq assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> inv(const DualQuat<T> &dq, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief if \f$\sigma = p + \epsilon q\f$ is a dual quaternion, p is not zero,
+     * the inverse dual quaternion is
+     * \f[\sigma^{-1} = \frac{\sigma^*}{||\sigma||^2}, \f]
+     * or equivalentlly,
+     * \f[\sigma^{-1} = p^{-1} - \epsilon p^{-1}qp^{-1}.\f]
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> inv(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the dot product of two dual quaternion.
+     * @param p other dual quaternion.
+     */
+    _Tp dot(DualQuat<_Tp> p) const;
+
+    /**
+     ** @brief return the value of \f$p^t\f$ where p is a dual quaternion.
+     * This could be calculated as:
+     * \f[
+     * p^t = \exp(t\ln p)
+     * \f]
+     * @param dq a dual quaternion.
+     * @param t index of power function.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion dq assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> power(const DualQuat<T> &dq, const T t, QuatAssumeType assumeUnit);
+
+    /**
+     ** @brief return the value of \f$p^t\f$ where p is a dual quaternion.
+     * This could be calculated as:
+     * \f[
+     * p^t = \exp(t\ln p)
+     * \f]
+     *
+     * @param t index of power function.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> power(const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of \f$p^q\f$ where p and q are dual quaternions.
+     * This could be calculated as:
+     * \f[
+     * p^q = \exp(q\ln p)
+     * \f]
+     * @param p a dual quaternion.
+     * @param q a dual quaternion.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion p assume to be a dual unit quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> power(const DualQuat<T>& p, const DualQuat<T>& q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of \f$p^q\f$ where p and q are dual quaternions.
+     * This could be calculated as:
+     * \f[
+     * p^q = \exp(q\ln p)
+     * \f]
+     *
+     * @param q a dual quaternion
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a dual unit quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> power(const DualQuat<_Tp>& q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of exponential function value
+     * @param dq a dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> exp(const DualQuat<T> &dq);
+
+    /**
+     * @brief return the value of exponential function value
+     */
+    DualQuat<_Tp> exp() const;
+
+    /**
+     * @brief return the value of logarithm function value
+     *
+     * @param dq a dual quaternion.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion dq assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> log(const DualQuat<T> &dq, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of logarithm function value
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> log(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief Transform this dual quaternion to a vector.
+     */
+    Vec<_Tp, 8> toVec() const;
+
+    /**
+     * @brief Transform this dual quaternion to a affine transformation matrix
+     * the form of matrix, see createFromMat().
+     */
+    Matx<_Tp, 4, 4> toMat(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+      * @brief Transform this dual quaternion to a instance of Affine3.
+      */
+    Affine3<_Tp> toAffine3(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief The screw linear interpolation(ScLERP) is an extension of spherical linear interpolation of dual quaternion.
+     * If \f$\sigma_1\f$ and \f$\sigma_2\f$ are two dual quaternions representing the initial and final pose.
+     * The interpolation of ScLERP function can be defined as:
+     * \f[
+     * ScLERP(t;\sigma_1,\sigma_2) = \sigma_1 * (\sigma_1^{-1} * \sigma_2)^t, t\in[0,1]
+     * \f]
+     *
+     * @param q1 a dual quaternion represents a initial pose.
+     * @param q2 a dual quaternion represents a final pose.
+     * @param t interpolation parameter
+     * @param directChange if true, it always return the shortest path.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     *
+     * For example
+     * ```
+     * double angle1 = CV_PI / 2;
+     * Vec3d axis{0, 0, 1};
+     * Vec3d t(0, 0, 3);
+     * DualQuatd initial = DualQuatd::createFromAngleAxisTrans(angle1, axis, t);
+     * double angle2 = CV_PI;
+     * DualQuatd final = DualQuatd::createFromAngleAxisTrans(angle2, axis, t);
+     * DualQuatd inter = DualQuatd::sclerp(initial, final, 0.5);
+     * ```
+     */
+    static DualQuat<_Tp> sclerp(const DualQuat<_Tp> &q1, const DualQuat<_Tp> &q2, const _Tp t,
+                                bool directChange=true, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+    /**
+     * @brief The method of Dual Quaternion linear Blending(DQB) is to compute a transformation between dual quaternion
+     * \f$q_1\f$ and \f$q_2\f$ and can be defined as:
+     * \f[
+     * DQB(t;{\boldsymbol{q}}_1,{\boldsymbol{q}}_2)=
+     * \frac{(1-t){\boldsymbol{q}}_1+t{\boldsymbol{q}}_2}{||(1-t){\boldsymbol{q}}_1+t{\boldsymbol{q}}_2||}.
+     * \f]
+     * where \f$q_1\f$ and \f$q_2\f$ are unit dual quaternions representing the input transformations.
+     * If you want to use DQB that works for more than two rigid transformations, see @ref gdqblend
+     *
+     * @param q1 a unit dual quaternion representing the input transformations.
+     * @param q2 a unit dual quaternion representing the input transformations.
+     * @param t parameter \f$t\in[0,1]\f$.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     *
+     * @sa gdqblend
+     */
+    static DualQuat<_Tp> dqblend(const DualQuat<_Tp> &q1, const DualQuat<_Tp> &q2, const _Tp t,
+                                   QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief The generalized Dual Quaternion linear Blending works for more than two rigid transformations.
+     * If these transformations are expressed as unit dual quaternions \f$q_1,...,q_n\f$ with convex weights
+     * \f$w = (w_1,...,w_n)\f$, the generalized DQB is simply
+     * \f[
+     * gDQB(\boldsymbol{w};{\boldsymbol{q}}_1,...,{\boldsymbol{q}}_n)=\frac{w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n}
+     * {||w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n||}.
+     * \f]
+     * @param dualquat vector of dual quaternions
+     * @param weights vector of weights, the size of weights should be the same as dualquat, and the weights should
+     * satisfy \f$\sum_0^n w_{i} = 1\f$ and \f$w_i>0\f$.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, these dual quaternions assume to be unit quaternions
+     * and this function will save some computations.
+     * @note the type of weights' element should be the same as the date type of dual quaternion inside the dualquat.
+     */
+    template <int cn>
+    static DualQuat<_Tp> gdqblend(const Vec<DualQuat<_Tp>, cn> &dualquat, InputArray weights,
+                                QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief The generalized Dual Quaternion linear Blending works for more than two rigid transformations.
+     * If these transformations are expressed as unit dual quaternions \f$q_1,...,q_n\f$ with convex weights
+     * \f$w = (w_1,...,w_n)\f$, the generalized DQB is simply
+     * \f[
+     * gDQB(\boldsymbol{w};{\boldsymbol{q}}_1,...,{\boldsymbol{q}}_n)=\frac{w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n}
+     * {||w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n||}.
+     * \f]
+     * @param dualquat The dual quaternions which have 8 channels and 1 row or 1 col.
+     * @param weights vector of weights, the size of weights should be the same as dualquat, and the weights should
+     * satisfy \f$\sum_0^n w_{i} = 1\f$ and \f$w_i>0\f$.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, these dual quaternions assume to be unit quaternions
+     * and this function will save some computations.
+     * @note the type of weights' element should be the same as the date type of dual quaternion inside the dualquat.
+     */
+    static DualQuat<_Tp> gdqblend(InputArray dualquat, InputArray weights,
+                                QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief Return opposite dual quaternion \f$-p\f$
+     * which satisfies \f$p + (-p) = 0.\f$
+     *
+     * For example
+     * ```
+     * DualQuatd q{1, 2, 3, 4, 5, 6, 7, 8};
+     * std::cout << -q << std::endl; // [-1, -2, -3, -4, -5, -6, -7, -8]
+     * ```
+     */
+    DualQuat<_Tp> operator-() const;
+
+    /**
+     * @brief return true if two dual quaternions p and q are nearly equal, i.e. when the absolute
+     * value of each \f$p_i\f$ and \f$q_i\f$ is less than CV_DUAL_QUAT_EPS.
+     */
+    bool operator==(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Subtraction operator of two dual quaternions p and q.
+     * It returns a new dual quaternion that each value is the sum of \f$p_i\f$ and \f$-q_i\f$.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p - q << std::endl; //[-4, -4, -4, -4, 4, -4, -4, -4]
+     * ```
+     */
+    DualQuat<_Tp> operator-(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Subtraction assignment operator of two dual quaternions p and q.
+     * It subtracts right operand from the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p -= q; // equivalent to p = p - q
+     * std::cout << p << std::endl; //[-4, -4, -4, -4, 4, -4, -4, -4]
+     *
+     * ```
+     */
+    DualQuat<_Tp>& operator-=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Addition operator of two dual quaternions p and q.
+     * It returns a new dual quaternion that each value is the sum of \f$p_i\f$ and \f$q_i\f$.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p + q << std::endl; //[6, 8, 10, 12, 14, 16, 18, 20]
+     * ```
+     */
+    DualQuat<_Tp> operator+(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Addition assignment operator of two dual quaternions p and q.
+     * It adds right operand to the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p += q; // equivalent to p = p + q
+     * std::cout << p << std::endl; //[6, 8, 10, 12, 14, 16, 18, 20]
+     *
+     * ```
+     */
+    DualQuat<_Tp>& operator+=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of two quaternions.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication:
+     * The dual quaternion can be written as an ordered pair of quaternions [A, B]. Thus
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [A, B][C, D]\\
+     * &=[AC, AD + BC]
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p *= q;
+     * std::cout << p << std::endl; //[-60, 12, 30, 24, -216, 80, 124, 120]
+     * ```
+     */
+    DualQuat<_Tp>& operator*=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of a quaternions and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z, w\_, x\_, y\_, z\_] * s\\
+     *  &=[w   s, x   s, y   s, z   s, w\_  \space  s, x\_  \space  s, y\_ \space  s, z\_ \space  s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * p *= s;
+     * std::cout << p << std::endl; //[2, 4, 6, 8, 10, 12, 14, 16]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    DualQuat<_Tp> operator*=(const _Tp s);
+
+
+    /**
+     * @brief Multiplication operator of two dual quaternions q and p.
+     * Multiplies values on either side of the operator.
+     *
+     * Rule of dual quaternion multiplication:
+     * The dual quaternion can be written as an ordered pair of quaternions [A, B]. Thus
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [A, B][C, D]\\
+     * &=[AC, AD + BC]
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p * q << std::endl; //[-60, 12, 30, 24, -216, 80, 124, 120]
+     * ```
+     */
+    DualQuat<_Tp> operator*(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Division operator of a dual quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z, w\_, x\_, y\_, z\_] / s\\
+     * &=[w/s, x/s, y/s, z/s, w\_/s, x\_/s, y\_/s, z\_/s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4]
+     * ```
+     * @note the type of scalar should be equal to this dual quaternion.
+     */
+    DualQuat<_Tp> operator/(const _Tp s) const;
+
+    /**
+     * @brief Division operator of two dual quaternions p and q.
+     * Divides left hand operand by right hand operand.
+     *
+     * Rule of dual quaternion division with a dual quaternion:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q &= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p / q << std::endl; // equivalent to p * q.inv()
+     * ```
+     */
+    DualQuat<_Tp> operator/(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Division assignment operator of two dual quaternions p and q;
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion division with a quaternion:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q&= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p /= q; // equivalent to p = p * q.inv()
+     * std::cout << p << std::endl;
+     * ```
+     */
+    DualQuat<_Tp>& operator/=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Division assignment operator of a dual quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z, w\_, x\_, y\_ ,z\_] / s\\
+     * &=[w / s, x / s, y / s, z / s, w\_ / \space s, x\_ / \space s, y\_ / \space s, z\_ / \space s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    Quat<_Tp>& operator/=(const _Tp s);
+
+    /**
+     * @brief Addition operator of a scalar and a dual quaternions.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << scalar + p << std::endl; //[3.0, 2, 3, 4, 5, 6, 7, 8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator+(const T s, const DualQuat<T>&);
+
+    /**
+     * @brief Addition operator of a dual quaternions and a scalar.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << p + scalar << std::endl; //[3.0, 2, 3, 4, 5, 6, 7, 8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator+(const DualQuat<T>&, const T s);
+
+    /**
+     * @brief Multiplication operator of a scalar and a dual quaternions.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z, w\_, x\_, y\_, z\_] * s\\
+     * &=[w s, x s, y s, z s, w\_ \space s, x\_ \space s, y\_ \space s, z\_ \space s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * std::cout << s * p << std::endl; //[2, 4, 6, 8, 10, 12, 14, 16]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator*(const T s, const DualQuat<T>&);
+
+    /**
+     * @brief Subtraction operator of a dual quaternion and a scalar.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << p - scalar << std::endl; //[-1, 2, 3, 4, 5, 6, 7, 8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator-(const DualQuat<T>&, const T s);
+
+    /**
+     * @brief Subtraction operator of a scalar and a dual quaternions.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << scalar - p << std::endl; //[1.0, -2, -3, -4, -5, -6, -7, -8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator-(const T s, const DualQuat<T>&);
+
+    /**
+     * @brief Multiplication operator of a dual quaternions and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z, w\_, x\_, y\_, z\_] * s\\
+     * &=[w s, x s, y s, z s, w\_ \space s, x\_ \space s, y\_ \space s, z\_ \space s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * std::cout << p * s << std::endl; //[2, 4, 6, 8, 10, 12, 14, 16]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator*(const DualQuat<T>&, const T s);
+
+    template <typename S>
+    friend std::ostream& cv::operator<<(std::ostream&, const DualQuat<S>&);
+
+};
+
+using DualQuatd = DualQuat<double>;
+using DualQuatf = DualQuat<float>;
+
+//! @} core
+}//namespace
+
+#include "dualquaternion.inl.hpp"
+
+#endif /* OPENCV_CORE_QUATERNION_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/dualquaternion.inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/dualquaternion.inl.hpp
new file mode 100644
index 000000000000..1a68f12d305b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/dualquaternion.inl.hpp
@@ -0,0 +1,487 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <kongliangqian@huawei.com>
+//         Longbu Wang <wanglongbu@huawei.com>
+
+#ifndef OPENCV_CORE_DUALQUATERNION_INL_HPP
+#define OPENCV_CORE_DUALQUATERNION_INL_HPP
+
+#ifndef OPENCV_CORE_DUALQUATERNION_HPP
+#error This is not a standalone header. Include dualquaternion.hpp instead.
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////
+//Implementation
+namespace cv {
+
+template <typename T>
+DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){}
+
+template <typename T>
+DualQuat<T>::DualQuat(const T vw, const T vx, const T vy, const T vz, const T _w, const T _x, const T _y, const T _z):
+                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){}
+
+template <typename T>
+DualQuat<T>::DualQuat(const Vec<T, 8> &q):w(q[0]), x(q[1]), y(q[2]), z(q[3]),
+                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromQuat(const Quat<T> &realPart, const Quat<T> &dualPart)
+{
+    T w = realPart.w;
+    T x = realPart.x;
+    T y = realPart.y;
+    T z = realPart.z;
+    T w_ = dualPart.w;
+    T x_ = dualPart.x;
+    T y_ = dualPart.y;
+    T z_ = dualPart.z;
+    return DualQuat<T>(w, x, y, z, w_, x_, y_, z_);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromAngleAxisTrans(const T angle, const Vec<T, 3> &axis, const Vec<T, 3> &trans)
+{
+    Quat<T> r = Quat<T>::createFromAngleAxis(angle, axis);
+    Quat<T> t{0, trans[0], trans[1], trans[2]};
+    return createFromQuat(r, t * r * T(0.5));
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromMat(InputArray _R)
+{
+    CV_CheckTypeEQ(_R.type(), cv::traits::Type<T>::value, "");
+    if (_R.size() != Size(4, 4))
+    {
+        CV_Error(Error::StsBadArg, "The input matrix must have 4 columns and 4 rows");
+    }
+    Mat R = _R.getMat();
+    Quat<T> r = Quat<T>::createFromRotMat(R.colRange(0, 3).rowRange(0, 3));
+    Quat<T> trans(0, R.at<T>(0, 3), R.at<T>(1, 3), R.at<T>(2, 3));
+    return createFromQuat(r, trans * r * T(0.5));
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromAffine3(const Affine3<T> &R)
+{
+    return createFromMat(R.matrix);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromPitch(const T angle, const T d, const Vec<T, 3> &axis, const Vec<T, 3> &moment)
+{
+    T half_angle = angle * T(0.5), half_d = d * T(0.5);
+    Quat<T> qaxis = Quat<T>(0, axis[0], axis[1], axis[2]).normalize();
+    Quat<T> qmoment = Quat<T>(0, moment[0], moment[1], moment[2]);
+    qmoment -= qaxis * axis.dot(moment);
+    Quat<T> dual = -half_d * std::sin(half_angle) + std::sin(half_angle) * qmoment +
+        half_d * std::cos(half_angle) * qaxis;
+    return createFromQuat(Quat<T>::createFromAngleAxis(angle, axis), dual);
+}
+
+template <typename T>
+inline bool DualQuat<T>::operator==(const DualQuat<T> &q) const
+{
+    return (abs(w - q.w) < CV_DUAL_QUAT_EPS && abs(x - q.x) < CV_DUAL_QUAT_EPS &&
+            abs(y - q.y) < CV_DUAL_QUAT_EPS && abs(z - q.z) < CV_DUAL_QUAT_EPS &&
+            abs(w_ - q.w_) < CV_DUAL_QUAT_EPS && abs(x_ - q.x_) < CV_DUAL_QUAT_EPS &&
+            abs(y_ - q.y_) < CV_DUAL_QUAT_EPS && abs(z_ - q.z_) < CV_DUAL_QUAT_EPS);
+}
+
+template <typename T>
+inline Quat<T> DualQuat<T>::getRealPart() const
+{
+    return Quat<T>(w, x, y, z);
+}
+
+template <typename T>
+inline Quat<T> DualQuat<T>::getDualPart() const
+{
+    return Quat<T>(w_, x_, y_, z_);
+}
+
+template <typename T>
+inline DualQuat<T> conjugate(const DualQuat<T> &dq)
+{
+    return dq.conjugate();
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::conjugate() const
+{
+    return DualQuat<T>(w, -x, -y, -z, w_, -x_, -y_, -z_);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::norm() const
+{
+    Quat<T> real = getRealPart();
+    T realNorm = real.norm();
+    Quat<T> dual = getDualPart();
+    if (realNorm < CV_DUAL_QUAT_EPS){
+        return DualQuat<T>(0, 0, 0, 0, 0, 0, 0, 0);
+    }
+    return DualQuat<T>(realNorm, 0, 0, 0, real.dot(dual) / realNorm, 0, 0, 0);
+}
+
+template <typename T>
+inline Quat<T> DualQuat<T>::getRotation(QuatAssumeType assumeUnit) const
+{
+    if (assumeUnit)
+    {
+        return getRealPart();
+    }
+    return getRealPart().normalize();
+}
+
+template <typename T>
+inline Vec<T, 3> DualQuat<T>::getTranslation(QuatAssumeType assumeUnit) const
+{
+    Quat<T> trans = T(2.0) * (getDualPart() * getRealPart().inv(assumeUnit));
+    return Vec<T, 3>{trans[1], trans[2], trans[3]};
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::normalize() const
+{
+    Quat<T> p = getRealPart();
+    Quat<T> q = getDualPart();
+    T p_norm = p.norm();
+    if (p_norm < CV_DUAL_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "Cannot normalize this dual quaternion: the norm is too small.");
+    }
+    Quat<T> p_nr = p / p_norm;
+    Quat<T> q_nr = q / p_norm;
+    return createFromQuat(p_nr, q_nr - p_nr * p_nr.dot(q_nr));
+}
+
+template <typename T>
+inline T DualQuat<T>::dot(DualQuat<T> q) const
+{
+    return q.w * w + q.x * x + q.y * y + q.z * z + q.w_ * w_ + q.x_ * x_ + q.y_ * y_ + q.z_ * z_;
+}
+
+template <typename T>
+inline DualQuat<T> inv(const DualQuat<T> &dq, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return dq.inv(assumeUnit);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::inv(QuatAssumeType assumeUnit) const
+{
+    Quat<T> real = getRealPart();
+    Quat<T> dual = getDualPart();
+    return createFromQuat(real.inv(assumeUnit), -real.inv(assumeUnit) * dual * real.inv(assumeUnit));
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator-(const DualQuat<T> &q) const
+{
+    return DualQuat<T>(w - q.w, x - q.x, y - q.y, z - q.z, w_ - q.w_, x_ - q.x_, y_ - q.y_, z_ - q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator-() const
+{
+    return DualQuat<T>(-w, -x, -y, -z, -w_, -x_, -y_, -z_);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator+(const DualQuat<T> &q) const
+{
+    return DualQuat<T>(w + q.w, x + q.x, y + q.y, z + q.z, w_ + q.w_, x_ + q.x_, y_ + q.y_, z_ + q.z_);
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator+=(const DualQuat<T> &q)
+{
+    *this = *this + q;
+    return *this;
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator*(const DualQuat<T> &q) const
+{
+    Quat<T> A = getRealPart();
+    Quat<T> B = getDualPart();
+    Quat<T> C = q.getRealPart();
+    Quat<T> D = q.getDualPart();
+    return DualQuat<T>::createFromQuat(A * C, A * D + B * C);
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator*=(const DualQuat<T> &q)
+{
+    *this = *this * q;
+    return *this;
+}
+
+template <typename T>
+inline DualQuat<T> operator+(const T a, const DualQuat<T> &q)
+{
+    return DualQuat<T>(a + q.w, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> operator+(const DualQuat<T> &q, const T a)
+{
+    return DualQuat<T>(a + q.w, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> operator-(const DualQuat<T> &q, const T a)
+{
+    return DualQuat<T>(q.w - a, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_);
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator-=(const DualQuat<T> &q)
+{
+    *this = *this - q;
+    return *this;
+}
+
+template <typename T>
+inline DualQuat<T> operator-(const T a, const DualQuat<T> &q)
+{
+    return DualQuat<T>(a - q.w, -q.x, -q.y, -q.z, -q.w_, -q.x_, -q.y_, -q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> operator*(const T a, const DualQuat<T> &q)
+{
+    return DualQuat<T>(q.w * a, q.x * a, q.y * a, q.z * a, q.w_ * a, q.x_ * a, q.y_ * a, q.z_ * a);
+}
+
+template <typename T>
+inline DualQuat<T> operator*(const DualQuat<T> &q, const T a)
+{
+    return DualQuat<T>(q.w * a, q.x * a, q.y * a, q.z * a, q.w_ * a, q.x_ * a, q.y_ * a, q.z_ * a);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator/(const T a) const
+{
+    return DualQuat<T>(w / a, x / a, y / a, z / a, w_ / a, x_ / a, y_ / a, z_ / a);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator/(const DualQuat<T> &q) const
+{
+    return *this * q.inv();
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator/=(const DualQuat<T> &q)
+{
+    *this = *this / q;
+    return *this;
+}
+
+template <typename T>
+std::ostream & operator<<(std::ostream &os, const DualQuat<T> &q)
+{
+    os << "DualQuat " << Vec<T, 8>{q.w, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_};
+    return os;
+}
+
+template <typename T>
+inline DualQuat<T> exp(const DualQuat<T> &dq)
+{
+    return dq.exp();
+}
+
+namespace detail {
+
+template <typename _Tp>
+Matx<_Tp, 4, 4> jacob_exp(const Quat<_Tp> &q)
+{
+    _Tp nv = std::sqrt(q.x * q.x + q.y * q.y + q.z * q.z);
+    _Tp sinc_nv = abs(nv) < cv::DualQuat<_Tp>::CV_DUAL_QUAT_EPS ? _Tp(1.0) - nv * nv * _Tp(1.0/6.0) : std::sin(nv) / nv;
+    _Tp csiii_nv = abs(nv) < cv::DualQuat<_Tp>::CV_DUAL_QUAT_EPS ? -_Tp(1.0/3.0) : (std::cos(nv) - sinc_nv) / nv / nv;
+    Matx<_Tp, 4, 4> J_exp_quat {
+        std::cos(nv), -sinc_nv * q.x,  -sinc_nv * q.y,  -sinc_nv * q.z,
+        sinc_nv * q.x, csiii_nv * q.x * q.x + sinc_nv, csiii_nv * q.x * q.y, csiii_nv * q.x * q.z,
+        sinc_nv * q.y, csiii_nv * q.y * q.x, csiii_nv * q.y * q.y + sinc_nv, csiii_nv * q.y * q.z,
+        sinc_nv * q.z, csiii_nv * q.z * q.x, csiii_nv * q.z * q.y, csiii_nv * q.z * q.z + sinc_nv
+    };
+    return std::exp(q.w) * J_exp_quat;
+}
+
+} // namespace detail
+
+template <typename T>
+DualQuat<T> DualQuat<T>::exp() const
+{
+    Quat<T> real = getRealPart();
+    return createFromQuat(real.exp(), Quat<T>(detail::jacob_exp(real) * getDualPart().toVec()));
+}
+
+template <typename T>
+DualQuat<T> log(const DualQuat<T> &dq, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return dq.log(assumeUnit);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::log(QuatAssumeType assumeUnit) const
+{
+    Quat<T> plog = getRealPart().log(assumeUnit);
+    Matx<T, 4, 4> jacob = detail::jacob_exp(plog);
+    return createFromQuat(plog, Quat<T>(jacob.inv() * getDualPart().toVec()));
+}
+
+template <typename T>
+inline DualQuat<T> power(const DualQuat<T> &dq, const T t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return dq.power(t, assumeUnit);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::power(const T t, QuatAssumeType assumeUnit) const
+{
+    return (t * log(assumeUnit)).exp();
+}
+
+template <typename T>
+inline DualQuat<T> power(const DualQuat<T> &p, const DualQuat<T> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return p.power(q, assumeUnit);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::power(const DualQuat<T> &q, QuatAssumeType assumeUnit) const
+{
+    return (q * log(assumeUnit)).exp();
+}
+
+template <typename T>
+inline Vec<T, 8> DualQuat<T>::toVec() const
+{
+   return Vec<T, 8>(w, x, y, z, w_, x_, y_, z_);
+}
+
+template <typename T>
+Affine3<T> DualQuat<T>::toAffine3(QuatAssumeType assumeUnit) const
+{
+    return Affine3<T>(toMat(assumeUnit));
+}
+
+template <typename T>
+Matx<T, 4, 4> DualQuat<T>::toMat(QuatAssumeType assumeUnit) const
+{
+    Matx<T, 4, 4> rot44 = getRotation(assumeUnit).toRotMat4x4();
+    Vec<T, 3> translation = getTranslation(assumeUnit);
+    rot44(0, 3) = translation[0];
+    rot44(1, 3) = translation[1];
+    rot44(2, 3) = translation[2];
+    return rot44;
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::sclerp(const DualQuat<T> &q0, const DualQuat<T> &q1, const T t, bool directChange, QuatAssumeType assumeUnit)
+{
+    DualQuat<T> v0(q0), v1(q1);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+    }
+    Quat<T> v0Real = v0.getRealPart();
+    Quat<T> v1Real = v1.getRealPart();
+    if (directChange && v1Real.dot(v0Real) < 0)
+    {
+        v0 = -v0;
+    }
+    DualQuat<T> v0inv1 = v0.inv() * v1;
+    return v0 * v0inv1.power(t, QUAT_ASSUME_UNIT);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::dqblend(const DualQuat<T> &q1, const DualQuat<T> &q2, const T t, QuatAssumeType assumeUnit)
+{
+    DualQuat<T> v1(q1), v2(q2);
+    if (!assumeUnit)
+    {
+        v1 = v1.normalize();
+        v2 = v2.normalize();
+    }
+    if (v1.getRotation(assumeUnit).dot(v2.getRotation(assumeUnit)) < 0)
+    {
+        return ((1 - t) * v1 - t * v2).normalize();
+    }
+    return ((1 - t) * v1 + t * v2).normalize();
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::gdqblend(InputArray _dualquat, InputArray _weight, QuatAssumeType assumeUnit)
+{
+    CV_CheckTypeEQ(_weight.type(), cv::traits::Type<T>::value, "");
+    CV_CheckTypeEQ(_dualquat.type(), CV_MAKETYPE(CV_MAT_DEPTH(cv::traits::Type<T>::value), 8), "");
+    Size dq_s = _dualquat.size();
+    if (dq_s != _weight.size() || (dq_s.height != 1 && dq_s.width != 1))
+    {
+        CV_Error(Error::StsBadArg, "The size of weight must be the same as dualquat, both of them should be (1, n) or (n, 1)");
+    }
+    Mat dualquat = _dualquat.getMat(), weight = _weight.getMat();
+    const int cn = std::max(dq_s.width, dq_s.height);
+    if (!assumeUnit)
+    {
+        for (int i = 0; i < cn; ++i)
+        {
+            dualquat.at<Vec<T, 8>>(i) = DualQuat<T>{dualquat.at<Vec<T, 8>>(i)}.normalize().toVec();
+        }
+    }
+    Vec<T, 8> dq_blend = dualquat.at<Vec<T, 8>>(0) * weight.at<T>(0);
+    Quat<T> q0 = DualQuat<T> {dualquat.at<Vec<T, 8>>(0)}.getRotation(assumeUnit);
+    for (int i = 1; i < cn; ++i)
+    {
+        T k = q0.dot(DualQuat<T>{dualquat.at<Vec<T, 8>>(i)}.getRotation(assumeUnit)) < 0 ? -1: 1;
+        dq_blend = dq_blend + dualquat.at<Vec<T, 8>>(i) * k * weight.at<T>(i);
+    }
+    return DualQuat<T>{dq_blend}.normalize();
+}
+
+template <typename T>
+template <int cn>
+DualQuat<T> DualQuat<T>::gdqblend(const Vec<DualQuat<T>, cn> &_dualquat, InputArray _weight, QuatAssumeType assumeUnit)
+{
+    Vec<DualQuat<T>, cn> dualquat(_dualquat);
+    if (cn == 0)
+    {
+        return DualQuat<T>(1, 0, 0, 0, 0, 0, 0, 0);
+    }
+    Mat dualquat_mat(cn, 1, CV_64FC(8));
+    for (int i = 0; i < cn ; ++i)
+    {
+        dualquat_mat.at<Vec<T, 8>>(i) = dualquat[i].toVec();
+    }
+    return gdqblend(dualquat_mat, _weight, assumeUnit);
+}
+
+} //namespace cv
+
+#endif /*OPENCV_CORE_DUALQUATERNION_INL_HPP*/
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/eigen.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/eigen.hpp
new file mode 100644
index 000000000000..231c6805c0ca
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/eigen.hpp
@@ -0,0 +1,403 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#ifndef OPENCV_CORE_EIGEN_HPP
+#define OPENCV_CORE_EIGEN_HPP
+
+#ifndef EIGEN_WORLD_VERSION
+#error "Wrong usage of OpenCV's Eigen utility header. Include Eigen's headers first. See https://github.com/opencv/opencv/issues/17366"
+#endif
+
+#include "opencv2/core.hpp"
+
+#if defined _MSC_VER && _MSC_VER >= 1200
+#ifndef NOMINMAX
+#define NOMINMAX // fix https://github.com/opencv/opencv/issues/17548
+#endif
+#pragma warning( disable: 4714 ) //__forceinline is not inlined
+#pragma warning( disable: 4127 ) //conditional expression is constant
+#pragma warning( disable: 4244 ) //conversion from '__int64' to 'int', possible loss of data
+#endif
+
+#if !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
+#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
+#include <unsupported/Eigen/CXX11/Tensor>
+#define OPENCV_EIGEN_TENSOR_SUPPORT 1
+#endif  // EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
+#endif  // !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
+
+namespace cv
+{
+
+/** @addtogroup core_eigen
+These functions are provided for OpenCV-Eigen interoperability. They convert `Mat`
+objects to corresponding `Eigen::Matrix` objects and vice-versa. Consult the [Eigen
+documentation](https://eigen.tuxfamily.org/dox/group__TutorialMatrixClass.html) for
+information about the `Matrix` template type.
+
+@note Using these functions requires the `Eigen/Dense` or similar header to be
+included before this header.
+*/
+//! @{
+
+#if defined(OPENCV_EIGEN_TENSOR_SUPPORT) || defined(CV_DOXYGEN)
+/** @brief Converts an Eigen::Tensor to a cv::Mat.
+
+The method converts an Eigen::Tensor with shape (H x W x C) to a cv::Mat where:
+ H = number of rows
+ W = number of columns
+ C = number of channels
+
+Usage:
+\code
+Eigen::Tensor<float, 3, Eigen::RowMajor> a_tensor(...);
+// populate tensor with values
+Mat a_mat;
+eigen2cv(a_tensor, a_mat);
+\endcode
+*/
+template <typename _Tp, int _layout> static inline
+void eigen2cv( const Eigen::Tensor<_Tp, 3, _layout> &src, OutputArray dst )
+{
+    if( !(_layout & Eigen::RowMajorBit) )
+    {
+        const std::array<int, 3> shuffle{2, 1, 0};
+        Eigen::Tensor<_Tp, 3, !_layout> row_major_tensor = src.swap_layout().shuffle(shuffle);
+        Mat _src(src.dimension(0), src.dimension(1), CV_MAKETYPE(DataType<_Tp>::type, src.dimension(2)), row_major_tensor.data());
+        _src.copyTo(dst);
+    }
+    else
+    {
+        Mat _src(src.dimension(0), src.dimension(1), CV_MAKETYPE(DataType<_Tp>::type, src.dimension(2)), (void *)src.data());
+        _src.copyTo(dst);
+    }
+}
+
+/** @brief Converts a cv::Mat to an Eigen::Tensor.
+
+The method converts a cv::Mat to an Eigen Tensor with shape (H x W x C) where:
+ H = number of rows
+ W = number of columns
+ C = number of channels
+
+Usage:
+\code
+Mat a_mat(...);
+// populate Mat with values
+Eigen::Tensor<float, 3, Eigen::RowMajor> a_tensor(...);
+cv2eigen(a_mat, a_tensor);
+\endcode
+*/
+template <typename _Tp, int _layout> static inline
+void cv2eigen( const Mat &src, Eigen::Tensor<_Tp, 3, _layout> &dst )
+{
+    if( !(_layout & Eigen::RowMajorBit) )
+    {
+        Eigen::Tensor<_Tp, 3, !_layout> row_major_tensor(src.rows, src.cols, src.channels());
+        Mat _dst(src.rows, src.cols, CV_MAKETYPE(DataType<_Tp>::type, src.channels()), row_major_tensor.data());
+        if (src.type() == _dst.type())
+            src.copyTo(_dst);
+        else
+            src.convertTo(_dst, _dst.type());
+        const std::array<int, 3> shuffle{2, 1, 0};
+        dst = row_major_tensor.swap_layout().shuffle(shuffle);
+    }
+    else
+    {
+        dst.resize(src.rows, src.cols, src.channels());
+        Mat _dst(src.rows, src.cols, CV_MAKETYPE(DataType<_Tp>::type, src.channels()), dst.data());
+        if (src.type() == _dst.type())
+            src.copyTo(_dst);
+        else
+            src.convertTo(_dst, _dst.type());
+    }
+}
+
+/** @brief Maps cv::Mat data to an Eigen::TensorMap.
+
+The method wraps an existing Mat data array with an Eigen TensorMap of shape (H x W x C) where:
+ H = number of rows
+ W = number of columns
+ C = number of channels
+
+Explicit instantiation of the return type is required.
+
+@note Caller should be aware of the lifetime of the cv::Mat instance and take appropriate safety measures.
+The cv::Mat instance will retain ownership of the data and the Eigen::TensorMap will lose access when the cv::Mat data is deallocated.
+
+The example below initializes a cv::Mat and produces an Eigen::TensorMap:
+\code
+float arr[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+Mat a_mat(2, 2, CV_32FC3, arr);
+Eigen::TensorMap<Eigen::Tensor<float, 3, Eigen::RowMajor>> a_tensormap = cv2eigen_tensormap<float>(a_mat);
+\endcode
+*/
+template <typename _Tp> static inline
+Eigen::TensorMap<Eigen::Tensor<_Tp, 3, Eigen::RowMajor>> cv2eigen_tensormap(InputArray src)
+{
+    Mat mat = src.getMat();
+    CV_CheckTypeEQ(mat.type(), CV_MAKETYPE(traits::Type<_Tp>::value, mat.channels()), "");
+    return Eigen::TensorMap<Eigen::Tensor<_Tp, 3, Eigen::RowMajor>>((_Tp *)mat.data, mat.rows, mat.cols, mat.channels());
+}
+#endif // OPENCV_EIGEN_TENSOR_SUPPORT
+
+template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
+void eigen2cv( const Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& src, OutputArray dst )
+{
+    if( !(src.Flags & Eigen::RowMajorBit) )
+    {
+        Mat _src(src.cols(), src.rows(), traits::Type<_Tp>::value,
+              (void*)src.data(), src.outerStride()*sizeof(_Tp));
+        transpose(_src, dst);
+    }
+    else
+    {
+        Mat _src(src.rows(), src.cols(), traits::Type<_Tp>::value,
+                 (void*)src.data(), src.outerStride()*sizeof(_Tp));
+        _src.copyTo(dst);
+    }
+}
+
+// Matx case
+template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
+void eigen2cv( const Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& src,
+               Matx<_Tp, _rows, _cols>& dst )
+{
+    if( !(src.Flags & Eigen::RowMajorBit) )
+    {
+        dst = Matx<_Tp, _cols, _rows>(static_cast<const _Tp*>(src.data())).t();
+    }
+    else
+    {
+        dst = Matx<_Tp, _rows, _cols>(static_cast<const _Tp*>(src.data()));
+    }
+}
+
+template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
+void cv2eigen( const Mat& src,
+               Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& dst )
+{
+    CV_DbgAssert(src.rows == _rows && src.cols == _cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(src.cols, src.rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        if( src.type() == _dst.type() )
+            transpose(src, _dst);
+        else if( src.cols == src.rows )
+        {
+            src.convertTo(_dst, _dst.type());
+            transpose(_dst, _dst);
+        }
+        else
+            Mat(src.t()).convertTo(_dst, _dst.type());
+    }
+    else
+    {
+        const Mat _dst(src.rows, src.cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.convertTo(_dst, _dst.type());
+    }
+}
+
+// Matx case
+template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
+void cv2eigen( const Matx<_Tp, _rows, _cols>& src,
+               Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& dst )
+{
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(_cols, _rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        transpose(src, _dst);
+    }
+    else
+    {
+        const Mat _dst(_rows, _cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        Mat(src).copyTo(_dst);
+    }
+}
+
+template<typename _Tp>  static inline
+void cv2eigen( const Mat& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, Eigen::Dynamic>& dst )
+{
+    dst.resize(src.rows, src.cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(src.cols, src.rows, traits::Type<_Tp>::value,
+             dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        if( src.type() == _dst.type() )
+            transpose(src, _dst);
+        else if( src.cols == src.rows )
+        {
+            src.convertTo(_dst, _dst.type());
+            transpose(_dst, _dst);
+        }
+        else
+            Mat(src.t()).convertTo(_dst, _dst.type());
+    }
+    else
+    {
+        const Mat _dst(src.rows, src.cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.convertTo(_dst, _dst.type());
+    }
+}
+
+// Matx case
+template<typename _Tp, int _rows, int _cols> static inline
+void cv2eigen( const Matx<_Tp, _rows, _cols>& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, Eigen::Dynamic>& dst )
+{
+    dst.resize(_rows, _cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(_cols, _rows, traits::Type<_Tp>::value,
+             dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        transpose(src, _dst);
+    }
+    else
+    {
+        const Mat _dst(_rows, _cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        Mat(src).copyTo(_dst);
+    }
+}
+
+template<typename _Tp> static inline
+void cv2eigen( const Mat& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, 1>& dst )
+{
+    CV_Assert(src.cols == 1);
+    dst.resize(src.rows);
+
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(src.cols, src.rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        if( src.type() == _dst.type() )
+            transpose(src, _dst);
+        else
+            Mat(src.t()).convertTo(_dst, _dst.type());
+    }
+    else
+    {
+        const Mat _dst(src.rows, src.cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.convertTo(_dst, _dst.type());
+    }
+}
+
+// Matx case
+template<typename _Tp, int _rows> static inline
+void cv2eigen( const Matx<_Tp, _rows, 1>& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, 1>& dst )
+{
+    dst.resize(_rows);
+
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(1, _rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        transpose(src, _dst);
+    }
+    else
+    {
+        const Mat _dst(_rows, 1, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.copyTo(_dst);
+    }
+}
+
+
+template<typename _Tp> static inline
+void cv2eigen( const Mat& src,
+               Eigen::Matrix<_Tp, 1, Eigen::Dynamic>& dst )
+{
+    CV_Assert(src.rows == 1);
+    dst.resize(src.cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(src.cols, src.rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        if( src.type() == _dst.type() )
+            transpose(src, _dst);
+        else
+            Mat(src.t()).convertTo(_dst, _dst.type());
+    }
+    else
+    {
+        const Mat _dst(src.rows, src.cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.convertTo(_dst, _dst.type());
+    }
+}
+
+//Matx
+template<typename _Tp, int _cols> static inline
+void cv2eigen( const Matx<_Tp, 1, _cols>& src,
+               Eigen::Matrix<_Tp, 1, Eigen::Dynamic>& dst )
+{
+    dst.resize(_cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(_cols, 1, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        transpose(src, _dst);
+    }
+    else
+    {
+        const Mat _dst(1, _cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        Mat(src).copyTo(_dst);
+    }
+}
+
+//! @}
+
+} // cv
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/fast_math.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/fast_math.hpp
new file mode 100644
index 000000000000..a28c3fbedfe6
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/fast_math.hpp
@@ -0,0 +1,433 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_FAST_MATH_HPP
+#define OPENCV_CORE_FAST_MATH_HPP
+
+#include "opencv2/core/cvdef.h"
+
+//! @addtogroup core_utils
+//! @{
+
+/****************************************************************************************\
+*                                      fast math                                         *
+\****************************************************************************************/
+
+#ifdef __cplusplus
+#  include <cmath>
+#else
+#  ifdef __BORLANDC__
+#    include <fastmath.h>
+#  else
+#    include <math.h>
+#  endif
+#endif
+
+#if defined(__CUDACC__)
+  // nothing, intrinsics/asm code is not supported
+#else
+  #if ((defined _MSC_VER && defined _M_X64) \
+      || (defined __GNUC__ && defined __SSE2__)) \
+      && !defined(OPENCV_SKIP_INCLUDE_EMMINTRIN_H)
+    #include <emmintrin.h>
+  #endif
+
+  #if defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 \
+      && !defined(OPENCV_SKIP_INCLUDE_ALTIVEC_H)
+    #include <altivec.h>
+    #undef vector
+    #undef bool
+    #undef pixel
+  #endif
+
+  #if defined(CV_INLINE_ROUND_FLT)
+    // user-specified version
+    // CV_INLINE_ROUND_DBL should be defined too
+  #elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON) && !defined __SOFTFP__
+    // 1. general scheme
+    #define ARM_ROUND(_value, _asm_string) \
+        int res; \
+        float temp; \
+        CV_UNUSED(temp); \
+        __asm__(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
+        return res
+    // 2. version for double
+    #ifdef __clang__
+        #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
+    #else
+        #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
+    #endif
+    // 3. version for float
+    #define CV_INLINE_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+  #elif defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8
+    // P8 and newer machines can convert fp32/64 to int quickly.
+    #define CV_INLINE_ROUND_DBL(value) \
+        int out; \
+        double temp; \
+        __asm__( "fctiw %[temp],%[in]\n\tmfvsrwz %[out],%[temp]\n\t" : [out] "=r" (out), [temp] "=d" (temp) : [in] "d" ((double)(value)) : ); \
+        return out;
+
+    // FP32 also works with FP64 routine above
+    #define CV_INLINE_ROUND_FLT(value) CV_INLINE_ROUND_DBL(value)
+  #endif
+
+  #ifdef CV_INLINE_ISINF_FLT
+    // user-specified version
+    // CV_INLINE_ISINF_DBL should be defined too
+  #elif defined __PPC64__ && defined _ARCH_PWR9 && defined(scalar_test_data_class)
+    #define CV_INLINE_ISINF_DBL(value) return scalar_test_data_class(value, 0x30);
+    #define CV_INLINE_ISINF_FLT(value) CV_INLINE_ISINF_DBL(value)
+  #endif
+
+  #ifdef CV_INLINE_ISNAN_FLT
+    // user-specified version
+    // CV_INLINE_ISNAN_DBL should be defined too
+  #elif defined __PPC64__ && defined _ARCH_PWR9 && defined(scalar_test_data_class)
+    #define CV_INLINE_ISNAN_DBL(value) return scalar_test_data_class(value, 0x40);
+    #define CV_INLINE_ISNAN_FLT(value) CV_INLINE_ISNAN_DBL(value)
+  #endif
+
+  #if !defined(OPENCV_USE_FASTMATH_BUILTINS) \
+    && ( \
+        defined(__x86_64__) || defined(__i686__) \
+        || defined(__arm__) \
+        || defined(__PPC64__) \
+    )
+    /* Let builtin C math functions when available. Dedicated hardware is available to
+       round and convert FP values. */
+    #define OPENCV_USE_FASTMATH_BUILTINS 1
+  #endif
+
+  /* Enable builtin math functions if possible, desired, and available.
+     Note, not all math functions inline equally. E.g lrint will not inline
+     without the -fno-math-errno option. */
+  #if defined(CV_ICC)
+    // nothing
+  #elif defined(OPENCV_USE_FASTMATH_BUILTINS) && OPENCV_USE_FASTMATH_BUILTINS
+    #if defined(__clang__)
+      #define CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+      #if !defined(CV_INLINE_ISNAN_DBL) && __has_builtin(__builtin_isnan)
+        #define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISNAN_FLT) && __has_builtin(__builtin_isnan)
+        #define CV_INLINE_ISNAN_FLT(value) return __builtin_isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_DBL) && __has_builtin(__builtin_isinf)
+        #define CV_INLINE_ISINF_DBL(value) return __builtin_isinf(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_FLT) && __has_builtin(__builtin_isinf)
+        #define CV_INLINE_ISINF_FLT(value) return __builtin_isinf(value);
+      #endif
+    #elif defined(__GNUC__)
+      #define CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS
+      #if !defined(CV_INLINE_ISNAN_DBL)
+        #define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISNAN_FLT)
+        #define CV_INLINE_ISNAN_FLT(value) return __builtin_isnanf(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_DBL)
+        #define CV_INLINE_ISINF_DBL(value) return __builtin_isinf(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_FLT)
+        #define CV_INLINE_ISINF_FLT(value) return __builtin_isinff(value);
+      #endif
+    #elif defined(_MSC_VER)
+      #if !defined(CV_INLINE_ISNAN_DBL)
+        #define CV_INLINE_ISNAN_DBL(value) return isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISNAN_FLT)
+        #define CV_INLINE_ISNAN_FLT(value) return isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_DBL)
+        #define CV_INLINE_ISINF_DBL(value) return isinf(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_FLT)
+        #define CV_INLINE_ISINF_FLT(value) return isinf(value);
+      #endif
+    #endif
+  #endif
+
+#endif // defined(__CUDACC__)
+
+/** @brief Rounds floating-point number to the nearest integer
+
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int
+cvRound( double value )
+{
+#if defined CV_INLINE_ROUND_DBL
+    CV_INLINE_ROUND_DBL(value);
+#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __SSE2__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    return _mm_cvtsd_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+      defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_lrint(value);
+#else
+    return (int)lrint(value);
+#endif
+}
+
+
+/** @brief Rounds floating-point number to the nearest integer not larger than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvFloor( double value )
+{
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_floor(value);
+#elif defined __loongarch64
+    int i;
+    double tmp;
+    __asm__ ("ftintrm.l.d     %[tmp],    %[in]       \n\t"
+             "movfr2gr.d      %[i],      %[tmp]      \n\t"
+             : [i] "=r" (i), [tmp] "=f" (tmp)
+             : [in] "f" (value)
+             :);
+    return i;
+#else
+    int i = (int)value;
+    return i - (i > value);
+#endif
+}
+
+/** @brief Rounds floating-point number to the nearest integer not smaller than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvCeil( double value )
+{
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_ceil(value);
+#elif defined __loongarch64
+    int i;
+    double tmp;
+    __asm__ ("ftintrp.l.d     %[tmp],    %[in]       \n\t"
+             "movfr2gr.d      %[i],      %[tmp]      \n\t"
+             : [i] "=r" (i), [tmp] "=f" (tmp)
+             : [in] "f" (value)
+             :);
+    return i;
+#else
+    int i = (int)value;
+    return i + (i < value);
+#endif
+}
+
+/** @brief Determines if the argument is Not A Number.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
+ otherwise. */
+CV_INLINE int cvIsNaN( double value )
+{
+#if defined CV_INLINE_ISNAN_DBL
+    CV_INLINE_ISNAN_DBL(value);
+#else
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
+           ((unsigned)ieee754.u != 0) > 0x7ff00000;
+#endif
+}
+
+/** @brief Determines if the argument is Infinity.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
+ and 0 otherwise. */
+CV_INLINE int cvIsInf( double value )
+{
+#if defined CV_INLINE_ISINF_DBL
+    CV_INLINE_ISINF_DBL(value);
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__PPC64__) || defined(__loongarch64)
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffffffffffff) ==
+                        0x7ff0000000000000;
+#else
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
+            (unsigned)ieee754.u == 0;
+#endif
+}
+
+#ifdef __cplusplus
+
+/** @overload */
+CV_INLINE int cvRound(float value)
+{
+#if defined CV_INLINE_ROUND_FLT
+    CV_INLINE_ROUND_FLT(value);
+#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __SSE2__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    return _mm_cvtss_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+      defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_lrintf(value);
+#else
+    return (int)lrintf(value);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvRound( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvFloor( float value )
+{
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_floorf(value);
+#elif defined __loongarch__
+    int i;
+    float tmp;
+    __asm__ ("ftintrm.w.s     %[tmp],    %[in]       \n\t"
+             "movfr2gr.s      %[i],      %[tmp]      \n\t"
+             : [i] "=r" (i), [tmp] "=f" (tmp)
+             : [in] "f" (value)
+             :);
+    return i;
+#else
+    int i = (int)value;
+    return i - (i > value);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvFloor( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvCeil( float value )
+{
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_ceilf(value);
+#elif defined __loongarch__
+    int i;
+    float tmp;
+    __asm__ ("ftintrp.w.s     %[tmp],    %[in]       \n\t"
+             "movfr2gr.s      %[i],      %[tmp]      \n\t"
+             : [i] "=r" (i), [tmp] "=f" (tmp)
+             : [in] "f" (value)
+             :);
+    return i;
+#else
+    int i = (int)value;
+    return i + (i < value);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvCeil( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvIsNaN( float value )
+{
+#if defined CV_INLINE_ISNAN_FLT
+    CV_INLINE_ISNAN_FLT(value);
+#else
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) > 0x7f800000;
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvIsInf( float value )
+{
+#if defined CV_INLINE_ISINF_FLT
+    CV_INLINE_ISINF_FLT(value);
+#else
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) == 0x7f800000;
+#endif
+}
+
+#endif // __cplusplus
+
+//! @} core_utils
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/hal.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/hal.hpp
new file mode 100644
index 000000000000..deca4e9539ae
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/hal.hpp
@@ -0,0 +1,260 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_HPP
+#define OPENCV_HAL_HPP
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/hal/interface.h"
+
+namespace cv { namespace hal {
+
+//! @addtogroup core_hal_functions
+//! @{
+
+CV_EXPORTS int normHamming(const uchar* a, int n);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
+
+CV_EXPORTS int normHamming(const uchar* a, int n, int cellSize);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
+
+CV_EXPORTS int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+CV_EXPORTS void SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int flags);
+CV_EXPORTS void SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int flags);
+CV_EXPORTS int QR32f(float* A, size_t astep, int m, int n, int k, float* b, size_t bstep, float* hFactors);
+CV_EXPORTS int QR64f(double* A, size_t astep, int m, int n, int k, double* b, size_t bstep, double* hFactors);
+
+CV_EXPORTS void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+CV_EXPORTS void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+CV_EXPORTS void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+CV_EXPORTS void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+
+CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
+CV_EXPORTS float normL1_(const float* a, const float* b, int n);
+CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
+
+CV_EXPORTS void exp32f(const float* src, float* dst, int n);
+CV_EXPORTS void exp64f(const double* src, double* dst, int n);
+CV_EXPORTS void log32f(const float* src, float* dst, int n);
+CV_EXPORTS void log64f(const double* src, double* dst, int n);
+
+CV_EXPORTS void cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int n, bool angleInDegrees);
+CV_EXPORTS void cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int n, bool angleInDegrees);
+CV_EXPORTS void fastAtan32f(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+CV_EXPORTS void fastAtan64f(const double* y, const double* x, double* dst, int n, bool angleInDegrees);
+CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
+CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
+CV_EXPORTS void polarToCart32f(const float* mag, const float* angle, float* x, float* y, int n, bool angleInDegrees);
+CV_EXPORTS void polarToCart64f(const double* mag, const double* angle, double* x, double* y, int n, bool angleInDegrees);
+CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
+CV_EXPORTS void sqrt64f(const double* src, double* dst, int len);
+CV_EXPORTS void invSqrt32f(const float* src, float* dst, int len);
+CV_EXPORTS void invSqrt64f(const double* src, double* dst, int len);
+
+CV_EXPORTS void split8u(const uchar* src, uchar** dst, int len, int cn );
+CV_EXPORTS void split16u(const ushort* src, ushort** dst, int len, int cn );
+CV_EXPORTS void split32s(const int* src, int** dst, int len, int cn );
+CV_EXPORTS void split64s(const int64* src, int64** dst, int len, int cn );
+
+CV_EXPORTS void merge8u(const uchar** src, uchar* dst, int len, int cn );
+CV_EXPORTS void merge16u(const ushort** src, ushort* dst, int len, int cn );
+CV_EXPORTS void merge32s(const int** src, int* dst, int len, int cn );
+CV_EXPORTS void merge64s(const int64** src, int64* dst, int len, int cn );
+
+CV_EXPORTS void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+
+CV_EXPORTS void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+CV_EXPORTS void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+CV_EXPORTS void recip8u( const uchar *, size_t, const uchar * src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip8s( const schar *, size_t, const schar * src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip16u( const ushort *, size_t, const ushort * src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip16s( const short *, size_t, const short * src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip32s( const int *, size_t, const int * src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip32f( const float *, size_t, const float * src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip64f( const double *, size_t, const double * src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+CV_EXPORTS void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
+CV_EXPORTS void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
+
+CV_EXPORTS void cvt16f32f( const hfloat* src, float* dst, int len );
+CV_EXPORTS void cvt32f16f( const float* src, hfloat* dst, int len );
+
+CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
+CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
+
+struct CV_EXPORTS DFT1D
+{
+    static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);
+    virtual void apply(const uchar *src, uchar *dst) = 0;
+    virtual ~DFT1D() {}
+};
+
+struct CV_EXPORTS DFT2D
+{
+    static Ptr<DFT2D> create(int width, int height, int depth,
+                             int src_channels, int dst_channels,
+                             int flags, int nonzero_rows = 0);
+    virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
+    virtual ~DFT2D() {}
+};
+
+struct CV_EXPORTS DCT2D
+{
+    static Ptr<DCT2D> create(int width, int height, int depth, int flags);
+    virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
+    virtual ~DCT2D() {}
+};
+
+//! @} core_hal
+
+//=============================================================================
+// for binary compatibility with 3.0
+
+//! @cond IGNORED
+
+CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+CV_EXPORTS void exp(const float* src, float* dst, int n);
+CV_EXPORTS void exp(const double* src, double* dst, int n);
+CV_EXPORTS void log(const float* src, float* dst, int n);
+CV_EXPORTS void log(const double* src, double* dst, int n);
+
+CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
+CV_EXPORTS void magnitude(const double* x, const double* y, double* dst, int n);
+CV_EXPORTS void sqrt(const float* src, float* dst, int len);
+CV_EXPORTS void sqrt(const double* src, double* dst, int len);
+CV_EXPORTS void invSqrt(const float* src, float* dst, int len);
+CV_EXPORTS void invSqrt(const double* src, double* dst, int len);
+
+//! @endcond
+
+}} //cv::hal
+
+#endif //OPENCV_HAL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/interface.h b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/interface.h
new file mode 100644
index 000000000000..6f0a83d35928
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/interface.h
@@ -0,0 +1,190 @@
+#ifndef OPENCV_CORE_HAL_INTERFACE_H
+#define OPENCV_CORE_HAL_INTERFACE_H
+
+//! @addtogroup core_hal_interface
+//! @{
+
+//! @name Return codes
+//! @{
+#define CV_HAL_ERROR_OK 0
+#define CV_HAL_ERROR_NOT_IMPLEMENTED 1
+#define CV_HAL_ERROR_UNKNOWN -1
+//! @}
+
+#ifdef __cplusplus
+#include <cstddef>
+#else
+#include <stddef.h>
+#include <stdbool.h>
+#endif
+
+//! @name Data types
+//! primitive types
+//! - schar  - signed 1 byte integer
+//! - uchar  - unsigned 1 byte integer
+//! - short  - signed 2 byte integer
+//! - ushort - unsigned 2 byte integer
+//! - int    - signed 4 byte integer
+//! - uint   - unsigned 4 byte integer
+//! - int64  - signed 8 byte integer
+//! - uint64 - unsigned 8 byte integer
+//! @{
+#if !defined _MSC_VER && !defined __BORLANDC__
+#  if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
+#    include <cstdint>
+#    ifdef __NEWLIB__
+        typedef unsigned int uint;
+#    else
+        typedef std::uint32_t uint;
+#    endif
+#  else
+#    include <stdint.h>
+     typedef uint32_t uint;
+#  endif
+#else
+   typedef unsigned uint;
+#endif
+
+typedef signed char schar;
+
+#ifndef __IPL_H__
+   typedef unsigned char uchar;
+   typedef unsigned short ushort;
+#endif
+
+#if defined _MSC_VER || defined __BORLANDC__
+   typedef __int64 int64;
+   typedef unsigned __int64 uint64;
+#  define CV_BIG_INT(n)   n##I64
+#  define CV_BIG_UINT(n)  n##UI64
+#else
+   typedef int64_t int64;
+   typedef uint64_t uint64;
+#  define CV_BIG_INT(n)   n##LL
+#  define CV_BIG_UINT(n)  n##ULL
+#endif
+
+#define CV_USRTYPE1 (void)"CV_USRTYPE1 support has been dropped in OpenCV 4.0"
+
+#define CV_CN_MAX     512
+#define CV_CN_SHIFT   3
+#define CV_DEPTH_MAX  (1 << CV_CN_SHIFT)
+
+#define CV_8U   0
+#define CV_8S   1
+#define CV_16U  2
+#define CV_16S  3
+#define CV_32S  4
+#define CV_32F  5
+#define CV_64F  6
+#define CV_16F  7
+
+#define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
+#define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
+
+#define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
+#define CV_MAKE_TYPE CV_MAKETYPE
+
+#define CV_8UC1 CV_MAKETYPE(CV_8U,1)
+#define CV_8UC2 CV_MAKETYPE(CV_8U,2)
+#define CV_8UC3 CV_MAKETYPE(CV_8U,3)
+#define CV_8UC4 CV_MAKETYPE(CV_8U,4)
+#define CV_8UC(n) CV_MAKETYPE(CV_8U,(n))
+
+#define CV_8SC1 CV_MAKETYPE(CV_8S,1)
+#define CV_8SC2 CV_MAKETYPE(CV_8S,2)
+#define CV_8SC3 CV_MAKETYPE(CV_8S,3)
+#define CV_8SC4 CV_MAKETYPE(CV_8S,4)
+#define CV_8SC(n) CV_MAKETYPE(CV_8S,(n))
+
+#define CV_16UC1 CV_MAKETYPE(CV_16U,1)
+#define CV_16UC2 CV_MAKETYPE(CV_16U,2)
+#define CV_16UC3 CV_MAKETYPE(CV_16U,3)
+#define CV_16UC4 CV_MAKETYPE(CV_16U,4)
+#define CV_16UC(n) CV_MAKETYPE(CV_16U,(n))
+
+#define CV_16SC1 CV_MAKETYPE(CV_16S,1)
+#define CV_16SC2 CV_MAKETYPE(CV_16S,2)
+#define CV_16SC3 CV_MAKETYPE(CV_16S,3)
+#define CV_16SC4 CV_MAKETYPE(CV_16S,4)
+#define CV_16SC(n) CV_MAKETYPE(CV_16S,(n))
+
+#define CV_32SC1 CV_MAKETYPE(CV_32S,1)
+#define CV_32SC2 CV_MAKETYPE(CV_32S,2)
+#define CV_32SC3 CV_MAKETYPE(CV_32S,3)
+#define CV_32SC4 CV_MAKETYPE(CV_32S,4)
+#define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))
+
+#define CV_32FC1 CV_MAKETYPE(CV_32F,1)
+#define CV_32FC2 CV_MAKETYPE(CV_32F,2)
+#define CV_32FC3 CV_MAKETYPE(CV_32F,3)
+#define CV_32FC4 CV_MAKETYPE(CV_32F,4)
+#define CV_32FC(n) CV_MAKETYPE(CV_32F,(n))
+
+#define CV_64FC1 CV_MAKETYPE(CV_64F,1)
+#define CV_64FC2 CV_MAKETYPE(CV_64F,2)
+#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
+#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
+#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
+
+#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
+#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
+#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
+#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
+#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
+//! @}
+
+//! @name Comparison operation
+//! @sa cv::CmpTypes
+//! @{
+#define CV_HAL_CMP_EQ 0
+#define CV_HAL_CMP_GT 1
+#define CV_HAL_CMP_GE 2
+#define CV_HAL_CMP_LT 3
+#define CV_HAL_CMP_LE 4
+#define CV_HAL_CMP_NE 5
+//! @}
+
+//! @name Border processing modes
+//! @sa cv::BorderTypes
+//! @{
+#define CV_HAL_BORDER_CONSTANT 0
+#define CV_HAL_BORDER_REPLICATE 1
+#define CV_HAL_BORDER_REFLECT 2
+#define CV_HAL_BORDER_WRAP 3
+#define CV_HAL_BORDER_REFLECT_101 4
+#define CV_HAL_BORDER_TRANSPARENT 5
+#define CV_HAL_BORDER_ISOLATED 16
+//! @}
+
+//! @name DFT flags
+//! @{
+#define CV_HAL_DFT_INVERSE        1
+#define CV_HAL_DFT_SCALE          2
+#define CV_HAL_DFT_ROWS           4
+#define CV_HAL_DFT_COMPLEX_OUTPUT 16
+#define CV_HAL_DFT_REAL_OUTPUT    32
+#define CV_HAL_DFT_TWO_STAGE      64
+#define CV_HAL_DFT_STAGE_COLS    128
+#define CV_HAL_DFT_IS_CONTINUOUS 512
+#define CV_HAL_DFT_IS_INPLACE 1024
+//! @}
+
+//! @name SVD flags
+//! @{
+#define CV_HAL_SVD_NO_UV    1
+#define CV_HAL_SVD_SHORT_UV 2
+#define CV_HAL_SVD_MODIFY_A 4
+#define CV_HAL_SVD_FULL_UV  8
+//! @}
+
+//! @name Gemm flags
+//! @{
+#define CV_HAL_GEMM_1_T 1
+#define CV_HAL_GEMM_2_T 2
+#define CV_HAL_GEMM_3_T 4
+//! @}
+
+//! @}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin.hpp
new file mode 100644
index 000000000000..27beccd9ab9f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin.hpp
@@ -0,0 +1,1256 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_INTRIN_HPP
+#define OPENCV_HAL_INTRIN_HPP
+
+#include <cmath>
+#include <float.h>
+#include <stdlib.h>
+#include "opencv2/core/cvdef.h"
+
+#if defined(__GNUC__) && __GNUC__ == 12
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+#define OPENCV_HAL_ADD(a, b) ((a) + (b))
+#define OPENCV_HAL_AND(a, b) ((a) & (b))
+#define OPENCV_HAL_NOP(a) (a)
+#define OPENCV_HAL_1ST(a, b) (a)
+
+namespace {
+inline unsigned int trailingZeros32(unsigned int value) {
+#if defined(_MSC_VER)
+#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64)
+    unsigned long index = 0;
+    _BitScanForward(&index, value);
+    return (unsigned int)index;
+#elif defined(__clang__)
+    // clang-cl doesn't export _tzcnt_u32 for non BMI systems
+    return value ? __builtin_ctz(value) : 32;
+#else
+    return _tzcnt_u32(value);
+#endif
+#elif defined(__GNUC__) || defined(__GNUG__)
+    return __builtin_ctz(value);
+#elif defined(__ICC) || defined(__INTEL_COMPILER)
+    return _bit_scan_forward(value);
+#elif defined(__clang__)
+    return llvm.cttz.i32(value, true);
+#else
+    static const int MultiplyDeBruijnBitPosition[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
+    return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
+#endif
+}
+}
+
+// unlike HAL API, which is in cv::hal,
+// we put intrinsics into cv namespace to make its
+// access from within opencv code more accessible
+namespace cv {
+
+namespace hal {
+
+enum StoreMode
+{
+    STORE_UNALIGNED = 0,
+    STORE_ALIGNED = 1,
+    STORE_ALIGNED_NOCACHE = 2
+};
+
+}
+
+// TODO FIXIT: Don't use "God" traits. Split on separate cases.
+template<typename _Tp> struct V_TypeTraits
+{
+};
+
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
+    template<> struct V_TypeTraits<type> \
+    { \
+        typedef type value_type; \
+        typedef int_type_ int_type; \
+        typedef abs_type_ abs_type; \
+        typedef uint_type_ uint_type; \
+        typedef w_type_ w_type; \
+        typedef q_type_ q_type; \
+        typedef sum_type_ sum_type; \
+    \
+        static inline int_type reinterpret_int(type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.l = x; \
+            return v.i; \
+        } \
+    \
+        static inline type reinterpret_from_int(int_type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.i = x; \
+            return v.l; \
+        } \
+    }
+
+#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
+    template<> struct V_TypeTraits<type> \
+    { \
+        typedef type value_type; \
+        typedef int_type_ int_type; \
+        typedef abs_type_ abs_type; \
+        typedef uint_type_ uint_type; \
+        typedef w_type_ w_type; \
+        typedef sum_type_ sum_type; \
+    \
+        static inline int_type reinterpret_int(type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.l = x; \
+            return v.i; \
+        } \
+    \
+        static inline type reinterpret_from_int(int_type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.i = x; \
+            return v.l; \
+        } \
+    }
+
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);
+
+#ifndef CV_DOXYGEN
+
+#ifndef CV_CPU_OPTIMIZATION_HAL_NAMESPACE
+#ifdef CV_FORCE_SIMD128_CPP
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#elif defined(CV_CPU_DISPATCH_MODE)
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#else
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#endif
+#endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
+#endif
+}
+
+#ifdef CV_DOXYGEN
+#   undef CV_AVX2
+#   undef CV_SSE2
+#   undef CV_NEON
+#   undef CV_VSX
+#   undef CV_FP16
+#   undef CV_MSA
+#   undef CV_RVV
+#endif
+
+#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_LSX) && !defined(CV_FORCE_SIMD128_CPP)
+#define CV__SIMD_FORWARD 128
+#include "opencv2/core/hal/intrin_forward.hpp"
+#endif
+
+#if CV_SSE2 && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_sse_em.hpp"
+#include "opencv2/core/hal/intrin_sse.hpp"
+
+#elif CV_NEON && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_neon.hpp"
+
+#elif CV_RVV071 && !defined(CV_FORCE_SIMD128_CPP)
+#define CV_SIMD128_CPP 0
+#include "opencv2/core/hal/intrin_rvv071.hpp"
+
+#elif CV_VSX && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_vsx.hpp"
+
+#elif CV_MSA && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_msa.hpp"
+
+#elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
+#include "opencv2/core/hal/intrin_wasm.hpp"
+
+#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
+#if defined(CV_RVV_SCALABLE)
+#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
+#else
+#include "opencv2/core/hal/intrin_rvv.hpp"
+#endif
+
+#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_lsx.hpp"
+
+#else
+
+#include "opencv2/core/hal/intrin_cpp.hpp"
+
+#endif
+
+// AVX2 can be used together with SSE2, so
+// we define those two sets of intrinsics at once.
+// Most of the intrinsics do not conflict (the proper overloaded variant is
+// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
+// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
+// Correspondingly, the wide intrinsics (which are mapped to the "widest"
+// available instruction set) will get vx_ prefix
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
+#if CV_AVX2
+
+#define CV__SIMD_FORWARD 256
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_avx.hpp"
+
+#endif
+
+// AVX512 can be used together with SSE2 and AVX2, so
+// we define those sets of intrinsics at once.
+// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
+// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
+#if CV_AVX512_SKX
+
+#define CV__SIMD_FORWARD 512
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_avx512.hpp"
+
+#endif
+
+#if CV_LASX
+
+#define CV__SIMD_FORWARD 256
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_lasx.hpp"
+
+#endif
+
+//! @cond IGNORED
+
+namespace cv {
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+#endif
+
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif
+
+#ifndef CV_SIMD128_CPP
+#define CV_SIMD128_CPP 0
+#endif
+
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif
+
+#ifndef CV_SIMD256
+#define CV_SIMD256 0
+#endif
+
+#ifndef CV_SIMD256_64F
+#define CV_SIMD256_64F 0
+#endif
+
+#ifndef CV_SIMD512
+#define CV_SIMD512 0
+#endif
+
+#ifndef CV_SIMD512_64F
+#define CV_SIMD512_64F 0
+#endif
+
+#ifndef CV_SIMD128_FP16
+#define CV_SIMD128_FP16 0
+#endif
+
+#ifndef CV_SIMD256_FP16
+#define CV_SIMD256_FP16 0
+#endif
+
+#ifndef CV_SIMD512_FP16
+#define CV_SIMD512_FP16 0
+#endif
+
+#ifndef CV_SIMD_SCALABLE
+#define CV_SIMD_SCALABLE 0
+#endif
+
+#ifndef CV_SIMD_SCALABLE_64F
+#define CV_SIMD_SCALABLE_64F 0
+#endif
+
+//==================================================================================================
+
+template<typename _Tp> struct V_RegTraits
+{
+};
+
+#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
+    template<> struct V_RegTraits<_reg> \
+    { \
+        typedef _reg reg; \
+        typedef _u_reg u_reg; \
+        typedef _w_reg w_reg; \
+        typedef _q_reg q_reg; \
+        typedef _int_reg int_reg; \
+        typedef _round_reg round_reg; \
+    }
+
+#if CV_SIMD128 || CV_SIMD128_CPP
+    CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
+    CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
+#if CV_SIMD128_64F || CV_SIMD128_CPP
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
+#else
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
+#endif
+    CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
+    CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
+#if CV_SIMD128_64F
+    CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
+#endif
+#endif
+
+#if CV_SIMD256
+    CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
+    CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
+#endif
+
+#if CV_SIMD512
+    CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
+    CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
+#endif
+#if CV_SIMD_SCALABLE
+    CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void);
+    CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
+    CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
+    CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
+    CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
+    CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
+    CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
+    CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void);
+    CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void);
+    CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32);
+#endif
+//! @endcond
+
+#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
+#define CV__SIMD_NAMESPACE simd512
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_FP16 CV_SIMD512_FP16
+    #define CV_SIMD_WIDTH 64
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
+    typedef v_uint8x64    v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
+    typedef v_int8x64     v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
+    typedef v_uint16x32   v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
+    typedef v_int16x32    v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
+    typedef v_uint32x16   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
+    typedef v_int32x16    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
+    typedef v_uint64x8    v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
+    typedef v_int64x8     v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
+    typedef v_float32x16  v_float32;
+    #if CV_SIMD512_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
+    typedef v_float64x8   v_float64;
+    #endif
+//! @}
+
+    #define VXPREFIX(func) v512##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
+#define CV__SIMD_NAMESPACE simd256
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD256_64F
+    #define CV_SIMD_FP16 CV_SIMD256_FP16
+    #define CV_SIMD_WIDTH 32
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
+    typedef v_uint8x32   v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
+    typedef v_int8x32    v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
+    typedef v_uint16x16  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
+    typedef v_int16x16   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
+    typedef v_uint32x8   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
+    typedef v_int32x8    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
+    typedef v_uint64x4   v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
+    typedef v_int64x4    v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
+    typedef v_float32x8  v_float32;
+    #if CV_SIMD256_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
+    typedef v_float64x4  v_float64;
+    #endif
+//! @}
+
+    #define VXPREFIX(func) v256##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
+#if defined CV_SIMD128_CPP
+#define CV__SIMD_NAMESPACE simd128_cpp
+#else
+#define CV__SIMD_NAMESPACE simd128
+#endif
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD CV_SIMD128
+    #define CV_SIMD_64F CV_SIMD128_64F
+    #define CV_SIMD_WIDTH 16
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
+    typedef v_uint8x16  v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
+    typedef v_int8x16   v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
+    typedef v_uint16x8  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
+    typedef v_int16x8   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
+    typedef v_uint32x4  v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
+    typedef v_int32x4   v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
+    typedef v_uint64x2  v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
+    typedef v_int64x2   v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
+    typedef v_float32x4 v_float32;
+    #if CV_SIMD128_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
+    typedef v_float64x2 v_float64;
+    #endif
+//! @}
+
+    #define VXPREFIX(func) v##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+
+#elif CV_SIMD_SCALABLE
+#define CV__SIMD_NAMESPACE simd
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 0
+    #define CV_SIMD_WIDTH 128  /* 1024/8 */
+
+    #define VXPREFIX(func) v##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+
+#endif
+
+//! @cond IGNORED
+#ifndef CV_SIMD_64F
+#define CV_SIMD_64F 0
+#endif
+
+namespace CV__SIMD_NAMESPACE {
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @name Wide init with value
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to a specific value
+    inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
+    inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
+    inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
+    inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
+    inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
+    inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
+    inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
+    inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
+    inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
+#endif
+    //! @}
+
+    //! @name Wide init with zero
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to zero
+    inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
+    inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
+    inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
+    inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
+    inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
+    inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
+    inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
+    inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
+    inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory
+    inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory(aligned)
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory(aligned)
+    inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load lower half from memory
+    //! @{
+    //! @brief Load lower half of maximum available capacity register from memory
+    inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load halfs from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from two memory blocks
+    inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of elements
+    //! @{
+    //! @brief Load maximum available capacity register contents with array elements by provided indexes
+    inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element pairs
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element pairs by provided indexes
+    inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+    inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element quads
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element quads by provided indexes
+    inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    //! @}
+
+    //! @name Wide load with double expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with double expand
+    inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_float32 vx_load_expand(const hfloat * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    //! @}
+
+    //! @name Wide load with quad expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with quad expand
+    inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    //! @}
+
+    /** @brief SIMD processing state cleanup call */
+    inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
+
+#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
+    // Compatibility layer
+
+    template<typename T> struct VTraits {
+        static inline int vlanes() { return T::nlanes; }
+        enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
+        using lane_type = typename T::lane_type;
+    };
+
+    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+    inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a + b; \
+    } \
+    inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a - b; \
+    } \
+    template<typename... Args> \
+    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_add(f1 + f2, vf...); \
+    }
+    #define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
+    inline _Tpvec v_shr(const _Tpvec& a, int n) \
+    { \
+        return a >> n; \
+    } \
+    inline _Tpvec v_shl(const _Tpvec& a, int n) \
+    { \
+        return a << n; \
+    }
+
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+    // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+    // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
+    inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a & b; \
+    } \
+    inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a | b; \
+    } \
+    inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a ^ b; \
+    }
+
+    #define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
+    inline _Tpvec v_not(const _Tpvec& a) \
+    { \
+        return ~a; \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint8)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint16)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint64)
+    OPENCV_HAL_WRAP_NOT_OP(v_int8)
+    OPENCV_HAL_WRAP_NOT_OP(v_int16)
+    OPENCV_HAL_WRAP_NOT_OP(v_int32)
+    OPENCV_HAL_WRAP_NOT_OP(v_int64)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a * b; \
+    } \
+    template<typename... Args> \
+    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_mul(f1 * f2, vf...); \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
+    inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a / b; \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
+    inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a op b; \
+    }
+    #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a == b; \
+    } \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a != b; \
+    }
+
+    #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \
+    OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=)
+
+    OPENCV_HAL_WRAP_CMP(v_uint8)
+    OPENCV_HAL_WRAP_CMP(v_uint16)
+    OPENCV_HAL_WRAP_CMP(v_uint32)
+    OPENCV_HAL_WRAP_EQ_OP(v_uint64)
+    OPENCV_HAL_WRAP_CMP(v_int8)
+    OPENCV_HAL_WRAP_CMP(v_int16)
+    OPENCV_HAL_WRAP_CMP(v_int32)
+    OPENCV_HAL_WRAP_EQ_OP(v_int64)
+    OPENCV_HAL_WRAP_CMP(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_CMP(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_CMP(v_uint8x16)
+        OPENCV_HAL_WRAP_CMP(v_uint16x8)
+        OPENCV_HAL_WRAP_CMP(v_uint32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_CMP(v_int8x16)
+        OPENCV_HAL_WRAP_CMP(v_int16x8)
+        OPENCV_HAL_WRAP_CMP(v_int32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
+        OPENCV_HAL_WRAP_CMP(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_CMP(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_CMP(v_uint8x32)
+        OPENCV_HAL_WRAP_CMP(v_uint16x16)
+        OPENCV_HAL_WRAP_CMP(v_uint32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_CMP(v_int8x32)
+        OPENCV_HAL_WRAP_CMP(v_int16x16)
+        OPENCV_HAL_WRAP_CMP(v_int32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
+        OPENCV_HAL_WRAP_CMP(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_CMP(v_float64x4)
+        #endif
+    #endif
+
+    //////////// get0 ////////////
+    #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
+    { \
+        return v.get0(); \
+    }
+
+    OPENCV_HAL_WRAP_GRT0(v_uint8)
+    OPENCV_HAL_WRAP_GRT0(v_int8)
+    OPENCV_HAL_WRAP_GRT0(v_uint16)
+    OPENCV_HAL_WRAP_GRT0(v_int16)
+    OPENCV_HAL_WRAP_GRT0(v_uint32)
+    OPENCV_HAL_WRAP_GRT0(v_int32)
+    OPENCV_HAL_WRAP_GRT0(v_uint64)
+    OPENCV_HAL_WRAP_GRT0(v_int64)
+    OPENCV_HAL_WRAP_GRT0(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_GRT0(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_GRT0(v_uint8x16)
+        OPENCV_HAL_WRAP_GRT0(v_uint16x8)
+        OPENCV_HAL_WRAP_GRT0(v_uint32x4)
+        OPENCV_HAL_WRAP_GRT0(v_uint64x2)
+        OPENCV_HAL_WRAP_GRT0(v_int8x16)
+        OPENCV_HAL_WRAP_GRT0(v_int16x8)
+        OPENCV_HAL_WRAP_GRT0(v_int32x4)
+        OPENCV_HAL_WRAP_GRT0(v_int64x2)
+        OPENCV_HAL_WRAP_GRT0(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_GRT0(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_GRT0(v_uint8x32)
+        OPENCV_HAL_WRAP_GRT0(v_uint16x16)
+        OPENCV_HAL_WRAP_GRT0(v_uint32x8)
+        OPENCV_HAL_WRAP_GRT0(v_uint64x4)
+        OPENCV_HAL_WRAP_GRT0(v_int8x32)
+        OPENCV_HAL_WRAP_GRT0(v_int16x16)
+        OPENCV_HAL_WRAP_GRT0(v_int32x8)
+        OPENCV_HAL_WRAP_GRT0(v_int64x4)
+        OPENCV_HAL_WRAP_GRT0(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_GRT0(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
+    { \
+        return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_EXTRACT(v_uint8)
+    OPENCV_HAL_WRAP_EXTRACT(v_int8)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint16)
+    OPENCV_HAL_WRAP_EXTRACT(v_int16)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint32)
+    OPENCV_HAL_WRAP_EXTRACT(v_int32)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint64)
+    OPENCV_HAL_WRAP_EXTRACT(v_int64)
+    OPENCV_HAL_WRAP_EXTRACT(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_EXTRACT(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
+        OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
+        OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
+        OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
+    inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
+    { \
+        return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_BROADCAST(v_uint32)
+    OPENCV_HAL_WRAP_BROADCAST(v_int32)
+    OPENCV_HAL_WRAP_BROADCAST(v_float32)
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
+        OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
+        OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
+        OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
+        OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
+    #endif
+
+#endif //!CV_SIMD_SCALABLE
+
+#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
+// Compatibility layer for the backend that cleaned up.
+    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_add(v_add(f1, f2), vf...); \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_mul(v_mul(f1, f2), vf...); \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
+    { \
+        return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_EXTRACT(v_uint8)
+    OPENCV_HAL_WRAP_EXTRACT(v_int8)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint16)
+    OPENCV_HAL_WRAP_EXTRACT(v_int16)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint32)
+    OPENCV_HAL_WRAP_EXTRACT(v_int32)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint64)
+    OPENCV_HAL_WRAP_EXTRACT(v_int64)
+    OPENCV_HAL_WRAP_EXTRACT(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_EXTRACT(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
+    inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
+    { \
+        return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_BROADCAST(v_uint32)
+    OPENCV_HAL_WRAP_BROADCAST(v_int32)
+    OPENCV_HAL_WRAP_BROADCAST(v_float32)
+
+#endif //CV_NEON
+
+//! @cond IGNORED
+
+    // backward compatibility
+    template<typename _Tp, typename _Tvec> static inline
+    void vx_store(_Tp* dst, const _Tvec& v) { return v_store(dst, v); }
+    // backward compatibility
+    template<typename _Tp, typename _Tvec> static inline
+    void vx_store_aligned(_Tp* dst, const _Tvec& v) { return v_store_aligned(dst, v); }
+
+//! @endcond
+
+
+//! @}
+    #undef VXPREFIX
+} // namespace
+
+
+#ifndef CV_SIMD_FP16
+#define CV_SIMD_FP16 0  //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
+#endif
+
+#ifndef CV_SIMD
+#define CV_SIMD 0
+#endif
+
+#include "simd_utils.impl.hpp"
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+#endif
+
+} // cv::
+
+//! @endcond
+
+#if defined(__GNUC__) && __GNUC__ == 12
+#pragma GCC diagnostic pop
+#endif
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_avx.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_avx.hpp
new file mode 100644
index 000000000000..eed609f80ef6
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_avx.hpp
@@ -0,0 +1,3177 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX_HPP
+#define OPENCV_HAL_INTRIN_AVX_HPP
+
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+#define CV_SIMD256_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Utils ////////////
+
+inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
+{ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
+
+inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
+{ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
+
+inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
+{ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
+
+inline int _v_cvtsi256_si32(const __m256i& a)
+{ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
+
+inline __m256i _v256_shuffle_odd_64(const __m256i& v)
+{ return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+inline __m256d _v256_shuffle_odd_64(const __m256d& v)
+{ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+template<int imm>
+inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
+{ return _mm256_permute2x128_si256(a, b, imm); }
+
+template<int imm>
+inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
+{ return _mm256_permute2f128_ps(a, b, imm); }
+
+template<int imm>
+inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
+{ return _mm256_permute2f128_pd(a, b, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
+
+template<int imm>
+inline __m256i _v256_permute4x64(const __m256i& a)
+{ return _mm256_permute4x64_epi64(a, imm); }
+
+template<int imm>
+inline __m256d _v256_permute4x64(const __m256d& a)
+{ return _mm256_permute4x64_pd(a, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute4x64(const _Tpvec& a)
+{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
+
+inline __m128i _v256_extract_high(const __m256i& v)
+{ return _mm256_extracti128_si256(v, 1); }
+
+inline __m128  _v256_extract_high(const __m256& v)
+{ return _mm256_extractf128_ps(v, 1); }
+
+inline __m128d _v256_extract_high(const __m256d& v)
+{ return _mm256_extractf128_pd(v, 1); }
+
+inline __m128i _v256_extract_low(const __m256i& v)
+{ return _mm256_castsi256_si128(v); }
+
+inline __m128  _v256_extract_low(const __m256& v)
+{ return _mm256_castps256_ps128(v); }
+
+inline __m128d _v256_extract_low(const __m256d& v)
+{ return _mm256_castpd256_pd128(v); }
+
+inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
+{
+    const __m256i m = _mm256_set1_epi32(65535);
+    __m256i am = _mm256_min_epu32(a, m);
+    __m256i bm = _mm256_min_epu32(b, m);
+    return _mm256_packus_epi32(am, bm);
+}
+
+template<int i>
+inline int _v256_extract_epi8(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi8(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
+    return _mm_extract_epi8(b, i & 15);  // SSE4.1
+#endif
+}
+
+template<int i>
+inline int _v256_extract_epi16(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi16(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
+    return _mm_extract_epi16(b, i & 7);  // SSE2
+#endif
+}
+
+template<int i>
+inline int _v256_extract_epi32(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi32(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
+    return _mm_extract_epi32(b, i & 3);  // SSE4.1
+#endif
+}
+
+template<int i>
+inline int64 _v256_extract_epi64(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi64(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
+    return _mm_extract_epi64(b, i & 1);  // SSE4.1
+#endif
+}
+
+///////// Types ////////////
+
+struct v_uint8x32
+{
+    typedef uchar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_uint8x32(__m256i v) : val(v) {}
+    v_uint8x32(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31)
+    {
+        val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+            (char)v4,  (char)v5,  (char)v6 , (char)v7,  (char)v8,  (char)v9,
+            (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
+            (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
+            (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
+            (char)v28, (char)v29, (char)v30, (char)v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint8x32() {}
+
+    uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
+};
+
+struct v_int8x32
+{
+    typedef schar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_int8x32(__m256i v) : val(v) {}
+    v_int8x32(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31)
+    {
+        val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+            v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
+            v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int8x32() {}
+
+    schar get0() const { return (schar)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint16x16
+{
+    typedef ushort lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_uint16x16(__m256i v) : val(v) {}
+    v_uint16x16(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15)
+    {
+        val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+            (short)v4,  (short)v5,  (short)v6,  (short)v7,  (short)v8,  (short)v9,
+            (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint16x16() {}
+
+    ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
+};
+
+struct v_int16x16
+{
+    typedef short lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_int16x16(__m256i v) : val(v) {}
+    v_int16x16(short v0,  short v1,  short v2,  short v3,
+               short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11,
+               short v12, short v13, short v14, short v15)
+    {
+        val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int16x16() {}
+
+    short get0() const { return (short)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint32x8
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_uint32x8(__m256i v) : val(v) {}
+    v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
+               unsigned v4, unsigned v5, unsigned v6, unsigned v7)
+    {
+        val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
+            (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint32x8() {}
+
+    unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
+};
+
+struct v_int32x8
+{
+    typedef int lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_int32x8(__m256i v) : val(v) {}
+    v_int32x8(int v0, int v1, int v2, int v3,
+              int v4, int v5, int v6, int v7)
+    {
+        val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int32x8() {}
+
+    int get0() const { return _v_cvtsi256_si32(val); }
+};
+
+struct v_float32x8
+{
+    typedef float lane_type;
+    enum { nlanes = 8 };
+    __m256 val;
+
+    explicit v_float32x8(__m256 v) : val(v) {}
+    v_float32x8(float v0, float v1, float v2, float v3,
+                float v4, float v5, float v6, float v7)
+    {
+        val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float32x8() {}
+
+    float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
+};
+
+struct v_uint64x4
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_uint64x4(__m256i v) : val(v) {}
+    v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
+    { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint64x4() {}
+
+    uint64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
+        int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #endif
+    }
+};
+
+struct v_int64x4
+{
+    typedef int64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_int64x4(__m256i v) : val(v) {}
+    v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
+    { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int64x4() {}
+
+    int64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
+        int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #endif
+    }
+};
+
+struct v_float64x4
+{
+    typedef double lane_type;
+    enum { nlanes = 4 };
+    __m256d val;
+
+    explicit v_float64x4(__m256d v) : val(v) {}
+    v_float64x4(double v0, double v1, double v2, double v3)
+    { val = _mm256_setr_pd(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float64x4() {}
+
+    double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp)                    \
+    inline _Tpvec v256_load(const _Tp* ptr)                           \
+    { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); }       \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); }        \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m128i v128 = _mm_loadu_si128((const __m128i*)ptr);          \
+        return _Tpvec(_mm256_castsi128_si256(v128));                  \
+    }                                                                 \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0);          \
+        __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1);          \
+        return _Tpvec(_v256_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm256_storeu_si256((__m256i*)ptr, a.val); }                    \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { _mm256_store_si256((__m256i*)ptr, a.val); }                     \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { _mm256_stream_si256((__m256i*)ptr, a.val); }                    \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm256_storeu_si256((__m256i*)ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm256_stream_si256((__m256i*)ptr, a.val); \
+        else \
+            _mm256_store_si256((__m256i*)ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); }    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32,  uchar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32,   schar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16,  short)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8,  unsigned)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8,   int)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4,  uint64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4,   int64)
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
+    inline _Tpvec v256_load(const _Tp* ptr)                               \
+    { return _Tpvec(_mm256_loadu_##suffix(ptr)); }                        \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(_mm256_load_##suffix(ptr)); }                         \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        return _Tpvec(_mm256_cast##suffix##128_##suffix##256              \
+                     (_mm_loadu_##suffix(ptr)));                          \
+    }                                                                     \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = _mm_loadu_##suffix(ptr0);                           \
+        halfreg vhi = _mm_loadu_##suffix(ptr1);                           \
+        return _Tpvec(_v256_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { _mm256_storeu_##suffix(ptr, a.val); }                               \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { _mm256_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { _mm256_stream_##suffix(ptr, a.val); }                               \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm256_storeu_##suffix(ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm256_stream_##suffix(ptr, a.val); \
+        else \
+            _mm256_store_##suffix(ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); }               \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float,  ps, __m128)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
+
+#define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)          \
+    inline _Tpvec v256_setzero_##suffix()                                        \
+    { return _Tpvec(_mm256_setzero_si256()); }                                   \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                                    \
+    { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); }                        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256)   \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
+
+OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32,  uchar,    u8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX_INIT(v_int8x32,   schar,    s8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort,   u16, epi16,  short)
+OPENCV_HAL_IMPL_AVX_INIT(v_int16x16,  short,    s16, epi16,  short)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8,  unsigned, u32, epi32,  int)
+OPENCV_HAL_IMPL_AVX_INIT(v_int32x8,   int,      s32, epi32,  int)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4,  uint64,   u64, epi64x, int64)
+OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,   int64,    s64, epi64x, int64)
+
+#define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v256_setzero_##suffix()                                \
+    { return _Tpvec(_mm256_setzero_##zsuffix()); }                       \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                            \
+    { return _Tpvec(_mm256_set1_##zsuffix(v)); }                         \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, cast)
+
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float,  f32, ps, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
+
+inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
+{ return a; }
+inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castpd_ps(a.val)); }
+
+inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
+{ return a; }
+inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_castps_pd(a.val)); }
+
+/* Recombine */
+/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm)                    \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return _Tpvec(perm(a.val, b.val, 0x20)); }                     \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(perm(a.val, b.val, 0x31)); }                     \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    { c = v_combine_low(a, b); d = v_combine_high(a, b); }
+
+#define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix)                  \
+    OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256)   \
+    inline void v_zip(const _Tpvec& a0, const _Tpvec& a1,            \
+                             _Tpvec& b0, _Tpvec& b1)                 \
+    {                                                                \
+        __m256i v0 = _v256_shuffle_odd_64(a0.val);                   \
+        __m256i v1 = _v256_shuffle_odd_64(a1.val);                   \
+        b0.val = _mm256_unpacklo_##suffix(v0, v1);                   \
+        b1.val = _mm256_unpackhi_##suffix(v0, v1);                   \
+    }
+
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4,  epi64)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4,   epi64)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
+
+inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
+{
+    __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
+    __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
+    v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
+}
+
+inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
+{
+    __m256d v0 = _v_shuffle_odd_64(a0.val);
+    __m256d v1 = _v_shuffle_odd_64(a1.val);
+    b0.val = _mm256_unpacklo_pd(v0, v1);
+    b1.val = _mm256_unpackhi_pd(v0, v1);
+}*/
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix)                 \
+    inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); }     \
+    inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4,  epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4,   epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
+
+// blend
+#define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix)               \
+    template<int m>                                             \
+    inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
+
+template<int m>
+inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
+{
+    enum {M0 = m};
+    enum {M1 = (M0 | (M0 << 2)) & 0x33};
+    enum {M2 = (M1 | (M1 << 1)) & 0x55};
+    enum {MM =  M2 | (M2 << 1)};
+    return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
+}
+template<int m>
+inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
+{ return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
+
+// shuffle
+// todo: emulate 64bit
+#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin)  \
+    template<int m>                                  \
+    inline _Tpvec v256_shuffle(const _Tpvec& a)      \
+    { return _Tpvec(_mm256_##intrin(a.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8,  shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8,   shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
+
+template<typename _Tpvec>
+inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
+{
+    ab0 = v256_unpacklo(a, b);
+    ab1 = v256_unpackhi(a, b);
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
+
+inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
+{ return v256_blend<0xf0>(a, b); }
+
+inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
+{ return v256_blend<0xc>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
+{ return v256_permute2x128<0x21>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
+inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
+// todo: emulate float32
+
+template<typename _Tpvec>
+inline _Tpvec v256_swap_halves(const _Tpvec& a)
+{ return v256_permute2x128<1>(a, a); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_reverse_64(const _Tpvec& a)
+{ return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
+
+// ZIP
+#define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec)                              \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return v256_permute2x128<0x20>(a, b); }                        \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return v256_permute2x128<0x31>(a, b); }                        \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    {                                                                \
+        _Tpvec a1b0 = v256_alignr_128(a, b);                         \
+        c = v256_combine_diagonal(a, a1b0);                          \
+        d = v256_combine_diagonal(a1b0, b);                          \
+    }                                                                \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,              \
+                      _Tpvec& ab0, _Tpvec& ab1)                      \
+    {                                                                \
+        _Tpvec ab0ab2, ab1ab3;                                       \
+        v256_zip(a, b, ab0ab2, ab1ab3);                              \
+        v_recombine(ab0ab2, ab1ab3, ab0, ab1);                       \
+    }
+
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin)            \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32,  _mm256_adds_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32,  _mm256_subs_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8,   _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8,   _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8,   _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4,  _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4,  _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4,   _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4,   _mm256_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
+
+// saturating multiply 8-bit, 16-bit
+inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
+{
+    v_uint16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
+{
+    v_int16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i pl = _mm256_mullo_epi16(a.val, b.val);
+    __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
+    __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
+    return v_uint16x16(_v256_packs_epu32(p0, p1));
+}
+inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i pl = _mm256_mullo_epi16(a.val, b.val);
+    __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
+    __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
+    __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
+    return v_int16x16(_mm256_packs_epi32(p0, p1));
+}
+inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
+{ a = a * b; return a; }
+inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
+{ a = a * b; return a; }
+inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
+{ a = a * b; return a; }
+inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
+{ a = a * b; return a; }
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32,  _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32,   _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16,  _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32,  _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32,   _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16,  _mm256_sub_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16,  _mm256_mullo_epi16)
+
+inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i ad = _mm256_srai_epi16(a.val, 8);
+    __m256i bd = _mm256_srai_epi16(b.val, 8);
+    __m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even
+    __m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd
+
+    const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
+    return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
+}
+inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
+                         v_uint16x16& c, v_uint16x16& d)
+{
+    v_uint16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
+                         v_int16x16& c, v_int16x16& d)
+{
+    v_int16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
+                         v_int32x8& c, v_int32x8& d)
+{
+    v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
+
+    v_int16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
+                         v_uint32x8& c, v_uint32x8& d)
+{
+    v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
+
+    v_uint16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
+                         v_uint64x4& c, v_uint64x4& d)
+{
+    __m256i v0 = _mm256_mul_epu32(a.val, b.val);
+    __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
+    v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
+}
+
+inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \
+    { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)            \
+    { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)            \
+    { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)            \
+    { return _Tpsvec(srai(a.val, imm)); }                             \
+    template<int imm>                                                 \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                            \
+    { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                            \
+    { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                            \
+    { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                            \
+    { return _Tpsvec(srai(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8,  v_int32x8,  epi32, _mm256_srai_epi32)
+
+inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
+{
+    __m256i d = _mm256_set1_epi64x((int64)1 << 63);
+    __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
+    return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
+}
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4,  v_int64x4,  epi64, _mm256_srai_epi64xx)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)  \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix)   \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix)    \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix)   \
+    inline _Tpvec operator ~ (const _Tpvec& a)                   \
+    { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32,    si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16,  si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8,    si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4,   si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4,    si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8,  ps,    _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4,  pd,    _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix)                               \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
+
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8,   epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                     \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }                                         \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)  \
+    { return b > a; }                                             \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a < b); }                                          \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)  \
+    { return b >= a; }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit)   \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)      \
+    { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
+    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)       \
+    {                                                                    \
+        __m256i smask = _mm256_set1_##suffix(sbit);                      \
+        return _Tpuvec(_mm256_cmpgt_##suffix(                            \
+                       _mm256_xor_si256(a.val, smask),                   \
+                       _mm256_xor_si256(b.val, smask)));                 \
+    }                                                                    \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)      \
+    { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)       \
+    { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); }             \
+    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec)                               \
+    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32,  v_int8x32,  epi8,  (char)-128)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec)                 \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); }         \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+    { return ~(a == b); }
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix)    \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix)               \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, suffix)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
+
+inline v_float32x8 v_not_nan(const v_float32x8& a)
+{ return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
+inline v_float64x4 v_not_nan(const v_float64x4& a)
+{ return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32,  _mm256_min_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32,  _mm256_max_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32,   _mm256_min_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32,   _mm256_max_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16,  _mm256_min_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16,  _mm256_max_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8,  _mm256_min_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8,  _mm256_max_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8,   _mm256_min_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8,   _mm256_max_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
+
+/** Rotate **/
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_R = (16 - imm) & 0xFF};
+    enum {IMM_R2 = (32 - imm) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(a.val, swap, IMM_R));
+    return v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2)); // imm < 32
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
+    return v_uint8x32(_mm256_alignr_epi8(b.val, swap, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+    enum {IMM_R = (16 - imm) & 0xFF};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
+    return v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
+    return v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
+}
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast)     \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)        \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a),    \
+                                       v_reinterpret_as_u8(b));   \
+        return _Tpvec(cast(ret.val));                             \
+    }                                                             \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a)                         \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a));   \
+        return _Tpvec(cast(ret.val));                             \
+    }
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec)                                  \
+    OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  _Tpvec, OPENCV_HAL_NOP) \
+    OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
+
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
+
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float64x4, _mm256_castsi256_pd)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
+
+/** Reverse **/
+inline v_uint8x32 v_reverse(const v_uint8x32 &a)
+{
+    static const __m256i perm = _mm256_setr_epi8(
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m256i vec = _mm256_shuffle_epi8(a.val, perm);
+    return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
+}
+
+inline v_int8x32 v_reverse(const v_int8x32 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x16 v_reverse(const v_uint16x16 &a)
+{
+    static const __m256i perm = _mm256_setr_epi8(
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    __m256i vec = _mm256_shuffle_epi8(a.val, perm);
+    return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
+}
+
+inline v_int16x16 v_reverse(const v_int16x16 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x8 v_reverse(const v_uint32x8 &a)
+{
+    static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
+}
+
+inline v_int32x8 v_reverse(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x8 v_reverse(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x4 v_reverse(const v_uint64x4 &a)
+{
+    return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+inline v_int64x4 v_reverse(const v_int64x4 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x4 v_reverse(const v_float64x4 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+inline unsigned v_reduce_sum(const v_uint8x32& a)
+{
+    __m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline int v_reduce_sum(const v_int8x32& a)
+{
+    __m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
+}
+#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a) \
+    { \
+        __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
+        val = intrin(val, _mm_srli_si128(val,8)); \
+        val = intrin(val, _mm_srli_si128(val,4)); \
+        val = intrin(val, _mm_srli_si128(val,2)); \
+        val = intrin(val, _mm_srli_si128(val,1)); \
+        return (sctype)_mm_cvtsi128_si32(val); \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar, min, _mm_min_epi8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar, max, _mm_max_epi8)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                  \
+    {                                                               \
+        __m128i v0 = _v256_extract_low(a.val);                      \
+        __m128i v1 = _v256_extract_high(a.val);                     \
+        v0 = intrin(v0, v1);                                        \
+        v0 = intrin(v0, _mm_srli_si128(v0, 8));                     \
+        v0 = intrin(v0, _mm_srli_si128(v0, 4));                     \
+        v0 = intrin(v0, _mm_srli_si128(v0, 2));                     \
+        return (sctype) _mm_cvtsi128_si32(v0);                      \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  min, _mm_min_epi16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  max, _mm_max_epi16)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                 \
+    {                                                              \
+        __m128i v0 = _v256_extract_low(a.val);                     \
+        __m128i v1 = _v256_extract_high(a.val);                    \
+        v0 = intrin(v0, v1);                                       \
+        v0 = intrin(v0, _mm_srli_si128(v0, 8));                    \
+        v0 = intrin(v0, _mm_srli_si128(v0, 4));                    \
+        return (sctype) _mm_cvtsi128_si32(v0);                     \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      min, _mm_min_epi32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      max, _mm_max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin)                  \
+    inline float v_reduce_##func(const v_float32x8& a)                \
+    {                                                                 \
+        __m128 v0 = _v256_extract_low(a.val);                         \
+        __m128 v1 = _v256_extract_high(a.val);                        \
+        v0 = intrin(v0, v1);                                          \
+        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
+        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \
+        return _mm_cvtss_f32(v0);                                     \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
+
+inline int v_reduce_sum(const v_int32x8& a)
+{
+    __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
+            s0 = _mm256_hadd_epi32(s0, s0);
+
+    __m128i s1 = _v256_extract_high(s0);
+            s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
+
+    return _mm_cvtsi128_si32(s1);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x8& a)
+{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+inline int v_reduce_sum(const v_int16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+inline float v_reduce_sum(const v_float32x8& a)
+{
+    __m256 s0 = _mm256_hadd_ps(a.val, a.val);
+           s0 = _mm256_hadd_ps(s0, s0);
+
+    __m128 s1 = _v256_extract_high(s0);
+           s1 = _mm_add_ps(_v256_extract_low(s0), s1);
+
+    return _mm_cvtss_f32(s1);
+}
+
+inline uint64 v_reduce_sum(const v_uint64x4& a)
+{
+    uint64 CV_DECL_ALIGNED(32) idx[2];
+    _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
+    return idx[0] + idx[1];
+}
+inline int64 v_reduce_sum(const v_int64x4& a)
+{
+    int64 CV_DECL_ALIGNED(32) idx[2];
+    _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
+    return idx[0] + idx[1];
+}
+inline double v_reduce_sum(const v_float64x4& a)
+{
+    __m256d s0 = _mm256_hadd_pd(a.val, a.val);
+    return _mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0)));
+}
+
+inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
+                                 const v_float32x8& c, const v_float32x8& d)
+{
+    __m256 ab = _mm256_hadd_ps(a.val, b.val);
+    __m256 cd = _mm256_hadd_ps(c.val, d.val);
+    return v_float32x8(_mm256_hadd_ps(ab, cd));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i half = _mm256_sad_epu8(a.val, b.val);
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i half = _mm256_set1_epi8(0x7f);
+    half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_add_wrap(a - b, b - a), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
+{
+    return v_reduce_sum(v_max(a, b) - v_min(a, b));
+}
+inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 m = a < b;
+    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
+}
+inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
+{
+    return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+}
+
+/** Popcount **/
+inline v_uint8x32 v_popcount(const v_uint8x32& a)
+{
+    __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+                                             0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+    __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
+    return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(                  a.val    , _popcnt_mask)),
+                                      _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
+}
+inline v_uint16x16 v_popcount(const v_uint16x16& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
+}
+inline v_uint32x8 v_popcount(const v_uint32x8& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
+}
+inline v_uint64x4 v_popcount(const v_uint64x4& a)
+{
+    return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
+}
+inline v_uint8x32 v_popcount(const v_int8x32& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x16 v_popcount(const v_int16x16& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x8 v_popcount(const v_int32x8& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x4 v_popcount(const v_int64x4& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+/** Mask **/
+inline int v_signmask(const v_int8x32& a)
+{ return _mm256_movemask_epi8(a.val); }
+inline int v_signmask(const v_uint8x32& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+
+inline int v_signmask(const v_int16x16& a)
+{ return v_signmask(v_pack(a, a)) & 0xFFFF; }
+inline int v_signmask(const v_uint16x16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_float32x8& a)
+{ return _mm256_movemask_ps(a.val); }
+inline int v_signmask(const v_float64x4& a)
+{ return _mm256_movemask_pd(a.val); }
+
+inline int v_signmask(const v_int32x8& a)
+{ return v_signmask(v_reinterpret_as_f32(a)); }
+inline int v_signmask(const v_uint32x8& a)
+{ return v_signmask(v_reinterpret_as_f32(a)); }
+
+inline int v_signmask(const v_int64x4& a)
+{ return v_signmask(v_reinterpret_as_f64(a)); }
+inline int v_signmask(const v_uint64x4& a)
+{ return v_signmask(v_reinterpret_as_f64(a)); }
+
+inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
+    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)
+OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)
+
+#define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec)  \
+    inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
+    inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
+OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#if CV_FMA3
+#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix)                            \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
+    { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }
+#else
+#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix)                                    \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)            \
+    { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); } \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)         \
+    { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); }
+#endif
+
+#define OPENCV_HAL_IMPL_AVX_MISC(_Tpvec, suffix)                              \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
+    { return _Tpvec(_mm256_sqrt_##suffix(x.val)); }                           \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_fma(a, a, b * b); }                                            \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
+    { return v_sqrt(v_fma(a, a, b*b)); }
+
+OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
+OPENCV_HAL_IMPL_AVX_MISC(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
+
+inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x8 v_invsqrt(const v_float32x8& x)
+{
+    v_float32x8 half = x * v256_setall_f32(0.5);
+    v_float32x8 t  = v_float32x8(_mm256_rsqrt_ps(x.val));
+    // todo: _mm256_fnmsub_ps
+    t *= v256_setall_f32(1.5) - ((t * t) * half);
+    return t;
+}
+
+inline v_float64x4 v_invsqrt(const v_float64x4& x)
+{
+    return v256_setall_f64(1.) / v_sqrt(x);
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix)         \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)       \
+    { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX_ABS(int8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
+OPENCV_HAL_IMPL_AVX_ABS(int32x8,  epi32)
+
+inline v_float32x8 v_abs(const v_float32x8& x)
+{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
+inline v_float64x4 v_abs(const v_float64x4& x)
+{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
+
+/** Absolute difference **/
+inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = v_sub_wrap(a, b);
+    v_int8x32 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 d = a - b;
+    v_int32x8 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
+{ return v_abs(a - b); }
+
+inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = a - b;
+    v_int8x32 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x8 v_round(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvtps_epi32(a.val)); }
+
+inline v_int32x8 v_round(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
+
+inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
+{
+    __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
+    return v_int32x8(_v256_combine(ai, bi));
+}
+
+inline v_int32x8 v_trunc(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(a.val)); }
+
+inline v_int32x8 v_trunc(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
+
+inline v_int32x8 v_floor(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
+
+inline v_int32x8 v_floor(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
+
+/** To float **/
+inline v_float32x8 v_cvt_f32(const v_int32x8& a)
+{ return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
+{
+    __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
+    return v_float32x8(_v256_combine(af, bf));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
+
+inline v_float64x4 v_cvt_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
+
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x4 v_cvt_f64(const v_int64x4& v)
+{
+    // constants encoded as floating-point
+    __m256i magic_i_lo   = _mm256_set1_epi64x(0x4330000000000000); // 2^52
+    __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m256i magic_i_all  = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m256d magic_d_all  = _mm256_castsi256_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m256i v_lo         = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
+    // Extract the 32 most significant bits of v
+    __m256i v_hi         = _mm256_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm256_xor_si256(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m256d v_hi_dbl     = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m256d result       = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
+    return v_float64x4(result);
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x32 v256_lut(const schar* tab, const int* idx)
+{
+    return v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
+                                      tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]],
+                                      tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
+                                      tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]]));
+}
+inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x32(_mm256_setr_epi16(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]), *(const short*)(tab + idx[ 3]),
+                                       *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]), *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]),
+                                       *(const short*)(tab + idx[ 8]), *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
+                                       *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]), *(const short*)(tab + idx[15])));
+}
+inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x32(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 1));
+}
+inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x16 v256_lut(const short* tab, const int* idx)
+{
+    return v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
+                                        tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
+}
+inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x16(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 2));
+}
+inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int16x16(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 2));//Looks like intrinsic has wrong definition
+#else
+    return v_int16x16(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 2));
+#endif
+}
+inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x8 v256_lut(const int* tab, const int* idx)
+{
+    return v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
+}
+inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int32x8(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
+#else
+    return v_int32x8(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
+#endif
+}
+inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
+}
+inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x4 v256_lut(const int64* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int64x4(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 8));
+#else
+    return v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((const __m128i*)idx), 8));
+#endif
+}
+inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x4(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
+}
+inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
+inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x8 v256_lut(const float* tab, const int* idx)
+{
+    return v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
+}
+inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x4 v256_lut(const double* tab, const int* idx)
+{
+    return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
+}
+inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }
+
+inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
+{
+    return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
+}
+
+inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
+{
+    return v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4));
+}
+
+inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
+{
+    return v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01, xy45, xy23, xy67;
+    xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
+    xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
+    xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
+    xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
+    __m256 xy0145 = _v256_combine(xy01, xy45);
+    xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
+    xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
+    xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
+    xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
+    __m256 xy2367 = _v256_combine(xy23, xy67);
+
+    __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
+    __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
+
+    x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
+    y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
+    __m256d xy02 = _v256_combine(xy0, xy2);
+    __m256d xy13 = _v256_combine(xy1, xy3);
+
+    x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
+    y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
+}
+
+inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
+{
+    return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
+}
+inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
+{
+    return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
+}
+inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
+{
+    return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
+}
+inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
+{
+    return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
+}
+inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
+{
+    return v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
+{
+    return v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),
+                                                 _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
+{
+    return v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),
+                                                  _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
+{
+    return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
+{
+    return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
+{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
+{
+    __m256i even = _mm256_mul_epi32(a.val, b.val);
+    __m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
+    return v_int64x4(_mm256_add_epi64(even, odd));
+}
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i even_m = _mm256_set1_epi32(0xFF00FF00);
+    __m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
+    __m256i odd_a  = _mm256_srli_epi16(a.val, 8);
+
+    __m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
+    __m256i odd_b  = _mm256_srli_epi16(b.val, 8);
+
+    __m256i prod0  = _mm256_madd_epi16(even_a, even_b);
+    __m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);
+    return v_uint32x8(_mm256_add_epi32(prod0, prod1));
+}
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
+    __m256i odd_a  = _mm256_srai_epi16(a.val, 8);
+
+    __m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
+    __m256i odd_b  = _mm256_srai_epi16(b.val, 8);
+
+    __m256i prod0  = _mm256_madd_epi16(even_a, even_b);
+    __m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);
+    return v_int32x8(_mm256_add_epi32(prod0, prod1));
+}
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
+    __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);
+    __m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);
+
+    __m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
+    __m256i p13   = _mm256_srli_epi64(mul0, 32);
+    __m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
+    __m256i p57   = _mm256_srli_epi64(mul1, 32);
+
+    __m256i p15_  = _mm256_add_epi64(p02, p13);
+    __m256i p9d_  = _mm256_add_epi64(p46, p57);
+
+    return v_uint64x4(_mm256_add_epi64(
+        _mm256_unpacklo_epi64(p15_, p9d_),
+        _mm256_unpackhi_epi64(p15_, p9d_)
+    ));
+}
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = _mm256_madd_epi16(a.val, b.val);
+    __m256i sign = _mm256_srai_epi32(prod, 31);
+
+    __m256i lo = _mm256_unpacklo_epi32(prod, sign);
+    __m256i hi = _mm256_unpackhi_epi32(prod, sign);
+
+    return v_int64x4(_mm256_add_epi64(
+        _mm256_unpacklo_epi64(lo, hi),
+        _mm256_unpackhi_epi64(lo, hi)
+    ));
+}
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
+{ return v_dotprod(a, b); }
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod(a, b); }
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
+    __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);
+    __m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);
+
+    __m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
+    __m256i p13   = _mm256_srli_epi64(mul0, 32);
+    __m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
+    __m256i p57   = _mm256_srli_epi64(mul1, 32);
+
+    __m256i p15_  = _mm256_add_epi64(p02, p13);
+    __m256i p9d_  = _mm256_add_epi64(p46, p57);
+
+    return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
+}
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = _mm256_madd_epi16(a.val, b.val);
+    __m256i sign = _mm256_srai_epi32(prod, 31);
+    __m256i lo = _mm256_unpacklo_epi32(prod, sign);
+    __m256i hi = _mm256_unpackhi_epi32(prod, sign);
+    return v_int64x4(_mm256_add_epi64(lo, hi));
+}
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
+    v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
+                            const v_float32x8& m1, const v_float32x8& m2,
+                            const v_float32x8& m3)
+{
+    v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+    v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+    v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
+                               const v_float32x8& m1, const v_float32x8& m2,
+                               const v_float32x8& a)
+{
+    v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+    v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to)    \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val));       \
+        __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val));       \
+        __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val));       \
+        __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val));       \
+        b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1));                        \
+        b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1));                        \
+        b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3));                        \
+        b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3));                        \
+    }
+
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(_v256_extract_low(a.val));                  \
+        b1.val = intrin(_v256_extract_high(a.val));                 \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(_v256_extract_low(a.val))); }           \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(intrin(_v256_extract_high(a.val))); }          \
+    inline _Tpwvec v256_load_expand(const _Tp* ptr)                 \
+    {                                                               \
+        __m128i a = _mm_loadu_si128((const __m128i*)ptr);           \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32,  v_uint16x16, uchar,    _mm256_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32,   v_int16x16,  schar,    _mm256_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8,  ushort,   _mm256_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16,  v_int32x8,   short,    _mm256_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8,  v_uint64x4,  unsigned, _mm256_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8,   v_int64x4,   int,      _mm256_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin)   \
+    inline _Tpvec v256_load_expand_q(const _Tp* ptr)        \
+    {                                                       \
+        __m128i a = _mm_loadl_epi64((const __m128i*)ptr);   \
+        return _Tpvec(intrin(a));                           \
+    }
+
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8,  schar, _mm256_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
+{ return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i t = _mm256_set1_epi16(255);
+    __m256i a1 = _mm256_min_epu16(a.val, t);
+    __m256i b1 = _mm256_min_epu16(b.val, t);
+    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
+}
+
+inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    const __m256i m = _mm256_set1_epi16(255);
+    __m256i am = _mm256_min_epu16(a.val, m);
+            am =  _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
+    v_store_low(ptr, v_uint8x32(am));
+}
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    const __m256i m = _mm256_set1_epi32(65535);
+    __m256i am = _mm256_min_epu32(a.val, m);
+            am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
+    v_store_low(ptr, v_uint16x16(am));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
+{
+    // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
+    __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
+    return v_uint32x8(_v256_shuffle_odd_64(ab));
+}
+
+inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x4& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x4& a)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// pack boolean
+inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i ab = _mm256_packs_epi16(a.val, b.val);
+    return v_uint8x32(_v256_shuffle_odd_64(ab));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
+                           const v_uint32x8& c, const v_uint32x8& d)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+
+    __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
+    return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                           const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
+                           const v_uint64x4& g, const v_uint64x4& h)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+    __m256i ef = _mm256_packs_epi32(e.val, f.val);
+    __m256i gh = _mm256_packs_epi32(g.val, h.val);
+
+    __m256i abcd = _mm256_packs_epi32(ab, cd);
+    __m256i efgh = _mm256_packs_epi32(ef, gh);
+    __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
+
+    __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
+    return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec)                    \
+    template<int s>                                            \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
+
+template<int i>
+inline uchar v_extract_n(v_uint8x32 a)
+{
+    return (uchar)_v256_extract_epi8<i>(a.val);
+}
+
+template<int i>
+inline schar v_extract_n(v_int8x32 a)
+{
+    return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
+}
+
+template<int i>
+inline ushort v_extract_n(v_uint16x16 a)
+{
+    return (ushort)_v256_extract_epi16<i>(a.val);
+}
+
+template<int i>
+inline short v_extract_n(v_int16x16 a)
+{
+    return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
+}
+
+template<int i>
+inline uint v_extract_n(v_uint32x8 a)
+{
+    return (uint)_v256_extract_epi32<i>(a.val);
+}
+
+template<int i>
+inline int v_extract_n(v_int32x8 a)
+{
+    return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
+}
+
+template<int i>
+inline uint64 v_extract_n(v_uint64x4 a)
+{
+    return (uint64)_v256_extract_epi64<i>(a.val);
+}
+
+template<int i>
+inline int64 v_extract_n(v_int64x4 v)
+{
+    return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
+}
+
+template<int i>
+inline float v_extract_n(v_float32x8 v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(v_float64x4 v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
+    return d.dv;
+}
+
+template<int i>
+inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
+{
+    static const __m256i perm = _mm256_set1_epi32((char)i);
+    return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
+}
+
+template<int i>
+inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+    const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+                                               0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint8x32(a0);
+    b = v_uint8x32(b0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+    const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                               0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint16x16(a0);
+    b = v_uint16x16(b0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+
+    enum { sh = 0+2*4+1*16+3*64 };
+    __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint32x8(a0);
+    b = v_uint32x8(b0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+
+    __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint64x4(a0);
+    b = v_uint64x4(b0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
+
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                               -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
+
+    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
+
+    const __m256i
+    sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
+                            0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
+    sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
+                            1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
+    sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
+                            2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    b0 = _mm256_shuffle_epi8(b0, sh_b);
+    g0 = _mm256_shuffle_epi8(g0, sh_g);
+    r0 = _mm256_shuffle_epi8(r0, sh_r);
+
+    a = v_uint8x32(b0);
+    b = v_uint8x32(g0);
+    c = v_uint8x32(r0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
+    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+    const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+                                                 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+                                                 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+                                                 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    b0 = _mm256_shuffle_epi8(b0, sh_b);
+    g0 = _mm256_shuffle_epi8(g0, sh_g);
+    r0 = _mm256_shuffle_epi8(r0, sh_r);
+
+    a = v_uint16x16(b0);
+    b = v_uint16x16(g0);
+    c = v_uint16x16(r0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+    __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
+    __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
+    __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
+
+    b0 = _mm256_shuffle_epi32(b0, 0x6c);
+    g0 = _mm256_shuffle_epi32(g0, 0xb1);
+    r0 = _mm256_shuffle_epi32(r0, 0xc6);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+
+    __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
+    __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
+    __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
+    __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
+    __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
+    __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
+    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
+    const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                                               0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+
+    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+    __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+    __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
+
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
+
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
+
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
+
+    a = v_uint8x32(b0);
+    b = v_uint8x32(g0);
+    c = v_uint8x32(r0);
+    d = v_uint8x32(a0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
+    const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+                                               0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+    __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+    __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
+
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
+
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
+
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
+
+    a = v_uint16x16(b0);
+    b = v_uint16x16(g0);
+    c = v_uint16x16(r0);
+    d = v_uint16x16(a0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
+{
+    __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));
+
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
+
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
+
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+    d = v_uint32x8(a0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
+{
+    __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+    __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));
+
+    __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
+    __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
+    __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
+    __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
+
+    __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
+    __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
+    __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
+    __m256i a0 = _mm256_unpackhi_epi64(h02, h13);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+    d = v_uint64x4(a0);
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
+
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 32), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
+
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 16), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
+
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 8), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
+
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 4), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _mm256_setr_epi8(
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m256i sh_g = _mm256_setr_epi8(
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m256i sh_r = _mm256_setr_epi8(
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+
+    __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
+    __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
+    __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
+
+    const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                               0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+
+    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+    __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+    __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
+    __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
+        _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
+        _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
+        _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _mm256_setr_epi8(
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _mm256_setr_epi8(
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m256i sh_r = _mm256_setr_epi8(
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+
+    __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
+    __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
+    __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
+
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+
+    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+    __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+    __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
+    //__m256i bgr1 = p1;
+    __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 16), p1);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 16), p1);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i b0 = _mm256_shuffle_epi32(a.val, 0x6c);
+    __m256i g0 = _mm256_shuffle_epi32(b.val, 0xb1);
+    __m256i r0 = _mm256_shuffle_epi32(c.val, 0xc6);
+
+    __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
+    __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
+    __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    //__m256i bgr1 = p2;
+    __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), p2);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 8), p2);
+        _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i s01 = _mm256_unpacklo_epi64(a.val, b.val);
+    __m256i s12 = _mm256_unpackhi_epi64(b.val, c.val);
+    __m256i s20 = _mm256_blend_epi32(c.val, a.val, 0xcc);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
+    __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
+    __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
+        _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
+        _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
+                                const v_uint8x32& c, const v_uint8x32& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = _mm256_unpacklo_epi8(a.val, b.val);
+    __m256i bg1 = _mm256_unpackhi_epi8(a.val, b.val);
+    __m256i ra0 = _mm256_unpacklo_epi8(c.val, d.val);
+    __m256i ra1 = _mm256_unpackhi_epi8(c.val, d.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
+                                const v_uint16x16& c, const v_uint16x16& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = _mm256_unpacklo_epi16(a.val, b.val);
+    __m256i bg1 = _mm256_unpackhi_epi16(a.val, b.val);
+    __m256i ra0 = _mm256_unpacklo_epi16(c.val, d.val);
+    __m256i ra1 = _mm256_unpackhi_epi16(c.val, d.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
+                                const v_uint32x8& c, const v_uint32x8& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = _mm256_unpacklo_epi32(a.val, b.val);
+    __m256i bg1 = _mm256_unpackhi_epi32(a.val, b.val);
+    __m256i ra0 = _mm256_unpacklo_epi32(c.val, d.val);
+    __m256i ra1 = _mm256_unpackhi_epi32(c.val, d.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
+                                const v_uint64x4& c, const v_uint64x4& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = _mm256_unpacklo_epi64(a.val, b.val);
+    __m256i bg1 = _mm256_unpackhi_epi64(a.val, b.val);
+    __m256i ra0 = _mm256_unpacklo_epi64(c.val, d.val);
+    __m256i ra1 = _mm256_unpackhi_epi64(c.val, d.val);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
+    }
+}
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
+
+//
+// FP16
+//
+
+inline v_float32x8 v256_load_expand(const hfloat* ptr)
+{
+#if CV_FP16
+    return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    for (int i = 0; i < 8; i++)
+        buf[i] = (float)ptr[i];
+    return v256_load_aligned(buf);
+#endif
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
+{
+#if CV_FP16
+    __m128i ah = _mm256_cvtps_ph(a.val, 0);
+    _mm_storeu_si128((__m128i*)ptr, ah);
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    v_store_aligned(buf, a);
+    for (int i = 0; i < 8; i++)
+        ptr[i] = hfloat(buf[i]);
+#endif
+}
+
+//
+// end of FP16
+//
+
+inline void v256_cleanup() { _mm256_zeroall(); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_avx512.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_avx512.hpp
new file mode 100644
index 000000000000..e59b8d92ebe9
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_avx512.hpp
@@ -0,0 +1,3090 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX512_HPP
+#define OPENCV_HAL_INTRIN_AVX512_HPP
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920/*MSVS2019*/)
+# pragma warning(disable:4146)  // unary minus operator applied to unsigned type, result still unsigned
+# pragma warning(disable:4309)  // 'argument': truncation of constant value
+# pragma warning(disable:4310)  // cast truncates constant value
+#endif
+
+#define CVT_ROUND_MODES_IMPLEMENTED 0
+
+#define CV_SIMD512 1
+#define CV_SIMD512_64F 1
+#define CV_SIMD512_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
+
+#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
+#define _v512_set_epu32(a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
+                         ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
+#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                        a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
+#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
+                        ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
+                        ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
+                        ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
+                        ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
+#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
+                       (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
+                       (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
+                       (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
+                       (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
+                       (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
+                       (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
+                       (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
+
+#ifndef _mm512_cvtpd_pslo
+#ifdef _mm512_zextsi256_si512
+#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
+#else
+//if preferred way to extend with zeros is unavailable
+#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
+#endif
+#endif
+///////// Utils ////////////
+
+namespace
+{
+
+inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)
+{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
+
+inline __m512 _v512_combine(const __m256& lo, const __m256& hi)
+{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
+
+inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)
+{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
+
+inline int _v_cvtsi512_si32(const __m512i& a)
+{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
+
+inline __m256i _v512_extract_high(const __m512i& v)
+{ return _mm512_extracti32x8_epi32(v, 1); }
+
+inline __m256  _v512_extract_high(const __m512& v)
+{ return _mm512_extractf32x8_ps(v, 1); }
+
+inline __m256d _v512_extract_high(const __m512d& v)
+{ return _mm512_extractf64x4_pd(v, 1); }
+
+inline __m256i _v512_extract_low(const __m512i& v)
+{ return _mm512_castsi512_si256(v); }
+
+inline __m256  _v512_extract_low(const __m512& v)
+{ return _mm512_castps512_ps256(v); }
+
+inline __m256d _v512_extract_low(const __m512d& v)
+{ return _mm512_castpd512_pd256(v); }
+
+inline __m512i _v512_insert(const __m512i& a, const __m256i& b)
+{ return _mm512_inserti32x8(a, b, 0); }
+
+inline __m512 _v512_insert(const __m512& a, const __m256& b)
+{ return _mm512_insertf32x8(a, b, 0); }
+
+inline __m512d _v512_insert(const __m512d& a, const __m256d& b)
+{ return _mm512_insertf64x4(a, b, 0); }
+
+}
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Types ////////////
+
+struct v_uint8x64
+{
+    typedef uchar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_uint8x64(__m512i v) : val(v) {}
+    v_uint8x64(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31,
+               uchar v32, uchar v33, uchar v34, uchar v35,
+               uchar v36, uchar v37, uchar v38, uchar v39,
+               uchar v40, uchar v41, uchar v42, uchar v43,
+               uchar v44, uchar v45, uchar v46, uchar v47,
+               uchar v48, uchar v49, uchar v50, uchar v51,
+               uchar v52, uchar v53, uchar v54, uchar v55,
+               uchar v56, uchar v57, uchar v58, uchar v59,
+               uchar v60, uchar v61, uchar v62, uchar v63)
+    {
+        val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint8x64() {}
+
+    static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); }
+
+    uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
+};
+
+struct v_int8x64
+{
+    typedef schar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_int8x64(__m512i v) : val(v) {}
+    v_int8x64(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31,
+              schar v32, schar v33, schar v34, schar v35,
+              schar v36, schar v37, schar v38, schar v39,
+              schar v40, schar v41, schar v42, schar v43,
+              schar v44, schar v45, schar v46, schar v47,
+              schar v48, schar v49, schar v50, schar v51,
+              schar v52, schar v53, schar v54, schar v55,
+              schar v56, schar v57, schar v58, schar v59,
+              schar v60, schar v61, schar v62, schar v63)
+    {
+        val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_int8x64() {}
+
+    static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); }
+
+    schar get0() const { return (schar)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint16x32
+{
+    typedef ushort lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_uint16x32(__m512i v) : val(v) {}
+    v_uint16x32(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15,
+                ushort v16, ushort v17, ushort v18, ushort v19,
+                ushort v20, ushort v21, ushort v22, ushort v23,
+                ushort v24, ushort v25, ushort v26, ushort v27,
+                ushort v28, ushort v29, ushort v30, ushort v31)
+    {
+        val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                              v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint16x32() {}
+
+    static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); }
+
+    ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
+};
+
+struct v_int16x32
+{
+    typedef short lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_int16x32(__m512i v) : val(v) {}
+    v_int16x32(short v0,  short v1,  short v2,  short v3,  short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11, short v12, short v13, short v14, short v15,
+               short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,
+               short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)
+    {
+        val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
+                              (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
+                              (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
+                              (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
+    }
+    v_int16x32() {}
+
+    static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); }
+
+    short get0() const { return (short)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint32x16
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_uint32x16(__m512i v) : val(v) {}
+    v_uint32x16(unsigned v0,  unsigned v1,  unsigned v2,  unsigned v3,
+                unsigned v4,  unsigned v5,  unsigned v6,  unsigned v7,
+                unsigned v8,  unsigned v9,  unsigned v10, unsigned v11,
+                unsigned v12, unsigned v13, unsigned v14, unsigned v15)
+    {
+        val = _mm512_setr_epi32((int)v0,  (int)v1,  (int)v2,  (int)v3, (int)v4,  (int)v5,  (int)v6,  (int)v7,
+                                (int)v8,  (int)v9,  (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
+    }
+    v_uint32x16() {}
+
+    static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); }
+
+    unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
+};
+
+struct v_int32x16
+{
+    typedef int lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_int32x16(__m512i v) : val(v) {}
+    v_int32x16(int v0, int v1, int v2,  int v3,  int v4,  int v5,  int v6,  int v7,
+               int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)
+    {
+        val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_int32x16() {}
+
+    static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); }
+
+    int get0() const { return _v_cvtsi512_si32(val); }
+};
+
+struct v_float32x16
+{
+    typedef float lane_type;
+    enum { nlanes = 16 };
+    __m512 val;
+
+    explicit v_float32x16(__m512 v) : val(v) {}
+    v_float32x16(float v0, float v1, float v2,  float v3,  float v4,  float v5,  float v6,  float v7,
+                 float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)
+    {
+        val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_float32x16() {}
+
+    static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); }
+
+    float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
+};
+
+struct v_uint64x8
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_uint64x8(__m512i v) : val(v) {}
+    v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
+    { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
+    v_uint64x8() {}
+
+    static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); }
+
+    uint64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #endif
+    }
+};
+
+struct v_int64x8
+{
+    typedef int64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_int64x8(__m512i v) : val(v) {}
+    v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
+    { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_int64x8() {}
+
+    static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); }
+
+    int64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #endif
+    }
+};
+
+struct v_float64x8
+{
+    typedef double lane_type;
+    enum { nlanes = 8 };
+    __m512d val;
+
+    explicit v_float64x8(__m512d v) : val(v) {}
+    v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
+    { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_float64x8() {}
+
+    static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); }
+
+    double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp)                    \
+    inline _Tpvec v512_load(const _Tp* ptr)                           \
+    { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); }       \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); }        \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr);       \
+        return _Tpvec(_mm512_castsi256_si512(v256));                  \
+    }                                                                 \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0);       \
+        __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1);       \
+        return _Tpvec(_v512_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm512_storeu_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { _mm512_store_si512((__m512i*)ptr, a.val); }                     \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { _mm512_stream_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_si512((__m512i*)ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_si512((__m512i*)ptr, a.val); \
+        else \
+            _mm512_store_si512((__m512i*)ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); }    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64,  uchar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64,   schar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,  short)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,  unsigned)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,   int)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8,  uint64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8,   int64)
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
+    inline _Tpvec v512_load(const _Tp* ptr)                               \
+    { return _Tpvec(_mm512_loadu_##suffix(ptr)); }                        \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(_mm512_load_##suffix(ptr)); }                         \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        return _Tpvec(_mm512_cast##suffix##256_##suffix##512              \
+                     (_mm256_loadu_##suffix(ptr)));                       \
+    }                                                                     \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = _mm256_loadu_##suffix(ptr0);                        \
+        halfreg vhi = _mm256_loadu_##suffix(ptr1);                        \
+        return _Tpvec(_v512_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { _mm512_storeu_##suffix(ptr, a.val); }                               \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { _mm512_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { _mm512_stream_##suffix(ptr, a.val); }                               \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_##suffix(ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_##suffix(ptr, a.val); \
+        else \
+            _mm512_store_##suffix(ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); }            \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float,  ps, __m256)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
+
+#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)         \
+    inline _Tpvec v512_setzero_##suffix()                                          \
+    { return _Tpvec(_mm512_setzero_si512()); }                                     \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                                      \
+    { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8,  suffix, _mm512_castpd_si512)
+
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64,  uchar,    u8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64,   schar,    s8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort,   u16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,  short,    s16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,  int,      s32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8,  uint64,   u64, epi64,  int64)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)
+
+#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v512_setzero_##suffix()                                   \
+    { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                               \
+    { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,   suffix, cast)
+
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float,  f32, ps, _mm512_castsi512_ps)
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,  double, f64, pd, _mm512_castsi512_pd)
+
+inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)
+{ return a; }
+inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_castpd_ps(a.val)); }
+
+inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)
+{ return a; }
+inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_castps_pd(a.val)); }
+
+// FP16
+inline v_float32x16 v512_load_expand(const hfloat* ptr)
+{
+    return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x16& a)
+{
+    __m256i ah = _mm512_cvtps_ph(a.val, 0);
+    _mm256_storeu_si256((__m256i*)ptr, ah);
+}
+
+/* Recombine & ZIP */
+inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8( 95,  31,  94,  30,  93,  29,  92,  28,  91,  27,  90,  26,  89,  25,  88,  24,
+                                    87,  23,  86,  22,  85,  21,  84,  20,  83,  19,  82,  18,  81,  17,  80,  16,
+                                    79,  15,  78,  14,  77,  13,  76,  12,  75,  11,  74,  10,  73,   9,  72,   8,
+                                    71,   7,  70,   6,  69,   5,  68,   4,  67,   3,  66,   2,  65,   1,  64,   0);
+    ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu8(127,  63, 126,  62, 125,  61, 124,  60, 123,  59, 122,  58, 121,  57, 120,  56,
+                                   119,  55, 118,  54, 117,  53, 116,  52, 115,  51, 114,  50, 113,  49, 112,  48,
+                                   111,  47, 110,  46, 109,  45, 108,  44, 107,  43, 106,  42, 105,  41, 104,  40,
+                                   103,  39, 102,  38, 101,  37, 100,  36,  99,  35,  98,  34,  97,  33,  96,  32);
+    ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
+#else
+    __m512i low  = _mm512_unpacklo_epi8(a.val, b.val);
+    __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
+    ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2,  9,  8, 1, 0), high));
+    ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
+#endif
+}
+inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
+{
+    __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41,  9, 40,  8,
+                                    39,  7, 38,  6, 37,  5, 36,  4, 35,  3, 34,  2, 33,  1, 32,  0);
+    ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
+                                    55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
+    ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
+{
+    __m512i mask0 = _v512_set_epu32(23,  7, 22,  6, 21,  5, 20,  4, 19,  3, 18,  2, 17, 1, 16, 0);
+    ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+    ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
+{
+    __m512i mask0 = _v512_set_epu64(11, 3, 10, 2,  9, 1,  8, 0);
+    ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
+    ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
+}
+
+inline void v_zip(const v_uint8x64&  a, const v_uint8x64&  b, v_uint8x64& ab0, v_uint8x64& ab1)
+{
+    v_int8x64 i0, i1;
+    v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
+    ab0 = v_reinterpret_as_u8(i0);
+    ab1 = v_reinterpret_as_u8(i1);
+}
+inline void v_zip(const v_uint16x32&  a, const v_uint16x32&  b, v_uint16x32& ab0, v_uint16x32& ab1)
+{
+    v_int16x32 i0, i1;
+    v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
+    ab0 = v_reinterpret_as_u16(i0);
+    ab1 = v_reinterpret_as_u16(i1);
+}
+inline void v_zip(const v_uint32x16&  a, const v_uint32x16&  b, v_uint32x16& ab0, v_uint32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_u32(i0);
+    ab1 = v_reinterpret_as_u32(i1);
+}
+inline void v_zip(const v_uint64x8&  a, const v_uint64x8&  b, v_uint64x8& ab0, v_uint64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_u64(i0);
+    ab1 = v_reinterpret_as_u64(i1);
+}
+inline void v_zip(const v_float32x16&  a, const v_float32x16&  b, v_float32x16& ab0, v_float32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_f32(i0);
+    ab1 = v_reinterpret_as_f32(i1);
+}
+inline void v_zip(const v_float64x8&  a, const v_float64x8&  b, v_float64x8& ab0, v_float64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_f64(i0);
+    ab1 = v_reinterpret_as_f64(i1);
+}
+
+#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix)                                    \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)                         \
+    { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)                        \
+    { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); }                    \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,                             \
+                                  _Tpvec& c, _Tpvec& d)                                   \
+    {                                                                                     \
+        c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val));         \
+        d.val = _v512_insert(b.val,_v512_extract_high(a.val));                            \
+    }
+
+
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64,   epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32,   epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8,  pd)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)      \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
+
+inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i ad = _mm512_srai_epi16(a.val, 8);
+    __m512i bd = _mm512_srai_epi16(b.val, 8);
+    __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even
+    __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd
+    return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
+}
+inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(intrin(a.val, b.val)); }                             \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
+
+/** Saturating arithmetics **/
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
+
+// saturating multiply
+inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
+{
+    v_uint16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
+{
+    v_int16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
+}
+inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+    return v_int16x32(_mm512_packs_epi32(p0, p1));
+}
+
+inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
+{ a = a * b; return a; }
+inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
+{ a = a * b; return a; }
+inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
+{ a = a * b; return a; }
+inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
+{ a = a * b; return a; }
+
+inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,
+                         v_uint16x32& c, v_uint16x32& d)
+{
+    v_uint16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,
+                         v_int16x32& c, v_int16x32& d)
+{
+    v_int16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,
+                         v_int32x16& c, v_int32x16& d)
+{
+    v_int16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,
+                         v_uint32x16& c, v_uint32x16& d)
+{
+    v_uint16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,
+                         v_uint64x8& c, v_uint64x8& d)
+{
+    v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
+          v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
+    v_int64x8& c, v_int64x8& d)
+{
+    v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
+          v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \
+    inline _Tpvec operator ~ (const _Tpvec& a)                     \
+    { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64,    si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8,   si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8,    si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps,    _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8,  pd,    _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf)                      \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64,   epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64,    epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32,  epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16,  epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8,  epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8,   epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16,   ps,    ps)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32,  epi16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16,  epi32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)
+
+inline v_float32x16 v_not_nan(const v_float32x16& a)
+{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }
+inline v_float64x8 v_not_nan(const v_float64x8& a)
+{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64,   _mm512_min_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64,   _mm512_max_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64,    _mm512_min_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64,    _mm512_max_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32,  _mm512_min_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32,  _mm512_max_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32,   _mm512_min_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32,   _mm512_max_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16,  _mm512_min_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16,  _mm512_max_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16,   _mm512_min_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16,   _mm512_max_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8,   _mm512_min_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8,   _mm512_max_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8,    _mm512_min_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8,    _mm512_max_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8,  _mm512_min_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8,  _mm512_max_pd)
+
+/** Rotate **/
+namespace {
+    template<bool prec, int imm4, bool part, int imm32>
+    struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
+    template<int imm4, int imm32>
+    struct _v_rotate_right<true, imm4, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
+    {
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32    ),    imm4 *8),
+                                         _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));
+    }};
+    template<int imm4>
+    struct _v_rotate_right<true, imm4, false, 15> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
+    {
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15),    imm4 *8),
+                                         _mm512_slli_epi32(                                b.val, (4-imm4)*8)));
+    }};
+    template<int imm4, int imm32>
+    struct _v_rotate_right<true, imm4, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
+    {
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16),    imm4 *8),
+                                         _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));
+    }};
+    template<int imm4>
+    struct _v_rotate_right<true, imm4, true, 31> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
+    { return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};
+    template<int imm32>
+    struct _v_rotate_right<false, 0, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
+    { return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};
+    template<>
+    struct _v_rotate_right<false, 0, false, 0> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64&) { return a; }};
+    template<int imm32>
+    struct _v_rotate_right<false, 0, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
+    { return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};
+    template<>
+    struct _v_rotate_right<false, 0, true, 16> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) { return b; }};
+    template<>
+    struct _v_rotate_right<false, 0, true, 32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
+}
+template<int imm> inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)
+{
+    return imm >= 128 ? v_int8x64() :
+#if CV_AVX_512VBMI
+    v_int8x64(_mm512_permutex2var_epi8(a.val,
+    _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,
+                   0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,
+                   0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,
+                   0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,
+                   0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,
+                   0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,
+                   0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,
+                   0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));
+#else
+    _v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)
+{
+    if (imm == 0) return a;
+    if (imm == 64) return b;
+    if (imm >= 128) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_permutex2var_epi8(b.val,
+           _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
+                          0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
+                          0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
+                          0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
+                          0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
+                          0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
+                          0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
+                          0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
+#else
+    return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_right(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
+           _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
+                          0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
+                          0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
+                          0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
+                          0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
+                          0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
+                          0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
+                          0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
+#else
+    return v_rotate_right<imm>(a, v512_setzero_s8());
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
+           _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
+                          0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
+                          0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
+                          0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
+                          0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
+                          0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
+                          0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
+                          0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
+#else
+    return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
+#endif
+}
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix)                                                                                   \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                            \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }      \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                           \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }     \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                             \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }                              \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                            \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix)                                                                                   \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                                              \
+{                                                                                                                                          \
+    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \
+    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                   \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
+}                                                                                                                                          \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                                             \
+{                                                                                                                                          \
+    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \
+    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                   \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
+}                                                                                                                                          \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                                               \
+{                                                                                                                                          \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \
+    return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                              \
+}                                                                                                                                          \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                                              \
+{                                                                                                                                          \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \
+    return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                            \
+}
+
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64,   u8)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32,  u16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32,   s16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)
+
+/** Reverse **/
+inline v_uint8x64 v_reverse(const v_uint8x64 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
+            0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
+            0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
+    return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int8x64 v_reverse(const v_int8x64 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x32 v_reverse(const v_uint16x32 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00000001, 0x00020003, 0x00040005, 0x00060007,
+            0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
+            0x00100011, 0x00120013, 0x00140015, 0x00160017,
+            0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
+    return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int16x32 v_reverse(const v_int16x32 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x16 v_reverse(const v_uint32x16 &a)
+{
+    static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
+    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
+}
+
+inline v_int32x16 v_reverse(const v_int32x16 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x16 v_reverse(const v_float32x16 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x8 v_reverse(const v_uint64x8 &a)
+{
+    static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+    return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
+}
+
+inline v_int64x8 v_reverse(const v_int64x8 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x8 v_reverse(const v_float64x8 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////// Reduce /////////
+
+/** Reduce **/
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop)                                          \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                                  \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));                           \
+      sctype CV_DECL_ALIGNED(64) idx[2];                                                                            \
+      _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  min, v_int64x8,  min_epi64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  max, v_int64x8,  max_epi64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  sum, v_int64x8,  add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop)                                         \
+    inline double v_reduce_##func(const v_float64x8& a)                                             \
+    { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));           \
+      double CV_DECL_ALIGNED(64) idx[2];                                                            \
+      _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  min, v_int32x16,  min_epi32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  max, v_int32x16,  max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc)                                            \
+    inline float v_reduce_##func(const v_float32x16& a)                                           \
+    { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));          \
+      __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2)));           \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1)));           \
+      return _mm_cvtss_f32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)
+
+inline float v_reduce_sum(const v_float32x16& a)
+{
+    __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
+    quarter = _mm_hadd_ps(quarter, quarter);
+    return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
+}
+inline int v_reduce_sum(const v_int32x16& a)
+{
+    __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    quarter = _mm_hadd_epi32(quarter, quarter);
+    return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
+}
+inline uint v_reduce_sum(const v_uint32x16& a)
+{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)
+
+inline int v_reduce_sum(const v_int16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline uint v_reduce_sum(const v_uint16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64,  min_epi8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64,  max_epi8)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix)                                    \
+    inline sctype v_reduce_sum(const _Tpvec& a)                                                         \
+    {   __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)),            \
+                                       _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val)));          \
+        a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
+        __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16));                 \
+        __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1));        \
+        a4 = _mm_hadd_epi32(a4, a4);                                                                    \
+        return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int,  v_int8x64,  epi8)
+
+inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,
+                                  const v_float32x16& c, const v_float32x16& d)
+{
+    __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
+    __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
+    __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
+    __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
+    return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i val = _mm512_sad_epu8(a.val, b.val);
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i val = _mm512_set1_epi8(-128);
+    val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
+inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
+{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
+inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
+inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
+{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
+inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
+{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
+inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
+{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
+
+/** Popcount **/
+inline v_uint8x64 v_popcount(const v_int8x64& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint8x64(_mm512_popcnt_epi8(a.val));
+#elif CV_AVX_512VBMI
+    __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
+    __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
+    return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
+#else
+    __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
+    __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
+
+    return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(                  a.val,     _popcnt_mask)),
+                                      _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
+#endif
+}
+inline v_uint16x32 v_popcount(const v_int16x32& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint16x32(_mm512_popcnt_epi16(a.val));
+#elif CV_AVX_512VPOPCNTDQ
+    __m512i zero = _mm512_setzero_si512();
+    return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
+                                          _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
+#endif
+}
+inline v_uint32x16 v_popcount(const v_int32x16& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint32x16(_mm512_popcnt_epi32(a.val));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
+#endif
+}
+inline v_uint64x8 v_popcount(const v_int64x8& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint64x8(_mm512_popcnt_epi64(a.val));
+#else
+    return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
+#endif
+}
+
+
+inline v_uint8x64  v_popcount(const v_uint8x64&  a) { return v_popcount(v_reinterpret_as_s8 (a)); }
+inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }
+inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }
+inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinterpret_as_s64(a)); }
+
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#if CV_FMA3
+#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                         \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }
+#else
+#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                                 \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)            \
+    { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); } \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)         \
+    { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); }
+#endif
+
+#define OPENCV_HAL_IMPL_AVX512_MISC(_Tpvec, suffix)                           \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
+    { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_fma(a, a, b * b); }                                            \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
+    { return v_sqrt(v_fma(a, a, b * b)); }
+
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)
+OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8,  pd)
+
+inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return a * b + c; }
+inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return v_fma(a, b, c); }
+
+inline v_float32x16 v_invsqrt(const v_float32x16& x)
+{
+#if CV_AVX_512ER
+    return v_float32x16(_mm512_rsqrt28_ps(x.val));
+#else
+    v_float32x16 half = x * v512_setall_f32(0.5);
+    v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));
+    t *= v512_setall_f32(1.5) - ((t * t) * half);
+    return t;
+#endif
+}
+
+inline v_float64x8 v_invsqrt(const v_float64x8& x)
+{
+#if CV_AVX_512ER
+    return v_float64x8(_mm512_rsqrt28_pd(x.val));
+#else
+    return v512_setall_f64(1.) / v_sqrt(x);
+//    v_float64x8 half = x * v512_setall_f64(0.5);
+//    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    return t;
+#endif
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
+    inline _Tpuvec v_abs(const _Tpvec& x)                   \
+    { return _Tpuvec(_mm512_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64,    v_uint8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32,   v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16,   v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8,    v_uint64x8,   epi64)
+
+inline v_float32x16 v_abs(const v_float32x16& x)
+{
+#ifdef _mm512_abs_pd
+    return v_float32x16(_mm512_abs_ps(x.val));
+#else
+    return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
+                        _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
+                                        0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
+#endif
+}
+
+inline v_float64x8 v_abs(const v_float64x8& x)
+{
+#ifdef _mm512_abs_pd
+    #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
+        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476
+        return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
+    #else
+        return v_float64x8(_mm512_abs_pd(x.val));
+    #endif
+#else
+    return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
+                       _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
+                                       0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
+#endif
+}
+
+/** Absolute difference **/
+inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = v_sub_wrap(a, b);
+    v_int8x64 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 d = a - b;
+    v_int32x16 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
+{ return v_abs(a - b); }
+
+inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = a - b;
+    v_int8x64 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x16 v_round(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(a.val)); }
+
+inline v_int32x16 v_round(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
+
+inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)
+{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
+
+inline v_int32x16 v_trunc(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvttps_epi32(a.val)); }
+
+inline v_int32x16 v_trunc(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
+
+#if CVT_ROUND_MODES_IMPLEMENTED
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
+#else
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
+#endif
+
+/** To float **/
+inline v_float32x16 v_cvt_f32(const v_int32x16& a)
+{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)
+{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
+
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x8 v_cvt_f64(const v_int64x8& v)
+{
+#if CV_AVX_512DQ
+    return v_float64x8(_mm512_cvtepi64_pd(v.val));
+#else
+    // constants encoded as floating-point
+    __m512i magic_i_lo   = _mm512_set1_epi64(0x4330000000000000); // 2^52
+    __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63
+    __m512i magic_i_all  = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m512i v_lo         = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
+    // Extract the 32 most significant bits of v
+    __m512i v_hi         = _mm512_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm512_xor_si512(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m512d v_hi_dbl     = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m512d result       = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
+    return v_float64x8(result);
+#endif
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x64 v512_lut(const schar* tab, const int* idx)
+{
+    __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));
+    __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));
+    return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
+}
+inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    return v_int8x64(_v512_combine(p0, p1));
+}
+inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));
+}
+inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x32 v512_lut(const short* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 2));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));
+    return v_int16x32(_v512_combine(p0, p1));
+}
+inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));
+}
+inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));
+#else
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));
+#endif
+}
+inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x16 v512_lut(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));
+#else
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));
+#endif
+}
+inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                          _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x8 v512_lut(const int64* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));
+#else
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));
+#endif
+}
+inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                         _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }
+inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x16 v512_lut(const float* tab, const int* idx)
+{
+    return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x8 v512_lut(const double* tab, const int* idx)
+{
+    return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));
+}
+inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)
+{
+        return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
+                               _mm_loadu_pd(tab + idx[0])),
+                               _mm_loadu_pd(tab + idx[1]), 1),
+                               _mm_loadu_pd(tab + idx[2]), 2),
+                               _mm_loadu_pd(tab + idx[3]), 3));
+}
+
+inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)
+{
+    return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
+}
+
+inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)
+{
+    return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
+}
+
+inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)
+{
+    return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
+{
+    x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
+    y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
+{
+    x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
+    y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);
+}
+
+inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
+}
+inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x64 v_interleave_quads(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
+}
+inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
+}
+inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x32 v_interleave_quads(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
+}
+inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
+}
+inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x64 v_pack_triplets(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
+                                              _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
+}
+inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_pack_triplets(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
+                                                               0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
+}
+inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_pack_triplets(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                               0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
+{
+    return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
+{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
+{
+    __m512i even = _mm512_mul_epi32(a.val, b.val);
+    __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
+    __m512i odd_a  = _mm512_srli_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
+    __m512i odd_b  = _mm512_srli_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_uint32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
+    __m512i odd_a  = _mm512_srai_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
+    __m512i odd_b  = _mm512_srai_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_int32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+
+    return v_uint64x8(_mm512_add_epi64(
+        _mm512_unpacklo_epi64(p15_, p9d_),
+        _mm512_unpackhi_epi64(p15_, p9d_)
+    ));
+}
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i prod = _mm512_madd_epi16(a.val, b.val);
+    __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
+    __m512i odd  = _mm512_srai_epi64(prod, 32);
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod(a, b); }
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod(a, b); }
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+    return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
+}
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
+    v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x16 v_matmul(const v_float32x16& v,
+                             const v_float32x16& m0, const v_float32x16& m1,
+                             const v_float32x16& m2, const v_float32x16& m3)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x16 v_matmuladd(const v_float32x16& v,
+                                const v_float32x16& m0, const v_float32x16& m1,
+                                const v_float32x16& m2, const v_float32x16& a)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val));       \
+        __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val));       \
+        __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val));       \
+        __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val));       \
+        b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1));                        \
+        b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1));                        \
+        b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3));                        \
+        b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3));                        \
+    }
+
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(_v512_extract_low(a.val));                  \
+        b1.val = intrin(_v512_extract_high(a.val));                 \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(_v512_extract_low(a.val))); }           \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(intrin(_v512_extract_high(a.val))); }          \
+    inline _Tpwvec v512_load_expand(const _Tp* ptr)                 \
+    {                                                               \
+        __m256i a = _mm256_loadu_si256((const __m256i*)ptr);        \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64,  v_uint16x32, uchar,    _mm512_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64,   v_int16x32,  schar,    _mm512_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort,   _mm512_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32,  v_int32x16,  short,    _mm512_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,  unsigned, _mm512_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16,  v_int64x8,   int,      _mm512_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
+    inline _Tpvec v512_load_expand_q(const _Tp* ptr)         \
+    {                                                        \
+        __m128i a = _mm_loadu_si128((const __m128i*)ptr);    \
+        return _Tpvec(intrin(a));                            \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,  schar, _mm512_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
+{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    const __m512i t = _mm512_set1_epi16(255);
+    return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
+}
+
+inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    const __m512i m = _mm512_set1_epi16(255);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
+}
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
+{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
+}
+
+inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
+{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)
+{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
+
+inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)
+{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
+
+inline void v_pack_store(int* ptr, const v_int64x8& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x8& a)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// pack boolean
+inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
+                           const v_uint32x16& c, const v_uint32x16& d)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+
+    return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
+}
+
+inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
+                           const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,
+                           const v_uint64x8& g, const v_uint64x8& h)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+    __m512i ef = _mm512_packs_epi32(e.val, f.val);
+    __m512i gh = _mm512_packs_epi32(g.val, h.val);
+
+    __m512i abcd = _mm512_packs_epi32(ab, cd);
+    __m512i efgh = _mm512_packs_epi32(ef, gh);
+
+    return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
+                                                               27, 19, 11, 3, 26, 18, 10, 2, 25, 17,  9, 1, 24, 16,  8, 0), _mm512_packs_epi16(abcd, efgh)));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec)                \
+    template<int s>                                           \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
+
+#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }
+
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)
+
+template<int i>
+inline v_uint32x16 v_broadcast_element(v_uint32x16 a)
+{
+    static const __m512i perm = _mm512_set1_epi32((char)i);
+    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
+}
+
+template<int i>
+inline v_int32x16 v_broadcast_element(const v_int32x16 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x16 v_broadcast_element(const v_float32x16 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+    a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
+    b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
+#else
+    __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
+    __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
+    __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
+    __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
+    b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+    a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
+    b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
+    b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
+    b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+
+#if CV_AVX_512VBMI2
+    __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102,  99,  96,  93,  90,  87,  84, 81,
+                                    78,  75,  72,  69,  66,  63,  60,  57,  54,  51,  48,  45,  42,  39,  36, 33,
+                                    30,  27,  24,  21,  18,  15,  12,   9,   6,   3,   0,  62,  59,  56,  53, 50,
+                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,  2);
+    __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
+    __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
+    __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
+                    _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83, 80,
+                                    77,  74,  71,  68,  65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
+                                    94,  91,  88,  85,  82,  79,  76,  73,  70,  67,  64,  61,  58,  55,  52, 49,
+                                    46,  43,  40,  37,  34,  31,  28,  25,  22,  19,  16,  13,  10,   7,   4,  1), bgr2);
+    a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
+    b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
+    c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
+#elif CV_AVX_512VBMI
+    __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
+    __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
+    __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
+    a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83,  80,
+                                                                    77,  74,  71,  68,  65,  63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,
+                                                                    46,  45,  43,  42,  40,  39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,
+                                                                    23,  21,  20,  18,  17,  15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0), bgr2));
+    b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,  46,  45,  43,  42,  40,
+                                                                    39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,  23,  21,  20,  18,  17,
+                                                                    15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0, 126, 123, 120, 117, 114,
+                                                                   111, 108, 105, 102,  99,  96,  93,  90,  87,  84,  81,  78,  75,  72,  69,  66), bgr0));
+    c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63,  60,  57,  54,  51,  48,  45,  42,  39,  36,  33,  30,  27,  24,  21,  18,
+                                                                    15,  12,   9,   6,   3,   0, 125, 122, 119, 116, 113, 110, 107, 104, 101,  98,
+                                                                    95,  92,  89,  86,  83,  80,  77,  74,  71,  68,  65,  62,  59,  56,  53,  50,
+                                                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,   2), bgr1));
+#else
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
+    __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                   14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
+    __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
+    a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
+    c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
+    b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
+    b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                    14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
+    c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+
+    __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
+    __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
+    __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
+    __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
+
+    a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
+    b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
+    c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+
+    __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
+    __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
+
+    a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
+    c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
+    b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));
+
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
+
+    a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
+    c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
+    b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
+    d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
+#else
+    __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+    __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
+    __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
+    __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
+    __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
+    __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
+    __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
+    __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
+
+    a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
+    c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
+    b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));
+
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
+
+    a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
+    c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
+    b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
+    d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
+
+    a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
+    c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
+    b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));
+
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
+
+    a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
+    c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
+    b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
+    d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(127,  84,  20, 126,  83,  19, 125,  82,  18, 124,  81,  17, 123,  80,  16, 122,
+                                    79,  15, 121,  78,  14, 120,  77,  13, 119,  76,  12, 118,  75,  11, 117,  74,
+                                    10, 116,  73,   9, 115,  72,   8, 114,  71,   7, 113,  70,   6, 112,  69,   5,
+                                   111,  68,   4, 110,  67,   3, 109,  66,   2, 108,  65,   1, 107,  64,   0, 106);
+    __m512i mask1 = _v512_set_epu8( 21,  42, 105,  20,  41, 104,  19,  40, 103,  18,  39, 102,  17,  38, 101,  16,
+                                    37, 100,  15,  36,  99,  14,  35,  98,  13,  34,  97,  12,  33,  96,  11,  32,
+                                    95,  10,  31,  94,   9,  30,  93,   8,  29,  92,   7,  28,  91,   6,  27,  90,
+                                     5,  26,  89,   4,  25,  88,   3,  24,  87,   2,  23,  86,   1,  22,  85,   0);
+    __m512i mask2 = _v512_set_epu8(106, 127,  63, 105, 126,  62, 104, 125,  61, 103, 124,  60, 102, 123,  59, 101,
+                                   122,  58, 100, 121,  57,  99, 120,  56,  98, 119,  55,  97, 118,  54,  96, 117,
+                                    53,  95, 116,  52,  94, 115,  51,  93, 114,  50,  92, 113,  49,  91, 112,  48,
+                                    90, 111,  47,  89, 110,  46,  88, 109,  45,  87, 108,  44,  86, 107,  43,  85);
+    __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);
+    __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);
+    __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
+    __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
+#else
+    __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
+    __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);
+    __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);
+    __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);
+
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+#endif
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
+    __m512i mask1 = _v512_set_epu32(31, 10, 25, 30,  9, 24, 29,  8, 23, 28,  7, 22, 27,  6, 21, 26);
+    __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);
+    __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);
+    __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu64( 5, 12,  7,  4, 11,  6,  3, 10);
+    __m512i mask1 = _v512_set_epu64(15,  7,  4, 14,  6,  3, 13,  5);
+    __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);
+    __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
+    __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b,
+                                const v_uint8x64& c, const v_uint8x64& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint8x64 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b,
+                                const v_uint16x32& c, const v_uint16x32& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint16x32 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b,
+                                const v_uint32x16& c, const v_uint32x16& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint32x16 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b,
+                                const v_uint64x8& c, const v_uint64x8& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint64x8 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+}
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)
+
+////////// Mask and checks /////////
+
+/** Mask **/
+inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
+inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }
+inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+
+/** Checks **/
+inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
+inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+
+inline int v_scan_forward(const v_int8x64& a)
+{
+    int64 mask = _mm512_movepi8_mask(a.val);
+    int mask32 = (int)mask;
+    return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
+}
+inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
+inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+
+inline void v512_cleanup() { _mm256_zeroall(); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_cpp.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_cpp.hpp
new file mode 100644
index 000000000000..8619fec60c53
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_cpp.hpp
@@ -0,0 +1,3317 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_INTRIN_CPP_HPP
+#define OPENCV_HAL_INTRIN_CPP_HPP
+
+#include <limits>
+#include <cstring>
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/saturate.hpp"
+
+//! @cond IGNORED
+#define CV_SIMD128_CPP 1
+#if defined(CV_FORCE_SIMD128_CPP)
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#endif
+#if defined(CV_DOXYGEN)
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+#define CV_SIMD512 1
+#define CV_SIMD512_64F 1
+#else
+#define CV_SIMD256 0 // Explicitly disable SIMD256 and SIMD512 support for scalar intrinsic implementation
+#define CV_SIMD512 0 // to avoid warnings during compilation
+#endif
+//! @endcond
+
+namespace cv
+{
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+#endif
+
+/** @addtogroup core_hal_intrin
+
+"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
+different platforms. Currently a few different SIMD extensions on different architectures are supported.
+128 bit registers of various types support is implemented for a wide range of architectures
+including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
+256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
+In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
+will be chosen and code will work as expected although it could be slower.
+
+### Types
+
+There are several types representing packed values vector registers, each type is
+implemented as a structure based on a one SIMD register.
+
+- cv::v_uint8 and cv::v_int8: 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16 and cv::v_int16: 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32 and cv::v_int32: 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64 and cv::v_int64: 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32: 32-bit floating point values (signed) - float
+- cv::v_float64: 64-bit floating point values (signed) - double
+
+Exact bit length(and value quantity) of listed types is compile time deduced and depends on architecture SIMD
+capabilities chosen as available during compilation of the library. All the types contains __nlanes__ enumeration
+to check for exact value quantity of the type.
+
+In case the exact bit length of the type is important it is possible to use specific fixed length register types.
+
+There are several types representing 128-bit registers.
+
+- cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x4: four 32-bit floating point values (signed) - float
+- cv::v_float64x2: two 64-bit floating point values (signed) - double
+
+There are several types representing 256-bit registers.
+
+- cv::v_uint8x32 and cv::v_int8x32: thirty two 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x16 and cv::v_int16x16: sixteen 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x8 and cv::v_int32x8: eight 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x4 and cv::v_int64x4: four 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x8: eight 32-bit floating point values (signed) - float
+- cv::v_float64x4: four 64-bit floating point values (signed) - double
+
+@note
+256 bit registers at the moment implemented for AVX2 SIMD extension only, if you want to use this type directly,
+don't forget to check the CV_SIMD256 preprocessor definition:
+@code
+#if CV_SIMD256
+//...
+#endif
+@endcode
+
+There are several types representing 512-bit registers.
+
+- cv::v_uint8x64 and cv::v_int8x64: sixty four 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x32 and cv::v_int16x32: thirty two 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x16 and cv::v_int32x16: sixteen 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x8 and cv::v_int64x8: eight 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x16: sixteen 32-bit floating point values (signed) - float
+- cv::v_float64x8: eight 64-bit floating point values (signed) - double
+@note
+512 bit registers at the moment implemented for AVX512 SIMD extension only, if you want to use this type directly,
+don't forget to check the CV_SIMD512 preprocessor definition.
+
+@note
+cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
+check the CV_SIMD128_64F preprocessor definition.
+
+### Load and store operations
+
+These operations allow to set contents of the register explicitly or by loading it from some memory
+block and to save contents of the register to memory block.
+
+There are variable size register load operations that provide result of maximum available size
+depending on chosen platform capabilities.
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+- Other create methods:
+vx_setall_s8, vx_setall_u8, ...,
+vx_setzero_u8, vx_setzero_s8, ...
+- Memory load operations:
+vx_load, vx_load_aligned, vx_load_low, vx_load_halves,
+- Memory operations with expansion of values:
+vx_load_expand, vx_load_expand_q
+
+Also there are fixed size register load/store operations.
+
+For 128 bit registers
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
+- Other create methods:
+@ref v_setall_s8, @ref v_setall_u8, ...,
+@ref v_setzero_u8, @ref v_setzero_s8, ...
+- Memory load operations:
+@ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
+- Memory operations with expansion of values:
+@ref v_load_expand, @ref v_load_expand_q
+
+For 256 bit registers(check CV_SIMD256 preprocessor definition)
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) "from four values", ...
+- Other create methods:
+@ref v256_setall_s8, @ref v256_setall_u8, ...,
+@ref v256_setzero_u8, @ref v256_setzero_s8, ...
+- Memory load operations:
+@ref v256_load, @ref v256_load_aligned, @ref v256_load_low, @ref v256_load_halves,
+- Memory operations with expansion of values:
+@ref v256_load_expand, @ref v256_load_expand_q
+
+For 512 bit registers(check CV_SIMD512 preprocessor definition)
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7) "from eight values", ...
+- Other create methods:
+@ref v512_setall_s8, @ref v512_setall_u8, ...,
+@ref v512_setzero_u8, @ref v512_setzero_s8, ...
+- Memory load operations:
+@ref v512_load, @ref v512_load_aligned, @ref v512_load_low, @ref v512_load_halves,
+- Memory operations with expansion of values:
+@ref v512_load_expand, @ref v512_load_expand_q
+
+Store to memory operations are similar across different platform capabilities:
+@ref v_store, @ref v_store_aligned,
+@ref v_store_high, @ref v_store_low
+
+### Value reordering
+
+These operations allow to reorder or recombine elements in one or multiple vectors.
+
+- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
+- Expand: @ref v_expand, @ref v_expand_low, @ref v_expand_high
+- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
+@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
+- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
+- Reverse: @ref v_reverse
+- Extract: @ref v_extract
+
+
+### Arithmetic, bitwise and comparison operations
+
+Element-wise binary and unary operations.
+
+- Arithmetics:
+@ref operator +(const v_reg &a, const v_reg &b) "+",
+@ref operator -(const v_reg &a, const v_reg &b) "-",
+@ref operator *(const v_reg &a, const v_reg &b) "*",
+@ref operator /(const v_reg &a, const v_reg &b) "/",
+@ref v_mul_expand
+
+- Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
+
+- Bitwise shifts:
+@ref operator <<(const v_reg &a, int s) "<<",
+@ref operator >>(const v_reg &a, int s) ">>",
+@ref v_shl, @ref v_shr
+
+- Bitwise logic:
+@ref operator &(const v_reg &a, const v_reg &b) "&",
+@ref operator |(const v_reg &a, const v_reg &b) "|",
+@ref operator ^(const v_reg &a, const v_reg &b) "^",
+@ref operator ~(const v_reg &a) "~"
+
+- Comparison:
+@ref operator >(const v_reg &a, const v_reg &b) ">",
+@ref operator >=(const v_reg &a, const v_reg &b) ">=",
+@ref operator <(const v_reg &a, const v_reg &b) "<",
+@ref operator <=(const v_reg &a, const v_reg &b) "<=",
+@ref operator ==(const v_reg &a, const v_reg &b) "==",
+@ref operator !=(const v_reg &a, const v_reg &b) "!="
+
+- min/max: @ref v_min, @ref v_max
+
+### Reduce and mask
+
+Most of these operations return only one value.
+
+- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
+- Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
+
+### Other math
+
+- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
+- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
+
+### Conversions
+
+Different type conversions and casts:
+
+- Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
+- To float: @ref v_cvt_f32, @ref v_cvt_f64
+- Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
+
+### Matrix operations
+
+In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_dotprod_fast,
+@ref v_dotprod_expand, @ref v_dotprod_expand_fast, @ref v_matmul, @ref v_transpose4x4
+
+### Usability
+
+Most operations are implemented only for some subset of the available types, following matrices
+shows the applicability of different operations to the types.
+
+Regular integers:
+
+| Operations\\Types | uint 8 | int 8 | uint 16 | int 16 | uint 32 | int 32 |
+|-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
+|load, store        | x | x | x | x | x | x |
+|interleave         | x | x | x | x | x | x |
+|expand             | x | x | x | x | x | x |
+|expand_low         | x | x | x | x | x | x |
+|expand_high        | x | x | x | x | x | x |
+|expand_q           | x | x |   |   |   |   |
+|add, sub           | x | x | x | x | x | x |
+|add_wrap, sub_wrap | x | x | x | x |   |   |
+|mul_wrap           | x | x | x | x |   |   |
+|mul                | x | x | x | x | x | x |
+|mul_expand         | x | x | x | x | x |   |
+|compare            | x | x | x | x | x | x |
+|shift              |   |   | x | x | x | x |
+|dotprod            |   |   |   | x |   | x |
+|dotprod_fast       |   |   |   | x |   | x |
+|dotprod_expand     | x | x | x | x |   | x |
+|dotprod_expand_fast| x | x | x | x |   | x |
+|logical            | x | x | x | x | x | x |
+|min, max           | x | x | x | x | x | x |
+|absdiff            | x | x | x | x | x | x |
+|absdiffs           |   | x |   | x |   |   |
+|reduce             | x | x | x | x | x | x |
+|mask               | x | x | x | x | x | x |
+|pack               | x | x | x | x | x | x |
+|pack_u             | x |   | x |   |   |   |
+|pack_b             | x |   |   |   |   |   |
+|unpack             | x | x | x | x | x | x |
+|extract            | x | x | x | x | x | x |
+|rotate (lanes)     | x | x | x | x | x | x |
+|cvt_flt32          |   |   |   |   |   | x |
+|cvt_flt64          |   |   |   |   |   | x |
+|transpose4x4       |   |   |   |   | x | x |
+|reverse            | x | x | x | x | x | x |
+|extract_n          | x | x | x | x | x | x |
+|broadcast_element  |   |   |   |   | x | x |
+
+Big integers:
+
+| Operations\\Types | uint 64 | int 64 |
+|-------------------|:-:|:-:|
+|load, store        | x | x |
+|add, sub           | x | x |
+|shift              | x | x |
+|logical            | x | x |
+|reverse            | x | x |
+|extract            | x | x |
+|rotate (lanes)     | x | x |
+|cvt_flt64          |   | x |
+|extract_n          | x | x |
+
+Floating point:
+
+| Operations\\Types | float 32 | float 64 |
+|-------------------|:-:|:-:|
+|load, store        | x | x |
+|interleave         | x |   |
+|add, sub           | x | x |
+|mul                | x | x |
+|div                | x | x |
+|compare            | x | x |
+|min, max           | x | x |
+|absdiff            | x | x |
+|reduce             | x |   |
+|mask               | x | x |
+|unpack             | x | x |
+|cvt_flt32          |   | x |
+|cvt_flt64          | x |   |
+|sqrt, abs          | x | x |
+|float math         | x | x |
+|transpose4x4       | x |   |
+|extract            | x | x |
+|rotate (lanes)     | x | x |
+|reverse            | x | x |
+|extract_n          | x | x |
+|broadcast_element  | x |   |
+
+ @{ */
+
+template<typename _Tp, int n> struct v_reg
+{
+//! @cond IGNORED
+    typedef _Tp lane_type;
+    enum { nlanes = n };
+// !@endcond
+
+    /** @brief Constructor
+
+    Initializes register with data from memory
+    @param ptr pointer to memory block with data for register */
+    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
+
+    /** @brief Constructor
+
+    Initializes register with two 64-bit values */
+    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
+
+    /** @brief Constructor
+
+    Initializes register with four 32-bit values */
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
+
+    /** @brief Constructor
+
+    Initializes register with eight 16-bit values */
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+    }
+
+    /** @brief Constructor
+
+    Initializes register with sixteen 8-bit values */
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
+           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
+           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
+        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
+    }
+
+    /** @brief Default constructor
+
+    Does not initialize anything*/
+    v_reg() {}
+
+    /** @brief Copy constructor */
+    v_reg(const v_reg<_Tp, n> & r)
+    {
+        for( int i = 0; i < n; i++ )
+            s[i] = r.s[i];
+    }
+    /** @brief Access first value
+
+    Returns value of the first lane according to register type, for example:
+    @code{.cpp}
+    v_int32x4 r(1, 2, 3, 4);
+    int v = r.get0(); // returns 1
+    v_uint64x2 r(1, 2);
+    uint64_t v = r.get0(); // returns 1
+    @endcode
+    */
+    _Tp get0() const { return s[0]; }
+
+//! @cond IGNORED
+    _Tp get(const int i) const { return s[i]; }
+    v_reg<_Tp, n> high() const
+    {
+        v_reg<_Tp, n> c;
+        int i;
+        for( i = 0; i < n/2; i++ )
+        {
+            c.s[i] = s[i+(n/2)];
+            c.s[i+(n/2)] = 0;
+        }
+        return c;
+    }
+
+    static v_reg<_Tp, n> zero()
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = (_Tp)0;
+        return c;
+    }
+
+    static v_reg<_Tp, n> all(_Tp s)
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = s;
+        return c;
+    }
+
+    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
+    {
+        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
+        v_reg<_Tp2, n2> c;
+        std::memcpy(&c.s[0], &s[0], bytes);
+        return c;
+    }
+
+    v_reg& operator=(const v_reg<_Tp, n> & r)
+    {
+        for( int i = 0; i < n; i++ )
+            s[i] = r.s[i];
+        return *this;
+    }
+
+    _Tp s[n];
+//! @endcond
+};
+
+/** @brief Sixteen 8-bit unsigned integer values */
+typedef v_reg<uchar, 16> v_uint8x16;
+/** @brief Sixteen 8-bit signed integer values */
+typedef v_reg<schar, 16> v_int8x16;
+/** @brief Eight 16-bit unsigned integer values */
+typedef v_reg<ushort, 8> v_uint16x8;
+/** @brief Eight 16-bit signed integer values */
+typedef v_reg<short, 8> v_int16x8;
+/** @brief Four 32-bit unsigned integer values */
+typedef v_reg<unsigned, 4> v_uint32x4;
+/** @brief Four 32-bit signed integer values */
+typedef v_reg<int, 4> v_int32x4;
+/** @brief Four 32-bit floating point values (single precision) */
+typedef v_reg<float, 4> v_float32x4;
+/** @brief Two 64-bit floating point values (double precision) */
+typedef v_reg<double, 2> v_float64x2;
+/** @brief Two 64-bit unsigned integer values */
+typedef v_reg<uint64, 2> v_uint64x2;
+/** @brief Two 64-bit signed integer values */
+typedef v_reg<int64, 2> v_int64x2;
+
+#if CV_SIMD256
+/** @brief Thirty two 8-bit unsigned integer values */
+typedef v_reg<uchar, 32> v_uint8x32;
+/** @brief Thirty two 8-bit signed integer values */
+typedef v_reg<schar, 32> v_int8x32;
+/** @brief Sixteen 16-bit unsigned integer values */
+typedef v_reg<ushort, 16> v_uint16x16;
+/** @brief Sixteen 16-bit signed integer values */
+typedef v_reg<short, 16> v_int16x16;
+/** @brief Eight 32-bit unsigned integer values */
+typedef v_reg<unsigned, 8> v_uint32x8;
+/** @brief Eight 32-bit signed integer values */
+typedef v_reg<int, 8> v_int32x8;
+/** @brief Eight 32-bit floating point values (single precision) */
+typedef v_reg<float, 8> v_float32x8;
+/** @brief Four 64-bit floating point values (double precision) */
+typedef v_reg<double, 4> v_float64x4;
+/** @brief Four 64-bit unsigned integer values */
+typedef v_reg<uint64, 4> v_uint64x4;
+/** @brief Four 64-bit signed integer values */
+typedef v_reg<int64, 4> v_int64x4;
+#endif
+
+#if CV_SIMD512
+/** @brief Sixty four 8-bit unsigned integer values */
+typedef v_reg<uchar, 64> v_uint8x64;
+/** @brief Sixty four 8-bit signed integer values */
+typedef v_reg<schar, 64> v_int8x64;
+/** @brief Thirty two 16-bit unsigned integer values */
+typedef v_reg<ushort, 32> v_uint16x32;
+/** @brief Thirty two 16-bit signed integer values */
+typedef v_reg<short, 32> v_int16x32;
+/** @brief Sixteen 32-bit unsigned integer values */
+typedef v_reg<unsigned, 16> v_uint32x16;
+/** @brief Sixteen 32-bit signed integer values */
+typedef v_reg<int, 16> v_int32x16;
+/** @brief Sixteen 32-bit floating point values (single precision) */
+typedef v_reg<float, 16> v_float32x16;
+/** @brief Eight 64-bit floating point values (double precision) */
+typedef v_reg<double, 8> v_float64x8;
+/** @brief Eight 64-bit unsigned integer values */
+typedef v_reg<uint64, 8> v_uint64x8;
+/** @brief Eight 64-bit signed integer values */
+typedef v_reg<int64, 8> v_int64x8;
+#endif
+
+enum {
+    simd128_width = 16,
+#if CV_SIMD256
+    simd256_width = 32,
+#endif
+#if CV_SIMD512
+    simd512_width = 64,
+    simdmax_width = simd512_width
+#elif CV_SIMD256
+    simdmax_width = simd256_width
+#else
+    simdmax_width = simd128_width
+#endif
+};
+
+/** @brief Add values
+
+For all types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Subtract values
+
+For all types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Multiply values
+
+For 16- and 32-bit integer types and floating types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Divide values
+
+For floating types only. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+
+/** @brief Bitwise AND
+
+Only for integer types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Bitwise OR
+
+Only for integer types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Bitwise XOR
+
+Only for integer types.*/
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Bitwise NOT
+
+Only for integer types.*/
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
+
+
+#ifndef CV_DOXYGEN
+
+#define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \
+__CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(schar, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(short, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(int, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(int64, __VA_ARGS__)) \
+
+#define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \
+__CV_EXPAND(macro_name(float, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(double, __VA_ARGS__)) \
+
+#define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
+
+#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
+template<int n> inline \
+v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return c; \
+} \
+template<int n> inline \
+v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return a; \
+}
+
+#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
+
+CV__HAL_INTRIN_IMPL_BIN_OP(+)
+CV__HAL_INTRIN_IMPL_BIN_OP(-)
+CV__HAL_INTRIN_IMPL_BIN_OP(*)
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
+
+#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
+template<int n> CV_INLINE \
+v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return c; \
+} \
+template<int n> CV_INLINE \
+v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return a; \
+}
+
+#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
+
+
+CV__HAL_INTRIN_IMPL_BIT_OP(&)
+CV__HAL_INTRIN_IMPL_BIT_OP(|)
+CV__HAL_INTRIN_IMPL_BIT_OP(^)
+
+#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
+template<int n> CV_INLINE \
+v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \
+    return c; \
+} \
+
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
+
+#endif  // !CV_DOXYGEN
+
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
+template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp2, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i]); \
+    return c; \
+}
+
+/** @brief Square root of elements
+
+Only for floating point types.*/
+OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
+
+//! @cond IGNORED
+OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+//! @endcond
+
+/** @brief Absolute value of elements
+
+Only for floating point types.*/
+OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
+                          typename V_TypeTraits<_Tp>::abs_type)
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i], b.s[i]); \
+    return c; \
+}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
+template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
+{ \
+    _Tp c = a.s[0]; \
+    for( int i = 1; i < n; i++ ) \
+        c = cfunc(c, a.s[i]); \
+    return c; \
+}
+
+/** @brief Choose min values for each pair
+
+Scheme:
+@code
+{A1 A2 ...}
+{B1 B2 ...}
+--------------
+{min(A1,B1) min(A2,B2) ...}
+@endcode
+For all types except 64-bit integer. */
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
+
+/** @brief Choose max values for each pair
+
+Scheme:
+@code
+{A1 A2 ...}
+{B1 B2 ...}
+--------------
+{max(A1,B1) max(A2,B2) ...}
+@endcode
+For all types except 64-bit integer. */
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
+
+/** @brief Find one min value
+
+Scheme:
+@code
+{A1 A2 A3 ...} => min(A1,A2,A3,...)
+@endcode
+For all types except 64-bit integer and 64-bit floating point types. */
+OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
+
+/** @brief Find one max value
+
+Scheme:
+@code
+{A1 A2 A3 ...} => max(A1,A2,A3,...)
+@endcode
+For all types except 64-bit integer and 64-bit floating point types. */
+OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
+
+static const unsigned char popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+/** @brief Count the 1 bits in the vector lanes and return result as corresponding unsigned type
+
+Scheme:
+@code
+{A1 A2 A3 ...} => {popcount(A1), popcount(A2), popcount(A3), ...}
+@endcode
+For all integer types. */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
+    for (int i = 0; i < n*(int)sizeof(_Tp); i++)
+        b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
+    return b;
+}
+
+
+//! @cond IGNORED
+template<typename _Tp, int n>
+inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
+{
+    for( int i = 0; i < n; i++ )
+    {
+        minval.s[i] = std::min(a.s[i], b.s[i]);
+        maxval.s[i] = std::max(a.s[i], b.s[i]);
+    }
+}
+//! @endcond
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
+    return c; \
+}
+
+/** @brief Less-than comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(<)
+
+/** @brief Greater-than comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(>)
+
+/** @brief Less-than or equal comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(<=)
+
+/** @brief Greater-than or equal comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(>=)
+
+/** @brief Equal comparison */
+OPENCV_HAL_IMPL_CMP_OP(==)
+
+/** @brief Not equal comparison */
+OPENCV_HAL_IMPL_CMP_OP(!=)
+
+template<int n>
+inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
+{
+    typedef typename V_TypeTraits<float>::int_type itype;
+    v_reg<float, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+template<int n>
+inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
+{
+    typedef typename V_TypeTraits<double>::int_type itype;
+    v_reg<double, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef _Tp2 rtype; \
+    v_reg<rtype, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
+    return c; \
+}
+
+/** @brief Add values without saturation
+
+For 8- and 16-bit integer values. */
+OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
+
+/** @brief Subtract values without saturation
+
+For 8- and 16-bit integer values. */
+OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
+
+/** @brief Multiply values without saturation
+
+For 8- and 16-bit integer values. */
+OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
+
+//! @cond IGNORED
+template<typename T> inline T _absdiff(T a, T b)
+{
+    return a > b ? a - b : b - a;
+}
+//! @endcond
+
+/** @brief Absolute difference
+
+Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
+Example:
+@code{.cpp}
+v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
+v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
+@endcode
+For 8-, 16-, 32-bit integer source types. */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
+{
+    typedef typename V_TypeTraits<_Tp>::abs_type rtype;
+    v_reg<rtype, n> c;
+    const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
+    for( int i = 0; i < n; i++ )
+    {
+        rtype ua = a.s[i] ^ mask;
+        rtype ub = b.s[i] ^ mask;
+        c.s[i] = _absdiff(ua, ub);
+    }
+    return c;
+}
+
+/** @overload
+
+For 32-bit floating point values */
+template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < c.nlanes; i++ )
+        c.s[i] = _absdiff(a.s[i], b.s[i]);
+    return c;
+}
+
+/** @overload
+
+For 64-bit floating point values */
+template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < c.nlanes; i++ )
+        c.s[i] = _absdiff(a.s[i], b.s[i]);
+    return c;
+}
+
+/** @brief Saturating absolute difference
+
+Returns \f$ saturate(|a - b|) \f$ .
+For 8-, 16-bit signed integer source types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++)
+        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
+    return c;
+}
+
+/** @brief Inversed square root
+
+Returns \f$ 1/sqrt(a) \f$
+For floating point types only. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = 1.f/std::sqrt(a.s[i]);
+    return c;
+}
+
+/** @brief Magnitude
+
+Returns \f$ sqrt(a^2 + b^2) \f$
+For floating point types only. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
+    return c;
+}
+
+/** @brief Square of the magnitude
+
+Returns \f$ a^2 + b^2 \f$
+For floating point types only. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
+    return c;
+}
+
+/** @brief Multiply and add
+
+ Returns \f$ a*b + c \f$
+ For floating point types and signed 32bit int only. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                           const v_reg<_Tp, n>& c)
+{
+    v_reg<_Tp, n> d;
+    for( int i = 0; i < n; i++ )
+        d.s[i] = a.s[i]*b.s[i] + c.s[i];
+    return d;
+}
+
+/** @brief A synonym for v_fma */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                              const v_reg<_Tp, n>& c)
+{
+    return v_fma(a, b, c);
+}
+
+/** @brief Dot product of elements
+
+Multiply values in two registers and sum adjacent result pairs.
+
+Scheme:
+@code
+  {A1 A2 ...} // 16-bit
+x {B1 B2 ...} // 16-bit
+-------------
+{A1B1+A2B2 ...} // 32-bit
+
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
+    return c;
+}
+
+/** @brief Dot product of elements
+
+Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 ...} // 16-bit
+x {B1 B2 ...} // 16-bit
+-------------
+  {A1B1+A2B2+C1 ...} // 32-bit
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+          const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> s;
+    for( int i = 0; i < (n/2); i++ )
+        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
+    return s;
+}
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod, but it may perform unorder sum between result pairs in some platforms,
+this intrinsic can be used if the sum among all lanes is only matters
+and also it should be yielding better performance on the affected platforms.
+
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{ return v_dotprod(a, b); }
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod_fast, but add a third element to the sum of adjacent pairs.
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+               const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{ return v_dotprod(a, b, c); }
+
+/** @brief Dot product of elements and expand
+
+Multiply values in two registers and expand the sum of adjacent result pairs.
+
+Scheme:
+@code
+  {A1 A2 A3 A4 ...} // 8-bit
+x {B1 B2 B3 B4 ...} // 8-bit
+-------------
+  {A1B1+A2B2+A3B3+A4B4 ...} // 32-bit
+
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
+    return s;
+}
+
+/** @brief Dot product of elements
+
+Same as cv::v_dotprod_expand, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 A3 A4 ...} // 8-bit
+x {B1 B2 B3 B4 ...} // 8-bit
+-------------
+  {A1B1+A2B2+A3B3+A4B4+C1 ...} // 32-bit
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
+    return s;
+}
+
+/** @brief Fast Dot product of elements and expand
+
+Multiply values in two registers and expand the sum of adjacent result pairs.
+
+Same as cv::v_dotprod_expand, but it may perform unorder sum between result pairs in some platforms,
+this intrinsic can be used if the sum among all lanes is only matters
+and also it should be yielding better performance on the affected platforms.
+
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{ return v_dotprod_expand(a, b); }
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod_expand_fast, but add a third element to the sum of adjacent pairs.
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                      const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{ return v_dotprod_expand(a, b, c); }
+
+/** @brief Multiply and expand
+
+Multiply values two registers and store results in two registers with wider pack type.
+Scheme:
+@code
+  {A B C D} // 32-bit
+x {E F G H} // 32-bit
+---------------
+{AE BF}         // 64-bit
+        {CG DH} // 64-bit
+@endcode
+Example:
+@code{.cpp}
+v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
+v_uint64x2 c, d; // results
+v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
+@endcode
+Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
+*/
+template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i]*b.s[i];
+        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
+    }
+}
+
+/** @brief Multiply and extract high part
+
+Multiply values two registers and store high part of the results.
+Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
+*/
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
+    return c;
+}
+
+//! @cond IGNORED
+template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
+                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
+    }
+}
+//! @endcond
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
+    return c; \
+}
+
+/** @brief Bitwise shift left
+
+For 16-, 32- and 64-bit integer values. */
+OPENCV_HAL_IMPL_SHIFT_OP(<< )
+
+/** @brief Bitwise shift right
+
+For 16-, 32- and 64-bit integer values. */
+OPENCV_HAL_IMPL_SHIFT_OP(>> )
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
+template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp, n> b; \
+    for (int i = 0; i < n; i++) \
+    { \
+        int sIndex = i opA imm; \
+        if (0 <= sIndex && sIndex < n) \
+        { \
+            b.s[i] = a.s[sIndex]; \
+        } \
+        else \
+        { \
+            b.s[i] = 0; \
+        } \
+    } \
+    return b; \
+} \
+template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for (int i = 0; i < n; i++) \
+    { \
+        int aIndex = i opA imm; \
+        int bIndex = i opA imm opB n; \
+        if (0 <= bIndex && bIndex < n) \
+        { \
+            c.s[i] = b.s[bIndex]; \
+        } \
+        else if (0 <= aIndex && aIndex < n) \
+        { \
+            c.s[i] = a.s[aIndex]; \
+        } \
+        else \
+        { \
+            c.s[i] = 0; \
+        } \
+    } \
+    return c; \
+}
+
+/** @brief Element shift left among vector
+
+For all type */
+OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left,  -, +)
+
+/** @brief Element shift right among vector
+
+For all type */
+OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
+
+/** @brief Sum packed values
+
+Scheme:
+@code
+{A1 A2 A3 ...} => sum{A1,A2,A3,...}
+@endcode
+*/
+template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
+{
+    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
+    for( int i = 1; i < n; i++ )
+        c += a.s[i];
+    return c;
+}
+
+/** @brief Sums all elements of each input vector, returns the vector of sums
+
+ Scheme:
+ @code
+ result[0] = a[0] + a[1] + a[2] + a[3]
+ result[1] = b[0] + b[1] + b[2] + b[3]
+ result[2] = c[0] + c[1] + c[2] + c[3]
+ result[3] = d[0] + d[1] + d[2] + d[3]
+ @endcode
+*/
+template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,
+    const v_reg<float, n>& c, const v_reg<float, n>& d)
+{
+    v_reg<float, n> r;
+    for(int i = 0; i < (n/4); i++)
+    {
+        r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];
+        r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];
+        r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];
+        r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];
+    }
+    return r;
+}
+
+/** @brief Sum absolute differences of values
+
+Scheme:
+@code
+{A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
+@endcode
+For all types except 64-bit types.*/
+template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
+    for (int i = 1; i < n; i++)
+        c += _absdiff(a.s[i], b.s[i]);
+    return c;
+}
+
+/** @brief Get negative values mask
+@deprecated v_signmask depends on a lane count heavily and therefore isn't universal enough
+
+Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
+Example:
+@code{.cpp}
+v_int32x4 r; // set to {-1, -1, 1, 1}
+int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
+@endcode
+*/
+template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
+{
+    int mask = 0;
+    for( int i = 0; i < n; i++ )
+        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
+    return mask;
+}
+
+/** @brief Get first negative lane index
+
+Returned value is an index of first negative lane (undefined for input of all positive values)
+Example:
+@code{.cpp}
+v_int32x4 r; // set to {0, 0, -1, -1}
+int idx = v_heading_zeros(r); // idx = 2
+@endcode
+*/
+template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
+{
+    for (int i = 0; i < n; i++)
+        if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)
+            return i;
+    return 0;
+}
+
+/** @brief Check if all packed values are less than zero
+
+Unsigned values will be casted to signed: `uchar 254 => char -2`.
+*/
+template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
+            return false;
+    return true;
+}
+
+/** @brief Check if any of packed values is less than zero
+
+Unsigned values will be casted to signed: `uchar 254 => char -2`.
+*/
+template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
+            return true;
+    return false;
+}
+
+/** @brief Per-element select (blend operation)
+
+Return value will be built by combining values _a_ and _b_ using the following scheme:
+    result[i] = mask[i] ? a[i] : b[i];
+
+@note: _mask_ element values are restricted to these values:
+- 0: select element from _b_
+- 0xff/0xffff/etc: select element from _a_
+(fully compatible with bitwise-based operator)
+*/
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
+                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef V_TypeTraits<_Tp> Traits;
+    typedef typename Traits::int_type int_type;
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        int_type m = Traits::reinterpret_int(mask.s[i]);
+        CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc
+        c.s[i] = m ? a.s[i] : b.s[i];
+    }
+    return c;
+}
+
+/** @brief Expand values to the wider pack type
+
+Copy contents of register to two registers with 2x wider pack type.
+Scheme:
+@code
+ int32x4     int64x2 int64x2
+{A B C D} ==> {A B} , {C D}
+@endcode */
+template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        b0.s[i] = a.s[i];
+        b1.s[i] = a.s[i+(n/2)];
+    }
+}
+
+/** @brief Expand lower values to the wider pack type
+
+Same as cv::v_expand, but return lower half of the vector.
+
+Scheme:
+@code
+ int32x4     int64x2
+{A B C D} ==> {A B}
+@endcode */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_expand_low(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
+    for( int i = 0; i < (n/2); i++ )
+        b.s[i] = a.s[i];
+    return b;
+}
+
+/** @brief Expand higher values to the wider pack type
+
+Same as cv::v_expand_low, but expand higher half of the vector instead.
+
+Scheme:
+@code
+ int32x4     int64x2
+{A B C D} ==> {C D}
+@endcode */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_expand_high(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
+    for( int i = 0; i < (n/2); i++ )
+        b.s[i] = a.s[i+(n/2)];
+    return b;
+}
+
+//! @cond IGNORED
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
+    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
+    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
+    return c;
+}
+//! @endcond
+
+/** @brief Interleave two vectors
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+---------------
+  {A1 B1 A2 B2} and {A3 B3 A4 B4}
+@endcode
+For all types except 64-bit.
+*/
+template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
+{
+    int i;
+    for( i = 0; i < n/2; i++ )
+    {
+        b0.s[i*2] = a0.s[i];
+        b0.s[i*2+1] = a1.s[i];
+    }
+    for( ; i < n; i++ )
+    {
+        b1.s[i*2-n] = a0.s[i];
+        b1.s[i*2-n+1] = a1.s[i];
+    }
+}
+
+/** @brief Load register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
+
+@note Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
+}
+
+#if CV_SIMD256
+/** @brief Load 256-bit length register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x32, int ==> cv::v_int32x8, etc.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load 512-bit length register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x64, int ==> cv::v_int32x16, etc.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary in case of SIMD128, 32-byte - SIMD256, etc)
+
+@note Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd128_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
+}
+
+#if CV_SIMD256
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v256_load, but source memory block should be aligned (to 32-byte boundary in case of SIMD256, 64-byte - SIMD512, etc)
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd256_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v512_load, but source memory block should be aligned (to 64-byte boundary in case of SIMD512, etc)
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd512_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+/** @brief Load 64-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[2] = { 1, 2 };
+v_int32x4 r = v_load_low(lo);
+@endcode
+
+@note Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for( int i = 0; i < c.nlanes/2; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+#if CV_SIMD256
+/** @brief Load 128-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4 };
+v_int32x8 r = v256_load_low(lo);
+@endcode
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load 256-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+v_int32x16 r = v512_load_low(lo);
+@endcode
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
+v_int32x4 r = v_load_halves(lo, hi);
+@endcode
+
+@note Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for( int i = 0; i < c.nlanes/2; i++ )
+    {
+        c.s[i] = loptr[i];
+        c.s[i+c.nlanes/2] = hiptr[i];
+    }
+    return c;
+}
+
+#if CV_SIMD256
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4 }, hi[4] = { 5, 6, 7, 8 };
+v_int32x8 r = v256_load_halves(lo, hi);
+@endcode
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = loptr[i];
+        c.s[i + c.nlanes / 2] = hiptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4, 5, 6, 7, 8 }, hi[4] = { 9, 10, 11, 12, 13, 14, 15, 16 };
+v_int32x16 r = v512_load_halves(lo, hi);
+@endcode
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = loptr[i];
+        c.s[i + c.nlanes / 2] = hiptr[i];
+    }
+    return c;
+}
+#endif
+
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[4] = {1, 2, 3, 4}; // type is int16
+v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd128_width / sizeof(w_type)> c;
+    for( int i = 0; i < c.nlanes; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+#if CV_SIMD256
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v256_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int16
+v_int32x8 r = v256_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v256_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd256_width / sizeof(w_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v512_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int16
+v_int32x16 r = v512_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v512_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd512_width / sizeof(w_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[4] = {1, 2, 3, 4}; // type is int8
+v_int32x4 r = v_load_expand_q(buf); // r = {1, 2, 3, 4} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd128_width / sizeof(q_type)> c;
+    for( int i = 0; i < c.nlanes; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+#if CV_SIMD256
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v256_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int8
+v_int32x8 r = v256_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v256_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd256_width / sizeof(q_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v512_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int8
+v_int32x16 r = v512_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v512_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd512_width / sizeof(q_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+/** @brief Load and deinterleave (2 channels)
+
+Load data from memory deinterleave and store to 2 registers.
+Scheme:
+@code
+{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i2;
+    for( i = i2 = 0; i < n; i++, i2 += 2 )
+    {
+        a.s[i] = ptr[i2];
+        b.s[i] = ptr[i2+1];
+    }
+}
+
+/** @brief Load and deinterleave (3 channels)
+
+Load data from memory deinterleave and store to 3 registers.
+Scheme:
+@code
+{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        a.s[i] = ptr[i3];
+        b.s[i] = ptr[i3+1];
+        c.s[i] = ptr[i3+2];
+    }
+}
+
+/** @brief Load and deinterleave (4 channels)
+
+Load data from memory deinterleave and store to 4 registers.
+Scheme:
+@code
+{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
+                                v_reg<_Tp, n>& d)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        a.s[i] = ptr[i4];
+        b.s[i] = ptr[i4+1];
+        c.s[i] = ptr[i4+2];
+        d.s[i] = ptr[i4+3];
+    }
+}
+
+/** @brief Interleave and store (2 channels)
+
+Interleave and store data from 2 registers to memory.
+Scheme:
+@code
+{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                               const v_reg<_Tp, n>& b,
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i2;
+    for( i = i2 = 0; i < n; i++, i2 += 2 )
+    {
+        ptr[i2] = a.s[i];
+        ptr[i2+1] = b.s[i];
+    }
+}
+
+/** @brief Interleave and store (3 channels)
+
+Interleave and store data from 3 registers to memory.
+Scheme:
+@code
+{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        ptr[i3] = a.s[i];
+        ptr[i3+1] = b.s[i];
+        ptr[i3+2] = c.s[i];
+    }
+}
+
+/** @brief Interleave and store (4 channels)
+
+Interleave and store data from 4 registers to memory.
+Scheme:
+@code
+{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                                            const v_reg<_Tp, n>& d,
+                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        ptr[i4] = a.s[i];
+        ptr[i4+1] = b.s[i];
+        ptr[i4+2] = c.s[i];
+        ptr[i4+3] = d.s[i];
+    }
+}
+
+/** @brief Store data to memory
+
+Store register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {A B C D}
+@endcode
+Pointer can be unaligned. */
+template<typename _Tp, int n>
+inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_store(ptr, a);
+}
+
+/** @brief Store data to memory (lower half)
+
+Store lower half of register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {A B}
+@endcode */
+template<typename _Tp, int n>
+inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i];
+}
+
+/** @brief Store data to memory (higher half)
+
+Store higher half of register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {C D}
+@endcode */
+template<typename _Tp, int n>
+inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i+(n/2)];
+}
+
+/** @brief Store data to memory (aligned)
+
+Store register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {A B C D}
+@endcode
+Pointer __should__ be aligned by 16-byte boundary. */
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
+    v_store(ptr, a);
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
+    v_store(ptr, a);
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
+    v_store(ptr, a);
+}
+
+/** @brief Combine vector from first elements of two vectors
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+---------------
+  {A1 A2 B1 B2}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i];
+        c.s[i+(n/2)] = b.s[i];
+    }
+    return c;
+}
+
+/** @brief Combine vector from last elements of two vectors
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+---------------
+  {A3 A4 B3 B4}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i+(n/2)];
+        c.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+    return c;
+}
+
+/** @brief Combine two vectors from lower and higher parts of two other vectors
+
+@code{.cpp}
+low = cv::v_combine_low(a, b);
+high = cv::v_combine_high(a, b);
+@endcode */
+template<typename _Tp, int n>
+inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        low.s[i] = a.s[i];
+        low.s[i+(n/2)] = b.s[i];
+        high.s[i] = a.s[i+(n/2)];
+        high.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+/** @brief Vector reverse order
+
+Reverse the order of the vector
+Scheme:
+@code
+  REG {A1 ... An} ==> REG {An ... A1}
+@endcode
+For all types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[n-i-1];
+    return c;
+}
+
+/** @brief Vector extract
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+========================
+shift = 1  {A2 A3 A4 B1}
+shift = 2  {A3 A4 B1 B2}
+shift = 3  {A4 B1 B2 B3}
+@endcode
+Restriction: 0 <= shift < nlanes
+
+Usage:
+@code
+v_int32x4 a, b, c;
+c = v_extract<2>(a, b);
+@endcode
+For all types. */
+template<int s, typename _Tp, int n>
+inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> r;
+    const int shift = n - s;
+    int i = 0;
+    for (; i < shift; ++i)
+        r.s[i] = a.s[i+s];
+    for (; i < n; ++i)
+        r.s[i] = b.s[i-shift];
+    return r;
+}
+
+/** @brief Vector extract
+
+Scheme:
+Return the s-th element of v.
+Restriction: 0 <= s < nlanes
+
+Usage:
+@code
+v_int32x4 a;
+int r;
+r = v_extract_n<2>(a);
+@endcode
+For all types. */
+template<int s, typename _Tp, int n>
+inline _Tp v_extract_n(const v_reg<_Tp, n>& v)
+{
+    CV_DbgAssert(s >= 0 && s < n);
+    return v.s[s];
+}
+
+/** @brief Broadcast i-th element of vector
+
+Scheme:
+@code
+{ v[0] v[1] v[2] ... v[SZ] } => { v[i], v[i], v[i] ... v[i] }
+@endcode
+Restriction: 0 <= i < nlanes
+Supported types: 32-bit integers and floats (s32/u32/f32)
+ */
+template<int i, typename _Tp, int n>
+inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
+{
+    CV_DbgAssert(i >= 0 && i < n);
+    return v_reg<_Tp, n>::all(a.s[i]);
+}
+
+/** @brief Round elements
+
+Rounds each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
+template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvRound(a.s[i]);
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = cvRound(b.s[i]);
+    }
+    return c;
+}
+
+/** @brief Floor elements
+
+Floor each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
+template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvFloor(a.s[i]);
+    return c;
+}
+
+/** @brief Ceil elements
+
+Ceil each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
+template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvCeil(a.s[i]);
+    return c;
+}
+
+/** @brief Truncate elements
+
+Truncate each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
+template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (int)(a.s[i]);
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvFloor(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (int)(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @brief Convert to float
+
+Supported input type is cv::v_int32. */
+template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (float)a.s[i];
+    return c;
+}
+
+/** @brief Convert lower half to float
+
+Supported input type is cv::v_float64. */
+template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
+{
+    v_reg<float, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (float)a.s[i];
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @brief Convert to float
+
+Supported input type is cv::v_float64. */
+template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<float, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (float)a.s[i];
+        c.s[i+n] = (float)b.s[i];
+    }
+    return c;
+}
+
+/** @brief Convert lower half to double
+
+Supported input type is cv::v_int32. */
+template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
+{
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+/** @brief Convert to double high part of vector
+
+Supported input type is cv::v_int32. */
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
+{
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
+    return c;
+}
+
+/** @brief Convert lower half to double
+
+Supported input type is cv::v_float32. */
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
+{
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+/** @brief Convert to double high part of vector
+
+Supported input type is cv::v_float32. */
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
+{
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
+    return c;
+}
+
+/** @brief Convert to double
+
+Supported input type is cv::v_int64. */
+template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut(const _Tp* tab, const int* idx)
+{
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
+        c.s[i] = tab[idx[i]];
+    return c;
+}
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_pairs(const _Tp* tab, const int* idx)
+{
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
+        c.s[i] = tab[idx[i / 2] + i % 2];
+    return c;
+}
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_quads(const _Tp* tab, const int* idx)
+{
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
+        c.s[i] = tab[idx[i / 4] + i % 4];
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
+{
+    v_reg<int, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)
+{
+    v_reg<double, n/2> c;
+    for( int i = 0; i < n/2; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+
+template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
+                                               v_reg<float, n>& x, v_reg<float, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
+template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
+                                               v_reg<double, n>& x, v_reg<double, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
+{
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n/4; i++)
+    {
+        c.s[4*i  ] = vec.s[4*i  ];
+        c.s[4*i+1] = vec.s[4*i+2];
+        c.s[4*i+2] = vec.s[4*i+1];
+        c.s[4*i+3] = vec.s[4*i+3];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
+{
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n/8; i++)
+    {
+        c.s[8*i  ] = vec.s[8*i  ];
+        c.s[8*i+1] = vec.s[8*i+4];
+        c.s[8*i+2] = vec.s[8*i+1];
+        c.s[8*i+3] = vec.s[8*i+5];
+        c.s[8*i+4] = vec.s[8*i+2];
+        c.s[8*i+5] = vec.s[8*i+6];
+        c.s[8*i+6] = vec.s[8*i+3];
+        c.s[8*i+7] = vec.s[8*i+7];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
+{
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n/4; i++)
+    {
+        c.s[3*i  ] = vec.s[4*i  ];
+        c.s[3*i+1] = vec.s[4*i+1];
+        c.s[3*i+2] = vec.s[4*i+2];
+    }
+    return c;
+}
+
+/** @brief Transpose 4x4 matrix
+
+Scheme:
+@code
+a0  {A1 A2 A3 A4}
+a1  {B1 B2 B3 B4}
+a2  {C1 C2 C3 C4}
+a3  {D1 D2 D3 D4}
+===============
+b0  {A1 B1 C1 D1}
+b1  {A2 B2 C2 D2}
+b2  {A3 B3 C3 D3}
+b3  {A4 B4 C4 D4}
+@endcode
+*/
+template<typename _Tp, int n>
+inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                            const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,
+                            v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1,
+                            v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )
+{
+    for (int i = 0; i < n / 4; i++)
+    {
+        b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];
+        b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];
+        b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];
+        b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];
+        b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];
+        b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];
+        b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];
+        b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];
+    }
+}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
+inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
+
+//! @name Init with zero
+//! @{
+//! @brief Create new vector with zero elements
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, v, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, v, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, v, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, v, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, v, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, v, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, v, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, v, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, v, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, v, s64)
+
+#if CV_SIMD256
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x32, v256, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x32, v256, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x16, v256, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x16, v256, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x8, v256, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x8, v256, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x8, v256, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x4, v256, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x4, v256, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x4, v256, s64)
+#endif
+
+#if CV_SIMD512
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x64, v512, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x64, v512, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x32, v512, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x32, v512, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x16, v512, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x16, v512, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x16, v512, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x8, v512, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x8, v512, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
+#endif
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
+inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
+
+//! @name Init with value
+//! @{
+//! @brief Create new vector with elements set to a specific value
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, v, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, v, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, v, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, v, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, v, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, v, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, v, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, v, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, v, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, v, s64)
+
+#if CV_SIMD256
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x32, uchar, v256, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x32, schar, v256, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x16, ushort, v256, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x16, short, v256, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x8, unsigned, v256, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x8, int, v256, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x8, float, v256, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x4, double, v256, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x4, uint64, v256, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x4, int64, v256, s64)
+#endif
+
+#if CV_SIMD512
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x64, uchar, v512, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x64, schar, v512, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x32, ushort, v512, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x32, short, v512, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x16, unsigned, v512, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x16, int, v512, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x16, float, v512, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x8, double, v512, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x8, uint64, v512, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x8, int64, v512, s64)
+#endif
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \
+template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \
+    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
+{ return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }
+
+//! @name Reinterpret
+//! @{
+//! @brief Convert vector to different type without modifying underlying data.
+OPENCV_HAL_IMPL_C_REINTERPRET(uchar, u8)
+OPENCV_HAL_IMPL_C_REINTERPRET(schar, s8)
+OPENCV_HAL_IMPL_C_REINTERPRET(ushort, u16)
+OPENCV_HAL_IMPL_C_REINTERPRET(short, s16)
+OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)
+OPENCV_HAL_IMPL_C_REINTERPRET(int, s32)
+OPENCV_HAL_IMPL_C_REINTERPRET(float, f32)
+OPENCV_HAL_IMPL_C_REINTERPRET(double, f64)
+OPENCV_HAL_IMPL_C_REINTERPRET(uint64, u64)
+OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
+{ return a << shift; }
+
+//! @name Left shift
+//! @{
+//! @brief Shift left
+OPENCV_HAL_IMPL_C_SHIFTL(ushort)
+OPENCV_HAL_IMPL_C_SHIFTL(short)
+OPENCV_HAL_IMPL_C_SHIFTL(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTL(int)
+OPENCV_HAL_IMPL_C_SHIFTL(uint64)
+OPENCV_HAL_IMPL_C_SHIFTL(int64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
+{ return a >> shift; }
+
+//! @name Right shift
+//! @{
+//! @brief Shift right
+OPENCV_HAL_IMPL_C_SHIFTR(ushort)
+OPENCV_HAL_IMPL_C_SHIFTR(short)
+OPENCV_HAL_IMPL_C_SHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTR(int)
+OPENCV_HAL_IMPL_C_SHIFTR(uint64)
+OPENCV_HAL_IMPL_C_SHIFTR(int64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+    return c; \
+}
+
+//! @name Rounding shift
+//! @{
+//! @brief Rounding shift right
+OPENCV_HAL_IMPL_C_RSHIFTR(ushort)
+OPENCV_HAL_IMPL_C_RSHIFTR(short)
+OPENCV_HAL_IMPL_C_RSHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_RSHIFTR(int)
+OPENCV_HAL_IMPL_C_RSHIFTR(uint64)
+OPENCV_HAL_IMPL_C_RSHIFTR(int64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
+    { \
+        c.s[i] = cast<_Tpn>(a.s[i]); \
+        c.s[i+n] = cast<_Tpn>(b.s[i]); \
+    } \
+    return c; \
+}
+
+//! @name Pack
+//! @{
+//! @brief Pack values from two vectors to one
+//!
+//! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
+//! converts to corresponding unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+//!
+//! @note All variants except 64-bit use saturation.
+OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
+    { \
+        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+        c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+    } \
+    return c; \
+}
+
+//! @name Pack with rounding shift
+//! @{
+//! @brief Pack values from two vectors to one with rounding shift
+//!
+//! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
+//! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+//!
+//! @note All variants except 64-bit use saturation.
+OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        ptr[i] = cast<_Tpn>(a.s[i]); \
+}
+
+//! @name Pack and store
+//! @{
+//! @brief Store values from the input vector into memory with pack
+//!
+//! Values will be stored into memory with conversion to narrower type.
+//! Variant with _u_ suffix converts to corresponding unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+//!
+//! @note All variants except 64-bit use saturation.
+OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+}
+
+//! @name Pack and store with rounding shift
+//! @{
+//! @brief Store values from the input vector into memory with pack
+//!
+//! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
+//! memory. Variant with _u_ suffix converts to unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+//!
+//! @note All variants except 64-bit use saturation.
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, ushort, pack_u, saturate_cast)
+//! @}
+
+//! @cond IGNORED
+template<typename _Tpm, typename _Tp, int n>
+inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    for (int i = 0; i < n; ++i)
+    {
+        mptr[i] = (_Tpm)a.s[i];
+        mptr[i + n] = (_Tpm)b.s[i];
+    }
+}
+//! @endcond
+
+//! @name Pack boolean values
+//! @{
+//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
+//!
+//! @note Must provide valid boolean values to guarantee same result for all architectures.
+
+/** @brief
+//! For 16-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
+b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)
+{
+    v_reg<uchar, 2*n> mask;
+    _pack_b(mask.s, a, b);
+    return mask;
+}
+
+/** @overload
+For 32-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0 0 0xFFFF..}
+b  {0 0xFFFF.. 0xFFFF.. 0}
+c  {0xFFFF.. 0 0xFFFF.. 0}
+d  {0 0xFFFF.. 0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,
+                                                  const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)
+{
+    v_reg<uchar, 4*n> mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 2*n, c, d);
+    return mask;
+}
+
+/** @overload
+For 64-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0}
+b  {0 0xFFFF..}
+c  {0xFFFF.. 0}
+d  {0 0xFFFF..}
+
+e  {0xFFFF.. 0}
+f  {0xFFFF.. 0}
+g  {0 0xFFFF..}
+h  {0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0xFF 0 0 0xFF
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,
+                                                  const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,
+                                                  const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,
+                                                  const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)
+{
+    v_reg<uchar, 8*n> mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 2*n, c, d);
+    _pack_b(mask.s + 4*n, e, f);
+    _pack_b(mask.s + 6*n, g, h);
+    return mask;
+}
+//! @}
+
+/** @brief Matrix multiplication
+
+Scheme:
+@code
+{A0 A1 A2 A3}   |V0|
+{B0 B1 B2 B3}   |V1|
+{C0 C1 C2 C3}   |V2|
+{D0 D1 D2 D3} x |V3|
+====================
+{R0 R1 R2 R3}, where:
+R0 = A0V0 + B0V1 + C0V2 + D0V3,
+R1 = A1V0 + B1V1 + C1V2 + D1V3
+...
+@endcode
+*/
+template<int n>
+inline v_reg<float, n> v_matmul(const v_reg<float, n>& v,
+                                const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                const v_reg<float, n>& c, const v_reg<float, n>& d)
+{
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];
+        res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];
+        res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];
+        res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];
+    }
+    return res;
+}
+
+/** @brief Matrix multiplication and add
+
+Scheme:
+@code
+{A0 A1 A2 A3}   |V0|   |D0|
+{B0 B1 B2 B3}   |V1|   |D1|
+{C0 C1 C2 C3} x |V2| + |D2|
+====================   |D3|
+{R0 R1 R2 R3}, where:
+R0 = A0V0 + B0V1 + C0V2 + D0,
+R1 = A1V0 + B1V1 + C1V2 + D1
+...
+@endcode
+*/
+template<int n>
+inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
+                                   const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                   const v_reg<float, n>& c, const v_reg<float, n>& d)
+{
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];
+        res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];
+        res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];
+        res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];
+    }
+    return res;
+}
+
+
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                           const v_reg<double, n/2>& c)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
+
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)
+{ return v_dotprod_expand(a, b); }
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                                const v_reg<double, n/2>& c)
+{ return v_dotprod_expand(a, b, c); }
+
+////// FP16 support ///////
+
+inline v_reg<float, simd128_width / sizeof(float)>
+v_load_expand(const hfloat* ptr)
+{
+    v_reg<float, simd128_width / sizeof(float)> v;
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#if CV_SIMD256
+inline v_reg<float, simd256_width / sizeof(float)>
+v256_load_expand(const hfloat* ptr)
+{
+    v_reg<float, simd256_width / sizeof(float)> v;
+    for (int i = 0; i < v.nlanes; i++)
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#endif
+#if CV_SIMD512
+inline v_reg<float, simd512_width / sizeof(float)>
+v512_load_expand(const hfloat* ptr)
+{
+    v_reg<float, simd512_width / sizeof(float)> v;
+    for (int i = 0; i < v.nlanes; i++)
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#endif
+
+template<int n> inline void
+v_pack_store(hfloat* ptr, const v_reg<float, n>& v)
+{
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        ptr[i] = hfloat(v.s[i]);
+    }
+}
+
+inline void v_cleanup() {}
+#if CV_SIMD256
+inline void v256_cleanup() {}
+#endif
+#if CV_SIMD512
+inline void v512_cleanup() {}
+#endif
+
+//! @}
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+#endif
+}
+
+#if !defined(CV_DOXYGEN)
+#undef CV_SIMD256
+#undef CV_SIMD512
+#endif
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_forward.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_forward.hpp
new file mode 100644
index 000000000000..28f67cc9ef97
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_forward.hpp
@@ -0,0 +1,191 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef CV__SIMD_FORWARD
+#error "Need to pre-define forward width"
+#endif
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+/** Types **/
+#if CV__SIMD_FORWARD == 1024
+// [todo] 1024
+#error "1024-long ops not implemented yet"
+#elif CV__SIMD_FORWARD == 512
+// 512
+#define __CV_VX(fun)   v512_##fun
+#define __CV_V_UINT8   v_uint8x64
+#define __CV_V_INT8    v_int8x64
+#define __CV_V_UINT16  v_uint16x32
+#define __CV_V_INT16   v_int16x32
+#define __CV_V_UINT32  v_uint32x16
+#define __CV_V_INT32   v_int32x16
+#define __CV_V_UINT64  v_uint64x8
+#define __CV_V_INT64   v_int64x8
+#define __CV_V_FLOAT32 v_float32x16
+#define __CV_V_FLOAT64 v_float64x8
+struct v_uint8x64;
+struct v_int8x64;
+struct v_uint16x32;
+struct v_int16x32;
+struct v_uint32x16;
+struct v_int32x16;
+struct v_uint64x8;
+struct v_int64x8;
+struct v_float32x16;
+struct v_float64x8;
+#elif CV__SIMD_FORWARD == 256
+// 256
+#define __CV_VX(fun)   v256_##fun
+#define __CV_V_UINT8   v_uint8x32
+#define __CV_V_INT8    v_int8x32
+#define __CV_V_UINT16  v_uint16x16
+#define __CV_V_INT16   v_int16x16
+#define __CV_V_UINT32  v_uint32x8
+#define __CV_V_INT32   v_int32x8
+#define __CV_V_UINT64  v_uint64x4
+#define __CV_V_INT64   v_int64x4
+#define __CV_V_FLOAT32 v_float32x8
+#define __CV_V_FLOAT64 v_float64x4
+struct v_uint8x32;
+struct v_int8x32;
+struct v_uint16x16;
+struct v_int16x16;
+struct v_uint32x8;
+struct v_int32x8;
+struct v_uint64x4;
+struct v_int64x4;
+struct v_float32x8;
+struct v_float64x4;
+#else
+// 128
+#define __CV_VX(fun)   v_##fun
+#define __CV_V_UINT8   v_uint8x16
+#define __CV_V_INT8    v_int8x16
+#define __CV_V_UINT16  v_uint16x8
+#define __CV_V_INT16   v_int16x8
+#define __CV_V_UINT32  v_uint32x4
+#define __CV_V_INT32   v_int32x4
+#define __CV_V_UINT64  v_uint64x2
+#define __CV_V_INT64   v_int64x2
+#define __CV_V_FLOAT32 v_float32x4
+#define __CV_V_FLOAT64 v_float64x2
+struct v_uint8x16;
+struct v_int8x16;
+struct v_uint16x8;
+struct v_int16x8;
+struct v_uint32x4;
+struct v_int32x4;
+struct v_uint64x2;
+struct v_int64x2;
+struct v_float32x4;
+struct v_float64x2;
+#endif
+
+/** Value reordering **/
+
+// Expansion
+void v_expand(const __CV_V_UINT8&,  __CV_V_UINT16&, __CV_V_UINT16&);
+void v_expand(const __CV_V_INT8&,   __CV_V_INT16&,  __CV_V_INT16&);
+void v_expand(const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
+void v_expand(const __CV_V_INT16&,  __CV_V_INT32&,  __CV_V_INT32&);
+void v_expand(const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
+void v_expand(const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
+// Low Expansion
+__CV_V_UINT16 v_expand_low(const __CV_V_UINT8&);
+__CV_V_INT16  v_expand_low(const __CV_V_INT8&);
+__CV_V_UINT32 v_expand_low(const __CV_V_UINT16&);
+__CV_V_INT32  v_expand_low(const __CV_V_INT16&);
+__CV_V_UINT64 v_expand_low(const __CV_V_UINT32&);
+__CV_V_INT64  v_expand_low(const __CV_V_INT32&);
+// High Expansion
+__CV_V_UINT16 v_expand_high(const __CV_V_UINT8&);
+__CV_V_INT16  v_expand_high(const __CV_V_INT8&);
+__CV_V_UINT32 v_expand_high(const __CV_V_UINT16&);
+__CV_V_INT32  v_expand_high(const __CV_V_INT16&);
+__CV_V_UINT64 v_expand_high(const __CV_V_UINT32&);
+__CV_V_INT64  v_expand_high(const __CV_V_INT32&);
+// Load & Low Expansion
+__CV_V_UINT16 __CV_VX(load_expand)(const uchar*);
+__CV_V_INT16  __CV_VX(load_expand)(const schar*);
+__CV_V_UINT32 __CV_VX(load_expand)(const ushort*);
+__CV_V_INT32  __CV_VX(load_expand)(const short*);
+__CV_V_UINT64 __CV_VX(load_expand)(const uint*);
+__CV_V_INT64  __CV_VX(load_expand)(const int*);
+// Load lower 8-bit and expand into 32-bit
+__CV_V_UINT32 __CV_VX(load_expand_q)(const uchar*);
+__CV_V_INT32  __CV_VX(load_expand_q)(const schar*);
+
+// Saturating Pack
+__CV_V_UINT8  v_pack(const __CV_V_UINT16&, const __CV_V_UINT16&);
+__CV_V_INT8   v_pack(const __CV_V_INT16&,  const __CV_V_INT16&);
+__CV_V_UINT16 v_pack(const __CV_V_UINT32&, const __CV_V_UINT32&);
+__CV_V_INT16  v_pack(const __CV_V_INT32&,  const __CV_V_INT32&);
+// Non-saturating Pack
+__CV_V_UINT32 v_pack(const __CV_V_UINT64&, const __CV_V_UINT64&);
+__CV_V_INT32  v_pack(const __CV_V_INT64&,  const __CV_V_INT64&);
+// Pack signed integers with unsigned saturation
+__CV_V_UINT8  v_pack_u(const __CV_V_INT16&, const __CV_V_INT16&);
+__CV_V_UINT16 v_pack_u(const __CV_V_INT32&, const __CV_V_INT32&);
+
+/** Arithmetic, bitwise and comparison operations **/
+
+// Non-saturating multiply
+#if CV_VSX
+template<typename Tvec>
+Tvec v_mul_wrap(const Tvec& a, const Tvec& b);
+#else
+__CV_V_UINT8  v_mul_wrap(const __CV_V_UINT8&,  const __CV_V_UINT8&);
+__CV_V_INT8   v_mul_wrap(const __CV_V_INT8&,   const __CV_V_INT8&);
+__CV_V_UINT16 v_mul_wrap(const __CV_V_UINT16&, const __CV_V_UINT16&);
+__CV_V_INT16  v_mul_wrap(const __CV_V_INT16&,  const __CV_V_INT16&);
+#endif
+
+//  Multiply and expand
+#if CV_VSX
+template<typename Tvec, typename Twvec>
+void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d);
+#else
+void v_mul_expand(const __CV_V_UINT8&,  const __CV_V_UINT8&,  __CV_V_UINT16&, __CV_V_UINT16&);
+void v_mul_expand(const __CV_V_INT8&,   const __CV_V_INT8&,   __CV_V_INT16&,  __CV_V_INT16&);
+void v_mul_expand(const __CV_V_UINT16&, const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
+void v_mul_expand(const __CV_V_INT16&,  const __CV_V_INT16&,  __CV_V_INT32&,  __CV_V_INT32&);
+void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
+void v_mul_expand(const __CV_V_INT32&,  const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
+#endif
+
+// Conversions
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_INT32& a);
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a);
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a, const __CV_V_FLOAT64& b);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT32& a);
+__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_INT32& a);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_FLOAT32& a);
+__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_FLOAT32& a);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT64& a);
+
+/** Cleanup **/
+#undef CV__SIMD_FORWARD
+#undef __CV_VX
+#undef __CV_V_UINT8
+#undef __CV_V_INT8
+#undef __CV_V_UINT16
+#undef __CV_V_INT16
+#undef __CV_V_UINT32
+#undef __CV_V_INT32
+#undef __CV_V_UINT64
+#undef __CV_V_INT64
+#undef __CV_V_FLOAT32
+#undef __CV_V_FLOAT64
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_lasx.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_lasx.hpp
new file mode 100644
index 000000000000..4a98dbf96ebe
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_lasx.hpp
@@ -0,0 +1,3024 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_LASX_HPP
+#define OPENCV_HAL_INTRIN_LASX_HPP
+
+#include <lsxintrin.h>
+#include <lasxintrin.h>
+
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+#define CV_SIMD256_FP16 0
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Utils ////////////
+
+inline __m256i _v256_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8,  char v9,
+                    char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
+                    char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
+                    char v30, char v31)
+{
+    return (__m256i)v32i8{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+                           v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+                           v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+                           v30, v31 };
+}
+
+inline __m256i _v256_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8,  char v9,
+                   char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
+                   char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
+                   char v30, char v31)
+{
+    return (__m256i)v32i8{ v31, v30,
+                           v29, v28, v27, v26, v25, v24, v23, v22, v21, v20,
+                           v19, v18, v17, v16, v15, v14, v13, v12, v11, v10,
+                           v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 };
+}
+
+inline __m256i _v256_setr_h(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7,
+                            short v8,  short v9, short v10, short v11, short v12, short v13, short v14, short v15)
+{
+    return (__m256i)v16i16{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 };
+}
+
+inline __m256i _v256_setr_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+{
+    return (__m256i)v8i32{ v0, v1, v2, v3, v4, v5, v6, v7 };
+}
+
+inline __m256i _v256_set_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+{
+    return (__m256i)v8i32{ v7, v6, v5, v4, v3, v2, v1, v0 };
+}
+
+inline __m256i _v256_setall_w(int v0)
+{
+    return (__m256i)v8i32{ v0, v0, v0, v0, v0, v0, v0, v0 };
+}
+
+inline __m256i _v256_setr_d(int64 v0, int64 v1, int64 v2, int64 v3)
+{
+    return (__m256i)v4i64{ v0, v1, v2, v3 };
+}
+
+inline __m256i _v256_set_d(int64 v0, int64 v1, int64 v2, int64 v3)
+{
+    return (__m256i)v4i64{ v3, v2, v1, v0 };
+}
+
+inline __m256 _v256_setr_ps(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7)
+{
+    return (__m256)v8f32{ v0, v1, v2, v3, v4, v5, v6, v7 };
+}
+
+inline __m256 _v256_setall_ps(float f32)
+{
+    return (__m256)v8f32{ f32, f32, f32, f32, f32, f32, f32, f32 };
+}
+
+inline __m256d _v256_setr_pd(double v0, double v1, double v2, double v3)
+{
+    return (__m256d)v4f64{ v0, v1, v2, v3 };
+}
+
+inline __m256d _v256_setall_pd(double f64)
+{
+    return (__m256d)v4f64{ f64, f64, f64, f64 };
+}
+
+inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrarni_bu_h(b, a, 0);
+}
+
+inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrarni_b_h(b, a, 0);
+}
+
+inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrarni_hu_w(b, a, 0);
+}
+
+inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrarni_h_w(b, a, 0);
+}
+
+inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
+{ return __lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02); }
+
+inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
+{ return __m256(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
+
+inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
+{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
+
+inline __m256i _v256_shuffle_odd_64(const __m256i& v)
+{ return __lasx_xvpermi_d(v, 0xd8); }
+
+inline __m256d _v256_shuffle_odd_64(const __m256d& v)
+{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&v), 0xd8)); }
+
+//LASX: only use for permute WITHOUT zero clearing
+template<int imm>
+inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
+{ return __lasx_xvpermi_q(a, b, imm); }
+
+template<int imm>
+inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
+{ return __m256(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
+
+template<int imm>
+inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
+{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
+
+template<int imm>
+inline __m256i _v256_permute4x64(const __m256i& a)
+{ return __lasx_xvpermi_d(a, imm); }
+
+template<int imm>
+inline __m256d _v256_permute4x64(const __m256d& a)
+{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&a), imm)); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute4x64(const _Tpvec& a)
+{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
+
+inline __m128i _v256_extract_high(const __m256i& v)
+{ __m256i temp256i = __lasx_xvpermi_d(v, 0x4E);
+  return *((__m128i*)&temp256i); }
+
+inline __m128  _v256_extract_high(const __m256& v)
+{ return __m128(_v256_extract_high(*((__m256i*)&v))); }
+
+inline __m128d _v256_extract_high(const __m256d& v)
+{ return __m128d(_v256_extract_high(*((__m256i*)&v))); }
+
+inline __m128i _v256_extract_low(const __m256i& v)
+{ return *((__m128i*)&v); }
+
+inline __m128  _v256_extract_low(const __m256& v)
+{ return __m128(_v256_extract_low(*((__m256i*)&v))); }
+
+inline __m128d _v256_extract_low(const __m256d& v)
+{ return __m128d(_v256_extract_low(*((__m256i*)&v))); }
+
+inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrlrni_hu_w(b, a, 0);
+}
+
+template<int i>
+inline int _v256_extract_b(const __m256i& a)
+{
+    int des[1] = {0};
+    __lasx_xvstelm_b(a, des, 0, i);
+    return des[0];
+}
+
+template<int i>
+inline int _v256_extract_h(const __m256i& a)
+{
+    int des[1] = {0};
+    __lasx_xvstelm_h(a, des, 0, i);
+    return des[0];
+}
+
+template<int i>
+inline int _v256_extract_w(const __m256i& a)
+{
+    return __lasx_xvpickve2gr_w(a, i);
+}
+
+template<int i>
+inline int64 _v256_extract_d(const __m256i& a)
+{
+    return __lasx_xvpickve2gr_d(a, i);
+}
+
+///////// Types ////////////
+
+struct v_uint8x32
+{
+    typedef uchar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_uint8x32(__m256i v) : val(v) {}
+    v_uint8x32(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31)
+    {
+        val = _v256_setr_b((char)v0, (char)v1, (char)v2, (char)v3,
+            (char)v4,  (char)v5,  (char)v6 , (char)v7,  (char)v8,  (char)v9,
+            (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
+            (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
+            (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
+            (char)v28, (char)v29, (char)v30, (char)v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint8x32() {}
+
+    uchar get0() const {
+        uchar des[1] = {0};
+        __lasx_xvstelm_b(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_int8x32
+{
+    typedef schar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_int8x32(__m256i v) : val(v) {}
+    v_int8x32(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31)
+    {
+        val = _v256_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+            v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
+            v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int8x32() {}
+
+    schar get0() const {
+        schar des[1] = {0};
+        __lasx_xvstelm_b(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_uint16x16
+{
+    typedef ushort lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_uint16x16(__m256i v) : val(v) {}
+    v_uint16x16(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15)
+    {
+        val = _v256_setr_h((short)v0, (short)v1, (short)v2, (short)v3,
+            (short)v4,  (short)v5,  (short)v6,  (short)v7,  (short)v8,  (short)v9,
+            (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint16x16() {}
+
+    ushort get0() const {
+        ushort des[1] = {0};
+        __lasx_xvstelm_h(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_int16x16
+{
+    typedef short lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_int16x16(__m256i v) : val(v) {}
+    v_int16x16(short v0,  short v1,  short v2,  short v3,
+               short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11,
+               short v12, short v13, short v14, short v15)
+    {
+        val = _v256_setr_h(v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int16x16() {}
+
+    short get0() const {
+        short des[1] = {0};
+        __lasx_xvstelm_h(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_uint32x8
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_uint32x8(__m256i v) : val(v) {}
+    v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
+               unsigned v4, unsigned v5, unsigned v6, unsigned v7)
+    {
+        val = _v256_setr_w((unsigned)v0, (unsigned)v1, (unsigned)v2,
+            (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint32x8() {}
+
+    unsigned get0() const { return __lasx_xvpickve2gr_wu(val, 0); }
+};
+
+struct v_int32x8
+{
+    typedef int lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_int32x8(__m256i v) : val(v) {}
+    v_int32x8(int v0, int v1, int v2, int v3,
+              int v4, int v5, int v6, int v7)
+    {
+        val = _v256_setr_w(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int32x8() {}
+
+    int get0() const { return __lasx_xvpickve2gr_w(val, 0); }
+};
+
+struct v_float32x8
+{
+    typedef float lane_type;
+    enum { nlanes = 8 };
+    __m256 val;
+
+    explicit v_float32x8(__m256 v) : val(v) {}
+    explicit v_float32x8(__m256i v) { val = *((__m256*)&v); }
+    v_float32x8(float v0, float v1, float v2, float v3,
+                float v4, float v5, float v6, float v7)
+    {
+        val = _v256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float32x8() {}
+
+    float get0() const {
+        float des[1] = {0};
+        __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+
+    int get0toint() const {
+        int des[1] = {0};
+        __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_uint64x4
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_uint64x4(__m256i v) : val(v) {}
+    v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
+    { val = _v256_setr_d((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint64x4() {}
+
+    uint64 get0() const
+    {
+        return __lasx_xvpickve2gr_du(val, 0);
+    }
+};
+
+struct v_int64x4
+{
+    typedef int64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_int64x4(__m256i v) : val(v) {}
+    v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
+    { val = _v256_setr_d(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int64x4() {}
+
+    int64 get0() const
+    {
+        return __lasx_xvpickve2gr_d(val, 0);
+    }
+};
+
+struct v_float64x4
+{
+    typedef double lane_type;
+    enum { nlanes = 4 };
+    __m256d val;
+
+    explicit v_float64x4(__m256d v) : val(v) {}
+    explicit v_float64x4(__m256i v) { val = *((__m256d*)&v); }
+    v_float64x4(double v0, double v1, double v2, double v3)
+    { val = _v256_setr_pd(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float64x4() {}
+
+    double get0() const {
+        double des[1] = {0};
+        __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+
+    int64 get0toint64() const {
+        int64 des[1] = {0};
+        __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_LASX_LOADSTORE(_Tpvec, _Tp)                   \
+    inline _Tpvec v256_load(const _Tp* ptr)                           \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                           \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                           \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m128i v128 = __lsx_vld(ptr, 0);                             \
+        return _Tpvec(*((__m256i*)&v128));                            \
+    }                                                                 \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m128i vlo = __lsx_vld(ptr0, 0);                             \
+        __m128i vhi = __lsx_vld(ptr1, 0);                             \
+        return _Tpvec(_v256_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { __lasx_xvst(a.val, ptr, 0); }                                   \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { __lasx_xvst(a.val, ptr, 0); }                                   \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { __lasx_xvst(a.val, ptr, 0); }                                   \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            __lasx_xvst(a.val, ptr, 0); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            __lasx_xvst(a.val, ptr, 0); \
+        else \
+            __lasx_xvst(a.val, ptr, 0); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { __lsx_vst(_v256_extract_low(a.val), ptr, 0); }                  \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
+
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint8x32,  uchar)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int8x32,   schar)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint16x16, ushort)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int16x16,  short)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint32x8,  unsigned)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int32x8,   int)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint64x4,  uint64)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int64x4,   int64)
+
+
+#define OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg)          \
+    inline _Tpvec v256_load(const _Tp* ptr)                               \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                               \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                               \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        __m128i v128 = __lsx_vld(ptr, 0);                                 \
+        return _Tpvec(*((__m256i*)&v128));                                \
+    }                                                                     \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = __lsx_vld(ptr0, 0);                                 \
+        halfreg vhi = __lsx_vld(ptr1, 0);                                 \
+        return _Tpvec(_v256_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { __lasx_xvst(a.val, ptr, 0); }                                       \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { __lasx_xvst(a.val, ptr, 0); }                                       \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { __lasx_xvst(a.val, ptr, 0); }                                       \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            __lasx_xvst(a.val, ptr, 0); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            __lasx_xvst(a.val, ptr, 0); \
+        else \
+            __lasx_xvst(a.val, ptr, 0); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { __lsx_vst(_v256_extract_low(a.val), ptr, 0); }                      \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
+
+OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float32x8, float, __m128i)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float64x4, double, __m128i)
+
+
+inline __m256i _lasx_256_castps_si256(const __m256& v)
+{ return __m256i(v); }
+
+inline __m256i _lasx_256_castpd_si256(const __m256d& v)
+{ return __m256i(v); }
+
+#define OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_LASX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)          \
+    inline _Tpvec v256_setzero_##suffix()                                         \
+    { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                                   \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                                     \
+    { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); }                  \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float32x8, suffix, _lasx_256_castps_si256) \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float64x4, suffix, _lasx_256_castpd_si256)
+
+OPENCV_HAL_IMPL_LASX_INIT(v_uint8x32,  uchar,    u8,  b,   int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int8x32,   schar,    s8,  b,   int)
+OPENCV_HAL_IMPL_LASX_INIT(v_uint16x16, ushort,   u16, h,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int16x16,  short,    s16, h,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_uint32x8,  unsigned, u32, w,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int32x8,   int,      s32, w,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_uint64x4,  uint64,   u64, d, long int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int64x4,   int64,    s64, d, long int)
+
+
+inline __m256 _lasx_256_castsi256_ps(const __m256i &v)
+{ return __m256(v); }
+
+inline __m256d _lasx_256_castsi256_pd(const __m256i &v)
+{ return __m256d(v); }
+
+#define OPENCV_HAL_IMPL_LASX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v256_setzero_##suffix()                                 \
+    { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                           \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                             \
+    { return _Tpvec(_v256_setall_##zsuffix(v)); }                   \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8,   suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4,   suffix, cast)
+
+OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float32x8, float,  f32, ps, _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float64x4, double, f64, pd, _lasx_256_castsi256_pd)
+
+inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
+{ return a; }
+inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
+{ return v_float32x8(_lasx_256_castps_si256(__m256(a.val))); }
+
+inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
+{ return a; }
+inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
+{ return v_float64x4(_lasx_256_castpd_si256(__m256d(a.val))); }
+
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_LASX_UNPACK(_Tpvec, suffix)                 \
+    inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(__lasx_xvilvl_##suffix(__m256i(b.val), __m256i(a.val))); }        \
+    inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(__lasx_xvilvh_##suffix(__m256i(b.val), __m256i(a.val))); }
+
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint8x32,  b)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int8x32,   b)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint16x16, h)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int16x16,  h)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint32x8,  w)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int32x8,   w)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint64x4,  d)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int64x4,   d)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_float32x8, w)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_float64x4, d)
+
+
+// shuffle
+// todo: emulate 64bit
+#define OPENCV_HAL_IMPL_LASX_SHUFFLE(_Tpvec, intrin)  \
+    template<int m>                                  \
+    inline _Tpvec v256_shuffle(const _Tpvec& a)      \
+    { return _Tpvec(__lasx_xvshuf4i_##intrin(a.val, m)); }
+
+OPENCV_HAL_IMPL_LASX_SHUFFLE(v_uint32x8,  w)
+OPENCV_HAL_IMPL_LASX_SHUFFLE(v_int32x8,   w)
+
+template<int m>
+inline v_float32x8 v256_shuffle(const v_float32x8 &a)
+{ return v_float32x8(__lasx_xvshuf4i_w(*((__m256i*)&a.val), m)); }
+
+template<int m>
+inline v_float64x4 v256_shuffle(const v_float64x4 &a)
+{
+    int imm8 = m & 0b0001;  //0 or 1
+    if (m & 0x0b0010) imm8 |= 0b0100;
+    //else imm8 |= 0b0000;
+    if (m & 0x0b0100) imm8 |= 0b110000;  //2 or 3
+    else imm8 |= 0b100000;
+    if (m & 0x0b1000) imm8 |= 0b11000000;
+    else imm8 |= 0b10000000;
+
+    return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8));
+}
+template<typename _Tpvec>
+inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
+{
+    ab0 = v256_unpacklo(a, b);
+    ab1 = v256_unpackhi(a, b);
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
+
+inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
+{ return v_float32x8(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
+
+inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
+{ return v256_permute2x128<0x03>(a, b); }
+
+inline __m256i _v256_alignr_b(const __m256i &a, const __m256i &b, const int imm)
+{
+    if (imm == 8) {
+        return __lasx_xvshuf4i_d(b, a, 0x9); // b.d1 a.d0 b.d3 a.d2
+    } else {
+        __m256i byteIndex = _v256_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        return __lasx_xvshuf_b(a, b, __lasx_xvadd_b(__lasx_xvreplgr2vr_b(imm), byteIndex));
+    }
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_alignr_b(a.val, b.val, 8)); }
+inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(__lasx_xvshuf4i_d(b.val, a.val, 0x9)); } // b.d1 a.d0 b.d3 a.d2
+// todo: emulate float32
+
+template<typename _Tpvec>
+inline _Tpvec v256_swap_halves(const _Tpvec& a)
+{ return v256_permute2x128<1>(a, a); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_reverse_64(const _Tpvec& a)
+{ return v256_permute4x64<0x1b>(a); }
+
+
+// ZIP
+#define OPENCV_HAL_IMPL_LASX_ZIP(_Tpvec)                             \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return v256_permute2x128<0x02>(a, b); }                        \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return v256_permute2x128<0x13>(a, b); }                        \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    {                                                                \
+        _Tpvec a1b0 = v256_alignr_128(a, b);                         \
+        c = v256_combine_diagonal(a, a1b0);                          \
+        d = v256_combine_diagonal(a1b0, b);                          \
+    }                                                                \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,              \
+                      _Tpvec& ab0, _Tpvec& ab1)                      \
+    {                                                                \
+        _Tpvec ab0ab2, ab1ab3;                                       \
+        v256_zip(a, b, ab0ab2, ab1ab3);                              \
+        v_recombine(ab0ab2, ab1ab3, ab0, ab1);                       \
+    }
+
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint8x32)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int8x32)
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int16x16)
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int32x8)
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint64x4)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int64x4)
+OPENCV_HAL_IMPL_LASX_ZIP(v_float32x8)
+OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin)           \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32,  __lasx_xvsadd_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32,  __lasx_xvssub_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32,   __lasx_xvsadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32,   __lasx_xvssub_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16,  __lasx_xvsadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16,  __lasx_xvssub_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8,  __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8,  __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8,  __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8,   __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8,   __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8,   __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4,  __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4,  __lasx_xvsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4,   __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4,   __lasx_xvsub_d)
+
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
+
+// saturating multiply 8-bit, 16-bit
+inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
+{
+    v_uint16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
+{
+    v_int16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i pl = __lasx_xvmul_h(a.val, b.val);
+    __m256i ph = __lasx_xvmuh_hu(a.val, b.val);
+    __m256i p0 = __lasx_xvilvl_h(ph, pl);
+    __m256i p1 = __lasx_xvilvh_h(ph, pl);
+    return v_uint16x16(_v256_packs_epu32(p0, p1));
+}
+inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i pl = __lasx_xvmul_h(a.val, b.val);
+    __m256i ph = __lasx_xvmuh_h(a.val, b.val);
+    __m256i p0 = __lasx_xvilvl_h(ph, pl);
+    __m256i p1 = __lasx_xvilvh_h(ph, pl);
+    return v_int16x16(_lasx_packs_w(p0, p1));
+}
+inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
+{ a = a * b; return a; }
+inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
+{ a = a * b; return a; }
+inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
+{ a = a * b; return a; }
+inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
+{ a = a * b; return a; }
+
+/** Non-saturating arithmetics **/
+
+#define OPENCV_HAL_IMPL_LASX_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)    \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint8x32,  __lasx_xvadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int8x32,   __lasx_xvadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint16x16, __lasx_xvadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int16x16,  __lasx_xvadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint8x32,  __lasx_xvsub_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int8x32,   __lasx_xvsub_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint16x16, __lasx_xvsub_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int16x16,  __lasx_xvsub_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_uint16x16, __lasx_xvmul_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16,  __lasx_xvmul_h)
+
+inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val);
+    __m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val);
+    return v_uint8x32(__lasx_xvpackev_b(p1, p0));
+}
+
+inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
+                         v_uint16x16& c, v_uint16x16& d)
+{
+    v_uint16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
+                         v_int16x16& c, v_int16x16& d)
+{
+    v_int16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
+                         v_int32x8& c, v_int32x8& d)
+{
+    v_int16x16 vhi = v_int16x16(__lasx_xvmuh_h(a.val, b.val));
+
+    v_int16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
+                         v_uint32x8& c, v_uint32x8& d)
+{
+    v_uint16x16 vhi = v_uint16x16(__lasx_xvmuh_hu(a.val, b.val));
+
+    v_uint16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
+                         v_uint64x4& c, v_uint64x4& d)
+{
+    __m256i v0 = __lasx_xvmulwev_d_wu(a.val, b.val);
+    __m256i v1 = __lasx_xvmulwod_d_wu(a.val, b.val);
+    v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
+}
+
+inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(__lasx_xvmuh_h(a.val, b.val)); }
+inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(__lasx_xvmuh_hu(a.val, b.val)); }
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                             \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)                                        \
+    { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)                                        \
+    { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)                                        \
+    { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)                                        \
+    { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }                            \
+    template<int imm>                                                                             \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                                                        \
+    { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    template<int imm>                                                                             \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                                                        \
+    { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    template<int imm>                                                                             \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                                                        \
+    { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    template<int imm>                                                                             \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                                                        \
+    { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }
+
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8,  v_int32x8,  w, __lasx_xvsra_w)
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, __lasx_xvsra_d)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const)    \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix)   \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix)    \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix)   \
+    inline _Tpvec operator ~ (const _Tpvec& a)                      \
+    { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
+
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32,   v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32,    v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint16x16,  v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int16x16,   v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint32x8,   v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int32x8,    v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4,   v, __lasx_xvreplgr2vr_d(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4,    v, __lasx_xvreplgr2vr_d(-1))
+
+#define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)                         \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)                            \
+    { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }                    \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)                              \
+    { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
+
+#define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast)       \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast)      \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast)       \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast)      \
+    inline _Tpvec operator ~ (const _Tpvec& a)                                     \
+    { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
+
+OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8,  v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float64x4,  v, __lasx_xvreplgr2vr_d(-1), _lasx_256_castsi256_pd)
+
+/** Select **/
+#define OPENCV_HAL_IMPL_LASX_SELECT(_Tpvec)                                      \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(__lasx_xvbitsel_v(b.val, a.val, mask.val)); }
+
+OPENCV_HAL_IMPL_LASX_SELECT(v_uint8x32)
+OPENCV_HAL_IMPL_LASX_SELECT(v_int8x32)
+OPENCV_HAL_IMPL_LASX_SELECT(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_SELECT(v_int16x16)
+OPENCV_HAL_IMPL_LASX_SELECT(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_SELECT(v_int32x8)
+
+inline v_float32x8 v_select(const v_float32x8 &mask, const v_float32x8 &a, const v_float32x8 &b)
+{ return v_float32x8(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
+
+inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const v_float64x4 &b)
+{ return v_float64x4(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec)                     \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)   \
+    { return ~(a == b); }                                          \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)   \
+    { return b > a; }                                              \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)   \
+    { return ~(a < b); }                                           \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)   \
+    { return b >= a; }
+
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)   \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)          \
+    { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
+    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)           \
+    {                                                                        \
+        return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val));                \
+    }                                                                        \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)          \
+    { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)           \
+    { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); }                 \
+    OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec)                                  \
+    OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint8x32,  v_int8x32,  b, bu)
+OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
+OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8,  v_int32x8,  w, wu)
+
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix)         \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); }       \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }
+
+OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
+OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
+
+#define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)    \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
+
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix)              \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix)     \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix)     \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(<,  xvfcmp_clt, _Tpvec, ssuffix)     \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
+
+OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
+OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
+
+inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b)
+{ return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
+
+inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b)
+{ return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
+
+inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b)
+{ return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
+
+inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b)
+{ return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
+
+inline v_float32x8 v_not_nan(const v_float32x8& a)
+{ return v_float32x8(__lasx_xvfcmp_cor_s(a.val, a.val)); }
+inline v_float64x4 v_not_nan(const v_float64x4& a)
+{ return v_float64x4(__lasx_xvfcmp_cor_d(a.val, a.val)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint8x32,  __lasx_xvmin_bu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint8x32,  __lasx_xvmax_bu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int8x32,   __lasx_xvmin_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int8x32,   __lasx_xvmax_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint16x16, __lasx_xvmin_hu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint16x16, __lasx_xvmax_hu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int16x16,  __lasx_xvmin_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int16x16,  __lasx_xvmax_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint32x8,  __lasx_xvmin_wu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint32x8,  __lasx_xvmax_wu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int32x8,   __lasx_xvmin_w)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int32x8,   __lasx_xvmax_w)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float32x8, __lasx_xvfmin_s)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float32x8, __lasx_xvfmax_s)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float64x4, __lasx_xvfmin_d)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float64x4, __lasx_xvfmax_d)
+
+/** Rotate **/
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_R = (16 - imm) & 0xFF};
+    enum {IMM_R2 = (32 - imm) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _v256_permute2x128<0x21>(a.val, b.val);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(a.val, swap, IMM_R));
+    return v_uint8x32(_v256_alignr_b(swap, b.val, IMM_R2)); // imm < 32
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _v256_permute2x128<0x03>(a.val, b.val);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(swap, a.val, imm));
+    return v_uint8x32(_v256_alignr_b(b.val, swap, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+    enum {IMM_R = (16 - imm) & 0xFF};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i swapz = __lasx_xvpermi_q(a.val, vzero, 0x20);;
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(a.val, swapz, IMM_R));
+    return v_uint8x32(__lasx_xvbsll_v(swapz, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i swapz = __lasx_xvpermi_q(vzero, a.val, 0x21);;
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(swapz, a.val, imm));
+    return v_uint8x32(__lasx_xvbsrl_v(swapz, IMM_L));
+}
+
+#define OPENCV_HAL_IMPL_LASX_ROTATE_CAST(intrin, _Tpvec, cast)    \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)        \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a),    \
+                                       v_reinterpret_as_u8(b));   \
+        return _Tpvec(cast(ret.val));                             \
+    }                                                             \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a)                         \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a));   \
+        return _Tpvec(cast(ret.val));                             \
+    }
+
+#define OPENCV_HAL_IMPL_LASX_ROTATE(_Tpvec)                                  \
+    OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left,  _Tpvec, OPENCV_HAL_NOP) \
+    OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
+
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int8x32)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int16x16)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int32x8)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_uint64x4)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int64x4)
+
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left,  v_float32x8, _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float32x8, _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left,  v_float64x4, _lasx_256_castsi256_pd)
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float64x4, _lasx_256_castsi256_pd)
+
+/** Reverse **/
+inline v_uint8x32 v_reverse(const v_uint8x32 &a)
+{
+    static const __m256i perm = _v256_setr_b(
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
+    return v_uint8x32(__lasx_xvpermi_q(vec, vec, 1));
+}
+
+inline v_int8x32 v_reverse(const v_int8x32 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x16 v_reverse(const v_uint16x16 &a)
+{
+    __m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B);
+    vec = __lasx_xvshuf4i_w(vec, 0x4E);
+    return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E));
+}
+
+inline v_int16x16 v_reverse(const v_int16x16 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x8 v_reverse(const v_uint32x8 &a)
+{
+    __m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B);
+    return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E));
+}
+
+inline v_int32x8 v_reverse(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x8 v_reverse(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x4 v_reverse(const v_uint64x4 &a)
+{
+    return v_uint64x4(__lasx_xvpermi_d(a.val, 0x1b));
+}
+
+inline v_int64x4 v_reverse(const v_int64x4 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x4 v_reverse(const v_float64x4 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+// this function is return a[0]+a[1]+...+a[31]
+inline unsigned v_reduce_sum(const v_uint8x32& a)
+{
+    __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
+    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
+}
+
+inline int v_reduce_sum(const v_int8x32& a)
+{
+    __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
+    __m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
+    __m256i t4 = __lasx_xvhaddw_q_d(t3, t3);
+    return (int)(((v8i32)t4)[0]+((v8i32)t4)[4]);
+}
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a) \
+    { \
+        __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
+        val = intrin(val, __lsx_vbsrl_v(val,8));    \
+        val = intrin(val, __lsx_vbsrl_v(val,4));    \
+        val = intrin(val, __lsx_vbsrl_v(val,2));    \
+        val = intrin(val, __lsx_vbsrl_v(val,1));    \
+        return (sctype)__lsx_vpickve2gr_w(val, 0);  \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, min, __lsx_vmin_bu)
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,  schar, min, __lsx_vmin_b)
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, max, __lsx_vmax_bu)
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,  schar, max, __lsx_vmax_b)
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_16(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                   \
+    {                                                                \
+        __m128i v0 = _v256_extract_low(a.val);                       \
+        __m128i v1 = _v256_extract_high(a.val);                      \
+        v0 = intrin(v0, v1);                                         \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 8));                       \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 4));                       \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 2));                       \
+        return (sctype) __lsx_vpickve2gr_w(v0, 0);                   \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, min, __lsx_vmin_hu)
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,  short,  min, __lsx_vmin_h)
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, max, __lsx_vmax_hu)
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,  short,  max, __lsx_vmax_h)
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_8(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                  \
+    {                                                               \
+        __m128i v0 = _v256_extract_low(a.val);                      \
+        __m128i v1 = _v256_extract_high(a.val);                     \
+        v0 = intrin(v0, v1);                                        \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 8));                      \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 4));                      \
+        return (sctype) __lsx_vpickve2gr_w(v0, 0);                  \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, min, __lsx_vmin_wu)
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,  int,      min, __lsx_vmin_w)
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, max, __lsx_vmax_wu)
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,  int,      max, __lsx_vmax_w)
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_FLT(func, intrin)                 \
+    inline float v_reduce_##func(const v_float32x8& a)                \
+    {                                                                 \
+        __m128 v0 = _v256_extract_low(a.val);                         \
+        __m128 v1 = _v256_extract_high(a.val);                        \
+        v0 = intrin(v0, v1);                                          \
+        v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x0e))); \
+        v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x01))); \
+        float *fvalue = (float*)&v0;                                  \
+        return fvalue[0];                                             \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_FLT(min, __lsx_vfmin_s)
+OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s)
+
+inline int v_reduce_sum(const v_int32x8& a)
+{
+    __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
+    __m256i t2 = __lasx_xvhaddw_q_d(t1, t1);
+    return (int)(((v8i32)t2)[0]+((v8i32)t2)[4]);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x8& a)
+{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+inline int v_reduce_sum(const v_int16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+inline float v_reduce_sum(const v_float32x8& a)
+{
+    float result = 0;
+    float *pa = (float*)&a;
+    for (int i = 0; i < 2; ++i) {
+        result += pa[i*4] + pa[i*4+1] + pa[i*4+2] + pa[i*4+3];
+    }
+    return result;
+}
+
+inline uint64 v_reduce_sum(const v_uint64x4& a)
+{
+    __m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val);
+    return (uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]);
+}
+inline int64 v_reduce_sum(const v_int64x4& a)
+{
+    __m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val);
+    return (int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]);
+}
+inline double v_reduce_sum(const v_float64x4& a)
+{
+    double *pa = (double*)&a;
+    return pa[0] + pa[1] + pa[2] + pa[3];
+}
+
+inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
+                                 const v_float32x8& c, const v_float32x8& d)
+{
+    float *pa = (float*)&a;
+    float *pb = (float*)&b;
+    float *pc = (float*)&c;
+    float *pd = (float*)&d;
+
+    float v0 = pa[0] + pa[1] + pa[2] + pa[3];
+    float v1 = pb[0] + pb[1] + pb[2] + pb[3];
+    float v2 = pc[0] + pc[1] + pc[2] + pc[3];
+    float v3 = pd[0] + pd[1] + pd[2] + pd[3];
+    float v4 = pa[4] + pa[5] + pa[6] + pa[7];
+    float v5 = pb[4] + pb[5] + pb[6] + pb[7];
+    float v6 = pc[4] + pc[5] + pc[6] + pc[7];
+    float v7 = pd[4] + pd[5] + pd[6] + pd[7];
+    return v_float32x8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i t0 = __lasx_xvabsd_bu(a.val, b.val);
+    __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
+    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
+}
+inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i t0 = __lasx_xvabsd_b(a.val, b.val);
+    __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
+    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
+}
+inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_add_wrap(a - b, b - a), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
+{
+    return v_reduce_sum(v_max(a, b) - v_min(a, b));
+}
+inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 m = a < b;
+    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
+}
+inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
+{
+    v_float32x8 a_b = a - b;
+    return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
+}
+
+/** Popcount **/
+inline v_uint8x32 v_popcount(const v_uint8x32& a)
+{ return v_uint8x32(__lasx_xvpcnt_b(a.val)); }
+inline v_uint16x16 v_popcount(const v_uint16x16& a)
+{ return v_uint16x16(__lasx_xvpcnt_h(a.val)); }
+inline v_uint32x8 v_popcount(const v_uint32x8& a)
+{ return v_uint32x8(__lasx_xvpcnt_w(a.val)); }
+inline v_uint64x4 v_popcount(const v_uint64x4& a)
+{ return v_uint64x4(__lasx_xvpcnt_d(a.val)); }
+inline v_uint8x32 v_popcount(const v_int8x32& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x16 v_popcount(const v_int16x16& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x8 v_popcount(const v_int32x8& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x4 v_popcount(const v_int64x4& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+inline int v_signmask(const v_int8x32& a)
+{
+    __m256i result = __lasx_xvmskltz_b(a.val);
+    int mask = __lasx_xvpickve2gr_w(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 16);
+    return mask;
+}
+inline int v_signmask(const v_uint8x32& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+
+inline int v_signmask(const v_int16x16& a)
+{ return v_signmask(v_pack(a, a)) & 0xFFFF; }
+inline int v_signmask(const v_uint16x16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_int32x8& a)
+{
+    __m256i result = __lasx_xvmskltz_w(a.val);
+    int mask = __lasx_xvpickve2gr_w(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 4);
+    return mask;
+}
+inline int v_signmask(const v_uint32x8& a)
+{ return v_signmask(*(v_int32x8*)(&a)); }
+
+inline int v_signmask(const v_int64x4& a)
+{
+    __m256i result = __lasx_xvmskltz_d(a.val);
+    int mask = __lasx_xvpickve2gr_d(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 2);
+    return mask;
+}
+inline int v_signmask(const v_uint64x4& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+
+inline int v_signmask(const v_float32x8& a)
+{ return v_signmask(*(v_int32x8*)(&a)); }
+
+inline int v_signmask(const v_float64x4& a)
+{ return v_signmask(*(v_int64x4*)(&a)); }
+
+inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_LASX_CHECK(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
+    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
+OPENCV_HAL_IMPL_LASX_CHECK(v_uint8x32, -1)
+OPENCV_HAL_IMPL_LASX_CHECK(v_int8x32, -1)
+OPENCV_HAL_IMPL_LASX_CHECK(v_uint32x8, 255)
+OPENCV_HAL_IMPL_LASX_CHECK(v_int32x8, 255)
+OPENCV_HAL_IMPL_LASX_CHECK(v_uint64x4, 15)
+OPENCV_HAL_IMPL_LASX_CHECK(v_int64x4, 15)
+OPENCV_HAL_IMPL_LASX_CHECK(v_float32x8, 255)
+OPENCV_HAL_IMPL_LASX_CHECK(v_float64x4, 15)
+
+#define OPENCV_HAL_IMPL_LASX_CHECK_SHORT(_Tpvec)  \
+    inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
+    inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
+OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_LASX_MULADD(_Tpvec, suffix)                            \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)     \
+    { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); }           \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)  \
+    { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); }           \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                      \
+    { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); }                         \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)            \
+    { return v_fma(a, a, b * b); }                                             \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                \
+    { return v_sqrt(v_fma(a, a, b*b)); }
+
+OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
+OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
+
+inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val));
+}
+
+inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x8 v_invsqrt(const v_float32x8& x)
+{ return v_float32x8(__lasx_xvfrsqrt_s(x.val)); }
+
+inline v_float64x4 v_invsqrt(const v_float64x4& x)
+{ return v_float64x4(__lasx_xvfrsqrt_d(x.val)); }
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix)         \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)        \
+    { return v_u##_Tpvec(__lasx_xvabsd_##suffix(x.val, __lasx_xvreplgr2vr_w(0))); }
+
+OPENCV_HAL_IMPL_LASX_ABS(int8x32,  b)
+OPENCV_HAL_IMPL_LASX_ABS(int16x16, h)
+OPENCV_HAL_IMPL_LASX_ABS(int32x8,  w)
+
+inline v_float32x8 v_abs(const v_float32x8& x)
+{ return v_float32x8(*((__m256i*)&x) & __lasx_xvreplgr2vr_w(0x7fffffff)); }
+inline v_float64x4 v_abs(const v_float64x4& x)
+{ return v_float64x4(*((__m256i*)&x) & __lasx_xvreplgr2vr_d(0x7fffffffffffffff)); }
+
+/** Absolute difference **/
+inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
+{ return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); }
+inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
+{ return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); }
+inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
+{ return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); }
+
+inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
+{ return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); }
+inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
+{ return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); }
+inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
+{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
+
+inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
+{ return v_abs(a - b); }
+
+inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = a - b;
+    v_int8x32 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x8 v_round(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftint_w_s(a.val)); }
+
+inline v_int32x8 v_round(const v_float64x4& a)
+{ __m256i t = __lasx_xvftint_w_d(a.val, a.val);
+  return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
+
+inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
+{
+    __m256i abi = __lasx_xvftint_w_d(b.val, a.val);
+    return v_int32x8(__lasx_xvpermi_d(abi, 0b11011000)); //3120
+}
+
+inline v_int32x8 v_trunc(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftintrz_w_s(a.val)); }
+
+inline v_int32x8 v_trunc(const v_float64x4& a)
+{ __m256i t = __lasx_xvftintrz_w_d(a.val, a.val);
+  return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
+
+inline v_int32x8 v_floor(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrm_s(a.val)))); }
+
+inline v_int32x8 v_floor(const v_float64x4& a)
+{ return v_trunc(v_float64x4(__lasx_xvfrintrm_d(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrp_s(a.val)))); }
+
+inline v_int32x8 v_ceil(const v_float64x4& a)
+{ return v_trunc(v_float64x4(__lasx_xvfrintrp_d(a.val))); }
+
+/** To float **/
+inline v_float32x8 v_cvt_f32(const v_int32x8& a)
+{ return v_float32x8(__lasx_xvffint_s_w(a.val)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a)
+{ return v_float32x8(__lasx_xvpermi_d(__lasx_xvfcvt_s_d(a.val, a.val), 0x88)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
+{
+    __m256 abf = __lasx_xvfcvt_s_d(a.val, b.val);  //warnning: order of a,b is diff from instruction xvfcvt.s.d
+    return v_float32x8(__lasx_xvpermi_d(abf, 0x8D));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int32x8& a)
+{
+    __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
+    return v_float64x4(__lasx_xvffintl_d_w(alow));
+}
+
+inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
+{
+    __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
+    return v_float64x4(__lasx_xvffintl_d_w(ahigh));
+}
+
+inline v_float64x4 v_cvt_f64(const v_float32x8& a)
+{
+    __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
+    return v_float64x4(__lasx_xvfcvtl_d_s((__m256)alow));
+}
+
+inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
+{
+    __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
+    return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int64x4& v)
+{ return v_float64x4(__lasx_xvffint_d_l(v.val)); }
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x32 v256_lut(const schar* tab, const int* idx)
+{
+    return v_int8x32(_v256_setr_b(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]],
+                                  tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]],
+                                  tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]], tab[idx[16]], tab[idx[17]],
+                                  tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
+                                  tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]],
+                                  tab[idx[30]], tab[idx[31]]));
+}
+inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x32(_v256_setr_h(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]),
+                                  *(const short*)(tab + idx[ 3]), *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]),
+                                  *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]), *(const short*)(tab + idx[ 8]),
+                                  *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
+                                  *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]),
+                                  *(const short*)(tab + idx[15])));
+}
+inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x32(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
+                                  *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
+                                  *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7])));
+}
+inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x16 v256_lut(const short* tab, const int* idx)
+{
+    return v_int16x16(_v256_setr_h(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]],
+                                   tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]],
+                                   tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]],
+                                   tab[idx[15]]));
+}
+inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x16(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                   *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
+                                   *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
+                                   *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
+}
+inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x16(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
+                                   *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
+
+}
+inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x8 v256_lut(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
+                                  *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
+                                  *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
+}
+inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
+                                  *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
+}
+inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
+}
+inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x4 v256_lut(const int64* tab, const int* idx)
+{
+    return v_int64x4(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
+                                  *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
+}
+inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
+}
+inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
+inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x8 v256_lut(const float* tab, const int* idx)
+{
+    return v_float32x8(_v256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x4 v256_lut(const double* tab, const int* idx)
+{
+    return v_float64x4(_v256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx)
+{ return v_float64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0))); }
+
+inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
+{
+    int *idx = (int*)&idxvec.val;
+    return v256_lut(tab, idx);
+}
+
+inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v256_lut(tab, idx);
+}
+
+inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v256_lut(tab, idx);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
+{
+    const int *idx = (const int*)&idxvec.val;
+    __m128i xy01, xy45, xy23, xy67;
+    xy01 = __lsx_vld(tab + idx[0], 0);
+    xy01 = __lsx_vextrins_d(xy01, __lsx_vld(tab + idx[1], 0), 0x10);
+    xy45 = __lsx_vld(tab + idx[4], 0);
+    xy45 = __lsx_vextrins_d(xy45, __lsx_vld(tab + idx[5], 0), 0x10);
+    __m256i xy0145 = _v256_combine(xy01, xy45);
+    xy23 = __lsx_vld(tab + idx[2], 0);
+    xy23 = __lsx_vextrins_d(xy23, __lsx_vld(tab + idx[3], 0), 0x10);
+    xy67 = __lsx_vld(tab + idx[6], 0);
+    xy67 = __lsx_vextrins_d(xy67, __lsx_vld(tab + idx[7], 0), 0x10);
+    __m256i xy2367 = _v256_combine(xy23, xy67);
+
+    __m256i xxyy0145 = __lasx_xvilvl_w(xy2367, xy0145);
+    __m256i xxyy2367 = __lasx_xvilvh_w(xy2367, xy0145);
+
+    x = v_float32x8(__lasx_xvilvl_w(xxyy2367, xxyy0145));
+    y = v_float32x8(__lasx_xvilvh_w(xxyy2367, xxyy0145));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
+{
+    //int CV_DECL_ALIGNED(32) idx[4];
+    const int *idx = (const int*)&idxvec.val;
+    __m128i xy0 = __lsx_vld(tab + idx[0], 0);
+    __m128i xy2 = __lsx_vld(tab + idx[2], 0);
+    __m128i xy1 = __lsx_vld(tab + idx[1], 0);
+    __m128i xy3 = __lsx_vld(tab + idx[3], 0);
+    __m256i xy02 = _v256_combine(xy0, xy2);
+    __m256i xy13 = _v256_combine(xy1, xy3);
+
+    x = v_float64x4(__lasx_xvilvl_d(xy13, xy02));
+    y = v_float64x4(__lasx_xvilvh_d(xy13, xy02));
+}
+
+inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
+{
+    return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
+                       _v256_set_d(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
+}
+inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
+{
+    return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
+                       _v256_set_d(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
+}
+inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec)
+{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
+{
+    return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
+                        _v256_set_d(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
+}
+inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec)
+{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
+{
+    return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
+                        _v256_set_d(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
+}
+inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec)
+{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
+{
+    return v_int32x8(__lasx_xvshuf4i_w(vec.val, 0xd8));
+}
+inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec)
+{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_interleave_pairs(const v_float32x8& vec)
+{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
+{
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
+                   _v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100));
+    return v_int8x32(__lasx_xvperm_w(t1,
+                       _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
+{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
+{
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
+                   _v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100));
+    return v_int16x16(__lasx_xvperm_w(t1,
+                        _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec)
+{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
+{
+    return v_int32x8(__lasx_xvperm_w(vec.val,
+                       _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec)
+{ return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
+{
+    return v_float32x8(__lasx_xvperm_w(*(__m256i*)(&vec.val),
+                         _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
+{ return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
+
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
+{
+    __m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
+    return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
+}
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{
+    __m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val);
+    return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
+}
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i even  = __lasx_xvmulwev_h_bu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_h_bu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_wu_hu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd);
+    return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
+}
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i even  = __lasx_xvmulwev_h_b(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_h_b(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_w_h(even, even);
+    __m256i prod1 = __lasx_xvhaddw_w_h(odd, odd);
+    return v_int32x8(__lasx_xvadd_w(prod0, prod1));
+}
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i even  = __lasx_xvmulwev_w_hu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_hu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
+    return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
+}
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i even  = __lasx_xvmulwev_w_h(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_h(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_d_w(even, even);
+    __m256i prod1 = __lasx_xvhaddw_d_w(odd, odd);
+    return v_int64x4(__lasx_xvadd_d(prod0, prod1));
+}
+
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
+{ return v_dotprod(a, b); }
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod(a, b); }
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i even  = __lasx_xvmulwev_w_hu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_hu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
+    return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
+}
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
+    __m256i sign = __lasx_xvsrai_w(prod, 31);
+    __m256i lo = __lasx_xvilvl_w(sign, prod);
+    __m256i hi = __lasx_xvilvh_w(sign, prod);
+    return v_int64x4(__lasx_xvadd_d(lo, hi));
+}
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+
+#define OPENCV_HAL_LASX_SPLAT2_PS(a, im) \
+    v_float32x8(__lasx_xvpermi_w(a.val, a.val, im))
+
+inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
+                            const v_float32x8& m1, const v_float32x8& m2,
+                            const v_float32x8& m3)
+{
+    v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
+    v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
+    v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
+                               const v_float32x8& m1, const v_float32x8& m2,
+                               const v_float32x8& a)
+{
+    v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
+    v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+
+#define OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(_Tpvec, cast_from, cast_to)           \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m256i t0 = cast_from(__lasx_xvilvl_w(a1.val, a0.val));                \
+        __m256i t1 = cast_from(__lasx_xvilvl_w(a3.val, a2.val));                \
+        __m256i t2 = cast_from(__lasx_xvilvh_w(a1.val, a0.val));                \
+        __m256i t3 = cast_from(__lasx_xvilvh_w(a3.val, a2.val));                \
+        b0.val = cast_to(__lasx_xvilvl_d(t1, t0));                              \
+        b1.val = cast_to(__lasx_xvilvh_d(t1, t0));                              \
+        b2.val = cast_to(__lasx_xvilvl_d(t3, t2));                              \
+        b3.val = cast_to(__lasx_xvilvh_d(t3, t2));                              \
+    }
+
+OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_uint32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_int32x8,  OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+
+inline void v_transpose4x4(const v_float32x8 &a0, const v_float32x8 &a1,
+                           const v_float32x8 &a2, const v_float32x8 &a3,
+                           v_float32x8 &b0, v_float32x8 &b1, v_float32x8 &b2, v_float32x8 &b3)
+{
+    __m256i t0 = __lasx_xvilvl_w(__m256i(a1.val), __m256i(a0.val));
+    __m256i t1 = __lasx_xvilvl_w(__m256i(a3.val), __m256i(a2.val));
+    __m256i t2 = __lasx_xvilvh_w(__m256i(a1.val), __m256i(a0.val));
+    __m256i t3 = __lasx_xvilvh_w(__m256i(a3.val), __m256i(a2.val));
+    b0.val = __m256(__lasx_xvilvl_d(t1, t0));
+    b1.val = __m256(__lasx_xvilvh_d(t1, t0));
+    b2.val = __m256(__lasx_xvilvl_d(t3, t2));
+    b3.val = __m256(__lasx_xvilvh_d(t3, t2));
+}
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_LASX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)     \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)   \
+    {                                                                 \
+        b0.val = intrin(a.val);                                       \
+        b1.val = intrin(__lasx_xvpermi_q(a.val, a.val, 0x11));        \
+    }                                                                 \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                      \
+    { return _Tpwvec(intrin(a.val)); }                                \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                     \
+    { return _Tpwvec(intrin(__lasx_xvpermi_q(a.val, a.val, 0x11))); } \
+    inline _Tpwvec v256_load_expand(const _Tp* ptr)                   \
+    {                                                                 \
+        __m128i a = __lsx_vld(ptr, 0);                                \
+        return _Tpwvec(intrin(*((__m256i*)&a)));                      \
+    }
+
+OPENCV_HAL_IMPL_LASX_EXPAND(v_uint8x32,  v_uint16x16, uchar,    __lasx_vext2xv_hu_bu)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_int8x32,   v_int16x16,  schar,    __lasx_vext2xv_h_b)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_uint16x16, v_uint32x8,  ushort,   __lasx_vext2xv_wu_hu)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_int16x16,  v_int32x8,   short,    __lasx_vext2xv_w_h)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_uint32x8,  v_uint64x4,  unsigned, __lasx_vext2xv_du_wu)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_int32x8,   v_int64x4,   int,      __lasx_vext2xv_d_w)
+
+#define OPENCV_HAL_IMPL_LASX_EXPAND_Q(_Tpvec, _Tp, intrin)   \
+    inline _Tpvec v256_load_expand_q(const _Tp* ptr)         \
+    {                                                        \
+        __m128i a = __lsx_vld(ptr, 0);                       \
+        return _Tpvec(intrin(*((__m256i*)&a)));              \
+    }
+
+OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_uint32x8, uchar, __lasx_vext2xv_wu_bu)
+OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_int32x8,  schar, __lasx_vext2xv_w_b)
+
+/* pack */
+// 16
+inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
+{ return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }
+
+inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); }
+
+inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar *ptr, const v_uint16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n);
+    return v_uint8x32(_v256_shuffle_odd_64(res));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n);
+    return v_uint8x32(_v256_shuffle_odd_64(res));
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
+{
+    __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n);
+    return v_int8x32(_v256_shuffle_odd_64(res));
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
+{
+    __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+// 32
+inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(_lasx_packs_w(a.val, b.val))); }
+
+inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_lasx_packus_w(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
+{
+    __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x8& a)
+{
+    __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    __m256i ab = __lasx_xvpickev_w(b.val, a.val);
+    return v_uint32x8(_v256_shuffle_odd_64(ab));
+}
+
+inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
+    v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x4& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
+{ return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    __m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x4& a)
+{
+    __m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+// pack boolean
+inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i ab = _lasx_packs_h(a.val, b.val);
+    return v_uint8x32(_v256_shuffle_odd_64(ab));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
+                           const v_uint32x8& c, const v_uint32x8& d)
+{
+    __m256i ab = _lasx_packs_w(a.val, b.val);
+    __m256i cd = _lasx_packs_w(c.val, d.val);
+
+    __m256i abcd = _v256_shuffle_odd_64(_lasx_packs_h(ab, cd));
+    return v_uint8x32(__lasx_xvshuf4i_w(abcd, 0xd8));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                           const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
+                           const v_uint64x4& g, const v_uint64x4& h)
+{
+    __m256i ab = _lasx_packs_w(a.val, b.val);
+    __m256i cd = _lasx_packs_w(c.val, d.val);
+    __m256i ef = _lasx_packs_w(e.val, f.val);
+    __m256i gh = _lasx_packs_w(g.val, h.val);
+
+    __m256i abcd = _lasx_packs_w(ab, cd);
+    __m256i efgh = _lasx_packs_w(ef, gh);
+    __m256i pkall = _v256_shuffle_odd_64(_lasx_packs_h(abcd, efgh));
+
+    __m256i rev = _v256_alignr_b(pkall, pkall, 8);
+    return v_uint8x32(__lasx_xvilvl_h(rev, pkall));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_LASX_EXTRACT(_Tpvec)                    \
+    template<int s>                                             \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)   \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint8x32)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int8x32)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int16x16)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int32x8)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint64x4)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int64x4)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_float32x8)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_float64x4)
+
+template<int i>
+inline uchar v_extract_n(v_uint8x32 a)
+{
+    return (uchar)_v256_extract_b<i>(a.val);
+}
+
+template<int i>
+inline schar v_extract_n(v_int8x32 a)
+{
+    return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
+}
+
+template<int i>
+inline ushort v_extract_n(v_uint16x16 a)
+{
+    return (ushort)_v256_extract_h<i>(a.val);
+}
+
+template<int i>
+inline short v_extract_n(v_int16x16 a)
+{
+    return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
+}
+
+template<int i>
+inline uint v_extract_n(v_uint32x8 a)
+{
+    return (uint)_v256_extract_w<i>(a.val);
+}
+
+template<int i>
+inline int v_extract_n(v_int32x8 a)
+{
+    return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
+}
+
+template<int i>
+inline uint64 v_extract_n(v_uint64x4 a)
+{
+    return (uint64)_v256_extract_d<i>(a.val);
+}
+
+template<int i>
+inline int64 v_extract_n(v_int64x4 v)
+{
+    return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
+}
+
+template<int i>
+inline float v_extract_n(v_float32x8 v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(v_float64x4 v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
+    return d.dv;
+}
+
+template<int i>
+inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
+{
+    static const __m256i perm = __lasx_xvreplgr2vr_w((char)i);
+    return v_uint32x8(__lasx_xvperm_w(a.val, perm));
+}
+
+template<int i>
+inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b)
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_b(t1, t0);
+    __m256i p1 = __lasx_xvpickod_b(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_h(t1, t0);
+    __m256i p1 = __lasx_xvpickod_h(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_w(t1, t0);
+    __m256i p1 = __lasx_xvpickod_w(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
+{
+    __m256i ab0 = __lasx_xvld(ptr, 0);
+    __m256i ab1 = __lasx_xvld(ptr, 32);
+
+    __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
+    __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
+    __m256i a0 = __lasx_xvilvl_d(ph, pl);
+    __m256i b0 = __lasx_xvilvh_d(ph, pl);
+    a = v_uint64x4(a0);
+    b = v_uint64x4(b0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
+
+    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
+    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
+
+    const __m256i m0 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                    0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                    -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
+
+    __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
+    __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
+
+    const __m256i
+    sh_b = _v256_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
+                        0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
+    sh_g = _v256_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
+                        1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
+    sh_r = _v256_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
+                        2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    b0 = __lasx_xvshuf_b(b0, b0, sh_b);
+    g0 = __lasx_xvshuf_b(g0, g0, sh_g);
+    r0 = __lasx_xvshuf_b(r0, r0, sh_r);
+
+    a = v_uint8x32(b0);
+    b = v_uint8x32(g0);
+    c = v_uint8x32(r0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
+
+    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
+    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
+
+    const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                    0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                    -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+    __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
+    __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
+    const __m256i sh_b = _v256_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+                                      0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _v256_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+                                      2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m256i sh_r = _v256_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+                                      4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    b0 = __lasx_xvshuf_b(b0, b0, sh_b);
+    g0 = __lasx_xvshuf_b(g0, g0, sh_g);
+    r0 = __lasx_xvshuf_b(r0, r0, sh_r);
+
+    a = v_uint16x16(b0);
+    b = v_uint16x16(g0);
+    c = v_uint16x16(r0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
+
+    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
+    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
+
+    __m256i m24 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
+    __m256i m92 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
+    __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m24), bgr1, m92);
+    __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m92), bgr1, m24);
+    __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m24), s02_high, m92);
+
+    b0 = __lasx_xvshuf4i_w(b0, 0x6c);
+    g0 = __lasx_xvshuf4i_w(g0, 0xb1);
+    r0 = __lasx_xvshuf4i_w(r0, 0xc6);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
+
+    __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128
+    __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
+    __m256i s20r = __lasx_xvpermi_d(__lasx_xvpermi_q(bgr2, bgr0, 0x12), 0x1b);
+    __m256i b0 = __lasx_xvilvl_d(s20r, s01);
+    __m256i g0 = _v256_alignr_b(s12, s01, 8);
+    __m256i r0 = __lasx_xvilvh_d(s12, s20r);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d)
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+    __m256i t2 = __lasx_xvld(ptr, 64);
+    __m256i t3 = __lasx_xvld(ptr, 96);
+
+    const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
+    __m256i ac_lo = __lasx_xvpickev_b(t1, t0);
+    __m256i bd_lo = __lasx_xvpickod_b(t1, t0);
+    __m256i ac_hi = __lasx_xvpickev_b(t3, t2);
+    __m256i bd_hi = __lasx_xvpickod_b(t3, t2);
+
+    __m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo);
+    __m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo);
+    __m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo);
+    __m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo);
+
+    a.val = __lasx_xvperm_w(a_pre, sh);
+    b.val = __lasx_xvperm_w(b_pre, sh);
+    c.val = __lasx_xvperm_w(c_pre, sh);
+    d.val = __lasx_xvperm_w(d_pre, sh);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d)
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+    __m256i t2 = __lasx_xvld(ptr, 64);
+    __m256i t3 = __lasx_xvld(ptr, 96);
+
+    const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
+    __m256i ac_lo = __lasx_xvpickev_h(t1, t0);
+    __m256i bd_lo = __lasx_xvpickod_h(t1, t0);
+    __m256i ac_hi = __lasx_xvpickev_h(t3, t2);
+    __m256i bd_hi = __lasx_xvpickod_h(t3, t2);
+
+    __m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo);
+    __m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo);
+    __m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo);
+    __m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo);
+
+    a.val = __lasx_xvperm_w(a_pre, sh);
+    b.val = __lasx_xvperm_w(b_pre, sh);
+    c.val = __lasx_xvperm_w(c_pre, sh);
+    d.val = __lasx_xvperm_w(d_pre, sh);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
+{
+    __m256i p0 = __lasx_xvld(ptr, 0);
+    __m256i p1 = __lasx_xvld(ptr, 32);
+    __m256i p2 = __lasx_xvld(ptr, 64);
+    __m256i p3 = __lasx_xvld(ptr, 96);
+
+    __m256i p01l = __lasx_xvilvl_w(p1, p0);
+    __m256i p01h = __lasx_xvilvh_w(p1, p0);
+    __m256i p23l = __lasx_xvilvl_w(p3, p2);
+    __m256i p23h = __lasx_xvilvh_w(p3, p2);
+
+    __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
+    __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
+    __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
+    __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
+
+    __m256i b0 = __lasx_xvilvl_w(plh, pll);
+    __m256i g0 = __lasx_xvilvh_w(plh, pll);
+    __m256i r0 = __lasx_xvilvl_w(phh, phl);
+    __m256i a0 = __lasx_xvilvh_w(phh, phl);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+    d = v_uint32x8(a0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
+{
+    __m256i bgra0 = __lasx_xvld(ptr, 0);
+    __m256i bgra1 = __lasx_xvld(ptr, 32);
+    __m256i bgra2 = __lasx_xvld(ptr, 64);
+    __m256i bgra3 = __lasx_xvld(ptr, 96);
+
+    __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
+    __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);
+    __m256i l13 = __lasx_xvpermi_q(bgra1, bgra3, 0x02);
+    __m256i h13 = __lasx_xvpermi_q(bgra1, bgra3, 0x13);
+
+    __m256i b0 = __lasx_xvilvl_d(l13, l02);
+    __m256i g0 = __lasx_xvilvh_d(l13, l02);
+    __m256i r0 = __lasx_xvilvl_d(h13, h02);
+    __m256i a0 = __lasx_xvilvh_d(h13, h02);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+    d = v_uint64x4(a0);
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_b(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_b(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 32*1);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_h(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_h(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 16*2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_w(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_w(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 8*4);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_d(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_d(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 4*8);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _v256_setr_b(
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m256i sh_g = _v256_setr_b(
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m256i sh_r = _v256_setr_b(
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+
+    __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
+    __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
+    __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
+
+    const __m256i m0 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                    0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                    0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+
+    __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
+    __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
+    __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
+    __m256i bgr1 = __lasx_xvpermi_q(p0, p2, 0 + 3*16);
+    __m256i bgr2 = __lasx_xvpermi_q(p2, p1, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgr1, (__m256i*)ptr, 32);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 64);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _v256_setr_b(
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _v256_setr_b(
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m256i sh_r = _v256_setr_b(
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+
+    __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
+    __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
+    __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
+
+    const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                    0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                    -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+
+    __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
+    __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
+    __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = __lasx_xvpermi_q(p2, p0, 0 + 2*16);
+    __m256i bgr2 = __lasx_xvpermi_q(p2, p0, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(p1,   (__m256i*)ptr, 16*2);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 32*2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i b0 = __lasx_xvshuf4i_w(a.val, 0x6c);
+    __m256i g0 = __lasx_xvshuf4i_w(b.val, 0xb1);
+    __m256i r0 = __lasx_xvshuf4i_w(c.val, 0xc6);
+
+    __m256i bitmask_1 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
+    __m256i bitmask_2 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
+
+    __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, bitmask_1), r0, bitmask_2);
+    __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, bitmask_1), b0, bitmask_2);
+    __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, bitmask_1), g0, bitmask_2);
+
+    __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
+    __m256i bgr2 = __lasx_xvpermi_q(p1, p0, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(p2,   (__m256i*)ptr, 8*4);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 16*4);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i s01 = __lasx_xvilvl_d(b.val, a.val);
+    __m256i s12 = __lasx_xvilvh_d(c.val, b.val);
+    __m256i s20 = __lasx_xvpermi_w(a.val, c.val, 0xe4);
+
+    __m256i bgr0 = __lasx_xvpermi_q(s20, s01, 0 + 2*16);
+    __m256i bgr1 = __lasx_xvpermi_q(s01, s12, 0x30);
+    __m256i bgr2 = __lasx_xvpermi_q(s12, s20, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgr1, (__m256i*)ptr, 4*8);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 8*8);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
+                                const v_uint8x32& c, const v_uint8x32& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_b(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_b(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_b(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_b(d.val, c.val);
+
+    __m256i bgra0_ = __lasx_xvilvl_h(ra0, bg0);
+    __m256i bgra1_ = __lasx_xvilvh_h(ra0, bg0);
+    __m256i bgra2_ = __lasx_xvilvl_h(ra1, bg1);
+    __m256i bgra3_ = __lasx_xvilvh_h(ra1, bg1);
+
+    __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
+    __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
+    __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)ptr, 32);
+    __lasx_xvst(bgra2, (__m256i*)ptr, 64);
+    __lasx_xvst(bgra3, (__m256i*)ptr, 96);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
+                                const v_uint16x16& c, const v_uint16x16& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_h(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_h(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_h(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_h(d.val, c.val);
+
+    __m256i bgra0_ = __lasx_xvilvl_w(ra0, bg0);
+    __m256i bgra1_ = __lasx_xvilvh_w(ra0, bg0);
+    __m256i bgra2_ = __lasx_xvilvl_w(ra1, bg1);
+    __m256i bgra3_ = __lasx_xvilvh_w(ra1, bg1);
+
+    __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
+    __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
+    __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)ptr, 16*2);
+    __lasx_xvst(bgra2, (__m256i*)ptr, 32*2);
+    __lasx_xvst(bgra3, (__m256i*)ptr, 48*2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
+                                const v_uint32x8& c, const v_uint32x8& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_w(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_w(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_w(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_w(d.val, c.val);
+
+    __m256i bgra0_ = __lasx_xvilvl_d(ra0, bg0);
+    __m256i bgra1_ = __lasx_xvilvh_d(ra0, bg0);
+    __m256i bgra2_ = __lasx_xvilvl_d(ra1, bg1);
+    __m256i bgra3_ = __lasx_xvilvh_d(ra1, bg1);
+
+    __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
+    __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
+    __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)ptr, 8*4);
+    __lasx_xvst(bgra2, (__m256i*)ptr, 16*4);
+    __lasx_xvst(bgra3, (__m256i*)ptr, 24*4);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
+                                const v_uint64x4& c, const v_uint64x4& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_d(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_d(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_d(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_d(d.val, c.val);
+
+    __m256i bgra0 = __lasx_xvpermi_q(ra0, bg0, 0 + 2*16);
+    __m256i bgra1 = __lasx_xvpermi_q(ra1, bg1, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(ra0, bg0, 1 + 3*16);
+    __m256i bgra3 = __lasx_xvpermi_q(ra1, bg1, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)(ptr), 4*8);
+    __lasx_xvst(bgra2, (__m256i*)(ptr), 8*8);
+    __lasx_xvst(bgra3, (__m256i*)(ptr), 12*8);
+}
+
+
+#define OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1/*, mode*/);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1/*, mode*/);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1/*, mode*/); \
+}
+
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
+
+//
+// FP16
+//
+
+inline v_float32x8 v256_load_expand(const hfloat* ptr)
+{
+#if CV_FP16
+    //1-load128, 2-permi, 3-cvt
+   return v_float32x8(__lasx_xvfcvtl_s_h(__lasx_xvpermi_d(__lsx_vld((const __m128i*)ptr, 0), 0x10)));
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    for (int i = 0; i < 8; i++)
+        buf[i] = (float)ptr[i];
+    return v256_load_aligned(buf);
+#endif
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
+{
+#if CV_FP16
+    __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
+    __lsx_vst((_m128i)ah, ptr, 0);
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    v_store_aligned(buf, a);
+    for (int i = 0; i < 8; i++)
+        ptr[i] = hfloat(buf[i]);
+#endif
+}
+
+//
+// end of FP16
+//
+
+inline void v256_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_LASX_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_lsx.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_lsx.hpp
new file mode 100644
index 000000000000..6e3290426f77
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_lsx.hpp
@@ -0,0 +1,2538 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_LSX_HPP
+#define OPENCV_HAL_INTRIN_LSX_HPP
+
+#include <lsxintrin.h>
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD128_FP16 0
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+/////////// Utils ////////
+
+inline __m128i _v128_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
+        char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
+{
+    return (__m128i)v16i8{ v0, v1, v2, v3, v4, v5, v6, v7,
+                           v8, v9, v10, v11, v12, v13, v14, v15 };
+}
+
+inline __m128i _v128_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
+        char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
+{
+    return (__m128i)v16i8{ v15, v14, v13, v12, v11, v10, v9, v8,
+                           v7, v6, v5, v4, v3, v2, v1, v0 };
+}
+
+inline __m128i _v128_setr_h(short v0, short v1, short v2, short v3, short v4, short v5,
+       short v6, short v7)
+{
+    return (__m128i)v8i16{ v0, v1, v2, v3, v4, v5, v6, v7 };
+}
+
+inline __m128i _v128_setr_w(int v0, int v1, int v2, int v3)
+{
+    return (__m128i)v4i32{ v0, v1, v2, v3 };
+}
+
+inline __m128i _v128_set_w(int v0, int v1, int v2, int v3)
+{
+    return (__m128i)v4i32{ v3, v2, v1, v0 };
+}
+
+inline __m128i _v128_setall_w(int v0)
+{
+    return __lsx_vreplgr2vr_w(v0);
+}
+
+inline __m128i _v128_setr_d(int64 v0, int64 v1)
+{
+    return (__m128i)v2i64{ v0, v1 };
+}
+
+inline __m128i _v128_set_d(int64 v0, int64 v1)
+{
+    return (__m128i)v2i64{ v1, v0 };
+}
+
+inline __m128 _v128_setr_ps(float v0, float v1, float v2, float v3)
+{
+    return (__m128)v4f32{ v0, v1, v2, v3 };
+}
+
+inline __m128 _v128_setall_ps(float v0)
+{
+    return (__m128)v4f32{ v0, v0, v0, v0 };
+}
+
+inline __m128d _v128_setr_pd(double v0, double v1)
+{
+    return (__m128d)v2f64{ v0, v1 };
+}
+
+inline __m128d _v128_setall_pd(double v0)
+{
+    return (__m128d)v2f64{ v0, v0 };
+}
+
+inline __m128i _lsx_packus_h(const __m128i& a, const __m128i& b)
+{
+    return __lsx_vssrarni_bu_h(b, a, 0);
+}
+
+inline __m128i _lsx_packs_h(const __m128i& a, const __m128i& b)
+{
+    return __lsx_vssrarni_b_h(b, a, 0);
+}
+
+inline __m128i _lsx_packus_w(const __m128i& a, const __m128i& b)
+{
+    return __lsx_vssrarni_hu_w(b, a, 0);
+}
+
+/////// Types ///////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16};
+
+    v_uint8x16() {}
+    explicit v_uint8x16(__m128i v): val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+             uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+
+    uchar get0() const
+    {
+        return (uchar)__lsx_vpickve2gr_bu(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(__m128i v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+            schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+
+    schar get0() const
+    {
+        return (schar)__lsx_vpickve2gr_b(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(__m128i v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+
+    ushort get0() const
+    {
+        return (ushort)__lsx_vpickve2gr_hu(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(__m128i v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+
+    short get0() const
+    {
+        return (short)__lsx_vpickve2gr_h(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(__m128i v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        val = _v128_setr_w(v0, v1, v2, v3);
+    }
+
+    unsigned get0() const
+    {
+        return (unsigned)__lsx_vpickve2gr_wu(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(__m128i v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        val = _v128_setr_w(v0, v1, v2, v3);
+    }
+
+    int get0() const
+    {
+        return (int)__lsx_vpickve2gr_w(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4};
+
+    v_float32x4() {}
+    explicit v_float32x4(__m128 v) : val(v) {}
+    explicit v_float32x4(__m128i v) { val = *((__m128*)&v); }
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        val = _v128_setr_ps(v0, v1, v2, v3);
+    }
+
+    float get0() const
+    {
+        union { int iv; float fv; } d;
+        d.iv = __lsx_vpickve2gr_w(val, 0);
+        return d.fv;
+    }
+
+    int get0toint() const
+    {
+        __m128i result = __lsx_vftintrz_w_s(val);
+        return (int)__lsx_vpickve2gr_w(result, 0);
+    }
+
+    __m128 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2};
+
+    v_uint64x2() {}
+    explicit v_uint64x2(__m128i v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        val = _v128_setr_d(v0, v1);
+    }
+
+    uint64 get0() const
+    {
+        return __lsx_vpickve2gr_du(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2};
+
+    v_int64x2() {}
+    explicit v_int64x2(__m128i v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        val = _v128_setr_d(v0, v1);
+    }
+
+    uint64 get0() const
+    {
+        return __lsx_vpickve2gr_d(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2};
+
+    v_float64x2() {}
+    explicit v_float64x2(__m128d v) : val(v) {}
+    explicit v_float64x2(__m128i v) { val = *((__m128d*)&v); }
+    v_float64x2(double v0, double v1)
+    {
+        val = _v128_setr_pd(v0, v1);
+    }
+
+    double get0() const
+    {
+        union { int64 iv; double fv; } d;
+        d.iv = __lsx_vpickve2gr_d(val, 0);
+        return d.fv;
+    }
+
+    int64 get0toint64() const
+    {
+        __m128i result = __lsx_vftintrz_l_d(val);
+        return (int64)__lsx_vpickve2gr_d(result, 0);
+    }
+
+    __m128d val;
+};
+
+////////////// Load and store operations /////////
+
+#define OPENCV_HAL_IMPL_LSX_LOADSTORE(_Tpvec, _Tp)                     \
+    inline _Tpvec v_load(const _Tp* ptr)                               \
+    { return _Tpvec(__lsx_vld(ptr, 0)); }                              \
+    inline _Tpvec v_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(__lsx_vld(ptr, 0)); }                              \
+    inline _Tpvec v_load_low(const _Tp* ptr)                           \
+    { return _Tpvec(__lsx_vldrepl_d(ptr, 0)); }                        \
+    inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                  \
+        __m128i vl = __lsx_vldrepl_d(ptr0, 0);                         \
+        __m128i vh = __lsx_vldrepl_d(ptr1, 0);                         \
+        return _Tpvec(__lsx_vilvl_d(vh, vl));                          \
+    }                                                                  \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                     \
+    { __lsx_vst(a.val, ptr, 0); }                                      \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)             \
+    { __lsx_vst(a.val, ptr, 0); }                                      \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)     \
+    { __lsx_vst(a.val, ptr, 0); }                                      \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
+    {                                                                  \
+        if ( mode == hal::STORE_UNALIGNED)                             \
+            __lsx_vst(a.val, ptr, 0);                                  \
+        else if ( mode == hal::STORE_ALIGNED_NOCACHE)                  \
+            __lsx_vst(a.val, ptr, 0);                                  \
+        else                                                           \
+            __lsx_vst(a.val, ptr, 0);                                  \
+    }                                                                  \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                 \
+    {  __lsx_vstelm_d(a.val, ptr, 0, 0); }                             \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                \
+    {  __lsx_vstelm_d(a.val, ptr, 0, 1); }                             \
+
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint8x16,  uchar)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int8x16,   schar)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int16x8,  short)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint32x4,  unsigned)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int32x4,   int)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint64x2,  uint64)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int64x2,   int64)
+
+#define OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg)        \
+    inline _Tpvec v_load(const _Tp* ptr)                               \
+    { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); }                     \
+    inline _Tpvec v_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); }                     \
+    inline _Tpvec v_load_low(const _Tp* ptr)                           \
+    { return _Tpvec((halfreg)__lsx_vldrepl_d(ptr, 0)); }               \
+    inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                  \
+        __m128i vl = __lsx_vldrepl_d(ptr0, 0);                         \
+        __m128i vh = __lsx_vldrepl_d(ptr1, 0);                         \
+        return _Tpvec((halfreg)__lsx_vilvl_d(vh, vl));                 \
+    }                                                                  \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                     \
+    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)             \
+    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)     \
+    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
+    {                                                                  \
+        if( mode == hal::STORE_UNALIGNED)                              \
+            __lsx_vst((__m128i)a.val, ptr, 0);                         \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE)                   \
+            __lsx_vst((__m128i)a.val, ptr, 0);                         \
+        else                                                           \
+            __lsx_vst((__m128i)a.val, ptr, 0);                         \
+    }                                                                  \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                 \
+    {  __lsx_vstelm_d((__m128i)a.val, ptr, 0, 0); }                    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                \
+    {  __lsx_vstelm_d((__m128i)a.val, ptr, 0, 1); }                    \
+
+OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float32x4, float, __m128)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float64x2, double, __m128d)
+
+inline __m128i _lsx_128_castps_si128(const __m128& v)
+{ return __m128i(v); }
+
+inline __m128i _lsx_128_castpd_si128(const __m128d& v)
+{ return __m128i(v); }
+
+#define OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, _Tpvecf, suffix, cast)  \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)    \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_LSX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)           \
+    inline _Tpvec v_setzero_##suffix()                                            \
+    { return _Tpvec(__lsx_vldi(0)); }                                             \
+    inline _Tpvec v_setall_##suffix(_Tp v)                                        \
+    { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); }                    \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float32x4, suffix, _lsx_128_castps_si128)  \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float64x2, suffix, _lsx_128_castpd_si128)  \
+
+OPENCV_HAL_IMPL_LSX_INIT(v_uint8x16,  uchar,    u8,   b,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int8x16,   schar,    s8,   b,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_uint16x8,  ushort,   u16,  h,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int16x8,   short,    s16,  h,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_uint32x4,  unsigned, u32,  w,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int32x4,   int,      s32,  w,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_uint64x2,  uint64,   u64,  d,  long int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int64x2,   int64,    s64,  d,  long int)
+
+inline __m128 _lsx_128_castsi128_ps(const __m128i &v)
+{ return __m128(v); }
+
+inline __m128d _lsx_128_castsi128_pd(const __m128i &v)
+{ return __m128d(v); }
+
+#define OPENCV_HAL_IMPL_LSX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast)    \
+    inline _Tpvec v_setzero_##suffix()                                      \
+    { return _Tpvec(__lsx_vldi(0)); }                                       \
+    inline _Tpvec v_setall_##suffix(_Tp v)                                  \
+    { return _Tpvec(_v128_setall_##zsuffix(v)); }                           \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,      suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8,      suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4,      suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2,      suffix,   cast)        \
+
+OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float32x4, float,  f32, ps, _lsx_128_castsi128_ps)
+OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float64x2, double, f64, pd, _lsx_128_castsi128_pd)
+
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a)
+{ return a; }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a)
+{ return v_float32x4(_lsx_128_castps_si128(__m128(a.val))); }
+
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a)
+{ return a; }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a)
+{ return v_float64x2(_lsx_128_castpd_si128(__m128d(a.val))); }
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_LSX_UNPACK(_Tpvec, suffix)                            \
+    inline _Tpvec v128_unpacklo(const _Tpvec& a, const _Tpvec& b)             \
+    { return _Tpvec(__lsx_vilvl_##suffix(__m128i(b.val), __m128i(a.val))); }  \
+    inline _Tpvec v128_unpackhi(const _Tpvec& a, const _Tpvec& b)             \
+    { return _Tpvec(__lsx_vilvh_##suffix(__m128i(b.val), __m128i(a.val))); }  \
+
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint8x16,  b)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int8x16,   b)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint16x8,  h)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int16x8,   h)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint32x4,  w)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int32x4,   w)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint64x2,  d)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int64x2,   d)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_float32x4, w)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_float64x2, d)
+
+//ZIP
+#define OPENCV_HAL_IMPL_LSX_ZIP(_Tpvec)                               \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)     \
+    { return (_Tpvec)__lsx_vilvl_d((__m128i)b.val, (__m128i)a.val); } \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)    \
+    { return (_Tpvec)__lsx_vilvh_d((__m128i)b.val, (__m128i)a.val); } \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,         \
+                            _Tpvec& c, _Tpvec& d)                     \
+    {                                                                 \
+        __m128i a1 = (__m128i)a.val,  b1 = (__m128i)b.val;            \
+        c = _Tpvec(__lsx_vilvl_d(b1, a1));                            \
+        d = _Tpvec(__lsx_vilvh_d(b1, a1));                            \
+    }                                                                 \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,               \
+                      _Tpvec& ab0, _Tpvec& ab1)                       \
+    {                                                                 \
+        ab0 = v128_unpacklo(a, b);                                    \
+        ab1 = v128_unpackhi(a, b);                                    \
+    }
+
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint8x16)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int8x16)
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint16x8)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int16x8)
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint32x4)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int32x4)
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint64x2)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int64x2)
+OPENCV_HAL_IMPL_LSX_ZIP(v_float32x4)
+OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin)           \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16,  __lsx_vsadd_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16,  __lsx_vssub_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16,   __lsx_vsadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16,   __lsx_vssub_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8,  __lsx_vsadd_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8,  __lsx_vssub_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8,   __lsx_vsadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8,   __lsx_vssub_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4,  __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4,  __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4,  __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4,   __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4,   __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4,   __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2,  __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2,  __lsx_vsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2,   __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2,   __lsx_vsub_d)
+
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d)
+
+// saturating multiply 8-bit, 16-bit
+inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
+{
+    v_uint16x8 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b)
+{
+    v_int16x8 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
+    __m128i pod = __lsx_vmulwod_w_hu(a0, b0);
+    __m128i pl  = __lsx_vilvl_w(pod, pev);
+    __m128i ph  = __lsx_vilvh_w(pod, pev);
+    return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
+}
+inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i pev = __lsx_vmulwev_w_h(a0, b0);
+    __m128i pod = __lsx_vmulwod_w_h(a0, b0);
+    __m128i pl  = __lsx_vilvl_w(pod, pev);
+    __m128i ph  = __lsx_vilvh_w(pod, pev);
+    return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
+}
+inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
+{ a = a * b; return a; }
+inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b)
+{ a = a * b; return a; }
+inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b)
+{ a = a * b; return a; }
+inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b)
+{ a = a * b; return a; }
+
+/** Non-saturating arithmetics **/
+
+#define OPENCV_HAL_IMPL_LSX_BIN_FUNC(func, _Tpvec, intrin)         \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)           \
+    { return _Tpvec(intrin(a.val, b.val)); }                       \
+
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint8x16,  __lsx_vadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int8x16,   __lsx_vadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint16x8,  __lsx_vadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int16x8,   __lsx_vadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint8x16,  __lsx_vsub_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int8x16,   __lsx_vsub_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint16x8,  __lsx_vsub_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int16x8,   __lsx_vsub_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_uint16x8,  __lsx_vmul_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_int16x8,   __lsx_vmul_h)
+
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
+    return v_uint8x16(__lsx_vpackev_b(p1, p0));
+}
+
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+// Multiply and expand
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
+    c.val = __lsx_vilvl_h(p1, p0);
+    d.val = __lsx_vilvh_h(p1, p0);
+}
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_h_b(a0, b0);
+    __m128i p1 = __lsx_vmulwod_h_b(a0, b0);
+    c.val = __lsx_vilvl_h(p1, p0);
+    d.val = __lsx_vilvh_h(p1, p0);
+}
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_w_h(a0, b0);
+    __m128i p1 = __lsx_vmulwod_w_h(a0, b0);
+    c.val = __lsx_vilvl_w(p1, p0);
+    d.val = __lsx_vilvh_w(p1, p0);
+}
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_w_hu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_w_hu(a0, b0);
+    c.val = __lsx_vilvl_w(p1, p0);
+    d.val = __lsx_vilvh_w(p1, p0);
+}
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_d_wu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_d_wu(a0, b0);
+    c.val = __lsx_vilvl_d(p1, p0);
+    d.val = __lsx_vilvh_d(p1, p0);
+}
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(__lsx_vmuh_h(a.val, b.val)); }
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint16x8(__lsx_vmuh_hu(a.val, b.val)); }
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                 \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)                           \
+    { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)                           \
+    { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)                           \
+    { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)                           \
+    { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); }                 \
+    template<int imm>                                                                \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                                           \
+    { return _Tpuvec(__lsx_vslli_##suffix(a.val, imm)); }                            \
+    template<int imm>                                                                \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                                           \
+    { return _Tpsvec(__lsx_vslli_##suffix(a.val, imm)); }                            \
+    template<int imm>                                                                \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                                           \
+    { return _Tpuvec(__lsx_vsrli_##suffix(a.val, imm)); }                            \
+    template<int imm>                                                                \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                                           \
+    { return _Tpsvec(__lsx_vsrai_##suffix(a.val, imm)); }                            \
+
+OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint16x8, v_int16x8, h, __lsx_vsra_h)
+OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint32x4, v_int32x4, w, __lsx_vsra_w)
+OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix)                                 \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix)                       \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix)                        \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix)                       \
+    inline _Tpvec operator ~(const _Tpvec& a)                                        \
+    { return _Tpvec(__lsx_vnori_b(a.val, 0)); }                                      \
+
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int8x16,    v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint16x8,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int16x8,    v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint32x4,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int32x4,    v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2,    v)
+
+#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)               \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)                 \
+    { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }                   \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)                   \
+    { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val);                          \
+      a.val = cast(c);                                                               \
+      return a;}
+
+#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast)                             \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast)                  \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast)                   \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast)                  \
+    inline _Tpvec operator ~ (const _Tpvec& a)                                       \
+    { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); }                           \
+
+OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
+OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float64x2, _lsx_128_castsi128_pd)
+
+/** Select **/
+#define OPENCV_HAL_IMPL_LSX_SELECT(_Tpvec)                                           \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(__lsx_vbitsel_v(b.val, a.val, mask.val)); }                      \
+
+OPENCV_HAL_IMPL_LSX_SELECT(v_uint8x16)
+OPENCV_HAL_IMPL_LSX_SELECT(v_int8x16)
+OPENCV_HAL_IMPL_LSX_SELECT(v_uint16x8)
+OPENCV_HAL_IMPL_LSX_SELECT(v_int16x8)
+OPENCV_HAL_IMPL_LSX_SELECT(v_uint32x4)
+OPENCV_HAL_IMPL_LSX_SELECT(v_int32x4)
+
+inline v_float32x4 v_select(const v_float32x4 &mask, const v_float32x4 &a, const v_float32x4 &b)
+{ return v_float32x4(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
+inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const v_float64x2 &b)
+{ return v_float64x2(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec)                            \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)         \
+    { return ~( a == b ); }                                              \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)         \
+    { return b > a ; }                                                   \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)         \
+    { return ~(a < b); }                                                 \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)         \
+    { return b >= a; }                                                   \
+
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)    \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)          \
+    { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
+    inline _Tpuvec operator >  (const _Tpuvec& a, const _Tpuvec& b)          \
+    { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); }                  \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)          \
+    { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)           \
+    { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); }                   \
+    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec)                                   \
+    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint8x16,  v_int8x16,  b, bu)
+OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8,  v_int16x8,  h, hu)
+OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4,  v_int32x4,  w, wu)
+
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix)          \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); }         \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }
+
+OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
+OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)
+
+#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)       \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)       \
+    { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); }           \
+
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix)                    \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(<,  vfcmp_clt, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix)            \
+
+OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
+OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)
+
+inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b)
+{ return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
+
+inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b)
+{ return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
+
+inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b)
+{ return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
+
+inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b)
+{ return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(__lsx_vfcmp_cor_s(a.val, a.val)); }
+
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(__lsx_vfcmp_cor_d(a.val, a.val)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint8x16,  __lsx_vmin_bu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint8x16,  __lsx_vmax_bu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int8x16,   __lsx_vmin_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int8x16,   __lsx_vmax_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint16x8,  __lsx_vmin_hu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint16x8,  __lsx_vmax_hu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int16x8,   __lsx_vmin_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int16x8,   __lsx_vmax_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint32x4,  __lsx_vmin_wu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint32x4,  __lsx_vmax_wu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int32x4,   __lsx_vmin_w)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int32x4,   __lsx_vmax_w)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float32x4, __lsx_vfmin_s)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float32x4, __lsx_vfmax_s)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float64x2, __lsx_vfmin_d)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float64x2, __lsx_vfmax_d)
+
+template <int imm,
+    bool is_invalid = ((imm < 0) || (imm > 16)),
+    bool is_first = (imm == 0),
+    bool is_half = (imm == 8),
+    bool is_second = (imm == 16),
+    bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
+class v_lsx_palignr_u8_class;
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, true, false, false, false, false>;
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        CV_UNUSED(b);
+        return a;
+    }
+};
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, false, true, false, false>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        return __lsx_vshuf4i_d(a, b, 0x9);
+    }
+};
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        CV_UNUSED(a);
+        return b;
+    }
+};
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, false, false, false, true>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        enum { imm2 = (sizeof(__m128i) - imm) };
+        return __lsx_vor_v(__lsx_vbsrl_v(a, imm), __lsx_vbsll_v(b, imm2));
+    }
+};
+
+template <int imm>
+inline __m128i v_lsx_palignr_u8(const __m128i& a, const __m128i& b)
+{
+    CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_lsx_palignr_u8");
+    return v_lsx_palignr_u8_class<imm>()(a, b);
+}
+/** Rotate **/
+#define OPENCV_HAL_IMPL_LSX_ROTATE_CAST(_Tpvec, cast)                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_right(const _Tpvec &a)                                       \
+    {                                                                                   \
+        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \
+        __m128i ret = __lsx_vbsrl_v((__m128i)a.val, imm2);                              \
+        return _Tpvec(cast(ret));                                                       \
+    }                                                                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_left(const _Tpvec &a)                                        \
+    {                                                                                   \
+        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \
+        __m128i ret = __lsx_vbsll_v((__m128i)a.val, imm2);                              \
+        return _Tpvec(cast(ret));                                                       \
+    }                                                                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                      \
+    {                                                                                   \
+        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \
+        return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)a.val, (__m128i)b.val)));    \
+    }                                                                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                       \
+    {                                                                                   \
+        enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type))};   \
+        return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)b.val, (__m128i)a.val)));    \
+    }
+
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint8x16, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int8x16,  OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint16x8, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int16x8,  OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint32x4, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int32x4,  OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint64x2, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int64x2,  OPENCV_HAL_NOP)                             \
+
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float32x4, _lsx_128_castsi128_ps)
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float64x2, _lsx_128_castsi128_pd)
+
+/** Rverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    __m128i vec = __lsx_vshuf4i_b(a.val, 0x1B);
+    return v_uint8x16(__lsx_vshuf4i_w(vec, 0x1B));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    __m128i vec = __lsx_vshuf4i_h(a.val, 0x1B);
+    return v_uint16x8(__lsx_vshuf4i_w(vec, 0x4E));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{ return v_uint32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_int32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{ return v_uint64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_int64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////////// Reduce and mask ////////////
+
+/** Reduce **/
+// this function is return a[0]+a[1]+...+a[31]
+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+    __m128i t1 = __lsx_vhaddw_hu_bu(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
+    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
+    return (unsigned)__lsx_vpickve2gr_w(t4, 0);
+}
+
+inline int v_reduce_sum(const v_int8x16 &a)
+{
+    __m128i t1 = __lsx_vhaddw_h_b(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_w_h(t1, t1);
+    __m128i t3 = __lsx_vhaddw_d_w(t2, t2);
+    __m128i t4 = __lsx_vhaddw_q_d(t3, t3);
+    return (int)__lsx_vpickve2gr_w(t4, 0);
+}
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_16(_Tpvec, sctype, func, intrin)            \
+    inline sctype v_reduce_##func(const _Tpvec& a)                             \
+    {                                                                          \
+        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \
+        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \
+        val = intrin(val, __lsx_vbsrl_v(val, 2));                              \
+        val = intrin(val, __lsx_vbsrl_v(val, 1));                              \
+        return (sctype)__lsx_vpickve2gr_b(val, 0);                             \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, min, __lsx_vmin_bu)
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, max, __lsx_vmax_bu)
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16,  schar, min, __lsx_vmin_b)
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16,  schar, max, __lsx_vmax_b)
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_8(_Tpvec, sctype, func, intrin)             \
+    inline sctype v_reduce_##func(const _Tpvec &a)                             \
+    {                                                                          \
+        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \
+        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \
+        val = intrin(val, __lsx_vbsrl_v(val, 2));                              \
+        return (sctype)__lsx_vpickve2gr_h(val, 0);                             \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, min, __lsx_vmin_hu)
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, max, __lsx_vmax_hu)
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8,  short,  min, __lsx_vmin_h)
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8,  short,  max, __lsx_vmax_h)
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_4(_Tpvec, sctype, func, intrin)             \
+    inline sctype v_reduce_##func(const _Tpvec &a)                             \
+    {                                                                          \
+        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \
+        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \
+        return (sctype)__lsx_vpickve2gr_w(val, 0);                             \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, min, __lsx_vmin_wu)
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, max, __lsx_vmax_wu)
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4,  int,      min, __lsx_vmin_w)
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4,  int,      max, __lsx_vmax_w)
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_FLT(func, intrin)                           \
+    inline float v_reduce_##func(const v_float32x4 &a)                         \
+    {                                                                          \
+        __m128 val   = a.val;                                                  \
+        val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 8));             \
+        val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 4));             \
+        float *fval = (float*)&val;                                            \
+        return fval[0];                                                        \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_FLT(min, __lsx_vfmin_s)
+OPENCV_HAL_IMPL_LSX_REDUCE_FLT(max, __lsx_vfmax_s)
+
+inline int v_reduce_sum(const v_int32x4 &a)
+{
+    __m128i t1 = __lsx_vhaddw_d_w(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_q_d(t1, t1);
+    return (int)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x4 &a)
+{
+    __m128i t1 = __lsx_vhaddw_du_wu(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
+    return (int)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline int v_reduce_sum(const v_int16x8 &a)
+{
+    __m128i t1 = __lsx_vhaddw_w_h(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_d_w(t1, t1);
+    __m128i t3 = __lsx_vhaddw_q_d(t2, t2);
+    return (int)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline unsigned v_reduce_sum(const v_uint16x8 &a)
+{
+    __m128i t1 = __lsx_vhaddw_wu_hu(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
+    return (int)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline float v_reduce_sum(const v_float32x4 &a)
+{
+    __m128i val = (__m128i)a.val;
+    val = __lsx_vbsrl_v(val, 8);
+    __m128 result = __lsx_vfadd_s(a.val, (__m128)val);
+    float *pa = (float*)&result;
+    return (float)(pa[0] + pa[1]);
+}
+
+inline uint64 v_reduce_sum(const v_uint64x2 &a)
+{
+    __m128i t0 = __lsx_vhaddw_qu_du(a.val, a.val);
+    return (uint64)__lsx_vpickve2gr_du(t0, 0);
+}
+
+inline int64 v_reduce_sum(const v_int64x2 &a)
+{
+    __m128i t0 = __lsx_vhaddw_q_d(a.val, a.val);
+    return (int64)__lsx_vpickve2gr_d(t0, 0);
+}
+
+inline double v_reduce_sum(const v_float64x2 &a)
+{
+    double *pa = (double*)&a;
+    return pa[0] + pa[1];
+}
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    __m128i a0 = (__m128i)a.val;
+    __m128i b0 = (__m128i)b.val;
+    __m128i c0 = (__m128i)c.val;
+    __m128i d0 = (__m128i)d.val;
+    __m128i ac_l = __lsx_vilvl_w(c0, a0);
+    __m128i ac_h = __lsx_vilvh_w(c0, a0);
+    __m128i bd_l = __lsx_vilvl_w(d0, b0);
+    __m128i bd_h = __lsx_vilvh_w(d0, b0);
+    __m128  ac   = __lsx_vfadd_s((__m128)ac_l, (__m128)ac_h);
+    __m128  bd   = __lsx_vfadd_s((__m128)bd_l, (__m128)bd_h);
+    return v_float32x4(__lsx_vfadd_s((__m128)__lsx_vilvl_w((__m128i)bd, (__m128i)ac),
+                       (__m128)__lsx_vilvh_w((__m128i)bd, (__m128i)ac)));
+}
+
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i t0 = __lsx_vabsd_b(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
+    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
+    return (unsigned)__lsx_vpickve2gr_w(t4, 0);
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i t0 = __lsx_vabsd_bu(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
+    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
+    return (unsigned)__lsx_vpickve2gr_w(t4, 0);
+}
+
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i t0 = __lsx_vabsd_hu(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
+    return (unsigned)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i t0 = __lsx_vabsd_h(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
+    return (unsigned)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i t0 = __lsx_vabsd_wu(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
+    return (unsigned)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i t0 = __lsx_vabsd_w(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
+    return (unsigned)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 a_b = a - b;
+    return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
+}
+
+/** Popcount **/
+#define OPENCV_HAL_IMPL_LSX_POPCOUNT(_Tpvec, _Tp, suffix)                  \
+inline _Tpvec v_popcount(const _Tp& a)                                     \
+{ return _Tpvec(__lsx_vpcnt_##suffix(a.val)); }
+
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16,  v_uint8x16,  b);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16,  v_int8x16,   b);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8,  v_uint16x8,  h);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8,  v_int16x8,   h);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4,  v_uint32x4,  w);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4,  v_int32x4,   w);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2,  v_uint64x2,  d);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2,  v_int64x2,   d);
+
+/** Mask **/
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt)              \
+inline tt reinterpret_int(ft x) { union {ft l; tt i;} v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
+
+inline int v_signmask(const v_int8x16& a)
+{
+    __m128i result = __lsx_vmskltz_b(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_uint8x16& a)
+{ return v_signmask(v_reinterpret_as_s8(a)) ;}
+
+inline int v_signmask(const v_int16x8 &a)
+{
+    __m128i result = __lsx_vmskltz_h(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_uint16x8 &a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    __m128i result = __lsx_vmskltz_w(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+inline int v_signmask(const v_uint64x2& a)
+{
+    __m128i result = __lsx_vmskltz_d(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(*(v_int32x4*)(&a)); }
+
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(*(v_int64x2*)(&a)); }
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_LSX_CHECK(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
+    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint8x16, 65535)
+OPENCV_HAL_IMPL_LSX_CHECK(v_int8x16, 65535)
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint16x8, 255);
+OPENCV_HAL_IMPL_LSX_CHECK(v_int16x8, 255);
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint32x4, 15)
+OPENCV_HAL_IMPL_LSX_CHECK(v_int32x4, 15)
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint64x2, 3)
+OPENCV_HAL_IMPL_LSX_CHECK(v_int64x2, 3)
+OPENCV_HAL_IMPL_LSX_CHECK(v_float32x4, 15)
+OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
+
+///////////// Other math /////////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_LSX_MULADD(_Tpvec, suffix)                              \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \
+    { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); }              \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec &b, const _Tpvec& c)   \
+    { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); }              \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                       \
+    { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); }                            \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
+    { return v_fma(a, a, b * b); }                                              \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_sqrt(v_fma(a, a, b * b)); }
+
+OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
+OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return v_int32x4(__lsx_vmadd_w(c.val, a.val, b.val)); }
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return v_fma(a, b, c); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    return v_float32x4(__lsx_vfrsqrt_s(x.val));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    return v_float64x2(__lsx_vfrsqrt_d(x.val));
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_LSX_ABS(_Tpvec, suffix)                          \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)                        \
+    { return v_u##_Tpvec(__lsx_vabsd_##suffix(x.val, __lsx_vldi(0))); }
+
+OPENCV_HAL_IMPL_LSX_ABS(int8x16, b)
+OPENCV_HAL_IMPL_LSX_ABS(int16x8, h)
+OPENCV_HAL_IMPL_LSX_ABS(int32x4, w)
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(*((__m128i*)&x) & __lsx_vreplgr2vr_w(0x7fffffff)); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{ return v_float64x2(*((__m128i*)&x) & __lsx_vreplgr2vr_d(0x7fffffffffffffff)); }
+
+/** Absolute difference **/
+
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return (v_uint8x16)__lsx_vabsd_bu(a.val, b.val); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return (v_uint16x8)__lsx_vabsd_hu(a.val, b.val); }
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return (v_uint32x4)__lsx_vabsd_wu(a.val, b.val); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{ return (v_uint8x16)__lsx_vabsd_b(a.val, b.val); }
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{ return (v_uint16x8)__lsx_vabsd_h(a.val, b.val); }
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{ return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{ return v_abs(a - b); }
+
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+///////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftint_w_s(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(__lsx_vftint_w_d(a.val, a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{ return v_int32x4(__lsx_vftint_w_d(b.val, a.val)); }
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftintrz_w_s(a.val)); }
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(__lsx_vftintrz_w_d(a.val, a.val)); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrm_s(a.val)))); }
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{ return v_trunc(v_float64x2(__lsx_vfrintrm_d(a.val))); }
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrp_s(a.val)))); }
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{ return v_trunc(v_float64x2(__lsx_vfrintrp_d(a.val))); }
+
+/** To float **/
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{ return v_float32x4(__lsx_vffint_s_w(a.val)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{ return v_float32x4(__lsx_vfcvt_s_d(a.val, a.val)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(__lsx_vfcvt_s_d(b.val, a.val)); }
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{ return v_float64x2(__lsx_vffintl_d_w(a.val)); }
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{ return v_float64x2(__lsx_vffinth_d_w(a.val)); }
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{ return v_float64x2(__lsx_vfcvtl_d_s(a.val)); }
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{ return v_float64x2(__lsx_vfcvth_d_s(a.val)); }
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& v)
+{ return v_float64x2(__lsx_vffint_d_l(v.val)); }
+
+
+//////////////// Lookup table access ////////////////
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    return v_int8x16(_v128_setr_b(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]], tab[idx[8]],
+                     tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]],
+                     tab[idx[14]], tab[idx[15]]));
+}
+
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x16(_v128_setr_h(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]),
+           *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]), *(const short*)(tab + idx[4]),
+           *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
+}
+
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x16(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+}
+
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx)
+{ return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx)
+{ return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx)
+{ return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    return v_int16x8(_v128_setr_h(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x8(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
+}
+
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx)
+{ return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx)
+{ return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx)
+{ return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    return v_int32x4(_v128_setr_w(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_int32x4 v_lut_pairs(const int *tab, const int* idx)
+{
+    return v_int32x4(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int *idx)
+{
+    return v_int64x2(_v128_setr_d(tab[idx[0]], tab[idx[1]]));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(_v128_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    return v_float32x4((__m128)_v128_setr_pd(*(const double*)(tab + idx[0]), *(const double*)(tab + idx[1])));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4((__m128)__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(_v128_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2((__m128d)__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int *idx = (int*)&idxvec.val;
+    return v_lut(tab, idx);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v_lut(tab, idx);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v_lut(tab, idx);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    const int *idx = (const int*)&idxvec.val;
+    __m128i xy0  = __lsx_vld(tab + idx[0], 0);
+    __m128i xy1  = __lsx_vld(tab + idx[1], 0);
+    __m128i xy2  = __lsx_vld(tab + idx[2], 0);
+    __m128i xy3  = __lsx_vld(tab + idx[3], 0);
+    __m128i xy01 = __lsx_vilvl_d(xy1, xy0);
+    __m128i xy23 = __lsx_vilvl_d(xy3, xy2);
+    __m128i xxyy02 = __lsx_vilvl_w(xy23, xy01);
+    __m128i xxyy13 = __lsx_vilvh_w(xy23, xy01);
+    x = v_float32x4((__m128)__lsx_vilvl_w(xxyy13, xxyy02));
+    y = v_float32x4((__m128)__lsx_vilvh_w(xxyy13, xxyy02));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    const int* idx = (const int*)&idxvec.val;
+    __m128i xy0 = __lsx_vld(tab + idx[0], 0);
+    __m128i xy1 = __lsx_vld(tab + idx[1], 0);
+    x = v_float64x2((__m128d)__lsx_vilvl_d(xy1, xy0));
+    y = v_float64x2((__m128d)__lsx_vilvh_d(xy1, xy0));
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0705060403010200, 0x0f0d0e0c0b090a08)));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0703060205010400, 0x0f0b0e0a0d090c08)));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0706030205040100, 0x0f0e0b0a0d0c0908)));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0b0a030209080100, 0x0f0e07060d0c0504)));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    return v_int32x4(__lsx_vshuf4i_w(vec.val, 0xd8));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
+{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
+{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    __m128i zero = __lsx_vldi(0);
+    return v_int8x16(__lsx_vshuf_b(zero, vec.val,
+           _v128_set_d(0x1211100f0e0d0c0a, 0x0908060504020100)));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    __m128i zero = __lsx_vldi(0);
+    return v_int16x8(__lsx_vshuf_b(zero, vec.val,
+           _v128_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100)));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+//////////// Matrix operations /////////
+
+/////////// Dot Product /////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    return v_int32x4(__lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y));
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    __m128i x = a.val, y = b.val, z = c.val;
+    __m128i t = __lsx_vmaddwev_w_h(z, x, y);
+    return v_int32x4(__lsx_vmaddwod_w_h(t, x, y));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i x = a.val, y = b.val;
+    return v_int64x2(__lsx_vmaddwod_d_w(__lsx_vmulwev_d_w(x, y), x, y));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    __m128i x = a.val, y = b.val, z = c.val;
+    __m128i t = __lsx_vmaddwev_d_w(z, x, y);
+    return v_int64x2(__lsx_vmaddwod_d_w(t, x, y));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_h_bu(x, y);
+    __m128i odd   = __lsx_vmulwod_h_bu(x, y);
+    __m128i prod0 = __lsx_vhaddw_wu_hu(even, even);
+    __m128i prod1 = __lsx_vhaddw_wu_hu(odd, odd);
+    return v_uint32x4(__lsx_vadd_w(prod0, prod1));
+}
+
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c ;}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_h_b(x, y);
+    __m128i odd   = __lsx_vmulwod_h_b(x, y);
+    __m128i prod0 = __lsx_vhaddw_w_h(even, even);
+    __m128i prod1 = __lsx_vhaddw_w_h(odd, odd);
+    return v_int32x4(__lsx_vadd_w(prod0, prod1));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_w_hu(x, y);
+    __m128i odd   = __lsx_vmulwod_w_hu(x, y);
+    __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
+    __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
+    return v_uint64x2(__lsx_vadd_d(prod0, prod1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_w_h(x, y);
+    __m128i odd   = __lsx_vmulwod_w_h(x, y);
+    __m128i prod0 = __lsx_vhaddw_d_w(even, even);
+    __m128i prod1 = __lsx_vhaddw_d_w(odd, odd);
+    return v_int64x2(__lsx_vadd_d(prod0, prod1));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+///////// Fast Dot Product //////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_w_hu(x, y);
+    __m128i odd   = __lsx_vmulwod_w_hu(x, y);
+    __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
+    __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
+    return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i prod = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y);
+    __m128i sign = __lsx_vsrai_w(prod, 31);
+    __m128i lo   = __lsx_vilvl_w(sign, prod);
+    __m128i hi   = __lsx_vilvh_w(sign, prod);
+    return v_int64x2(__lsx_vadd_d(lo, hi));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3)
+{
+    __m128i x = (__m128i)v.val;
+    __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
+    __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
+    __m128 v2 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val);
+    __m128 v3 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xFF), m3.val);
+
+    return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), __lsx_vfadd_s(v2, v3)));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const  v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& a)
+{
+    __m128i x = (__m128i)v.val;
+    __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
+    __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
+    __m128 v2 = __lsx_vfmadd_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val, a.val);
+
+    return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), v2));
+}
+
+#define OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(_Tpvec, cast_from, cast_to)                          \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,                            \
+                               const _Tpvec& a2, const _Tpvec& a3,                            \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)                \
+   {                                                                                          \
+       __m128i t0 = cast_from(__lsx_vilvl_w(a1.val, a0.val));                                 \
+       __m128i t1 = cast_from(__lsx_vilvl_w(a3.val, a2.val));                                 \
+       __m128i t2 = cast_from(__lsx_vilvh_w(a1.val, a0.val));                                 \
+       __m128i t3 = cast_from(__lsx_vilvh_w(a3.val, a2.val));                                 \
+       b0.val = cast_to(__lsx_vilvl_d(t1, t0));                                               \
+       b1.val = cast_to(__lsx_vilvh_d(t1, t0));                                               \
+       b2.val = cast_to(__lsx_vilvl_d(t3, t2));                                               \
+       b3.val = cast_to(__lsx_vilvh_d(t3, t2));                                               \
+   }
+
+OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_uint32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_int32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+
+inline void v_transpose4x4(const v_float32x4& a0, const v_float32x4& a1,
+                           const v_float32x4& a2, const v_float32x4& a3,
+                           v_float32x4& b0, v_float32x4& b1, v_float32x4& b2, v_float32x4& b3)
+{
+    __m128i vec0 = (__m128i)a0.val, vec1 = (__m128i)a1.val;
+    __m128i vec2 = (__m128i)a2.val, vec3 = (__m128i)a3.val;
+    __m128i t0 = __lsx_vilvl_w(vec1, vec0);
+    __m128i t1 = __lsx_vilvl_w(vec3, vec2);
+    __m128i t2 = __lsx_vilvh_w(vec1, vec0);
+    __m128i t3 = __lsx_vilvh_w(vec3, vec2);
+    b0.val = __m128(__lsx_vilvl_d(t1, t0));
+    b1.val = __m128(__lsx_vilvh_d(t1, t0));
+    b2.val = __m128(__lsx_vilvl_d(t3, t2));
+    b3.val = __m128(__lsx_vilvh_d(t3, t2));
+}
+
+////////////////// Value reordering ////////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_LSX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin_lo, intrin_hi)     \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)                \
+    {                                                                              \
+        b0.val = intrin_lo(a.val, 0);                                              \
+        b1.val = intrin_hi(a.val);                                                 \
+    }                                                                              \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                                   \
+    { return _Tpwvec(intrin_lo(a.val, 0)); }                                       \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                                  \
+    { return _Tpwvec(intrin_hi(a.val)); }                                          \
+    inline _Tpwvec v_load_expand(const _Tp* ptr)                                   \
+    {                                                                              \
+        __m128i a = __lsx_vld(ptr, 0);                                             \
+        return _Tpwvec(intrin_lo(a, 0));                                           \
+    }
+
+OPENCV_HAL_IMPL_LSX_EXPAND(v_uint8x16, v_uint16x8, uchar,     __lsx_vsllwil_hu_bu, __lsx_vexth_hu_bu)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_int8x16,  v_int16x8,  schar,     __lsx_vsllwil_h_b,   __lsx_vexth_h_b)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_uint16x8, v_uint32x4, ushort,    __lsx_vsllwil_wu_hu, __lsx_vexth_wu_hu)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_int16x8,  v_int32x4,  short,     __lsx_vsllwil_w_h,   __lsx_vexth_w_h)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_uint32x4, v_uint64x2, unsigned,  __lsx_vsllwil_du_wu, __lsx_vexth_du_wu)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_int32x4,  v_int64x2,  int,       __lsx_vsllwil_d_w,   __lsx_vexth_d_w)
+
+#define OPENCV_HAL_IMPL_LSX_EXPAND_Q(_Tpvec, _Tp, intrin_lo, intrin_hi)          \
+    inline _Tpvec v_load_expand_q(const _Tp* ptr)                                \
+    {                                                                            \
+        __m128i a = __lsx_vld(ptr, 0);                                           \
+        __m128i b = intrin_lo(a, 0);                                             \
+        return _Tpvec(intrin_hi(b, 0));                                          \
+    }
+
+OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_uint32x4, uchar, __lsx_vsllwil_hu_bu, __lsx_vsllwil_wu_hu)
+OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_int32x4,  schar, __lsx_vsllwil_h_b,   __lsx_vsllwil_w_h)
+
+/* pack */
+// 16
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(_lsx_packs_h(a.val, b.val)); }
+
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, 0)); }
+
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(_lsx_packus_h(a.val, b.val)); }
+
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{ __lsx_vstelm_d(__lsx_vssrlrni_bu_h(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(__lsx_vssrarni_bu_h(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_bu_h(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(__lsx_vssrarni_b_h(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_b_h(a.val, a.val, n), ptr, 0, 0); }
+
+//32
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, 0)); }
+
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, 0)); }
+
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, 0)); }
+
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort *ptr, const v_uint32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, 0), ptr,  0, 0); }
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
+
+template<int n> inline
+v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_h_w(a.val, a.val, n), ptr, 0, 0); }
+
+// 64
+// Non-saturaing pack
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint32x4(__lsx_vpickev_w(b.val, a.val)); }
+
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{ __lsx_vstelm_d(__lsx_vshuf4i_w(a.val, 0x08), ptr, 0, 0); }
+
+inline void v_pack_store(int *ptr, const v_int64x2& a)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(a)); }
+
+template<int n> inline
+v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint32x4(__lsx_vsrlrni_w_d(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{ __lsx_vstelm_d(__lsx_vsrlrni_w_d(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{ return v_int32x4(__lsx_vsrarni_w_d(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{ __lsx_vstelm_d(__lsx_vsrarni_w_d(a.val, a.val, n), ptr, 0, 0); }
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint8x16(__lsx_vssrarni_b_h(b.val, a.val, 0)); }
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    __m128i ab = __lsx_vssrarni_h_w(b.val, a.val, 0);
+    __m128i cd = __lsx_vssrarni_h_w(d.val, c.val, 0);
+    return v_uint8x16(__lsx_vssrarni_b_h(cd, ab, 0));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    __m128i ab = __lsx_vssrarni_w_d(b.val, a.val, 0);
+    __m128i cd = __lsx_vssrarni_w_d(d.val, c.val, 0);
+    __m128i ef = __lsx_vssrarni_w_d(f.val, e.val, 0);
+    __m128i gh = __lsx_vssrarni_w_d(h.val, g.val, 0);
+
+    __m128i abcd = __lsx_vssrarni_h_w(cd, ab, 0);
+    __m128i efgh = __lsx_vssrarni_h_w(gh, ef, 0);
+    return v_uint8x16(__lsx_vssrarni_b_h(efgh, abcd, 0));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_LSX_EXTRACT(_Tpvec)                    \
+    template<int s>                                            \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint8x16)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int8x16)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint16x8)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int16x8)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint32x4)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int32x4)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint64x2)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int64x2)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_float32x4)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_float64x2)
+
+#define OPENCV_HAL_IMPL_LSX_EXTRACT_N(_Tpvec, _Twvec, intrin)             \
+template<int i>                                                           \
+inline _Twvec v_extract_n(const _Tpvec& a)                                \
+{ return (_Twvec)intrin(a.val, i); }
+
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint8x16, uchar,   __lsx_vpickve2gr_b)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int8x16,  schar,   __lsx_vpickve2gr_b)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint16x8, ushort,  __lsx_vpickve2gr_h)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int16x8,  short,   __lsx_vpickve2gr_h)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint32x4, uint,    __lsx_vpickve2gr_w)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int32x4,  int,     __lsx_vpickve2gr_w)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint64x2, uint64,  __lsx_vpickve2gr_d)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int64x2,  int64,   __lsx_vpickve2gr_d)
+
+template<int i>
+inline float v_extract_n(const v_float32x4& v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = __lsx_vpickve2gr_w(v.val, i);
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(const v_float64x2& v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = __lsx_vpickve2gr_d(v.val, i);
+    return d.dv;
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{ return v_uint32x4(__lsx_vreplvei_w(a.val, i)); }
+
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{ return v_int32x4(__lsx_vreplvei_w(a.val, i)); }
+
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{ return v_float32x4((__m128)__lsx_vreplvei_w((__m128i)a.val, i)); }
+
+/////////////////// load deinterleave //////////////////////////////
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+
+    a.val = __lsx_vpickev_b(t1, t0);
+    b.val = __lsx_vpickod_b(t1, t0);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    a.val = __lsx_vpickev_h(t1, t0);
+    b.val = __lsx_vpickod_h(t1, t0);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    a.val = __lsx_vpickev_w(t1, t0);
+    b.val = __lsx_vpickod_w(t1, t0);
+}
+
+inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    a.val = __lsx_vilvl_d(t1, t0);
+    b.val = __lsx_vilvh_d(t1, t0);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    const __m128i shuff0 = _v128_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i shuff1 = _v128_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff0);
+    __m128i b0 = __lsx_vbitsel_v(t1, t0, shuff1);
+    __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
+    const __m128i shuff_a = _v128_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29);
+    const __m128i shuff_b = _v128_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30);
+    const __m128i shuff_c = _v128_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31);
+
+    a.val = __lsx_vshuf_b(t2, a0, shuff_a);
+    b.val = __lsx_vshuf_b(t2, b0, shuff_b);
+    c.val = __lsx_vshuf_b(t2, c0, shuff_c);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    const __m128i shuff0 = _v128_setr_h(0, 0, -1, 0, 0, -1, 0, 0);
+    const __m128i shuff1 = _v128_setr_h(0, -1, 0, 0, -1, 0, 0, -1);
+
+    __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff1);
+    __m128i b0 = __lsx_vbitsel_v(t0, t1, shuff0);
+    __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
+
+    const __m128i shuff_a = _v128_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 20, 21, 26, 27);
+    const __m128i shuff_b = _v128_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 16, 17, 22, 23, 28, 29);
+    const __m128i shuff_c = _v128_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31);
+
+    a.val = __lsx_vshuf_b(t2, a0, shuff_a);
+    b.val = __lsx_vshuf_b(t2, b0, shuff_b);
+    c.val = __lsx_vshuf_b(t2, c0, shuff_c);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+
+    __m128i a0 = __lsx_vpermi_w(t1, t0, 0xAC);
+    __m128i b0 = __lsx_vpermi_w(t1, t0, 0xC5);
+    __m128i c0 = __lsx_vpermi_w(t1, t0, 0x5A);
+
+    a.val = __lsx_vextrins_w(a0, t2, 0x31);
+    b0    = __lsx_vshuf4i_w(b0, 0x38);
+    c0    = __lsx_vshuf4i_w(c0, 0x8);
+    b.val = __lsx_vextrins_w(b0, t2, 0x32);
+    c.val = __lsx_vpermi_w(t2, c0, 0xC4);
+}
+
+inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+
+    a.val = __lsx_vshuf4i_d(t0, t1, 0xC);
+    b.val = __lsx_vshuf4i_d(t0, t2, 0x9);
+    c.val = __lsx_vshuf4i_d(t1, t2, 0xC);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    __m128i t3 = __lsx_vld(ptr, 48);
+
+    __m128i ac_lo = __lsx_vpickev_b(t1, t0);
+    __m128i bd_lo = __lsx_vpickod_b(t1, t0);
+    __m128i ac_hi = __lsx_vpickev_b(t3, t2);
+    __m128i bd_hi = __lsx_vpickod_b(t3, t2);
+
+    a.val = __lsx_vpickev_b(ac_hi, ac_lo);
+    c.val = __lsx_vpickod_b(ac_hi, ac_lo);
+    b.val = __lsx_vpickev_b(bd_hi, bd_lo);
+    d.val = __lsx_vpickod_b(bd_hi, bd_lo);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    __m128i t3 = __lsx_vld(ptr, 48);
+
+    __m128i ac_lo = __lsx_vpickev_h(t1, t0);
+    __m128i bd_lo = __lsx_vpickod_h(t1, t0);
+    __m128i ac_hi = __lsx_vpickev_h(t3, t2);
+    __m128i bd_hi = __lsx_vpickod_h(t3, t2);
+
+    a.val = __lsx_vpickev_h(ac_hi, ac_lo);
+    c.val = __lsx_vpickod_h(ac_hi, ac_lo);
+    b.val = __lsx_vpickev_h(bd_hi, bd_lo);
+    d.val = __lsx_vpickod_h(bd_hi, bd_lo);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i p0 = __lsx_vld(ptr, 0);
+    __m128i p1 = __lsx_vld(ptr, 16);
+    __m128i p2 = __lsx_vld(ptr, 32);
+    __m128i p3 = __lsx_vld(ptr, 48);
+
+    __m128i t0 = __lsx_vilvl_w(p1, p0);
+    __m128i t1 = __lsx_vilvl_w(p3, p2);
+    __m128i t2 = __lsx_vilvh_w(p1, p0);
+    __m128i t3 = __lsx_vilvh_w(p3, p2);
+    a.val = __lsx_vilvl_d(t1, t0);
+    b.val = __lsx_vilvh_d(t1, t0);
+    c.val = __lsx_vilvl_d(t3, t2);
+    d.val = __lsx_vilvh_d(t3, t2);
+}
+
+inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    __m128i t3 = __lsx_vld(ptr, 48);
+
+    a.val = __lsx_vilvl_d(t2, t0);
+    b.val = __lsx_vilvh_d(t2, t0);
+    c.val = __lsx_vilvl_d(t3, t1);
+    d.val = __lsx_vilvh_d(t3, t1);
+}
+
+////////////////////////// store interleave ////////////////////////////////
+
+inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_b(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_b(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_h(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_h(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_w(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_w(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_d(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_d(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
+    __m128i v_c = c.val;
+    const __m128i shuff0 = _v128_setr_b(0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10);
+    const __m128i shuff1 = _v128_setr_b(11, 21, 12, 13, 22, 14, 15, 23, 0, 0, 0, 0, 0, 0, 0, 0);
+    const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 24, 18, 19, 25, 20, 21);
+    const __m128i shuff3 = _v128_setr_b(26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31);
+    __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
+
+    __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
+    __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
+    __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
+    dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+}
+
+inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, const v_uint16x8& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
+    __m128i v_c = c.val;
+    const __m128i shuff0 = _v128_setr_b(0, 1, 2, 3, 16, 17, 4, 5, 6, 7, 18, 19, 8, 9, 10, 11);
+    const __m128i shuff1 = _v128_setr_b(20, 21, 12, 13, 14, 15, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0);
+    const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 24, 25, 20, 21);
+    const __m128i shuff3 = _v128_setr_b(6, 7, 26, 27, 8, 9, 10, 11, 28, 29, 12, 13, 14, 15, 30, 31);
+    __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
+
+    __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
+    __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
+    __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
+    dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, const v_uint32x4& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v_c = c.val;
+    __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);  //a0 b0 a1 b1
+    __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);  //a2 b2 a3 b3
+    __m128i bc_od = __lsx_vpackod_w(v_c, b.val); // b1 c1 b3 c3
+
+    __m128i dst0 = __lsx_vshuf4i_w(ab_lo, 0xB4);  //a0 b0 b1 a1
+    __m128i dst1 = __lsx_vilvl_d(ab_hi, bc_od); //b1 c1 a2 b2
+    __m128i dst2 = __lsx_vpermi_w(bc_od, ab_hi, 0xE8); //a2, a3, b3, c3
+
+    dst0 = __lsx_vextrins_w(dst0, v_c, 0x20);
+    dst2 = __lsx_vextrins_w(dst2, v_c, 0x2);
+    __lsx_vst(dst0, ptr, 0);  //a0 b0 c0 a1
+    __lsx_vst(dst1, ptr, 16); //b1 c1 a2 b2
+    __lsx_vst(dst2, ptr, 32); //c2 a3 b3 c3
+}
+
+inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
+    __m128i dst1 = __lsx_vpermi_w(a.val, c.val, 0xE4);
+    __m128i dst2 = __lsx_vilvh_d(c.val, b.val);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+}
+
+inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                               const v_uint8x16& c, const v_uint8x16& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
+    __m128i cd_lo = __lsx_vilvl_b(d.val, c.val);
+    __m128i cd_hi = __lsx_vilvh_b(d.val, c.val);
+
+    __m128i dst0 = __lsx_vilvl_h(cd_lo, ab_lo);
+    __m128i dst1 = __lsx_vilvh_h(cd_lo, ab_lo);
+    __m128i dst2 = __lsx_vilvl_h(cd_hi, ab_hi);
+    __m128i dst3 = __lsx_vilvh_h(cd_hi, ab_hi);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                               const v_uint16x8& c, const v_uint16x8& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
+    __m128i cd_lo = __lsx_vilvl_h(d.val, c.val);
+    __m128i cd_hi = __lsx_vilvh_h(d.val, c.val);
+
+    __m128i dst0 = __lsx_vilvl_w(cd_lo, ab_lo);
+    __m128i dst1 = __lsx_vilvh_w(cd_lo, ab_lo);
+    __m128i dst2 = __lsx_vilvl_w(cd_hi, ab_hi);
+    __m128i dst3 = __lsx_vilvh_w(cd_hi, ab_hi);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
+    __m128i cd_lo = __lsx_vilvl_w(d.val, c.val);
+    __m128i cd_hi = __lsx_vilvh_w(d.val, c.val);
+
+    __m128i dst0 = __lsx_vilvl_d(cd_lo, ab_lo);
+    __m128i dst1 = __lsx_vilvh_d(cd_lo, ab_lo);
+    __m128i dst2 = __lsx_vilvl_d(cd_hi, ab_hi);
+    __m128i dst3 = __lsx_vilvh_d(cd_hi, ab_hi);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
+    __m128i dst2 = __lsx_vilvh_d(b.val, a.val);
+    __m128i dst1 = __lsx_vilvl_d(d.val, c.val);
+    __m128i dst3 = __lsx_vilvh_d(d.val, c.val);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+#define OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1)  \
+inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0)                        \
+{                                                                                                 \
+    _Tpvec1 a1, b1;                                                                               \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1);                                                \
+    a0 = v_reinterpret_as_##suffix0(a1);                                                          \
+    b0 = v_reinterpret_as_##suffix0(b1);                                                          \
+}                                                                                                 \
+inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0)           \
+{                                                                                                 \
+    _Tpvec1 a1, b1, c1;                                                                           \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1);                                            \
+    a0 = v_reinterpret_as_##suffix0(a1);                                                          \
+    b0 = v_reinterpret_as_##suffix0(b1);                                                          \
+    c0 = v_reinterpret_as_##suffix0(c1);                                                          \
+}                                                                                                 \
+inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0,                        \
+                                _Tpvec0& c0, _Tpvec0& d0)                                         \
+{                                                                                                 \
+    _Tpvec1 a1, b1, c1, d1;                                                                       \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1);                                        \
+    a0 = v_reinterpret_as_##suffix0(a1);                                                          \
+    b0 = v_reinterpret_as_##suffix0(b1);                                                          \
+    c0 = v_reinterpret_as_##suffix0(c1);                                                          \
+    d0 = v_reinterpret_as_##suffix0(d1);                                                          \
+}                                                                                                 \
+inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0,                   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)                      \
+{                                                                                                 \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \
+    v_store_interleave((_Tp1*)ptr, a1, b1);                                                     \
+}                                                                                                 \
+inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0,\
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)                      \
+{                                                                                                 \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0);                                                  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1);                                                 \
+}                                                                                                 \
+inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0,                   \
+                               const _Tpvec0& c0, const _Tpvec0& d0,                              \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)                      \
+{                                                                                                 \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0);                                                  \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0);                                                  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1);                                             \
+}
+
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
+
+//
+// FP16
+//
+
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+#if CV_FP16
+    return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
+#else
+    float CV_DECL_ALIGNED(32) buf[4];
+    for (int i = 0; i < 4; i++)
+        buf[i] = (float)ptr[i];
+    return v_float32x4((__m128)__lsx_vld(buf, 0));
+#endif
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
+{
+#if CV_FP16
+    __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
+    __lsx_vstelm_d(res, ptr, 0, 0);
+#else
+    float CV_DECL_ALIGNED(32) buf[4];
+    v_store_aligned(buf, a);
+    for (int i = 0; i < 4; i++)
+        ptr[i] = hfloat(buf[i]);
+#endif
+}
+
+//
+// end of FP16
+//
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_LSX_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_msa.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_msa.hpp
new file mode 100644
index 000000000000..23d6ebd3d1d0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_msa.hpp
@@ -0,0 +1,1887 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_INTRIN_MSA_HPP
+#define OPENCV_HAL_INTRIN_MSA_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @cond IGNORED
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+
+//MSA implements 128-bit wide vector registers shared with the 64-bit wide floating-point unit registers.
+//MSA and FPU can not be both present, unless the FPU has 64-bit floating-point registers.
+#define CV_SIMD128_64F 1
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(v16u8 v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = msa_ld1q_u8(v);
+    }
+
+    uchar get0() const
+    {
+        return msa_getq_lane_u8(val, 0);
+    }
+
+    v16u8 val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(v16i8 v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = msa_ld1q_s8(v);
+    }
+
+    schar get0() const
+    {
+        return msa_getq_lane_s8(val, 0);
+    }
+
+    v16i8 val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(v8u16 v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = msa_ld1q_u16(v);
+    }
+
+    ushort get0() const
+    {
+        return msa_getq_lane_u16(val, 0);
+    }
+
+    v8u16 val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(v8i16 v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = msa_ld1q_s16(v);
+    }
+
+    short get0() const
+    {
+        return msa_getq_lane_s16(val, 0);
+    }
+
+    v8i16 val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned int lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(v4u32 v) : val(v) {}
+    v_uint32x4(unsigned int v0, unsigned int v1, unsigned int v2, unsigned int v3)
+    {
+        unsigned int v[] = {v0, v1, v2, v3};
+        val = msa_ld1q_u32(v);
+    }
+
+    unsigned int get0() const
+    {
+        return msa_getq_lane_u32(val, 0);
+    }
+
+    v4u32 val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(v4i32 v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = msa_ld1q_s32(v);
+    }
+
+    int get0() const
+    {
+        return msa_getq_lane_s32(val, 0);
+    }
+
+    v4i32 val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(v4f32 v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = msa_ld1q_f32(v);
+    }
+
+    float get0() const
+    {
+        return msa_getq_lane_f32(val, 0);
+    }
+
+    v4f32 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(v2u64 v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = msa_ld1q_u64(v);
+    }
+
+    uint64 get0() const
+    {
+        return msa_getq_lane_u64(val, 0);
+    }
+
+    v2u64 val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(v2i64 v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = msa_ld1q_s64(v);
+    }
+
+    int64 get0() const
+    {
+        return msa_getq_lane_s64(val, 0);
+    }
+
+    v2i64 val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(v2f64 v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = msa_ld1q_f64(v);
+    }
+
+    double get0() const
+    {
+        return msa_getq_lane_f64(val, 0);
+    }
+
+    v2f64 val;
+};
+
+#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \
+inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); }
+
+OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_INIT(uint32x4, unsigned int, u32)
+OPENCV_HAL_IMPL_MSA_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_INIT(float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_INIT(float64x2, double, f64)
+
+#define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    return _Tpvec(mov(a.val, b.val)); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    return _Tpvec(rshr(a.val, b.val, n)); \
+}
+
+OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_uint16x8, pack, msa_qpack_u16, msa_qrpackr_u16)
+OPENCV_HAL_IMPL_MSA_PACK(v_int8x16, v_int16x8, pack, msa_qpack_s16, msa_qrpackr_s16)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_uint32x4, pack, msa_qpack_u32, msa_qrpackr_u32)
+OPENCV_HAL_IMPL_MSA_PACK(v_int16x8, v_int32x4, pack, msa_qpack_s32, msa_qrpackr_s32)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint32x4, v_uint64x2, pack, msa_pack_u64, msa_rpackr_u64)
+OPENCV_HAL_IMPL_MSA_PACK(v_int32x4, v_int64x2, pack, msa_pack_s64, msa_rpackr_s64)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_int16x8, pack_u, msa_qpacku_s16, msa_qrpackru_s16)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_int32x4, pack_u, msa_qpacku_s32, msa_qrpackru_s32)
+
+#define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = mov(a.val); \
+    msa_st1_##suffix(ptr, a1); \
+} \
+template<int n> inline \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = rshr(a.val, n); \
+    msa_st1_##suffix(ptr, a1); \
+}
+
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int8x16, schar, v8i8, s8, v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int16x8, short, v4i16, s16, v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint32x4, unsigned, v2u32, u32, v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int32x4, int, v2i32, s32, v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint8x16(msa_pack_u16(a.val, b.val));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
+    v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
+    return v_uint8x16(msa_pack_u16(abcd, efgh));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    v4f32 v0 = v.val;
+    v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
+    res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
+    res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
+    res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    v4f32 v0 = v.val;
+    v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
+    res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
+    res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
+    res = msa_addq_f32(res, a.val);
+    return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec)         \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+{                                                            \
+    _Tpwvec c, d;                                            \
+    v_mul_expand(a, b, c, d);                                \
+    return v_pack(c, d);                                     \
+}                                                            \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+{a = a * b; return a; }
+
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint16x8, v_uint32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    v16i8 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
+    ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
+    c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
+    d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    v16u8 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
+    ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
+    c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
+    d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    v8i16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
+    ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
+    c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
+    d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    v8u16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
+    ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
+    c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
+    d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    v4u32 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
+    ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
+    c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
+    d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    v8i16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
+    ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
+
+    return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
+                                   msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
+}
+
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v8u16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
+    ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
+
+    return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
+                                    msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
+}
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dotp_u_w(even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dpadd_u_w(c.val, even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    v8i16 prod = msa_dotp_s_h(a.val, b.val);
+    return v_int32x4(msa_hadd_s32(prod, prod));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dotp_u_d(even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
+                                   const v_uint64x2& c)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dpadd_u_d(c.val, even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v4i32 prod = msa_dotp_s_w(a.val, b.val);
+    return v_int64x2(msa_hadd_s64(prod, prod));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
+OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix)   \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
+}
+
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint8x16, v16u8, u8)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int8x16, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint16x8, v8u16, u16)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int16x8, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint32x4, v4u32, u32)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int32x4, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
+
+#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
+}
+
+/* v_abs */
+#define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
+inline _Tpuvec v_abs(const _Tpsvec& a) \
+{ \
+    return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_ABS(v_uint8x16, v_int8x16, u8, s8)
+OPENCV_HAL_IMPL_MSA_ABS(v_uint16x8, v_int16x8, u16, s16)
+OPENCV_HAL_IMPL_MSA_ABS(v_uint32x4, v_int32x4, u32, s32)
+
+/* v_abs(float), v_sqrt, v_invsqrt */
+#define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a) \
+{ \
+    return _Tpvec(intrin(a.val)); \
+}
+
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_abs, msa_absq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_abs, msa_absq_f64)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_sqrt, msa_sqrtq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_invsqrt, msa_rsqrtq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
+
+#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_min, msa_minq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_max, msa_maxq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_min, msa_minq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_max, msa_maxq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_min, msa_minq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_max, msa_maxq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_min, msa_minq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_max, msa_maxq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_min, msa_minq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_max, msa_maxq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_min, msa_minq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_max, msa_maxq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_min, msa_minq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_max, msa_maxq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
+
+#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int8x16, v16i8, s8, u8)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint16x8, v8u16, u16, u16)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int16x8, v8i16, s16, u16)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint32x4, v4u32, u32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int32x4, v4i32, s32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float32x4, v4f32, f32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint64x2, v2u64, u64, u64)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int64x2, v2i64, s64, u64)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float64x2, v2f64, f64, u64)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_add_wrap, msa_addq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_add_wrap, msa_addq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_add_wrap, msa_addq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_add_wrap, msa_addq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_sub_wrap, msa_subq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_sub_wrap, msa_subq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_sub_wrap, msa_subq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_sub_wrap, msa_subq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_mul_wrap, msa_mulq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_mul_wrap, msa_mulq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_mul_wrap, msa_mulq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_mul_wrap, msa_mulq_s16)
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_absdiff, msa_abdq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_absdiff, msa_abdq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_absdiff, msa_abdq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_absdiff, msa_abdq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_absdiff, msa_abdq_f64)
+
+/** Saturating absolute difference **/
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_absdiffs, msa_qabdq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_absdiffs, msa_qabdq_s16)
+
+#define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \
+inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int8x16, v_uint8x16, v16u8, v_absdiff, msa_abdq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int16x8, v_uint16x8, v8u16, v_absdiff, msa_abdq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int32x4, v_uint32x4, v4u32, v_absdiff, msa_abdq_s32)
+
+/* v_magnitude, v_sqr_magnitude, v_fma, v_muladd */
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); }
+
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int32x4, s32, int, s32)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint64x2, u64, int64, s64)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int64x2, s64, int64, s64)
+
+/* v_rotate_right, v_rotate_left */
+#define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ \
+    return a; \
+} \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    CV_UNUSED(b); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint64x2, v2u64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int64x2, v2i64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float64x2, v2f64, v2i64, s64)
+
+#define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    int n  = _Tpvec::nlanes; \
+    for( int i = 0; i < (n/2); i++ ) \
+        ptr[i] = a.val[i]; \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    int n  = _Tpvec::nlanes; \
+    for( int i = 0; i < (n/2); i++ ) \
+        ptr[i] = a.val[i+(n/2)]; \
+}
+
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64)
+
+
+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    v_uint8x16 c = v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
+    return c;
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    v_uint16x8 c = v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
+    return c;
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    v_uint32x4 c;
+    c.val[0] = a.val[3];
+    c.val[1] = a.val[2];
+    c.val[2] = a.val[1];
+    c.val[3] = a.val[0];
+    return c;
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    v_uint64x2 c;
+    c.val[0] = a.val[1];
+    c.val[1] = a.val[0];
+    return c;
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
+inline unsigned short v_reduce_##func(const v_uint16x8& a) \
+{ \
+    v8u16 a_lo, a_hi; \
+    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \
+    v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \
+    v4u32 b_lo, b_hi; \
+    ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \
+    v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \
+    return (unsigned short)cfunc(c[0], c[1]); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, std::min)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \
+inline short v_reduce_##func(const v_int16x8& a) \
+{ \
+    v8i16 a_lo, a_hi; \
+    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \
+    v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \
+    v4i32 b_lo, b_hi; \
+    ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \
+    v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \
+    return (short)cfunc(c[0], c[1]); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, std::min)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpvec2 a1, a2; \
+    v_expand(a, a1, a2); \
+    return (scalartype)v_reduce_##func(v_##func(a1, a2)); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, max)
+
+
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    return (scalartype)msa_sum_##suffix(a.val); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned short, u8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, short, s8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned, u16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, int, s16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, uint64_t, u32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int64_t, s32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32)
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{ return (uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
+inline int64 v_reduce_sum(const v_int64x2& a)
+{ return (int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
+}
+
+/* v_reduce_sum4, v_reduce_sad */
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
+                            MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); // a0+a1 b0+b1 a2+a3 b2+b3
+    v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
+                            MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); // c0+c1 d0+d1 c2+c3 d2+d3
+
+    return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
+                                    MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v16u8 t0 = msa_abdq_u8(a.val, b.val);
+    v8u16 t1 = msa_paddlq_u8(t0);
+    v4u32 t2 = msa_paddlq_u16(t1);
+    return msa_sum_u32(t2);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
+    v8u16 t1 = msa_paddlq_u8(t0);
+    v4u32 t2 = msa_paddlq_u16(t1);
+    return msa_sum_u32(t2);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v8u16 t0 = msa_abdq_u16(a.val, b.val);
+    v4u32 t1 = msa_paddlq_u16(t0);
+    return msa_sum_u32(t1);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
+    v4u32 t1 = msa_paddlq_u16(t0);
+    return msa_sum_u32(t1);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    v4u32 t0 = msa_abdq_u32(a.val, b.val);
+    return msa_sum_u32(t0);
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
+    return msa_sum_u32(t0);
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    v4f32 t0 = msa_abdq_f32(a.val, b.val);
+    return msa_sum_f32(t0);
+}
+
+/* v_popcount */
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \
+inline v_uint8x16 v_popcount(const _Tpvec& a) \
+{ \
+    v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \
+    return v_uint8x16(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_uint8x16)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_int8x16)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \
+inline v_uint16x8 v_popcount(const _Tpvec& a) \
+{ \
+    v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \
+    return v_uint16x8(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_uint16x8)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_int16x8)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \
+inline v_uint32x4 v_popcount(const _Tpvec& a) \
+{ \
+    v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \
+    return v_uint32x4(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_uint32x4)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_int32x4)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \
+inline v_uint64x2 v_popcount(const _Tpvec& a) \
+{ \
+    v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \
+    return v_uint64x2(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_uint64x2)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_int64x2)
+
+inline int v_signmask(const v_uint8x16& a)
+{
+    v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100));
+    v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
+    v8u16 v1 = msa_paddlq_u8(v0);
+    v4u32 v2 = msa_paddlq_u16(v1);
+    v2u64 v3 = msa_paddlq_u32(v2);
+    return (int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+    v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000));
+    v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
+    v4u32 v1 = msa_paddlq_u16(v0);
+    v2u64 v2 = msa_paddlq_u32(v1);
+    return (int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000));
+    v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
+    v2u64 v1 = msa_paddlq_u32(v0);
+    return (int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+inline int v_signmask(const v_uint64x2& a)
+{
+    v2u64 v0 = msa_shrq_n_u64(a.val, 63);
+    return (int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+
+#define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \
+    v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
+    return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \
+    v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
+    return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \
+}
+
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+/* v_select */
+#define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \
+                  MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \
+}
+
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint8x16, v16u8, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int8x16, v16i8, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint16x8, v8u16, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int16x8, v8i16, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint32x4, v4u32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int32x4, v4i32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_float32x4, v4f32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_float64x2, v2f64, v16u8)
+
+#define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    b0.val = msa_paddlq_##suffix(a_lo); \
+    b1.val = msa_paddlq_##suffix(a_hi); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    return _Tpwvec(msa_paddlq_##suffix(a_lo)); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    return _Tpwvec(msa_paddlq_##suffix(a_hi)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \
+}
+
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint8x16, v_uint16x8, uchar, u8, s8, v16u8, v16i8)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int8x16, v_int16x8, schar, s8, s8, v16i8, v16i8)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint16x8, v_uint32x4, ushort, u16, s16, v8u16, v8i16)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int16x8, v_int32x4, short, s16, s16, v8i16, v8i16)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint32x4, v_uint64x2, uint, u32, s32, v4u32, v4i32)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int32x4, v_int64x2, int, s32, s32, v4i32, v4i32)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
+}
+
+/* v_zip, v_combine_low, v_combine_high, v_recombine */
+#define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+    b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
+    d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_float64x2, v2f64, v2i64, s64)
+
+/* v_extract */
+#define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \
+}
+
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint64x2, v2u64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int64x2, v2i64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_float64x2, v2f64, v2i64, s64)
+
+/* v_round, v_floor, v_ceil, v_trunc */
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    return v_int32x4(msa_cvttintq_s32_f32(a.val));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    v4i32 a1 = msa_cvttintq_s32_f32(a.val);
+    return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    v4i32 a1 = msa_cvttintq_s32_f32(a.val);
+    return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+    return v_int32x4(msa_cvttruncq_s32_f32(a.val));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    v2f64 a1 = msa_cvtrintq_f64(a.val);
+    return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    v2f64 a1 = msa_cvtrintq_f64(a.val);
+    return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
+}
+
+#define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+    _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+    _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
+    _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
+    b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
+    b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
+    b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
+    b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
+}
+
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_float32x4, v4f32, v4i32, s32)
+
+#define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    msa_ld2q_##suffix(ptr, &a.val, &b.val); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    msa_st2q_##suffix(ptr, a.val, b.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    msa_st3q_##suffix(ptr, a.val, b.val, c.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \
+}
+
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, double, f64)
+
+/* v_cvt_f32, v_cvt_f64, v_cvt_f64_high */
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(msa_cvtfintq_f32_s32(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(msa_cvtflq_f64_f32(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    return v_float64x2(msa_cvtfhq_f64_f32(a.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(msa_cvtfintq_f64_s64(a.val));
+}
+
+////////////// Lookup table access ////////////////////
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(msa_ld1q_s16(elems));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(msa_ld1q_s16(elems));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(msa_ld1q_s32(elems));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(msa_ld1q_s32(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]])));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(msa_ld1q_s64(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(msa_ld1q_f32(elems));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    uint64 CV_DECL_ALIGNED(32) elems[2] =
+    {
+        *(uint64*)(tab + idx[0]),
+        *(uint64*)(tab + idx[1])
+    };
+    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(msa_ld1q_f32(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[msa_getq_lane_s32(idxvec.val, 0)],
+        tab[msa_getq_lane_s32(idxvec.val, 1)],
+        tab[msa_getq_lane_s32(idxvec.val, 2)],
+        tab[msa_getq_lane_s32(idxvec.val, 3)]
+    };
+    return v_uint32x4(msa_ld1q_u32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2]));
+    v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3]));
+    x = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
+    y = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
+    return c;
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
+    return c;
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
+    return c;
+}
+
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
+    return c;
+}
+
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    v_int32x4 c;
+    c.val[0] = vec.val[0];
+    c.val[1] = vec.val[2];
+    c.val[2] = vec.val[1];
+    c.val[3] = vec.val[3];
+    return c;
+}
+
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
+    return c;
+}
+
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
+    return c;
+}
+
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_float64x2(msa_ld1q_f64(elems));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(msa_ld1q_f64(tab + idx[0]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    v2f64 xy0 = msa_ld1q_f64(tab + idx[0]);
+    v2f64 xy1 = msa_ld1q_f64(tab + idx[1]);
+    x = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
+    y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
+}
+
+template<int i, typename _Tp>
+inline typename _Tp::lane_type v_extract_n(const _Tp& a)
+{
+    return v_rotate_right<i>(a).get0();
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{
+    return v_setall_u32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{
+    return v_setall_s32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{
+    return v_setall_f32(v_extract_n<i>(a));
+}
+
+////// FP16 support ///////
+#if CV_FP16
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+#ifndef msa_ld1_f16
+    v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
+#else
+    v4f16 v = msa_ld1_f16((const __fp16*)ptr);
+#endif
+    return v_float32x4(msa_cvt_f32_f16(v));
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    v4f16 hv = msa_cvt_f16_f32(v.val);
+
+#ifndef msa_st1_f16
+    msa_st1_s16((short*)ptr, (int16x4_t)hv);
+#else
+    msa_st1_f16((__fp16*)ptr, hv);
+#endif
+}
+#else
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    float buf[4];
+    for( int i = 0; i < 4; i++ )
+        buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    float buf[4];
+    v_store(buf, v);
+    for( int i = 0; i < 4; i++ )
+        ptr[i] = (hfloat)buf[i];
+}
+#endif
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_neon.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_neon.hpp
new file mode 100644
index 000000000000..6e843d68ea67
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_neon.hpp
@@ -0,0 +1,2655 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_INTRIN_NEON_HPP
+#define OPENCV_HAL_INTRIN_NEON_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define CV_SIMD128_64F 1
+#else
+#define CV_SIMD128_64F 0
+#endif
+
+// The following macro checks if the code is being compiled for the
+// AArch64 execution state of Armv8, to enable the 128-bit
+// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
+// the Arm C Language Extension (ACLE) specifications [1] to check the
+// availability of 128-bit intrinsics, and it is supporrted by clang
+// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
+// Visual Studio [2] .
+//
+// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
+// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
+#define CV_NEON_AARCH64 1
+#else
+#define CV_NEON_AARCH64 0
+#endif
+
+
+//////////// Utils ////////////
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
+#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
+    { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
+#else
+#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
+#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
+#endif
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
+    template <typename T> static inline \
+    _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
+    template <typename T> static inline \
+    float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
+#else
+#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
+    OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
+
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8,  u8)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16,  int8x8,   s8)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8,  int16x4,  s16)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4,  int32x2,  s32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2,  int64x1,  s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
+#endif
+
+//////////// Compatibility layer ////////////
+template<typename T> struct VTraits {
+        static inline int vlanes() { return T::nlanes; }
+        enum { max_nlanes = T::nlanes, nlanes = T::nlanes };
+        using lane_type = typename T::lane_type;
+};
+
+template<typename T>
+inline typename VTraits<T>::lane_type v_get0(const T& v) \
+{ \
+    return v.get0(); \
+}
+//////////// Types ////////////
+
+struct v_uint8x16
+{
+    v_uint8x16() {}
+    explicit v_uint8x16(uint8x16_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_u8(v);
+    }
+    uint8x16_t val;
+
+private:
+    friend struct VTraits<v_uint8x16>;
+    enum { nlanes = 16 };
+    typedef uchar lane_type;
+
+    friend typename VTraits<v_uint8x16>::lane_type v_get0<v_uint8x16>(const v_uint8x16& v);
+    uchar get0() const
+    {
+        return vgetq_lane_u8(val, 0);
+    }
+};
+
+struct v_int8x16
+{
+    v_int8x16() {}
+    explicit v_int8x16(int8x16_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_s8(v);
+    }
+    int8x16_t val;
+
+private:
+    friend struct VTraits<v_int8x16>;
+    enum { nlanes = 16 };
+    typedef schar lane_type;
+
+    friend typename VTraits<v_int8x16>::lane_type v_get0<v_int8x16>(const v_int8x16& v);
+    schar get0() const
+    {
+        return vgetq_lane_s8(val, 0);
+    }
+};
+
+struct v_uint16x8
+{
+    v_uint16x8() {}
+    explicit v_uint16x8(uint16x8_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_u16(v);
+    }
+    uint16x8_t val;
+
+private:
+    friend struct VTraits<v_uint16x8>;
+    enum { nlanes = 8 };
+    typedef ushort lane_type;
+
+    friend typename VTraits<v_uint16x8>::lane_type v_get0<v_uint16x8>(const v_uint16x8& v);
+    ushort get0() const
+    {
+        return vgetq_lane_u16(val, 0);
+    }
+};
+
+struct v_int16x8
+{
+    v_int16x8() {}
+    explicit v_int16x8(int16x8_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_s16(v);
+    }
+    int16x8_t val;
+
+private:
+    friend struct VTraits<v_int16x8>;
+    enum { nlanes = 8 };
+    typedef short lane_type;
+
+    friend typename VTraits<v_int16x8>::lane_type v_get0<v_int16x8>(const v_int16x8& v);
+    short get0() const
+    {
+        return vgetq_lane_s16(val, 0);
+    }
+};
+
+struct v_uint32x4
+{
+    v_uint32x4() {}
+    explicit v_uint32x4(uint32x4_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = vld1q_u32(v);
+    }
+    uint32x4_t val;
+
+private:
+    friend struct VTraits<v_uint32x4>;
+    enum { nlanes = 4 };
+    typedef unsigned lane_type;
+
+    friend typename VTraits<v_uint32x4>::lane_type v_get0<v_uint32x4>(const v_uint32x4& v);
+    unsigned get0() const
+    {
+        return vgetq_lane_u32(val, 0);
+    }
+};
+
+struct v_int32x4
+{
+    v_int32x4() {}
+    explicit v_int32x4(int32x4_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = vld1q_s32(v);
+    }
+    int32x4_t val;
+
+private:
+    friend struct VTraits<v_int32x4>;
+    enum { nlanes = 4 };
+    typedef int lane_type;
+
+    friend typename VTraits<v_int32x4>::lane_type v_get0<v_int32x4>(const v_int32x4& v);
+    int get0() const
+    {
+        return vgetq_lane_s32(val, 0);
+    }
+};
+
+struct v_float32x4
+{
+    v_float32x4() {}
+    explicit v_float32x4(float32x4_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = vld1q_f32(v);
+    }
+    float32x4_t val;
+
+private:
+    friend struct VTraits<v_float32x4>;
+    enum { nlanes = 4 };
+    typedef float lane_type;
+
+    friend typename VTraits<v_float32x4>::lane_type v_get0<v_float32x4>(const v_float32x4& v);
+    float get0() const
+    {
+        return vgetq_lane_f32(val, 0);
+    }
+};
+
+struct v_uint64x2
+{
+    v_uint64x2() {}
+    explicit v_uint64x2(uint64x2_t v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = vld1q_u64(v);
+    }
+    uint64x2_t val;
+private:
+    friend struct VTraits<v_uint64x2>;
+    enum { nlanes = 2 };
+    typedef uint64 lane_type;
+
+    friend typename VTraits<v_uint64x2>::lane_type v_get0<v_uint64x2>(const v_uint64x2& v);
+    uint64 get0() const
+    {
+        return vgetq_lane_u64(val, 0);
+    }
+};
+
+struct v_int64x2
+{
+    v_int64x2() {}
+    explicit v_int64x2(int64x2_t v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = vld1q_s64(v);
+    }
+    int64x2_t val;
+
+private:
+    friend struct VTraits<v_int64x2>;
+    enum { nlanes = 2 };
+    typedef int64 lane_type;
+
+    friend typename VTraits<v_int64x2>::lane_type v_get0<v_int64x2>(const v_int64x2& v);
+    int64 get0() const
+    {
+        return vgetq_lane_s64(val, 0);
+    }
+};
+
+#if CV_SIMD128_64F
+struct v_float64x2
+{
+    v_float64x2() {}
+    explicit v_float64x2(float64x2_t v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = vld1q_f64(v);
+    }
+
+    float64x2_t val;
+private:
+    friend struct VTraits<v_float64x2>;
+    enum { nlanes = 2 };
+    typedef double lane_type;
+
+    friend typename VTraits<v_float64x2>::lane_type v_get0<v_float64x2>(const v_float64x2& v);
+    double get0() const
+    {
+        return vgetq_lane_f64(val, 0);
+    }
+};
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
+inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
+
+OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
+inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
+OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
+OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
+OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
+OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = mov(a.val), b1 = mov(b.val); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = mov(a.val); \
+    vst1_##suffix(ptr, a1); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = rshr(a.val, n); \
+    hreg b1 = rshr(b.val, n); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+template<int n> inline \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = rshr(a.val, n); \
+    vst1_##suffix(ptr, a1); \
+}
+
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
+OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
+OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
+OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
+
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
+    uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
+    return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
+    uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
+    uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
+    uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
+
+    uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
+    uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
+    return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
+    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
+    res = vmlaq_lane_f32(res, m1.val, vl, 1);
+    res = vmlaq_lane_f32(res, m2.val, vh, 0);
+    res = vmlaq_lane_f32(res, m3.val, vh, 1);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
+    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
+    res = vmlaq_lane_f32(res, m1.val, vl, 1);
+    res = vmlaq_lane_f32(res, m2.val, vh, 0);
+    res = vaddq_f32(res, a.val);
+    return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint8x16, vqaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int8x16, vqaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int8x16, vqsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint16x8, vqaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint16x8, vqsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int16x8, vqaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int16x8, vqsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int32x4, vaddq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int32x4, vsubq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int64x2, vaddq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int64x2, vsubq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint64x2, vaddq_u64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint64x2, vsubq_u64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float32x4, vdivq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float64x2, vaddq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float64x2, vsubq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float64x2, vmulq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float64x2, vdivq_f64)
+#else
+inline v_float32x4 v_div (const v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    return v_float32x4(vmulq_f32(a.val, reciprocal));
+}
+#endif
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec)            \
+    inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }
+
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_s8(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u8(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_s16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u32(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+#if CV_NEON_AARCH64
+    int32x4_t c = vmull_high_s16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    int32x4_t c = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+#endif // #if CV_NEON_AARCH64
+    return v_int16x8(vcombine_s16(
+                                  vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
+                                  vshrn_n_s32(c, 16)
+                                 ));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_NEON_AARCH64
+    uint32x4_t c = vmull_high_u16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    uint32x4_t c = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+#endif // #if CV_NEON_AARCH64
+    return v_uint16x8(vcombine_u16(
+                                   vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
+                                   vshrn_n_u32(c, 16)
+                                  ));
+}
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int16x4_t a0 = vget_low_s16(uzp1);
+    int16x4_t b0 = vget_high_s16(uzp1);
+    int16x4_t a1 = vget_low_s16(uzp2);
+    int16x4_t b1 = vget_high_s16(uzp2);
+    int32x4_t p = vmull_s16(a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int16x4_t a0 = vget_low_s16(uzp1);
+    int16x4_t b0 = vget_high_s16(uzp1);
+    int16x4_t a1 = vget_low_s16(uzp2);
+    int16x4_t b1 = vget_high_s16(uzp2);
+    int32x4_t p = vmlal_s16(c.val, a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int32x2_t a0 = vget_low_s32(uzp1);
+    int32x2_t b0 = vget_high_s32(uzp1);
+    int32x2_t a1 = vget_low_s32(uzp2);
+    int32x2_t b1 = vget_high_s32(uzp2);
+    int64x2_t p = vmull_s32(a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int32x2_t a0 = vget_low_s32(uzp1);
+    int32x2_t b0 = vget_high_s32(uzp1);
+    int32x2_t a1 = vget_low_s32(uzp2);
+    int32x2_t b1 = vget_high_s32(uzp2);
+    int64x2_t p = vmlal_s32(c.val, a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+
+// 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b)   \
+{ \
+    return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
+} \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+    return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_int32x4,  v_int8x16,  s32)
+#else
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    const uint8x16_t zero   = vreinterpretq_u8_u32(vdupq_n_u32(0));
+    const uint8x16_t mask   = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
+    const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
+    const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
+
+    uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
+                                vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
+    uint16x8_t odd  = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
+                                vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
+
+    uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
+                              vreinterpretq_u32_u16(vbslq_u16(mask32, odd,  zero32)));
+    uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
+                              vshrq_n_u32(vreinterpretq_u32_u16(odd),  16));
+    return v_uint32x4(vaddq_u32(s0, s1));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+    return v_add(v_dotprod_expand(a, b), c);
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    int16x8_t p0  = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    int16x8_t p1  = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(p0, p1, uzp1, uzp2);
+    int16x8_t sum = vaddq_s16(uzp1, uzp2);
+    int16x4_t uzpl1, uzpl2;
+    _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
+    return v_int32x4(vaddl_s16(uzpl1, uzpl2));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{
+    return v_add(v_dotprod_expand(a, b), c);
+}
+#endif
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
+    const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
+
+    uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
+                                vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
+    uint32x4_t odd  = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
+                                vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
+    uint32x4_t uzp1, uzp2;
+    _v128_unzip(even, odd, uzp1, uzp2);
+    uint64x2_t s0  = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
+    uint64x2_t s1  = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
+    return v_uint64x2(vaddq_u64(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t p0  = vmull_s16(vget_low_s16(a.val),  vget_low_s16(b.val));
+    int32x4_t p1  = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(p0, p1, uzp1, uzp2);
+    int32x4_t sum = vaddq_s32(uzp1, uzp2);
+
+    int32x2_t uzpl1, uzpl2;
+    _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
+    return v_int64x2(vaddl_s32(uzpl1, uzpl2));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                  const v_int64x2& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+#endif
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+#if CV_NEON_AARCH64
+    int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    return v_int32x4(vmlal_high_s16(p, a.val, b.val));
+#else
+    int16x4_t a0 = vget_low_s16(a.val);
+    int16x4_t a1 = vget_high_s16(a.val);
+    int16x4_t b0 = vget_low_s16(b.val);
+    int16x4_t b1 = vget_high_s16(b.val);
+    int32x4_t p = vmull_s16(a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+#endif
+}
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+#if CV_NEON_AARCH64
+    int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
+    return v_int32x4(vmlal_high_s16(p, a.val, b.val));
+#else
+    int16x4_t a0 = vget_low_s16(a.val);
+    int16x4_t a1 = vget_high_s16(a.val);
+    int16x4_t b0 = vget_low_s16(b.val);
+    int16x4_t b1 = vget_high_s16(b.val);
+    int32x4_t p = vmlal_s16(c.val, a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+#endif
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_NEON_AARCH64
+    int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
+    return v_int64x2(vmlal_high_s32(p, a.val, b.val));
+#else
+    int32x2_t a0 = vget_low_s32(a.val);
+    int32x2_t a1 = vget_high_s32(a.val);
+    int32x2_t b0 = vget_low_s32(b.val);
+    int32x2_t b1 = vget_high_s32(b.val);
+    int64x2_t p = vmull_s32(a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+#endif
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+#if CV_NEON_AARCH64
+    int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
+    return v_int64x2(vmlal_high_s32(p, a.val, b.val));
+#else
+    int32x2_t a0 = vget_low_s32(a.val);
+    int32x2_t a1 = vget_high_s32(a.val);
+    int32x2_t b0 = vget_low_s32(b.val);
+    int32x2_t b1 = vget_high_s32(b.val);
+    int64x2_t p = vmlal_s32(c.val, a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+#endif
+}
+
+// 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b)   \
+{ \
+    return v_dotprod_expand(a, b); \
+} \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+    return v_dotprod_expand(a, b, c); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_int32x4,  v_int8x16,  s32)
+#else
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+    uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+    uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
+    uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
+    return v_uint32x4(vaddq_u32(s0, s1));
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    return v_add(v_dotprod_expand_fast(a, b), c);
+}
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
+    return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    return v_add(v_dotprod_expand_fast(a, b), c);
+}
+#endif
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint32x4_t p0  = vmull_u16(vget_low_u16(a.val),  vget_low_u16(b.val));
+    uint32x4_t p1  = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+    uint64x2_t s0  = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
+    uint64x2_t s1  = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
+    return v_uint64x2(vaddq_u64(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
+    return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod_fast(a, b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
+#endif
+
+
+#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \
+    inline _Tpvec v_not (const _Tpvec& a) \
+    { \
+        return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
+    }
+
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
+
+#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
+}
+
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32)
+
+inline v_float32x4 v_not (const v_float32x4& a)
+{
+    return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
+}
+
+#if CV_SIMD128_64F
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(vsqrtq_f32(x.val));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    v_float32x4 one = v_setall_f32(1.0f);
+    return v_div(one, v_sqrt(x));
+}
+#else
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
+    float32x4_t e = vrsqrteq_f32(x1);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    return v_float32x4(vmulq_f32(x.val, e));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    float32x4_t e = vrsqrteq_f32(x.val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    return v_float32x4(e);
+}
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
+inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
+
+OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
+OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
+OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{ return v_float32x4(vabsq_f32(x.val)); }
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
+inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
+}
+
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64)
+
+inline v_float64x2 v_not (const v_float64x2& a)
+{
+    return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(vsqrtq_f64(x.val));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    v_float64x2 one = v_setall_f64(1.0f);
+    return v_div(one, v_sqrt(x));
+}
+
+inline v_float64x2 v_abs(v_float64x2 x)
+{ return v_float64x2(vabsq_f64(x.val)); }
+#endif
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
+inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
+inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
+inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
+#if defined(__aarch64__) || defined(_M_ARM64)
+static inline uint64x2_t vmvnq_u64(uint64x2_t a)
+{
+    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
+    return veorq_u64(a, vx);
+}
+//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
+//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
+static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint64x2(vceqq_u64(a.val, b.val)); }
+static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
+static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
+{ return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
+static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
+{ return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
+#else
+static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
+}
+static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    uint64x2_t v_eq = vreinterpretq_u64_u32(vandq_u32(cmp, swapped));
+    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
+    return v_uint64x2(veorq_u64(v_eq, vx));
+}
+static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
+{
+    return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
+}
+static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
+{
+    return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
+}
+#endif
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
+#endif
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
+#if CV_SIMD128_64F
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
+#endif
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
+#endif
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
+
+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
+inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec2(cast(intrin(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_SIMD128_64F
+    // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
+    // also adds FMA support both for single- and double-precision floating-point vectors
+    return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
+#else
+    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+#endif
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+#endif
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
+inline _Tpvec v_shl (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
+inline _Tpvec v_shr (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
+
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
+
+#define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
+#endif
+
+#if defined(__clang__) && defined(__aarch64__)
+// avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
+#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
+uint64 v = *(unaligned_uint64*)ptr; \
+return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
+}
+#else
+#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
+
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
+#endif
+
+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+#if CV_NEON_AARCH64
+    uint16_t t0 = vaddlvq_u8(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+#if CV_NEON_AARCH64
+    int16_t t0 = vaddlvq_s8(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
+    int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    return vget_lane_s32(vpadd_s32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sum(const v_uint16x8& a)
+{
+#if CV_NEON_AARCH64
+    uint32_t t0 = vaddlvq_u16(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(a.val);
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_reduce_sum(const v_int16x8& a)
+{
+#if CV_NEON_AARCH64
+    int32_t t0 = vaddlvq_s16(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int32x4_t t0 = vpaddlq_s16(a.val);
+    int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    return vget_lane_s32(vpadd_s32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
+    a0 = vp##vectorfunc##_##suffix(a0, a0); \
+    a0 = vp##vectorfunc##_##suffix(a0, a0); \
+    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
+}
+#endif // #if CV_NEON_AARCH64
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
+    a0 = vp##vectorfunc##_##suffix(a0, a0); \
+    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
+}
+#endif // #if CV_NEON_AARCH64
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
+    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
+}
+#endif // #if CV_NEON_AARCH64
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+#if CV_NEON_AARCH64
+    return vaddvq_u64(a.val);
+#else // #if CV_NEON_AARCH64
+    return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
+#endif // #if CV_NEON_AARCH64
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+#if CV_NEON_AARCH64
+    return vaddvq_s64(a.val);
+#else // #if CV_NEON_AARCH64
+    return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
+#endif // #if CV_NEON_AARCH64
+}
+#if CV_SIMD128_64F
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    return vaddvq_f64(a.val);
+}
+#endif
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+#if CV_NEON_AARCH64
+    float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
+    float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
+    return v_float32x4(vpaddq_f32(ab, cd));  // sumA sumB sumC sumD
+#else // #if CV_NEON_AARCH64
+    float32x4x2_t ab = vtrnq_f32(a.val, b.val);
+    float32x4x2_t cd = vtrnq_f32(c.val, d.val);
+
+    float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
+    float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
+
+    float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
+    float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
+
+    return v_float32x4(vaddq_f32(v0, v1));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+#if CV_NEON_AARCH64
+    uint8x16_t t0 = vabdq_u8(a.val, b.val);
+    uint16_t t1 = vaddlvq_u8(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_NEON_AARCH64
+    uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
+    uint16_t t1 = vaddlvq_u8(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_NEON_AARCH64
+    uint16x8_t t0 = vabdq_u16(a.val, b.val);
+    uint32_t t1 = vaddlvq_u16(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+#if CV_NEON_AARCH64
+    uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
+    uint32_t t1 = vaddlvq_u16(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+#if CV_NEON_AARCH64
+    uint32x4_t t0 = vabdq_u32(a.val, b.val);
+    uint32_t t1 = vaddvq_u32(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vabdq_u32(a.val, b.val);
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_NEON_AARCH64
+    uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
+    uint32_t t1 = vaddvq_u32(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+#if CV_NEON_AARCH64
+    float32x4_t t0 = vabdq_f32(a.val, b.val);
+    return vaddvq_f32(t0);
+#else // #if CV_NEON_AARCH64
+    float32x4_t t0 = vabdq_f32(a.val, b.val);
+    float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
+    return vget_lane_f32(vpadd_f32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{ return v_uint8x16(vcntq_u8(a.val)); }
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
+
+inline int v_signmask(const v_uint8x16& a)
+{
+#if CV_NEON_AARCH64
+    const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
+    const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
+    uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
+    uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
+#endif // #if CV_NEON_AARCH64
+}
+
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+#if CV_NEON_AARCH64
+    const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
+    uint32_t t0 = vaddlvq_u16(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+#if CV_NEON_AARCH64
+    const int32x4_t signPosition = {0,1,2,3};
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
+    uint32_t t0 = vaddvq_u32(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(v0);
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_uint64x2& a)
+{
+#if CV_NEON_AARCH64
+    const int64x2_t signPosition = {0,1};
+    uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
+    int t0 = (int)vaddvq_u64(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int64x1_t m0 = vdup_n_s64(0);
+    uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
+    return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#endif
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+#if CV_SIMD128_64F
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+#endif
+
+#if CV_NEON_AARCH64
+    #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+    inline bool v_check_all(const v_##_Tpvec& a) \
+    { \
+        return (vminvq_##suffix(a.val) >> shift) != 0; \
+    } \
+    inline bool v_check_any(const v_##_Tpvec& a) \
+    { \
+        return (vmaxvq_##suffix(a.val) >> shift) != 0; \
+    }
+#else // #if CV_NEON_AARCH64
+    #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+    inline bool v_check_all(const v_##_Tpvec& a) \
+    { \
+        _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
+        uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+        return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
+    } \
+    inline bool v_check_any(const v_##_Tpvec& a) \
+    { \
+        _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
+        uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+        return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
+    }
+#endif // #if CV_NEON_AARCH64
+
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
+
+inline bool v_check_all(const v_uint64x2& a)
+{
+    uint64x2_t v0 = vshrq_n_u64(a.val, 63);
+    return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
+}
+inline bool v_check_any(const v_uint64x2& a)
+{
+    uint64x2_t v0 = vshrq_n_u64(a.val, 63);
+    return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
+}
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
+#endif
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_high_##suffix(a.val); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+#else
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+#endif
+
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
+    uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
+    uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
+    return v_uint32x4(vmovl_u16(v1));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
+    int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
+    int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
+    return v_int32x4(vmovl_s16(v1));
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    b0.val = vzip1q_##suffix(a0.val, a1.val); \
+    b1.val = vzip2q_##suffix(a0.val, a1.val); \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
+    d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
+}
+#else
+#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
+    b0.val = p.val[0]; \
+    b1.val = p.val[1]; \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
+    d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
+}
+#endif
+
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
+#endif
+
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    uint8x16_t vec = vrev64q_u8(a.val);
+    return v_uint8x16(vextq_u8(vec, vec, 8));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    uint16x8_t vec = vrev64q_u16(a.val);
+    return v_uint16x8(vextq_u16(vec, vec, 4));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    uint32x4_t vec = vrev64q_u32(a.val);
+    return v_uint32x4(vextq_u32(vec, vec, 2));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    uint64x2_t vec = a.val;
+    uint64x1_t vec_lo = vget_low_u64(vec);
+    uint64x1_t vec_hi = vget_high_u64(vec);
+    return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
+template <int s> \
+inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
+}
+
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
+OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
+
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
+template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
+
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
+#endif
+
+#if CV_SIMD128_64F
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    float32x4_t a_ = a.val;
+    int32x4_t result;
+#if defined _MSC_VER
+    result = vcvtnq_s32_f32(a_);
+#else
+    __asm__ ("fcvtns %0.4s, %1.4s"
+             : "=w"(result)
+             : "w"(a_)
+             : /* No clobbers */);
+#endif
+    return v_int32x4(result);
+}
+#else
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
+    float32x4_t delta = vdupq_n_f32(12582912.0f);
+    return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta)));
+}
+#endif
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
+    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
+    return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(vcvtq_s32_f32(a.val)); }
+
+#if CV_SIMD128_64F
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    static const int32x2_t zero = vdup_n_s32(0);
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), zero));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), vmovn_s64(vcvtnq_s64_f64(b.val))));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    static const int32x2_t zero = vdup_n_s32(0);
+    int64x2_t a1 = vcvtq_s64_f64(a.val);
+    uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
+    a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
+    return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    static const int32x2_t zero = vdup_n_s32(0);
+    int64x2_t a1 = vcvtq_s64_f64(a.val);
+    uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
+    a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
+    return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    static const int32x2_t zero = vdup_n_s32(0);
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
+}
+#endif
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* -- Pass 1: 64b transpose */ \
+    _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
+    _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
+    _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
+    _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
+    /* -- Pass 2: 32b transpose */ \
+    b0.val = vtrn1q_##suffix##32(t0, t1); \
+    b1.val = vtrn2q_##suffix##32(t0, t1); \
+    b2.val = vtrn1q_##suffix##32(t2, t3); \
+    b3.val = vtrn2q_##suffix##32(t2, t3); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
+#else // #if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* m00 m01 m02 m03 */ \
+    /* m10 m11 m12 m13 */ \
+    /* m20 m21 m22 m23 */ \
+    /* m30 m31 m32 m33 */ \
+    _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
+    _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
+    /* m00 m10 m02 m12 */ \
+    /* m01 m11 m03 m13 */ \
+    /* m20 m30 m22 m32 */ \
+    /* m21 m31 m23 m33 */ \
+    b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
+    b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
+    b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
+    b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
+#endif // #if CV_NEON_AARCH64
+
+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    _Tpvec##x2_t v = vld2q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v = vld3q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v = vld4q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+    d.val = v.val[3]; \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    _Tpvec##x2_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    vst2q_##suffix(ptr, v); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    _Tpvec##x3_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    vst3q_##suffix(ptr, v); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec##x4_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    v.val[3] = d.val; \
+    vst4q_##suffix(ptr, v); \
+}
+
+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 2); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 3); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
+                                 v_##tp##x2& b, v_##tp##x2& c ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 3); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 5); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
+                                 v_##tp##x2& c, v_##tp##x2& d ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t d0 = vld1_##suffix(ptr + 3); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 5); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 6); \
+    tp##x1_t d1 = vld1_##suffix(ptr + 7); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+    d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
+                                const v_##tp##x2& b, const v_##tp##x2& c, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                const v_##tp##x2& c, const v_##tp##x2& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
+    vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
+#endif
+
+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vcvtq_f32_s32(a.val));
+}
+
+#if CV_SIMD128_64F
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    float32x2_t zero = vdup_n_f32(0.0f);
+    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{  return v_float64x2(vcvtq_f64_s64(a.val)); }
+
+#endif
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(vld1q_s8(elems));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(vld1q_s8(elems));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(vld1q_s8(elems));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(vld1q_s16(elems));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(vld1q_s16(elems));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(vld1q_s32(elems));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vld1q_s32(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vld1q_s64(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(vld1q_f32(elems));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64;
+
+    uint64 CV_DECL_ALIGNED(32) elems[2] =
+    {
+        *(unaligned_uint64*)(tab + idx[0]),
+        *(unaligned_uint64*)(tab + idx[1])
+    };
+    return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(vld1q_f32(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_int32x4(vld1q_s32(elems));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_uint32x4(vld1q_u32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_float32x4(vld1q_f32(elems));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    /*int CV_DECL_ALIGNED(32) idx[4];
+    v_store(idx, idxvec);
+
+    float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
+    float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
+
+    float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
+    x = v_float32x4(xxyy.val[0]);
+    y = v_float32x4(xxyy.val[1]);*/
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
+    return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
+    return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_float64x2(vld1q_f64(elems));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(vld1q_f64(tab + idx[0]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+    };
+    return v_float64x2(vld1q_f64(elems));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+#endif
+
+////// FP16 support ///////
+#if CV_FP16
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    float16x4_t v =
+    #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
+        (float16x4_t)vld1_s16((const short*)ptr);
+    #else
+        vld1_f16((const __fp16*)ptr);
+    #endif
+    return v_float32x4(vcvt_f32_f16(v));
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    float16x4_t hv = vcvt_f16_f32(v.val);
+
+    #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
+        vst1_s16((short*)ptr, (int16x4_t)hv);
+    #else
+        vst1_f16((__fp16*)ptr, hv);
+    #endif
+}
+#else
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    const int N = 4;
+    float buf[N];
+    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    const int N = 4;
+    float buf[N];
+    v_store(buf, v);
+    for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
+}
+#endif
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv.hpp
new file mode 100644
index 000000000000..d446a05db5c2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv.hpp
@@ -0,0 +1,3345 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// The original implementation has been contributed by Yin Zhang.
+// Copyright (C) 2020, Institute of Software, Chinese Academy of Sciences.
+
+#ifndef OPENCV_HAL_INTRIN_RVV_HPP
+#define OPENCV_HAL_INTRIN_RVV_HPP
+
+#include <algorithm>
+
+// RVV intrinsics have been renamed in version 0.11, so we need to include
+// compatibility headers:
+// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
+#if defined(__riscv_v_intrinsic) &&  __riscv_v_intrinsic>10999
+#include "intrin_rvv_010_compat_non-policy.hpp"
+#include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
+#endif
+
+
+// Building for T-Head C906 core with RVV 0.7.1 using toolchain
+// https://github.com/T-head-Semi/xuantie-gnu-toolchain
+// with option '-march=rv64gcv0p7'
+#ifdef __THEAD_VERSION__
+#   if __riscv_v == 7000
+#       include <fenv.h>
+#       define CV_RVV_THEAD_0_7
+#   endif
+#endif
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+#ifndef CV_RVV_THEAD_0_7
+#   define CV_SIMD128_64F 1
+#else
+#   define CV_SIMD128_64F 0
+#endif
+
+//////////// Unsupported native intrinsics in C++ ////////////
+// The following types have been defined in clang, but not in GCC yet.
+#ifndef __clang__
+
+struct vuint8mf2_t
+{
+    uchar val[8] = {0};
+    vuint8mf2_t() {}
+    vuint8mf2_t(const uchar* ptr)
+    {
+        for (int i = 0; i < 8; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vint8mf2_t
+{
+    schar val[8] = {0};
+    vint8mf2_t() {}
+    vint8mf2_t(const schar* ptr)
+    {
+        for (int i = 0; i < 8; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vuint16mf2_t
+{
+    ushort val[4] = {0};
+    vuint16mf2_t() {}
+    vuint16mf2_t(const ushort* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vint16mf2_t
+{
+    short val[4] = {0};
+    vint16mf2_t() {}
+    vint16mf2_t(const short* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vuint32mf2_t
+{
+    unsigned val[2] = {0};
+    vuint32mf2_t() {}
+    vuint32mf2_t(const unsigned* ptr)
+    {
+        val[0] = ptr[0];
+        val[1] = ptr[1];
+    }
+};
+struct vint32mf2_t
+{
+    int val[2] = {0};
+    vint32mf2_t() {}
+    vint32mf2_t(const int* ptr)
+    {
+        val[0] = ptr[0];
+        val[1] = ptr[1];
+    }
+};
+struct vfloat32mf2_t
+{
+    float val[2] = {0};
+    vfloat32mf2_t() {}
+    vfloat32mf2_t(const float* ptr)
+    {
+        val[0] = ptr[0];
+        val[1] = ptr[1];
+    }
+};
+struct vuint64mf2_t
+{
+    uint64 val[1] = {0};
+    vuint64mf2_t() {}
+    vuint64mf2_t(const uint64* ptr)
+    {
+        val[0] = ptr[0];
+    }
+};
+struct vint64mf2_t
+{
+    int64 val[1] = {0};
+    vint64mf2_t() {}
+    vint64mf2_t(const int64* ptr)
+    {
+        val[0] = ptr[0];
+    }
+};
+struct vfloat64mf2_t
+{
+    double val[1] = {0};
+    vfloat64mf2_t() {}
+    vfloat64mf2_t(const double* ptr)
+    {
+        val[0] = ptr[0];
+    }
+};
+struct vuint8mf4_t
+{
+    uchar val[4] = {0};
+    vuint8mf4_t() {}
+    vuint8mf4_t(const uchar* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vint8mf4_t
+{
+    schar val[4] = {0};
+    vint8mf4_t() {}
+    vint8mf4_t(const schar* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+
+#define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
+inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr, size_t vl) \
+{ \
+    CV_UNUSED(vl); \
+    return _Tpvec(ptr); \
+} \
+inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v, size_t vl) \
+{ \
+    CV_UNUSED(vl); \
+    for (int i = 0; i < n; ++i) \
+    { \
+            ptr[i] = v.val[i]; \
+    } \
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
+
+
+#define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
+inline _Tpwvec wcvt (_Tpvec v, size_t vl) \
+{ \
+    _wTp tmp[n]; \
+    for (int i = 0; i < n; ++i) \
+    { \
+            tmp[i] = (_wTp)v.val[i]; \
+    } \
+    return vle##width##_v_##suffix##m1(tmp, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
+
+inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base, size_t vl)
+{
+    CV_UNUSED(vl);
+    return vuint8mf4_t(base);
+}
+inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base, size_t vl)
+{
+    CV_UNUSED(vl);
+    return vint8mf4_t(base);
+}
+
+inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src, size_t vl)
+{
+    ushort tmp[4];
+    for (int i = 0; i < 4; ++i)
+    {
+            tmp[i] = (ushort)src.val[i];
+    }
+    return vle16_v_u16mf2(tmp, vl);
+}
+inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src, size_t vl)
+{
+    short tmp[4];
+    for (int i = 0; i < 4; ++i)
+    {
+            tmp[i] = (short)src.val[i];
+    }
+    return vle16_v_i16mf2(tmp, vl);
+}
+#endif
+
+//////////// Types ////////////
+
+#ifndef __clang__
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(vuint8m1_t v)
+    {
+        vse8_v_u8m1(val, v, nlanes);
+    }
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint8m1_t() const
+    {
+        return vle8_v_u8m1(val, nlanes);
+    }
+    uchar get0() const
+    {
+        return val[0];
+    }
+
+    uchar val[16];
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(vint8m1_t v)
+    {
+        vse8_v_i8m1(val, v, nlanes);
+    }
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint8m1_t() const
+    {
+        return vle8_v_i8m1(val, nlanes);
+    }
+    schar get0() const
+    {
+        return val[0];
+    }
+
+    schar val[16];
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(vuint16m1_t v)
+    {
+        vse16_v_u16m1(val, v, nlanes);
+    }
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint16m1_t() const
+    {
+        return vle16_v_u16m1(val, nlanes);
+    }
+    ushort get0() const
+    {
+        return val[0];
+    }
+
+    ushort val[8];
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(vint16m1_t v)
+    {
+        vse16_v_i16m1(val, v, nlanes);
+    }
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint16m1_t() const
+    {
+        return vle16_v_i16m1(val, nlanes);
+    }
+    short get0() const
+    {
+        return val[0];
+    }
+
+    short val[8];
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(vuint32m1_t v)
+    {
+        vse32_v_u32m1(val, v, nlanes);
+    }
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint32m1_t() const
+    {
+        return vle32_v_u32m1(val, nlanes);
+    }
+    unsigned get0() const
+    {
+        return val[0];
+    }
+
+    unsigned val[4];
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(vint32m1_t v)
+    {
+        vse32_v_i32m1(val, v, nlanes);
+    }
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint32m1_t() const
+    {
+        return vle32_v_i32m1(val, nlanes);
+    }
+    int get0() const
+    {
+        return val[0];
+    }
+    int val[4];
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(vfloat32m1_t v)
+    {
+        vse32_v_f32m1(val, v, nlanes);
+    }
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vfloat32m1_t() const
+    {
+        return vle32_v_f32m1(val, nlanes);
+    }
+    float get0() const
+    {
+        return val[0];
+    }
+    float val[4];
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(vuint64m1_t v)
+    {
+        vse64_v_u64m1(val, v, nlanes);
+    }
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint64m1_t() const
+    {
+        return vle64_v_u64m1(val, nlanes);
+    }
+    uint64 get0() const
+    {
+        return val[0];
+    }
+
+    uint64 val[2];
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(vint64m1_t v)
+    {
+        vse64_v_i64m1(val, v, nlanes);
+    }
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint64m1_t() const
+    {
+        return vle64_v_i64m1(val, nlanes);
+    }
+    int64 get0() const
+    {
+        return val[0];
+    }
+
+    int64 val[2];
+};
+
+#if CV_SIMD128_64F
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(vfloat64m1_t v)
+    {
+        vse64_v_f64m1(val, v, nlanes);
+    }
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vfloat64m1_t() const
+    {
+        return vle64_v_f64m1(val, nlanes);
+    }
+    double get0() const
+    {
+        return val[0];
+    }
+
+    double val[2];
+};
+#endif
+#else
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(vuint8m1_t v)
+    {
+        *pval = v;
+    }
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        *pval = vle8_v_u8m1(v, nlanes);
+    }
+    operator vuint8m1_t() const
+    {
+        return *pval;
+    }
+    uchar get0() const
+    {
+        return vmv_x(*pval);
+    }
+    inline v_uint8x16& operator=(const v_uint8x16& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_uint8x16(const v_uint8x16& vec) {
+        *pval = *(vec.pval);
+    }
+    uchar val[16];
+    vuint8m1_t* pval = (vuint8m1_t*)val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(vint8m1_t v)
+    {
+        *pval = v;
+    }
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        *pval = vle8_v_i8m1(v, nlanes);
+    }
+    operator vint8m1_t() const
+    {
+        return *pval;
+    }
+    schar get0() const
+    {
+        return vmv_x(*pval);
+    }
+    inline v_int8x16& operator=(const v_int8x16& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_int8x16(const v_int8x16& vec) {
+        *pval = *(vec.pval);
+    }
+    schar val[16];
+    vint8m1_t* pval = (vint8m1_t*)val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(vuint16m1_t v)
+    {
+        *pval = v;
+    }
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        *pval = vle16_v_u16m1(v, nlanes);
+    }
+    operator vuint16m1_t() const
+    {
+        return *pval;
+    }
+    ushort get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_uint16x8& operator=(const v_uint16x8& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_uint16x8(const v_uint16x8& vec) {
+        *pval = *(vec.pval);
+    }
+    ushort val[8];
+    vuint16m1_t* pval = (vuint16m1_t*)val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(vint16m1_t v)
+    {
+        *pval = v;
+    }
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        *pval = vle16_v_i16m1(v, nlanes);
+    }
+    operator vint16m1_t() const
+    {
+        return *pval;
+    }
+    short get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_int16x8& operator=(const v_int16x8& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_int16x8(const v_int16x8& vec) {
+        *pval = *(vec.pval);
+    }
+    short val[8];
+    vint16m1_t* pval = (vint16m1_t*)val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(vuint32m1_t v)
+    {
+        *pval = v;
+    }
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        *pval = vle32_v_u32m1(v, nlanes);
+    }
+    operator vuint32m1_t() const
+    {
+        return *pval;
+    }
+    unsigned get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_uint32x4& operator=(const v_uint32x4& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_uint32x4(const v_uint32x4& vec) {
+        *pval = *(vec.pval);
+    }
+    unsigned val[4];
+    vuint32m1_t* pval = (vuint32m1_t*)val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(vint32m1_t v)
+    {
+        *pval = v;
+    }
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        *pval = vle32_v_i32m1(v, nlanes);
+    }
+    operator vint32m1_t() const
+    {
+        return *pval;
+    }
+    int get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_int32x4& operator=(const v_int32x4& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_int32x4(const v_int32x4& vec) {
+        *pval = *(vec.pval);
+    }
+    int val[4];
+    vint32m1_t* pval = (vint32m1_t*)val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(vfloat32m1_t v)
+    {
+        *pval = v;
+    }
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        *pval = vle32_v_f32m1(v, nlanes);
+    }
+    operator vfloat32m1_t() const
+    {
+        return *pval;
+    }
+    float get0() const
+    {
+        return vfmv_f(*pval);
+    }
+    inline v_float32x4& operator=(const v_float32x4& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_float32x4(const v_float32x4& vec) {
+        *pval = *(vec.pval);
+    }
+    float val[4];
+    vfloat32m1_t* pval = (vfloat32m1_t*)val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(vuint64m1_t v)
+    {
+        *pval = v;
+    }
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        *pval = vle64_v_u64m1(v, nlanes);
+    }
+    operator vuint64m1_t() const
+    {
+        return *pval;
+    }
+    uint64 get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_uint64x2& operator=(const v_uint64x2& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_uint64x2(const v_uint64x2& vec) {
+        *pval = *(vec.pval);
+    }
+    uint64 val[2];
+    vuint64m1_t* pval = (vuint64m1_t*)val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(vint64m1_t v)
+    {
+        *pval = v;
+    }
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        *pval = vle64_v_i64m1(v, nlanes);
+    }
+    operator vint64m1_t() const
+    {
+        return *pval;
+    }
+    int64 get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_int64x2& operator=(const v_int64x2& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_int64x2(const v_int64x2& vec) {
+        *pval = *(vec.pval);
+    }
+    int64 val[2];
+    vint64m1_t* pval = (vint64m1_t*)val;
+};
+
+#if CV_SIMD128_64F
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(vfloat64m1_t v)
+    {
+        *pval = v;
+    }
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        *pval = vle64_v_f64m1(v, nlanes);
+    }
+    operator vfloat64m1_t() const
+    {
+        return *pval;
+    }
+    double get0() const
+    {
+        return vfmv_f(*pval);
+    }
+
+    inline v_float64x2& operator=(const v_float64x2& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_float64x2(const v_float64x2& vec) {
+        *pval = *(vec.pval);
+    }
+    double val[2];
+    vfloat64m1_t* pval = (vfloat64m1_t*)val;
+};
+#endif // CV_SIMD128_64F
+#endif // __clang__
+
+//////////// Initial ////////////
+
+#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
+inline v_##_Tpvec v_setzero_##suffix1() \
+{ \
+    return v_##_Tpvec(vmv_v_x_##suffix2##m1(0, vl)); \
+} \
+inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
+{ \
+    return v_##_Tpvec(vmv_v_x_##suffix2##m1(v, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, u8, u8, 16)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, s8, i8, 16)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, u16, u16, 8)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, s16, i16, 8)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, u32, u32, 4)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, s32, i32, 4)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, u64, u64, 2)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, s64, i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
+inline v_##_Tpv v_setzero_##suffix() \
+{ \
+    return v_##_Tpv(vfmv_v_f_##suffix##m1(0, vl)); \
+} \
+inline v_##_Tpv v_setall_##suffix(_Tp v) \
+{ \
+    return v_##_Tpv(vfmv_v_f_##suffix##m1(v, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, f32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, f64, 2)
+#endif
+
+//////////// Reinterpret ////////////
+
+#define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
+inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
+
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, int8x16, u8, s8, u8, i8)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, int16x8, u16, s16, u16, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, int32x4, u32, s32, u32, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, float32x4, u32, f32, u32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, float32x4, s32, f32, i32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, int64x2, u64, s64, u64, i64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, float64x2, u64, f64, u64, f64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64x2, float64x2, s64, f64, i64, f64)
+#endif
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint16x8, u8, u16, u8, u16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint32x4, u8, u32, u8, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint64x2, u8, u64, u8, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint32x4, u16, u32, u16, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint64x2, u16, u64, u16, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, uint64x2, u32, u64, u32, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int16x8, s8, s16, i8, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int32x4, s8, s32, i8, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int64x2, s8, s64, i8, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int32x4, s16, s32, i16, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int64x2, s16, s64, i16, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, int64x2, s32, s64, i32, i64)
+
+
+#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return v_##_Tpvec1(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v)));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return v_##_Tpvec2(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v)));\
+}
+
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int16x8, u8, s16, u, i, 8, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int32x4, u8, s32, u, i, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int64x2, u8, s64, u, i, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int8x16, u16, s8, u, i, 16, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int32x4, u16, s32, u, i, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int64x2, u16, s64, u, i, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int8x16, u32, s8, u, i, 32, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int16x8, u32, s16, u, i, 32, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int64x2, u32, s64, u, i, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int8x16, u64, s8, u, i, 64, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int16x8, u64, s16, u, i, 64, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int32x4, u64, s32, u, i, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float32x4, u8, f32, u, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float32x4, u16, f32, u, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, float32x4, u64, f32, u, f, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float32x4, s8, f32, i, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float32x4, s16, f32, i, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64x2, float32x4, s64, f32, i, f, 64, 32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float64x2, u8, f64, u, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float64x2, u16, f64, u, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, float64x2, u32, f64, u, f, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float64x2, s8, f64, i, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float64x2, s16, f64, i, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32x4, float64x2, s32, f64, i, f, 32, 64)
+#endif
+
+// Three times reinterpret
+#if CV_SIMD128_64F
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) \
+{ \
+    return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v))));\
+} \
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) \
+{ \
+    return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v))));\
+}
+#endif
+
+////////////// Extract //////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vmv, vl) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
+} \
+template<int i> inline _Tp v_extract_n(_Tpvec v) \
+{ \
+    return _Tp(vmv(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), v, i, vl))); \
+}
+
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8x16, uchar, u8, vmv_x_s_u8m1_u8, 16)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8x16, schar, i8, vmv_x_s_i8m1_i8, 16)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16x8, ushort, u16, vmv_x_s_u16m1_u16, 8)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16x8, short, i16, vmv_x_s_i16m1_i16, 8)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32x4, uint, u32, vmv_x_s_u32m1_u32, 4)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32x4, int, i32, vmv_x_s_i32m1_i32, 4)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64x2, uint64, u64, vmv_x_s_u64m1_u64, 2)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64x2, int64, i64, vmv_x_s_i64m1_i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vmv, vl) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
+} \
+template<int i> inline _Tp v_extract_n(_Tpvec v) \
+{ \
+    return _Tp(vmv(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), v, i, vl))); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32x4, float, f32, vfmv_f_s_f32m1_f32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64x2, double, f64, vfmv_f_s_f64m1_f64, 2)
+#endif
+
+////////////// Load/Store //////////////
+
+#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ \
+    return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
+} \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ \
+    return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
+} \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
+    return res; \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, hvl); \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
+}
+
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 16, 8, u8, vmv_v_x_u8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 16, 8, i8, vmv_v_x_i8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 8, 16, u16, vmv_v_x_u16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 8, 16, i16, vmv_v_x_i16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 4, 32, u32, vmv_v_x_u32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 4, 32, i32, vmv_v_x_i32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 2, 64, u64, vmv_v_x_u64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 2, 64, i64, vmv_v_x_i64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 4, 32, f32, vfmv_v_f_f32m1)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 2, 64, f64, vfmv_v_f_f64m1)
+#endif
+
+inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
+{
+    schar elems[16] =
+    {
+        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
+        ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+}
+inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
+
+inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
+{
+    short elems[8] =
+    {
+        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+}
+inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
+
+inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
+{
+    int elems[4] =
+    {
+        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+}
+inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
+{
+    float elems[4] =
+    {
+        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
+
+inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
+{
+    int64 elems[2] =
+    {
+        ptr0[0], ptr1[0]
+    };
+    return v_int64x2(vle64_v_i64m1(elems, 2));
+}
+inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
+{
+    double elems[2] =
+    {
+        ptr0[0], ptr1[0]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+#endif
+
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    schar elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    schar elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    short elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    int elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vle32_v_i32m1(tab + idx[0], 4));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    int64_t elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_int64x2(vle64_v_i64m1(elems, 2));
+}
+inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x2(vle64_v_i64m1(tab + idx[0], 2));
+}
+inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    float elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int elems[4] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)],
+        tab[v_extract_n<2>(idxvec)],
+        tab[v_extract_n<3>(idxvec)]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned elems[4] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)],
+        tab[v_extract_n<2>(idxvec)],
+        tab[v_extract_n<3>(idxvec)]
+    };
+    return v_uint32x4(vle32_v_u32m1(elems, 4));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float elems[4] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)],
+        tab[v_extract_n<2>(idxvec)],
+        tab[v_extract_n<3>(idxvec)]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    double elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(vle64_v_f64m1(tab + idx[0], 2));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    double elems[2] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int idx[4] = {0};
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+#endif
+
+////////////// Pack boolean ////////////////////
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    ushort ptr[16] = {0};
+    v_store(ptr, a);
+    v_store(ptr + 8, b);
+    return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr, 16), 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    unsigned ptr[16] = {0};
+    v_store(ptr, a);
+    v_store(ptr + 4, b);
+    v_store(ptr + 8, c);
+    v_store(ptr + 12, d);
+    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr, 16), 0, 16), 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    uint64 ptr[16] = {0};
+    v_store(ptr, a);
+    v_store(ptr + 2, b);
+    v_store(ptr + 4, c);
+    v_store(ptr + 6, d);
+    v_store(ptr + 8, e);
+    v_store(ptr + 10, f);
+    v_store(ptr + 12, g);
+    v_store(ptr + 14, h);
+    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr, 16), 0, 16), 0, 16), 0, 16));
+}
+
+////////////// Arithmetics //////////////
+#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, vl) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a, b, vl)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a = _Tpvec(intrin(a, b, vl)); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 2)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 2)
+#endif
+
+
+////////////// Bitwise logic //////////////
+
+#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, vl) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, vl) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, vl) \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+    return _Tpvec(vnot_v_##suffix##m1(a, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a), 4)));
+}
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a), 2)));
+}
+#endif
+
+////////////// Bitwise shifts //////////////
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ \
+    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ \
+    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 2)
+
+
+////////////// Comparison //////////////
+
+#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
+inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    uint64_t ones = -1; \
+    return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
+inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    union { uint64 u; double d; } ones; ones.u = -1; \
+    return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, vl)
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, vl)
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, vl)
+
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8, 16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16, 8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32, 4)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64, 2)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8, 16)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16, 8)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32, 4)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64, 2)
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64, 2)
+#endif
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return a == a; }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return a == a; }
+#endif
+
+////////////// Min/Max //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a, b, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 2)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
+#endif
+
+////////////// Arithmetics wrap //////////////
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
+
+////////////// Reduce //////////////
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
+    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
+    return (scalartype)(_wTpvec(res).get0()); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 8, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 8, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 4, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 4, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 2, redsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 2, redsum)
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
+    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
+    return (scalartype)(_wTpvec(res).get0()); \
+}
+
+// vfredsum for float has renamed to fredosum, also updated in GNU.
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 4, fredosum)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 2, fredosum)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
+inline scalartype v_reduce_##func(const _Tpvec& a)  \
+{ \
+    _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a, vl)); \
+    return scalartype(res.get0()); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 16, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 16, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 8, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 8, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 4, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 4, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 4, fredmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 16, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 16, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 8, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 8, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 4, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 4, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 4, fredmax)
+
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    float elems[4] =
+    {
+        v_reduce_sum(a),
+        v_reduce_sum(b),
+        v_reduce_sum(c),
+        v_reduce_sum(d)
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+
+////////////// Square-Root //////////////
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(vfsqrt_v_f32m1(x, 4));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    v_float32x4 one = v_setall_f32(1.0f);
+    return one / v_sqrt(x);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(vfsqrt_v_f64m1(x, 4));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    v_float64x2 one = v_setall_f64(1.0f);
+    return one / v_sqrt(x);
+}
+#endif
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
+}
+#endif
+
+////////////// Multiply-Add //////////////
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(vfmacc_vv_f32m1(c, a, b, 4));
+}
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmacc_vv_i32m1(c, a, b, 4));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmacc_vv_f64m1(c, a, b, 2));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+#endif
+
+////////////// Check all/any //////////////
+
+// use overloaded vcpop in clang, no casting like (vuint64m1_t) is needed.
+#ifndef __clang__
+#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    auto v0 = vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl); \
+    v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
+    return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) == 0; \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    auto v0 = vsrl_vx_##suffix##m1(a, shift, vl); \
+    v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
+    return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) != 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 8)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 4)
+//OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 2)
+inline bool v_check_all(const v_uint64x2& a)
+{
+    v_uint64x2 v = v_uint64x2(vsrl_vx_u64m1(vnot_v_u64m1(a, 2), 63, 2));
+    return (v.val[0] | v.val[1]) == 0;
+}
+inline bool v_check_any(const v_uint64x2& a)
+{
+    v_uint64x2 v = v_uint64x2(vsrl_vx_u64m1(a, 63, 2));
+    return (v.val[0] | v.val[1]) != 0;
+}
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+#if CV_SIMD128_64F
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+#endif
+#else
+#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    return vcpop(vmslt(a, 0, vl), vl) == vl; \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    return vcpop(vmslt(a, 0, vl), vl) != 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8x16, 16)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16x8, 8)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32x4, 4)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64x2, 2)
+
+
+inline bool v_check_all(const v_uint8x16& a)
+{ return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint8x16& a)
+{ return v_check_any(v_reinterpret_as_s8(a)); }
+
+inline bool v_check_all(const v_uint16x8& a)
+{ return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint16x8& a)
+{ return v_check_any(v_reinterpret_as_s16(a)); }
+
+inline bool v_check_all(const v_uint32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_uint64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+
+#if CV_SIMD128_64F
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+#endif
+#endif
+////////////// abs //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
+inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_max(a, b) - v_min(a, b); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
+#endif
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(ivec, uvec, itype, utype, isuf, usuf, vlen) \
+inline uvec v_absdiff(const ivec& a, const ivec& b) \
+{ \
+    itype max = vmax_vv_##isuf(a, b, vlen); \
+    itype min = vmin_vv_##isuf(a, b, vlen); \
+    return uvec(vreinterpret_v_##isuf##_##usuf(vsub_vv_##isuf(max, min, vlen))); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vint8m1_t, vuint8m1_t, i8m1, u8m1, 16)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vint16m1_t, vuint16m1_t, i16m1, u16m1, 8)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vint32m1_t, vuint32m1_t, i32m1, u32m1, 4)
+
+#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
+inline _Tprvec v_abs(const _Tpvec& a) \
+{ \
+    return v_absdiff(a, v_setzero_##suffix()); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32)
+OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
+inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_reduce_sum(v_absdiff(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
+
+////////////// Select //////////////
+
+#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, vl) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(merge(ne(mask, 0, vl), b, a, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 16)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 16)
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 8)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 8)
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 4)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 4)
+OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 2)
+#endif
+
+////////////// Rotate shift //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64x2, i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32x4, f32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64x2, f64, 2)
+#endif
+
+////////////// Convert to float //////////////
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vfcvt_f_x_v_f32m1(a, 4));
+}
+
+#if CV_SIMD128_64F
+#ifndef __clang__
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0], a.val[1], 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
+}
+#else
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
+    return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
+}
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
+    return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
+}
+#endif
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
+    double elems[2] =
+    {
+        ptr[0], ptr[1]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
+    double elems[2] =
+    {
+        ptr[2], ptr[3]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
+    double elems[2] =
+    {
+        ptr[0], ptr[1]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
+    double elems[2] =
+    {
+        ptr[2], ptr[3]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(vfcvt_f_x_v_f64m1(a, 2));
+}
+#endif
+
+////////////// Broadcast //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
+template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
+{ \
+    return v_setall_##suffix(v_extract_n<i>(v)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64)
+#endif
+
+////////////// Transpose4x4 //////////////
+
+#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
+inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    _Tp elems0[4] = \
+    { \
+        v_extract_n<0>(a0), \
+        v_extract_n<0>(a1), \
+        v_extract_n<0>(a2), \
+        v_extract_n<0>(a3) \
+    }; \
+    b0 = v_load(elems0); \
+    _Tp elems1[4] = \
+    { \
+        v_extract_n<1>(a0), \
+        v_extract_n<1>(a1), \
+        v_extract_n<1>(a2), \
+        v_extract_n<1>(a3) \
+    }; \
+    b1 = v_load(elems1); \
+    _Tp elems2[4] = \
+    { \
+        v_extract_n<2>(a0), \
+        v_extract_n<2>(a1), \
+        v_extract_n<2>(a2), \
+        v_extract_n<2>(a3) \
+    }; \
+    b2 = v_load(elems2); \
+    _Tp elems3[4] = \
+    { \
+        v_extract_n<3>(a0), \
+        v_extract_n<3>(a1), \
+        v_extract_n<3>(a2), \
+        v_extract_n<3>(a3) \
+    }; \
+    b3 = v_load(elems3); \
+}
+
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
+
+////////////// Reverse //////////////
+
+#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_reverse(const _Tpvec& a)  \
+{ \
+    _Tp ptr[_Tpvec::nlanes] = {0}; \
+    _Tp ptra[_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+    { \
+        ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
+    } \
+    return v_load(ptr); \
+}
+
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, i8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, i16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, i32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, i64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, f64)
+#endif
+
+//////////// Value reordering ////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
+    v_store_low(lptr, a); \
+    v_store_high(hptr, a); \
+    b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
+    b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
+    v_store_low(lptr, a); \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
+    v_store_high(hptr, a); \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr, vl), vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1, 2)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr, 4), 4), 4));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr, 4), 4), 4));
+}
+
+
+#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, shr, hvl, vl) \
+inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    return _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl)); \
+} \
+inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl), hvl); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)); \
+} \
+template<int n> inline \
+void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)), hvl); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 8, 16, u8, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 8, 16, i8, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 16, 32, u16, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
+OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 16, 32, i16, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 32, 64, u32, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
+OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 32, 64, i32, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)
+
+
+#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
+inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl)); \
+} \
+inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl), hvl); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl)); \
+} \
+template<int n> inline \
+void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl))); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 4, 8)
+
+
+#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    _Tp ptra0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra1[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb1[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra0, a0); \
+    v_store(ptra1, a1); \
+    int i; \
+    for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
+    { \
+        ptrb0[i*2] = ptra0[i]; \
+        ptrb0[i*2+1] = ptra1[i]; \
+    } \
+    for( ; i < v_##_Tpvec::nlanes; i++ ) \
+    { \
+        ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
+        ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
+    } \
+    b0 = v_load(ptrb0); \
+    b1 = v_load(ptrb1); \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    v_store_low(ptra, a); \
+    v_store_low(ptrb, b); \
+    return v_load_halves(ptra, ptrb); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    v_store_high(ptra, a); \
+    v_store_high(ptrb, b); \
+    return v_load_halves(ptra, ptrb); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c = v_combine_low(a, b); \
+    d = v_combine_high(a, b); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, i8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, i16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, i32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, f64)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    int i, i2; \
+    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
+    { \
+        ptra[i] = ptr[i2]; \
+        ptrb[i] = ptr[i2+1]; \
+    } \
+    a = v_load(ptra); \
+    b = v_load(ptrb); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    int i, i3; \
+    for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
+    { \
+        ptra[i] = ptr[i3]; \
+        ptrb[i] = ptr[i3+1]; \
+        ptrc[i] = ptr[i3+2]; \
+    } \
+    a = v_load(ptra); \
+    b = v_load(ptrb); \
+    c = v_load(ptrc); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
+    int i, i4; \
+    for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
+    { \
+        ptra[i] = ptr[i4]; \
+        ptrb[i] = ptr[i4+1]; \
+        ptrc[i] = ptr[i4+2]; \
+        ptrd[i] = ptr[i4+3]; \
+    } \
+    a = v_load(ptra); \
+    b = v_load(ptrb); \
+    c = v_load(ptrc); \
+    d = v_load(ptrd); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    int i, i2; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    v_store(ptrb, b); \
+    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
+    { \
+        ptr[i2] = ptra[i]; \
+        ptr[i2+1] = ptrb[i]; \
+    } \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    int i, i3; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    v_store(ptrb, b); \
+    v_store(ptrc, c); \
+    for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
+    { \
+        ptr[i3] = ptra[i]; \
+        ptr[i3+1] = ptrb[i]; \
+        ptr[i3+2] = ptrc[i]; \
+    } \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    int i, i4; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    v_store(ptrb, b); \
+    v_store(ptrc, c); \
+    v_store(ptrd, d); \
+    for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
+    { \
+        ptr[i4] = ptra[i]; \
+        ptr[i4+1] = ptrb[i]; \
+        ptr[i4+2] = ptrc[i]; \
+        ptr[i4+3] = ptrd[i]; \
+    } \
+} \
+inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
+{ \
+    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptrvec, vec); \
+    for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
+    { \
+        ptr[4*i  ] = ptrvec[4*i  ]; \
+        ptr[4*i+1] = ptrvec[4*i+2]; \
+        ptr[4*i+2] = ptrvec[4*i+1]; \
+        ptr[4*i+3] = ptrvec[4*i+3]; \
+    } \
+    return v_load(ptr); \
+} \
+inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
+{ \
+    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptrvec, vec); \
+    for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
+    { \
+        ptr[8*i  ] = ptrvec[8*i  ]; \
+        ptr[8*i+1] = ptrvec[8*i+4]; \
+        ptr[8*i+2] = ptrvec[8*i+1]; \
+        ptr[8*i+3] = ptrvec[8*i+5]; \
+        ptr[8*i+4] = ptrvec[8*i+2]; \
+        ptr[8*i+5] = ptrvec[8*i+6]; \
+        ptr[8*i+6] = ptrvec[8*i+3]; \
+        ptr[8*i+7] = ptrvec[8*i+7]; \
+    } \
+    return v_load(ptr); \
+}
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double)
+#endif
+
+//////////// PopCount ////////////
+
+static const unsigned char popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+
+#define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
+inline _rTpvec v_popcount(const _Tpvec& a) \
+{ \
+    uchar ptra[16] = {0}; \
+    v_store(ptra, v_reinterpret_as_u8(a)); \
+    _rTp ptr[_Tpvec::nlanes] = {0}; \
+    v_store(ptr, v_setzero_##suffix()); \
+    for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
+        ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
+    return v_load(ptr); \
+}
+
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
+
+//////////// SignMask ////////////
+
+#ifndef __clang__
+#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, vl, shift) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    int mask = 0; \
+    _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift, vl)); \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        mask |= (int)(tmp.val[i]) << i; \
+    return mask; \
+}
+
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 16, 7)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 8, 15)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 4, 31)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 2, 63)
+
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#endif
+
+#else
+#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, width, vl) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    uint8_t ans[16] = {0};\
+    vsm(ans, vmslt(a, 0, vl), vl);\
+    return reinterpret_cast<int*>(ans)[0] & ((1 << (vl)) - 1);\
+}
+
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8x16, 8, 16)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16x8, 16, 8)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32x4, 32, 4)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64x2, 64, 2)
+
+inline int v_signmask(const v_uint8x16& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+inline int v_signmask(const v_uint16x8& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32x4& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64x2& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#if CV_SIMD128_64F
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#endif
+
+#endif
+
+//////////// Scan forward ////////////
+
+#define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
+inline int v_scan_forward(const _Tpvec& a) \
+{ \
+    _Tp ptr[_Tpvec::nlanes] = {0}; \
+    v_store(ptr, v_reinterpret_as_##suffix(a)); \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+        if(int(ptr[i]) < 0) \
+            return i; \
+    return 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
+#endif
+
+//////////// Pack triplets ////////////
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    const uint64 ptr[2] = {0x0908060504020100, 0xFFFFFF0F0E0D0C0A};
+    const v_uint64x2 flags(vle64_v_u64m1(ptr, 2));
+    return v_reinterpret_as_s8(v_uint8x16(
+            vrgather_vv_u8m1(
+                v_reinterpret_as_u8(vec),
+                v_reinterpret_as_u8(flags),
+                16)));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    const uint64 ptr[2] = {0x0908050403020100, 0xFFFF0F0E0D0C0B0A};
+    const v_uint64x2 flags(vle64_v_u64m1(ptr, 2));
+    return v_reinterpret_as_s16(v_uint8x16(
+            vrgather_vv_u8m1(
+                v_reinterpret_as_u8(vec),
+                v_reinterpret_as_u8(flags),
+                16)));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{
+    return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
+}
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+////// FP16 support ///////
+
+#if CV_FP16
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
+}
+#else
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    const int N = 4;
+    float buf[N];
+    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    const int N = 4;
+    float buf[N];
+    v_store(buf, v);
+    for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
+}
+#endif
+
+////////////// Rounding //////////////
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    return v_int32x4(vfcvt_x_f_v_i32m1(a, 4));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    v_float32x4 ZP5 = v_setall_f32(0.5f);
+    v_float32x4 t = a - ZP5;
+    return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    v_float32x4 ZP5 = v_setall_f32(0.5f);
+    v_float32x4 t = a + ZP5;
+    return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+#ifndef CV_RVV_THEAD_0_7
+    return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
+#else
+    const int old_round = fesetround(FE_TOWARDZERO);
+    vint32m1_t val = vfcvt_x_f_v_i32m1(a, 4);
+    fesetround(old_round);
+    return v_int32x4(val);
+#endif
+}
+#if CV_SIMD128_64F
+#ifndef __clang__
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0], a.val[1], 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0], a.val[1], 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+#ifndef CV_RVV_THEAD_0_7
+    return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
+#else
+    const int old_round = fesetround(FE_TOWARDZERO);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(tmp, 4);
+    fesetround(old_round);
+    return v_int32x4(val);
+#endif
+}
+
+#else
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
+    return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
+    dst = vset_v_f64m1_f64m2(dst, 0, a);
+    dst = vfsub_vf_f64m2(dst, 0.5, 2);
+    return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
+    dst = vset_v_f64m1_f64m2(dst, 0, a);
+    dst = vfadd_vf_f64m2(dst, 0.5, 2);
+    return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
+    return v_int32x4(vfncvt_rtz_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
+}
+#endif
+#endif
+
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    int ptr[8] = {0};
+    v_int32x4 t1, t2;
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2;
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int ptr[8] = {0};
+    v_int32x4 t1, t2;
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2 + c;
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    int64 ptr[4] = {0};
+    v_int64x2 t1, t2;
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2;
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int64 ptr[4] = {0};
+    v_int64x2 t1, t2;
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2 + c;
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    unsigned ptr[16] = {0};
+    v_uint32x4 t1, t2, t3, t4;
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+    unsigned ptr[16] = {0};
+    v_uint32x4 t1, t2, t3, t4;
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    int ptr[16] = {0};
+    v_int32x4 t1, t2, t3, t4;
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{
+    int ptr[16] = {0};
+    v_int32x4 t1, t2, t3, t4;
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint64 ptr[8] = {0};
+    v_uint64x2 t1, t2, t3, t4;
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    uint64 ptr[8] = {0};
+    v_uint64x2 t1, t2, t3, t4;
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    int64 ptr[8] = {0};
+    v_int64x2 t1, t2, t3, t4;
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                  const v_int64x2& c)
+{
+    int64 ptr[8] = {0};
+    v_int64x2 t1, t2, t3, t4;
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+#endif
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int ptr[8] = {0};
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    return t1 + t2;
+}
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int ptr[8] = {0};
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    return t1 + t2 + c;
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    int64 ptr[4] = {0};
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    return t1 + t2;
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int64 ptr[4] = {0};
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    return t1 + t2 + c;
+}
+
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    unsigned ptr[16] = {0};
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
+    v_uint32x4 t1 = v_load(ptr);
+    v_uint32x4 t2 = v_load(ptr+4);
+    v_uint32x4 t3 = v_load(ptr+8);
+    v_uint32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    unsigned ptr[16] = {0};
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
+    v_uint32x4 t1 = v_load(ptr);
+    v_uint32x4 t2 = v_load(ptr+4);
+    v_uint32x4 t3 = v_load(ptr+8);
+    v_uint32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4 + c;
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    int ptr[16] = {0};
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    v_int32x4 t3 = v_load(ptr+8);
+    v_int32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    int ptr[16] = {0};
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    v_int32x4 t3 = v_load(ptr+8);
+    v_int32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint64 ptr[8] = {0};
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
+    v_uint64x2 t1 = v_load(ptr);
+    v_uint64x2 t2 = v_load(ptr+2);
+    v_uint64x2 t3 = v_load(ptr+4);
+    v_uint64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    uint64 ptr[8] = {0};
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
+    v_uint64x2 t1 = v_load(ptr);
+    v_uint64x2 t2 = v_load(ptr+2);
+    v_uint64x2 t3 = v_load(ptr+4);
+    v_uint64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4 + c;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int64 ptr[8] = {0};
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    v_int64x2 t3 = v_load(ptr+4);
+    v_int64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{
+    int64 ptr[8] = {0};
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    v_int64x2 t3 = v_load(ptr+4);
+    v_int64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod_fast(a, b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+#endif
+
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3, 4);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
+    return v_float32x4(res) + a;
+}
+
+#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
+inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
+{ \
+    _Tpw ptr[_Tpwvec::nlanes*2] = {0}; \
+    vse##width##_v_##suffix##m2(ptr, wmul(a, b, vl), vl); \
+    c = _Tpwvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
+    d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes, hvl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16, 16, 8)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16, 16, 8)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64, 4, 2)
+
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b, 8), 16, 8));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b, 8), 16, 8));
+}
+
+
+//////// Saturating Multiply ////////
+
+#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _wTpvec c, d; \
+    v_mul_expand(a, b, c, d); \
+    return v_pack(c, d); \
+} \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a = a * b; \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4)
+
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // namespace cv
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv071.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv071.hpp
new file mode 100644
index 000000000000..5681ae211de6
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv071.hpp
@@ -0,0 +1,2899 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Copyright (C) 2015, PingTouGe Semiconductor Co., Ltd., all rights reserved.
+
+#ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
+#define OPENCV_HAL_INTRIN_RISCVV_HPP
+
+#include <float.h>
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+//////////// Types ////////////
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(vuint8m1_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = (vuint8m1_t)vle8_v_u8m1((unsigned char*)v, 16);
+    }
+    uchar get0() const
+    {
+        return vmv_x_s_u8m1_u8(val);
+    }
+
+    vuint8m1_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(vint8m1_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = (vint8m1_t)vle8_v_i8m1((schar*)v, 16);
+    }
+    schar get0() const
+    {
+        return vmv_x_s_i8m1_i8(val);
+    }
+
+    vint8m1_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(vuint16m1_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = (vuint16m1_t)vle16_v_u16m1((unsigned short*)v, 8);
+    }
+    ushort get0() const
+    {
+        return vmv_x_s_u16m1_u16(val);
+    }
+
+    vuint16m1_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(vint16m1_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = (vint16m1_t)vle16_v_i16m1((signed short*)v, 8);
+    }
+    short get0() const
+    {
+        return vmv_x_s_i16m1_i16(val);
+    }
+
+    vint16m1_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(vuint32m1_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = (vuint32m1_t)vle32_v_u32m1((unsigned int*)v, 4);
+    }
+    unsigned get0() const
+    {
+        return vmv_x_s_u32m1_u32(val);
+    }
+
+    vuint32m1_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(vint32m1_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = (vint32m1_t)vle32_v_i32m1((signed int*)v, 4);
+    }
+    int get0() const
+    {
+        return vmv_x_s_i32m1_i32(val);
+    }
+    vint32m1_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(vfloat32m1_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = (vfloat32m1_t)vle32_v_f32m1((float*)v, 4);
+    }
+    float get0() const
+    {
+        return vfmv_f_s_f32m1_f32(val);
+    }
+    vfloat32m1_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(vuint64m1_t v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = (vuint64m1_t)vle64_v_u64m1((unsigned long*)v, 2);
+    }
+    uint64 get0() const
+    {
+        return vmv_x_s_u64m1_u64(val);
+    }
+    vuint64m1_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(vint64m1_t v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = (vint64m1_t)vle64_v_i64m1((long*)v, 2);
+    }
+    int64 get0() const
+    {
+        return vmv_x_s_i64m1_i64(val);
+    }
+    vint64m1_t val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(vfloat64m1_t v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = (vfloat64m1_t)vle64_v_f64m1((double*)v, 2);
+    }
+    double get0() const
+    {
+        return vfmv_f_s_f64m1_f64(val);
+    }
+    vfloat64m1_t val;
+};
+/*
+#define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
+inline _Tp##m1_t vreinterpret_v_##suffix##m1_##suffix##m1(_Tp##m1_t v) { return v; } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
+inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
+
+
+OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
+OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, i8)
+OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
+OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, i16)
+OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
+OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, i32)
+OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
+OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, i64)
+OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
+OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
+*/
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint8x16& v) { return v_uint8x16(v.val); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint8x16& v) { return v_int8x16(vreinterpret_v_u8m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint8x16& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint8x16& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint8x16& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint8x16& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint8x16& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int8x16& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int8x16& v) { return v_int8x16(v.val); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int8x16& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int8x16& v) { return v_int32x4(vreinterpret_v_i8m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int8x16& v) { return v_int64x2(vreinterpret_v_i8m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int8x16& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i8m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int8x16& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i8m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint16x8& v) { return v_uint8x16(vreinterpret_v_u16m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint16x8& v) { return v_uint16x8(v.val); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint16x8& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint16x8& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint16x8& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint16x8& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint16x8& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int16x8& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int16x8& v) { return v_uint16x8(vreinterpret_v_i16m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int16x8& v) { return v_int16x8(v.val); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int16x8& v) { return v_int32x4(vreinterpret_v_i16m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int16x8& v) { return v_int64x2(vreinterpret_v_i16m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int16x8& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i16m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int16x8& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i16m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint32x4& v) { return v_uint32x4(v.val); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint32x4& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint32x4& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint32x4& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint32x4& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int32x4& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int32x4& v) { return v_uint32x4(vreinterpret_v_i32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int32x4& v) { return v_int32x4(v.val); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int32x4& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint64x2& v) { return v_uint64x2(v.val); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint64x2& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int64x2& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int64x2& v) { return v_uint64x2(vreinterpret_v_i64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int64x2& v) { return v_int64x2(v.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& v) { return v_uint32x4(vreinterpret_v_f32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& v) { return v_int32x4(vreinterpret_v_f32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& v) { return v_float32x4(v.val); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float64x2& v) { return v_uint64x2(vreinterpret_v_f64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float64x2& v) { return v_int64x2(vreinterpret_v_f64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64x2(v.val); }
+
+#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
+inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); }     \
+inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
+
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
+inline v_float32x4 v_setzero_f32() { return v_float32x4(vfmv_v_f_f32m1(0, 4)); }
+inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
+
+inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
+inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
+
+
+#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val, num)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val, num); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
+inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
+}
+inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+{
+    a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
+    return a;
+}
+
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
+inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
+}
+inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
+{
+    a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
+    return a;
+}
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val, num)); \
+}
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(vfsqrt_v_f32m1(x.val, 4));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
+}
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    return v_float32x4(res);
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(vfsqrt_v_f64m1(x.val, 2));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
+}
+
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
+    inline _Tpvec operator ~ (const _Tpvec & a) \
+    { \
+        return _Tpvec(vnot_v_##suffix(a.val, num)); \
+    }
+
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint8x16, u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint16x8, u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint32x4, u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint64x2, u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int8x16,  i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int16x8,  i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4,  i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)
+
+#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
+}
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
+}
+
+//#define OPENCV_HAL_IMPL_RISCVV_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
+//inline _Tpuvec v_abs(const _Tpsvec& a) {    \
+//    E##xm1_t mask=vmflt_vf_e32xm1_f32m1(x.val, 0.0, 4);
+
+//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint8x16, v_int8x16, u8, s8)
+//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint16x8, v_int16x8, u16, s16)
+//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint32x4, v_int32x4, u32, s32)
+
+inline v_uint32x4 v_abs(v_int32x4 x)
+{
+    vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
+    return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4)));
+}
+
+inline v_uint16x8 v_abs(v_int16x8 x)
+{
+    vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
+    return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8)));
+}
+
+inline v_uint8x16 v_abs(v_int8x16 x)
+{
+    vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
+    return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16)));
+}
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{
+    return (v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
+}
+
+inline v_float64x2 v_abs(v_float64x2 x)
+{
+    return (v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
+}
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{
+    vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
+    return (v_float32x4)vfsgnjx_vv_f32m1(ret, ret, 4);
+}
+
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
+    return (v_float64x2)vfsgnjx_vv_f64m1(ret, ret, 2);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
+inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){    \
+    vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num);    \
+    vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num);    \
+    return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
+}
+
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(v_int8x16 a, v_int8x16 b){
+    vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
+    vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
+    return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
+}
+inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){
+    vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
+    vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
+    return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
+inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){    \
+     vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
+     vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
+    return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num)));    \
+}
+
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    vint16m2_t res = vundefined_i16m2();
+    res = vwmul_vv_i16m2(a.val, b.val, 16);
+    c.val = vget_v_i16m2_i16m1(res, 0);
+    d.val = vget_v_i16m2_i16m1(res, 1);
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    vuint16m2_t res = vundefined_u16m2();
+    res = vwmulu_vv_u16m2(a.val, b.val, 16);
+    c.val = vget_v_u16m2_u16m1(res, 0);
+    d.val = vget_v_u16m2_u16m1(res, 1);
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    vint32m2_t res = vundefined_i32m2();
+    res = vwmul_vv_i32m2(a.val, b.val, 8);
+    c.val = vget_v_i32m2_i32m1(res, 0);
+    d.val = vget_v_i32m2_i32m1(res, 1);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    vuint32m2_t res = vundefined_u32m2();
+    res = vwmulu_vv_u32m2(a.val, b.val, 8);
+    c.val = vget_v_u32m2_u32m1(res, 0);
+    d.val = vget_v_u32m2_u32m1(res, 1);
+}
+
+inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
+                         v_int64x2& c, v_int64x2& d)
+{
+    vint64m2_t res = vundefined_i64m2();
+    res = vwmul_vv_i64m2(a.val, b.val, 4);
+    c.val = vget_v_i64m2_i64m1(res, 0);
+    d.val = vget_v_i64m2_i64m1(res, 1);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    vuint64m2_t res = vundefined_u64m2();
+    res = vwmulu_vv_u64m2(a.val, b.val, 4);
+    c.val = vget_v_u64m2_u64m1(res, 0);
+    d.val = vget_v_u64m2_u64m1(res, 1);
+}
+
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
+//////// Dot Product ////////
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vint32m2_t res = vundefined_i32m2();
+    res = vwmul_vv_i32m2(a.val, b.val, 8);
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4));
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vint32m2_t res = vundefined_i32m2();
+    res = vwmul_vv_i32m2(a.val, b.val, 8);
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vint64m2_t res = vundefined_i64m2();
+    res = vwmul_vv_i64m2(a.val, b.val, 4);
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vint64m2_t res = vundefined_i64m2();
+    res = vwmul_vv_i64m2(a.val, b.val, 4);
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
+}
+
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                   const v_int32x4& c)
+{
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
+}
+
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
+}
+
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
+                                   const v_uint64x2& c)
+{
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                   const v_int64x2& c)
+{
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
+}
+
+//////// Fast Dot Product ////////
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4));
+}
+
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    vint64m2_t v1 = vundefined_i64m2();
+    v1 = vwmul_vv_i64m2(a.val, b.val, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2));
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    vint64m2_t v1 = vundefined_i64m2();
+    v1 = vwmul_vv_i64m2(a.val, b.val, 8);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
+}
+
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
+}
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
+}
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
+}
+
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
+inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
+{\
+    v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
+    val = intrin(val, a.val, val, num);    \
+    return vmv_x_s_##len##m1_##len(val);    \
+}
+
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \
+inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
+{\
+    v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \
+    val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num);    \
+    return scalerfunc(val);    \
+}
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64, int, sum, vwredsum_vs_i32m1_i64m1, 4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16, unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32, unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
+inline float v_reduce_sum(const v_float32x4& a) \
+{\
+    vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
+    val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4);    \
+    return vfmv_f_s_f32m1_f32(val);    \
+}
+inline double v_reduce_sum(const v_float64x2& a) \
+{\
+    vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
+    val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2);    \
+    return vfmv_f_s_f64m1_f64(val);    \
+}
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{ vuint64m1_t res = vundefined_u64m1(); return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); }
+
+inline int64 v_reduce_sum(const v_int64x2& a)
+{ vint64m1_t res = vundefined_i64m1(); return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); }
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16, vmv_x_s_i8m1_i8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
+    a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4);
+    b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4);
+    c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4);
+    d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4);
+    vfloat32m1_t res;
+    res = vslideup_vx_f32m1(a0, b0, 1, 4);
+    res = vslideup_vx_f32m1(res, c0, 2, 4);
+    res = vslideup_vx_f32m1(res, d0, 3, 4);
+    return v_float32x4(res);
+}
+
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
+    vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
+    vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
+    a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4);
+    return vfmv_f_s_f32m1_f32(a0);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
+inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){    \
+    _Tpvec2 x = v_absdiff(a, b);    \
+    return v_reduce_sum(x);    \
+}
+
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int8x16, v_uint8x16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint8x16, v_uint8x16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int16x8, v_uint16x8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint16x8, v_uint16x8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
+
+#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int8x16, i8m1,  8, 16, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int16x8, i16m1, 16, 8, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int32x4, i32m1, 32, 4, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int64x2, i64m1, 64, 2, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint8x16, u8m1, 8, 16, u_vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint16x8, u16m1, 16, 8, u_vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
+
+//TODO: ==
+inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}
+inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}
+inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}
+inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}
+inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}
+inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}/**/
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{
+    vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}
+
+//TODO: ==
+inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}
+inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}
+inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}
+inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}
+inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}
+inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}/**/
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{
+    vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}
+#define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
+inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
+                         const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
+                         v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
+                         v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
+{ \
+    vuint32m4_t vindex = vundefined_u32m4(); \
+    vuint32m1_t vindex0 = vid_v_u32m1(4); \
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \
+    vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \
+    vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \
+    v##_Tp##32m4_t val = vundefined_##_T##m4();    \
+    val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val);   \
+    val = vrgather_vv_##_T##m4(val, vindex, 16);    \
+    b0.val = vget_v_##_T##m4_##_T##m1(val, 0);   \
+    b1.val = vget_v_##_T##m4_##_T##m1(val, 1);   \
+    b2.val = vget_v_##_T##m4_##_T##m1(val, 2);   \
+    b3.val = vget_v_##_T##m4_##_T##m1(val, 3);   \
+}
+OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
+OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
+OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
+
+
+#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
+
+#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
+OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
+OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
+
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
+
+#if 0
+#define VUP4(n) {0, 1, 2, 3}
+#define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
+#define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+#define VUP2(n) {0, 1}
+#endif
+#define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{    \
+    suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
+        tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
+        return _Tpvec(tmp);\
+} \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{     \
+        suffix##m1_t res = vundefined_##_T##m1(); \
+        return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    suffix##m2_t tmp = vundefined_##_T##m2();    \
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val);          \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val);          \
+        res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    suffix##m2_t tmp = vundefined_##_T##m2();    \
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val);    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val);    \
+        res = vslideup_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    CV_UNUSED(b); return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
+
+#if 1
+#define vreinterpret_v_i8m1_i8m1
+#define vreinterpret_v_u8m1_u8m1
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \
+  res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64, i8m1, schar)
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \
+  res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); }
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
+
+#else
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res, res1; \
+  res = vle##elemsize##_v_##len(ptr0, hnum); \
+  res1 = vle##elemsize##_v_##len(ptr1, hnum); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, hnum);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0; \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse##elemsize##_v_##len(ptr, a0, hnum);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
+
+#endif
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+#if 0
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, vle32_v_u32m4((unsigned int *)idx, 16), 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((unsigned int *)idx, 16), 16));
+#endif
+#endif
+}
+
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
+#if 0
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 8);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 1, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    index = vadd_vv_u32m4(vand_vx_u32m4(seq, 1, 16), vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+#if 0
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 4);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 2, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    seq = vset_v_u32m1_u32m4(seq, 1, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 2, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 3, vget_v_u32m4_u32m1(seq, 0));
+    index = vadd_vv_u32m4(seq, vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
+}
+
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+#if 0
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8));
+#endif
+#endif
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+#if 0
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 4);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 1, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(seq, 1, 8), vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+#if 0
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 2);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 2, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    seq = vset_v_u32m1_u32m2(seq, 1, vget_v_u32m2_u32m1(seq, 0));
+    index = vsll_vx_u32m2(vadd_vv_u32m2(seq, vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+#if 0
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+#if 0
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_int32x4(vloxei32_v_i32m1(tab, index, 4));
+#endif
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vle32_v_i32m1(tab+idx[0], 4));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    //vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vle64_v_i64m1(tab+idx[0], 2));
+}
+
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
+{
+    //vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
+}
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
+{
+    return v_uint64x2(vle64_v_u64m1(tab+idx[0], 2));
+}
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+#if 0
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+#if 0
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0]+1],
+        tab[idx[1]],
+        tab[idx[1]+1]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_float32x4(vloxei32_v_f32m1(tab, index, 4));
+#endif
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
+}
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    //vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(vle64_v_f64m1(tab+idx[0], 2));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    /*int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idxvec.val[0]],
+        tab[idxvec.val[1]],
+        tab[idxvec.val[2]],
+        tab[idxvec.val[3]]
+    };*/
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    /*unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idxvec.val[0]],
+        tab[idxvec.val[1]],
+        tab[idxvec.val[2]],
+        tab[idxvec.val[3]]
+    };*/
+    return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    /*float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idxvec.val[0]],
+        tab[idxvec.val[1]],
+        tab[idxvec.val[2]],
+        tab[idxvec.val[3]]
+    };*/
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
+}
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    //vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2));
+}
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    vint32m1_t index = vmul_vx_i32m1(idxvec.val, 4, 4);
+    //vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
+
+    //x.val = vlxe_v_f32m1(tab, index_x, 4);
+    //y.val = vlxe_v_f32m1(tab, index_y, 4);
+    vloxseg2ei32_v_f32m1(&x.val, &y.val, tab, vreinterpret_v_i32m1_u32m1(index), 4);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \
+inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
+{ \
+    v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
+    return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
+}\
+template<int n> inline \
+v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
+{ \
+    v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
+    return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
+}\
+inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
+{ \
+    v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    asm("" ::: "memory");                                       \
+    vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
+}\
+template<int n> inline \
+void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
+{ \
+    v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
+}
+OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx, signed char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx, signed short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx, int, 32)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx, unsigned char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx, unsigned short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx, unsigned int, 32)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vuint16m2_t tmp = vundefined_u16m2();    \
+    tmp = vset_v_u16m1_u16m2(tmp, 0, a.val);    \
+    tmp = vset_v_u16m1_u16m2(tmp, 1, b.val);    \
+    return v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    vuint32m4_t vabcd = vundefined_u32m4();    \
+    vuint16m2_t v16 = vundefined_u16m2();    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val);    \
+    v16 = vnsrl_wx_u16m2(vabcd, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    vuint64m8_t v64 = vundefined_u64m8();    \
+    vuint32m4_t v32 = vundefined_u32m4();    \
+    vuint16m2_t v16 = vundefined_u16m2();    \
+    v64 = vset_v_u64m1_u64m8(v64, 0, a.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 1, b.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 2, c.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 3, d.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 4, e.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 5, f.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 6, g.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 7, h.val);    \
+    v32 = vnsrl_wx_u32m4(v64, 0, 16);
+    v16 = vnsrl_wx_u16m2(v32, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
+}
+
+//inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
+//{ \
+//    int16xm2_u tmp;    \
+//    tmp.m1[0] = (vint16m1_t)a.val;    \
+//    tmp.m1[1] = (vint16m1_t)b.val;    \
+//    e8xm1_t mask = (e8xm1_t)vmsge_vx_e16xm2_i16m2(tmp.v, 0, 16);\
+//    return v_uint8x16(vnclipuvi_mask_u8m1_u16m2(vmv_v_x_u8m1(0, 16), (vuint16m2_t)tmp.v, 0, mask, 16));
+//}
+
+#define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
+inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
+    vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1));    \
+} \
+inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2);    \
+} \
+template<int n> inline \
+v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
+    vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1));    \
+} \
+template<int n> inline \
+void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1);    \
+    return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\
+}
+OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
+OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
+
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt)   \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
+    {                                                           \
+        auto res = mul(a.val, b.val, num);                      \
+        return _Tpvec(cvt(res, 0, num));                        \
+    }                                                           \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)     \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  16, vwmul_vv_i16m2, vnclip_wx_i8m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  32, vwmul_vv_i32m2, vnclip_wx_i16m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, 32, vwmulu_vv_u32m2, vnclipu_wx_u16m1)
+
+
+static const signed char popCountTable[256] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+
+inline vuint8m1_t vcnt_u8(vuint8m1_t val){
+#if __riscv_v == 7000
+    vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16);
+    return vadd_vv_u8m1(vloxei8_v_u8m1((unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16);
+#else
+    return vloxei8_v_u8m1((unsigned char*)popCountTable, val, 16);
+#endif
+}
+
+inline v_uint8x16
+v_popcount(const v_uint8x16& a)
+{
+    return v_uint8x16(vcnt_u8(a.val));
+}
+
+inline v_uint8x16
+v_popcount(const v_int8x16& a)
+{
+    return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val)));
+}
+
+inline v_uint16x8
+v_popcount(const v_uint16x8& a)
+{
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
+}
+
+inline v_uint16x8
+v_popcount(const v_int16x8& a)
+{
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
+}
+
+inline v_uint32x4
+v_popcount(const v_uint32x4& a)
+{
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
+}
+
+inline v_uint32x4
+v_popcount(const v_int32x4& a)
+{
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
+}
+
+inline v_uint64x2
+v_popcount(const v_uint64x2& a)
+{
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1();
+    vuint16m1_t res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
+}
+
+inline v_uint64x2
+v_popcount(const v_int64x2& a)
+{
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val)));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
+}
+
+#define SMASK 1, 2, 4, 8, 16, 32, 64, 128
+inline int v_signmask(const v_uint8x16& a)
+{
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
+    vuint8m1_t t0  = vsrl_vx_u8m1(a.val, 7, 16);
+    vbool8_t mask = vmseq_vx_u8m1_b8(t0, 1, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
+}
+inline int v_signmask(const v_int8x16& a)
+{
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
+    vbool8_t mask = vmslt_vx_i8m1_b8(a.val, 0, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
+}
+
+inline int v_signmask(const v_int16x8& a)
+{
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vbool16_t mask = vmslt_vx_i16m1_b16(a.val, 0, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
+}
+inline int v_signmask(const v_uint16x8& a)
+{
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vuint16m1_t t0  = vsrl_vx_u16m1(a.val, 15, 8);
+    vbool16_t mask = vmseq_vx_u16m1_b16(t0, 1, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 8);
+    return vmv_x_s_u16m1_u16(res);
+}
+inline int v_signmask(const v_int32x4& a)
+{
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmslt_vx_i32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
+}
+inline int v_signmask(const v_uint32x4& a)
+{
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vuint32m1_t t0  = vsrl_vx_u32m1(a.val, 31, 4);
+    vbool32_t mask = vmseq_vx_u32m1_b32(t0, 1, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
+}
+inline int v_signmask(const v_uint64x2& a)
+{
+    vuint64m1_t res = vundefined_u64m1();
+    vuint64m1_t id = vid_v_u64m1(2);
+    vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2), id, 2);
+    vuint64m1_t t0  = vsrl_vx_u64m1(a.val, 63, 2);
+    vbool64_t mask = vmseq_vx_u64m1_b64(t0, 1, 2);
+    res = vredsum_vs_u64m1_u64m1_m(mask, res, num, vmv_v_x_u64m1(0, 2), 2);
+    return vmv_x_s_u64m1_u64(res);
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float32x4& a)
+{
+    return v_signmask(v_reinterpret_as_u32(a));
+    /*
+    vuint32m1_t res;
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmflt_vf_f32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);*/
+}
+
+inline int v_scan_forward(const v_int8x16& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint8x16& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_int16x8& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint16x8& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_int32x4& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint32x4& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_float32x4& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_int64x2& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint64x2& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+
+#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16, b8)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \
+}
+
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32)
+inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(mask.val, 0, 4), b.val, a.val, 4));
+}
+inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(mask.val, 0, 2), b.val, a.val, 2));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \
+inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
+{ \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0);  \
+    b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1);  \
+} \
+inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+} \
+inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \
+} \
+inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2);    \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+}
+
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1, 32)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    vuint16m2_t b = vundefined_u16m2();
+    vuint32m2_t c = vundefined_u32m2();
+    vuint8m1_t val = vle8_v_u8m1(ptr, 4);    \
+    b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4);    \
+    c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
+    return v_uint32x4(vget_v_u32m2_u32m1(c, 0));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    vint16m2_t b = vundefined_i16m2();
+    vint32m2_t c = vundefined_i32m2();
+    vint8m1_t val = vle8_v_i8m1(ptr, 4);    \
+    b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4);    \
+    c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
+    return v_int32x4(vget_v_i32m2_i32m1(c, 0));
+}
+#define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
+#define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
+#define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
+#define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0}
+
+#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    v##_Tp##m2_t tmp = vundefined_##_T##m2();\
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \
+    unsigned mdata[] = VITL_##num; \
+    vuint32m2_t mask = vle32_v_u32m2(mdata, 8);    \
+    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2);    \
+    b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \
+    b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
+    return v_##_Tpvec(b0);\
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t b1 = vundefined_##_T##m1(); \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
+    b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+    return v_##_Tpvec(b1);\
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
+    c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
+    d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+}
+
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2)
+
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
+}
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{
+    return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
+}
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{
+    return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
+}
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{
+    return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
+}
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{
+    return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
+}
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{
+    return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
+template <int n> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ return v_rotate_right<n>(a, b);}
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint8x16, u8, 0)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int8x16, s8, 0)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint16x8, u16, 1)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int16x8, s16, 1)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint32x4, u32, 2)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int32x4, s32, 2)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint64x2, u64, 3)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int64x2, s64, 3)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
+
+
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); }
+
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64)
+
+#define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
+template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
+
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
+
+inline void __builtin_riscv_fsrm(int val)
+{
+    asm("csrw frm, %0\n\t"
+        :
+        :"r"(val));
+    return;
+}
+
+inline void barrier1(void *arg) {
+  __asm__ __volatile__("" : : "r" (arg) : "memory");
+}
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(0);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(3);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(1);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(0);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    //_val = vset_f64m2(_val, 1, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    __builtin_riscv_fsrm(0);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(2);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(3);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(1);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
+{ \
+    intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
+{ \
+    intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \
+}\
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
+                                v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
+{ \
+    intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \
+} \
+
+#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \
+OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize)    \
+OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize)
+
+//OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32, 32)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32, 32)
+
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
+{ vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
+{ vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
+                                v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
+{ vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);}
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32, 32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64, 64)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64, 64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64, 64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vfcvt_f_x_v_f32m1(a.val, 4));
+}
+
+#if CV_SIMD128_64F
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    return v_float32x4(aval);
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4);
+    return v_float32x4(aval);
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
+    vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
+    vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(vfcvt_f_x_v_f64m1(a.val, 2));
+}
+
+#endif
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16)));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
+    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2);
+    return v_float64x2(res);
+}
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ v_float64x2 res = v_dotprod_expand_fast(a, b);
+  return res + c; }
+#endif
+////// FP16 support ///////
+#if __riscv_v == 7000
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4);
+    vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
+    return v_float32x4(vget_v_f32m2_f32m1(v32, 0));
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    vfloat32m2_t v32 = vundefined_f32m2();
+    v32 = vset_v_f32m1_f32m2(v32, 0, v.val);
+    vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4);
+    vse16_v_f16m1((__fp16*)ptr, hv, 4);
+}
+#else
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4);
+    vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4);
+    return v_float32x4(v32);
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    //vfloat32m2_t v32 = vundefined_f32m2();
+    //v32 = vset_f32m2(v32, 0, v.val);
+    vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4);
+    vse16_v_f16mf2((__fp16*)ptr, hv, 4);
+}
+#endif
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_010_compat_non-policy.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_010_compat_non-policy.hpp
new file mode 100644
index 000000000000..6e19e3087b55
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_010_compat_non-policy.hpp
@@ -0,0 +1,24395 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copied from
+// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
+
+#ifndef __RVV_0P10_COMPATIBLE_HEADERS_NON_OVERLOADED_NON_POLICY_H
+#define __RVV_0P10_COMPATIBLE_HEADERS_NON_OVERLOADED_NON_POLICY_H
+
+
+#if __has_include ("riscv_vector.h")
+#include <riscv_vector.h>
+#endif
+#ifndef __RISCV_VECTOR_H
+#include_next <riscv_vector.h>
+#endif
+
+#define vsetvl_e8mf8(...) __riscv_vsetvl_e8mf8(__VA_ARGS__)
+#define vsetvl_e8mf4(...) __riscv_vsetvl_e8mf4(__VA_ARGS__)
+#define vsetvl_e8mf2(...) __riscv_vsetvl_e8mf2(__VA_ARGS__)
+#define vsetvl_e8m1(...) __riscv_vsetvl_e8m1(__VA_ARGS__)
+#define vsetvl_e8m2(...) __riscv_vsetvl_e8m2(__VA_ARGS__)
+#define vsetvl_e8m4(...) __riscv_vsetvl_e8m4(__VA_ARGS__)
+#define vsetvl_e8m8(...) __riscv_vsetvl_e8m8(__VA_ARGS__)
+#define vsetvl_e16mf4(...) __riscv_vsetvl_e16mf4(__VA_ARGS__)
+#define vsetvl_e16mf2(...) __riscv_vsetvl_e16mf2(__VA_ARGS__)
+#define vsetvl_e16m1(...) __riscv_vsetvl_e16m1(__VA_ARGS__)
+#define vsetvl_e16m2(...) __riscv_vsetvl_e16m2(__VA_ARGS__)
+#define vsetvl_e16m4(...) __riscv_vsetvl_e16m4(__VA_ARGS__)
+#define vsetvl_e16m8(...) __riscv_vsetvl_e16m8(__VA_ARGS__)
+#define vsetvl_e32mf2(...) __riscv_vsetvl_e32mf2(__VA_ARGS__)
+#define vsetvl_e32m1(...) __riscv_vsetvl_e32m1(__VA_ARGS__)
+#define vsetvl_e32m2(...) __riscv_vsetvl_e32m2(__VA_ARGS__)
+#define vsetvl_e32m4(...) __riscv_vsetvl_e32m4(__VA_ARGS__)
+#define vsetvl_e32m8(...) __riscv_vsetvl_e32m8(__VA_ARGS__)
+#define vsetvl_e64m1(...) __riscv_vsetvl_e64m1(__VA_ARGS__)
+#define vsetvl_e64m2(...) __riscv_vsetvl_e64m2(__VA_ARGS__)
+#define vsetvl_e64m4(...) __riscv_vsetvl_e64m4(__VA_ARGS__)
+#define vsetvl_e64m8(...) __riscv_vsetvl_e64m8(__VA_ARGS__)
+#define vsetvlmax_e8mf8(...) __riscv_vsetvlmax_e8mf8(__VA_ARGS__)
+#define vsetvlmax_e8mf4(...) __riscv_vsetvlmax_e8mf4(__VA_ARGS__)
+#define vsetvlmax_e8mf2(...) __riscv_vsetvlmax_e8mf2(__VA_ARGS__)
+#define vsetvlmax_e8m1(...) __riscv_vsetvlmax_e8m1(__VA_ARGS__)
+#define vsetvlmax_e8m2(...) __riscv_vsetvlmax_e8m2(__VA_ARGS__)
+#define vsetvlmax_e8m4(...) __riscv_vsetvlmax_e8m4(__VA_ARGS__)
+#define vsetvlmax_e8m8(...) __riscv_vsetvlmax_e8m8(__VA_ARGS__)
+#define vsetvlmax_e16mf4(...) __riscv_vsetvlmax_e16mf4(__VA_ARGS__)
+#define vsetvlmax_e16mf2(...) __riscv_vsetvlmax_e16mf2(__VA_ARGS__)
+#define vsetvlmax_e16m1(...) __riscv_vsetvlmax_e16m1(__VA_ARGS__)
+#define vsetvlmax_e16m2(...) __riscv_vsetvlmax_e16m2(__VA_ARGS__)
+#define vsetvlmax_e16m4(...) __riscv_vsetvlmax_e16m4(__VA_ARGS__)
+#define vsetvlmax_e16m8(...) __riscv_vsetvlmax_e16m8(__VA_ARGS__)
+#define vsetvlmax_e32mf2(...) __riscv_vsetvlmax_e32mf2(__VA_ARGS__)
+#define vsetvlmax_e32m1(...) __riscv_vsetvlmax_e32m1(__VA_ARGS__)
+#define vsetvlmax_e32m2(...) __riscv_vsetvlmax_e32m2(__VA_ARGS__)
+#define vsetvlmax_e32m4(...) __riscv_vsetvlmax_e32m4(__VA_ARGS__)
+#define vsetvlmax_e32m8(...) __riscv_vsetvlmax_e32m8(__VA_ARGS__)
+#define vsetvlmax_e64m1(...) __riscv_vsetvlmax_e64m1(__VA_ARGS__)
+#define vsetvlmax_e64m2(...) __riscv_vsetvlmax_e64m2(__VA_ARGS__)
+#define vsetvlmax_e64m4(...) __riscv_vsetvlmax_e64m4(__VA_ARGS__)
+#define vsetvlmax_e64m8(...) __riscv_vsetvlmax_e64m8(__VA_ARGS__)
+#define vle16_v_f16mf4(...) __riscv_vle16_v_f16mf4(__VA_ARGS__)
+#define vle16_v_f16mf2(...) __riscv_vle16_v_f16mf2(__VA_ARGS__)
+#define vle16_v_f16m1(...) __riscv_vle16_v_f16m1(__VA_ARGS__)
+#define vle16_v_f16m2(...) __riscv_vle16_v_f16m2(__VA_ARGS__)
+#define vle16_v_f16m4(...) __riscv_vle16_v_f16m4(__VA_ARGS__)
+#define vle16_v_f16m8(...) __riscv_vle16_v_f16m8(__VA_ARGS__)
+#define vle32_v_f32mf2(...) __riscv_vle32_v_f32mf2(__VA_ARGS__)
+#define vle32_v_f32m1(...) __riscv_vle32_v_f32m1(__VA_ARGS__)
+#define vle32_v_f32m2(...) __riscv_vle32_v_f32m2(__VA_ARGS__)
+#define vle32_v_f32m4(...) __riscv_vle32_v_f32m4(__VA_ARGS__)
+#define vle32_v_f32m8(...) __riscv_vle32_v_f32m8(__VA_ARGS__)
+#define vle64_v_f64m1(...) __riscv_vle64_v_f64m1(__VA_ARGS__)
+#define vle64_v_f64m2(...) __riscv_vle64_v_f64m2(__VA_ARGS__)
+#define vle64_v_f64m4(...) __riscv_vle64_v_f64m4(__VA_ARGS__)
+#define vle64_v_f64m8(...) __riscv_vle64_v_f64m8(__VA_ARGS__)
+#define vle8_v_i8mf8(...) __riscv_vle8_v_i8mf8(__VA_ARGS__)
+#define vle8_v_i8mf4(...) __riscv_vle8_v_i8mf4(__VA_ARGS__)
+#define vle8_v_i8mf2(...) __riscv_vle8_v_i8mf2(__VA_ARGS__)
+#define vle8_v_i8m1(...) __riscv_vle8_v_i8m1(__VA_ARGS__)
+#define vle8_v_i8m2(...) __riscv_vle8_v_i8m2(__VA_ARGS__)
+#define vle8_v_i8m4(...) __riscv_vle8_v_i8m4(__VA_ARGS__)
+#define vle8_v_i8m8(...) __riscv_vle8_v_i8m8(__VA_ARGS__)
+#define vle16_v_i16mf4(...) __riscv_vle16_v_i16mf4(__VA_ARGS__)
+#define vle16_v_i16mf2(...) __riscv_vle16_v_i16mf2(__VA_ARGS__)
+#define vle16_v_i16m1(...) __riscv_vle16_v_i16m1(__VA_ARGS__)
+#define vle16_v_i16m2(...) __riscv_vle16_v_i16m2(__VA_ARGS__)
+#define vle16_v_i16m4(...) __riscv_vle16_v_i16m4(__VA_ARGS__)
+#define vle16_v_i16m8(...) __riscv_vle16_v_i16m8(__VA_ARGS__)
+#define vle32_v_i32mf2(...) __riscv_vle32_v_i32mf2(__VA_ARGS__)
+#define vle32_v_i32m1(...) __riscv_vle32_v_i32m1(__VA_ARGS__)
+#define vle32_v_i32m2(...) __riscv_vle32_v_i32m2(__VA_ARGS__)
+#define vle32_v_i32m4(...) __riscv_vle32_v_i32m4(__VA_ARGS__)
+#define vle32_v_i32m8(...) __riscv_vle32_v_i32m8(__VA_ARGS__)
+#define vle64_v_i64m1(...) __riscv_vle64_v_i64m1(__VA_ARGS__)
+#define vle64_v_i64m2(...) __riscv_vle64_v_i64m2(__VA_ARGS__)
+#define vle64_v_i64m4(...) __riscv_vle64_v_i64m4(__VA_ARGS__)
+#define vle64_v_i64m8(...) __riscv_vle64_v_i64m8(__VA_ARGS__)
+#define vle8_v_u8mf8(...) __riscv_vle8_v_u8mf8(__VA_ARGS__)
+#define vle8_v_u8mf4(...) __riscv_vle8_v_u8mf4(__VA_ARGS__)
+#define vle8_v_u8mf2(...) __riscv_vle8_v_u8mf2(__VA_ARGS__)
+#define vle8_v_u8m1(...) __riscv_vle8_v_u8m1(__VA_ARGS__)
+#define vle8_v_u8m2(...) __riscv_vle8_v_u8m2(__VA_ARGS__)
+#define vle8_v_u8m4(...) __riscv_vle8_v_u8m4(__VA_ARGS__)
+#define vle8_v_u8m8(...) __riscv_vle8_v_u8m8(__VA_ARGS__)
+#define vle16_v_u16mf4(...) __riscv_vle16_v_u16mf4(__VA_ARGS__)
+#define vle16_v_u16mf2(...) __riscv_vle16_v_u16mf2(__VA_ARGS__)
+#define vle16_v_u16m1(...) __riscv_vle16_v_u16m1(__VA_ARGS__)
+#define vle16_v_u16m2(...) __riscv_vle16_v_u16m2(__VA_ARGS__)
+#define vle16_v_u16m4(...) __riscv_vle16_v_u16m4(__VA_ARGS__)
+#define vle16_v_u16m8(...) __riscv_vle16_v_u16m8(__VA_ARGS__)
+#define vle32_v_u32mf2(...) __riscv_vle32_v_u32mf2(__VA_ARGS__)
+#define vle32_v_u32m1(...) __riscv_vle32_v_u32m1(__VA_ARGS__)
+#define vle32_v_u32m2(...) __riscv_vle32_v_u32m2(__VA_ARGS__)
+#define vle32_v_u32m4(...) __riscv_vle32_v_u32m4(__VA_ARGS__)
+#define vle32_v_u32m8(...) __riscv_vle32_v_u32m8(__VA_ARGS__)
+#define vle64_v_u64m1(...) __riscv_vle64_v_u64m1(__VA_ARGS__)
+#define vle64_v_u64m2(...) __riscv_vle64_v_u64m2(__VA_ARGS__)
+#define vle64_v_u64m4(...) __riscv_vle64_v_u64m4(__VA_ARGS__)
+#define vle64_v_u64m8(...) __riscv_vle64_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vle16_v_f16mf4_m(...) __riscv_vle16_v_f16mf4_tumu(__VA_ARGS__)
+#define vle16_v_f16mf2_m(...) __riscv_vle16_v_f16mf2_tumu(__VA_ARGS__)
+#define vle16_v_f16m1_m(...) __riscv_vle16_v_f16m1_tumu(__VA_ARGS__)
+#define vle16_v_f16m2_m(...) __riscv_vle16_v_f16m2_tumu(__VA_ARGS__)
+#define vle16_v_f16m4_m(...) __riscv_vle16_v_f16m4_tumu(__VA_ARGS__)
+#define vle16_v_f16m8_m(...) __riscv_vle16_v_f16m8_tumu(__VA_ARGS__)
+#define vle32_v_f32mf2_m(...) __riscv_vle32_v_f32mf2_tumu(__VA_ARGS__)
+#define vle32_v_f32m1_m(...) __riscv_vle32_v_f32m1_tumu(__VA_ARGS__)
+#define vle32_v_f32m2_m(...) __riscv_vle32_v_f32m2_tumu(__VA_ARGS__)
+#define vle32_v_f32m4_m(...) __riscv_vle32_v_f32m4_tumu(__VA_ARGS__)
+#define vle32_v_f32m8_m(...) __riscv_vle32_v_f32m8_tumu(__VA_ARGS__)
+#define vle64_v_f64m1_m(...) __riscv_vle64_v_f64m1_tumu(__VA_ARGS__)
+#define vle64_v_f64m2_m(...) __riscv_vle64_v_f64m2_tumu(__VA_ARGS__)
+#define vle64_v_f64m4_m(...) __riscv_vle64_v_f64m4_tumu(__VA_ARGS__)
+#define vle64_v_f64m8_m(...) __riscv_vle64_v_f64m8_tumu(__VA_ARGS__)
+#define vle8_v_i8mf8_m(...) __riscv_vle8_v_i8mf8_tumu(__VA_ARGS__)
+#define vle8_v_i8mf4_m(...) __riscv_vle8_v_i8mf4_tumu(__VA_ARGS__)
+#define vle8_v_i8mf2_m(...) __riscv_vle8_v_i8mf2_tumu(__VA_ARGS__)
+#define vle8_v_i8m1_m(...) __riscv_vle8_v_i8m1_tumu(__VA_ARGS__)
+#define vle8_v_i8m2_m(...) __riscv_vle8_v_i8m2_tumu(__VA_ARGS__)
+#define vle8_v_i8m4_m(...) __riscv_vle8_v_i8m4_tumu(__VA_ARGS__)
+#define vle8_v_i8m8_m(...) __riscv_vle8_v_i8m8_tumu(__VA_ARGS__)
+#define vle16_v_i16mf4_m(...) __riscv_vle16_v_i16mf4_tumu(__VA_ARGS__)
+#define vle16_v_i16mf2_m(...) __riscv_vle16_v_i16mf2_tumu(__VA_ARGS__)
+#define vle16_v_i16m1_m(...) __riscv_vle16_v_i16m1_tumu(__VA_ARGS__)
+#define vle16_v_i16m2_m(...) __riscv_vle16_v_i16m2_tumu(__VA_ARGS__)
+#define vle16_v_i16m4_m(...) __riscv_vle16_v_i16m4_tumu(__VA_ARGS__)
+#define vle16_v_i16m8_m(...) __riscv_vle16_v_i16m8_tumu(__VA_ARGS__)
+#define vle32_v_i32mf2_m(...) __riscv_vle32_v_i32mf2_tumu(__VA_ARGS__)
+#define vle32_v_i32m1_m(...) __riscv_vle32_v_i32m1_tumu(__VA_ARGS__)
+#define vle32_v_i32m2_m(...) __riscv_vle32_v_i32m2_tumu(__VA_ARGS__)
+#define vle32_v_i32m4_m(...) __riscv_vle32_v_i32m4_tumu(__VA_ARGS__)
+#define vle32_v_i32m8_m(...) __riscv_vle32_v_i32m8_tumu(__VA_ARGS__)
+#define vle64_v_i64m1_m(...) __riscv_vle64_v_i64m1_tumu(__VA_ARGS__)
+#define vle64_v_i64m2_m(...) __riscv_vle64_v_i64m2_tumu(__VA_ARGS__)
+#define vle64_v_i64m4_m(...) __riscv_vle64_v_i64m4_tumu(__VA_ARGS__)
+#define vle64_v_i64m8_m(...) __riscv_vle64_v_i64m8_tumu(__VA_ARGS__)
+#define vle8_v_u8mf8_m(...) __riscv_vle8_v_u8mf8_tumu(__VA_ARGS__)
+#define vle8_v_u8mf4_m(...) __riscv_vle8_v_u8mf4_tumu(__VA_ARGS__)
+#define vle8_v_u8mf2_m(...) __riscv_vle8_v_u8mf2_tumu(__VA_ARGS__)
+#define vle8_v_u8m1_m(...) __riscv_vle8_v_u8m1_tumu(__VA_ARGS__)
+#define vle8_v_u8m2_m(...) __riscv_vle8_v_u8m2_tumu(__VA_ARGS__)
+#define vle8_v_u8m4_m(...) __riscv_vle8_v_u8m4_tumu(__VA_ARGS__)
+#define vle8_v_u8m8_m(...) __riscv_vle8_v_u8m8_tumu(__VA_ARGS__)
+#define vle16_v_u16mf4_m(...) __riscv_vle16_v_u16mf4_tumu(__VA_ARGS__)
+#define vle16_v_u16mf2_m(...) __riscv_vle16_v_u16mf2_tumu(__VA_ARGS__)
+#define vle16_v_u16m1_m(...) __riscv_vle16_v_u16m1_tumu(__VA_ARGS__)
+#define vle16_v_u16m2_m(...) __riscv_vle16_v_u16m2_tumu(__VA_ARGS__)
+#define vle16_v_u16m4_m(...) __riscv_vle16_v_u16m4_tumu(__VA_ARGS__)
+#define vle16_v_u16m8_m(...) __riscv_vle16_v_u16m8_tumu(__VA_ARGS__)
+#define vle32_v_u32mf2_m(...) __riscv_vle32_v_u32mf2_tumu(__VA_ARGS__)
+#define vle32_v_u32m1_m(...) __riscv_vle32_v_u32m1_tumu(__VA_ARGS__)
+#define vle32_v_u32m2_m(...) __riscv_vle32_v_u32m2_tumu(__VA_ARGS__)
+#define vle32_v_u32m4_m(...) __riscv_vle32_v_u32m4_tumu(__VA_ARGS__)
+#define vle32_v_u32m8_m(...) __riscv_vle32_v_u32m8_tumu(__VA_ARGS__)
+#define vle64_v_u64m1_m(...) __riscv_vle64_v_u64m1_tumu(__VA_ARGS__)
+#define vle64_v_u64m2_m(...) __riscv_vle64_v_u64m2_tumu(__VA_ARGS__)
+#define vle64_v_u64m4_m(...) __riscv_vle64_v_u64m4_tumu(__VA_ARGS__)
+#define vle64_v_u64m8_m(...) __riscv_vle64_v_u64m8_tumu(__VA_ARGS__)
+#define vse16_v_f16mf4(...) __riscv_vse16_v_f16mf4(__VA_ARGS__)
+#define vse16_v_f16mf2(...) __riscv_vse16_v_f16mf2(__VA_ARGS__)
+#define vse16_v_f16m1(...) __riscv_vse16_v_f16m1(__VA_ARGS__)
+#define vse16_v_f16m2(...) __riscv_vse16_v_f16m2(__VA_ARGS__)
+#define vse16_v_f16m4(...) __riscv_vse16_v_f16m4(__VA_ARGS__)
+#define vse16_v_f16m8(...) __riscv_vse16_v_f16m8(__VA_ARGS__)
+#define vse32_v_f32mf2(...) __riscv_vse32_v_f32mf2(__VA_ARGS__)
+#define vse32_v_f32m1(...) __riscv_vse32_v_f32m1(__VA_ARGS__)
+#define vse32_v_f32m2(...) __riscv_vse32_v_f32m2(__VA_ARGS__)
+#define vse32_v_f32m4(...) __riscv_vse32_v_f32m4(__VA_ARGS__)
+#define vse32_v_f32m8(...) __riscv_vse32_v_f32m8(__VA_ARGS__)
+#define vse64_v_f64m1(...) __riscv_vse64_v_f64m1(__VA_ARGS__)
+#define vse64_v_f64m2(...) __riscv_vse64_v_f64m2(__VA_ARGS__)
+#define vse64_v_f64m4(...) __riscv_vse64_v_f64m4(__VA_ARGS__)
+#define vse64_v_f64m8(...) __riscv_vse64_v_f64m8(__VA_ARGS__)
+#define vse8_v_i8mf8(...) __riscv_vse8_v_i8mf8(__VA_ARGS__)
+#define vse8_v_i8mf4(...) __riscv_vse8_v_i8mf4(__VA_ARGS__)
+#define vse8_v_i8mf2(...) __riscv_vse8_v_i8mf2(__VA_ARGS__)
+#define vse8_v_i8m1(...) __riscv_vse8_v_i8m1(__VA_ARGS__)
+#define vse8_v_i8m2(...) __riscv_vse8_v_i8m2(__VA_ARGS__)
+#define vse8_v_i8m4(...) __riscv_vse8_v_i8m4(__VA_ARGS__)
+#define vse8_v_i8m8(...) __riscv_vse8_v_i8m8(__VA_ARGS__)
+#define vse16_v_i16mf4(...) __riscv_vse16_v_i16mf4(__VA_ARGS__)
+#define vse16_v_i16mf2(...) __riscv_vse16_v_i16mf2(__VA_ARGS__)
+#define vse16_v_i16m1(...) __riscv_vse16_v_i16m1(__VA_ARGS__)
+#define vse16_v_i16m2(...) __riscv_vse16_v_i16m2(__VA_ARGS__)
+#define vse16_v_i16m4(...) __riscv_vse16_v_i16m4(__VA_ARGS__)
+#define vse16_v_i16m8(...) __riscv_vse16_v_i16m8(__VA_ARGS__)
+#define vse32_v_i32mf2(...) __riscv_vse32_v_i32mf2(__VA_ARGS__)
+#define vse32_v_i32m1(...) __riscv_vse32_v_i32m1(__VA_ARGS__)
+#define vse32_v_i32m2(...) __riscv_vse32_v_i32m2(__VA_ARGS__)
+#define vse32_v_i32m4(...) __riscv_vse32_v_i32m4(__VA_ARGS__)
+#define vse32_v_i32m8(...) __riscv_vse32_v_i32m8(__VA_ARGS__)
+#define vse64_v_i64m1(...) __riscv_vse64_v_i64m1(__VA_ARGS__)
+#define vse64_v_i64m2(...) __riscv_vse64_v_i64m2(__VA_ARGS__)
+#define vse64_v_i64m4(...) __riscv_vse64_v_i64m4(__VA_ARGS__)
+#define vse64_v_i64m8(...) __riscv_vse64_v_i64m8(__VA_ARGS__)
+#define vse8_v_u8mf8(...) __riscv_vse8_v_u8mf8(__VA_ARGS__)
+#define vse8_v_u8mf4(...) __riscv_vse8_v_u8mf4(__VA_ARGS__)
+#define vse8_v_u8mf2(...) __riscv_vse8_v_u8mf2(__VA_ARGS__)
+#define vse8_v_u8m1(...) __riscv_vse8_v_u8m1(__VA_ARGS__)
+#define vse8_v_u8m2(...) __riscv_vse8_v_u8m2(__VA_ARGS__)
+#define vse8_v_u8m4(...) __riscv_vse8_v_u8m4(__VA_ARGS__)
+#define vse8_v_u8m8(...) __riscv_vse8_v_u8m8(__VA_ARGS__)
+#define vse16_v_u16mf4(...) __riscv_vse16_v_u16mf4(__VA_ARGS__)
+#define vse16_v_u16mf2(...) __riscv_vse16_v_u16mf2(__VA_ARGS__)
+#define vse16_v_u16m1(...) __riscv_vse16_v_u16m1(__VA_ARGS__)
+#define vse16_v_u16m2(...) __riscv_vse16_v_u16m2(__VA_ARGS__)
+#define vse16_v_u16m4(...) __riscv_vse16_v_u16m4(__VA_ARGS__)
+#define vse16_v_u16m8(...) __riscv_vse16_v_u16m8(__VA_ARGS__)
+#define vse32_v_u32mf2(...) __riscv_vse32_v_u32mf2(__VA_ARGS__)
+#define vse32_v_u32m1(...) __riscv_vse32_v_u32m1(__VA_ARGS__)
+#define vse32_v_u32m2(...) __riscv_vse32_v_u32m2(__VA_ARGS__)
+#define vse32_v_u32m4(...) __riscv_vse32_v_u32m4(__VA_ARGS__)
+#define vse32_v_u32m8(...) __riscv_vse32_v_u32m8(__VA_ARGS__)
+#define vse64_v_u64m1(...) __riscv_vse64_v_u64m1(__VA_ARGS__)
+#define vse64_v_u64m2(...) __riscv_vse64_v_u64m2(__VA_ARGS__)
+#define vse64_v_u64m4(...) __riscv_vse64_v_u64m4(__VA_ARGS__)
+#define vse64_v_u64m8(...) __riscv_vse64_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vse16_v_f16mf4_m(...) __riscv_vse16_v_f16mf4_m(__VA_ARGS__)
+#define vse16_v_f16mf2_m(...) __riscv_vse16_v_f16mf2_m(__VA_ARGS__)
+#define vse16_v_f16m1_m(...) __riscv_vse16_v_f16m1_m(__VA_ARGS__)
+#define vse16_v_f16m2_m(...) __riscv_vse16_v_f16m2_m(__VA_ARGS__)
+#define vse16_v_f16m4_m(...) __riscv_vse16_v_f16m4_m(__VA_ARGS__)
+#define vse16_v_f16m8_m(...) __riscv_vse16_v_f16m8_m(__VA_ARGS__)
+#define vse32_v_f32mf2_m(...) __riscv_vse32_v_f32mf2_m(__VA_ARGS__)
+#define vse32_v_f32m1_m(...) __riscv_vse32_v_f32m1_m(__VA_ARGS__)
+#define vse32_v_f32m2_m(...) __riscv_vse32_v_f32m2_m(__VA_ARGS__)
+#define vse32_v_f32m4_m(...) __riscv_vse32_v_f32m4_m(__VA_ARGS__)
+#define vse32_v_f32m8_m(...) __riscv_vse32_v_f32m8_m(__VA_ARGS__)
+#define vse64_v_f64m1_m(...) __riscv_vse64_v_f64m1_m(__VA_ARGS__)
+#define vse64_v_f64m2_m(...) __riscv_vse64_v_f64m2_m(__VA_ARGS__)
+#define vse64_v_f64m4_m(...) __riscv_vse64_v_f64m4_m(__VA_ARGS__)
+#define vse64_v_f64m8_m(...) __riscv_vse64_v_f64m8_m(__VA_ARGS__)
+#define vse8_v_i8mf8_m(...) __riscv_vse8_v_i8mf8_m(__VA_ARGS__)
+#define vse8_v_i8mf4_m(...) __riscv_vse8_v_i8mf4_m(__VA_ARGS__)
+#define vse8_v_i8mf2_m(...) __riscv_vse8_v_i8mf2_m(__VA_ARGS__)
+#define vse8_v_i8m1_m(...) __riscv_vse8_v_i8m1_m(__VA_ARGS__)
+#define vse8_v_i8m2_m(...) __riscv_vse8_v_i8m2_m(__VA_ARGS__)
+#define vse8_v_i8m4_m(...) __riscv_vse8_v_i8m4_m(__VA_ARGS__)
+#define vse8_v_i8m8_m(...) __riscv_vse8_v_i8m8_m(__VA_ARGS__)
+#define vse16_v_i16mf4_m(...) __riscv_vse16_v_i16mf4_m(__VA_ARGS__)
+#define vse16_v_i16mf2_m(...) __riscv_vse16_v_i16mf2_m(__VA_ARGS__)
+#define vse16_v_i16m1_m(...) __riscv_vse16_v_i16m1_m(__VA_ARGS__)
+#define vse16_v_i16m2_m(...) __riscv_vse16_v_i16m2_m(__VA_ARGS__)
+#define vse16_v_i16m4_m(...) __riscv_vse16_v_i16m4_m(__VA_ARGS__)
+#define vse16_v_i16m8_m(...) __riscv_vse16_v_i16m8_m(__VA_ARGS__)
+#define vse32_v_i32mf2_m(...) __riscv_vse32_v_i32mf2_m(__VA_ARGS__)
+#define vse32_v_i32m1_m(...) __riscv_vse32_v_i32m1_m(__VA_ARGS__)
+#define vse32_v_i32m2_m(...) __riscv_vse32_v_i32m2_m(__VA_ARGS__)
+#define vse32_v_i32m4_m(...) __riscv_vse32_v_i32m4_m(__VA_ARGS__)
+#define vse32_v_i32m8_m(...) __riscv_vse32_v_i32m8_m(__VA_ARGS__)
+#define vse64_v_i64m1_m(...) __riscv_vse64_v_i64m1_m(__VA_ARGS__)
+#define vse64_v_i64m2_m(...) __riscv_vse64_v_i64m2_m(__VA_ARGS__)
+#define vse64_v_i64m4_m(...) __riscv_vse64_v_i64m4_m(__VA_ARGS__)
+#define vse64_v_i64m8_m(...) __riscv_vse64_v_i64m8_m(__VA_ARGS__)
+#define vse8_v_u8mf8_m(...) __riscv_vse8_v_u8mf8_m(__VA_ARGS__)
+#define vse8_v_u8mf4_m(...) __riscv_vse8_v_u8mf4_m(__VA_ARGS__)
+#define vse8_v_u8mf2_m(...) __riscv_vse8_v_u8mf2_m(__VA_ARGS__)
+#define vse8_v_u8m1_m(...) __riscv_vse8_v_u8m1_m(__VA_ARGS__)
+#define vse8_v_u8m2_m(...) __riscv_vse8_v_u8m2_m(__VA_ARGS__)
+#define vse8_v_u8m4_m(...) __riscv_vse8_v_u8m4_m(__VA_ARGS__)
+#define vse8_v_u8m8_m(...) __riscv_vse8_v_u8m8_m(__VA_ARGS__)
+#define vse16_v_u16mf4_m(...) __riscv_vse16_v_u16mf4_m(__VA_ARGS__)
+#define vse16_v_u16mf2_m(...) __riscv_vse16_v_u16mf2_m(__VA_ARGS__)
+#define vse16_v_u16m1_m(...) __riscv_vse16_v_u16m1_m(__VA_ARGS__)
+#define vse16_v_u16m2_m(...) __riscv_vse16_v_u16m2_m(__VA_ARGS__)
+#define vse16_v_u16m4_m(...) __riscv_vse16_v_u16m4_m(__VA_ARGS__)
+#define vse16_v_u16m8_m(...) __riscv_vse16_v_u16m8_m(__VA_ARGS__)
+#define vse32_v_u32mf2_m(...) __riscv_vse32_v_u32mf2_m(__VA_ARGS__)
+#define vse32_v_u32m1_m(...) __riscv_vse32_v_u32m1_m(__VA_ARGS__)
+#define vse32_v_u32m2_m(...) __riscv_vse32_v_u32m2_m(__VA_ARGS__)
+#define vse32_v_u32m4_m(...) __riscv_vse32_v_u32m4_m(__VA_ARGS__)
+#define vse32_v_u32m8_m(...) __riscv_vse32_v_u32m8_m(__VA_ARGS__)
+#define vse64_v_u64m1_m(...) __riscv_vse64_v_u64m1_m(__VA_ARGS__)
+#define vse64_v_u64m2_m(...) __riscv_vse64_v_u64m2_m(__VA_ARGS__)
+#define vse64_v_u64m4_m(...) __riscv_vse64_v_u64m4_m(__VA_ARGS__)
+#define vse64_v_u64m8_m(...) __riscv_vse64_v_u64m8_m(__VA_ARGS__)
+#define vlse16_v_f16mf4(...) __riscv_vlse16_v_f16mf4(__VA_ARGS__)
+#define vlse16_v_f16mf2(...) __riscv_vlse16_v_f16mf2(__VA_ARGS__)
+#define vlse16_v_f16m1(...) __riscv_vlse16_v_f16m1(__VA_ARGS__)
+#define vlse16_v_f16m2(...) __riscv_vlse16_v_f16m2(__VA_ARGS__)
+#define vlse16_v_f16m4(...) __riscv_vlse16_v_f16m4(__VA_ARGS__)
+#define vlse16_v_f16m8(...) __riscv_vlse16_v_f16m8(__VA_ARGS__)
+#define vlse32_v_f32mf2(...) __riscv_vlse32_v_f32mf2(__VA_ARGS__)
+#define vlse32_v_f32m1(...) __riscv_vlse32_v_f32m1(__VA_ARGS__)
+#define vlse32_v_f32m2(...) __riscv_vlse32_v_f32m2(__VA_ARGS__)
+#define vlse32_v_f32m4(...) __riscv_vlse32_v_f32m4(__VA_ARGS__)
+#define vlse32_v_f32m8(...) __riscv_vlse32_v_f32m8(__VA_ARGS__)
+#define vlse64_v_f64m1(...) __riscv_vlse64_v_f64m1(__VA_ARGS__)
+#define vlse64_v_f64m2(...) __riscv_vlse64_v_f64m2(__VA_ARGS__)
+#define vlse64_v_f64m4(...) __riscv_vlse64_v_f64m4(__VA_ARGS__)
+#define vlse64_v_f64m8(...) __riscv_vlse64_v_f64m8(__VA_ARGS__)
+#define vlse8_v_i8mf8(...) __riscv_vlse8_v_i8mf8(__VA_ARGS__)
+#define vlse8_v_i8mf4(...) __riscv_vlse8_v_i8mf4(__VA_ARGS__)
+#define vlse8_v_i8mf2(...) __riscv_vlse8_v_i8mf2(__VA_ARGS__)
+#define vlse8_v_i8m1(...) __riscv_vlse8_v_i8m1(__VA_ARGS__)
+#define vlse8_v_i8m2(...) __riscv_vlse8_v_i8m2(__VA_ARGS__)
+#define vlse8_v_i8m4(...) __riscv_vlse8_v_i8m4(__VA_ARGS__)
+#define vlse8_v_i8m8(...) __riscv_vlse8_v_i8m8(__VA_ARGS__)
+#define vlse16_v_i16mf4(...) __riscv_vlse16_v_i16mf4(__VA_ARGS__)
+#define vlse16_v_i16mf2(...) __riscv_vlse16_v_i16mf2(__VA_ARGS__)
+#define vlse16_v_i16m1(...) __riscv_vlse16_v_i16m1(__VA_ARGS__)
+#define vlse16_v_i16m2(...) __riscv_vlse16_v_i16m2(__VA_ARGS__)
+#define vlse16_v_i16m4(...) __riscv_vlse16_v_i16m4(__VA_ARGS__)
+#define vlse16_v_i16m8(...) __riscv_vlse16_v_i16m8(__VA_ARGS__)
+#define vlse32_v_i32mf2(...) __riscv_vlse32_v_i32mf2(__VA_ARGS__)
+#define vlse32_v_i32m1(...) __riscv_vlse32_v_i32m1(__VA_ARGS__)
+#define vlse32_v_i32m2(...) __riscv_vlse32_v_i32m2(__VA_ARGS__)
+#define vlse32_v_i32m4(...) __riscv_vlse32_v_i32m4(__VA_ARGS__)
+#define vlse32_v_i32m8(...) __riscv_vlse32_v_i32m8(__VA_ARGS__)
+#define vlse64_v_i64m1(...) __riscv_vlse64_v_i64m1(__VA_ARGS__)
+#define vlse64_v_i64m2(...) __riscv_vlse64_v_i64m2(__VA_ARGS__)
+#define vlse64_v_i64m4(...) __riscv_vlse64_v_i64m4(__VA_ARGS__)
+#define vlse64_v_i64m8(...) __riscv_vlse64_v_i64m8(__VA_ARGS__)
+#define vlse8_v_u8mf8(...) __riscv_vlse8_v_u8mf8(__VA_ARGS__)
+#define vlse8_v_u8mf4(...) __riscv_vlse8_v_u8mf4(__VA_ARGS__)
+#define vlse8_v_u8mf2(...) __riscv_vlse8_v_u8mf2(__VA_ARGS__)
+#define vlse8_v_u8m1(...) __riscv_vlse8_v_u8m1(__VA_ARGS__)
+#define vlse8_v_u8m2(...) __riscv_vlse8_v_u8m2(__VA_ARGS__)
+#define vlse8_v_u8m4(...) __riscv_vlse8_v_u8m4(__VA_ARGS__)
+#define vlse8_v_u8m8(...) __riscv_vlse8_v_u8m8(__VA_ARGS__)
+#define vlse16_v_u16mf4(...) __riscv_vlse16_v_u16mf4(__VA_ARGS__)
+#define vlse16_v_u16mf2(...) __riscv_vlse16_v_u16mf2(__VA_ARGS__)
+#define vlse16_v_u16m1(...) __riscv_vlse16_v_u16m1(__VA_ARGS__)
+#define vlse16_v_u16m2(...) __riscv_vlse16_v_u16m2(__VA_ARGS__)
+#define vlse16_v_u16m4(...) __riscv_vlse16_v_u16m4(__VA_ARGS__)
+#define vlse16_v_u16m8(...) __riscv_vlse16_v_u16m8(__VA_ARGS__)
+#define vlse32_v_u32mf2(...) __riscv_vlse32_v_u32mf2(__VA_ARGS__)
+#define vlse32_v_u32m1(...) __riscv_vlse32_v_u32m1(__VA_ARGS__)
+#define vlse32_v_u32m2(...) __riscv_vlse32_v_u32m2(__VA_ARGS__)
+#define vlse32_v_u32m4(...) __riscv_vlse32_v_u32m4(__VA_ARGS__)
+#define vlse32_v_u32m8(...) __riscv_vlse32_v_u32m8(__VA_ARGS__)
+#define vlse64_v_u64m1(...) __riscv_vlse64_v_u64m1(__VA_ARGS__)
+#define vlse64_v_u64m2(...) __riscv_vlse64_v_u64m2(__VA_ARGS__)
+#define vlse64_v_u64m4(...) __riscv_vlse64_v_u64m4(__VA_ARGS__)
+#define vlse64_v_u64m8(...) __riscv_vlse64_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vlse16_v_f16mf4_m(...) __riscv_vlse16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlse16_v_f16mf2_m(...) __riscv_vlse16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlse16_v_f16m1_m(...) __riscv_vlse16_v_f16m1_tumu(__VA_ARGS__)
+#define vlse16_v_f16m2_m(...) __riscv_vlse16_v_f16m2_tumu(__VA_ARGS__)
+#define vlse16_v_f16m4_m(...) __riscv_vlse16_v_f16m4_tumu(__VA_ARGS__)
+#define vlse16_v_f16m8_m(...) __riscv_vlse16_v_f16m8_tumu(__VA_ARGS__)
+#define vlse32_v_f32mf2_m(...) __riscv_vlse32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlse32_v_f32m1_m(...) __riscv_vlse32_v_f32m1_tumu(__VA_ARGS__)
+#define vlse32_v_f32m2_m(...) __riscv_vlse32_v_f32m2_tumu(__VA_ARGS__)
+#define vlse32_v_f32m4_m(...) __riscv_vlse32_v_f32m4_tumu(__VA_ARGS__)
+#define vlse32_v_f32m8_m(...) __riscv_vlse32_v_f32m8_tumu(__VA_ARGS__)
+#define vlse64_v_f64m1_m(...) __riscv_vlse64_v_f64m1_tumu(__VA_ARGS__)
+#define vlse64_v_f64m2_m(...) __riscv_vlse64_v_f64m2_tumu(__VA_ARGS__)
+#define vlse64_v_f64m4_m(...) __riscv_vlse64_v_f64m4_tumu(__VA_ARGS__)
+#define vlse64_v_f64m8_m(...) __riscv_vlse64_v_f64m8_tumu(__VA_ARGS__)
+#define vlse8_v_i8mf8_m(...) __riscv_vlse8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlse8_v_i8mf4_m(...) __riscv_vlse8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlse8_v_i8mf2_m(...) __riscv_vlse8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlse8_v_i8m1_m(...) __riscv_vlse8_v_i8m1_tumu(__VA_ARGS__)
+#define vlse8_v_i8m2_m(...) __riscv_vlse8_v_i8m2_tumu(__VA_ARGS__)
+#define vlse8_v_i8m4_m(...) __riscv_vlse8_v_i8m4_tumu(__VA_ARGS__)
+#define vlse8_v_i8m8_m(...) __riscv_vlse8_v_i8m8_tumu(__VA_ARGS__)
+#define vlse16_v_i16mf4_m(...) __riscv_vlse16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlse16_v_i16mf2_m(...) __riscv_vlse16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlse16_v_i16m1_m(...) __riscv_vlse16_v_i16m1_tumu(__VA_ARGS__)
+#define vlse16_v_i16m2_m(...) __riscv_vlse16_v_i16m2_tumu(__VA_ARGS__)
+#define vlse16_v_i16m4_m(...) __riscv_vlse16_v_i16m4_tumu(__VA_ARGS__)
+#define vlse16_v_i16m8_m(...) __riscv_vlse16_v_i16m8_tumu(__VA_ARGS__)
+#define vlse32_v_i32mf2_m(...) __riscv_vlse32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlse32_v_i32m1_m(...) __riscv_vlse32_v_i32m1_tumu(__VA_ARGS__)
+#define vlse32_v_i32m2_m(...) __riscv_vlse32_v_i32m2_tumu(__VA_ARGS__)
+#define vlse32_v_i32m4_m(...) __riscv_vlse32_v_i32m4_tumu(__VA_ARGS__)
+#define vlse32_v_i32m8_m(...) __riscv_vlse32_v_i32m8_tumu(__VA_ARGS__)
+#define vlse64_v_i64m1_m(...) __riscv_vlse64_v_i64m1_tumu(__VA_ARGS__)
+#define vlse64_v_i64m2_m(...) __riscv_vlse64_v_i64m2_tumu(__VA_ARGS__)
+#define vlse64_v_i64m4_m(...) __riscv_vlse64_v_i64m4_tumu(__VA_ARGS__)
+#define vlse64_v_i64m8_m(...) __riscv_vlse64_v_i64m8_tumu(__VA_ARGS__)
+#define vlse8_v_u8mf8_m(...) __riscv_vlse8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlse8_v_u8mf4_m(...) __riscv_vlse8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlse8_v_u8mf2_m(...) __riscv_vlse8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlse8_v_u8m1_m(...) __riscv_vlse8_v_u8m1_tumu(__VA_ARGS__)
+#define vlse8_v_u8m2_m(...) __riscv_vlse8_v_u8m2_tumu(__VA_ARGS__)
+#define vlse8_v_u8m4_m(...) __riscv_vlse8_v_u8m4_tumu(__VA_ARGS__)
+#define vlse8_v_u8m8_m(...) __riscv_vlse8_v_u8m8_tumu(__VA_ARGS__)
+#define vlse16_v_u16mf4_m(...) __riscv_vlse16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlse16_v_u16mf2_m(...) __riscv_vlse16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlse16_v_u16m1_m(...) __riscv_vlse16_v_u16m1_tumu(__VA_ARGS__)
+#define vlse16_v_u16m2_m(...) __riscv_vlse16_v_u16m2_tumu(__VA_ARGS__)
+#define vlse16_v_u16m4_m(...) __riscv_vlse16_v_u16m4_tumu(__VA_ARGS__)
+#define vlse16_v_u16m8_m(...) __riscv_vlse16_v_u16m8_tumu(__VA_ARGS__)
+#define vlse32_v_u32mf2_m(...) __riscv_vlse32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlse32_v_u32m1_m(...) __riscv_vlse32_v_u32m1_tumu(__VA_ARGS__)
+#define vlse32_v_u32m2_m(...) __riscv_vlse32_v_u32m2_tumu(__VA_ARGS__)
+#define vlse32_v_u32m4_m(...) __riscv_vlse32_v_u32m4_tumu(__VA_ARGS__)
+#define vlse32_v_u32m8_m(...) __riscv_vlse32_v_u32m8_tumu(__VA_ARGS__)
+#define vlse64_v_u64m1_m(...) __riscv_vlse64_v_u64m1_tumu(__VA_ARGS__)
+#define vlse64_v_u64m2_m(...) __riscv_vlse64_v_u64m2_tumu(__VA_ARGS__)
+#define vlse64_v_u64m4_m(...) __riscv_vlse64_v_u64m4_tumu(__VA_ARGS__)
+#define vlse64_v_u64m8_m(...) __riscv_vlse64_v_u64m8_tumu(__VA_ARGS__)
+#define vsse16_v_f16mf4(...) __riscv_vsse16_v_f16mf4(__VA_ARGS__)
+#define vsse16_v_f16mf2(...) __riscv_vsse16_v_f16mf2(__VA_ARGS__)
+#define vsse16_v_f16m1(...) __riscv_vsse16_v_f16m1(__VA_ARGS__)
+#define vsse16_v_f16m2(...) __riscv_vsse16_v_f16m2(__VA_ARGS__)
+#define vsse16_v_f16m4(...) __riscv_vsse16_v_f16m4(__VA_ARGS__)
+#define vsse16_v_f16m8(...) __riscv_vsse16_v_f16m8(__VA_ARGS__)
+#define vsse32_v_f32mf2(...) __riscv_vsse32_v_f32mf2(__VA_ARGS__)
+#define vsse32_v_f32m1(...) __riscv_vsse32_v_f32m1(__VA_ARGS__)
+#define vsse32_v_f32m2(...) __riscv_vsse32_v_f32m2(__VA_ARGS__)
+#define vsse32_v_f32m4(...) __riscv_vsse32_v_f32m4(__VA_ARGS__)
+#define vsse32_v_f32m8(...) __riscv_vsse32_v_f32m8(__VA_ARGS__)
+#define vsse64_v_f64m1(...) __riscv_vsse64_v_f64m1(__VA_ARGS__)
+#define vsse64_v_f64m2(...) __riscv_vsse64_v_f64m2(__VA_ARGS__)
+#define vsse64_v_f64m4(...) __riscv_vsse64_v_f64m4(__VA_ARGS__)
+#define vsse64_v_f64m8(...) __riscv_vsse64_v_f64m8(__VA_ARGS__)
+#define vsse8_v_i8mf8(...) __riscv_vsse8_v_i8mf8(__VA_ARGS__)
+#define vsse8_v_i8mf4(...) __riscv_vsse8_v_i8mf4(__VA_ARGS__)
+#define vsse8_v_i8mf2(...) __riscv_vsse8_v_i8mf2(__VA_ARGS__)
+#define vsse8_v_i8m1(...) __riscv_vsse8_v_i8m1(__VA_ARGS__)
+#define vsse8_v_i8m2(...) __riscv_vsse8_v_i8m2(__VA_ARGS__)
+#define vsse8_v_i8m4(...) __riscv_vsse8_v_i8m4(__VA_ARGS__)
+#define vsse8_v_i8m8(...) __riscv_vsse8_v_i8m8(__VA_ARGS__)
+#define vsse16_v_i16mf4(...) __riscv_vsse16_v_i16mf4(__VA_ARGS__)
+#define vsse16_v_i16mf2(...) __riscv_vsse16_v_i16mf2(__VA_ARGS__)
+#define vsse16_v_i16m1(...) __riscv_vsse16_v_i16m1(__VA_ARGS__)
+#define vsse16_v_i16m2(...) __riscv_vsse16_v_i16m2(__VA_ARGS__)
+#define vsse16_v_i16m4(...) __riscv_vsse16_v_i16m4(__VA_ARGS__)
+#define vsse16_v_i16m8(...) __riscv_vsse16_v_i16m8(__VA_ARGS__)
+#define vsse32_v_i32mf2(...) __riscv_vsse32_v_i32mf2(__VA_ARGS__)
+#define vsse32_v_i32m1(...) __riscv_vsse32_v_i32m1(__VA_ARGS__)
+#define vsse32_v_i32m2(...) __riscv_vsse32_v_i32m2(__VA_ARGS__)
+#define vsse32_v_i32m4(...) __riscv_vsse32_v_i32m4(__VA_ARGS__)
+#define vsse32_v_i32m8(...) __riscv_vsse32_v_i32m8(__VA_ARGS__)
+#define vsse64_v_i64m1(...) __riscv_vsse64_v_i64m1(__VA_ARGS__)
+#define vsse64_v_i64m2(...) __riscv_vsse64_v_i64m2(__VA_ARGS__)
+#define vsse64_v_i64m4(...) __riscv_vsse64_v_i64m4(__VA_ARGS__)
+#define vsse64_v_i64m8(...) __riscv_vsse64_v_i64m8(__VA_ARGS__)
+#define vsse8_v_u8mf8(...) __riscv_vsse8_v_u8mf8(__VA_ARGS__)
+#define vsse8_v_u8mf4(...) __riscv_vsse8_v_u8mf4(__VA_ARGS__)
+#define vsse8_v_u8mf2(...) __riscv_vsse8_v_u8mf2(__VA_ARGS__)
+#define vsse8_v_u8m1(...) __riscv_vsse8_v_u8m1(__VA_ARGS__)
+#define vsse8_v_u8m2(...) __riscv_vsse8_v_u8m2(__VA_ARGS__)
+#define vsse8_v_u8m4(...) __riscv_vsse8_v_u8m4(__VA_ARGS__)
+#define vsse8_v_u8m8(...) __riscv_vsse8_v_u8m8(__VA_ARGS__)
+#define vsse16_v_u16mf4(...) __riscv_vsse16_v_u16mf4(__VA_ARGS__)
+#define vsse16_v_u16mf2(...) __riscv_vsse16_v_u16mf2(__VA_ARGS__)
+#define vsse16_v_u16m1(...) __riscv_vsse16_v_u16m1(__VA_ARGS__)
+#define vsse16_v_u16m2(...) __riscv_vsse16_v_u16m2(__VA_ARGS__)
+#define vsse16_v_u16m4(...) __riscv_vsse16_v_u16m4(__VA_ARGS__)
+#define vsse16_v_u16m8(...) __riscv_vsse16_v_u16m8(__VA_ARGS__)
+#define vsse32_v_u32mf2(...) __riscv_vsse32_v_u32mf2(__VA_ARGS__)
+#define vsse32_v_u32m1(...) __riscv_vsse32_v_u32m1(__VA_ARGS__)
+#define vsse32_v_u32m2(...) __riscv_vsse32_v_u32m2(__VA_ARGS__)
+#define vsse32_v_u32m4(...) __riscv_vsse32_v_u32m4(__VA_ARGS__)
+#define vsse32_v_u32m8(...) __riscv_vsse32_v_u32m8(__VA_ARGS__)
+#define vsse64_v_u64m1(...) __riscv_vsse64_v_u64m1(__VA_ARGS__)
+#define vsse64_v_u64m2(...) __riscv_vsse64_v_u64m2(__VA_ARGS__)
+#define vsse64_v_u64m4(...) __riscv_vsse64_v_u64m4(__VA_ARGS__)
+#define vsse64_v_u64m8(...) __riscv_vsse64_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vsse16_v_f16mf4_m(...) __riscv_vsse16_v_f16mf4_m(__VA_ARGS__)
+#define vsse16_v_f16mf2_m(...) __riscv_vsse16_v_f16mf2_m(__VA_ARGS__)
+#define vsse16_v_f16m1_m(...) __riscv_vsse16_v_f16m1_m(__VA_ARGS__)
+#define vsse16_v_f16m2_m(...) __riscv_vsse16_v_f16m2_m(__VA_ARGS__)
+#define vsse16_v_f16m4_m(...) __riscv_vsse16_v_f16m4_m(__VA_ARGS__)
+#define vsse16_v_f16m8_m(...) __riscv_vsse16_v_f16m8_m(__VA_ARGS__)
+#define vsse32_v_f32mf2_m(...) __riscv_vsse32_v_f32mf2_m(__VA_ARGS__)
+#define vsse32_v_f32m1_m(...) __riscv_vsse32_v_f32m1_m(__VA_ARGS__)
+#define vsse32_v_f32m2_m(...) __riscv_vsse32_v_f32m2_m(__VA_ARGS__)
+#define vsse32_v_f32m4_m(...) __riscv_vsse32_v_f32m4_m(__VA_ARGS__)
+#define vsse32_v_f32m8_m(...) __riscv_vsse32_v_f32m8_m(__VA_ARGS__)
+#define vsse64_v_f64m1_m(...) __riscv_vsse64_v_f64m1_m(__VA_ARGS__)
+#define vsse64_v_f64m2_m(...) __riscv_vsse64_v_f64m2_m(__VA_ARGS__)
+#define vsse64_v_f64m4_m(...) __riscv_vsse64_v_f64m4_m(__VA_ARGS__)
+#define vsse64_v_f64m8_m(...) __riscv_vsse64_v_f64m8_m(__VA_ARGS__)
+#define vsse8_v_i8mf8_m(...) __riscv_vsse8_v_i8mf8_m(__VA_ARGS__)
+#define vsse8_v_i8mf4_m(...) __riscv_vsse8_v_i8mf4_m(__VA_ARGS__)
+#define vsse8_v_i8mf2_m(...) __riscv_vsse8_v_i8mf2_m(__VA_ARGS__)
+#define vsse8_v_i8m1_m(...) __riscv_vsse8_v_i8m1_m(__VA_ARGS__)
+#define vsse8_v_i8m2_m(...) __riscv_vsse8_v_i8m2_m(__VA_ARGS__)
+#define vsse8_v_i8m4_m(...) __riscv_vsse8_v_i8m4_m(__VA_ARGS__)
+#define vsse8_v_i8m8_m(...) __riscv_vsse8_v_i8m8_m(__VA_ARGS__)
+#define vsse16_v_i16mf4_m(...) __riscv_vsse16_v_i16mf4_m(__VA_ARGS__)
+#define vsse16_v_i16mf2_m(...) __riscv_vsse16_v_i16mf2_m(__VA_ARGS__)
+#define vsse16_v_i16m1_m(...) __riscv_vsse16_v_i16m1_m(__VA_ARGS__)
+#define vsse16_v_i16m2_m(...) __riscv_vsse16_v_i16m2_m(__VA_ARGS__)
+#define vsse16_v_i16m4_m(...) __riscv_vsse16_v_i16m4_m(__VA_ARGS__)
+#define vsse16_v_i16m8_m(...) __riscv_vsse16_v_i16m8_m(__VA_ARGS__)
+#define vsse32_v_i32mf2_m(...) __riscv_vsse32_v_i32mf2_m(__VA_ARGS__)
+#define vsse32_v_i32m1_m(...) __riscv_vsse32_v_i32m1_m(__VA_ARGS__)
+#define vsse32_v_i32m2_m(...) __riscv_vsse32_v_i32m2_m(__VA_ARGS__)
+#define vsse32_v_i32m4_m(...) __riscv_vsse32_v_i32m4_m(__VA_ARGS__)
+#define vsse32_v_i32m8_m(...) __riscv_vsse32_v_i32m8_m(__VA_ARGS__)
+#define vsse64_v_i64m1_m(...) __riscv_vsse64_v_i64m1_m(__VA_ARGS__)
+#define vsse64_v_i64m2_m(...) __riscv_vsse64_v_i64m2_m(__VA_ARGS__)
+#define vsse64_v_i64m4_m(...) __riscv_vsse64_v_i64m4_m(__VA_ARGS__)
+#define vsse64_v_i64m8_m(...) __riscv_vsse64_v_i64m8_m(__VA_ARGS__)
+#define vsse8_v_u8mf8_m(...) __riscv_vsse8_v_u8mf8_m(__VA_ARGS__)
+#define vsse8_v_u8mf4_m(...) __riscv_vsse8_v_u8mf4_m(__VA_ARGS__)
+#define vsse8_v_u8mf2_m(...) __riscv_vsse8_v_u8mf2_m(__VA_ARGS__)
+#define vsse8_v_u8m1_m(...) __riscv_vsse8_v_u8m1_m(__VA_ARGS__)
+#define vsse8_v_u8m2_m(...) __riscv_vsse8_v_u8m2_m(__VA_ARGS__)
+#define vsse8_v_u8m4_m(...) __riscv_vsse8_v_u8m4_m(__VA_ARGS__)
+#define vsse8_v_u8m8_m(...) __riscv_vsse8_v_u8m8_m(__VA_ARGS__)
+#define vsse16_v_u16mf4_m(...) __riscv_vsse16_v_u16mf4_m(__VA_ARGS__)
+#define vsse16_v_u16mf2_m(...) __riscv_vsse16_v_u16mf2_m(__VA_ARGS__)
+#define vsse16_v_u16m1_m(...) __riscv_vsse16_v_u16m1_m(__VA_ARGS__)
+#define vsse16_v_u16m2_m(...) __riscv_vsse16_v_u16m2_m(__VA_ARGS__)
+#define vsse16_v_u16m4_m(...) __riscv_vsse16_v_u16m4_m(__VA_ARGS__)
+#define vsse16_v_u16m8_m(...) __riscv_vsse16_v_u16m8_m(__VA_ARGS__)
+#define vsse32_v_u32mf2_m(...) __riscv_vsse32_v_u32mf2_m(__VA_ARGS__)
+#define vsse32_v_u32m1_m(...) __riscv_vsse32_v_u32m1_m(__VA_ARGS__)
+#define vsse32_v_u32m2_m(...) __riscv_vsse32_v_u32m2_m(__VA_ARGS__)
+#define vsse32_v_u32m4_m(...) __riscv_vsse32_v_u32m4_m(__VA_ARGS__)
+#define vsse32_v_u32m8_m(...) __riscv_vsse32_v_u32m8_m(__VA_ARGS__)
+#define vsse64_v_u64m1_m(...) __riscv_vsse64_v_u64m1_m(__VA_ARGS__)
+#define vsse64_v_u64m2_m(...) __riscv_vsse64_v_u64m2_m(__VA_ARGS__)
+#define vsse64_v_u64m4_m(...) __riscv_vsse64_v_u64m4_m(__VA_ARGS__)
+#define vsse64_v_u64m8_m(...) __riscv_vsse64_v_u64m8_m(__VA_ARGS__)
+#define vloxei8_v_f16mf4(...) __riscv_vloxei8_v_f16mf4(__VA_ARGS__)
+#define vloxei8_v_f16mf2(...) __riscv_vloxei8_v_f16mf2(__VA_ARGS__)
+#define vloxei8_v_f16m1(...) __riscv_vloxei8_v_f16m1(__VA_ARGS__)
+#define vloxei8_v_f16m2(...) __riscv_vloxei8_v_f16m2(__VA_ARGS__)
+#define vloxei8_v_f16m4(...) __riscv_vloxei8_v_f16m4(__VA_ARGS__)
+#define vloxei8_v_f16m8(...) __riscv_vloxei8_v_f16m8(__VA_ARGS__)
+#define vloxei16_v_f16mf4(...) __riscv_vloxei16_v_f16mf4(__VA_ARGS__)
+#define vloxei16_v_f16mf2(...) __riscv_vloxei16_v_f16mf2(__VA_ARGS__)
+#define vloxei16_v_f16m1(...) __riscv_vloxei16_v_f16m1(__VA_ARGS__)
+#define vloxei16_v_f16m2(...) __riscv_vloxei16_v_f16m2(__VA_ARGS__)
+#define vloxei16_v_f16m4(...) __riscv_vloxei16_v_f16m4(__VA_ARGS__)
+#define vloxei16_v_f16m8(...) __riscv_vloxei16_v_f16m8(__VA_ARGS__)
+#define vloxei32_v_f16mf4(...) __riscv_vloxei32_v_f16mf4(__VA_ARGS__)
+#define vloxei32_v_f16mf2(...) __riscv_vloxei32_v_f16mf2(__VA_ARGS__)
+#define vloxei32_v_f16m1(...) __riscv_vloxei32_v_f16m1(__VA_ARGS__)
+#define vloxei32_v_f16m2(...) __riscv_vloxei32_v_f16m2(__VA_ARGS__)
+#define vloxei32_v_f16m4(...) __riscv_vloxei32_v_f16m4(__VA_ARGS__)
+#define vloxei64_v_f16mf4(...) __riscv_vloxei64_v_f16mf4(__VA_ARGS__)
+#define vloxei64_v_f16mf2(...) __riscv_vloxei64_v_f16mf2(__VA_ARGS__)
+#define vloxei64_v_f16m1(...) __riscv_vloxei64_v_f16m1(__VA_ARGS__)
+#define vloxei64_v_f16m2(...) __riscv_vloxei64_v_f16m2(__VA_ARGS__)
+#define vloxei8_v_f32mf2(...) __riscv_vloxei8_v_f32mf2(__VA_ARGS__)
+#define vloxei8_v_f32m1(...) __riscv_vloxei8_v_f32m1(__VA_ARGS__)
+#define vloxei8_v_f32m2(...) __riscv_vloxei8_v_f32m2(__VA_ARGS__)
+#define vloxei8_v_f32m4(...) __riscv_vloxei8_v_f32m4(__VA_ARGS__)
+#define vloxei8_v_f32m8(...) __riscv_vloxei8_v_f32m8(__VA_ARGS__)
+#define vloxei16_v_f32mf2(...) __riscv_vloxei16_v_f32mf2(__VA_ARGS__)
+#define vloxei16_v_f32m1(...) __riscv_vloxei16_v_f32m1(__VA_ARGS__)
+#define vloxei16_v_f32m2(...) __riscv_vloxei16_v_f32m2(__VA_ARGS__)
+#define vloxei16_v_f32m4(...) __riscv_vloxei16_v_f32m4(__VA_ARGS__)
+#define vloxei16_v_f32m8(...) __riscv_vloxei16_v_f32m8(__VA_ARGS__)
+#define vloxei32_v_f32mf2(...) __riscv_vloxei32_v_f32mf2(__VA_ARGS__)
+#define vloxei32_v_f32m1(...) __riscv_vloxei32_v_f32m1(__VA_ARGS__)
+#define vloxei32_v_f32m2(...) __riscv_vloxei32_v_f32m2(__VA_ARGS__)
+#define vloxei32_v_f32m4(...) __riscv_vloxei32_v_f32m4(__VA_ARGS__)
+#define vloxei32_v_f32m8(...) __riscv_vloxei32_v_f32m8(__VA_ARGS__)
+#define vloxei64_v_f32mf2(...) __riscv_vloxei64_v_f32mf2(__VA_ARGS__)
+#define vloxei64_v_f32m1(...) __riscv_vloxei64_v_f32m1(__VA_ARGS__)
+#define vloxei64_v_f32m2(...) __riscv_vloxei64_v_f32m2(__VA_ARGS__)
+#define vloxei64_v_f32m4(...) __riscv_vloxei64_v_f32m4(__VA_ARGS__)
+#define vloxei8_v_f64m1(...) __riscv_vloxei8_v_f64m1(__VA_ARGS__)
+#define vloxei8_v_f64m2(...) __riscv_vloxei8_v_f64m2(__VA_ARGS__)
+#define vloxei8_v_f64m4(...) __riscv_vloxei8_v_f64m4(__VA_ARGS__)
+#define vloxei8_v_f64m8(...) __riscv_vloxei8_v_f64m8(__VA_ARGS__)
+#define vloxei16_v_f64m1(...) __riscv_vloxei16_v_f64m1(__VA_ARGS__)
+#define vloxei16_v_f64m2(...) __riscv_vloxei16_v_f64m2(__VA_ARGS__)
+#define vloxei16_v_f64m4(...) __riscv_vloxei16_v_f64m4(__VA_ARGS__)
+#define vloxei16_v_f64m8(...) __riscv_vloxei16_v_f64m8(__VA_ARGS__)
+#define vloxei32_v_f64m1(...) __riscv_vloxei32_v_f64m1(__VA_ARGS__)
+#define vloxei32_v_f64m2(...) __riscv_vloxei32_v_f64m2(__VA_ARGS__)
+#define vloxei32_v_f64m4(...) __riscv_vloxei32_v_f64m4(__VA_ARGS__)
+#define vloxei32_v_f64m8(...) __riscv_vloxei32_v_f64m8(__VA_ARGS__)
+#define vloxei64_v_f64m1(...) __riscv_vloxei64_v_f64m1(__VA_ARGS__)
+#define vloxei64_v_f64m2(...) __riscv_vloxei64_v_f64m2(__VA_ARGS__)
+#define vloxei64_v_f64m4(...) __riscv_vloxei64_v_f64m4(__VA_ARGS__)
+#define vloxei64_v_f64m8(...) __riscv_vloxei64_v_f64m8(__VA_ARGS__)
+#define vluxei8_v_f16mf4(...) __riscv_vluxei8_v_f16mf4(__VA_ARGS__)
+#define vluxei8_v_f16mf2(...) __riscv_vluxei8_v_f16mf2(__VA_ARGS__)
+#define vluxei8_v_f16m1(...) __riscv_vluxei8_v_f16m1(__VA_ARGS__)
+#define vluxei8_v_f16m2(...) __riscv_vluxei8_v_f16m2(__VA_ARGS__)
+#define vluxei8_v_f16m4(...) __riscv_vluxei8_v_f16m4(__VA_ARGS__)
+#define vluxei8_v_f16m8(...) __riscv_vluxei8_v_f16m8(__VA_ARGS__)
+#define vluxei16_v_f16mf4(...) __riscv_vluxei16_v_f16mf4(__VA_ARGS__)
+#define vluxei16_v_f16mf2(...) __riscv_vluxei16_v_f16mf2(__VA_ARGS__)
+#define vluxei16_v_f16m1(...) __riscv_vluxei16_v_f16m1(__VA_ARGS__)
+#define vluxei16_v_f16m2(...) __riscv_vluxei16_v_f16m2(__VA_ARGS__)
+#define vluxei16_v_f16m4(...) __riscv_vluxei16_v_f16m4(__VA_ARGS__)
+#define vluxei16_v_f16m8(...) __riscv_vluxei16_v_f16m8(__VA_ARGS__)
+#define vluxei32_v_f16mf4(...) __riscv_vluxei32_v_f16mf4(__VA_ARGS__)
+#define vluxei32_v_f16mf2(...) __riscv_vluxei32_v_f16mf2(__VA_ARGS__)
+#define vluxei32_v_f16m1(...) __riscv_vluxei32_v_f16m1(__VA_ARGS__)
+#define vluxei32_v_f16m2(...) __riscv_vluxei32_v_f16m2(__VA_ARGS__)
+#define vluxei32_v_f16m4(...) __riscv_vluxei32_v_f16m4(__VA_ARGS__)
+#define vluxei64_v_f16mf4(...) __riscv_vluxei64_v_f16mf4(__VA_ARGS__)
+#define vluxei64_v_f16mf2(...) __riscv_vluxei64_v_f16mf2(__VA_ARGS__)
+#define vluxei64_v_f16m1(...) __riscv_vluxei64_v_f16m1(__VA_ARGS__)
+#define vluxei64_v_f16m2(...) __riscv_vluxei64_v_f16m2(__VA_ARGS__)
+#define vluxei8_v_f32mf2(...) __riscv_vluxei8_v_f32mf2(__VA_ARGS__)
+#define vluxei8_v_f32m1(...) __riscv_vluxei8_v_f32m1(__VA_ARGS__)
+#define vluxei8_v_f32m2(...) __riscv_vluxei8_v_f32m2(__VA_ARGS__)
+#define vluxei8_v_f32m4(...) __riscv_vluxei8_v_f32m4(__VA_ARGS__)
+#define vluxei8_v_f32m8(...) __riscv_vluxei8_v_f32m8(__VA_ARGS__)
+#define vluxei16_v_f32mf2(...) __riscv_vluxei16_v_f32mf2(__VA_ARGS__)
+#define vluxei16_v_f32m1(...) __riscv_vluxei16_v_f32m1(__VA_ARGS__)
+#define vluxei16_v_f32m2(...) __riscv_vluxei16_v_f32m2(__VA_ARGS__)
+#define vluxei16_v_f32m4(...) __riscv_vluxei16_v_f32m4(__VA_ARGS__)
+#define vluxei16_v_f32m8(...) __riscv_vluxei16_v_f32m8(__VA_ARGS__)
+#define vluxei32_v_f32mf2(...) __riscv_vluxei32_v_f32mf2(__VA_ARGS__)
+#define vluxei32_v_f32m1(...) __riscv_vluxei32_v_f32m1(__VA_ARGS__)
+#define vluxei32_v_f32m2(...) __riscv_vluxei32_v_f32m2(__VA_ARGS__)
+#define vluxei32_v_f32m4(...) __riscv_vluxei32_v_f32m4(__VA_ARGS__)
+#define vluxei32_v_f32m8(...) __riscv_vluxei32_v_f32m8(__VA_ARGS__)
+#define vluxei64_v_f32mf2(...) __riscv_vluxei64_v_f32mf2(__VA_ARGS__)
+#define vluxei64_v_f32m1(...) __riscv_vluxei64_v_f32m1(__VA_ARGS__)
+#define vluxei64_v_f32m2(...) __riscv_vluxei64_v_f32m2(__VA_ARGS__)
+#define vluxei64_v_f32m4(...) __riscv_vluxei64_v_f32m4(__VA_ARGS__)
+#define vluxei8_v_f64m1(...) __riscv_vluxei8_v_f64m1(__VA_ARGS__)
+#define vluxei8_v_f64m2(...) __riscv_vluxei8_v_f64m2(__VA_ARGS__)
+#define vluxei8_v_f64m4(...) __riscv_vluxei8_v_f64m4(__VA_ARGS__)
+#define vluxei8_v_f64m8(...) __riscv_vluxei8_v_f64m8(__VA_ARGS__)
+#define vluxei16_v_f64m1(...) __riscv_vluxei16_v_f64m1(__VA_ARGS__)
+#define vluxei16_v_f64m2(...) __riscv_vluxei16_v_f64m2(__VA_ARGS__)
+#define vluxei16_v_f64m4(...) __riscv_vluxei16_v_f64m4(__VA_ARGS__)
+#define vluxei16_v_f64m8(...) __riscv_vluxei16_v_f64m8(__VA_ARGS__)
+#define vluxei32_v_f64m1(...) __riscv_vluxei32_v_f64m1(__VA_ARGS__)
+#define vluxei32_v_f64m2(...) __riscv_vluxei32_v_f64m2(__VA_ARGS__)
+#define vluxei32_v_f64m4(...) __riscv_vluxei32_v_f64m4(__VA_ARGS__)
+#define vluxei32_v_f64m8(...) __riscv_vluxei32_v_f64m8(__VA_ARGS__)
+#define vluxei64_v_f64m1(...) __riscv_vluxei64_v_f64m1(__VA_ARGS__)
+#define vluxei64_v_f64m2(...) __riscv_vluxei64_v_f64m2(__VA_ARGS__)
+#define vluxei64_v_f64m4(...) __riscv_vluxei64_v_f64m4(__VA_ARGS__)
+#define vluxei64_v_f64m8(...) __riscv_vluxei64_v_f64m8(__VA_ARGS__)
+#define vloxei8_v_i8mf8(...) __riscv_vloxei8_v_i8mf8(__VA_ARGS__)
+#define vloxei8_v_i8mf4(...) __riscv_vloxei8_v_i8mf4(__VA_ARGS__)
+#define vloxei8_v_i8mf2(...) __riscv_vloxei8_v_i8mf2(__VA_ARGS__)
+#define vloxei8_v_i8m1(...) __riscv_vloxei8_v_i8m1(__VA_ARGS__)
+#define vloxei8_v_i8m2(...) __riscv_vloxei8_v_i8m2(__VA_ARGS__)
+#define vloxei8_v_i8m4(...) __riscv_vloxei8_v_i8m4(__VA_ARGS__)
+#define vloxei8_v_i8m8(...) __riscv_vloxei8_v_i8m8(__VA_ARGS__)
+#define vloxei16_v_i8mf8(...) __riscv_vloxei16_v_i8mf8(__VA_ARGS__)
+#define vloxei16_v_i8mf4(...) __riscv_vloxei16_v_i8mf4(__VA_ARGS__)
+#define vloxei16_v_i8mf2(...) __riscv_vloxei16_v_i8mf2(__VA_ARGS__)
+#define vloxei16_v_i8m1(...) __riscv_vloxei16_v_i8m1(__VA_ARGS__)
+#define vloxei16_v_i8m2(...) __riscv_vloxei16_v_i8m2(__VA_ARGS__)
+#define vloxei16_v_i8m4(...) __riscv_vloxei16_v_i8m4(__VA_ARGS__)
+#define vloxei32_v_i8mf8(...) __riscv_vloxei32_v_i8mf8(__VA_ARGS__)
+#define vloxei32_v_i8mf4(...) __riscv_vloxei32_v_i8mf4(__VA_ARGS__)
+#define vloxei32_v_i8mf2(...) __riscv_vloxei32_v_i8mf2(__VA_ARGS__)
+#define vloxei32_v_i8m1(...) __riscv_vloxei32_v_i8m1(__VA_ARGS__)
+#define vloxei32_v_i8m2(...) __riscv_vloxei32_v_i8m2(__VA_ARGS__)
+#define vloxei64_v_i8mf8(...) __riscv_vloxei64_v_i8mf8(__VA_ARGS__)
+#define vloxei64_v_i8mf4(...) __riscv_vloxei64_v_i8mf4(__VA_ARGS__)
+#define vloxei64_v_i8mf2(...) __riscv_vloxei64_v_i8mf2(__VA_ARGS__)
+#define vloxei64_v_i8m1(...) __riscv_vloxei64_v_i8m1(__VA_ARGS__)
+#define vloxei8_v_i16mf4(...) __riscv_vloxei8_v_i16mf4(__VA_ARGS__)
+#define vloxei8_v_i16mf2(...) __riscv_vloxei8_v_i16mf2(__VA_ARGS__)
+#define vloxei8_v_i16m1(...) __riscv_vloxei8_v_i16m1(__VA_ARGS__)
+#define vloxei8_v_i16m2(...) __riscv_vloxei8_v_i16m2(__VA_ARGS__)
+#define vloxei8_v_i16m4(...) __riscv_vloxei8_v_i16m4(__VA_ARGS__)
+#define vloxei8_v_i16m8(...) __riscv_vloxei8_v_i16m8(__VA_ARGS__)
+#define vloxei16_v_i16mf4(...) __riscv_vloxei16_v_i16mf4(__VA_ARGS__)
+#define vloxei16_v_i16mf2(...) __riscv_vloxei16_v_i16mf2(__VA_ARGS__)
+#define vloxei16_v_i16m1(...) __riscv_vloxei16_v_i16m1(__VA_ARGS__)
+#define vloxei16_v_i16m2(...) __riscv_vloxei16_v_i16m2(__VA_ARGS__)
+#define vloxei16_v_i16m4(...) __riscv_vloxei16_v_i16m4(__VA_ARGS__)
+#define vloxei16_v_i16m8(...) __riscv_vloxei16_v_i16m8(__VA_ARGS__)
+#define vloxei32_v_i16mf4(...) __riscv_vloxei32_v_i16mf4(__VA_ARGS__)
+#define vloxei32_v_i16mf2(...) __riscv_vloxei32_v_i16mf2(__VA_ARGS__)
+#define vloxei32_v_i16m1(...) __riscv_vloxei32_v_i16m1(__VA_ARGS__)
+#define vloxei32_v_i16m2(...) __riscv_vloxei32_v_i16m2(__VA_ARGS__)
+#define vloxei32_v_i16m4(...) __riscv_vloxei32_v_i16m4(__VA_ARGS__)
+#define vloxei64_v_i16mf4(...) __riscv_vloxei64_v_i16mf4(__VA_ARGS__)
+#define vloxei64_v_i16mf2(...) __riscv_vloxei64_v_i16mf2(__VA_ARGS__)
+#define vloxei64_v_i16m1(...) __riscv_vloxei64_v_i16m1(__VA_ARGS__)
+#define vloxei64_v_i16m2(...) __riscv_vloxei64_v_i16m2(__VA_ARGS__)
+#define vloxei8_v_i32mf2(...) __riscv_vloxei8_v_i32mf2(__VA_ARGS__)
+#define vloxei8_v_i32m1(...) __riscv_vloxei8_v_i32m1(__VA_ARGS__)
+#define vloxei8_v_i32m2(...) __riscv_vloxei8_v_i32m2(__VA_ARGS__)
+#define vloxei8_v_i32m4(...) __riscv_vloxei8_v_i32m4(__VA_ARGS__)
+#define vloxei8_v_i32m8(...) __riscv_vloxei8_v_i32m8(__VA_ARGS__)
+#define vloxei16_v_i32mf2(...) __riscv_vloxei16_v_i32mf2(__VA_ARGS__)
+#define vloxei16_v_i32m1(...) __riscv_vloxei16_v_i32m1(__VA_ARGS__)
+#define vloxei16_v_i32m2(...) __riscv_vloxei16_v_i32m2(__VA_ARGS__)
+#define vloxei16_v_i32m4(...) __riscv_vloxei16_v_i32m4(__VA_ARGS__)
+#define vloxei16_v_i32m8(...) __riscv_vloxei16_v_i32m8(__VA_ARGS__)
+#define vloxei32_v_i32mf2(...) __riscv_vloxei32_v_i32mf2(__VA_ARGS__)
+#define vloxei32_v_i32m1(...) __riscv_vloxei32_v_i32m1(__VA_ARGS__)
+#define vloxei32_v_i32m2(...) __riscv_vloxei32_v_i32m2(__VA_ARGS__)
+#define vloxei32_v_i32m4(...) __riscv_vloxei32_v_i32m4(__VA_ARGS__)
+#define vloxei32_v_i32m8(...) __riscv_vloxei32_v_i32m8(__VA_ARGS__)
+#define vloxei64_v_i32mf2(...) __riscv_vloxei64_v_i32mf2(__VA_ARGS__)
+#define vloxei64_v_i32m1(...) __riscv_vloxei64_v_i32m1(__VA_ARGS__)
+#define vloxei64_v_i32m2(...) __riscv_vloxei64_v_i32m2(__VA_ARGS__)
+#define vloxei64_v_i32m4(...) __riscv_vloxei64_v_i32m4(__VA_ARGS__)
+#define vloxei8_v_i64m1(...) __riscv_vloxei8_v_i64m1(__VA_ARGS__)
+#define vloxei8_v_i64m2(...) __riscv_vloxei8_v_i64m2(__VA_ARGS__)
+#define vloxei8_v_i64m4(...) __riscv_vloxei8_v_i64m4(__VA_ARGS__)
+#define vloxei8_v_i64m8(...) __riscv_vloxei8_v_i64m8(__VA_ARGS__)
+#define vloxei16_v_i64m1(...) __riscv_vloxei16_v_i64m1(__VA_ARGS__)
+#define vloxei16_v_i64m2(...) __riscv_vloxei16_v_i64m2(__VA_ARGS__)
+#define vloxei16_v_i64m4(...) __riscv_vloxei16_v_i64m4(__VA_ARGS__)
+#define vloxei16_v_i64m8(...) __riscv_vloxei16_v_i64m8(__VA_ARGS__)
+#define vloxei32_v_i64m1(...) __riscv_vloxei32_v_i64m1(__VA_ARGS__)
+#define vloxei32_v_i64m2(...) __riscv_vloxei32_v_i64m2(__VA_ARGS__)
+#define vloxei32_v_i64m4(...) __riscv_vloxei32_v_i64m4(__VA_ARGS__)
+#define vloxei32_v_i64m8(...) __riscv_vloxei32_v_i64m8(__VA_ARGS__)
+#define vloxei64_v_i64m1(...) __riscv_vloxei64_v_i64m1(__VA_ARGS__)
+#define vloxei64_v_i64m2(...) __riscv_vloxei64_v_i64m2(__VA_ARGS__)
+#define vloxei64_v_i64m4(...) __riscv_vloxei64_v_i64m4(__VA_ARGS__)
+#define vloxei64_v_i64m8(...) __riscv_vloxei64_v_i64m8(__VA_ARGS__)
+#define vluxei8_v_i8mf8(...) __riscv_vluxei8_v_i8mf8(__VA_ARGS__)
+#define vluxei8_v_i8mf4(...) __riscv_vluxei8_v_i8mf4(__VA_ARGS__)
+#define vluxei8_v_i8mf2(...) __riscv_vluxei8_v_i8mf2(__VA_ARGS__)
+#define vluxei8_v_i8m1(...) __riscv_vluxei8_v_i8m1(__VA_ARGS__)
+#define vluxei8_v_i8m2(...) __riscv_vluxei8_v_i8m2(__VA_ARGS__)
+#define vluxei8_v_i8m4(...) __riscv_vluxei8_v_i8m4(__VA_ARGS__)
+#define vluxei8_v_i8m8(...) __riscv_vluxei8_v_i8m8(__VA_ARGS__)
+#define vluxei16_v_i8mf8(...) __riscv_vluxei16_v_i8mf8(__VA_ARGS__)
+#define vluxei16_v_i8mf4(...) __riscv_vluxei16_v_i8mf4(__VA_ARGS__)
+#define vluxei16_v_i8mf2(...) __riscv_vluxei16_v_i8mf2(__VA_ARGS__)
+#define vluxei16_v_i8m1(...) __riscv_vluxei16_v_i8m1(__VA_ARGS__)
+#define vluxei16_v_i8m2(...) __riscv_vluxei16_v_i8m2(__VA_ARGS__)
+#define vluxei16_v_i8m4(...) __riscv_vluxei16_v_i8m4(__VA_ARGS__)
+#define vluxei32_v_i8mf8(...) __riscv_vluxei32_v_i8mf8(__VA_ARGS__)
+#define vluxei32_v_i8mf4(...) __riscv_vluxei32_v_i8mf4(__VA_ARGS__)
+#define vluxei32_v_i8mf2(...) __riscv_vluxei32_v_i8mf2(__VA_ARGS__)
+#define vluxei32_v_i8m1(...) __riscv_vluxei32_v_i8m1(__VA_ARGS__)
+#define vluxei32_v_i8m2(...) __riscv_vluxei32_v_i8m2(__VA_ARGS__)
+#define vluxei64_v_i8mf8(...) __riscv_vluxei64_v_i8mf8(__VA_ARGS__)
+#define vluxei64_v_i8mf4(...) __riscv_vluxei64_v_i8mf4(__VA_ARGS__)
+#define vluxei64_v_i8mf2(...) __riscv_vluxei64_v_i8mf2(__VA_ARGS__)
+#define vluxei64_v_i8m1(...) __riscv_vluxei64_v_i8m1(__VA_ARGS__)
+#define vluxei8_v_i16mf4(...) __riscv_vluxei8_v_i16mf4(__VA_ARGS__)
+#define vluxei8_v_i16mf2(...) __riscv_vluxei8_v_i16mf2(__VA_ARGS__)
+#define vluxei8_v_i16m1(...) __riscv_vluxei8_v_i16m1(__VA_ARGS__)
+#define vluxei8_v_i16m2(...) __riscv_vluxei8_v_i16m2(__VA_ARGS__)
+#define vluxei8_v_i16m4(...) __riscv_vluxei8_v_i16m4(__VA_ARGS__)
+#define vluxei8_v_i16m8(...) __riscv_vluxei8_v_i16m8(__VA_ARGS__)
+#define vluxei16_v_i16mf4(...) __riscv_vluxei16_v_i16mf4(__VA_ARGS__)
+#define vluxei16_v_i16mf2(...) __riscv_vluxei16_v_i16mf2(__VA_ARGS__)
+#define vluxei16_v_i16m1(...) __riscv_vluxei16_v_i16m1(__VA_ARGS__)
+#define vluxei16_v_i16m2(...) __riscv_vluxei16_v_i16m2(__VA_ARGS__)
+#define vluxei16_v_i16m4(...) __riscv_vluxei16_v_i16m4(__VA_ARGS__)
+#define vluxei16_v_i16m8(...) __riscv_vluxei16_v_i16m8(__VA_ARGS__)
+#define vluxei32_v_i16mf4(...) __riscv_vluxei32_v_i16mf4(__VA_ARGS__)
+#define vluxei32_v_i16mf2(...) __riscv_vluxei32_v_i16mf2(__VA_ARGS__)
+#define vluxei32_v_i16m1(...) __riscv_vluxei32_v_i16m1(__VA_ARGS__)
+#define vluxei32_v_i16m2(...) __riscv_vluxei32_v_i16m2(__VA_ARGS__)
+#define vluxei32_v_i16m4(...) __riscv_vluxei32_v_i16m4(__VA_ARGS__)
+#define vluxei64_v_i16mf4(...) __riscv_vluxei64_v_i16mf4(__VA_ARGS__)
+#define vluxei64_v_i16mf2(...) __riscv_vluxei64_v_i16mf2(__VA_ARGS__)
+#define vluxei64_v_i16m1(...) __riscv_vluxei64_v_i16m1(__VA_ARGS__)
+#define vluxei64_v_i16m2(...) __riscv_vluxei64_v_i16m2(__VA_ARGS__)
+#define vluxei8_v_i32mf2(...) __riscv_vluxei8_v_i32mf2(__VA_ARGS__)
+#define vluxei8_v_i32m1(...) __riscv_vluxei8_v_i32m1(__VA_ARGS__)
+#define vluxei8_v_i32m2(...) __riscv_vluxei8_v_i32m2(__VA_ARGS__)
+#define vluxei8_v_i32m4(...) __riscv_vluxei8_v_i32m4(__VA_ARGS__)
+#define vluxei8_v_i32m8(...) __riscv_vluxei8_v_i32m8(__VA_ARGS__)
+#define vluxei16_v_i32mf2(...) __riscv_vluxei16_v_i32mf2(__VA_ARGS__)
+#define vluxei16_v_i32m1(...) __riscv_vluxei16_v_i32m1(__VA_ARGS__)
+#define vluxei16_v_i32m2(...) __riscv_vluxei16_v_i32m2(__VA_ARGS__)
+#define vluxei16_v_i32m4(...) __riscv_vluxei16_v_i32m4(__VA_ARGS__)
+#define vluxei16_v_i32m8(...) __riscv_vluxei16_v_i32m8(__VA_ARGS__)
+#define vluxei32_v_i32mf2(...) __riscv_vluxei32_v_i32mf2(__VA_ARGS__)
+#define vluxei32_v_i32m1(...) __riscv_vluxei32_v_i32m1(__VA_ARGS__)
+#define vluxei32_v_i32m2(...) __riscv_vluxei32_v_i32m2(__VA_ARGS__)
+#define vluxei32_v_i32m4(...) __riscv_vluxei32_v_i32m4(__VA_ARGS__)
+#define vluxei32_v_i32m8(...) __riscv_vluxei32_v_i32m8(__VA_ARGS__)
+#define vluxei64_v_i32mf2(...) __riscv_vluxei64_v_i32mf2(__VA_ARGS__)
+#define vluxei64_v_i32m1(...) __riscv_vluxei64_v_i32m1(__VA_ARGS__)
+#define vluxei64_v_i32m2(...) __riscv_vluxei64_v_i32m2(__VA_ARGS__)
+#define vluxei64_v_i32m4(...) __riscv_vluxei64_v_i32m4(__VA_ARGS__)
+#define vluxei8_v_i64m1(...) __riscv_vluxei8_v_i64m1(__VA_ARGS__)
+#define vluxei8_v_i64m2(...) __riscv_vluxei8_v_i64m2(__VA_ARGS__)
+#define vluxei8_v_i64m4(...) __riscv_vluxei8_v_i64m4(__VA_ARGS__)
+#define vluxei8_v_i64m8(...) __riscv_vluxei8_v_i64m8(__VA_ARGS__)
+#define vluxei16_v_i64m1(...) __riscv_vluxei16_v_i64m1(__VA_ARGS__)
+#define vluxei16_v_i64m2(...) __riscv_vluxei16_v_i64m2(__VA_ARGS__)
+#define vluxei16_v_i64m4(...) __riscv_vluxei16_v_i64m4(__VA_ARGS__)
+#define vluxei16_v_i64m8(...) __riscv_vluxei16_v_i64m8(__VA_ARGS__)
+#define vluxei32_v_i64m1(...) __riscv_vluxei32_v_i64m1(__VA_ARGS__)
+#define vluxei32_v_i64m2(...) __riscv_vluxei32_v_i64m2(__VA_ARGS__)
+#define vluxei32_v_i64m4(...) __riscv_vluxei32_v_i64m4(__VA_ARGS__)
+#define vluxei32_v_i64m8(...) __riscv_vluxei32_v_i64m8(__VA_ARGS__)
+#define vluxei64_v_i64m1(...) __riscv_vluxei64_v_i64m1(__VA_ARGS__)
+#define vluxei64_v_i64m2(...) __riscv_vluxei64_v_i64m2(__VA_ARGS__)
+#define vluxei64_v_i64m4(...) __riscv_vluxei64_v_i64m4(__VA_ARGS__)
+#define vluxei64_v_i64m8(...) __riscv_vluxei64_v_i64m8(__VA_ARGS__)
+#define vloxei8_v_u8mf8(...) __riscv_vloxei8_v_u8mf8(__VA_ARGS__)
+#define vloxei8_v_u8mf4(...) __riscv_vloxei8_v_u8mf4(__VA_ARGS__)
+#define vloxei8_v_u8mf2(...) __riscv_vloxei8_v_u8mf2(__VA_ARGS__)
+#define vloxei8_v_u8m1(...) __riscv_vloxei8_v_u8m1(__VA_ARGS__)
+#define vloxei8_v_u8m2(...) __riscv_vloxei8_v_u8m2(__VA_ARGS__)
+#define vloxei8_v_u8m4(...) __riscv_vloxei8_v_u8m4(__VA_ARGS__)
+#define vloxei8_v_u8m8(...) __riscv_vloxei8_v_u8m8(__VA_ARGS__)
+#define vloxei16_v_u8mf8(...) __riscv_vloxei16_v_u8mf8(__VA_ARGS__)
+#define vloxei16_v_u8mf4(...) __riscv_vloxei16_v_u8mf4(__VA_ARGS__)
+#define vloxei16_v_u8mf2(...) __riscv_vloxei16_v_u8mf2(__VA_ARGS__)
+#define vloxei16_v_u8m1(...) __riscv_vloxei16_v_u8m1(__VA_ARGS__)
+#define vloxei16_v_u8m2(...) __riscv_vloxei16_v_u8m2(__VA_ARGS__)
+#define vloxei16_v_u8m4(...) __riscv_vloxei16_v_u8m4(__VA_ARGS__)
+#define vloxei32_v_u8mf8(...) __riscv_vloxei32_v_u8mf8(__VA_ARGS__)
+#define vloxei32_v_u8mf4(...) __riscv_vloxei32_v_u8mf4(__VA_ARGS__)
+#define vloxei32_v_u8mf2(...) __riscv_vloxei32_v_u8mf2(__VA_ARGS__)
+#define vloxei32_v_u8m1(...) __riscv_vloxei32_v_u8m1(__VA_ARGS__)
+#define vloxei32_v_u8m2(...) __riscv_vloxei32_v_u8m2(__VA_ARGS__)
+#define vloxei64_v_u8mf8(...) __riscv_vloxei64_v_u8mf8(__VA_ARGS__)
+#define vloxei64_v_u8mf4(...) __riscv_vloxei64_v_u8mf4(__VA_ARGS__)
+#define vloxei64_v_u8mf2(...) __riscv_vloxei64_v_u8mf2(__VA_ARGS__)
+#define vloxei64_v_u8m1(...) __riscv_vloxei64_v_u8m1(__VA_ARGS__)
+#define vloxei8_v_u16mf4(...) __riscv_vloxei8_v_u16mf4(__VA_ARGS__)
+#define vloxei8_v_u16mf2(...) __riscv_vloxei8_v_u16mf2(__VA_ARGS__)
+#define vloxei8_v_u16m1(...) __riscv_vloxei8_v_u16m1(__VA_ARGS__)
+#define vloxei8_v_u16m2(...) __riscv_vloxei8_v_u16m2(__VA_ARGS__)
+#define vloxei8_v_u16m4(...) __riscv_vloxei8_v_u16m4(__VA_ARGS__)
+#define vloxei8_v_u16m8(...) __riscv_vloxei8_v_u16m8(__VA_ARGS__)
+#define vloxei16_v_u16mf4(...) __riscv_vloxei16_v_u16mf4(__VA_ARGS__)
+#define vloxei16_v_u16mf2(...) __riscv_vloxei16_v_u16mf2(__VA_ARGS__)
+#define vloxei16_v_u16m1(...) __riscv_vloxei16_v_u16m1(__VA_ARGS__)
+#define vloxei16_v_u16m2(...) __riscv_vloxei16_v_u16m2(__VA_ARGS__)
+#define vloxei16_v_u16m4(...) __riscv_vloxei16_v_u16m4(__VA_ARGS__)
+#define vloxei16_v_u16m8(...) __riscv_vloxei16_v_u16m8(__VA_ARGS__)
+#define vloxei32_v_u16mf4(...) __riscv_vloxei32_v_u16mf4(__VA_ARGS__)
+#define vloxei32_v_u16mf2(...) __riscv_vloxei32_v_u16mf2(__VA_ARGS__)
+#define vloxei32_v_u16m1(...) __riscv_vloxei32_v_u16m1(__VA_ARGS__)
+#define vloxei32_v_u16m2(...) __riscv_vloxei32_v_u16m2(__VA_ARGS__)
+#define vloxei32_v_u16m4(...) __riscv_vloxei32_v_u16m4(__VA_ARGS__)
+#define vloxei64_v_u16mf4(...) __riscv_vloxei64_v_u16mf4(__VA_ARGS__)
+#define vloxei64_v_u16mf2(...) __riscv_vloxei64_v_u16mf2(__VA_ARGS__)
+#define vloxei64_v_u16m1(...) __riscv_vloxei64_v_u16m1(__VA_ARGS__)
+#define vloxei64_v_u16m2(...) __riscv_vloxei64_v_u16m2(__VA_ARGS__)
+#define vloxei8_v_u32mf2(...) __riscv_vloxei8_v_u32mf2(__VA_ARGS__)
+#define vloxei8_v_u32m1(...) __riscv_vloxei8_v_u32m1(__VA_ARGS__)
+#define vloxei8_v_u32m2(...) __riscv_vloxei8_v_u32m2(__VA_ARGS__)
+#define vloxei8_v_u32m4(...) __riscv_vloxei8_v_u32m4(__VA_ARGS__)
+#define vloxei8_v_u32m8(...) __riscv_vloxei8_v_u32m8(__VA_ARGS__)
+#define vloxei16_v_u32mf2(...) __riscv_vloxei16_v_u32mf2(__VA_ARGS__)
+#define vloxei16_v_u32m1(...) __riscv_vloxei16_v_u32m1(__VA_ARGS__)
+#define vloxei16_v_u32m2(...) __riscv_vloxei16_v_u32m2(__VA_ARGS__)
+#define vloxei16_v_u32m4(...) __riscv_vloxei16_v_u32m4(__VA_ARGS__)
+#define vloxei16_v_u32m8(...) __riscv_vloxei16_v_u32m8(__VA_ARGS__)
+#define vloxei32_v_u32mf2(...) __riscv_vloxei32_v_u32mf2(__VA_ARGS__)
+#define vloxei32_v_u32m1(...) __riscv_vloxei32_v_u32m1(__VA_ARGS__)
+#define vloxei32_v_u32m2(...) __riscv_vloxei32_v_u32m2(__VA_ARGS__)
+#define vloxei32_v_u32m4(...) __riscv_vloxei32_v_u32m4(__VA_ARGS__)
+#define vloxei32_v_u32m8(...) __riscv_vloxei32_v_u32m8(__VA_ARGS__)
+#define vloxei64_v_u32mf2(...) __riscv_vloxei64_v_u32mf2(__VA_ARGS__)
+#define vloxei64_v_u32m1(...) __riscv_vloxei64_v_u32m1(__VA_ARGS__)
+#define vloxei64_v_u32m2(...) __riscv_vloxei64_v_u32m2(__VA_ARGS__)
+#define vloxei64_v_u32m4(...) __riscv_vloxei64_v_u32m4(__VA_ARGS__)
+#define vloxei8_v_u64m1(...) __riscv_vloxei8_v_u64m1(__VA_ARGS__)
+#define vloxei8_v_u64m2(...) __riscv_vloxei8_v_u64m2(__VA_ARGS__)
+#define vloxei8_v_u64m4(...) __riscv_vloxei8_v_u64m4(__VA_ARGS__)
+#define vloxei8_v_u64m8(...) __riscv_vloxei8_v_u64m8(__VA_ARGS__)
+#define vloxei16_v_u64m1(...) __riscv_vloxei16_v_u64m1(__VA_ARGS__)
+#define vloxei16_v_u64m2(...) __riscv_vloxei16_v_u64m2(__VA_ARGS__)
+#define vloxei16_v_u64m4(...) __riscv_vloxei16_v_u64m4(__VA_ARGS__)
+#define vloxei16_v_u64m8(...) __riscv_vloxei16_v_u64m8(__VA_ARGS__)
+#define vloxei32_v_u64m1(...) __riscv_vloxei32_v_u64m1(__VA_ARGS__)
+#define vloxei32_v_u64m2(...) __riscv_vloxei32_v_u64m2(__VA_ARGS__)
+#define vloxei32_v_u64m4(...) __riscv_vloxei32_v_u64m4(__VA_ARGS__)
+#define vloxei32_v_u64m8(...) __riscv_vloxei32_v_u64m8(__VA_ARGS__)
+#define vloxei64_v_u64m1(...) __riscv_vloxei64_v_u64m1(__VA_ARGS__)
+#define vloxei64_v_u64m2(...) __riscv_vloxei64_v_u64m2(__VA_ARGS__)
+#define vloxei64_v_u64m4(...) __riscv_vloxei64_v_u64m4(__VA_ARGS__)
+#define vloxei64_v_u64m8(...) __riscv_vloxei64_v_u64m8(__VA_ARGS__)
+#define vluxei8_v_u8mf8(...) __riscv_vluxei8_v_u8mf8(__VA_ARGS__)
+#define vluxei8_v_u8mf4(...) __riscv_vluxei8_v_u8mf4(__VA_ARGS__)
+#define vluxei8_v_u8mf2(...) __riscv_vluxei8_v_u8mf2(__VA_ARGS__)
+#define vluxei8_v_u8m1(...) __riscv_vluxei8_v_u8m1(__VA_ARGS__)
+#define vluxei8_v_u8m2(...) __riscv_vluxei8_v_u8m2(__VA_ARGS__)
+#define vluxei8_v_u8m4(...) __riscv_vluxei8_v_u8m4(__VA_ARGS__)
+#define vluxei8_v_u8m8(...) __riscv_vluxei8_v_u8m8(__VA_ARGS__)
+#define vluxei16_v_u8mf8(...) __riscv_vluxei16_v_u8mf8(__VA_ARGS__)
+#define vluxei16_v_u8mf4(...) __riscv_vluxei16_v_u8mf4(__VA_ARGS__)
+#define vluxei16_v_u8mf2(...) __riscv_vluxei16_v_u8mf2(__VA_ARGS__)
+#define vluxei16_v_u8m1(...) __riscv_vluxei16_v_u8m1(__VA_ARGS__)
+#define vluxei16_v_u8m2(...) __riscv_vluxei16_v_u8m2(__VA_ARGS__)
+#define vluxei16_v_u8m4(...) __riscv_vluxei16_v_u8m4(__VA_ARGS__)
+#define vluxei32_v_u8mf8(...) __riscv_vluxei32_v_u8mf8(__VA_ARGS__)
+#define vluxei32_v_u8mf4(...) __riscv_vluxei32_v_u8mf4(__VA_ARGS__)
+#define vluxei32_v_u8mf2(...) __riscv_vluxei32_v_u8mf2(__VA_ARGS__)
+#define vluxei32_v_u8m1(...) __riscv_vluxei32_v_u8m1(__VA_ARGS__)
+#define vluxei32_v_u8m2(...) __riscv_vluxei32_v_u8m2(__VA_ARGS__)
+#define vluxei64_v_u8mf8(...) __riscv_vluxei64_v_u8mf8(__VA_ARGS__)
+#define vluxei64_v_u8mf4(...) __riscv_vluxei64_v_u8mf4(__VA_ARGS__)
+#define vluxei64_v_u8mf2(...) __riscv_vluxei64_v_u8mf2(__VA_ARGS__)
+#define vluxei64_v_u8m1(...) __riscv_vluxei64_v_u8m1(__VA_ARGS__)
+#define vluxei8_v_u16mf4(...) __riscv_vluxei8_v_u16mf4(__VA_ARGS__)
+#define vluxei8_v_u16mf2(...) __riscv_vluxei8_v_u16mf2(__VA_ARGS__)
+#define vluxei8_v_u16m1(...) __riscv_vluxei8_v_u16m1(__VA_ARGS__)
+#define vluxei8_v_u16m2(...) __riscv_vluxei8_v_u16m2(__VA_ARGS__)
+#define vluxei8_v_u16m4(...) __riscv_vluxei8_v_u16m4(__VA_ARGS__)
+#define vluxei8_v_u16m8(...) __riscv_vluxei8_v_u16m8(__VA_ARGS__)
+#define vluxei16_v_u16mf4(...) __riscv_vluxei16_v_u16mf4(__VA_ARGS__)
+#define vluxei16_v_u16mf2(...) __riscv_vluxei16_v_u16mf2(__VA_ARGS__)
+#define vluxei16_v_u16m1(...) __riscv_vluxei16_v_u16m1(__VA_ARGS__)
+#define vluxei16_v_u16m2(...) __riscv_vluxei16_v_u16m2(__VA_ARGS__)
+#define vluxei16_v_u16m4(...) __riscv_vluxei16_v_u16m4(__VA_ARGS__)
+#define vluxei16_v_u16m8(...) __riscv_vluxei16_v_u16m8(__VA_ARGS__)
+#define vluxei32_v_u16mf4(...) __riscv_vluxei32_v_u16mf4(__VA_ARGS__)
+#define vluxei32_v_u16mf2(...) __riscv_vluxei32_v_u16mf2(__VA_ARGS__)
+#define vluxei32_v_u16m1(...) __riscv_vluxei32_v_u16m1(__VA_ARGS__)
+#define vluxei32_v_u16m2(...) __riscv_vluxei32_v_u16m2(__VA_ARGS__)
+#define vluxei32_v_u16m4(...) __riscv_vluxei32_v_u16m4(__VA_ARGS__)
+#define vluxei64_v_u16mf4(...) __riscv_vluxei64_v_u16mf4(__VA_ARGS__)
+#define vluxei64_v_u16mf2(...) __riscv_vluxei64_v_u16mf2(__VA_ARGS__)
+#define vluxei64_v_u16m1(...) __riscv_vluxei64_v_u16m1(__VA_ARGS__)
+#define vluxei64_v_u16m2(...) __riscv_vluxei64_v_u16m2(__VA_ARGS__)
+#define vluxei8_v_u32mf2(...) __riscv_vluxei8_v_u32mf2(__VA_ARGS__)
+#define vluxei8_v_u32m1(...) __riscv_vluxei8_v_u32m1(__VA_ARGS__)
+#define vluxei8_v_u32m2(...) __riscv_vluxei8_v_u32m2(__VA_ARGS__)
+#define vluxei8_v_u32m4(...) __riscv_vluxei8_v_u32m4(__VA_ARGS__)
+#define vluxei8_v_u32m8(...) __riscv_vluxei8_v_u32m8(__VA_ARGS__)
+#define vluxei16_v_u32mf2(...) __riscv_vluxei16_v_u32mf2(__VA_ARGS__)
+#define vluxei16_v_u32m1(...) __riscv_vluxei16_v_u32m1(__VA_ARGS__)
+#define vluxei16_v_u32m2(...) __riscv_vluxei16_v_u32m2(__VA_ARGS__)
+#define vluxei16_v_u32m4(...) __riscv_vluxei16_v_u32m4(__VA_ARGS__)
+#define vluxei16_v_u32m8(...) __riscv_vluxei16_v_u32m8(__VA_ARGS__)
+#define vluxei32_v_u32mf2(...) __riscv_vluxei32_v_u32mf2(__VA_ARGS__)
+#define vluxei32_v_u32m1(...) __riscv_vluxei32_v_u32m1(__VA_ARGS__)
+#define vluxei32_v_u32m2(...) __riscv_vluxei32_v_u32m2(__VA_ARGS__)
+#define vluxei32_v_u32m4(...) __riscv_vluxei32_v_u32m4(__VA_ARGS__)
+#define vluxei32_v_u32m8(...) __riscv_vluxei32_v_u32m8(__VA_ARGS__)
+#define vluxei64_v_u32mf2(...) __riscv_vluxei64_v_u32mf2(__VA_ARGS__)
+#define vluxei64_v_u32m1(...) __riscv_vluxei64_v_u32m1(__VA_ARGS__)
+#define vluxei64_v_u32m2(...) __riscv_vluxei64_v_u32m2(__VA_ARGS__)
+#define vluxei64_v_u32m4(...) __riscv_vluxei64_v_u32m4(__VA_ARGS__)
+#define vluxei8_v_u64m1(...) __riscv_vluxei8_v_u64m1(__VA_ARGS__)
+#define vluxei8_v_u64m2(...) __riscv_vluxei8_v_u64m2(__VA_ARGS__)
+#define vluxei8_v_u64m4(...) __riscv_vluxei8_v_u64m4(__VA_ARGS__)
+#define vluxei8_v_u64m8(...) __riscv_vluxei8_v_u64m8(__VA_ARGS__)
+#define vluxei16_v_u64m1(...) __riscv_vluxei16_v_u64m1(__VA_ARGS__)
+#define vluxei16_v_u64m2(...) __riscv_vluxei16_v_u64m2(__VA_ARGS__)
+#define vluxei16_v_u64m4(...) __riscv_vluxei16_v_u64m4(__VA_ARGS__)
+#define vluxei16_v_u64m8(...) __riscv_vluxei16_v_u64m8(__VA_ARGS__)
+#define vluxei32_v_u64m1(...) __riscv_vluxei32_v_u64m1(__VA_ARGS__)
+#define vluxei32_v_u64m2(...) __riscv_vluxei32_v_u64m2(__VA_ARGS__)
+#define vluxei32_v_u64m4(...) __riscv_vluxei32_v_u64m4(__VA_ARGS__)
+#define vluxei32_v_u64m8(...) __riscv_vluxei32_v_u64m8(__VA_ARGS__)
+#define vluxei64_v_u64m1(...) __riscv_vluxei64_v_u64m1(__VA_ARGS__)
+#define vluxei64_v_u64m2(...) __riscv_vluxei64_v_u64m2(__VA_ARGS__)
+#define vluxei64_v_u64m4(...) __riscv_vluxei64_v_u64m4(__VA_ARGS__)
+#define vluxei64_v_u64m8(...) __riscv_vluxei64_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vloxei8_v_f16mf4_m(...) __riscv_vloxei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxei8_v_f16mf2_m(...) __riscv_vloxei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxei8_v_f16m1_m(...) __riscv_vloxei8_v_f16m1_tumu(__VA_ARGS__)
+#define vloxei8_v_f16m2_m(...) __riscv_vloxei8_v_f16m2_tumu(__VA_ARGS__)
+#define vloxei8_v_f16m4_m(...) __riscv_vloxei8_v_f16m4_tumu(__VA_ARGS__)
+#define vloxei8_v_f16m8_m(...) __riscv_vloxei8_v_f16m8_tumu(__VA_ARGS__)
+#define vloxei16_v_f16mf4_m(...) __riscv_vloxei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxei16_v_f16mf2_m(...) __riscv_vloxei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxei16_v_f16m1_m(...) __riscv_vloxei16_v_f16m1_tumu(__VA_ARGS__)
+#define vloxei16_v_f16m2_m(...) __riscv_vloxei16_v_f16m2_tumu(__VA_ARGS__)
+#define vloxei16_v_f16m4_m(...) __riscv_vloxei16_v_f16m4_tumu(__VA_ARGS__)
+#define vloxei16_v_f16m8_m(...) __riscv_vloxei16_v_f16m8_tumu(__VA_ARGS__)
+#define vloxei32_v_f16mf4_m(...) __riscv_vloxei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxei32_v_f16mf2_m(...) __riscv_vloxei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxei32_v_f16m1_m(...) __riscv_vloxei32_v_f16m1_tumu(__VA_ARGS__)
+#define vloxei32_v_f16m2_m(...) __riscv_vloxei32_v_f16m2_tumu(__VA_ARGS__)
+#define vloxei32_v_f16m4_m(...) __riscv_vloxei32_v_f16m4_tumu(__VA_ARGS__)
+#define vloxei64_v_f16mf4_m(...) __riscv_vloxei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxei64_v_f16mf2_m(...) __riscv_vloxei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxei64_v_f16m1_m(...) __riscv_vloxei64_v_f16m1_tumu(__VA_ARGS__)
+#define vloxei64_v_f16m2_m(...) __riscv_vloxei64_v_f16m2_tumu(__VA_ARGS__)
+#define vloxei8_v_f32mf2_m(...) __riscv_vloxei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxei8_v_f32m1_m(...) __riscv_vloxei8_v_f32m1_tumu(__VA_ARGS__)
+#define vloxei8_v_f32m2_m(...) __riscv_vloxei8_v_f32m2_tumu(__VA_ARGS__)
+#define vloxei8_v_f32m4_m(...) __riscv_vloxei8_v_f32m4_tumu(__VA_ARGS__)
+#define vloxei8_v_f32m8_m(...) __riscv_vloxei8_v_f32m8_tumu(__VA_ARGS__)
+#define vloxei16_v_f32mf2_m(...) __riscv_vloxei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxei16_v_f32m1_m(...) __riscv_vloxei16_v_f32m1_tumu(__VA_ARGS__)
+#define vloxei16_v_f32m2_m(...) __riscv_vloxei16_v_f32m2_tumu(__VA_ARGS__)
+#define vloxei16_v_f32m4_m(...) __riscv_vloxei16_v_f32m4_tumu(__VA_ARGS__)
+#define vloxei16_v_f32m8_m(...) __riscv_vloxei16_v_f32m8_tumu(__VA_ARGS__)
+#define vloxei32_v_f32mf2_m(...) __riscv_vloxei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxei32_v_f32m1_m(...) __riscv_vloxei32_v_f32m1_tumu(__VA_ARGS__)
+#define vloxei32_v_f32m2_m(...) __riscv_vloxei32_v_f32m2_tumu(__VA_ARGS__)
+#define vloxei32_v_f32m4_m(...) __riscv_vloxei32_v_f32m4_tumu(__VA_ARGS__)
+#define vloxei32_v_f32m8_m(...) __riscv_vloxei32_v_f32m8_tumu(__VA_ARGS__)
+#define vloxei64_v_f32mf2_m(...) __riscv_vloxei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxei64_v_f32m1_m(...) __riscv_vloxei64_v_f32m1_tumu(__VA_ARGS__)
+#define vloxei64_v_f32m2_m(...) __riscv_vloxei64_v_f32m2_tumu(__VA_ARGS__)
+#define vloxei64_v_f32m4_m(...) __riscv_vloxei64_v_f32m4_tumu(__VA_ARGS__)
+#define vloxei8_v_f64m1_m(...) __riscv_vloxei8_v_f64m1_tumu(__VA_ARGS__)
+#define vloxei8_v_f64m2_m(...) __riscv_vloxei8_v_f64m2_tumu(__VA_ARGS__)
+#define vloxei8_v_f64m4_m(...) __riscv_vloxei8_v_f64m4_tumu(__VA_ARGS__)
+#define vloxei8_v_f64m8_m(...) __riscv_vloxei8_v_f64m8_tumu(__VA_ARGS__)
+#define vloxei16_v_f64m1_m(...) __riscv_vloxei16_v_f64m1_tumu(__VA_ARGS__)
+#define vloxei16_v_f64m2_m(...) __riscv_vloxei16_v_f64m2_tumu(__VA_ARGS__)
+#define vloxei16_v_f64m4_m(...) __riscv_vloxei16_v_f64m4_tumu(__VA_ARGS__)
+#define vloxei16_v_f64m8_m(...) __riscv_vloxei16_v_f64m8_tumu(__VA_ARGS__)
+#define vloxei32_v_f64m1_m(...) __riscv_vloxei32_v_f64m1_tumu(__VA_ARGS__)
+#define vloxei32_v_f64m2_m(...) __riscv_vloxei32_v_f64m2_tumu(__VA_ARGS__)
+#define vloxei32_v_f64m4_m(...) __riscv_vloxei32_v_f64m4_tumu(__VA_ARGS__)
+#define vloxei32_v_f64m8_m(...) __riscv_vloxei32_v_f64m8_tumu(__VA_ARGS__)
+#define vloxei64_v_f64m1_m(...) __riscv_vloxei64_v_f64m1_tumu(__VA_ARGS__)
+#define vloxei64_v_f64m2_m(...) __riscv_vloxei64_v_f64m2_tumu(__VA_ARGS__)
+#define vloxei64_v_f64m4_m(...) __riscv_vloxei64_v_f64m4_tumu(__VA_ARGS__)
+#define vloxei64_v_f64m8_m(...) __riscv_vloxei64_v_f64m8_tumu(__VA_ARGS__)
+#define vluxei8_v_f16mf4_m(...) __riscv_vluxei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxei8_v_f16mf2_m(...) __riscv_vluxei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxei8_v_f16m1_m(...) __riscv_vluxei8_v_f16m1_tumu(__VA_ARGS__)
+#define vluxei8_v_f16m2_m(...) __riscv_vluxei8_v_f16m2_tumu(__VA_ARGS__)
+#define vluxei8_v_f16m4_m(...) __riscv_vluxei8_v_f16m4_tumu(__VA_ARGS__)
+#define vluxei8_v_f16m8_m(...) __riscv_vluxei8_v_f16m8_tumu(__VA_ARGS__)
+#define vluxei16_v_f16mf4_m(...) __riscv_vluxei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxei16_v_f16mf2_m(...) __riscv_vluxei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxei16_v_f16m1_m(...) __riscv_vluxei16_v_f16m1_tumu(__VA_ARGS__)
+#define vluxei16_v_f16m2_m(...) __riscv_vluxei16_v_f16m2_tumu(__VA_ARGS__)
+#define vluxei16_v_f16m4_m(...) __riscv_vluxei16_v_f16m4_tumu(__VA_ARGS__)
+#define vluxei16_v_f16m8_m(...) __riscv_vluxei16_v_f16m8_tumu(__VA_ARGS__)
+#define vluxei32_v_f16mf4_m(...) __riscv_vluxei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxei32_v_f16mf2_m(...) __riscv_vluxei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxei32_v_f16m1_m(...) __riscv_vluxei32_v_f16m1_tumu(__VA_ARGS__)
+#define vluxei32_v_f16m2_m(...) __riscv_vluxei32_v_f16m2_tumu(__VA_ARGS__)
+#define vluxei32_v_f16m4_m(...) __riscv_vluxei32_v_f16m4_tumu(__VA_ARGS__)
+#define vluxei64_v_f16mf4_m(...) __riscv_vluxei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxei64_v_f16mf2_m(...) __riscv_vluxei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxei64_v_f16m1_m(...) __riscv_vluxei64_v_f16m1_tumu(__VA_ARGS__)
+#define vluxei64_v_f16m2_m(...) __riscv_vluxei64_v_f16m2_tumu(__VA_ARGS__)
+#define vluxei8_v_f32mf2_m(...) __riscv_vluxei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxei8_v_f32m1_m(...) __riscv_vluxei8_v_f32m1_tumu(__VA_ARGS__)
+#define vluxei8_v_f32m2_m(...) __riscv_vluxei8_v_f32m2_tumu(__VA_ARGS__)
+#define vluxei8_v_f32m4_m(...) __riscv_vluxei8_v_f32m4_tumu(__VA_ARGS__)
+#define vluxei8_v_f32m8_m(...) __riscv_vluxei8_v_f32m8_tumu(__VA_ARGS__)
+#define vluxei16_v_f32mf2_m(...) __riscv_vluxei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxei16_v_f32m1_m(...) __riscv_vluxei16_v_f32m1_tumu(__VA_ARGS__)
+#define vluxei16_v_f32m2_m(...) __riscv_vluxei16_v_f32m2_tumu(__VA_ARGS__)
+#define vluxei16_v_f32m4_m(...) __riscv_vluxei16_v_f32m4_tumu(__VA_ARGS__)
+#define vluxei16_v_f32m8_m(...) __riscv_vluxei16_v_f32m8_tumu(__VA_ARGS__)
+#define vluxei32_v_f32mf2_m(...) __riscv_vluxei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxei32_v_f32m1_m(...) __riscv_vluxei32_v_f32m1_tumu(__VA_ARGS__)
+#define vluxei32_v_f32m2_m(...) __riscv_vluxei32_v_f32m2_tumu(__VA_ARGS__)
+#define vluxei32_v_f32m4_m(...) __riscv_vluxei32_v_f32m4_tumu(__VA_ARGS__)
+#define vluxei32_v_f32m8_m(...) __riscv_vluxei32_v_f32m8_tumu(__VA_ARGS__)
+#define vluxei64_v_f32mf2_m(...) __riscv_vluxei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxei64_v_f32m1_m(...) __riscv_vluxei64_v_f32m1_tumu(__VA_ARGS__)
+#define vluxei64_v_f32m2_m(...) __riscv_vluxei64_v_f32m2_tumu(__VA_ARGS__)
+#define vluxei64_v_f32m4_m(...) __riscv_vluxei64_v_f32m4_tumu(__VA_ARGS__)
+#define vluxei8_v_f64m1_m(...) __riscv_vluxei8_v_f64m1_tumu(__VA_ARGS__)
+#define vluxei8_v_f64m2_m(...) __riscv_vluxei8_v_f64m2_tumu(__VA_ARGS__)
+#define vluxei8_v_f64m4_m(...) __riscv_vluxei8_v_f64m4_tumu(__VA_ARGS__)
+#define vluxei8_v_f64m8_m(...) __riscv_vluxei8_v_f64m8_tumu(__VA_ARGS__)
+#define vluxei16_v_f64m1_m(...) __riscv_vluxei16_v_f64m1_tumu(__VA_ARGS__)
+#define vluxei16_v_f64m2_m(...) __riscv_vluxei16_v_f64m2_tumu(__VA_ARGS__)
+#define vluxei16_v_f64m4_m(...) __riscv_vluxei16_v_f64m4_tumu(__VA_ARGS__)
+#define vluxei16_v_f64m8_m(...) __riscv_vluxei16_v_f64m8_tumu(__VA_ARGS__)
+#define vluxei32_v_f64m1_m(...) __riscv_vluxei32_v_f64m1_tumu(__VA_ARGS__)
+#define vluxei32_v_f64m2_m(...) __riscv_vluxei32_v_f64m2_tumu(__VA_ARGS__)
+#define vluxei32_v_f64m4_m(...) __riscv_vluxei32_v_f64m4_tumu(__VA_ARGS__)
+#define vluxei32_v_f64m8_m(...) __riscv_vluxei32_v_f64m8_tumu(__VA_ARGS__)
+#define vluxei64_v_f64m1_m(...) __riscv_vluxei64_v_f64m1_tumu(__VA_ARGS__)
+#define vluxei64_v_f64m2_m(...) __riscv_vluxei64_v_f64m2_tumu(__VA_ARGS__)
+#define vluxei64_v_f64m4_m(...) __riscv_vluxei64_v_f64m4_tumu(__VA_ARGS__)
+#define vluxei64_v_f64m8_m(...) __riscv_vluxei64_v_f64m8_tumu(__VA_ARGS__)
+#define vloxei8_v_i8mf8_m(...) __riscv_vloxei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxei8_v_i8mf4_m(...) __riscv_vloxei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxei8_v_i8mf2_m(...) __riscv_vloxei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxei8_v_i8m1_m(...) __riscv_vloxei8_v_i8m1_tumu(__VA_ARGS__)
+#define vloxei8_v_i8m2_m(...) __riscv_vloxei8_v_i8m2_tumu(__VA_ARGS__)
+#define vloxei8_v_i8m4_m(...) __riscv_vloxei8_v_i8m4_tumu(__VA_ARGS__)
+#define vloxei8_v_i8m8_m(...) __riscv_vloxei8_v_i8m8_tumu(__VA_ARGS__)
+#define vloxei16_v_i8mf8_m(...) __riscv_vloxei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxei16_v_i8mf4_m(...) __riscv_vloxei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxei16_v_i8mf2_m(...) __riscv_vloxei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxei16_v_i8m1_m(...) __riscv_vloxei16_v_i8m1_tumu(__VA_ARGS__)
+#define vloxei16_v_i8m2_m(...) __riscv_vloxei16_v_i8m2_tumu(__VA_ARGS__)
+#define vloxei16_v_i8m4_m(...) __riscv_vloxei16_v_i8m4_tumu(__VA_ARGS__)
+#define vloxei32_v_i8mf8_m(...) __riscv_vloxei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxei32_v_i8mf4_m(...) __riscv_vloxei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxei32_v_i8mf2_m(...) __riscv_vloxei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxei32_v_i8m1_m(...) __riscv_vloxei32_v_i8m1_tumu(__VA_ARGS__)
+#define vloxei32_v_i8m2_m(...) __riscv_vloxei32_v_i8m2_tumu(__VA_ARGS__)
+#define vloxei64_v_i8mf8_m(...) __riscv_vloxei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxei64_v_i8mf4_m(...) __riscv_vloxei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxei64_v_i8mf2_m(...) __riscv_vloxei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxei64_v_i8m1_m(...) __riscv_vloxei64_v_i8m1_tumu(__VA_ARGS__)
+#define vloxei8_v_i16mf4_m(...) __riscv_vloxei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxei8_v_i16mf2_m(...) __riscv_vloxei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxei8_v_i16m1_m(...) __riscv_vloxei8_v_i16m1_tumu(__VA_ARGS__)
+#define vloxei8_v_i16m2_m(...) __riscv_vloxei8_v_i16m2_tumu(__VA_ARGS__)
+#define vloxei8_v_i16m4_m(...) __riscv_vloxei8_v_i16m4_tumu(__VA_ARGS__)
+#define vloxei8_v_i16m8_m(...) __riscv_vloxei8_v_i16m8_tumu(__VA_ARGS__)
+#define vloxei16_v_i16mf4_m(...) __riscv_vloxei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxei16_v_i16mf2_m(...) __riscv_vloxei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxei16_v_i16m1_m(...) __riscv_vloxei16_v_i16m1_tumu(__VA_ARGS__)
+#define vloxei16_v_i16m2_m(...) __riscv_vloxei16_v_i16m2_tumu(__VA_ARGS__)
+#define vloxei16_v_i16m4_m(...) __riscv_vloxei16_v_i16m4_tumu(__VA_ARGS__)
+#define vloxei16_v_i16m8_m(...) __riscv_vloxei16_v_i16m8_tumu(__VA_ARGS__)
+#define vloxei32_v_i16mf4_m(...) __riscv_vloxei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxei32_v_i16mf2_m(...) __riscv_vloxei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxei32_v_i16m1_m(...) __riscv_vloxei32_v_i16m1_tumu(__VA_ARGS__)
+#define vloxei32_v_i16m2_m(...) __riscv_vloxei32_v_i16m2_tumu(__VA_ARGS__)
+#define vloxei32_v_i16m4_m(...) __riscv_vloxei32_v_i16m4_tumu(__VA_ARGS__)
+#define vloxei64_v_i16mf4_m(...) __riscv_vloxei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxei64_v_i16mf2_m(...) __riscv_vloxei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxei64_v_i16m1_m(...) __riscv_vloxei64_v_i16m1_tumu(__VA_ARGS__)
+#define vloxei64_v_i16m2_m(...) __riscv_vloxei64_v_i16m2_tumu(__VA_ARGS__)
+#define vloxei8_v_i32mf2_m(...) __riscv_vloxei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxei8_v_i32m1_m(...) __riscv_vloxei8_v_i32m1_tumu(__VA_ARGS__)
+#define vloxei8_v_i32m2_m(...) __riscv_vloxei8_v_i32m2_tumu(__VA_ARGS__)
+#define vloxei8_v_i32m4_m(...) __riscv_vloxei8_v_i32m4_tumu(__VA_ARGS__)
+#define vloxei8_v_i32m8_m(...) __riscv_vloxei8_v_i32m8_tumu(__VA_ARGS__)
+#define vloxei16_v_i32mf2_m(...) __riscv_vloxei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxei16_v_i32m1_m(...) __riscv_vloxei16_v_i32m1_tumu(__VA_ARGS__)
+#define vloxei16_v_i32m2_m(...) __riscv_vloxei16_v_i32m2_tumu(__VA_ARGS__)
+#define vloxei16_v_i32m4_m(...) __riscv_vloxei16_v_i32m4_tumu(__VA_ARGS__)
+#define vloxei16_v_i32m8_m(...) __riscv_vloxei16_v_i32m8_tumu(__VA_ARGS__)
+#define vloxei32_v_i32mf2_m(...) __riscv_vloxei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxei32_v_i32m1_m(...) __riscv_vloxei32_v_i32m1_tumu(__VA_ARGS__)
+#define vloxei32_v_i32m2_m(...) __riscv_vloxei32_v_i32m2_tumu(__VA_ARGS__)
+#define vloxei32_v_i32m4_m(...) __riscv_vloxei32_v_i32m4_tumu(__VA_ARGS__)
+#define vloxei32_v_i32m8_m(...) __riscv_vloxei32_v_i32m8_tumu(__VA_ARGS__)
+#define vloxei64_v_i32mf2_m(...) __riscv_vloxei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxei64_v_i32m1_m(...) __riscv_vloxei64_v_i32m1_tumu(__VA_ARGS__)
+#define vloxei64_v_i32m2_m(...) __riscv_vloxei64_v_i32m2_tumu(__VA_ARGS__)
+#define vloxei64_v_i32m4_m(...) __riscv_vloxei64_v_i32m4_tumu(__VA_ARGS__)
+#define vloxei8_v_i64m1_m(...) __riscv_vloxei8_v_i64m1_tumu(__VA_ARGS__)
+#define vloxei8_v_i64m2_m(...) __riscv_vloxei8_v_i64m2_tumu(__VA_ARGS__)
+#define vloxei8_v_i64m4_m(...) __riscv_vloxei8_v_i64m4_tumu(__VA_ARGS__)
+#define vloxei8_v_i64m8_m(...) __riscv_vloxei8_v_i64m8_tumu(__VA_ARGS__)
+#define vloxei16_v_i64m1_m(...) __riscv_vloxei16_v_i64m1_tumu(__VA_ARGS__)
+#define vloxei16_v_i64m2_m(...) __riscv_vloxei16_v_i64m2_tumu(__VA_ARGS__)
+#define vloxei16_v_i64m4_m(...) __riscv_vloxei16_v_i64m4_tumu(__VA_ARGS__)
+#define vloxei16_v_i64m8_m(...) __riscv_vloxei16_v_i64m8_tumu(__VA_ARGS__)
+#define vloxei32_v_i64m1_m(...) __riscv_vloxei32_v_i64m1_tumu(__VA_ARGS__)
+#define vloxei32_v_i64m2_m(...) __riscv_vloxei32_v_i64m2_tumu(__VA_ARGS__)
+#define vloxei32_v_i64m4_m(...) __riscv_vloxei32_v_i64m4_tumu(__VA_ARGS__)
+#define vloxei32_v_i64m8_m(...) __riscv_vloxei32_v_i64m8_tumu(__VA_ARGS__)
+#define vloxei64_v_i64m1_m(...) __riscv_vloxei64_v_i64m1_tumu(__VA_ARGS__)
+#define vloxei64_v_i64m2_m(...) __riscv_vloxei64_v_i64m2_tumu(__VA_ARGS__)
+#define vloxei64_v_i64m4_m(...) __riscv_vloxei64_v_i64m4_tumu(__VA_ARGS__)
+#define vloxei64_v_i64m8_m(...) __riscv_vloxei64_v_i64m8_tumu(__VA_ARGS__)
+#define vluxei8_v_i8mf8_m(...) __riscv_vluxei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxei8_v_i8mf4_m(...) __riscv_vluxei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxei8_v_i8mf2_m(...) __riscv_vluxei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxei8_v_i8m1_m(...) __riscv_vluxei8_v_i8m1_tumu(__VA_ARGS__)
+#define vluxei8_v_i8m2_m(...) __riscv_vluxei8_v_i8m2_tumu(__VA_ARGS__)
+#define vluxei8_v_i8m4_m(...) __riscv_vluxei8_v_i8m4_tumu(__VA_ARGS__)
+#define vluxei8_v_i8m8_m(...) __riscv_vluxei8_v_i8m8_tumu(__VA_ARGS__)
+#define vluxei16_v_i8mf8_m(...) __riscv_vluxei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxei16_v_i8mf4_m(...) __riscv_vluxei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxei16_v_i8mf2_m(...) __riscv_vluxei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxei16_v_i8m1_m(...) __riscv_vluxei16_v_i8m1_tumu(__VA_ARGS__)
+#define vluxei16_v_i8m2_m(...) __riscv_vluxei16_v_i8m2_tumu(__VA_ARGS__)
+#define vluxei16_v_i8m4_m(...) __riscv_vluxei16_v_i8m4_tumu(__VA_ARGS__)
+#define vluxei32_v_i8mf8_m(...) __riscv_vluxei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxei32_v_i8mf4_m(...) __riscv_vluxei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxei32_v_i8mf2_m(...) __riscv_vluxei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxei32_v_i8m1_m(...) __riscv_vluxei32_v_i8m1_tumu(__VA_ARGS__)
+#define vluxei32_v_i8m2_m(...) __riscv_vluxei32_v_i8m2_tumu(__VA_ARGS__)
+#define vluxei64_v_i8mf8_m(...) __riscv_vluxei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxei64_v_i8mf4_m(...) __riscv_vluxei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxei64_v_i8mf2_m(...) __riscv_vluxei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxei64_v_i8m1_m(...) __riscv_vluxei64_v_i8m1_tumu(__VA_ARGS__)
+#define vluxei8_v_i16mf4_m(...) __riscv_vluxei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxei8_v_i16mf2_m(...) __riscv_vluxei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxei8_v_i16m1_m(...) __riscv_vluxei8_v_i16m1_tumu(__VA_ARGS__)
+#define vluxei8_v_i16m2_m(...) __riscv_vluxei8_v_i16m2_tumu(__VA_ARGS__)
+#define vluxei8_v_i16m4_m(...) __riscv_vluxei8_v_i16m4_tumu(__VA_ARGS__)
+#define vluxei8_v_i16m8_m(...) __riscv_vluxei8_v_i16m8_tumu(__VA_ARGS__)
+#define vluxei16_v_i16mf4_m(...) __riscv_vluxei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxei16_v_i16mf2_m(...) __riscv_vluxei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxei16_v_i16m1_m(...) __riscv_vluxei16_v_i16m1_tumu(__VA_ARGS__)
+#define vluxei16_v_i16m2_m(...) __riscv_vluxei16_v_i16m2_tumu(__VA_ARGS__)
+#define vluxei16_v_i16m4_m(...) __riscv_vluxei16_v_i16m4_tumu(__VA_ARGS__)
+#define vluxei16_v_i16m8_m(...) __riscv_vluxei16_v_i16m8_tumu(__VA_ARGS__)
+#define vluxei32_v_i16mf4_m(...) __riscv_vluxei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxei32_v_i16mf2_m(...) __riscv_vluxei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxei32_v_i16m1_m(...) __riscv_vluxei32_v_i16m1_tumu(__VA_ARGS__)
+#define vluxei32_v_i16m2_m(...) __riscv_vluxei32_v_i16m2_tumu(__VA_ARGS__)
+#define vluxei32_v_i16m4_m(...) __riscv_vluxei32_v_i16m4_tumu(__VA_ARGS__)
+#define vluxei64_v_i16mf4_m(...) __riscv_vluxei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxei64_v_i16mf2_m(...) __riscv_vluxei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxei64_v_i16m1_m(...) __riscv_vluxei64_v_i16m1_tumu(__VA_ARGS__)
+#define vluxei64_v_i16m2_m(...) __riscv_vluxei64_v_i16m2_tumu(__VA_ARGS__)
+#define vluxei8_v_i32mf2_m(...) __riscv_vluxei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxei8_v_i32m1_m(...) __riscv_vluxei8_v_i32m1_tumu(__VA_ARGS__)
+#define vluxei8_v_i32m2_m(...) __riscv_vluxei8_v_i32m2_tumu(__VA_ARGS__)
+#define vluxei8_v_i32m4_m(...) __riscv_vluxei8_v_i32m4_tumu(__VA_ARGS__)
+#define vluxei8_v_i32m8_m(...) __riscv_vluxei8_v_i32m8_tumu(__VA_ARGS__)
+#define vluxei16_v_i32mf2_m(...) __riscv_vluxei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxei16_v_i32m1_m(...) __riscv_vluxei16_v_i32m1_tumu(__VA_ARGS__)
+#define vluxei16_v_i32m2_m(...) __riscv_vluxei16_v_i32m2_tumu(__VA_ARGS__)
+#define vluxei16_v_i32m4_m(...) __riscv_vluxei16_v_i32m4_tumu(__VA_ARGS__)
+#define vluxei16_v_i32m8_m(...) __riscv_vluxei16_v_i32m8_tumu(__VA_ARGS__)
+#define vluxei32_v_i32mf2_m(...) __riscv_vluxei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxei32_v_i32m1_m(...) __riscv_vluxei32_v_i32m1_tumu(__VA_ARGS__)
+#define vluxei32_v_i32m2_m(...) __riscv_vluxei32_v_i32m2_tumu(__VA_ARGS__)
+#define vluxei32_v_i32m4_m(...) __riscv_vluxei32_v_i32m4_tumu(__VA_ARGS__)
+#define vluxei32_v_i32m8_m(...) __riscv_vluxei32_v_i32m8_tumu(__VA_ARGS__)
+#define vluxei64_v_i32mf2_m(...) __riscv_vluxei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxei64_v_i32m1_m(...) __riscv_vluxei64_v_i32m1_tumu(__VA_ARGS__)
+#define vluxei64_v_i32m2_m(...) __riscv_vluxei64_v_i32m2_tumu(__VA_ARGS__)
+#define vluxei64_v_i32m4_m(...) __riscv_vluxei64_v_i32m4_tumu(__VA_ARGS__)
+#define vluxei8_v_i64m1_m(...) __riscv_vluxei8_v_i64m1_tumu(__VA_ARGS__)
+#define vluxei8_v_i64m2_m(...) __riscv_vluxei8_v_i64m2_tumu(__VA_ARGS__)
+#define vluxei8_v_i64m4_m(...) __riscv_vluxei8_v_i64m4_tumu(__VA_ARGS__)
+#define vluxei8_v_i64m8_m(...) __riscv_vluxei8_v_i64m8_tumu(__VA_ARGS__)
+#define vluxei16_v_i64m1_m(...) __riscv_vluxei16_v_i64m1_tumu(__VA_ARGS__)
+#define vluxei16_v_i64m2_m(...) __riscv_vluxei16_v_i64m2_tumu(__VA_ARGS__)
+#define vluxei16_v_i64m4_m(...) __riscv_vluxei16_v_i64m4_tumu(__VA_ARGS__)
+#define vluxei16_v_i64m8_m(...) __riscv_vluxei16_v_i64m8_tumu(__VA_ARGS__)
+#define vluxei32_v_i64m1_m(...) __riscv_vluxei32_v_i64m1_tumu(__VA_ARGS__)
+#define vluxei32_v_i64m2_m(...) __riscv_vluxei32_v_i64m2_tumu(__VA_ARGS__)
+#define vluxei32_v_i64m4_m(...) __riscv_vluxei32_v_i64m4_tumu(__VA_ARGS__)
+#define vluxei32_v_i64m8_m(...) __riscv_vluxei32_v_i64m8_tumu(__VA_ARGS__)
+#define vluxei64_v_i64m1_m(...) __riscv_vluxei64_v_i64m1_tumu(__VA_ARGS__)
+#define vluxei64_v_i64m2_m(...) __riscv_vluxei64_v_i64m2_tumu(__VA_ARGS__)
+#define vluxei64_v_i64m4_m(...) __riscv_vluxei64_v_i64m4_tumu(__VA_ARGS__)
+#define vluxei64_v_i64m8_m(...) __riscv_vluxei64_v_i64m8_tumu(__VA_ARGS__)
+#define vloxei8_v_u8mf8_m(...) __riscv_vloxei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxei8_v_u8mf4_m(...) __riscv_vloxei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxei8_v_u8mf2_m(...) __riscv_vloxei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxei8_v_u8m1_m(...) __riscv_vloxei8_v_u8m1_tumu(__VA_ARGS__)
+#define vloxei8_v_u8m2_m(...) __riscv_vloxei8_v_u8m2_tumu(__VA_ARGS__)
+#define vloxei8_v_u8m4_m(...) __riscv_vloxei8_v_u8m4_tumu(__VA_ARGS__)
+#define vloxei8_v_u8m8_m(...) __riscv_vloxei8_v_u8m8_tumu(__VA_ARGS__)
+#define vloxei16_v_u8mf8_m(...) __riscv_vloxei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxei16_v_u8mf4_m(...) __riscv_vloxei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxei16_v_u8mf2_m(...) __riscv_vloxei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxei16_v_u8m1_m(...) __riscv_vloxei16_v_u8m1_tumu(__VA_ARGS__)
+#define vloxei16_v_u8m2_m(...) __riscv_vloxei16_v_u8m2_tumu(__VA_ARGS__)
+#define vloxei16_v_u8m4_m(...) __riscv_vloxei16_v_u8m4_tumu(__VA_ARGS__)
+#define vloxei32_v_u8mf8_m(...) __riscv_vloxei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxei32_v_u8mf4_m(...) __riscv_vloxei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxei32_v_u8mf2_m(...) __riscv_vloxei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxei32_v_u8m1_m(...) __riscv_vloxei32_v_u8m1_tumu(__VA_ARGS__)
+#define vloxei32_v_u8m2_m(...) __riscv_vloxei32_v_u8m2_tumu(__VA_ARGS__)
+#define vloxei64_v_u8mf8_m(...) __riscv_vloxei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxei64_v_u8mf4_m(...) __riscv_vloxei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxei64_v_u8mf2_m(...) __riscv_vloxei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxei64_v_u8m1_m(...) __riscv_vloxei64_v_u8m1_tumu(__VA_ARGS__)
+#define vloxei8_v_u16mf4_m(...) __riscv_vloxei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxei8_v_u16mf2_m(...) __riscv_vloxei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxei8_v_u16m1_m(...) __riscv_vloxei8_v_u16m1_tumu(__VA_ARGS__)
+#define vloxei8_v_u16m2_m(...) __riscv_vloxei8_v_u16m2_tumu(__VA_ARGS__)
+#define vloxei8_v_u16m4_m(...) __riscv_vloxei8_v_u16m4_tumu(__VA_ARGS__)
+#define vloxei8_v_u16m8_m(...) __riscv_vloxei8_v_u16m8_tumu(__VA_ARGS__)
+#define vloxei16_v_u16mf4_m(...) __riscv_vloxei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxei16_v_u16mf2_m(...) __riscv_vloxei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxei16_v_u16m1_m(...) __riscv_vloxei16_v_u16m1_tumu(__VA_ARGS__)
+#define vloxei16_v_u16m2_m(...) __riscv_vloxei16_v_u16m2_tumu(__VA_ARGS__)
+#define vloxei16_v_u16m4_m(...) __riscv_vloxei16_v_u16m4_tumu(__VA_ARGS__)
+#define vloxei16_v_u16m8_m(...) __riscv_vloxei16_v_u16m8_tumu(__VA_ARGS__)
+#define vloxei32_v_u16mf4_m(...) __riscv_vloxei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxei32_v_u16mf2_m(...) __riscv_vloxei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxei32_v_u16m1_m(...) __riscv_vloxei32_v_u16m1_tumu(__VA_ARGS__)
+#define vloxei32_v_u16m2_m(...) __riscv_vloxei32_v_u16m2_tumu(__VA_ARGS__)
+#define vloxei32_v_u16m4_m(...) __riscv_vloxei32_v_u16m4_tumu(__VA_ARGS__)
+#define vloxei64_v_u16mf4_m(...) __riscv_vloxei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxei64_v_u16mf2_m(...) __riscv_vloxei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxei64_v_u16m1_m(...) __riscv_vloxei64_v_u16m1_tumu(__VA_ARGS__)
+#define vloxei64_v_u16m2_m(...) __riscv_vloxei64_v_u16m2_tumu(__VA_ARGS__)
+#define vloxei8_v_u32mf2_m(...) __riscv_vloxei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxei8_v_u32m1_m(...) __riscv_vloxei8_v_u32m1_tumu(__VA_ARGS__)
+#define vloxei8_v_u32m2_m(...) __riscv_vloxei8_v_u32m2_tumu(__VA_ARGS__)
+#define vloxei8_v_u32m4_m(...) __riscv_vloxei8_v_u32m4_tumu(__VA_ARGS__)
+#define vloxei8_v_u32m8_m(...) __riscv_vloxei8_v_u32m8_tumu(__VA_ARGS__)
+#define vloxei16_v_u32mf2_m(...) __riscv_vloxei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxei16_v_u32m1_m(...) __riscv_vloxei16_v_u32m1_tumu(__VA_ARGS__)
+#define vloxei16_v_u32m2_m(...) __riscv_vloxei16_v_u32m2_tumu(__VA_ARGS__)
+#define vloxei16_v_u32m4_m(...) __riscv_vloxei16_v_u32m4_tumu(__VA_ARGS__)
+#define vloxei16_v_u32m8_m(...) __riscv_vloxei16_v_u32m8_tumu(__VA_ARGS__)
+#define vloxei32_v_u32mf2_m(...) __riscv_vloxei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxei32_v_u32m1_m(...) __riscv_vloxei32_v_u32m1_tumu(__VA_ARGS__)
+#define vloxei32_v_u32m2_m(...) __riscv_vloxei32_v_u32m2_tumu(__VA_ARGS__)
+#define vloxei32_v_u32m4_m(...) __riscv_vloxei32_v_u32m4_tumu(__VA_ARGS__)
+#define vloxei32_v_u32m8_m(...) __riscv_vloxei32_v_u32m8_tumu(__VA_ARGS__)
+#define vloxei64_v_u32mf2_m(...) __riscv_vloxei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxei64_v_u32m1_m(...) __riscv_vloxei64_v_u32m1_tumu(__VA_ARGS__)
+#define vloxei64_v_u32m2_m(...) __riscv_vloxei64_v_u32m2_tumu(__VA_ARGS__)
+#define vloxei64_v_u32m4_m(...) __riscv_vloxei64_v_u32m4_tumu(__VA_ARGS__)
+#define vloxei8_v_u64m1_m(...) __riscv_vloxei8_v_u64m1_tumu(__VA_ARGS__)
+#define vloxei8_v_u64m2_m(...) __riscv_vloxei8_v_u64m2_tumu(__VA_ARGS__)
+#define vloxei8_v_u64m4_m(...) __riscv_vloxei8_v_u64m4_tumu(__VA_ARGS__)
+#define vloxei8_v_u64m8_m(...) __riscv_vloxei8_v_u64m8_tumu(__VA_ARGS__)
+#define vloxei16_v_u64m1_m(...) __riscv_vloxei16_v_u64m1_tumu(__VA_ARGS__)
+#define vloxei16_v_u64m2_m(...) __riscv_vloxei16_v_u64m2_tumu(__VA_ARGS__)
+#define vloxei16_v_u64m4_m(...) __riscv_vloxei16_v_u64m4_tumu(__VA_ARGS__)
+#define vloxei16_v_u64m8_m(...) __riscv_vloxei16_v_u64m8_tumu(__VA_ARGS__)
+#define vloxei32_v_u64m1_m(...) __riscv_vloxei32_v_u64m1_tumu(__VA_ARGS__)
+#define vloxei32_v_u64m2_m(...) __riscv_vloxei32_v_u64m2_tumu(__VA_ARGS__)
+#define vloxei32_v_u64m4_m(...) __riscv_vloxei32_v_u64m4_tumu(__VA_ARGS__)
+#define vloxei32_v_u64m8_m(...) __riscv_vloxei32_v_u64m8_tumu(__VA_ARGS__)
+#define vloxei64_v_u64m1_m(...) __riscv_vloxei64_v_u64m1_tumu(__VA_ARGS__)
+#define vloxei64_v_u64m2_m(...) __riscv_vloxei64_v_u64m2_tumu(__VA_ARGS__)
+#define vloxei64_v_u64m4_m(...) __riscv_vloxei64_v_u64m4_tumu(__VA_ARGS__)
+#define vloxei64_v_u64m8_m(...) __riscv_vloxei64_v_u64m8_tumu(__VA_ARGS__)
+#define vluxei8_v_u8mf8_m(...) __riscv_vluxei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxei8_v_u8mf4_m(...) __riscv_vluxei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxei8_v_u8mf2_m(...) __riscv_vluxei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxei8_v_u8m1_m(...) __riscv_vluxei8_v_u8m1_tumu(__VA_ARGS__)
+#define vluxei8_v_u8m2_m(...) __riscv_vluxei8_v_u8m2_tumu(__VA_ARGS__)
+#define vluxei8_v_u8m4_m(...) __riscv_vluxei8_v_u8m4_tumu(__VA_ARGS__)
+#define vluxei8_v_u8m8_m(...) __riscv_vluxei8_v_u8m8_tumu(__VA_ARGS__)
+#define vluxei16_v_u8mf8_m(...) __riscv_vluxei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxei16_v_u8mf4_m(...) __riscv_vluxei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxei16_v_u8mf2_m(...) __riscv_vluxei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxei16_v_u8m1_m(...) __riscv_vluxei16_v_u8m1_tumu(__VA_ARGS__)
+#define vluxei16_v_u8m2_m(...) __riscv_vluxei16_v_u8m2_tumu(__VA_ARGS__)
+#define vluxei16_v_u8m4_m(...) __riscv_vluxei16_v_u8m4_tumu(__VA_ARGS__)
+#define vluxei32_v_u8mf8_m(...) __riscv_vluxei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxei32_v_u8mf4_m(...) __riscv_vluxei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxei32_v_u8mf2_m(...) __riscv_vluxei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxei32_v_u8m1_m(...) __riscv_vluxei32_v_u8m1_tumu(__VA_ARGS__)
+#define vluxei32_v_u8m2_m(...) __riscv_vluxei32_v_u8m2_tumu(__VA_ARGS__)
+#define vluxei64_v_u8mf8_m(...) __riscv_vluxei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxei64_v_u8mf4_m(...) __riscv_vluxei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxei64_v_u8mf2_m(...) __riscv_vluxei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxei64_v_u8m1_m(...) __riscv_vluxei64_v_u8m1_tumu(__VA_ARGS__)
+#define vluxei8_v_u16mf4_m(...) __riscv_vluxei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxei8_v_u16mf2_m(...) __riscv_vluxei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxei8_v_u16m1_m(...) __riscv_vluxei8_v_u16m1_tumu(__VA_ARGS__)
+#define vluxei8_v_u16m2_m(...) __riscv_vluxei8_v_u16m2_tumu(__VA_ARGS__)
+#define vluxei8_v_u16m4_m(...) __riscv_vluxei8_v_u16m4_tumu(__VA_ARGS__)
+#define vluxei8_v_u16m8_m(...) __riscv_vluxei8_v_u16m8_tumu(__VA_ARGS__)
+#define vluxei16_v_u16mf4_m(...) __riscv_vluxei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxei16_v_u16mf2_m(...) __riscv_vluxei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxei16_v_u16m1_m(...) __riscv_vluxei16_v_u16m1_tumu(__VA_ARGS__)
+#define vluxei16_v_u16m2_m(...) __riscv_vluxei16_v_u16m2_tumu(__VA_ARGS__)
+#define vluxei16_v_u16m4_m(...) __riscv_vluxei16_v_u16m4_tumu(__VA_ARGS__)
+#define vluxei16_v_u16m8_m(...) __riscv_vluxei16_v_u16m8_tumu(__VA_ARGS__)
+#define vluxei32_v_u16mf4_m(...) __riscv_vluxei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxei32_v_u16mf2_m(...) __riscv_vluxei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxei32_v_u16m1_m(...) __riscv_vluxei32_v_u16m1_tumu(__VA_ARGS__)
+#define vluxei32_v_u16m2_m(...) __riscv_vluxei32_v_u16m2_tumu(__VA_ARGS__)
+#define vluxei32_v_u16m4_m(...) __riscv_vluxei32_v_u16m4_tumu(__VA_ARGS__)
+#define vluxei64_v_u16mf4_m(...) __riscv_vluxei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxei64_v_u16mf2_m(...) __riscv_vluxei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxei64_v_u16m1_m(...) __riscv_vluxei64_v_u16m1_tumu(__VA_ARGS__)
+#define vluxei64_v_u16m2_m(...) __riscv_vluxei64_v_u16m2_tumu(__VA_ARGS__)
+#define vluxei8_v_u32mf2_m(...) __riscv_vluxei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxei8_v_u32m1_m(...) __riscv_vluxei8_v_u32m1_tumu(__VA_ARGS__)
+#define vluxei8_v_u32m2_m(...) __riscv_vluxei8_v_u32m2_tumu(__VA_ARGS__)
+#define vluxei8_v_u32m4_m(...) __riscv_vluxei8_v_u32m4_tumu(__VA_ARGS__)
+#define vluxei8_v_u32m8_m(...) __riscv_vluxei8_v_u32m8_tumu(__VA_ARGS__)
+#define vluxei16_v_u32mf2_m(...) __riscv_vluxei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxei16_v_u32m1_m(...) __riscv_vluxei16_v_u32m1_tumu(__VA_ARGS__)
+#define vluxei16_v_u32m2_m(...) __riscv_vluxei16_v_u32m2_tumu(__VA_ARGS__)
+#define vluxei16_v_u32m4_m(...) __riscv_vluxei16_v_u32m4_tumu(__VA_ARGS__)
+#define vluxei16_v_u32m8_m(...) __riscv_vluxei16_v_u32m8_tumu(__VA_ARGS__)
+#define vluxei32_v_u32mf2_m(...) __riscv_vluxei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxei32_v_u32m1_m(...) __riscv_vluxei32_v_u32m1_tumu(__VA_ARGS__)
+#define vluxei32_v_u32m2_m(...) __riscv_vluxei32_v_u32m2_tumu(__VA_ARGS__)
+#define vluxei32_v_u32m4_m(...) __riscv_vluxei32_v_u32m4_tumu(__VA_ARGS__)
+#define vluxei32_v_u32m8_m(...) __riscv_vluxei32_v_u32m8_tumu(__VA_ARGS__)
+#define vluxei64_v_u32mf2_m(...) __riscv_vluxei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxei64_v_u32m1_m(...) __riscv_vluxei64_v_u32m1_tumu(__VA_ARGS__)
+#define vluxei64_v_u32m2_m(...) __riscv_vluxei64_v_u32m2_tumu(__VA_ARGS__)
+#define vluxei64_v_u32m4_m(...) __riscv_vluxei64_v_u32m4_tumu(__VA_ARGS__)
+#define vluxei8_v_u64m1_m(...) __riscv_vluxei8_v_u64m1_tumu(__VA_ARGS__)
+#define vluxei8_v_u64m2_m(...) __riscv_vluxei8_v_u64m2_tumu(__VA_ARGS__)
+#define vluxei8_v_u64m4_m(...) __riscv_vluxei8_v_u64m4_tumu(__VA_ARGS__)
+#define vluxei8_v_u64m8_m(...) __riscv_vluxei8_v_u64m8_tumu(__VA_ARGS__)
+#define vluxei16_v_u64m1_m(...) __riscv_vluxei16_v_u64m1_tumu(__VA_ARGS__)
+#define vluxei16_v_u64m2_m(...) __riscv_vluxei16_v_u64m2_tumu(__VA_ARGS__)
+#define vluxei16_v_u64m4_m(...) __riscv_vluxei16_v_u64m4_tumu(__VA_ARGS__)
+#define vluxei16_v_u64m8_m(...) __riscv_vluxei16_v_u64m8_tumu(__VA_ARGS__)
+#define vluxei32_v_u64m1_m(...) __riscv_vluxei32_v_u64m1_tumu(__VA_ARGS__)
+#define vluxei32_v_u64m2_m(...) __riscv_vluxei32_v_u64m2_tumu(__VA_ARGS__)
+#define vluxei32_v_u64m4_m(...) __riscv_vluxei32_v_u64m4_tumu(__VA_ARGS__)
+#define vluxei32_v_u64m8_m(...) __riscv_vluxei32_v_u64m8_tumu(__VA_ARGS__)
+#define vluxei64_v_u64m1_m(...) __riscv_vluxei64_v_u64m1_tumu(__VA_ARGS__)
+#define vluxei64_v_u64m2_m(...) __riscv_vluxei64_v_u64m2_tumu(__VA_ARGS__)
+#define vluxei64_v_u64m4_m(...) __riscv_vluxei64_v_u64m4_tumu(__VA_ARGS__)
+#define vluxei64_v_u64m8_m(...) __riscv_vluxei64_v_u64m8_tumu(__VA_ARGS__)
+#define vsoxei8_v_f16mf4(...) __riscv_vsoxei8_v_f16mf4(__VA_ARGS__)
+#define vsoxei8_v_f16mf2(...) __riscv_vsoxei8_v_f16mf2(__VA_ARGS__)
+#define vsoxei8_v_f16m1(...) __riscv_vsoxei8_v_f16m1(__VA_ARGS__)
+#define vsoxei8_v_f16m2(...) __riscv_vsoxei8_v_f16m2(__VA_ARGS__)
+#define vsoxei8_v_f16m4(...) __riscv_vsoxei8_v_f16m4(__VA_ARGS__)
+#define vsoxei8_v_f16m8(...) __riscv_vsoxei8_v_f16m8(__VA_ARGS__)
+#define vsoxei16_v_f16mf4(...) __riscv_vsoxei16_v_f16mf4(__VA_ARGS__)
+#define vsoxei16_v_f16mf2(...) __riscv_vsoxei16_v_f16mf2(__VA_ARGS__)
+#define vsoxei16_v_f16m1(...) __riscv_vsoxei16_v_f16m1(__VA_ARGS__)
+#define vsoxei16_v_f16m2(...) __riscv_vsoxei16_v_f16m2(__VA_ARGS__)
+#define vsoxei16_v_f16m4(...) __riscv_vsoxei16_v_f16m4(__VA_ARGS__)
+#define vsoxei16_v_f16m8(...) __riscv_vsoxei16_v_f16m8(__VA_ARGS__)
+#define vsoxei32_v_f16mf4(...) __riscv_vsoxei32_v_f16mf4(__VA_ARGS__)
+#define vsoxei32_v_f16mf2(...) __riscv_vsoxei32_v_f16mf2(__VA_ARGS__)
+#define vsoxei32_v_f16m1(...) __riscv_vsoxei32_v_f16m1(__VA_ARGS__)
+#define vsoxei32_v_f16m2(...) __riscv_vsoxei32_v_f16m2(__VA_ARGS__)
+#define vsoxei32_v_f16m4(...) __riscv_vsoxei32_v_f16m4(__VA_ARGS__)
+#define vsoxei64_v_f16mf4(...) __riscv_vsoxei64_v_f16mf4(__VA_ARGS__)
+#define vsoxei64_v_f16mf2(...) __riscv_vsoxei64_v_f16mf2(__VA_ARGS__)
+#define vsoxei64_v_f16m1(...) __riscv_vsoxei64_v_f16m1(__VA_ARGS__)
+#define vsoxei64_v_f16m2(...) __riscv_vsoxei64_v_f16m2(__VA_ARGS__)
+#define vsoxei8_v_f32mf2(...) __riscv_vsoxei8_v_f32mf2(__VA_ARGS__)
+#define vsoxei8_v_f32m1(...) __riscv_vsoxei8_v_f32m1(__VA_ARGS__)
+#define vsoxei8_v_f32m2(...) __riscv_vsoxei8_v_f32m2(__VA_ARGS__)
+#define vsoxei8_v_f32m4(...) __riscv_vsoxei8_v_f32m4(__VA_ARGS__)
+#define vsoxei8_v_f32m8(...) __riscv_vsoxei8_v_f32m8(__VA_ARGS__)
+#define vsoxei16_v_f32mf2(...) __riscv_vsoxei16_v_f32mf2(__VA_ARGS__)
+#define vsoxei16_v_f32m1(...) __riscv_vsoxei16_v_f32m1(__VA_ARGS__)
+#define vsoxei16_v_f32m2(...) __riscv_vsoxei16_v_f32m2(__VA_ARGS__)
+#define vsoxei16_v_f32m4(...) __riscv_vsoxei16_v_f32m4(__VA_ARGS__)
+#define vsoxei16_v_f32m8(...) __riscv_vsoxei16_v_f32m8(__VA_ARGS__)
+#define vsoxei32_v_f32mf2(...) __riscv_vsoxei32_v_f32mf2(__VA_ARGS__)
+#define vsoxei32_v_f32m1(...) __riscv_vsoxei32_v_f32m1(__VA_ARGS__)
+#define vsoxei32_v_f32m2(...) __riscv_vsoxei32_v_f32m2(__VA_ARGS__)
+#define vsoxei32_v_f32m4(...) __riscv_vsoxei32_v_f32m4(__VA_ARGS__)
+#define vsoxei32_v_f32m8(...) __riscv_vsoxei32_v_f32m8(__VA_ARGS__)
+#define vsoxei64_v_f32mf2(...) __riscv_vsoxei64_v_f32mf2(__VA_ARGS__)
+#define vsoxei64_v_f32m1(...) __riscv_vsoxei64_v_f32m1(__VA_ARGS__)
+#define vsoxei64_v_f32m2(...) __riscv_vsoxei64_v_f32m2(__VA_ARGS__)
+#define vsoxei64_v_f32m4(...) __riscv_vsoxei64_v_f32m4(__VA_ARGS__)
+#define vsoxei8_v_f64m1(...) __riscv_vsoxei8_v_f64m1(__VA_ARGS__)
+#define vsoxei8_v_f64m2(...) __riscv_vsoxei8_v_f64m2(__VA_ARGS__)
+#define vsoxei8_v_f64m4(...) __riscv_vsoxei8_v_f64m4(__VA_ARGS__)
+#define vsoxei8_v_f64m8(...) __riscv_vsoxei8_v_f64m8(__VA_ARGS__)
+#define vsoxei16_v_f64m1(...) __riscv_vsoxei16_v_f64m1(__VA_ARGS__)
+#define vsoxei16_v_f64m2(...) __riscv_vsoxei16_v_f64m2(__VA_ARGS__)
+#define vsoxei16_v_f64m4(...) __riscv_vsoxei16_v_f64m4(__VA_ARGS__)
+#define vsoxei16_v_f64m8(...) __riscv_vsoxei16_v_f64m8(__VA_ARGS__)
+#define vsoxei32_v_f64m1(...) __riscv_vsoxei32_v_f64m1(__VA_ARGS__)
+#define vsoxei32_v_f64m2(...) __riscv_vsoxei32_v_f64m2(__VA_ARGS__)
+#define vsoxei32_v_f64m4(...) __riscv_vsoxei32_v_f64m4(__VA_ARGS__)
+#define vsoxei32_v_f64m8(...) __riscv_vsoxei32_v_f64m8(__VA_ARGS__)
+#define vsoxei64_v_f64m1(...) __riscv_vsoxei64_v_f64m1(__VA_ARGS__)
+#define vsoxei64_v_f64m2(...) __riscv_vsoxei64_v_f64m2(__VA_ARGS__)
+#define vsoxei64_v_f64m4(...) __riscv_vsoxei64_v_f64m4(__VA_ARGS__)
+#define vsoxei64_v_f64m8(...) __riscv_vsoxei64_v_f64m8(__VA_ARGS__)
+#define vsuxei8_v_f16mf4(...) __riscv_vsuxei8_v_f16mf4(__VA_ARGS__)
+#define vsuxei8_v_f16mf2(...) __riscv_vsuxei8_v_f16mf2(__VA_ARGS__)
+#define vsuxei8_v_f16m1(...) __riscv_vsuxei8_v_f16m1(__VA_ARGS__)
+#define vsuxei8_v_f16m2(...) __riscv_vsuxei8_v_f16m2(__VA_ARGS__)
+#define vsuxei8_v_f16m4(...) __riscv_vsuxei8_v_f16m4(__VA_ARGS__)
+#define vsuxei8_v_f16m8(...) __riscv_vsuxei8_v_f16m8(__VA_ARGS__)
+#define vsuxei16_v_f16mf4(...) __riscv_vsuxei16_v_f16mf4(__VA_ARGS__)
+#define vsuxei16_v_f16mf2(...) __riscv_vsuxei16_v_f16mf2(__VA_ARGS__)
+#define vsuxei16_v_f16m1(...) __riscv_vsuxei16_v_f16m1(__VA_ARGS__)
+#define vsuxei16_v_f16m2(...) __riscv_vsuxei16_v_f16m2(__VA_ARGS__)
+#define vsuxei16_v_f16m4(...) __riscv_vsuxei16_v_f16m4(__VA_ARGS__)
+#define vsuxei16_v_f16m8(...) __riscv_vsuxei16_v_f16m8(__VA_ARGS__)
+#define vsuxei32_v_f16mf4(...) __riscv_vsuxei32_v_f16mf4(__VA_ARGS__)
+#define vsuxei32_v_f16mf2(...) __riscv_vsuxei32_v_f16mf2(__VA_ARGS__)
+#define vsuxei32_v_f16m1(...) __riscv_vsuxei32_v_f16m1(__VA_ARGS__)
+#define vsuxei32_v_f16m2(...) __riscv_vsuxei32_v_f16m2(__VA_ARGS__)
+#define vsuxei32_v_f16m4(...) __riscv_vsuxei32_v_f16m4(__VA_ARGS__)
+#define vsuxei64_v_f16mf4(...) __riscv_vsuxei64_v_f16mf4(__VA_ARGS__)
+#define vsuxei64_v_f16mf2(...) __riscv_vsuxei64_v_f16mf2(__VA_ARGS__)
+#define vsuxei64_v_f16m1(...) __riscv_vsuxei64_v_f16m1(__VA_ARGS__)
+#define vsuxei64_v_f16m2(...) __riscv_vsuxei64_v_f16m2(__VA_ARGS__)
+#define vsuxei8_v_f32mf2(...) __riscv_vsuxei8_v_f32mf2(__VA_ARGS__)
+#define vsuxei8_v_f32m1(...) __riscv_vsuxei8_v_f32m1(__VA_ARGS__)
+#define vsuxei8_v_f32m2(...) __riscv_vsuxei8_v_f32m2(__VA_ARGS__)
+#define vsuxei8_v_f32m4(...) __riscv_vsuxei8_v_f32m4(__VA_ARGS__)
+#define vsuxei8_v_f32m8(...) __riscv_vsuxei8_v_f32m8(__VA_ARGS__)
+#define vsuxei16_v_f32mf2(...) __riscv_vsuxei16_v_f32mf2(__VA_ARGS__)
+#define vsuxei16_v_f32m1(...) __riscv_vsuxei16_v_f32m1(__VA_ARGS__)
+#define vsuxei16_v_f32m2(...) __riscv_vsuxei16_v_f32m2(__VA_ARGS__)
+#define vsuxei16_v_f32m4(...) __riscv_vsuxei16_v_f32m4(__VA_ARGS__)
+#define vsuxei16_v_f32m8(...) __riscv_vsuxei16_v_f32m8(__VA_ARGS__)
+#define vsuxei32_v_f32mf2(...) __riscv_vsuxei32_v_f32mf2(__VA_ARGS__)
+#define vsuxei32_v_f32m1(...) __riscv_vsuxei32_v_f32m1(__VA_ARGS__)
+#define vsuxei32_v_f32m2(...) __riscv_vsuxei32_v_f32m2(__VA_ARGS__)
+#define vsuxei32_v_f32m4(...) __riscv_vsuxei32_v_f32m4(__VA_ARGS__)
+#define vsuxei32_v_f32m8(...) __riscv_vsuxei32_v_f32m8(__VA_ARGS__)
+#define vsuxei64_v_f32mf2(...) __riscv_vsuxei64_v_f32mf2(__VA_ARGS__)
+#define vsuxei64_v_f32m1(...) __riscv_vsuxei64_v_f32m1(__VA_ARGS__)
+#define vsuxei64_v_f32m2(...) __riscv_vsuxei64_v_f32m2(__VA_ARGS__)
+#define vsuxei64_v_f32m4(...) __riscv_vsuxei64_v_f32m4(__VA_ARGS__)
+#define vsuxei8_v_f64m1(...) __riscv_vsuxei8_v_f64m1(__VA_ARGS__)
+#define vsuxei8_v_f64m2(...) __riscv_vsuxei8_v_f64m2(__VA_ARGS__)
+#define vsuxei8_v_f64m4(...) __riscv_vsuxei8_v_f64m4(__VA_ARGS__)
+#define vsuxei8_v_f64m8(...) __riscv_vsuxei8_v_f64m8(__VA_ARGS__)
+#define vsuxei16_v_f64m1(...) __riscv_vsuxei16_v_f64m1(__VA_ARGS__)
+#define vsuxei16_v_f64m2(...) __riscv_vsuxei16_v_f64m2(__VA_ARGS__)
+#define vsuxei16_v_f64m4(...) __riscv_vsuxei16_v_f64m4(__VA_ARGS__)
+#define vsuxei16_v_f64m8(...) __riscv_vsuxei16_v_f64m8(__VA_ARGS__)
+#define vsuxei32_v_f64m1(...) __riscv_vsuxei32_v_f64m1(__VA_ARGS__)
+#define vsuxei32_v_f64m2(...) __riscv_vsuxei32_v_f64m2(__VA_ARGS__)
+#define vsuxei32_v_f64m4(...) __riscv_vsuxei32_v_f64m4(__VA_ARGS__)
+#define vsuxei32_v_f64m8(...) __riscv_vsuxei32_v_f64m8(__VA_ARGS__)
+#define vsuxei64_v_f64m1(...) __riscv_vsuxei64_v_f64m1(__VA_ARGS__)
+#define vsuxei64_v_f64m2(...) __riscv_vsuxei64_v_f64m2(__VA_ARGS__)
+#define vsuxei64_v_f64m4(...) __riscv_vsuxei64_v_f64m4(__VA_ARGS__)
+#define vsuxei64_v_f64m8(...) __riscv_vsuxei64_v_f64m8(__VA_ARGS__)
+#define vsoxei8_v_i8mf8(...) __riscv_vsoxei8_v_i8mf8(__VA_ARGS__)
+#define vsoxei8_v_i8mf4(...) __riscv_vsoxei8_v_i8mf4(__VA_ARGS__)
+#define vsoxei8_v_i8mf2(...) __riscv_vsoxei8_v_i8mf2(__VA_ARGS__)
+#define vsoxei8_v_i8m1(...) __riscv_vsoxei8_v_i8m1(__VA_ARGS__)
+#define vsoxei8_v_i8m2(...) __riscv_vsoxei8_v_i8m2(__VA_ARGS__)
+#define vsoxei8_v_i8m4(...) __riscv_vsoxei8_v_i8m4(__VA_ARGS__)
+#define vsoxei8_v_i8m8(...) __riscv_vsoxei8_v_i8m8(__VA_ARGS__)
+#define vsoxei16_v_i8mf8(...) __riscv_vsoxei16_v_i8mf8(__VA_ARGS__)
+#define vsoxei16_v_i8mf4(...) __riscv_vsoxei16_v_i8mf4(__VA_ARGS__)
+#define vsoxei16_v_i8mf2(...) __riscv_vsoxei16_v_i8mf2(__VA_ARGS__)
+#define vsoxei16_v_i8m1(...) __riscv_vsoxei16_v_i8m1(__VA_ARGS__)
+#define vsoxei16_v_i8m2(...) __riscv_vsoxei16_v_i8m2(__VA_ARGS__)
+#define vsoxei16_v_i8m4(...) __riscv_vsoxei16_v_i8m4(__VA_ARGS__)
+#define vsoxei32_v_i8mf8(...) __riscv_vsoxei32_v_i8mf8(__VA_ARGS__)
+#define vsoxei32_v_i8mf4(...) __riscv_vsoxei32_v_i8mf4(__VA_ARGS__)
+#define vsoxei32_v_i8mf2(...) __riscv_vsoxei32_v_i8mf2(__VA_ARGS__)
+#define vsoxei32_v_i8m1(...) __riscv_vsoxei32_v_i8m1(__VA_ARGS__)
+#define vsoxei32_v_i8m2(...) __riscv_vsoxei32_v_i8m2(__VA_ARGS__)
+#define vsoxei64_v_i8mf8(...) __riscv_vsoxei64_v_i8mf8(__VA_ARGS__)
+#define vsoxei64_v_i8mf4(...) __riscv_vsoxei64_v_i8mf4(__VA_ARGS__)
+#define vsoxei64_v_i8mf2(...) __riscv_vsoxei64_v_i8mf2(__VA_ARGS__)
+#define vsoxei64_v_i8m1(...) __riscv_vsoxei64_v_i8m1(__VA_ARGS__)
+#define vsoxei8_v_i16mf4(...) __riscv_vsoxei8_v_i16mf4(__VA_ARGS__)
+#define vsoxei8_v_i16mf2(...) __riscv_vsoxei8_v_i16mf2(__VA_ARGS__)
+#define vsoxei8_v_i16m1(...) __riscv_vsoxei8_v_i16m1(__VA_ARGS__)
+#define vsoxei8_v_i16m2(...) __riscv_vsoxei8_v_i16m2(__VA_ARGS__)
+#define vsoxei8_v_i16m4(...) __riscv_vsoxei8_v_i16m4(__VA_ARGS__)
+#define vsoxei8_v_i16m8(...) __riscv_vsoxei8_v_i16m8(__VA_ARGS__)
+#define vsoxei16_v_i16mf4(...) __riscv_vsoxei16_v_i16mf4(__VA_ARGS__)
+#define vsoxei16_v_i16mf2(...) __riscv_vsoxei16_v_i16mf2(__VA_ARGS__)
+#define vsoxei16_v_i16m1(...) __riscv_vsoxei16_v_i16m1(__VA_ARGS__)
+#define vsoxei16_v_i16m2(...) __riscv_vsoxei16_v_i16m2(__VA_ARGS__)
+#define vsoxei16_v_i16m4(...) __riscv_vsoxei16_v_i16m4(__VA_ARGS__)
+#define vsoxei16_v_i16m8(...) __riscv_vsoxei16_v_i16m8(__VA_ARGS__)
+#define vsoxei32_v_i16mf4(...) __riscv_vsoxei32_v_i16mf4(__VA_ARGS__)
+#define vsoxei32_v_i16mf2(...) __riscv_vsoxei32_v_i16mf2(__VA_ARGS__)
+#define vsoxei32_v_i16m1(...) __riscv_vsoxei32_v_i16m1(__VA_ARGS__)
+#define vsoxei32_v_i16m2(...) __riscv_vsoxei32_v_i16m2(__VA_ARGS__)
+#define vsoxei32_v_i16m4(...) __riscv_vsoxei32_v_i16m4(__VA_ARGS__)
+#define vsoxei64_v_i16mf4(...) __riscv_vsoxei64_v_i16mf4(__VA_ARGS__)
+#define vsoxei64_v_i16mf2(...) __riscv_vsoxei64_v_i16mf2(__VA_ARGS__)
+#define vsoxei64_v_i16m1(...) __riscv_vsoxei64_v_i16m1(__VA_ARGS__)
+#define vsoxei64_v_i16m2(...) __riscv_vsoxei64_v_i16m2(__VA_ARGS__)
+#define vsoxei8_v_i32mf2(...) __riscv_vsoxei8_v_i32mf2(__VA_ARGS__)
+#define vsoxei8_v_i32m1(...) __riscv_vsoxei8_v_i32m1(__VA_ARGS__)
+#define vsoxei8_v_i32m2(...) __riscv_vsoxei8_v_i32m2(__VA_ARGS__)
+#define vsoxei8_v_i32m4(...) __riscv_vsoxei8_v_i32m4(__VA_ARGS__)
+#define vsoxei8_v_i32m8(...) __riscv_vsoxei8_v_i32m8(__VA_ARGS__)
+#define vsoxei16_v_i32mf2(...) __riscv_vsoxei16_v_i32mf2(__VA_ARGS__)
+#define vsoxei16_v_i32m1(...) __riscv_vsoxei16_v_i32m1(__VA_ARGS__)
+#define vsoxei16_v_i32m2(...) __riscv_vsoxei16_v_i32m2(__VA_ARGS__)
+#define vsoxei16_v_i32m4(...) __riscv_vsoxei16_v_i32m4(__VA_ARGS__)
+#define vsoxei16_v_i32m8(...) __riscv_vsoxei16_v_i32m8(__VA_ARGS__)
+#define vsoxei32_v_i32mf2(...) __riscv_vsoxei32_v_i32mf2(__VA_ARGS__)
+#define vsoxei32_v_i32m1(...) __riscv_vsoxei32_v_i32m1(__VA_ARGS__)
+#define vsoxei32_v_i32m2(...) __riscv_vsoxei32_v_i32m2(__VA_ARGS__)
+#define vsoxei32_v_i32m4(...) __riscv_vsoxei32_v_i32m4(__VA_ARGS__)
+#define vsoxei32_v_i32m8(...) __riscv_vsoxei32_v_i32m8(__VA_ARGS__)
+#define vsoxei64_v_i32mf2(...) __riscv_vsoxei64_v_i32mf2(__VA_ARGS__)
+#define vsoxei64_v_i32m1(...) __riscv_vsoxei64_v_i32m1(__VA_ARGS__)
+#define vsoxei64_v_i32m2(...) __riscv_vsoxei64_v_i32m2(__VA_ARGS__)
+#define vsoxei64_v_i32m4(...) __riscv_vsoxei64_v_i32m4(__VA_ARGS__)
+#define vsoxei8_v_i64m1(...) __riscv_vsoxei8_v_i64m1(__VA_ARGS__)
+#define vsoxei8_v_i64m2(...) __riscv_vsoxei8_v_i64m2(__VA_ARGS__)
+#define vsoxei8_v_i64m4(...) __riscv_vsoxei8_v_i64m4(__VA_ARGS__)
+#define vsoxei8_v_i64m8(...) __riscv_vsoxei8_v_i64m8(__VA_ARGS__)
+#define vsoxei16_v_i64m1(...) __riscv_vsoxei16_v_i64m1(__VA_ARGS__)
+#define vsoxei16_v_i64m2(...) __riscv_vsoxei16_v_i64m2(__VA_ARGS__)
+#define vsoxei16_v_i64m4(...) __riscv_vsoxei16_v_i64m4(__VA_ARGS__)
+#define vsoxei16_v_i64m8(...) __riscv_vsoxei16_v_i64m8(__VA_ARGS__)
+#define vsoxei32_v_i64m1(...) __riscv_vsoxei32_v_i64m1(__VA_ARGS__)
+#define vsoxei32_v_i64m2(...) __riscv_vsoxei32_v_i64m2(__VA_ARGS__)
+#define vsoxei32_v_i64m4(...) __riscv_vsoxei32_v_i64m4(__VA_ARGS__)
+#define vsoxei32_v_i64m8(...) __riscv_vsoxei32_v_i64m8(__VA_ARGS__)
+#define vsoxei64_v_i64m1(...) __riscv_vsoxei64_v_i64m1(__VA_ARGS__)
+#define vsoxei64_v_i64m2(...) __riscv_vsoxei64_v_i64m2(__VA_ARGS__)
+#define vsoxei64_v_i64m4(...) __riscv_vsoxei64_v_i64m4(__VA_ARGS__)
+#define vsoxei64_v_i64m8(...) __riscv_vsoxei64_v_i64m8(__VA_ARGS__)
+#define vsuxei8_v_i8mf8(...) __riscv_vsuxei8_v_i8mf8(__VA_ARGS__)
+#define vsuxei8_v_i8mf4(...) __riscv_vsuxei8_v_i8mf4(__VA_ARGS__)
+#define vsuxei8_v_i8mf2(...) __riscv_vsuxei8_v_i8mf2(__VA_ARGS__)
+#define vsuxei8_v_i8m1(...) __riscv_vsuxei8_v_i8m1(__VA_ARGS__)
+#define vsuxei8_v_i8m2(...) __riscv_vsuxei8_v_i8m2(__VA_ARGS__)
+#define vsuxei8_v_i8m4(...) __riscv_vsuxei8_v_i8m4(__VA_ARGS__)
+#define vsuxei8_v_i8m8(...) __riscv_vsuxei8_v_i8m8(__VA_ARGS__)
+#define vsuxei16_v_i8mf8(...) __riscv_vsuxei16_v_i8mf8(__VA_ARGS__)
+#define vsuxei16_v_i8mf4(...) __riscv_vsuxei16_v_i8mf4(__VA_ARGS__)
+#define vsuxei16_v_i8mf2(...) __riscv_vsuxei16_v_i8mf2(__VA_ARGS__)
+#define vsuxei16_v_i8m1(...) __riscv_vsuxei16_v_i8m1(__VA_ARGS__)
+#define vsuxei16_v_i8m2(...) __riscv_vsuxei16_v_i8m2(__VA_ARGS__)
+#define vsuxei16_v_i8m4(...) __riscv_vsuxei16_v_i8m4(__VA_ARGS__)
+#define vsuxei32_v_i8mf8(...) __riscv_vsuxei32_v_i8mf8(__VA_ARGS__)
+#define vsuxei32_v_i8mf4(...) __riscv_vsuxei32_v_i8mf4(__VA_ARGS__)
+#define vsuxei32_v_i8mf2(...) __riscv_vsuxei32_v_i8mf2(__VA_ARGS__)
+#define vsuxei32_v_i8m1(...) __riscv_vsuxei32_v_i8m1(__VA_ARGS__)
+#define vsuxei32_v_i8m2(...) __riscv_vsuxei32_v_i8m2(__VA_ARGS__)
+#define vsuxei64_v_i8mf8(...) __riscv_vsuxei64_v_i8mf8(__VA_ARGS__)
+#define vsuxei64_v_i8mf4(...) __riscv_vsuxei64_v_i8mf4(__VA_ARGS__)
+#define vsuxei64_v_i8mf2(...) __riscv_vsuxei64_v_i8mf2(__VA_ARGS__)
+#define vsuxei64_v_i8m1(...) __riscv_vsuxei64_v_i8m1(__VA_ARGS__)
+#define vsuxei8_v_i16mf4(...) __riscv_vsuxei8_v_i16mf4(__VA_ARGS__)
+#define vsuxei8_v_i16mf2(...) __riscv_vsuxei8_v_i16mf2(__VA_ARGS__)
+#define vsuxei8_v_i16m1(...) __riscv_vsuxei8_v_i16m1(__VA_ARGS__)
+#define vsuxei8_v_i16m2(...) __riscv_vsuxei8_v_i16m2(__VA_ARGS__)
+#define vsuxei8_v_i16m4(...) __riscv_vsuxei8_v_i16m4(__VA_ARGS__)
+#define vsuxei8_v_i16m8(...) __riscv_vsuxei8_v_i16m8(__VA_ARGS__)
+#define vsuxei16_v_i16mf4(...) __riscv_vsuxei16_v_i16mf4(__VA_ARGS__)
+#define vsuxei16_v_i16mf2(...) __riscv_vsuxei16_v_i16mf2(__VA_ARGS__)
+#define vsuxei16_v_i16m1(...) __riscv_vsuxei16_v_i16m1(__VA_ARGS__)
+#define vsuxei16_v_i16m2(...) __riscv_vsuxei16_v_i16m2(__VA_ARGS__)
+#define vsuxei16_v_i16m4(...) __riscv_vsuxei16_v_i16m4(__VA_ARGS__)
+#define vsuxei16_v_i16m8(...) __riscv_vsuxei16_v_i16m8(__VA_ARGS__)
+#define vsuxei32_v_i16mf4(...) __riscv_vsuxei32_v_i16mf4(__VA_ARGS__)
+#define vsuxei32_v_i16mf2(...) __riscv_vsuxei32_v_i16mf2(__VA_ARGS__)
+#define vsuxei32_v_i16m1(...) __riscv_vsuxei32_v_i16m1(__VA_ARGS__)
+#define vsuxei32_v_i16m2(...) __riscv_vsuxei32_v_i16m2(__VA_ARGS__)
+#define vsuxei32_v_i16m4(...) __riscv_vsuxei32_v_i16m4(__VA_ARGS__)
+#define vsuxei64_v_i16mf4(...) __riscv_vsuxei64_v_i16mf4(__VA_ARGS__)
+#define vsuxei64_v_i16mf2(...) __riscv_vsuxei64_v_i16mf2(__VA_ARGS__)
+#define vsuxei64_v_i16m1(...) __riscv_vsuxei64_v_i16m1(__VA_ARGS__)
+#define vsuxei64_v_i16m2(...) __riscv_vsuxei64_v_i16m2(__VA_ARGS__)
+#define vsuxei8_v_i32mf2(...) __riscv_vsuxei8_v_i32mf2(__VA_ARGS__)
+#define vsuxei8_v_i32m1(...) __riscv_vsuxei8_v_i32m1(__VA_ARGS__)
+#define vsuxei8_v_i32m2(...) __riscv_vsuxei8_v_i32m2(__VA_ARGS__)
+#define vsuxei8_v_i32m4(...) __riscv_vsuxei8_v_i32m4(__VA_ARGS__)
+#define vsuxei8_v_i32m8(...) __riscv_vsuxei8_v_i32m8(__VA_ARGS__)
+#define vsuxei16_v_i32mf2(...) __riscv_vsuxei16_v_i32mf2(__VA_ARGS__)
+#define vsuxei16_v_i32m1(...) __riscv_vsuxei16_v_i32m1(__VA_ARGS__)
+#define vsuxei16_v_i32m2(...) __riscv_vsuxei16_v_i32m2(__VA_ARGS__)
+#define vsuxei16_v_i32m4(...) __riscv_vsuxei16_v_i32m4(__VA_ARGS__)
+#define vsuxei16_v_i32m8(...) __riscv_vsuxei16_v_i32m8(__VA_ARGS__)
+#define vsuxei32_v_i32mf2(...) __riscv_vsuxei32_v_i32mf2(__VA_ARGS__)
+#define vsuxei32_v_i32m1(...) __riscv_vsuxei32_v_i32m1(__VA_ARGS__)
+#define vsuxei32_v_i32m2(...) __riscv_vsuxei32_v_i32m2(__VA_ARGS__)
+#define vsuxei32_v_i32m4(...) __riscv_vsuxei32_v_i32m4(__VA_ARGS__)
+#define vsuxei32_v_i32m8(...) __riscv_vsuxei32_v_i32m8(__VA_ARGS__)
+#define vsuxei64_v_i32mf2(...) __riscv_vsuxei64_v_i32mf2(__VA_ARGS__)
+#define vsuxei64_v_i32m1(...) __riscv_vsuxei64_v_i32m1(__VA_ARGS__)
+#define vsuxei64_v_i32m2(...) __riscv_vsuxei64_v_i32m2(__VA_ARGS__)
+#define vsuxei64_v_i32m4(...) __riscv_vsuxei64_v_i32m4(__VA_ARGS__)
+#define vsuxei8_v_i64m1(...) __riscv_vsuxei8_v_i64m1(__VA_ARGS__)
+#define vsuxei8_v_i64m2(...) __riscv_vsuxei8_v_i64m2(__VA_ARGS__)
+#define vsuxei8_v_i64m4(...) __riscv_vsuxei8_v_i64m4(__VA_ARGS__)
+#define vsuxei8_v_i64m8(...) __riscv_vsuxei8_v_i64m8(__VA_ARGS__)
+#define vsuxei16_v_i64m1(...) __riscv_vsuxei16_v_i64m1(__VA_ARGS__)
+#define vsuxei16_v_i64m2(...) __riscv_vsuxei16_v_i64m2(__VA_ARGS__)
+#define vsuxei16_v_i64m4(...) __riscv_vsuxei16_v_i64m4(__VA_ARGS__)
+#define vsuxei16_v_i64m8(...) __riscv_vsuxei16_v_i64m8(__VA_ARGS__)
+#define vsuxei32_v_i64m1(...) __riscv_vsuxei32_v_i64m1(__VA_ARGS__)
+#define vsuxei32_v_i64m2(...) __riscv_vsuxei32_v_i64m2(__VA_ARGS__)
+#define vsuxei32_v_i64m4(...) __riscv_vsuxei32_v_i64m4(__VA_ARGS__)
+#define vsuxei32_v_i64m8(...) __riscv_vsuxei32_v_i64m8(__VA_ARGS__)
+#define vsuxei64_v_i64m1(...) __riscv_vsuxei64_v_i64m1(__VA_ARGS__)
+#define vsuxei64_v_i64m2(...) __riscv_vsuxei64_v_i64m2(__VA_ARGS__)
+#define vsuxei64_v_i64m4(...) __riscv_vsuxei64_v_i64m4(__VA_ARGS__)
+#define vsuxei64_v_i64m8(...) __riscv_vsuxei64_v_i64m8(__VA_ARGS__)
+#define vsoxei8_v_u8mf8(...) __riscv_vsoxei8_v_u8mf8(__VA_ARGS__)
+#define vsoxei8_v_u8mf4(...) __riscv_vsoxei8_v_u8mf4(__VA_ARGS__)
+#define vsoxei8_v_u8mf2(...) __riscv_vsoxei8_v_u8mf2(__VA_ARGS__)
+#define vsoxei8_v_u8m1(...) __riscv_vsoxei8_v_u8m1(__VA_ARGS__)
+#define vsoxei8_v_u8m2(...) __riscv_vsoxei8_v_u8m2(__VA_ARGS__)
+#define vsoxei8_v_u8m4(...) __riscv_vsoxei8_v_u8m4(__VA_ARGS__)
+#define vsoxei8_v_u8m8(...) __riscv_vsoxei8_v_u8m8(__VA_ARGS__)
+#define vsoxei16_v_u8mf8(...) __riscv_vsoxei16_v_u8mf8(__VA_ARGS__)
+#define vsoxei16_v_u8mf4(...) __riscv_vsoxei16_v_u8mf4(__VA_ARGS__)
+#define vsoxei16_v_u8mf2(...) __riscv_vsoxei16_v_u8mf2(__VA_ARGS__)
+#define vsoxei16_v_u8m1(...) __riscv_vsoxei16_v_u8m1(__VA_ARGS__)
+#define vsoxei16_v_u8m2(...) __riscv_vsoxei16_v_u8m2(__VA_ARGS__)
+#define vsoxei16_v_u8m4(...) __riscv_vsoxei16_v_u8m4(__VA_ARGS__)
+#define vsoxei32_v_u8mf8(...) __riscv_vsoxei32_v_u8mf8(__VA_ARGS__)
+#define vsoxei32_v_u8mf4(...) __riscv_vsoxei32_v_u8mf4(__VA_ARGS__)
+#define vsoxei32_v_u8mf2(...) __riscv_vsoxei32_v_u8mf2(__VA_ARGS__)
+#define vsoxei32_v_u8m1(...) __riscv_vsoxei32_v_u8m1(__VA_ARGS__)
+#define vsoxei32_v_u8m2(...) __riscv_vsoxei32_v_u8m2(__VA_ARGS__)
+#define vsoxei64_v_u8mf8(...) __riscv_vsoxei64_v_u8mf8(__VA_ARGS__)
+#define vsoxei64_v_u8mf4(...) __riscv_vsoxei64_v_u8mf4(__VA_ARGS__)
+#define vsoxei64_v_u8mf2(...) __riscv_vsoxei64_v_u8mf2(__VA_ARGS__)
+#define vsoxei64_v_u8m1(...) __riscv_vsoxei64_v_u8m1(__VA_ARGS__)
+#define vsoxei8_v_u16mf4(...) __riscv_vsoxei8_v_u16mf4(__VA_ARGS__)
+#define vsoxei8_v_u16mf2(...) __riscv_vsoxei8_v_u16mf2(__VA_ARGS__)
+#define vsoxei8_v_u16m1(...) __riscv_vsoxei8_v_u16m1(__VA_ARGS__)
+#define vsoxei8_v_u16m2(...) __riscv_vsoxei8_v_u16m2(__VA_ARGS__)
+#define vsoxei8_v_u16m4(...) __riscv_vsoxei8_v_u16m4(__VA_ARGS__)
+#define vsoxei8_v_u16m8(...) __riscv_vsoxei8_v_u16m8(__VA_ARGS__)
+#define vsoxei16_v_u16mf4(...) __riscv_vsoxei16_v_u16mf4(__VA_ARGS__)
+#define vsoxei16_v_u16mf2(...) __riscv_vsoxei16_v_u16mf2(__VA_ARGS__)
+#define vsoxei16_v_u16m1(...) __riscv_vsoxei16_v_u16m1(__VA_ARGS__)
+#define vsoxei16_v_u16m2(...) __riscv_vsoxei16_v_u16m2(__VA_ARGS__)
+#define vsoxei16_v_u16m4(...) __riscv_vsoxei16_v_u16m4(__VA_ARGS__)
+#define vsoxei16_v_u16m8(...) __riscv_vsoxei16_v_u16m8(__VA_ARGS__)
+#define vsoxei32_v_u16mf4(...) __riscv_vsoxei32_v_u16mf4(__VA_ARGS__)
+#define vsoxei32_v_u16mf2(...) __riscv_vsoxei32_v_u16mf2(__VA_ARGS__)
+#define vsoxei32_v_u16m1(...) __riscv_vsoxei32_v_u16m1(__VA_ARGS__)
+#define vsoxei32_v_u16m2(...) __riscv_vsoxei32_v_u16m2(__VA_ARGS__)
+#define vsoxei32_v_u16m4(...) __riscv_vsoxei32_v_u16m4(__VA_ARGS__)
+#define vsoxei64_v_u16mf4(...) __riscv_vsoxei64_v_u16mf4(__VA_ARGS__)
+#define vsoxei64_v_u16mf2(...) __riscv_vsoxei64_v_u16mf2(__VA_ARGS__)
+#define vsoxei64_v_u16m1(...) __riscv_vsoxei64_v_u16m1(__VA_ARGS__)
+#define vsoxei64_v_u16m2(...) __riscv_vsoxei64_v_u16m2(__VA_ARGS__)
+#define vsoxei8_v_u32mf2(...) __riscv_vsoxei8_v_u32mf2(__VA_ARGS__)
+#define vsoxei8_v_u32m1(...) __riscv_vsoxei8_v_u32m1(__VA_ARGS__)
+#define vsoxei8_v_u32m2(...) __riscv_vsoxei8_v_u32m2(__VA_ARGS__)
+#define vsoxei8_v_u32m4(...) __riscv_vsoxei8_v_u32m4(__VA_ARGS__)
+#define vsoxei8_v_u32m8(...) __riscv_vsoxei8_v_u32m8(__VA_ARGS__)
+#define vsoxei16_v_u32mf2(...) __riscv_vsoxei16_v_u32mf2(__VA_ARGS__)
+#define vsoxei16_v_u32m1(...) __riscv_vsoxei16_v_u32m1(__VA_ARGS__)
+#define vsoxei16_v_u32m2(...) __riscv_vsoxei16_v_u32m2(__VA_ARGS__)
+#define vsoxei16_v_u32m4(...) __riscv_vsoxei16_v_u32m4(__VA_ARGS__)
+#define vsoxei16_v_u32m8(...) __riscv_vsoxei16_v_u32m8(__VA_ARGS__)
+#define vsoxei32_v_u32mf2(...) __riscv_vsoxei32_v_u32mf2(__VA_ARGS__)
+#define vsoxei32_v_u32m1(...) __riscv_vsoxei32_v_u32m1(__VA_ARGS__)
+#define vsoxei32_v_u32m2(...) __riscv_vsoxei32_v_u32m2(__VA_ARGS__)
+#define vsoxei32_v_u32m4(...) __riscv_vsoxei32_v_u32m4(__VA_ARGS__)
+#define vsoxei32_v_u32m8(...) __riscv_vsoxei32_v_u32m8(__VA_ARGS__)
+#define vsoxei64_v_u32mf2(...) __riscv_vsoxei64_v_u32mf2(__VA_ARGS__)
+#define vsoxei64_v_u32m1(...) __riscv_vsoxei64_v_u32m1(__VA_ARGS__)
+#define vsoxei64_v_u32m2(...) __riscv_vsoxei64_v_u32m2(__VA_ARGS__)
+#define vsoxei64_v_u32m4(...) __riscv_vsoxei64_v_u32m4(__VA_ARGS__)
+#define vsoxei8_v_u64m1(...) __riscv_vsoxei8_v_u64m1(__VA_ARGS__)
+#define vsoxei8_v_u64m2(...) __riscv_vsoxei8_v_u64m2(__VA_ARGS__)
+#define vsoxei8_v_u64m4(...) __riscv_vsoxei8_v_u64m4(__VA_ARGS__)
+#define vsoxei8_v_u64m8(...) __riscv_vsoxei8_v_u64m8(__VA_ARGS__)
+#define vsoxei16_v_u64m1(...) __riscv_vsoxei16_v_u64m1(__VA_ARGS__)
+#define vsoxei16_v_u64m2(...) __riscv_vsoxei16_v_u64m2(__VA_ARGS__)
+#define vsoxei16_v_u64m4(...) __riscv_vsoxei16_v_u64m4(__VA_ARGS__)
+#define vsoxei16_v_u64m8(...) __riscv_vsoxei16_v_u64m8(__VA_ARGS__)
+#define vsoxei32_v_u64m1(...) __riscv_vsoxei32_v_u64m1(__VA_ARGS__)
+#define vsoxei32_v_u64m2(...) __riscv_vsoxei32_v_u64m2(__VA_ARGS__)
+#define vsoxei32_v_u64m4(...) __riscv_vsoxei32_v_u64m4(__VA_ARGS__)
+#define vsoxei32_v_u64m8(...) __riscv_vsoxei32_v_u64m8(__VA_ARGS__)
+#define vsoxei64_v_u64m1(...) __riscv_vsoxei64_v_u64m1(__VA_ARGS__)
+#define vsoxei64_v_u64m2(...) __riscv_vsoxei64_v_u64m2(__VA_ARGS__)
+#define vsoxei64_v_u64m4(...) __riscv_vsoxei64_v_u64m4(__VA_ARGS__)
+#define vsoxei64_v_u64m8(...) __riscv_vsoxei64_v_u64m8(__VA_ARGS__)
+#define vsuxei8_v_u8mf8(...) __riscv_vsuxei8_v_u8mf8(__VA_ARGS__)
+#define vsuxei8_v_u8mf4(...) __riscv_vsuxei8_v_u8mf4(__VA_ARGS__)
+#define vsuxei8_v_u8mf2(...) __riscv_vsuxei8_v_u8mf2(__VA_ARGS__)
+#define vsuxei8_v_u8m1(...) __riscv_vsuxei8_v_u8m1(__VA_ARGS__)
+#define vsuxei8_v_u8m2(...) __riscv_vsuxei8_v_u8m2(__VA_ARGS__)
+#define vsuxei8_v_u8m4(...) __riscv_vsuxei8_v_u8m4(__VA_ARGS__)
+#define vsuxei8_v_u8m8(...) __riscv_vsuxei8_v_u8m8(__VA_ARGS__)
+#define vsuxei16_v_u8mf8(...) __riscv_vsuxei16_v_u8mf8(__VA_ARGS__)
+#define vsuxei16_v_u8mf4(...) __riscv_vsuxei16_v_u8mf4(__VA_ARGS__)
+#define vsuxei16_v_u8mf2(...) __riscv_vsuxei16_v_u8mf2(__VA_ARGS__)
+#define vsuxei16_v_u8m1(...) __riscv_vsuxei16_v_u8m1(__VA_ARGS__)
+#define vsuxei16_v_u8m2(...) __riscv_vsuxei16_v_u8m2(__VA_ARGS__)
+#define vsuxei16_v_u8m4(...) __riscv_vsuxei16_v_u8m4(__VA_ARGS__)
+#define vsuxei32_v_u8mf8(...) __riscv_vsuxei32_v_u8mf8(__VA_ARGS__)
+#define vsuxei32_v_u8mf4(...) __riscv_vsuxei32_v_u8mf4(__VA_ARGS__)
+#define vsuxei32_v_u8mf2(...) __riscv_vsuxei32_v_u8mf2(__VA_ARGS__)
+#define vsuxei32_v_u8m1(...) __riscv_vsuxei32_v_u8m1(__VA_ARGS__)
+#define vsuxei32_v_u8m2(...) __riscv_vsuxei32_v_u8m2(__VA_ARGS__)
+#define vsuxei64_v_u8mf8(...) __riscv_vsuxei64_v_u8mf8(__VA_ARGS__)
+#define vsuxei64_v_u8mf4(...) __riscv_vsuxei64_v_u8mf4(__VA_ARGS__)
+#define vsuxei64_v_u8mf2(...) __riscv_vsuxei64_v_u8mf2(__VA_ARGS__)
+#define vsuxei64_v_u8m1(...) __riscv_vsuxei64_v_u8m1(__VA_ARGS__)
+#define vsuxei8_v_u16mf4(...) __riscv_vsuxei8_v_u16mf4(__VA_ARGS__)
+#define vsuxei8_v_u16mf2(...) __riscv_vsuxei8_v_u16mf2(__VA_ARGS__)
+#define vsuxei8_v_u16m1(...) __riscv_vsuxei8_v_u16m1(__VA_ARGS__)
+#define vsuxei8_v_u16m2(...) __riscv_vsuxei8_v_u16m2(__VA_ARGS__)
+#define vsuxei8_v_u16m4(...) __riscv_vsuxei8_v_u16m4(__VA_ARGS__)
+#define vsuxei8_v_u16m8(...) __riscv_vsuxei8_v_u16m8(__VA_ARGS__)
+#define vsuxei16_v_u16mf4(...) __riscv_vsuxei16_v_u16mf4(__VA_ARGS__)
+#define vsuxei16_v_u16mf2(...) __riscv_vsuxei16_v_u16mf2(__VA_ARGS__)
+#define vsuxei16_v_u16m1(...) __riscv_vsuxei16_v_u16m1(__VA_ARGS__)
+#define vsuxei16_v_u16m2(...) __riscv_vsuxei16_v_u16m2(__VA_ARGS__)
+#define vsuxei16_v_u16m4(...) __riscv_vsuxei16_v_u16m4(__VA_ARGS__)
+#define vsuxei16_v_u16m8(...) __riscv_vsuxei16_v_u16m8(__VA_ARGS__)
+#define vsuxei32_v_u16mf4(...) __riscv_vsuxei32_v_u16mf4(__VA_ARGS__)
+#define vsuxei32_v_u16mf2(...) __riscv_vsuxei32_v_u16mf2(__VA_ARGS__)
+#define vsuxei32_v_u16m1(...) __riscv_vsuxei32_v_u16m1(__VA_ARGS__)
+#define vsuxei32_v_u16m2(...) __riscv_vsuxei32_v_u16m2(__VA_ARGS__)
+#define vsuxei32_v_u16m4(...) __riscv_vsuxei32_v_u16m4(__VA_ARGS__)
+#define vsuxei64_v_u16mf4(...) __riscv_vsuxei64_v_u16mf4(__VA_ARGS__)
+#define vsuxei64_v_u16mf2(...) __riscv_vsuxei64_v_u16mf2(__VA_ARGS__)
+#define vsuxei64_v_u16m1(...) __riscv_vsuxei64_v_u16m1(__VA_ARGS__)
+#define vsuxei64_v_u16m2(...) __riscv_vsuxei64_v_u16m2(__VA_ARGS__)
+#define vsuxei8_v_u32mf2(...) __riscv_vsuxei8_v_u32mf2(__VA_ARGS__)
+#define vsuxei8_v_u32m1(...) __riscv_vsuxei8_v_u32m1(__VA_ARGS__)
+#define vsuxei8_v_u32m2(...) __riscv_vsuxei8_v_u32m2(__VA_ARGS__)
+#define vsuxei8_v_u32m4(...) __riscv_vsuxei8_v_u32m4(__VA_ARGS__)
+#define vsuxei8_v_u32m8(...) __riscv_vsuxei8_v_u32m8(__VA_ARGS__)
+#define vsuxei16_v_u32mf2(...) __riscv_vsuxei16_v_u32mf2(__VA_ARGS__)
+#define vsuxei16_v_u32m1(...) __riscv_vsuxei16_v_u32m1(__VA_ARGS__)
+#define vsuxei16_v_u32m2(...) __riscv_vsuxei16_v_u32m2(__VA_ARGS__)
+#define vsuxei16_v_u32m4(...) __riscv_vsuxei16_v_u32m4(__VA_ARGS__)
+#define vsuxei16_v_u32m8(...) __riscv_vsuxei16_v_u32m8(__VA_ARGS__)
+#define vsuxei32_v_u32mf2(...) __riscv_vsuxei32_v_u32mf2(__VA_ARGS__)
+#define vsuxei32_v_u32m1(...) __riscv_vsuxei32_v_u32m1(__VA_ARGS__)
+#define vsuxei32_v_u32m2(...) __riscv_vsuxei32_v_u32m2(__VA_ARGS__)
+#define vsuxei32_v_u32m4(...) __riscv_vsuxei32_v_u32m4(__VA_ARGS__)
+#define vsuxei32_v_u32m8(...) __riscv_vsuxei32_v_u32m8(__VA_ARGS__)
+#define vsuxei64_v_u32mf2(...) __riscv_vsuxei64_v_u32mf2(__VA_ARGS__)
+#define vsuxei64_v_u32m1(...) __riscv_vsuxei64_v_u32m1(__VA_ARGS__)
+#define vsuxei64_v_u32m2(...) __riscv_vsuxei64_v_u32m2(__VA_ARGS__)
+#define vsuxei64_v_u32m4(...) __riscv_vsuxei64_v_u32m4(__VA_ARGS__)
+#define vsuxei8_v_u64m1(...) __riscv_vsuxei8_v_u64m1(__VA_ARGS__)
+#define vsuxei8_v_u64m2(...) __riscv_vsuxei8_v_u64m2(__VA_ARGS__)
+#define vsuxei8_v_u64m4(...) __riscv_vsuxei8_v_u64m4(__VA_ARGS__)
+#define vsuxei8_v_u64m8(...) __riscv_vsuxei8_v_u64m8(__VA_ARGS__)
+#define vsuxei16_v_u64m1(...) __riscv_vsuxei16_v_u64m1(__VA_ARGS__)
+#define vsuxei16_v_u64m2(...) __riscv_vsuxei16_v_u64m2(__VA_ARGS__)
+#define vsuxei16_v_u64m4(...) __riscv_vsuxei16_v_u64m4(__VA_ARGS__)
+#define vsuxei16_v_u64m8(...) __riscv_vsuxei16_v_u64m8(__VA_ARGS__)
+#define vsuxei32_v_u64m1(...) __riscv_vsuxei32_v_u64m1(__VA_ARGS__)
+#define vsuxei32_v_u64m2(...) __riscv_vsuxei32_v_u64m2(__VA_ARGS__)
+#define vsuxei32_v_u64m4(...) __riscv_vsuxei32_v_u64m4(__VA_ARGS__)
+#define vsuxei32_v_u64m8(...) __riscv_vsuxei32_v_u64m8(__VA_ARGS__)
+#define vsuxei64_v_u64m1(...) __riscv_vsuxei64_v_u64m1(__VA_ARGS__)
+#define vsuxei64_v_u64m2(...) __riscv_vsuxei64_v_u64m2(__VA_ARGS__)
+#define vsuxei64_v_u64m4(...) __riscv_vsuxei64_v_u64m4(__VA_ARGS__)
+#define vsuxei64_v_u64m8(...) __riscv_vsuxei64_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vsoxei8_v_f16mf4_m(...) __riscv_vsoxei8_v_f16mf4_m(__VA_ARGS__)
+#define vsoxei8_v_f16mf2_m(...) __riscv_vsoxei8_v_f16mf2_m(__VA_ARGS__)
+#define vsoxei8_v_f16m1_m(...) __riscv_vsoxei8_v_f16m1_m(__VA_ARGS__)
+#define vsoxei8_v_f16m2_m(...) __riscv_vsoxei8_v_f16m2_m(__VA_ARGS__)
+#define vsoxei8_v_f16m4_m(...) __riscv_vsoxei8_v_f16m4_m(__VA_ARGS__)
+#define vsoxei8_v_f16m8_m(...) __riscv_vsoxei8_v_f16m8_m(__VA_ARGS__)
+#define vsoxei16_v_f16mf4_m(...) __riscv_vsoxei16_v_f16mf4_m(__VA_ARGS__)
+#define vsoxei16_v_f16mf2_m(...) __riscv_vsoxei16_v_f16mf2_m(__VA_ARGS__)
+#define vsoxei16_v_f16m1_m(...) __riscv_vsoxei16_v_f16m1_m(__VA_ARGS__)
+#define vsoxei16_v_f16m2_m(...) __riscv_vsoxei16_v_f16m2_m(__VA_ARGS__)
+#define vsoxei16_v_f16m4_m(...) __riscv_vsoxei16_v_f16m4_m(__VA_ARGS__)
+#define vsoxei16_v_f16m8_m(...) __riscv_vsoxei16_v_f16m8_m(__VA_ARGS__)
+#define vsoxei32_v_f16mf4_m(...) __riscv_vsoxei32_v_f16mf4_m(__VA_ARGS__)
+#define vsoxei32_v_f16mf2_m(...) __riscv_vsoxei32_v_f16mf2_m(__VA_ARGS__)
+#define vsoxei32_v_f16m1_m(...) __riscv_vsoxei32_v_f16m1_m(__VA_ARGS__)
+#define vsoxei32_v_f16m2_m(...) __riscv_vsoxei32_v_f16m2_m(__VA_ARGS__)
+#define vsoxei32_v_f16m4_m(...) __riscv_vsoxei32_v_f16m4_m(__VA_ARGS__)
+#define vsoxei64_v_f16mf4_m(...) __riscv_vsoxei64_v_f16mf4_m(__VA_ARGS__)
+#define vsoxei64_v_f16mf2_m(...) __riscv_vsoxei64_v_f16mf2_m(__VA_ARGS__)
+#define vsoxei64_v_f16m1_m(...) __riscv_vsoxei64_v_f16m1_m(__VA_ARGS__)
+#define vsoxei64_v_f16m2_m(...) __riscv_vsoxei64_v_f16m2_m(__VA_ARGS__)
+#define vsoxei8_v_f32mf2_m(...) __riscv_vsoxei8_v_f32mf2_m(__VA_ARGS__)
+#define vsoxei8_v_f32m1_m(...) __riscv_vsoxei8_v_f32m1_m(__VA_ARGS__)
+#define vsoxei8_v_f32m2_m(...) __riscv_vsoxei8_v_f32m2_m(__VA_ARGS__)
+#define vsoxei8_v_f32m4_m(...) __riscv_vsoxei8_v_f32m4_m(__VA_ARGS__)
+#define vsoxei8_v_f32m8_m(...) __riscv_vsoxei8_v_f32m8_m(__VA_ARGS__)
+#define vsoxei16_v_f32mf2_m(...) __riscv_vsoxei16_v_f32mf2_m(__VA_ARGS__)
+#define vsoxei16_v_f32m1_m(...) __riscv_vsoxei16_v_f32m1_m(__VA_ARGS__)
+#define vsoxei16_v_f32m2_m(...) __riscv_vsoxei16_v_f32m2_m(__VA_ARGS__)
+#define vsoxei16_v_f32m4_m(...) __riscv_vsoxei16_v_f32m4_m(__VA_ARGS__)
+#define vsoxei16_v_f32m8_m(...) __riscv_vsoxei16_v_f32m8_m(__VA_ARGS__)
+#define vsoxei32_v_f32mf2_m(...) __riscv_vsoxei32_v_f32mf2_m(__VA_ARGS__)
+#define vsoxei32_v_f32m1_m(...) __riscv_vsoxei32_v_f32m1_m(__VA_ARGS__)
+#define vsoxei32_v_f32m2_m(...) __riscv_vsoxei32_v_f32m2_m(__VA_ARGS__)
+#define vsoxei32_v_f32m4_m(...) __riscv_vsoxei32_v_f32m4_m(__VA_ARGS__)
+#define vsoxei32_v_f32m8_m(...) __riscv_vsoxei32_v_f32m8_m(__VA_ARGS__)
+#define vsoxei64_v_f32mf2_m(...) __riscv_vsoxei64_v_f32mf2_m(__VA_ARGS__)
+#define vsoxei64_v_f32m1_m(...) __riscv_vsoxei64_v_f32m1_m(__VA_ARGS__)
+#define vsoxei64_v_f32m2_m(...) __riscv_vsoxei64_v_f32m2_m(__VA_ARGS__)
+#define vsoxei64_v_f32m4_m(...) __riscv_vsoxei64_v_f32m4_m(__VA_ARGS__)
+#define vsoxei8_v_f64m1_m(...) __riscv_vsoxei8_v_f64m1_m(__VA_ARGS__)
+#define vsoxei8_v_f64m2_m(...) __riscv_vsoxei8_v_f64m2_m(__VA_ARGS__)
+#define vsoxei8_v_f64m4_m(...) __riscv_vsoxei8_v_f64m4_m(__VA_ARGS__)
+#define vsoxei8_v_f64m8_m(...) __riscv_vsoxei8_v_f64m8_m(__VA_ARGS__)
+#define vsoxei16_v_f64m1_m(...) __riscv_vsoxei16_v_f64m1_m(__VA_ARGS__)
+#define vsoxei16_v_f64m2_m(...) __riscv_vsoxei16_v_f64m2_m(__VA_ARGS__)
+#define vsoxei16_v_f64m4_m(...) __riscv_vsoxei16_v_f64m4_m(__VA_ARGS__)
+#define vsoxei16_v_f64m8_m(...) __riscv_vsoxei16_v_f64m8_m(__VA_ARGS__)
+#define vsoxei32_v_f64m1_m(...) __riscv_vsoxei32_v_f64m1_m(__VA_ARGS__)
+#define vsoxei32_v_f64m2_m(...) __riscv_vsoxei32_v_f64m2_m(__VA_ARGS__)
+#define vsoxei32_v_f64m4_m(...) __riscv_vsoxei32_v_f64m4_m(__VA_ARGS__)
+#define vsoxei32_v_f64m8_m(...) __riscv_vsoxei32_v_f64m8_m(__VA_ARGS__)
+#define vsoxei64_v_f64m1_m(...) __riscv_vsoxei64_v_f64m1_m(__VA_ARGS__)
+#define vsoxei64_v_f64m2_m(...) __riscv_vsoxei64_v_f64m2_m(__VA_ARGS__)
+#define vsoxei64_v_f64m4_m(...) __riscv_vsoxei64_v_f64m4_m(__VA_ARGS__)
+#define vsoxei64_v_f64m8_m(...) __riscv_vsoxei64_v_f64m8_m(__VA_ARGS__)
+#define vsuxei8_v_f16mf4_m(...) __riscv_vsuxei8_v_f16mf4_m(__VA_ARGS__)
+#define vsuxei8_v_f16mf2_m(...) __riscv_vsuxei8_v_f16mf2_m(__VA_ARGS__)
+#define vsuxei8_v_f16m1_m(...) __riscv_vsuxei8_v_f16m1_m(__VA_ARGS__)
+#define vsuxei8_v_f16m2_m(...) __riscv_vsuxei8_v_f16m2_m(__VA_ARGS__)
+#define vsuxei8_v_f16m4_m(...) __riscv_vsuxei8_v_f16m4_m(__VA_ARGS__)
+#define vsuxei8_v_f16m8_m(...) __riscv_vsuxei8_v_f16m8_m(__VA_ARGS__)
+#define vsuxei16_v_f16mf4_m(...) __riscv_vsuxei16_v_f16mf4_m(__VA_ARGS__)
+#define vsuxei16_v_f16mf2_m(...) __riscv_vsuxei16_v_f16mf2_m(__VA_ARGS__)
+#define vsuxei16_v_f16m1_m(...) __riscv_vsuxei16_v_f16m1_m(__VA_ARGS__)
+#define vsuxei16_v_f16m2_m(...) __riscv_vsuxei16_v_f16m2_m(__VA_ARGS__)
+#define vsuxei16_v_f16m4_m(...) __riscv_vsuxei16_v_f16m4_m(__VA_ARGS__)
+#define vsuxei16_v_f16m8_m(...) __riscv_vsuxei16_v_f16m8_m(__VA_ARGS__)
+#define vsuxei32_v_f16mf4_m(...) __riscv_vsuxei32_v_f16mf4_m(__VA_ARGS__)
+#define vsuxei32_v_f16mf2_m(...) __riscv_vsuxei32_v_f16mf2_m(__VA_ARGS__)
+#define vsuxei32_v_f16m1_m(...) __riscv_vsuxei32_v_f16m1_m(__VA_ARGS__)
+#define vsuxei32_v_f16m2_m(...) __riscv_vsuxei32_v_f16m2_m(__VA_ARGS__)
+#define vsuxei32_v_f16m4_m(...) __riscv_vsuxei32_v_f16m4_m(__VA_ARGS__)
+#define vsuxei64_v_f16mf4_m(...) __riscv_vsuxei64_v_f16mf4_m(__VA_ARGS__)
+#define vsuxei64_v_f16mf2_m(...) __riscv_vsuxei64_v_f16mf2_m(__VA_ARGS__)
+#define vsuxei64_v_f16m1_m(...) __riscv_vsuxei64_v_f16m1_m(__VA_ARGS__)
+#define vsuxei64_v_f16m2_m(...) __riscv_vsuxei64_v_f16m2_m(__VA_ARGS__)
+#define vsuxei8_v_f32mf2_m(...) __riscv_vsuxei8_v_f32mf2_m(__VA_ARGS__)
+#define vsuxei8_v_f32m1_m(...) __riscv_vsuxei8_v_f32m1_m(__VA_ARGS__)
+#define vsuxei8_v_f32m2_m(...) __riscv_vsuxei8_v_f32m2_m(__VA_ARGS__)
+#define vsuxei8_v_f32m4_m(...) __riscv_vsuxei8_v_f32m4_m(__VA_ARGS__)
+#define vsuxei8_v_f32m8_m(...) __riscv_vsuxei8_v_f32m8_m(__VA_ARGS__)
+#define vsuxei16_v_f32mf2_m(...) __riscv_vsuxei16_v_f32mf2_m(__VA_ARGS__)
+#define vsuxei16_v_f32m1_m(...) __riscv_vsuxei16_v_f32m1_m(__VA_ARGS__)
+#define vsuxei16_v_f32m2_m(...) __riscv_vsuxei16_v_f32m2_m(__VA_ARGS__)
+#define vsuxei16_v_f32m4_m(...) __riscv_vsuxei16_v_f32m4_m(__VA_ARGS__)
+#define vsuxei16_v_f32m8_m(...) __riscv_vsuxei16_v_f32m8_m(__VA_ARGS__)
+#define vsuxei32_v_f32mf2_m(...) __riscv_vsuxei32_v_f32mf2_m(__VA_ARGS__)
+#define vsuxei32_v_f32m1_m(...) __riscv_vsuxei32_v_f32m1_m(__VA_ARGS__)
+#define vsuxei32_v_f32m2_m(...) __riscv_vsuxei32_v_f32m2_m(__VA_ARGS__)
+#define vsuxei32_v_f32m4_m(...) __riscv_vsuxei32_v_f32m4_m(__VA_ARGS__)
+#define vsuxei32_v_f32m8_m(...) __riscv_vsuxei32_v_f32m8_m(__VA_ARGS__)
+#define vsuxei64_v_f32mf2_m(...) __riscv_vsuxei64_v_f32mf2_m(__VA_ARGS__)
+#define vsuxei64_v_f32m1_m(...) __riscv_vsuxei64_v_f32m1_m(__VA_ARGS__)
+#define vsuxei64_v_f32m2_m(...) __riscv_vsuxei64_v_f32m2_m(__VA_ARGS__)
+#define vsuxei64_v_f32m4_m(...) __riscv_vsuxei64_v_f32m4_m(__VA_ARGS__)
+#define vsuxei8_v_f64m1_m(...) __riscv_vsuxei8_v_f64m1_m(__VA_ARGS__)
+#define vsuxei8_v_f64m2_m(...) __riscv_vsuxei8_v_f64m2_m(__VA_ARGS__)
+#define vsuxei8_v_f64m4_m(...) __riscv_vsuxei8_v_f64m4_m(__VA_ARGS__)
+#define vsuxei8_v_f64m8_m(...) __riscv_vsuxei8_v_f64m8_m(__VA_ARGS__)
+#define vsuxei16_v_f64m1_m(...) __riscv_vsuxei16_v_f64m1_m(__VA_ARGS__)
+#define vsuxei16_v_f64m2_m(...) __riscv_vsuxei16_v_f64m2_m(__VA_ARGS__)
+#define vsuxei16_v_f64m4_m(...) __riscv_vsuxei16_v_f64m4_m(__VA_ARGS__)
+#define vsuxei16_v_f64m8_m(...) __riscv_vsuxei16_v_f64m8_m(__VA_ARGS__)
+#define vsuxei32_v_f64m1_m(...) __riscv_vsuxei32_v_f64m1_m(__VA_ARGS__)
+#define vsuxei32_v_f64m2_m(...) __riscv_vsuxei32_v_f64m2_m(__VA_ARGS__)
+#define vsuxei32_v_f64m4_m(...) __riscv_vsuxei32_v_f64m4_m(__VA_ARGS__)
+#define vsuxei32_v_f64m8_m(...) __riscv_vsuxei32_v_f64m8_m(__VA_ARGS__)
+#define vsuxei64_v_f64m1_m(...) __riscv_vsuxei64_v_f64m1_m(__VA_ARGS__)
+#define vsuxei64_v_f64m2_m(...) __riscv_vsuxei64_v_f64m2_m(__VA_ARGS__)
+#define vsuxei64_v_f64m4_m(...) __riscv_vsuxei64_v_f64m4_m(__VA_ARGS__)
+#define vsuxei64_v_f64m8_m(...) __riscv_vsuxei64_v_f64m8_m(__VA_ARGS__)
+#define vsoxei8_v_i8mf8_m(...) __riscv_vsoxei8_v_i8mf8_m(__VA_ARGS__)
+#define vsoxei8_v_i8mf4_m(...) __riscv_vsoxei8_v_i8mf4_m(__VA_ARGS__)
+#define vsoxei8_v_i8mf2_m(...) __riscv_vsoxei8_v_i8mf2_m(__VA_ARGS__)
+#define vsoxei8_v_i8m1_m(...) __riscv_vsoxei8_v_i8m1_m(__VA_ARGS__)
+#define vsoxei8_v_i8m2_m(...) __riscv_vsoxei8_v_i8m2_m(__VA_ARGS__)
+#define vsoxei8_v_i8m4_m(...) __riscv_vsoxei8_v_i8m4_m(__VA_ARGS__)
+#define vsoxei8_v_i8m8_m(...) __riscv_vsoxei8_v_i8m8_m(__VA_ARGS__)
+#define vsoxei16_v_i8mf8_m(...) __riscv_vsoxei16_v_i8mf8_m(__VA_ARGS__)
+#define vsoxei16_v_i8mf4_m(...) __riscv_vsoxei16_v_i8mf4_m(__VA_ARGS__)
+#define vsoxei16_v_i8mf2_m(...) __riscv_vsoxei16_v_i8mf2_m(__VA_ARGS__)
+#define vsoxei16_v_i8m1_m(...) __riscv_vsoxei16_v_i8m1_m(__VA_ARGS__)
+#define vsoxei16_v_i8m2_m(...) __riscv_vsoxei16_v_i8m2_m(__VA_ARGS__)
+#define vsoxei16_v_i8m4_m(...) __riscv_vsoxei16_v_i8m4_m(__VA_ARGS__)
+#define vsoxei32_v_i8mf8_m(...) __riscv_vsoxei32_v_i8mf8_m(__VA_ARGS__)
+#define vsoxei32_v_i8mf4_m(...) __riscv_vsoxei32_v_i8mf4_m(__VA_ARGS__)
+#define vsoxei32_v_i8mf2_m(...) __riscv_vsoxei32_v_i8mf2_m(__VA_ARGS__)
+#define vsoxei32_v_i8m1_m(...) __riscv_vsoxei32_v_i8m1_m(__VA_ARGS__)
+#define vsoxei32_v_i8m2_m(...) __riscv_vsoxei32_v_i8m2_m(__VA_ARGS__)
+#define vsoxei64_v_i8mf8_m(...) __riscv_vsoxei64_v_i8mf8_m(__VA_ARGS__)
+#define vsoxei64_v_i8mf4_m(...) __riscv_vsoxei64_v_i8mf4_m(__VA_ARGS__)
+#define vsoxei64_v_i8mf2_m(...) __riscv_vsoxei64_v_i8mf2_m(__VA_ARGS__)
+#define vsoxei64_v_i8m1_m(...) __riscv_vsoxei64_v_i8m1_m(__VA_ARGS__)
+#define vsoxei8_v_i16mf4_m(...) __riscv_vsoxei8_v_i16mf4_m(__VA_ARGS__)
+#define vsoxei8_v_i16mf2_m(...) __riscv_vsoxei8_v_i16mf2_m(__VA_ARGS__)
+#define vsoxei8_v_i16m1_m(...) __riscv_vsoxei8_v_i16m1_m(__VA_ARGS__)
+#define vsoxei8_v_i16m2_m(...) __riscv_vsoxei8_v_i16m2_m(__VA_ARGS__)
+#define vsoxei8_v_i16m4_m(...) __riscv_vsoxei8_v_i16m4_m(__VA_ARGS__)
+#define vsoxei8_v_i16m8_m(...) __riscv_vsoxei8_v_i16m8_m(__VA_ARGS__)
+#define vsoxei16_v_i16mf4_m(...) __riscv_vsoxei16_v_i16mf4_m(__VA_ARGS__)
+#define vsoxei16_v_i16mf2_m(...) __riscv_vsoxei16_v_i16mf2_m(__VA_ARGS__)
+#define vsoxei16_v_i16m1_m(...) __riscv_vsoxei16_v_i16m1_m(__VA_ARGS__)
+#define vsoxei16_v_i16m2_m(...) __riscv_vsoxei16_v_i16m2_m(__VA_ARGS__)
+#define vsoxei16_v_i16m4_m(...) __riscv_vsoxei16_v_i16m4_m(__VA_ARGS__)
+#define vsoxei16_v_i16m8_m(...) __riscv_vsoxei16_v_i16m8_m(__VA_ARGS__)
+#define vsoxei32_v_i16mf4_m(...) __riscv_vsoxei32_v_i16mf4_m(__VA_ARGS__)
+#define vsoxei32_v_i16mf2_m(...) __riscv_vsoxei32_v_i16mf2_m(__VA_ARGS__)
+#define vsoxei32_v_i16m1_m(...) __riscv_vsoxei32_v_i16m1_m(__VA_ARGS__)
+#define vsoxei32_v_i16m2_m(...) __riscv_vsoxei32_v_i16m2_m(__VA_ARGS__)
+#define vsoxei32_v_i16m4_m(...) __riscv_vsoxei32_v_i16m4_m(__VA_ARGS__)
+#define vsoxei64_v_i16mf4_m(...) __riscv_vsoxei64_v_i16mf4_m(__VA_ARGS__)
+#define vsoxei64_v_i16mf2_m(...) __riscv_vsoxei64_v_i16mf2_m(__VA_ARGS__)
+#define vsoxei64_v_i16m1_m(...) __riscv_vsoxei64_v_i16m1_m(__VA_ARGS__)
+#define vsoxei64_v_i16m2_m(...) __riscv_vsoxei64_v_i16m2_m(__VA_ARGS__)
+#define vsoxei8_v_i32mf2_m(...) __riscv_vsoxei8_v_i32mf2_m(__VA_ARGS__)
+#define vsoxei8_v_i32m1_m(...) __riscv_vsoxei8_v_i32m1_m(__VA_ARGS__)
+#define vsoxei8_v_i32m2_m(...) __riscv_vsoxei8_v_i32m2_m(__VA_ARGS__)
+#define vsoxei8_v_i32m4_m(...) __riscv_vsoxei8_v_i32m4_m(__VA_ARGS__)
+#define vsoxei8_v_i32m8_m(...) __riscv_vsoxei8_v_i32m8_m(__VA_ARGS__)
+#define vsoxei16_v_i32mf2_m(...) __riscv_vsoxei16_v_i32mf2_m(__VA_ARGS__)
+#define vsoxei16_v_i32m1_m(...) __riscv_vsoxei16_v_i32m1_m(__VA_ARGS__)
+#define vsoxei16_v_i32m2_m(...) __riscv_vsoxei16_v_i32m2_m(__VA_ARGS__)
+#define vsoxei16_v_i32m4_m(...) __riscv_vsoxei16_v_i32m4_m(__VA_ARGS__)
+#define vsoxei16_v_i32m8_m(...) __riscv_vsoxei16_v_i32m8_m(__VA_ARGS__)
+#define vsoxei32_v_i32mf2_m(...) __riscv_vsoxei32_v_i32mf2_m(__VA_ARGS__)
+#define vsoxei32_v_i32m1_m(...) __riscv_vsoxei32_v_i32m1_m(__VA_ARGS__)
+#define vsoxei32_v_i32m2_m(...) __riscv_vsoxei32_v_i32m2_m(__VA_ARGS__)
+#define vsoxei32_v_i32m4_m(...) __riscv_vsoxei32_v_i32m4_m(__VA_ARGS__)
+#define vsoxei32_v_i32m8_m(...) __riscv_vsoxei32_v_i32m8_m(__VA_ARGS__)
+#define vsoxei64_v_i32mf2_m(...) __riscv_vsoxei64_v_i32mf2_m(__VA_ARGS__)
+#define vsoxei64_v_i32m1_m(...) __riscv_vsoxei64_v_i32m1_m(__VA_ARGS__)
+#define vsoxei64_v_i32m2_m(...) __riscv_vsoxei64_v_i32m2_m(__VA_ARGS__)
+#define vsoxei64_v_i32m4_m(...) __riscv_vsoxei64_v_i32m4_m(__VA_ARGS__)
+#define vsoxei8_v_i64m1_m(...) __riscv_vsoxei8_v_i64m1_m(__VA_ARGS__)
+#define vsoxei8_v_i64m2_m(...) __riscv_vsoxei8_v_i64m2_m(__VA_ARGS__)
+#define vsoxei8_v_i64m4_m(...) __riscv_vsoxei8_v_i64m4_m(__VA_ARGS__)
+#define vsoxei8_v_i64m8_m(...) __riscv_vsoxei8_v_i64m8_m(__VA_ARGS__)
+#define vsoxei16_v_i64m1_m(...) __riscv_vsoxei16_v_i64m1_m(__VA_ARGS__)
+#define vsoxei16_v_i64m2_m(...) __riscv_vsoxei16_v_i64m2_m(__VA_ARGS__)
+#define vsoxei16_v_i64m4_m(...) __riscv_vsoxei16_v_i64m4_m(__VA_ARGS__)
+#define vsoxei16_v_i64m8_m(...) __riscv_vsoxei16_v_i64m8_m(__VA_ARGS__)
+#define vsoxei32_v_i64m1_m(...) __riscv_vsoxei32_v_i64m1_m(__VA_ARGS__)
+#define vsoxei32_v_i64m2_m(...) __riscv_vsoxei32_v_i64m2_m(__VA_ARGS__)
+#define vsoxei32_v_i64m4_m(...) __riscv_vsoxei32_v_i64m4_m(__VA_ARGS__)
+#define vsoxei32_v_i64m8_m(...) __riscv_vsoxei32_v_i64m8_m(__VA_ARGS__)
+#define vsoxei64_v_i64m1_m(...) __riscv_vsoxei64_v_i64m1_m(__VA_ARGS__)
+#define vsoxei64_v_i64m2_m(...) __riscv_vsoxei64_v_i64m2_m(__VA_ARGS__)
+#define vsoxei64_v_i64m4_m(...) __riscv_vsoxei64_v_i64m4_m(__VA_ARGS__)
+#define vsoxei64_v_i64m8_m(...) __riscv_vsoxei64_v_i64m8_m(__VA_ARGS__)
+#define vsuxei8_v_i8mf8_m(...) __riscv_vsuxei8_v_i8mf8_m(__VA_ARGS__)
+#define vsuxei8_v_i8mf4_m(...) __riscv_vsuxei8_v_i8mf4_m(__VA_ARGS__)
+#define vsuxei8_v_i8mf2_m(...) __riscv_vsuxei8_v_i8mf2_m(__VA_ARGS__)
+#define vsuxei8_v_i8m1_m(...) __riscv_vsuxei8_v_i8m1_m(__VA_ARGS__)
+#define vsuxei8_v_i8m2_m(...) __riscv_vsuxei8_v_i8m2_m(__VA_ARGS__)
+#define vsuxei8_v_i8m4_m(...) __riscv_vsuxei8_v_i8m4_m(__VA_ARGS__)
+#define vsuxei8_v_i8m8_m(...) __riscv_vsuxei8_v_i8m8_m(__VA_ARGS__)
+#define vsuxei16_v_i8mf8_m(...) __riscv_vsuxei16_v_i8mf8_m(__VA_ARGS__)
+#define vsuxei16_v_i8mf4_m(...) __riscv_vsuxei16_v_i8mf4_m(__VA_ARGS__)
+#define vsuxei16_v_i8mf2_m(...) __riscv_vsuxei16_v_i8mf2_m(__VA_ARGS__)
+#define vsuxei16_v_i8m1_m(...) __riscv_vsuxei16_v_i8m1_m(__VA_ARGS__)
+#define vsuxei16_v_i8m2_m(...) __riscv_vsuxei16_v_i8m2_m(__VA_ARGS__)
+#define vsuxei16_v_i8m4_m(...) __riscv_vsuxei16_v_i8m4_m(__VA_ARGS__)
+#define vsuxei32_v_i8mf8_m(...) __riscv_vsuxei32_v_i8mf8_m(__VA_ARGS__)
+#define vsuxei32_v_i8mf4_m(...) __riscv_vsuxei32_v_i8mf4_m(__VA_ARGS__)
+#define vsuxei32_v_i8mf2_m(...) __riscv_vsuxei32_v_i8mf2_m(__VA_ARGS__)
+#define vsuxei32_v_i8m1_m(...) __riscv_vsuxei32_v_i8m1_m(__VA_ARGS__)
+#define vsuxei32_v_i8m2_m(...) __riscv_vsuxei32_v_i8m2_m(__VA_ARGS__)
+#define vsuxei64_v_i8mf8_m(...) __riscv_vsuxei64_v_i8mf8_m(__VA_ARGS__)
+#define vsuxei64_v_i8mf4_m(...) __riscv_vsuxei64_v_i8mf4_m(__VA_ARGS__)
+#define vsuxei64_v_i8mf2_m(...) __riscv_vsuxei64_v_i8mf2_m(__VA_ARGS__)
+#define vsuxei64_v_i8m1_m(...) __riscv_vsuxei64_v_i8m1_m(__VA_ARGS__)
+#define vsuxei8_v_i16mf4_m(...) __riscv_vsuxei8_v_i16mf4_m(__VA_ARGS__)
+#define vsuxei8_v_i16mf2_m(...) __riscv_vsuxei8_v_i16mf2_m(__VA_ARGS__)
+#define vsuxei8_v_i16m1_m(...) __riscv_vsuxei8_v_i16m1_m(__VA_ARGS__)
+#define vsuxei8_v_i16m2_m(...) __riscv_vsuxei8_v_i16m2_m(__VA_ARGS__)
+#define vsuxei8_v_i16m4_m(...) __riscv_vsuxei8_v_i16m4_m(__VA_ARGS__)
+#define vsuxei8_v_i16m8_m(...) __riscv_vsuxei8_v_i16m8_m(__VA_ARGS__)
+#define vsuxei16_v_i16mf4_m(...) __riscv_vsuxei16_v_i16mf4_m(__VA_ARGS__)
+#define vsuxei16_v_i16mf2_m(...) __riscv_vsuxei16_v_i16mf2_m(__VA_ARGS__)
+#define vsuxei16_v_i16m1_m(...) __riscv_vsuxei16_v_i16m1_m(__VA_ARGS__)
+#define vsuxei16_v_i16m2_m(...) __riscv_vsuxei16_v_i16m2_m(__VA_ARGS__)
+#define vsuxei16_v_i16m4_m(...) __riscv_vsuxei16_v_i16m4_m(__VA_ARGS__)
+#define vsuxei16_v_i16m8_m(...) __riscv_vsuxei16_v_i16m8_m(__VA_ARGS__)
+#define vsuxei32_v_i16mf4_m(...) __riscv_vsuxei32_v_i16mf4_m(__VA_ARGS__)
+#define vsuxei32_v_i16mf2_m(...) __riscv_vsuxei32_v_i16mf2_m(__VA_ARGS__)
+#define vsuxei32_v_i16m1_m(...) __riscv_vsuxei32_v_i16m1_m(__VA_ARGS__)
+#define vsuxei32_v_i16m2_m(...) __riscv_vsuxei32_v_i16m2_m(__VA_ARGS__)
+#define vsuxei32_v_i16m4_m(...) __riscv_vsuxei32_v_i16m4_m(__VA_ARGS__)
+#define vsuxei64_v_i16mf4_m(...) __riscv_vsuxei64_v_i16mf4_m(__VA_ARGS__)
+#define vsuxei64_v_i16mf2_m(...) __riscv_vsuxei64_v_i16mf2_m(__VA_ARGS__)
+#define vsuxei64_v_i16m1_m(...) __riscv_vsuxei64_v_i16m1_m(__VA_ARGS__)
+#define vsuxei64_v_i16m2_m(...) __riscv_vsuxei64_v_i16m2_m(__VA_ARGS__)
+#define vsuxei8_v_i32mf2_m(...) __riscv_vsuxei8_v_i32mf2_m(__VA_ARGS__)
+#define vsuxei8_v_i32m1_m(...) __riscv_vsuxei8_v_i32m1_m(__VA_ARGS__)
+#define vsuxei8_v_i32m2_m(...) __riscv_vsuxei8_v_i32m2_m(__VA_ARGS__)
+#define vsuxei8_v_i32m4_m(...) __riscv_vsuxei8_v_i32m4_m(__VA_ARGS__)
+#define vsuxei8_v_i32m8_m(...) __riscv_vsuxei8_v_i32m8_m(__VA_ARGS__)
+#define vsuxei16_v_i32mf2_m(...) __riscv_vsuxei16_v_i32mf2_m(__VA_ARGS__)
+#define vsuxei16_v_i32m1_m(...) __riscv_vsuxei16_v_i32m1_m(__VA_ARGS__)
+#define vsuxei16_v_i32m2_m(...) __riscv_vsuxei16_v_i32m2_m(__VA_ARGS__)
+#define vsuxei16_v_i32m4_m(...) __riscv_vsuxei16_v_i32m4_m(__VA_ARGS__)
+#define vsuxei16_v_i32m8_m(...) __riscv_vsuxei16_v_i32m8_m(__VA_ARGS__)
+#define vsuxei32_v_i32mf2_m(...) __riscv_vsuxei32_v_i32mf2_m(__VA_ARGS__)
+#define vsuxei32_v_i32m1_m(...) __riscv_vsuxei32_v_i32m1_m(__VA_ARGS__)
+#define vsuxei32_v_i32m2_m(...) __riscv_vsuxei32_v_i32m2_m(__VA_ARGS__)
+#define vsuxei32_v_i32m4_m(...) __riscv_vsuxei32_v_i32m4_m(__VA_ARGS__)
+#define vsuxei32_v_i32m8_m(...) __riscv_vsuxei32_v_i32m8_m(__VA_ARGS__)
+#define vsuxei64_v_i32mf2_m(...) __riscv_vsuxei64_v_i32mf2_m(__VA_ARGS__)
+#define vsuxei64_v_i32m1_m(...) __riscv_vsuxei64_v_i32m1_m(__VA_ARGS__)
+#define vsuxei64_v_i32m2_m(...) __riscv_vsuxei64_v_i32m2_m(__VA_ARGS__)
+#define vsuxei64_v_i32m4_m(...) __riscv_vsuxei64_v_i32m4_m(__VA_ARGS__)
+#define vsuxei8_v_i64m1_m(...) __riscv_vsuxei8_v_i64m1_m(__VA_ARGS__)
+#define vsuxei8_v_i64m2_m(...) __riscv_vsuxei8_v_i64m2_m(__VA_ARGS__)
+#define vsuxei8_v_i64m4_m(...) __riscv_vsuxei8_v_i64m4_m(__VA_ARGS__)
+#define vsuxei8_v_i64m8_m(...) __riscv_vsuxei8_v_i64m8_m(__VA_ARGS__)
+#define vsuxei16_v_i64m1_m(...) __riscv_vsuxei16_v_i64m1_m(__VA_ARGS__)
+#define vsuxei16_v_i64m2_m(...) __riscv_vsuxei16_v_i64m2_m(__VA_ARGS__)
+#define vsuxei16_v_i64m4_m(...) __riscv_vsuxei16_v_i64m4_m(__VA_ARGS__)
+#define vsuxei16_v_i64m8_m(...) __riscv_vsuxei16_v_i64m8_m(__VA_ARGS__)
+#define vsuxei32_v_i64m1_m(...) __riscv_vsuxei32_v_i64m1_m(__VA_ARGS__)
+#define vsuxei32_v_i64m2_m(...) __riscv_vsuxei32_v_i64m2_m(__VA_ARGS__)
+#define vsuxei32_v_i64m4_m(...) __riscv_vsuxei32_v_i64m4_m(__VA_ARGS__)
+#define vsuxei32_v_i64m8_m(...) __riscv_vsuxei32_v_i64m8_m(__VA_ARGS__)
+#define vsuxei64_v_i64m1_m(...) __riscv_vsuxei64_v_i64m1_m(__VA_ARGS__)
+#define vsuxei64_v_i64m2_m(...) __riscv_vsuxei64_v_i64m2_m(__VA_ARGS__)
+#define vsuxei64_v_i64m4_m(...) __riscv_vsuxei64_v_i64m4_m(__VA_ARGS__)
+#define vsuxei64_v_i64m8_m(...) __riscv_vsuxei64_v_i64m8_m(__VA_ARGS__)
+#define vsoxei8_v_u8mf8_m(...) __riscv_vsoxei8_v_u8mf8_m(__VA_ARGS__)
+#define vsoxei8_v_u8mf4_m(...) __riscv_vsoxei8_v_u8mf4_m(__VA_ARGS__)
+#define vsoxei8_v_u8mf2_m(...) __riscv_vsoxei8_v_u8mf2_m(__VA_ARGS__)
+#define vsoxei8_v_u8m1_m(...) __riscv_vsoxei8_v_u8m1_m(__VA_ARGS__)
+#define vsoxei8_v_u8m2_m(...) __riscv_vsoxei8_v_u8m2_m(__VA_ARGS__)
+#define vsoxei8_v_u8m4_m(...) __riscv_vsoxei8_v_u8m4_m(__VA_ARGS__)
+#define vsoxei8_v_u8m8_m(...) __riscv_vsoxei8_v_u8m8_m(__VA_ARGS__)
+#define vsoxei16_v_u8mf8_m(...) __riscv_vsoxei16_v_u8mf8_m(__VA_ARGS__)
+#define vsoxei16_v_u8mf4_m(...) __riscv_vsoxei16_v_u8mf4_m(__VA_ARGS__)
+#define vsoxei16_v_u8mf2_m(...) __riscv_vsoxei16_v_u8mf2_m(__VA_ARGS__)
+#define vsoxei16_v_u8m1_m(...) __riscv_vsoxei16_v_u8m1_m(__VA_ARGS__)
+#define vsoxei16_v_u8m2_m(...) __riscv_vsoxei16_v_u8m2_m(__VA_ARGS__)
+#define vsoxei16_v_u8m4_m(...) __riscv_vsoxei16_v_u8m4_m(__VA_ARGS__)
+#define vsoxei32_v_u8mf8_m(...) __riscv_vsoxei32_v_u8mf8_m(__VA_ARGS__)
+#define vsoxei32_v_u8mf4_m(...) __riscv_vsoxei32_v_u8mf4_m(__VA_ARGS__)
+#define vsoxei32_v_u8mf2_m(...) __riscv_vsoxei32_v_u8mf2_m(__VA_ARGS__)
+#define vsoxei32_v_u8m1_m(...) __riscv_vsoxei32_v_u8m1_m(__VA_ARGS__)
+#define vsoxei32_v_u8m2_m(...) __riscv_vsoxei32_v_u8m2_m(__VA_ARGS__)
+#define vsoxei64_v_u8mf8_m(...) __riscv_vsoxei64_v_u8mf8_m(__VA_ARGS__)
+#define vsoxei64_v_u8mf4_m(...) __riscv_vsoxei64_v_u8mf4_m(__VA_ARGS__)
+#define vsoxei64_v_u8mf2_m(...) __riscv_vsoxei64_v_u8mf2_m(__VA_ARGS__)
+#define vsoxei64_v_u8m1_m(...) __riscv_vsoxei64_v_u8m1_m(__VA_ARGS__)
+#define vsoxei8_v_u16mf4_m(...) __riscv_vsoxei8_v_u16mf4_m(__VA_ARGS__)
+#define vsoxei8_v_u16mf2_m(...) __riscv_vsoxei8_v_u16mf2_m(__VA_ARGS__)
+#define vsoxei8_v_u16m1_m(...) __riscv_vsoxei8_v_u16m1_m(__VA_ARGS__)
+#define vsoxei8_v_u16m2_m(...) __riscv_vsoxei8_v_u16m2_m(__VA_ARGS__)
+#define vsoxei8_v_u16m4_m(...) __riscv_vsoxei8_v_u16m4_m(__VA_ARGS__)
+#define vsoxei8_v_u16m8_m(...) __riscv_vsoxei8_v_u16m8_m(__VA_ARGS__)
+#define vsoxei16_v_u16mf4_m(...) __riscv_vsoxei16_v_u16mf4_m(__VA_ARGS__)
+#define vsoxei16_v_u16mf2_m(...) __riscv_vsoxei16_v_u16mf2_m(__VA_ARGS__)
+#define vsoxei16_v_u16m1_m(...) __riscv_vsoxei16_v_u16m1_m(__VA_ARGS__)
+#define vsoxei16_v_u16m2_m(...) __riscv_vsoxei16_v_u16m2_m(__VA_ARGS__)
+#define vsoxei16_v_u16m4_m(...) __riscv_vsoxei16_v_u16m4_m(__VA_ARGS__)
+#define vsoxei16_v_u16m8_m(...) __riscv_vsoxei16_v_u16m8_m(__VA_ARGS__)
+#define vsoxei32_v_u16mf4_m(...) __riscv_vsoxei32_v_u16mf4_m(__VA_ARGS__)
+#define vsoxei32_v_u16mf2_m(...) __riscv_vsoxei32_v_u16mf2_m(__VA_ARGS__)
+#define vsoxei32_v_u16m1_m(...) __riscv_vsoxei32_v_u16m1_m(__VA_ARGS__)
+#define vsoxei32_v_u16m2_m(...) __riscv_vsoxei32_v_u16m2_m(__VA_ARGS__)
+#define vsoxei32_v_u16m4_m(...) __riscv_vsoxei32_v_u16m4_m(__VA_ARGS__)
+#define vsoxei64_v_u16mf4_m(...) __riscv_vsoxei64_v_u16mf4_m(__VA_ARGS__)
+#define vsoxei64_v_u16mf2_m(...) __riscv_vsoxei64_v_u16mf2_m(__VA_ARGS__)
+#define vsoxei64_v_u16m1_m(...) __riscv_vsoxei64_v_u16m1_m(__VA_ARGS__)
+#define vsoxei64_v_u16m2_m(...) __riscv_vsoxei64_v_u16m2_m(__VA_ARGS__)
+#define vsoxei8_v_u32mf2_m(...) __riscv_vsoxei8_v_u32mf2_m(__VA_ARGS__)
+#define vsoxei8_v_u32m1_m(...) __riscv_vsoxei8_v_u32m1_m(__VA_ARGS__)
+#define vsoxei8_v_u32m2_m(...) __riscv_vsoxei8_v_u32m2_m(__VA_ARGS__)
+#define vsoxei8_v_u32m4_m(...) __riscv_vsoxei8_v_u32m4_m(__VA_ARGS__)
+#define vsoxei8_v_u32m8_m(...) __riscv_vsoxei8_v_u32m8_m(__VA_ARGS__)
+#define vsoxei16_v_u32mf2_m(...) __riscv_vsoxei16_v_u32mf2_m(__VA_ARGS__)
+#define vsoxei16_v_u32m1_m(...) __riscv_vsoxei16_v_u32m1_m(__VA_ARGS__)
+#define vsoxei16_v_u32m2_m(...) __riscv_vsoxei16_v_u32m2_m(__VA_ARGS__)
+#define vsoxei16_v_u32m4_m(...) __riscv_vsoxei16_v_u32m4_m(__VA_ARGS__)
+#define vsoxei16_v_u32m8_m(...) __riscv_vsoxei16_v_u32m8_m(__VA_ARGS__)
+#define vsoxei32_v_u32mf2_m(...) __riscv_vsoxei32_v_u32mf2_m(__VA_ARGS__)
+#define vsoxei32_v_u32m1_m(...) __riscv_vsoxei32_v_u32m1_m(__VA_ARGS__)
+#define vsoxei32_v_u32m2_m(...) __riscv_vsoxei32_v_u32m2_m(__VA_ARGS__)
+#define vsoxei32_v_u32m4_m(...) __riscv_vsoxei32_v_u32m4_m(__VA_ARGS__)
+#define vsoxei32_v_u32m8_m(...) __riscv_vsoxei32_v_u32m8_m(__VA_ARGS__)
+#define vsoxei64_v_u32mf2_m(...) __riscv_vsoxei64_v_u32mf2_m(__VA_ARGS__)
+#define vsoxei64_v_u32m1_m(...) __riscv_vsoxei64_v_u32m1_m(__VA_ARGS__)
+#define vsoxei64_v_u32m2_m(...) __riscv_vsoxei64_v_u32m2_m(__VA_ARGS__)
+#define vsoxei64_v_u32m4_m(...) __riscv_vsoxei64_v_u32m4_m(__VA_ARGS__)
+#define vsoxei8_v_u64m1_m(...) __riscv_vsoxei8_v_u64m1_m(__VA_ARGS__)
+#define vsoxei8_v_u64m2_m(...) __riscv_vsoxei8_v_u64m2_m(__VA_ARGS__)
+#define vsoxei8_v_u64m4_m(...) __riscv_vsoxei8_v_u64m4_m(__VA_ARGS__)
+#define vsoxei8_v_u64m8_m(...) __riscv_vsoxei8_v_u64m8_m(__VA_ARGS__)
+#define vsoxei16_v_u64m1_m(...) __riscv_vsoxei16_v_u64m1_m(__VA_ARGS__)
+#define vsoxei16_v_u64m2_m(...) __riscv_vsoxei16_v_u64m2_m(__VA_ARGS__)
+#define vsoxei16_v_u64m4_m(...) __riscv_vsoxei16_v_u64m4_m(__VA_ARGS__)
+#define vsoxei16_v_u64m8_m(...) __riscv_vsoxei16_v_u64m8_m(__VA_ARGS__)
+#define vsoxei32_v_u64m1_m(...) __riscv_vsoxei32_v_u64m1_m(__VA_ARGS__)
+#define vsoxei32_v_u64m2_m(...) __riscv_vsoxei32_v_u64m2_m(__VA_ARGS__)
+#define vsoxei32_v_u64m4_m(...) __riscv_vsoxei32_v_u64m4_m(__VA_ARGS__)
+#define vsoxei32_v_u64m8_m(...) __riscv_vsoxei32_v_u64m8_m(__VA_ARGS__)
+#define vsoxei64_v_u64m1_m(...) __riscv_vsoxei64_v_u64m1_m(__VA_ARGS__)
+#define vsoxei64_v_u64m2_m(...) __riscv_vsoxei64_v_u64m2_m(__VA_ARGS__)
+#define vsoxei64_v_u64m4_m(...) __riscv_vsoxei64_v_u64m4_m(__VA_ARGS__)
+#define vsoxei64_v_u64m8_m(...) __riscv_vsoxei64_v_u64m8_m(__VA_ARGS__)
+#define vsuxei8_v_u8mf8_m(...) __riscv_vsuxei8_v_u8mf8_m(__VA_ARGS__)
+#define vsuxei8_v_u8mf4_m(...) __riscv_vsuxei8_v_u8mf4_m(__VA_ARGS__)
+#define vsuxei8_v_u8mf2_m(...) __riscv_vsuxei8_v_u8mf2_m(__VA_ARGS__)
+#define vsuxei8_v_u8m1_m(...) __riscv_vsuxei8_v_u8m1_m(__VA_ARGS__)
+#define vsuxei8_v_u8m2_m(...) __riscv_vsuxei8_v_u8m2_m(__VA_ARGS__)
+#define vsuxei8_v_u8m4_m(...) __riscv_vsuxei8_v_u8m4_m(__VA_ARGS__)
+#define vsuxei8_v_u8m8_m(...) __riscv_vsuxei8_v_u8m8_m(__VA_ARGS__)
+#define vsuxei16_v_u8mf8_m(...) __riscv_vsuxei16_v_u8mf8_m(__VA_ARGS__)
+#define vsuxei16_v_u8mf4_m(...) __riscv_vsuxei16_v_u8mf4_m(__VA_ARGS__)
+#define vsuxei16_v_u8mf2_m(...) __riscv_vsuxei16_v_u8mf2_m(__VA_ARGS__)
+#define vsuxei16_v_u8m1_m(...) __riscv_vsuxei16_v_u8m1_m(__VA_ARGS__)
+#define vsuxei16_v_u8m2_m(...) __riscv_vsuxei16_v_u8m2_m(__VA_ARGS__)
+#define vsuxei16_v_u8m4_m(...) __riscv_vsuxei16_v_u8m4_m(__VA_ARGS__)
+#define vsuxei32_v_u8mf8_m(...) __riscv_vsuxei32_v_u8mf8_m(__VA_ARGS__)
+#define vsuxei32_v_u8mf4_m(...) __riscv_vsuxei32_v_u8mf4_m(__VA_ARGS__)
+#define vsuxei32_v_u8mf2_m(...) __riscv_vsuxei32_v_u8mf2_m(__VA_ARGS__)
+#define vsuxei32_v_u8m1_m(...) __riscv_vsuxei32_v_u8m1_m(__VA_ARGS__)
+#define vsuxei32_v_u8m2_m(...) __riscv_vsuxei32_v_u8m2_m(__VA_ARGS__)
+#define vsuxei64_v_u8mf8_m(...) __riscv_vsuxei64_v_u8mf8_m(__VA_ARGS__)
+#define vsuxei64_v_u8mf4_m(...) __riscv_vsuxei64_v_u8mf4_m(__VA_ARGS__)
+#define vsuxei64_v_u8mf2_m(...) __riscv_vsuxei64_v_u8mf2_m(__VA_ARGS__)
+#define vsuxei64_v_u8m1_m(...) __riscv_vsuxei64_v_u8m1_m(__VA_ARGS__)
+#define vsuxei8_v_u16mf4_m(...) __riscv_vsuxei8_v_u16mf4_m(__VA_ARGS__)
+#define vsuxei8_v_u16mf2_m(...) __riscv_vsuxei8_v_u16mf2_m(__VA_ARGS__)
+#define vsuxei8_v_u16m1_m(...) __riscv_vsuxei8_v_u16m1_m(__VA_ARGS__)
+#define vsuxei8_v_u16m2_m(...) __riscv_vsuxei8_v_u16m2_m(__VA_ARGS__)
+#define vsuxei8_v_u16m4_m(...) __riscv_vsuxei8_v_u16m4_m(__VA_ARGS__)
+#define vsuxei8_v_u16m8_m(...) __riscv_vsuxei8_v_u16m8_m(__VA_ARGS__)
+#define vsuxei16_v_u16mf4_m(...) __riscv_vsuxei16_v_u16mf4_m(__VA_ARGS__)
+#define vsuxei16_v_u16mf2_m(...) __riscv_vsuxei16_v_u16mf2_m(__VA_ARGS__)
+#define vsuxei16_v_u16m1_m(...) __riscv_vsuxei16_v_u16m1_m(__VA_ARGS__)
+#define vsuxei16_v_u16m2_m(...) __riscv_vsuxei16_v_u16m2_m(__VA_ARGS__)
+#define vsuxei16_v_u16m4_m(...) __riscv_vsuxei16_v_u16m4_m(__VA_ARGS__)
+#define vsuxei16_v_u16m8_m(...) __riscv_vsuxei16_v_u16m8_m(__VA_ARGS__)
+#define vsuxei32_v_u16mf4_m(...) __riscv_vsuxei32_v_u16mf4_m(__VA_ARGS__)
+#define vsuxei32_v_u16mf2_m(...) __riscv_vsuxei32_v_u16mf2_m(__VA_ARGS__)
+#define vsuxei32_v_u16m1_m(...) __riscv_vsuxei32_v_u16m1_m(__VA_ARGS__)
+#define vsuxei32_v_u16m2_m(...) __riscv_vsuxei32_v_u16m2_m(__VA_ARGS__)
+#define vsuxei32_v_u16m4_m(...) __riscv_vsuxei32_v_u16m4_m(__VA_ARGS__)
+#define vsuxei64_v_u16mf4_m(...) __riscv_vsuxei64_v_u16mf4_m(__VA_ARGS__)
+#define vsuxei64_v_u16mf2_m(...) __riscv_vsuxei64_v_u16mf2_m(__VA_ARGS__)
+#define vsuxei64_v_u16m1_m(...) __riscv_vsuxei64_v_u16m1_m(__VA_ARGS__)
+#define vsuxei64_v_u16m2_m(...) __riscv_vsuxei64_v_u16m2_m(__VA_ARGS__)
+#define vsuxei8_v_u32mf2_m(...) __riscv_vsuxei8_v_u32mf2_m(__VA_ARGS__)
+#define vsuxei8_v_u32m1_m(...) __riscv_vsuxei8_v_u32m1_m(__VA_ARGS__)
+#define vsuxei8_v_u32m2_m(...) __riscv_vsuxei8_v_u32m2_m(__VA_ARGS__)
+#define vsuxei8_v_u32m4_m(...) __riscv_vsuxei8_v_u32m4_m(__VA_ARGS__)
+#define vsuxei8_v_u32m8_m(...) __riscv_vsuxei8_v_u32m8_m(__VA_ARGS__)
+#define vsuxei16_v_u32mf2_m(...) __riscv_vsuxei16_v_u32mf2_m(__VA_ARGS__)
+#define vsuxei16_v_u32m1_m(...) __riscv_vsuxei16_v_u32m1_m(__VA_ARGS__)
+#define vsuxei16_v_u32m2_m(...) __riscv_vsuxei16_v_u32m2_m(__VA_ARGS__)
+#define vsuxei16_v_u32m4_m(...) __riscv_vsuxei16_v_u32m4_m(__VA_ARGS__)
+#define vsuxei16_v_u32m8_m(...) __riscv_vsuxei16_v_u32m8_m(__VA_ARGS__)
+#define vsuxei32_v_u32mf2_m(...) __riscv_vsuxei32_v_u32mf2_m(__VA_ARGS__)
+#define vsuxei32_v_u32m1_m(...) __riscv_vsuxei32_v_u32m1_m(__VA_ARGS__)
+#define vsuxei32_v_u32m2_m(...) __riscv_vsuxei32_v_u32m2_m(__VA_ARGS__)
+#define vsuxei32_v_u32m4_m(...) __riscv_vsuxei32_v_u32m4_m(__VA_ARGS__)
+#define vsuxei32_v_u32m8_m(...) __riscv_vsuxei32_v_u32m8_m(__VA_ARGS__)
+#define vsuxei64_v_u32mf2_m(...) __riscv_vsuxei64_v_u32mf2_m(__VA_ARGS__)
+#define vsuxei64_v_u32m1_m(...) __riscv_vsuxei64_v_u32m1_m(__VA_ARGS__)
+#define vsuxei64_v_u32m2_m(...) __riscv_vsuxei64_v_u32m2_m(__VA_ARGS__)
+#define vsuxei64_v_u32m4_m(...) __riscv_vsuxei64_v_u32m4_m(__VA_ARGS__)
+#define vsuxei8_v_u64m1_m(...) __riscv_vsuxei8_v_u64m1_m(__VA_ARGS__)
+#define vsuxei8_v_u64m2_m(...) __riscv_vsuxei8_v_u64m2_m(__VA_ARGS__)
+#define vsuxei8_v_u64m4_m(...) __riscv_vsuxei8_v_u64m4_m(__VA_ARGS__)
+#define vsuxei8_v_u64m8_m(...) __riscv_vsuxei8_v_u64m8_m(__VA_ARGS__)
+#define vsuxei16_v_u64m1_m(...) __riscv_vsuxei16_v_u64m1_m(__VA_ARGS__)
+#define vsuxei16_v_u64m2_m(...) __riscv_vsuxei16_v_u64m2_m(__VA_ARGS__)
+#define vsuxei16_v_u64m4_m(...) __riscv_vsuxei16_v_u64m4_m(__VA_ARGS__)
+#define vsuxei16_v_u64m8_m(...) __riscv_vsuxei16_v_u64m8_m(__VA_ARGS__)
+#define vsuxei32_v_u64m1_m(...) __riscv_vsuxei32_v_u64m1_m(__VA_ARGS__)
+#define vsuxei32_v_u64m2_m(...) __riscv_vsuxei32_v_u64m2_m(__VA_ARGS__)
+#define vsuxei32_v_u64m4_m(...) __riscv_vsuxei32_v_u64m4_m(__VA_ARGS__)
+#define vsuxei32_v_u64m8_m(...) __riscv_vsuxei32_v_u64m8_m(__VA_ARGS__)
+#define vsuxei64_v_u64m1_m(...) __riscv_vsuxei64_v_u64m1_m(__VA_ARGS__)
+#define vsuxei64_v_u64m2_m(...) __riscv_vsuxei64_v_u64m2_m(__VA_ARGS__)
+#define vsuxei64_v_u64m4_m(...) __riscv_vsuxei64_v_u64m4_m(__VA_ARGS__)
+#define vsuxei64_v_u64m8_m(...) __riscv_vsuxei64_v_u64m8_m(__VA_ARGS__)
+#define vle16ff_v_f16mf4(...) __riscv_vle16ff_v_f16mf4(__VA_ARGS__)
+#define vle16ff_v_f16mf2(...) __riscv_vle16ff_v_f16mf2(__VA_ARGS__)
+#define vle16ff_v_f16m1(...) __riscv_vle16ff_v_f16m1(__VA_ARGS__)
+#define vle16ff_v_f16m2(...) __riscv_vle16ff_v_f16m2(__VA_ARGS__)
+#define vle16ff_v_f16m4(...) __riscv_vle16ff_v_f16m4(__VA_ARGS__)
+#define vle16ff_v_f16m8(...) __riscv_vle16ff_v_f16m8(__VA_ARGS__)
+#define vle32ff_v_f32mf2(...) __riscv_vle32ff_v_f32mf2(__VA_ARGS__)
+#define vle32ff_v_f32m1(...) __riscv_vle32ff_v_f32m1(__VA_ARGS__)
+#define vle32ff_v_f32m2(...) __riscv_vle32ff_v_f32m2(__VA_ARGS__)
+#define vle32ff_v_f32m4(...) __riscv_vle32ff_v_f32m4(__VA_ARGS__)
+#define vle32ff_v_f32m8(...) __riscv_vle32ff_v_f32m8(__VA_ARGS__)
+#define vle64ff_v_f64m1(...) __riscv_vle64ff_v_f64m1(__VA_ARGS__)
+#define vle64ff_v_f64m2(...) __riscv_vle64ff_v_f64m2(__VA_ARGS__)
+#define vle64ff_v_f64m4(...) __riscv_vle64ff_v_f64m4(__VA_ARGS__)
+#define vle64ff_v_f64m8(...) __riscv_vle64ff_v_f64m8(__VA_ARGS__)
+#define vle8ff_v_i8mf8(...) __riscv_vle8ff_v_i8mf8(__VA_ARGS__)
+#define vle8ff_v_i8mf4(...) __riscv_vle8ff_v_i8mf4(__VA_ARGS__)
+#define vle8ff_v_i8mf2(...) __riscv_vle8ff_v_i8mf2(__VA_ARGS__)
+#define vle8ff_v_i8m1(...) __riscv_vle8ff_v_i8m1(__VA_ARGS__)
+#define vle8ff_v_i8m2(...) __riscv_vle8ff_v_i8m2(__VA_ARGS__)
+#define vle8ff_v_i8m4(...) __riscv_vle8ff_v_i8m4(__VA_ARGS__)
+#define vle8ff_v_i8m8(...) __riscv_vle8ff_v_i8m8(__VA_ARGS__)
+#define vle16ff_v_i16mf4(...) __riscv_vle16ff_v_i16mf4(__VA_ARGS__)
+#define vle16ff_v_i16mf2(...) __riscv_vle16ff_v_i16mf2(__VA_ARGS__)
+#define vle16ff_v_i16m1(...) __riscv_vle16ff_v_i16m1(__VA_ARGS__)
+#define vle16ff_v_i16m2(...) __riscv_vle16ff_v_i16m2(__VA_ARGS__)
+#define vle16ff_v_i16m4(...) __riscv_vle16ff_v_i16m4(__VA_ARGS__)
+#define vle16ff_v_i16m8(...) __riscv_vle16ff_v_i16m8(__VA_ARGS__)
+#define vle32ff_v_i32mf2(...) __riscv_vle32ff_v_i32mf2(__VA_ARGS__)
+#define vle32ff_v_i32m1(...) __riscv_vle32ff_v_i32m1(__VA_ARGS__)
+#define vle32ff_v_i32m2(...) __riscv_vle32ff_v_i32m2(__VA_ARGS__)
+#define vle32ff_v_i32m4(...) __riscv_vle32ff_v_i32m4(__VA_ARGS__)
+#define vle32ff_v_i32m8(...) __riscv_vle32ff_v_i32m8(__VA_ARGS__)
+#define vle64ff_v_i64m1(...) __riscv_vle64ff_v_i64m1(__VA_ARGS__)
+#define vle64ff_v_i64m2(...) __riscv_vle64ff_v_i64m2(__VA_ARGS__)
+#define vle64ff_v_i64m4(...) __riscv_vle64ff_v_i64m4(__VA_ARGS__)
+#define vle64ff_v_i64m8(...) __riscv_vle64ff_v_i64m8(__VA_ARGS__)
+#define vle8ff_v_u8mf8(...) __riscv_vle8ff_v_u8mf8(__VA_ARGS__)
+#define vle8ff_v_u8mf4(...) __riscv_vle8ff_v_u8mf4(__VA_ARGS__)
+#define vle8ff_v_u8mf2(...) __riscv_vle8ff_v_u8mf2(__VA_ARGS__)
+#define vle8ff_v_u8m1(...) __riscv_vle8ff_v_u8m1(__VA_ARGS__)
+#define vle8ff_v_u8m2(...) __riscv_vle8ff_v_u8m2(__VA_ARGS__)
+#define vle8ff_v_u8m4(...) __riscv_vle8ff_v_u8m4(__VA_ARGS__)
+#define vle8ff_v_u8m8(...) __riscv_vle8ff_v_u8m8(__VA_ARGS__)
+#define vle16ff_v_u16mf4(...) __riscv_vle16ff_v_u16mf4(__VA_ARGS__)
+#define vle16ff_v_u16mf2(...) __riscv_vle16ff_v_u16mf2(__VA_ARGS__)
+#define vle16ff_v_u16m1(...) __riscv_vle16ff_v_u16m1(__VA_ARGS__)
+#define vle16ff_v_u16m2(...) __riscv_vle16ff_v_u16m2(__VA_ARGS__)
+#define vle16ff_v_u16m4(...) __riscv_vle16ff_v_u16m4(__VA_ARGS__)
+#define vle16ff_v_u16m8(...) __riscv_vle16ff_v_u16m8(__VA_ARGS__)
+#define vle32ff_v_u32mf2(...) __riscv_vle32ff_v_u32mf2(__VA_ARGS__)
+#define vle32ff_v_u32m1(...) __riscv_vle32ff_v_u32m1(__VA_ARGS__)
+#define vle32ff_v_u32m2(...) __riscv_vle32ff_v_u32m2(__VA_ARGS__)
+#define vle32ff_v_u32m4(...) __riscv_vle32ff_v_u32m4(__VA_ARGS__)
+#define vle32ff_v_u32m8(...) __riscv_vle32ff_v_u32m8(__VA_ARGS__)
+#define vle64ff_v_u64m1(...) __riscv_vle64ff_v_u64m1(__VA_ARGS__)
+#define vle64ff_v_u64m2(...) __riscv_vle64ff_v_u64m2(__VA_ARGS__)
+#define vle64ff_v_u64m4(...) __riscv_vle64ff_v_u64m4(__VA_ARGS__)
+#define vle64ff_v_u64m8(...) __riscv_vle64ff_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vle16ff_v_f16mf4_m(...) __riscv_vle16ff_v_f16mf4_tumu(__VA_ARGS__)
+#define vle16ff_v_f16mf2_m(...) __riscv_vle16ff_v_f16mf2_tumu(__VA_ARGS__)
+#define vle16ff_v_f16m1_m(...) __riscv_vle16ff_v_f16m1_tumu(__VA_ARGS__)
+#define vle16ff_v_f16m2_m(...) __riscv_vle16ff_v_f16m2_tumu(__VA_ARGS__)
+#define vle16ff_v_f16m4_m(...) __riscv_vle16ff_v_f16m4_tumu(__VA_ARGS__)
+#define vle16ff_v_f16m8_m(...) __riscv_vle16ff_v_f16m8_tumu(__VA_ARGS__)
+#define vle32ff_v_f32mf2_m(...) __riscv_vle32ff_v_f32mf2_tumu(__VA_ARGS__)
+#define vle32ff_v_f32m1_m(...) __riscv_vle32ff_v_f32m1_tumu(__VA_ARGS__)
+#define vle32ff_v_f32m2_m(...) __riscv_vle32ff_v_f32m2_tumu(__VA_ARGS__)
+#define vle32ff_v_f32m4_m(...) __riscv_vle32ff_v_f32m4_tumu(__VA_ARGS__)
+#define vle32ff_v_f32m8_m(...) __riscv_vle32ff_v_f32m8_tumu(__VA_ARGS__)
+#define vle64ff_v_f64m1_m(...) __riscv_vle64ff_v_f64m1_tumu(__VA_ARGS__)
+#define vle64ff_v_f64m2_m(...) __riscv_vle64ff_v_f64m2_tumu(__VA_ARGS__)
+#define vle64ff_v_f64m4_m(...) __riscv_vle64ff_v_f64m4_tumu(__VA_ARGS__)
+#define vle64ff_v_f64m8_m(...) __riscv_vle64ff_v_f64m8_tumu(__VA_ARGS__)
+#define vle8ff_v_i8mf8_m(...) __riscv_vle8ff_v_i8mf8_tumu(__VA_ARGS__)
+#define vle8ff_v_i8mf4_m(...) __riscv_vle8ff_v_i8mf4_tumu(__VA_ARGS__)
+#define vle8ff_v_i8mf2_m(...) __riscv_vle8ff_v_i8mf2_tumu(__VA_ARGS__)
+#define vle8ff_v_i8m1_m(...) __riscv_vle8ff_v_i8m1_tumu(__VA_ARGS__)
+#define vle8ff_v_i8m2_m(...) __riscv_vle8ff_v_i8m2_tumu(__VA_ARGS__)
+#define vle8ff_v_i8m4_m(...) __riscv_vle8ff_v_i8m4_tumu(__VA_ARGS__)
+#define vle8ff_v_i8m8_m(...) __riscv_vle8ff_v_i8m8_tumu(__VA_ARGS__)
+#define vle16ff_v_i16mf4_m(...) __riscv_vle16ff_v_i16mf4_tumu(__VA_ARGS__)
+#define vle16ff_v_i16mf2_m(...) __riscv_vle16ff_v_i16mf2_tumu(__VA_ARGS__)
+#define vle16ff_v_i16m1_m(...) __riscv_vle16ff_v_i16m1_tumu(__VA_ARGS__)
+#define vle16ff_v_i16m2_m(...) __riscv_vle16ff_v_i16m2_tumu(__VA_ARGS__)
+#define vle16ff_v_i16m4_m(...) __riscv_vle16ff_v_i16m4_tumu(__VA_ARGS__)
+#define vle16ff_v_i16m8_m(...) __riscv_vle16ff_v_i16m8_tumu(__VA_ARGS__)
+#define vle32ff_v_i32mf2_m(...) __riscv_vle32ff_v_i32mf2_tumu(__VA_ARGS__)
+#define vle32ff_v_i32m1_m(...) __riscv_vle32ff_v_i32m1_tumu(__VA_ARGS__)
+#define vle32ff_v_i32m2_m(...) __riscv_vle32ff_v_i32m2_tumu(__VA_ARGS__)
+#define vle32ff_v_i32m4_m(...) __riscv_vle32ff_v_i32m4_tumu(__VA_ARGS__)
+#define vle32ff_v_i32m8_m(...) __riscv_vle32ff_v_i32m8_tumu(__VA_ARGS__)
+#define vle64ff_v_i64m1_m(...) __riscv_vle64ff_v_i64m1_tumu(__VA_ARGS__)
+#define vle64ff_v_i64m2_m(...) __riscv_vle64ff_v_i64m2_tumu(__VA_ARGS__)
+#define vle64ff_v_i64m4_m(...) __riscv_vle64ff_v_i64m4_tumu(__VA_ARGS__)
+#define vle64ff_v_i64m8_m(...) __riscv_vle64ff_v_i64m8_tumu(__VA_ARGS__)
+#define vle8ff_v_u8mf8_m(...) __riscv_vle8ff_v_u8mf8_tumu(__VA_ARGS__)
+#define vle8ff_v_u8mf4_m(...) __riscv_vle8ff_v_u8mf4_tumu(__VA_ARGS__)
+#define vle8ff_v_u8mf2_m(...) __riscv_vle8ff_v_u8mf2_tumu(__VA_ARGS__)
+#define vle8ff_v_u8m1_m(...) __riscv_vle8ff_v_u8m1_tumu(__VA_ARGS__)
+#define vle8ff_v_u8m2_m(...) __riscv_vle8ff_v_u8m2_tumu(__VA_ARGS__)
+#define vle8ff_v_u8m4_m(...) __riscv_vle8ff_v_u8m4_tumu(__VA_ARGS__)
+#define vle8ff_v_u8m8_m(...) __riscv_vle8ff_v_u8m8_tumu(__VA_ARGS__)
+#define vle16ff_v_u16mf4_m(...) __riscv_vle16ff_v_u16mf4_tumu(__VA_ARGS__)
+#define vle16ff_v_u16mf2_m(...) __riscv_vle16ff_v_u16mf2_tumu(__VA_ARGS__)
+#define vle16ff_v_u16m1_m(...) __riscv_vle16ff_v_u16m1_tumu(__VA_ARGS__)
+#define vle16ff_v_u16m2_m(...) __riscv_vle16ff_v_u16m2_tumu(__VA_ARGS__)
+#define vle16ff_v_u16m4_m(...) __riscv_vle16ff_v_u16m4_tumu(__VA_ARGS__)
+#define vle16ff_v_u16m8_m(...) __riscv_vle16ff_v_u16m8_tumu(__VA_ARGS__)
+#define vle32ff_v_u32mf2_m(...) __riscv_vle32ff_v_u32mf2_tumu(__VA_ARGS__)
+#define vle32ff_v_u32m1_m(...) __riscv_vle32ff_v_u32m1_tumu(__VA_ARGS__)
+#define vle32ff_v_u32m2_m(...) __riscv_vle32ff_v_u32m2_tumu(__VA_ARGS__)
+#define vle32ff_v_u32m4_m(...) __riscv_vle32ff_v_u32m4_tumu(__VA_ARGS__)
+#define vle32ff_v_u32m8_m(...) __riscv_vle32ff_v_u32m8_tumu(__VA_ARGS__)
+#define vle64ff_v_u64m1_m(...) __riscv_vle64ff_v_u64m1_tumu(__VA_ARGS__)
+#define vle64ff_v_u64m2_m(...) __riscv_vle64ff_v_u64m2_tumu(__VA_ARGS__)
+#define vle64ff_v_u64m4_m(...) __riscv_vle64ff_v_u64m4_tumu(__VA_ARGS__)
+#define vle64ff_v_u64m8_m(...) __riscv_vle64ff_v_u64m8_tumu(__VA_ARGS__)
+#define vlseg2e16_v_f16mf4(...) __riscv_vlseg2e16_v_f16mf4(__VA_ARGS__)
+#define vlseg3e16_v_f16mf4(...) __riscv_vlseg3e16_v_f16mf4(__VA_ARGS__)
+#define vlseg4e16_v_f16mf4(...) __riscv_vlseg4e16_v_f16mf4(__VA_ARGS__)
+#define vlseg5e16_v_f16mf4(...) __riscv_vlseg5e16_v_f16mf4(__VA_ARGS__)
+#define vlseg6e16_v_f16mf4(...) __riscv_vlseg6e16_v_f16mf4(__VA_ARGS__)
+#define vlseg7e16_v_f16mf4(...) __riscv_vlseg7e16_v_f16mf4(__VA_ARGS__)
+#define vlseg8e16_v_f16mf4(...) __riscv_vlseg8e16_v_f16mf4(__VA_ARGS__)
+#define vlseg2e16_v_f16mf2(...) __riscv_vlseg2e16_v_f16mf2(__VA_ARGS__)
+#define vlseg3e16_v_f16mf2(...) __riscv_vlseg3e16_v_f16mf2(__VA_ARGS__)
+#define vlseg4e16_v_f16mf2(...) __riscv_vlseg4e16_v_f16mf2(__VA_ARGS__)
+#define vlseg5e16_v_f16mf2(...) __riscv_vlseg5e16_v_f16mf2(__VA_ARGS__)
+#define vlseg6e16_v_f16mf2(...) __riscv_vlseg6e16_v_f16mf2(__VA_ARGS__)
+#define vlseg7e16_v_f16mf2(...) __riscv_vlseg7e16_v_f16mf2(__VA_ARGS__)
+#define vlseg8e16_v_f16mf2(...) __riscv_vlseg8e16_v_f16mf2(__VA_ARGS__)
+#define vlseg2e16_v_f16m1(...) __riscv_vlseg2e16_v_f16m1(__VA_ARGS__)
+#define vlseg3e16_v_f16m1(...) __riscv_vlseg3e16_v_f16m1(__VA_ARGS__)
+#define vlseg4e16_v_f16m1(...) __riscv_vlseg4e16_v_f16m1(__VA_ARGS__)
+#define vlseg5e16_v_f16m1(...) __riscv_vlseg5e16_v_f16m1(__VA_ARGS__)
+#define vlseg6e16_v_f16m1(...) __riscv_vlseg6e16_v_f16m1(__VA_ARGS__)
+#define vlseg7e16_v_f16m1(...) __riscv_vlseg7e16_v_f16m1(__VA_ARGS__)
+#define vlseg8e16_v_f16m1(...) __riscv_vlseg8e16_v_f16m1(__VA_ARGS__)
+#define vlseg2e16_v_f16m2(...) __riscv_vlseg2e16_v_f16m2(__VA_ARGS__)
+#define vlseg3e16_v_f16m2(...) __riscv_vlseg3e16_v_f16m2(__VA_ARGS__)
+#define vlseg4e16_v_f16m2(...) __riscv_vlseg4e16_v_f16m2(__VA_ARGS__)
+#define vlseg2e16_v_f16m4(...) __riscv_vlseg2e16_v_f16m4(__VA_ARGS__)
+#define vlseg2e32_v_f32mf2(...) __riscv_vlseg2e32_v_f32mf2(__VA_ARGS__)
+#define vlseg3e32_v_f32mf2(...) __riscv_vlseg3e32_v_f32mf2(__VA_ARGS__)
+#define vlseg4e32_v_f32mf2(...) __riscv_vlseg4e32_v_f32mf2(__VA_ARGS__)
+#define vlseg5e32_v_f32mf2(...) __riscv_vlseg5e32_v_f32mf2(__VA_ARGS__)
+#define vlseg6e32_v_f32mf2(...) __riscv_vlseg6e32_v_f32mf2(__VA_ARGS__)
+#define vlseg7e32_v_f32mf2(...) __riscv_vlseg7e32_v_f32mf2(__VA_ARGS__)
+#define vlseg8e32_v_f32mf2(...) __riscv_vlseg8e32_v_f32mf2(__VA_ARGS__)
+#define vlseg2e32_v_f32m1(...) __riscv_vlseg2e32_v_f32m1(__VA_ARGS__)
+#define vlseg3e32_v_f32m1(...) __riscv_vlseg3e32_v_f32m1(__VA_ARGS__)
+#define vlseg4e32_v_f32m1(...) __riscv_vlseg4e32_v_f32m1(__VA_ARGS__)
+#define vlseg5e32_v_f32m1(...) __riscv_vlseg5e32_v_f32m1(__VA_ARGS__)
+#define vlseg6e32_v_f32m1(...) __riscv_vlseg6e32_v_f32m1(__VA_ARGS__)
+#define vlseg7e32_v_f32m1(...) __riscv_vlseg7e32_v_f32m1(__VA_ARGS__)
+#define vlseg8e32_v_f32m1(...) __riscv_vlseg8e32_v_f32m1(__VA_ARGS__)
+#define vlseg2e32_v_f32m2(...) __riscv_vlseg2e32_v_f32m2(__VA_ARGS__)
+#define vlseg3e32_v_f32m2(...) __riscv_vlseg3e32_v_f32m2(__VA_ARGS__)
+#define vlseg4e32_v_f32m2(...) __riscv_vlseg4e32_v_f32m2(__VA_ARGS__)
+#define vlseg2e32_v_f32m4(...) __riscv_vlseg2e32_v_f32m4(__VA_ARGS__)
+#define vlseg2e64_v_f64m1(...) __riscv_vlseg2e64_v_f64m1(__VA_ARGS__)
+#define vlseg3e64_v_f64m1(...) __riscv_vlseg3e64_v_f64m1(__VA_ARGS__)
+#define vlseg4e64_v_f64m1(...) __riscv_vlseg4e64_v_f64m1(__VA_ARGS__)
+#define vlseg5e64_v_f64m1(...) __riscv_vlseg5e64_v_f64m1(__VA_ARGS__)
+#define vlseg6e64_v_f64m1(...) __riscv_vlseg6e64_v_f64m1(__VA_ARGS__)
+#define vlseg7e64_v_f64m1(...) __riscv_vlseg7e64_v_f64m1(__VA_ARGS__)
+#define vlseg8e64_v_f64m1(...) __riscv_vlseg8e64_v_f64m1(__VA_ARGS__)
+#define vlseg2e64_v_f64m2(...) __riscv_vlseg2e64_v_f64m2(__VA_ARGS__)
+#define vlseg3e64_v_f64m2(...) __riscv_vlseg3e64_v_f64m2(__VA_ARGS__)
+#define vlseg4e64_v_f64m2(...) __riscv_vlseg4e64_v_f64m2(__VA_ARGS__)
+#define vlseg2e64_v_f64m4(...) __riscv_vlseg2e64_v_f64m4(__VA_ARGS__)
+#define vlseg2e16ff_v_f16mf4(...) __riscv_vlseg2e16ff_v_f16mf4(__VA_ARGS__)
+#define vlseg3e16ff_v_f16mf4(...) __riscv_vlseg3e16ff_v_f16mf4(__VA_ARGS__)
+#define vlseg4e16ff_v_f16mf4(...) __riscv_vlseg4e16ff_v_f16mf4(__VA_ARGS__)
+#define vlseg5e16ff_v_f16mf4(...) __riscv_vlseg5e16ff_v_f16mf4(__VA_ARGS__)
+#define vlseg6e16ff_v_f16mf4(...) __riscv_vlseg6e16ff_v_f16mf4(__VA_ARGS__)
+#define vlseg7e16ff_v_f16mf4(...) __riscv_vlseg7e16ff_v_f16mf4(__VA_ARGS__)
+#define vlseg8e16ff_v_f16mf4(...) __riscv_vlseg8e16ff_v_f16mf4(__VA_ARGS__)
+#define vlseg2e16ff_v_f16mf2(...) __riscv_vlseg2e16ff_v_f16mf2(__VA_ARGS__)
+#define vlseg3e16ff_v_f16mf2(...) __riscv_vlseg3e16ff_v_f16mf2(__VA_ARGS__)
+#define vlseg4e16ff_v_f16mf2(...) __riscv_vlseg4e16ff_v_f16mf2(__VA_ARGS__)
+#define vlseg5e16ff_v_f16mf2(...) __riscv_vlseg5e16ff_v_f16mf2(__VA_ARGS__)
+#define vlseg6e16ff_v_f16mf2(...) __riscv_vlseg6e16ff_v_f16mf2(__VA_ARGS__)
+#define vlseg7e16ff_v_f16mf2(...) __riscv_vlseg7e16ff_v_f16mf2(__VA_ARGS__)
+#define vlseg8e16ff_v_f16mf2(...) __riscv_vlseg8e16ff_v_f16mf2(__VA_ARGS__)
+#define vlseg2e16ff_v_f16m1(...) __riscv_vlseg2e16ff_v_f16m1(__VA_ARGS__)
+#define vlseg3e16ff_v_f16m1(...) __riscv_vlseg3e16ff_v_f16m1(__VA_ARGS__)
+#define vlseg4e16ff_v_f16m1(...) __riscv_vlseg4e16ff_v_f16m1(__VA_ARGS__)
+#define vlseg5e16ff_v_f16m1(...) __riscv_vlseg5e16ff_v_f16m1(__VA_ARGS__)
+#define vlseg6e16ff_v_f16m1(...) __riscv_vlseg6e16ff_v_f16m1(__VA_ARGS__)
+#define vlseg7e16ff_v_f16m1(...) __riscv_vlseg7e16ff_v_f16m1(__VA_ARGS__)
+#define vlseg8e16ff_v_f16m1(...) __riscv_vlseg8e16ff_v_f16m1(__VA_ARGS__)
+#define vlseg2e16ff_v_f16m2(...) __riscv_vlseg2e16ff_v_f16m2(__VA_ARGS__)
+#define vlseg3e16ff_v_f16m2(...) __riscv_vlseg3e16ff_v_f16m2(__VA_ARGS__)
+#define vlseg4e16ff_v_f16m2(...) __riscv_vlseg4e16ff_v_f16m2(__VA_ARGS__)
+#define vlseg2e16ff_v_f16m4(...) __riscv_vlseg2e16ff_v_f16m4(__VA_ARGS__)
+#define vlseg2e32ff_v_f32mf2(...) __riscv_vlseg2e32ff_v_f32mf2(__VA_ARGS__)
+#define vlseg3e32ff_v_f32mf2(...) __riscv_vlseg3e32ff_v_f32mf2(__VA_ARGS__)
+#define vlseg4e32ff_v_f32mf2(...) __riscv_vlseg4e32ff_v_f32mf2(__VA_ARGS__)
+#define vlseg5e32ff_v_f32mf2(...) __riscv_vlseg5e32ff_v_f32mf2(__VA_ARGS__)
+#define vlseg6e32ff_v_f32mf2(...) __riscv_vlseg6e32ff_v_f32mf2(__VA_ARGS__)
+#define vlseg7e32ff_v_f32mf2(...) __riscv_vlseg7e32ff_v_f32mf2(__VA_ARGS__)
+#define vlseg8e32ff_v_f32mf2(...) __riscv_vlseg8e32ff_v_f32mf2(__VA_ARGS__)
+#define vlseg2e32ff_v_f32m1(...) __riscv_vlseg2e32ff_v_f32m1(__VA_ARGS__)
+#define vlseg3e32ff_v_f32m1(...) __riscv_vlseg3e32ff_v_f32m1(__VA_ARGS__)
+#define vlseg4e32ff_v_f32m1(...) __riscv_vlseg4e32ff_v_f32m1(__VA_ARGS__)
+#define vlseg5e32ff_v_f32m1(...) __riscv_vlseg5e32ff_v_f32m1(__VA_ARGS__)
+#define vlseg6e32ff_v_f32m1(...) __riscv_vlseg6e32ff_v_f32m1(__VA_ARGS__)
+#define vlseg7e32ff_v_f32m1(...) __riscv_vlseg7e32ff_v_f32m1(__VA_ARGS__)
+#define vlseg8e32ff_v_f32m1(...) __riscv_vlseg8e32ff_v_f32m1(__VA_ARGS__)
+#define vlseg2e32ff_v_f32m2(...) __riscv_vlseg2e32ff_v_f32m2(__VA_ARGS__)
+#define vlseg3e32ff_v_f32m2(...) __riscv_vlseg3e32ff_v_f32m2(__VA_ARGS__)
+#define vlseg4e32ff_v_f32m2(...) __riscv_vlseg4e32ff_v_f32m2(__VA_ARGS__)
+#define vlseg2e32ff_v_f32m4(...) __riscv_vlseg2e32ff_v_f32m4(__VA_ARGS__)
+#define vlseg2e64ff_v_f64m1(...) __riscv_vlseg2e64ff_v_f64m1(__VA_ARGS__)
+#define vlseg3e64ff_v_f64m1(...) __riscv_vlseg3e64ff_v_f64m1(__VA_ARGS__)
+#define vlseg4e64ff_v_f64m1(...) __riscv_vlseg4e64ff_v_f64m1(__VA_ARGS__)
+#define vlseg5e64ff_v_f64m1(...) __riscv_vlseg5e64ff_v_f64m1(__VA_ARGS__)
+#define vlseg6e64ff_v_f64m1(...) __riscv_vlseg6e64ff_v_f64m1(__VA_ARGS__)
+#define vlseg7e64ff_v_f64m1(...) __riscv_vlseg7e64ff_v_f64m1(__VA_ARGS__)
+#define vlseg8e64ff_v_f64m1(...) __riscv_vlseg8e64ff_v_f64m1(__VA_ARGS__)
+#define vlseg2e64ff_v_f64m2(...) __riscv_vlseg2e64ff_v_f64m2(__VA_ARGS__)
+#define vlseg3e64ff_v_f64m2(...) __riscv_vlseg3e64ff_v_f64m2(__VA_ARGS__)
+#define vlseg4e64ff_v_f64m2(...) __riscv_vlseg4e64ff_v_f64m2(__VA_ARGS__)
+#define vlseg2e64ff_v_f64m4(...) __riscv_vlseg2e64ff_v_f64m4(__VA_ARGS__)
+#define vlseg2e8_v_i8mf8(...) __riscv_vlseg2e8_v_i8mf8(__VA_ARGS__)
+#define vlseg3e8_v_i8mf8(...) __riscv_vlseg3e8_v_i8mf8(__VA_ARGS__)
+#define vlseg4e8_v_i8mf8(...) __riscv_vlseg4e8_v_i8mf8(__VA_ARGS__)
+#define vlseg5e8_v_i8mf8(...) __riscv_vlseg5e8_v_i8mf8(__VA_ARGS__)
+#define vlseg6e8_v_i8mf8(...) __riscv_vlseg6e8_v_i8mf8(__VA_ARGS__)
+#define vlseg7e8_v_i8mf8(...) __riscv_vlseg7e8_v_i8mf8(__VA_ARGS__)
+#define vlseg8e8_v_i8mf8(...) __riscv_vlseg8e8_v_i8mf8(__VA_ARGS__)
+#define vlseg2e8_v_i8mf4(...) __riscv_vlseg2e8_v_i8mf4(__VA_ARGS__)
+#define vlseg3e8_v_i8mf4(...) __riscv_vlseg3e8_v_i8mf4(__VA_ARGS__)
+#define vlseg4e8_v_i8mf4(...) __riscv_vlseg4e8_v_i8mf4(__VA_ARGS__)
+#define vlseg5e8_v_i8mf4(...) __riscv_vlseg5e8_v_i8mf4(__VA_ARGS__)
+#define vlseg6e8_v_i8mf4(...) __riscv_vlseg6e8_v_i8mf4(__VA_ARGS__)
+#define vlseg7e8_v_i8mf4(...) __riscv_vlseg7e8_v_i8mf4(__VA_ARGS__)
+#define vlseg8e8_v_i8mf4(...) __riscv_vlseg8e8_v_i8mf4(__VA_ARGS__)
+#define vlseg2e8_v_i8mf2(...) __riscv_vlseg2e8_v_i8mf2(__VA_ARGS__)
+#define vlseg3e8_v_i8mf2(...) __riscv_vlseg3e8_v_i8mf2(__VA_ARGS__)
+#define vlseg4e8_v_i8mf2(...) __riscv_vlseg4e8_v_i8mf2(__VA_ARGS__)
+#define vlseg5e8_v_i8mf2(...) __riscv_vlseg5e8_v_i8mf2(__VA_ARGS__)
+#define vlseg6e8_v_i8mf2(...) __riscv_vlseg6e8_v_i8mf2(__VA_ARGS__)
+#define vlseg7e8_v_i8mf2(...) __riscv_vlseg7e8_v_i8mf2(__VA_ARGS__)
+#define vlseg8e8_v_i8mf2(...) __riscv_vlseg8e8_v_i8mf2(__VA_ARGS__)
+#define vlseg2e8_v_i8m1(...) __riscv_vlseg2e8_v_i8m1(__VA_ARGS__)
+#define vlseg3e8_v_i8m1(...) __riscv_vlseg3e8_v_i8m1(__VA_ARGS__)
+#define vlseg4e8_v_i8m1(...) __riscv_vlseg4e8_v_i8m1(__VA_ARGS__)
+#define vlseg5e8_v_i8m1(...) __riscv_vlseg5e8_v_i8m1(__VA_ARGS__)
+#define vlseg6e8_v_i8m1(...) __riscv_vlseg6e8_v_i8m1(__VA_ARGS__)
+#define vlseg7e8_v_i8m1(...) __riscv_vlseg7e8_v_i8m1(__VA_ARGS__)
+#define vlseg8e8_v_i8m1(...) __riscv_vlseg8e8_v_i8m1(__VA_ARGS__)
+#define vlseg2e8_v_i8m2(...) __riscv_vlseg2e8_v_i8m2(__VA_ARGS__)
+#define vlseg3e8_v_i8m2(...) __riscv_vlseg3e8_v_i8m2(__VA_ARGS__)
+#define vlseg4e8_v_i8m2(...) __riscv_vlseg4e8_v_i8m2(__VA_ARGS__)
+#define vlseg2e8_v_i8m4(...) __riscv_vlseg2e8_v_i8m4(__VA_ARGS__)
+#define vlseg2e16_v_i16mf4(...) __riscv_vlseg2e16_v_i16mf4(__VA_ARGS__)
+#define vlseg3e16_v_i16mf4(...) __riscv_vlseg3e16_v_i16mf4(__VA_ARGS__)
+#define vlseg4e16_v_i16mf4(...) __riscv_vlseg4e16_v_i16mf4(__VA_ARGS__)
+#define vlseg5e16_v_i16mf4(...) __riscv_vlseg5e16_v_i16mf4(__VA_ARGS__)
+#define vlseg6e16_v_i16mf4(...) __riscv_vlseg6e16_v_i16mf4(__VA_ARGS__)
+#define vlseg7e16_v_i16mf4(...) __riscv_vlseg7e16_v_i16mf4(__VA_ARGS__)
+#define vlseg8e16_v_i16mf4(...) __riscv_vlseg8e16_v_i16mf4(__VA_ARGS__)
+#define vlseg2e16_v_i16mf2(...) __riscv_vlseg2e16_v_i16mf2(__VA_ARGS__)
+#define vlseg3e16_v_i16mf2(...) __riscv_vlseg3e16_v_i16mf2(__VA_ARGS__)
+#define vlseg4e16_v_i16mf2(...) __riscv_vlseg4e16_v_i16mf2(__VA_ARGS__)
+#define vlseg5e16_v_i16mf2(...) __riscv_vlseg5e16_v_i16mf2(__VA_ARGS__)
+#define vlseg6e16_v_i16mf2(...) __riscv_vlseg6e16_v_i16mf2(__VA_ARGS__)
+#define vlseg7e16_v_i16mf2(...) __riscv_vlseg7e16_v_i16mf2(__VA_ARGS__)
+#define vlseg8e16_v_i16mf2(...) __riscv_vlseg8e16_v_i16mf2(__VA_ARGS__)
+#define vlseg2e16_v_i16m1(...) __riscv_vlseg2e16_v_i16m1(__VA_ARGS__)
+#define vlseg3e16_v_i16m1(...) __riscv_vlseg3e16_v_i16m1(__VA_ARGS__)
+#define vlseg4e16_v_i16m1(...) __riscv_vlseg4e16_v_i16m1(__VA_ARGS__)
+#define vlseg5e16_v_i16m1(...) __riscv_vlseg5e16_v_i16m1(__VA_ARGS__)
+#define vlseg6e16_v_i16m1(...) __riscv_vlseg6e16_v_i16m1(__VA_ARGS__)
+#define vlseg7e16_v_i16m1(...) __riscv_vlseg7e16_v_i16m1(__VA_ARGS__)
+#define vlseg8e16_v_i16m1(...) __riscv_vlseg8e16_v_i16m1(__VA_ARGS__)
+#define vlseg2e16_v_i16m2(...) __riscv_vlseg2e16_v_i16m2(__VA_ARGS__)
+#define vlseg3e16_v_i16m2(...) __riscv_vlseg3e16_v_i16m2(__VA_ARGS__)
+#define vlseg4e16_v_i16m2(...) __riscv_vlseg4e16_v_i16m2(__VA_ARGS__)
+#define vlseg2e16_v_i16m4(...) __riscv_vlseg2e16_v_i16m4(__VA_ARGS__)
+#define vlseg2e32_v_i32mf2(...) __riscv_vlseg2e32_v_i32mf2(__VA_ARGS__)
+#define vlseg3e32_v_i32mf2(...) __riscv_vlseg3e32_v_i32mf2(__VA_ARGS__)
+#define vlseg4e32_v_i32mf2(...) __riscv_vlseg4e32_v_i32mf2(__VA_ARGS__)
+#define vlseg5e32_v_i32mf2(...) __riscv_vlseg5e32_v_i32mf2(__VA_ARGS__)
+#define vlseg6e32_v_i32mf2(...) __riscv_vlseg6e32_v_i32mf2(__VA_ARGS__)
+#define vlseg7e32_v_i32mf2(...) __riscv_vlseg7e32_v_i32mf2(__VA_ARGS__)
+#define vlseg8e32_v_i32mf2(...) __riscv_vlseg8e32_v_i32mf2(__VA_ARGS__)
+#define vlseg2e32_v_i32m1(...) __riscv_vlseg2e32_v_i32m1(__VA_ARGS__)
+#define vlseg3e32_v_i32m1(...) __riscv_vlseg3e32_v_i32m1(__VA_ARGS__)
+#define vlseg4e32_v_i32m1(...) __riscv_vlseg4e32_v_i32m1(__VA_ARGS__)
+#define vlseg5e32_v_i32m1(...) __riscv_vlseg5e32_v_i32m1(__VA_ARGS__)
+#define vlseg6e32_v_i32m1(...) __riscv_vlseg6e32_v_i32m1(__VA_ARGS__)
+#define vlseg7e32_v_i32m1(...) __riscv_vlseg7e32_v_i32m1(__VA_ARGS__)
+#define vlseg8e32_v_i32m1(...) __riscv_vlseg8e32_v_i32m1(__VA_ARGS__)
+#define vlseg2e32_v_i32m2(...) __riscv_vlseg2e32_v_i32m2(__VA_ARGS__)
+#define vlseg3e32_v_i32m2(...) __riscv_vlseg3e32_v_i32m2(__VA_ARGS__)
+#define vlseg4e32_v_i32m2(...) __riscv_vlseg4e32_v_i32m2(__VA_ARGS__)
+#define vlseg2e32_v_i32m4(...) __riscv_vlseg2e32_v_i32m4(__VA_ARGS__)
+#define vlseg2e64_v_i64m1(...) __riscv_vlseg2e64_v_i64m1(__VA_ARGS__)
+#define vlseg3e64_v_i64m1(...) __riscv_vlseg3e64_v_i64m1(__VA_ARGS__)
+#define vlseg4e64_v_i64m1(...) __riscv_vlseg4e64_v_i64m1(__VA_ARGS__)
+#define vlseg5e64_v_i64m1(...) __riscv_vlseg5e64_v_i64m1(__VA_ARGS__)
+#define vlseg6e64_v_i64m1(...) __riscv_vlseg6e64_v_i64m1(__VA_ARGS__)
+#define vlseg7e64_v_i64m1(...) __riscv_vlseg7e64_v_i64m1(__VA_ARGS__)
+#define vlseg8e64_v_i64m1(...) __riscv_vlseg8e64_v_i64m1(__VA_ARGS__)
+#define vlseg2e64_v_i64m2(...) __riscv_vlseg2e64_v_i64m2(__VA_ARGS__)
+#define vlseg3e64_v_i64m2(...) __riscv_vlseg3e64_v_i64m2(__VA_ARGS__)
+#define vlseg4e64_v_i64m2(...) __riscv_vlseg4e64_v_i64m2(__VA_ARGS__)
+#define vlseg2e64_v_i64m4(...) __riscv_vlseg2e64_v_i64m4(__VA_ARGS__)
+#define vlseg2e8ff_v_i8mf8(...) __riscv_vlseg2e8ff_v_i8mf8(__VA_ARGS__)
+#define vlseg3e8ff_v_i8mf8(...) __riscv_vlseg3e8ff_v_i8mf8(__VA_ARGS__)
+#define vlseg4e8ff_v_i8mf8(...) __riscv_vlseg4e8ff_v_i8mf8(__VA_ARGS__)
+#define vlseg5e8ff_v_i8mf8(...) __riscv_vlseg5e8ff_v_i8mf8(__VA_ARGS__)
+#define vlseg6e8ff_v_i8mf8(...) __riscv_vlseg6e8ff_v_i8mf8(__VA_ARGS__)
+#define vlseg7e8ff_v_i8mf8(...) __riscv_vlseg7e8ff_v_i8mf8(__VA_ARGS__)
+#define vlseg8e8ff_v_i8mf8(...) __riscv_vlseg8e8ff_v_i8mf8(__VA_ARGS__)
+#define vlseg2e8ff_v_i8mf4(...) __riscv_vlseg2e8ff_v_i8mf4(__VA_ARGS__)
+#define vlseg3e8ff_v_i8mf4(...) __riscv_vlseg3e8ff_v_i8mf4(__VA_ARGS__)
+#define vlseg4e8ff_v_i8mf4(...) __riscv_vlseg4e8ff_v_i8mf4(__VA_ARGS__)
+#define vlseg5e8ff_v_i8mf4(...) __riscv_vlseg5e8ff_v_i8mf4(__VA_ARGS__)
+#define vlseg6e8ff_v_i8mf4(...) __riscv_vlseg6e8ff_v_i8mf4(__VA_ARGS__)
+#define vlseg7e8ff_v_i8mf4(...) __riscv_vlseg7e8ff_v_i8mf4(__VA_ARGS__)
+#define vlseg8e8ff_v_i8mf4(...) __riscv_vlseg8e8ff_v_i8mf4(__VA_ARGS__)
+#define vlseg2e8ff_v_i8mf2(...) __riscv_vlseg2e8ff_v_i8mf2(__VA_ARGS__)
+#define vlseg3e8ff_v_i8mf2(...) __riscv_vlseg3e8ff_v_i8mf2(__VA_ARGS__)
+#define vlseg4e8ff_v_i8mf2(...) __riscv_vlseg4e8ff_v_i8mf2(__VA_ARGS__)
+#define vlseg5e8ff_v_i8mf2(...) __riscv_vlseg5e8ff_v_i8mf2(__VA_ARGS__)
+#define vlseg6e8ff_v_i8mf2(...) __riscv_vlseg6e8ff_v_i8mf2(__VA_ARGS__)
+#define vlseg7e8ff_v_i8mf2(...) __riscv_vlseg7e8ff_v_i8mf2(__VA_ARGS__)
+#define vlseg8e8ff_v_i8mf2(...) __riscv_vlseg8e8ff_v_i8mf2(__VA_ARGS__)
+#define vlseg2e8ff_v_i8m1(...) __riscv_vlseg2e8ff_v_i8m1(__VA_ARGS__)
+#define vlseg3e8ff_v_i8m1(...) __riscv_vlseg3e8ff_v_i8m1(__VA_ARGS__)
+#define vlseg4e8ff_v_i8m1(...) __riscv_vlseg4e8ff_v_i8m1(__VA_ARGS__)
+#define vlseg5e8ff_v_i8m1(...) __riscv_vlseg5e8ff_v_i8m1(__VA_ARGS__)
+#define vlseg6e8ff_v_i8m1(...) __riscv_vlseg6e8ff_v_i8m1(__VA_ARGS__)
+#define vlseg7e8ff_v_i8m1(...) __riscv_vlseg7e8ff_v_i8m1(__VA_ARGS__)
+#define vlseg8e8ff_v_i8m1(...) __riscv_vlseg8e8ff_v_i8m1(__VA_ARGS__)
+#define vlseg2e8ff_v_i8m2(...) __riscv_vlseg2e8ff_v_i8m2(__VA_ARGS__)
+#define vlseg3e8ff_v_i8m2(...) __riscv_vlseg3e8ff_v_i8m2(__VA_ARGS__)
+#define vlseg4e8ff_v_i8m2(...) __riscv_vlseg4e8ff_v_i8m2(__VA_ARGS__)
+#define vlseg2e8ff_v_i8m4(...) __riscv_vlseg2e8ff_v_i8m4(__VA_ARGS__)
+#define vlseg2e16ff_v_i16mf4(...) __riscv_vlseg2e16ff_v_i16mf4(__VA_ARGS__)
+#define vlseg3e16ff_v_i16mf4(...) __riscv_vlseg3e16ff_v_i16mf4(__VA_ARGS__)
+#define vlseg4e16ff_v_i16mf4(...) __riscv_vlseg4e16ff_v_i16mf4(__VA_ARGS__)
+#define vlseg5e16ff_v_i16mf4(...) __riscv_vlseg5e16ff_v_i16mf4(__VA_ARGS__)
+#define vlseg6e16ff_v_i16mf4(...) __riscv_vlseg6e16ff_v_i16mf4(__VA_ARGS__)
+#define vlseg7e16ff_v_i16mf4(...) __riscv_vlseg7e16ff_v_i16mf4(__VA_ARGS__)
+#define vlseg8e16ff_v_i16mf4(...) __riscv_vlseg8e16ff_v_i16mf4(__VA_ARGS__)
+#define vlseg2e16ff_v_i16mf2(...) __riscv_vlseg2e16ff_v_i16mf2(__VA_ARGS__)
+#define vlseg3e16ff_v_i16mf2(...) __riscv_vlseg3e16ff_v_i16mf2(__VA_ARGS__)
+#define vlseg4e16ff_v_i16mf2(...) __riscv_vlseg4e16ff_v_i16mf2(__VA_ARGS__)
+#define vlseg5e16ff_v_i16mf2(...) __riscv_vlseg5e16ff_v_i16mf2(__VA_ARGS__)
+#define vlseg6e16ff_v_i16mf2(...) __riscv_vlseg6e16ff_v_i16mf2(__VA_ARGS__)
+#define vlseg7e16ff_v_i16mf2(...) __riscv_vlseg7e16ff_v_i16mf2(__VA_ARGS__)
+#define vlseg8e16ff_v_i16mf2(...) __riscv_vlseg8e16ff_v_i16mf2(__VA_ARGS__)
+#define vlseg2e16ff_v_i16m1(...) __riscv_vlseg2e16ff_v_i16m1(__VA_ARGS__)
+#define vlseg3e16ff_v_i16m1(...) __riscv_vlseg3e16ff_v_i16m1(__VA_ARGS__)
+#define vlseg4e16ff_v_i16m1(...) __riscv_vlseg4e16ff_v_i16m1(__VA_ARGS__)
+#define vlseg5e16ff_v_i16m1(...) __riscv_vlseg5e16ff_v_i16m1(__VA_ARGS__)
+#define vlseg6e16ff_v_i16m1(...) __riscv_vlseg6e16ff_v_i16m1(__VA_ARGS__)
+#define vlseg7e16ff_v_i16m1(...) __riscv_vlseg7e16ff_v_i16m1(__VA_ARGS__)
+#define vlseg8e16ff_v_i16m1(...) __riscv_vlseg8e16ff_v_i16m1(__VA_ARGS__)
+#define vlseg2e16ff_v_i16m2(...) __riscv_vlseg2e16ff_v_i16m2(__VA_ARGS__)
+#define vlseg3e16ff_v_i16m2(...) __riscv_vlseg3e16ff_v_i16m2(__VA_ARGS__)
+#define vlseg4e16ff_v_i16m2(...) __riscv_vlseg4e16ff_v_i16m2(__VA_ARGS__)
+#define vlseg2e16ff_v_i16m4(...) __riscv_vlseg2e16ff_v_i16m4(__VA_ARGS__)
+#define vlseg2e32ff_v_i32mf2(...) __riscv_vlseg2e32ff_v_i32mf2(__VA_ARGS__)
+#define vlseg3e32ff_v_i32mf2(...) __riscv_vlseg3e32ff_v_i32mf2(__VA_ARGS__)
+#define vlseg4e32ff_v_i32mf2(...) __riscv_vlseg4e32ff_v_i32mf2(__VA_ARGS__)
+#define vlseg5e32ff_v_i32mf2(...) __riscv_vlseg5e32ff_v_i32mf2(__VA_ARGS__)
+#define vlseg6e32ff_v_i32mf2(...) __riscv_vlseg6e32ff_v_i32mf2(__VA_ARGS__)
+#define vlseg7e32ff_v_i32mf2(...) __riscv_vlseg7e32ff_v_i32mf2(__VA_ARGS__)
+#define vlseg8e32ff_v_i32mf2(...) __riscv_vlseg8e32ff_v_i32mf2(__VA_ARGS__)
+#define vlseg2e32ff_v_i32m1(...) __riscv_vlseg2e32ff_v_i32m1(__VA_ARGS__)
+#define vlseg3e32ff_v_i32m1(...) __riscv_vlseg3e32ff_v_i32m1(__VA_ARGS__)
+#define vlseg4e32ff_v_i32m1(...) __riscv_vlseg4e32ff_v_i32m1(__VA_ARGS__)
+#define vlseg5e32ff_v_i32m1(...) __riscv_vlseg5e32ff_v_i32m1(__VA_ARGS__)
+#define vlseg6e32ff_v_i32m1(...) __riscv_vlseg6e32ff_v_i32m1(__VA_ARGS__)
+#define vlseg7e32ff_v_i32m1(...) __riscv_vlseg7e32ff_v_i32m1(__VA_ARGS__)
+#define vlseg8e32ff_v_i32m1(...) __riscv_vlseg8e32ff_v_i32m1(__VA_ARGS__)
+#define vlseg2e32ff_v_i32m2(...) __riscv_vlseg2e32ff_v_i32m2(__VA_ARGS__)
+#define vlseg3e32ff_v_i32m2(...) __riscv_vlseg3e32ff_v_i32m2(__VA_ARGS__)
+#define vlseg4e32ff_v_i32m2(...) __riscv_vlseg4e32ff_v_i32m2(__VA_ARGS__)
+#define vlseg2e32ff_v_i32m4(...) __riscv_vlseg2e32ff_v_i32m4(__VA_ARGS__)
+#define vlseg2e64ff_v_i64m1(...) __riscv_vlseg2e64ff_v_i64m1(__VA_ARGS__)
+#define vlseg3e64ff_v_i64m1(...) __riscv_vlseg3e64ff_v_i64m1(__VA_ARGS__)
+#define vlseg4e64ff_v_i64m1(...) __riscv_vlseg4e64ff_v_i64m1(__VA_ARGS__)
+#define vlseg5e64ff_v_i64m1(...) __riscv_vlseg5e64ff_v_i64m1(__VA_ARGS__)
+#define vlseg6e64ff_v_i64m1(...) __riscv_vlseg6e64ff_v_i64m1(__VA_ARGS__)
+#define vlseg7e64ff_v_i64m1(...) __riscv_vlseg7e64ff_v_i64m1(__VA_ARGS__)
+#define vlseg8e64ff_v_i64m1(...) __riscv_vlseg8e64ff_v_i64m1(__VA_ARGS__)
+#define vlseg2e64ff_v_i64m2(...) __riscv_vlseg2e64ff_v_i64m2(__VA_ARGS__)
+#define vlseg3e64ff_v_i64m2(...) __riscv_vlseg3e64ff_v_i64m2(__VA_ARGS__)
+#define vlseg4e64ff_v_i64m2(...) __riscv_vlseg4e64ff_v_i64m2(__VA_ARGS__)
+#define vlseg2e64ff_v_i64m4(...) __riscv_vlseg2e64ff_v_i64m4(__VA_ARGS__)
+#define vlseg2e8_v_u8mf8(...) __riscv_vlseg2e8_v_u8mf8(__VA_ARGS__)
+#define vlseg3e8_v_u8mf8(...) __riscv_vlseg3e8_v_u8mf8(__VA_ARGS__)
+#define vlseg4e8_v_u8mf8(...) __riscv_vlseg4e8_v_u8mf8(__VA_ARGS__)
+#define vlseg5e8_v_u8mf8(...) __riscv_vlseg5e8_v_u8mf8(__VA_ARGS__)
+#define vlseg6e8_v_u8mf8(...) __riscv_vlseg6e8_v_u8mf8(__VA_ARGS__)
+#define vlseg7e8_v_u8mf8(...) __riscv_vlseg7e8_v_u8mf8(__VA_ARGS__)
+#define vlseg8e8_v_u8mf8(...) __riscv_vlseg8e8_v_u8mf8(__VA_ARGS__)
+#define vlseg2e8_v_u8mf4(...) __riscv_vlseg2e8_v_u8mf4(__VA_ARGS__)
+#define vlseg3e8_v_u8mf4(...) __riscv_vlseg3e8_v_u8mf4(__VA_ARGS__)
+#define vlseg4e8_v_u8mf4(...) __riscv_vlseg4e8_v_u8mf4(__VA_ARGS__)
+#define vlseg5e8_v_u8mf4(...) __riscv_vlseg5e8_v_u8mf4(__VA_ARGS__)
+#define vlseg6e8_v_u8mf4(...) __riscv_vlseg6e8_v_u8mf4(__VA_ARGS__)
+#define vlseg7e8_v_u8mf4(...) __riscv_vlseg7e8_v_u8mf4(__VA_ARGS__)
+#define vlseg8e8_v_u8mf4(...) __riscv_vlseg8e8_v_u8mf4(__VA_ARGS__)
+#define vlseg2e8_v_u8mf2(...) __riscv_vlseg2e8_v_u8mf2(__VA_ARGS__)
+#define vlseg3e8_v_u8mf2(...) __riscv_vlseg3e8_v_u8mf2(__VA_ARGS__)
+#define vlseg4e8_v_u8mf2(...) __riscv_vlseg4e8_v_u8mf2(__VA_ARGS__)
+#define vlseg5e8_v_u8mf2(...) __riscv_vlseg5e8_v_u8mf2(__VA_ARGS__)
+#define vlseg6e8_v_u8mf2(...) __riscv_vlseg6e8_v_u8mf2(__VA_ARGS__)
+#define vlseg7e8_v_u8mf2(...) __riscv_vlseg7e8_v_u8mf2(__VA_ARGS__)
+#define vlseg8e8_v_u8mf2(...) __riscv_vlseg8e8_v_u8mf2(__VA_ARGS__)
+#define vlseg2e8_v_u8m1(...) __riscv_vlseg2e8_v_u8m1(__VA_ARGS__)
+#define vlseg3e8_v_u8m1(...) __riscv_vlseg3e8_v_u8m1(__VA_ARGS__)
+#define vlseg4e8_v_u8m1(...) __riscv_vlseg4e8_v_u8m1(__VA_ARGS__)
+#define vlseg5e8_v_u8m1(...) __riscv_vlseg5e8_v_u8m1(__VA_ARGS__)
+#define vlseg6e8_v_u8m1(...) __riscv_vlseg6e8_v_u8m1(__VA_ARGS__)
+#define vlseg7e8_v_u8m1(...) __riscv_vlseg7e8_v_u8m1(__VA_ARGS__)
+#define vlseg8e8_v_u8m1(...) __riscv_vlseg8e8_v_u8m1(__VA_ARGS__)
+#define vlseg2e8_v_u8m2(...) __riscv_vlseg2e8_v_u8m2(__VA_ARGS__)
+#define vlseg3e8_v_u8m2(...) __riscv_vlseg3e8_v_u8m2(__VA_ARGS__)
+#define vlseg4e8_v_u8m2(...) __riscv_vlseg4e8_v_u8m2(__VA_ARGS__)
+#define vlseg2e8_v_u8m4(...) __riscv_vlseg2e8_v_u8m4(__VA_ARGS__)
+#define vlseg2e16_v_u16mf4(...) __riscv_vlseg2e16_v_u16mf4(__VA_ARGS__)
+#define vlseg3e16_v_u16mf4(...) __riscv_vlseg3e16_v_u16mf4(__VA_ARGS__)
+#define vlseg4e16_v_u16mf4(...) __riscv_vlseg4e16_v_u16mf4(__VA_ARGS__)
+#define vlseg5e16_v_u16mf4(...) __riscv_vlseg5e16_v_u16mf4(__VA_ARGS__)
+#define vlseg6e16_v_u16mf4(...) __riscv_vlseg6e16_v_u16mf4(__VA_ARGS__)
+#define vlseg7e16_v_u16mf4(...) __riscv_vlseg7e16_v_u16mf4(__VA_ARGS__)
+#define vlseg8e16_v_u16mf4(...) __riscv_vlseg8e16_v_u16mf4(__VA_ARGS__)
+#define vlseg2e16_v_u16mf2(...) __riscv_vlseg2e16_v_u16mf2(__VA_ARGS__)
+#define vlseg3e16_v_u16mf2(...) __riscv_vlseg3e16_v_u16mf2(__VA_ARGS__)
+#define vlseg4e16_v_u16mf2(...) __riscv_vlseg4e16_v_u16mf2(__VA_ARGS__)
+#define vlseg5e16_v_u16mf2(...) __riscv_vlseg5e16_v_u16mf2(__VA_ARGS__)
+#define vlseg6e16_v_u16mf2(...) __riscv_vlseg6e16_v_u16mf2(__VA_ARGS__)
+#define vlseg7e16_v_u16mf2(...) __riscv_vlseg7e16_v_u16mf2(__VA_ARGS__)
+#define vlseg8e16_v_u16mf2(...) __riscv_vlseg8e16_v_u16mf2(__VA_ARGS__)
+#define vlseg2e16_v_u16m1(...) __riscv_vlseg2e16_v_u16m1(__VA_ARGS__)
+#define vlseg3e16_v_u16m1(...) __riscv_vlseg3e16_v_u16m1(__VA_ARGS__)
+#define vlseg4e16_v_u16m1(...) __riscv_vlseg4e16_v_u16m1(__VA_ARGS__)
+#define vlseg5e16_v_u16m1(...) __riscv_vlseg5e16_v_u16m1(__VA_ARGS__)
+#define vlseg6e16_v_u16m1(...) __riscv_vlseg6e16_v_u16m1(__VA_ARGS__)
+#define vlseg7e16_v_u16m1(...) __riscv_vlseg7e16_v_u16m1(__VA_ARGS__)
+#define vlseg8e16_v_u16m1(...) __riscv_vlseg8e16_v_u16m1(__VA_ARGS__)
+#define vlseg2e16_v_u16m2(...) __riscv_vlseg2e16_v_u16m2(__VA_ARGS__)
+#define vlseg3e16_v_u16m2(...) __riscv_vlseg3e16_v_u16m2(__VA_ARGS__)
+#define vlseg4e16_v_u16m2(...) __riscv_vlseg4e16_v_u16m2(__VA_ARGS__)
+#define vlseg2e16_v_u16m4(...) __riscv_vlseg2e16_v_u16m4(__VA_ARGS__)
+#define vlseg2e32_v_u32mf2(...) __riscv_vlseg2e32_v_u32mf2(__VA_ARGS__)
+#define vlseg3e32_v_u32mf2(...) __riscv_vlseg3e32_v_u32mf2(__VA_ARGS__)
+#define vlseg4e32_v_u32mf2(...) __riscv_vlseg4e32_v_u32mf2(__VA_ARGS__)
+#define vlseg5e32_v_u32mf2(...) __riscv_vlseg5e32_v_u32mf2(__VA_ARGS__)
+#define vlseg6e32_v_u32mf2(...) __riscv_vlseg6e32_v_u32mf2(__VA_ARGS__)
+#define vlseg7e32_v_u32mf2(...) __riscv_vlseg7e32_v_u32mf2(__VA_ARGS__)
+#define vlseg8e32_v_u32mf2(...) __riscv_vlseg8e32_v_u32mf2(__VA_ARGS__)
+#define vlseg2e32_v_u32m1(...) __riscv_vlseg2e32_v_u32m1(__VA_ARGS__)
+#define vlseg3e32_v_u32m1(...) __riscv_vlseg3e32_v_u32m1(__VA_ARGS__)
+#define vlseg4e32_v_u32m1(...) __riscv_vlseg4e32_v_u32m1(__VA_ARGS__)
+#define vlseg5e32_v_u32m1(...) __riscv_vlseg5e32_v_u32m1(__VA_ARGS__)
+#define vlseg6e32_v_u32m1(...) __riscv_vlseg6e32_v_u32m1(__VA_ARGS__)
+#define vlseg7e32_v_u32m1(...) __riscv_vlseg7e32_v_u32m1(__VA_ARGS__)
+#define vlseg8e32_v_u32m1(...) __riscv_vlseg8e32_v_u32m1(__VA_ARGS__)
+#define vlseg2e32_v_u32m2(...) __riscv_vlseg2e32_v_u32m2(__VA_ARGS__)
+#define vlseg3e32_v_u32m2(...) __riscv_vlseg3e32_v_u32m2(__VA_ARGS__)
+#define vlseg4e32_v_u32m2(...) __riscv_vlseg4e32_v_u32m2(__VA_ARGS__)
+#define vlseg2e32_v_u32m4(...) __riscv_vlseg2e32_v_u32m4(__VA_ARGS__)
+#define vlseg2e64_v_u64m1(...) __riscv_vlseg2e64_v_u64m1(__VA_ARGS__)
+#define vlseg3e64_v_u64m1(...) __riscv_vlseg3e64_v_u64m1(__VA_ARGS__)
+#define vlseg4e64_v_u64m1(...) __riscv_vlseg4e64_v_u64m1(__VA_ARGS__)
+#define vlseg5e64_v_u64m1(...) __riscv_vlseg5e64_v_u64m1(__VA_ARGS__)
+#define vlseg6e64_v_u64m1(...) __riscv_vlseg6e64_v_u64m1(__VA_ARGS__)
+#define vlseg7e64_v_u64m1(...) __riscv_vlseg7e64_v_u64m1(__VA_ARGS__)
+#define vlseg8e64_v_u64m1(...) __riscv_vlseg8e64_v_u64m1(__VA_ARGS__)
+#define vlseg2e64_v_u64m2(...) __riscv_vlseg2e64_v_u64m2(__VA_ARGS__)
+#define vlseg3e64_v_u64m2(...) __riscv_vlseg3e64_v_u64m2(__VA_ARGS__)
+#define vlseg4e64_v_u64m2(...) __riscv_vlseg4e64_v_u64m2(__VA_ARGS__)
+#define vlseg2e64_v_u64m4(...) __riscv_vlseg2e64_v_u64m4(__VA_ARGS__)
+#define vlseg2e8ff_v_u8mf8(...) __riscv_vlseg2e8ff_v_u8mf8(__VA_ARGS__)
+#define vlseg3e8ff_v_u8mf8(...) __riscv_vlseg3e8ff_v_u8mf8(__VA_ARGS__)
+#define vlseg4e8ff_v_u8mf8(...) __riscv_vlseg4e8ff_v_u8mf8(__VA_ARGS__)
+#define vlseg5e8ff_v_u8mf8(...) __riscv_vlseg5e8ff_v_u8mf8(__VA_ARGS__)
+#define vlseg6e8ff_v_u8mf8(...) __riscv_vlseg6e8ff_v_u8mf8(__VA_ARGS__)
+#define vlseg7e8ff_v_u8mf8(...) __riscv_vlseg7e8ff_v_u8mf8(__VA_ARGS__)
+#define vlseg8e8ff_v_u8mf8(...) __riscv_vlseg8e8ff_v_u8mf8(__VA_ARGS__)
+#define vlseg2e8ff_v_u8mf4(...) __riscv_vlseg2e8ff_v_u8mf4(__VA_ARGS__)
+#define vlseg3e8ff_v_u8mf4(...) __riscv_vlseg3e8ff_v_u8mf4(__VA_ARGS__)
+#define vlseg4e8ff_v_u8mf4(...) __riscv_vlseg4e8ff_v_u8mf4(__VA_ARGS__)
+#define vlseg5e8ff_v_u8mf4(...) __riscv_vlseg5e8ff_v_u8mf4(__VA_ARGS__)
+#define vlseg6e8ff_v_u8mf4(...) __riscv_vlseg6e8ff_v_u8mf4(__VA_ARGS__)
+#define vlseg7e8ff_v_u8mf4(...) __riscv_vlseg7e8ff_v_u8mf4(__VA_ARGS__)
+#define vlseg8e8ff_v_u8mf4(...) __riscv_vlseg8e8ff_v_u8mf4(__VA_ARGS__)
+#define vlseg2e8ff_v_u8mf2(...) __riscv_vlseg2e8ff_v_u8mf2(__VA_ARGS__)
+#define vlseg3e8ff_v_u8mf2(...) __riscv_vlseg3e8ff_v_u8mf2(__VA_ARGS__)
+#define vlseg4e8ff_v_u8mf2(...) __riscv_vlseg4e8ff_v_u8mf2(__VA_ARGS__)
+#define vlseg5e8ff_v_u8mf2(...) __riscv_vlseg5e8ff_v_u8mf2(__VA_ARGS__)
+#define vlseg6e8ff_v_u8mf2(...) __riscv_vlseg6e8ff_v_u8mf2(__VA_ARGS__)
+#define vlseg7e8ff_v_u8mf2(...) __riscv_vlseg7e8ff_v_u8mf2(__VA_ARGS__)
+#define vlseg8e8ff_v_u8mf2(...) __riscv_vlseg8e8ff_v_u8mf2(__VA_ARGS__)
+#define vlseg2e8ff_v_u8m1(...) __riscv_vlseg2e8ff_v_u8m1(__VA_ARGS__)
+#define vlseg3e8ff_v_u8m1(...) __riscv_vlseg3e8ff_v_u8m1(__VA_ARGS__)
+#define vlseg4e8ff_v_u8m1(...) __riscv_vlseg4e8ff_v_u8m1(__VA_ARGS__)
+#define vlseg5e8ff_v_u8m1(...) __riscv_vlseg5e8ff_v_u8m1(__VA_ARGS__)
+#define vlseg6e8ff_v_u8m1(...) __riscv_vlseg6e8ff_v_u8m1(__VA_ARGS__)
+#define vlseg7e8ff_v_u8m1(...) __riscv_vlseg7e8ff_v_u8m1(__VA_ARGS__)
+#define vlseg8e8ff_v_u8m1(...) __riscv_vlseg8e8ff_v_u8m1(__VA_ARGS__)
+#define vlseg2e8ff_v_u8m2(...) __riscv_vlseg2e8ff_v_u8m2(__VA_ARGS__)
+#define vlseg3e8ff_v_u8m2(...) __riscv_vlseg3e8ff_v_u8m2(__VA_ARGS__)
+#define vlseg4e8ff_v_u8m2(...) __riscv_vlseg4e8ff_v_u8m2(__VA_ARGS__)
+#define vlseg2e8ff_v_u8m4(...) __riscv_vlseg2e8ff_v_u8m4(__VA_ARGS__)
+#define vlseg2e16ff_v_u16mf4(...) __riscv_vlseg2e16ff_v_u16mf4(__VA_ARGS__)
+#define vlseg3e16ff_v_u16mf4(...) __riscv_vlseg3e16ff_v_u16mf4(__VA_ARGS__)
+#define vlseg4e16ff_v_u16mf4(...) __riscv_vlseg4e16ff_v_u16mf4(__VA_ARGS__)
+#define vlseg5e16ff_v_u16mf4(...) __riscv_vlseg5e16ff_v_u16mf4(__VA_ARGS__)
+#define vlseg6e16ff_v_u16mf4(...) __riscv_vlseg6e16ff_v_u16mf4(__VA_ARGS__)
+#define vlseg7e16ff_v_u16mf4(...) __riscv_vlseg7e16ff_v_u16mf4(__VA_ARGS__)
+#define vlseg8e16ff_v_u16mf4(...) __riscv_vlseg8e16ff_v_u16mf4(__VA_ARGS__)
+#define vlseg2e16ff_v_u16mf2(...) __riscv_vlseg2e16ff_v_u16mf2(__VA_ARGS__)
+#define vlseg3e16ff_v_u16mf2(...) __riscv_vlseg3e16ff_v_u16mf2(__VA_ARGS__)
+#define vlseg4e16ff_v_u16mf2(...) __riscv_vlseg4e16ff_v_u16mf2(__VA_ARGS__)
+#define vlseg5e16ff_v_u16mf2(...) __riscv_vlseg5e16ff_v_u16mf2(__VA_ARGS__)
+#define vlseg6e16ff_v_u16mf2(...) __riscv_vlseg6e16ff_v_u16mf2(__VA_ARGS__)
+#define vlseg7e16ff_v_u16mf2(...) __riscv_vlseg7e16ff_v_u16mf2(__VA_ARGS__)
+#define vlseg8e16ff_v_u16mf2(...) __riscv_vlseg8e16ff_v_u16mf2(__VA_ARGS__)
+#define vlseg2e16ff_v_u16m1(...) __riscv_vlseg2e16ff_v_u16m1(__VA_ARGS__)
+#define vlseg3e16ff_v_u16m1(...) __riscv_vlseg3e16ff_v_u16m1(__VA_ARGS__)
+#define vlseg4e16ff_v_u16m1(...) __riscv_vlseg4e16ff_v_u16m1(__VA_ARGS__)
+#define vlseg5e16ff_v_u16m1(...) __riscv_vlseg5e16ff_v_u16m1(__VA_ARGS__)
+#define vlseg6e16ff_v_u16m1(...) __riscv_vlseg6e16ff_v_u16m1(__VA_ARGS__)
+#define vlseg7e16ff_v_u16m1(...) __riscv_vlseg7e16ff_v_u16m1(__VA_ARGS__)
+#define vlseg8e16ff_v_u16m1(...) __riscv_vlseg8e16ff_v_u16m1(__VA_ARGS__)
+#define vlseg2e16ff_v_u16m2(...) __riscv_vlseg2e16ff_v_u16m2(__VA_ARGS__)
+#define vlseg3e16ff_v_u16m2(...) __riscv_vlseg3e16ff_v_u16m2(__VA_ARGS__)
+#define vlseg4e16ff_v_u16m2(...) __riscv_vlseg4e16ff_v_u16m2(__VA_ARGS__)
+#define vlseg2e16ff_v_u16m4(...) __riscv_vlseg2e16ff_v_u16m4(__VA_ARGS__)
+#define vlseg2e32ff_v_u32mf2(...) __riscv_vlseg2e32ff_v_u32mf2(__VA_ARGS__)
+#define vlseg3e32ff_v_u32mf2(...) __riscv_vlseg3e32ff_v_u32mf2(__VA_ARGS__)
+#define vlseg4e32ff_v_u32mf2(...) __riscv_vlseg4e32ff_v_u32mf2(__VA_ARGS__)
+#define vlseg5e32ff_v_u32mf2(...) __riscv_vlseg5e32ff_v_u32mf2(__VA_ARGS__)
+#define vlseg6e32ff_v_u32mf2(...) __riscv_vlseg6e32ff_v_u32mf2(__VA_ARGS__)
+#define vlseg7e32ff_v_u32mf2(...) __riscv_vlseg7e32ff_v_u32mf2(__VA_ARGS__)
+#define vlseg8e32ff_v_u32mf2(...) __riscv_vlseg8e32ff_v_u32mf2(__VA_ARGS__)
+#define vlseg2e32ff_v_u32m1(...) __riscv_vlseg2e32ff_v_u32m1(__VA_ARGS__)
+#define vlseg3e32ff_v_u32m1(...) __riscv_vlseg3e32ff_v_u32m1(__VA_ARGS__)
+#define vlseg4e32ff_v_u32m1(...) __riscv_vlseg4e32ff_v_u32m1(__VA_ARGS__)
+#define vlseg5e32ff_v_u32m1(...) __riscv_vlseg5e32ff_v_u32m1(__VA_ARGS__)
+#define vlseg6e32ff_v_u32m1(...) __riscv_vlseg6e32ff_v_u32m1(__VA_ARGS__)
+#define vlseg7e32ff_v_u32m1(...) __riscv_vlseg7e32ff_v_u32m1(__VA_ARGS__)
+#define vlseg8e32ff_v_u32m1(...) __riscv_vlseg8e32ff_v_u32m1(__VA_ARGS__)
+#define vlseg2e32ff_v_u32m2(...) __riscv_vlseg2e32ff_v_u32m2(__VA_ARGS__)
+#define vlseg3e32ff_v_u32m2(...) __riscv_vlseg3e32ff_v_u32m2(__VA_ARGS__)
+#define vlseg4e32ff_v_u32m2(...) __riscv_vlseg4e32ff_v_u32m2(__VA_ARGS__)
+#define vlseg2e32ff_v_u32m4(...) __riscv_vlseg2e32ff_v_u32m4(__VA_ARGS__)
+#define vlseg2e64ff_v_u64m1(...) __riscv_vlseg2e64ff_v_u64m1(__VA_ARGS__)
+#define vlseg3e64ff_v_u64m1(...) __riscv_vlseg3e64ff_v_u64m1(__VA_ARGS__)
+#define vlseg4e64ff_v_u64m1(...) __riscv_vlseg4e64ff_v_u64m1(__VA_ARGS__)
+#define vlseg5e64ff_v_u64m1(...) __riscv_vlseg5e64ff_v_u64m1(__VA_ARGS__)
+#define vlseg6e64ff_v_u64m1(...) __riscv_vlseg6e64ff_v_u64m1(__VA_ARGS__)
+#define vlseg7e64ff_v_u64m1(...) __riscv_vlseg7e64ff_v_u64m1(__VA_ARGS__)
+#define vlseg8e64ff_v_u64m1(...) __riscv_vlseg8e64ff_v_u64m1(__VA_ARGS__)
+#define vlseg2e64ff_v_u64m2(...) __riscv_vlseg2e64ff_v_u64m2(__VA_ARGS__)
+#define vlseg3e64ff_v_u64m2(...) __riscv_vlseg3e64ff_v_u64m2(__VA_ARGS__)
+#define vlseg4e64ff_v_u64m2(...) __riscv_vlseg4e64ff_v_u64m2(__VA_ARGS__)
+#define vlseg2e64ff_v_u64m4(...) __riscv_vlseg2e64ff_v_u64m4(__VA_ARGS__)
+// masked functions
+#define vlseg2e16_v_f16mf4_m(...) __riscv_vlseg2e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg3e16_v_f16mf4_m(...) __riscv_vlseg3e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg4e16_v_f16mf4_m(...) __riscv_vlseg4e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg5e16_v_f16mf4_m(...) __riscv_vlseg5e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg6e16_v_f16mf4_m(...) __riscv_vlseg6e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg7e16_v_f16mf4_m(...) __riscv_vlseg7e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg8e16_v_f16mf4_m(...) __riscv_vlseg8e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg2e16_v_f16mf2_m(...) __riscv_vlseg2e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg3e16_v_f16mf2_m(...) __riscv_vlseg3e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg4e16_v_f16mf2_m(...) __riscv_vlseg4e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg5e16_v_f16mf2_m(...) __riscv_vlseg5e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg6e16_v_f16mf2_m(...) __riscv_vlseg6e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg7e16_v_f16mf2_m(...) __riscv_vlseg7e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg8e16_v_f16mf2_m(...) __riscv_vlseg8e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg2e16_v_f16m1_m(...) __riscv_vlseg2e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg3e16_v_f16m1_m(...) __riscv_vlseg3e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg4e16_v_f16m1_m(...) __riscv_vlseg4e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg5e16_v_f16m1_m(...) __riscv_vlseg5e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg6e16_v_f16m1_m(...) __riscv_vlseg6e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg7e16_v_f16m1_m(...) __riscv_vlseg7e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg8e16_v_f16m1_m(...) __riscv_vlseg8e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg2e16_v_f16m2_m(...) __riscv_vlseg2e16_v_f16m2_tumu(__VA_ARGS__)
+#define vlseg3e16_v_f16m2_m(...) __riscv_vlseg3e16_v_f16m2_tumu(__VA_ARGS__)
+#define vlseg4e16_v_f16m2_m(...) __riscv_vlseg4e16_v_f16m2_tumu(__VA_ARGS__)
+#define vlseg2e16_v_f16m4_m(...) __riscv_vlseg2e16_v_f16m4_tumu(__VA_ARGS__)
+#define vlseg2e32_v_f32mf2_m(...) __riscv_vlseg2e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg3e32_v_f32mf2_m(...) __riscv_vlseg3e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg4e32_v_f32mf2_m(...) __riscv_vlseg4e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg5e32_v_f32mf2_m(...) __riscv_vlseg5e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg6e32_v_f32mf2_m(...) __riscv_vlseg6e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg7e32_v_f32mf2_m(...) __riscv_vlseg7e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg8e32_v_f32mf2_m(...) __riscv_vlseg8e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg2e32_v_f32m1_m(...) __riscv_vlseg2e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg3e32_v_f32m1_m(...) __riscv_vlseg3e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg4e32_v_f32m1_m(...) __riscv_vlseg4e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg5e32_v_f32m1_m(...) __riscv_vlseg5e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg6e32_v_f32m1_m(...) __riscv_vlseg6e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg7e32_v_f32m1_m(...) __riscv_vlseg7e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg8e32_v_f32m1_m(...) __riscv_vlseg8e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg2e32_v_f32m2_m(...) __riscv_vlseg2e32_v_f32m2_tumu(__VA_ARGS__)
+#define vlseg3e32_v_f32m2_m(...) __riscv_vlseg3e32_v_f32m2_tumu(__VA_ARGS__)
+#define vlseg4e32_v_f32m2_m(...) __riscv_vlseg4e32_v_f32m2_tumu(__VA_ARGS__)
+#define vlseg2e32_v_f32m4_m(...) __riscv_vlseg2e32_v_f32m4_tumu(__VA_ARGS__)
+#define vlseg2e64_v_f64m1_m(...) __riscv_vlseg2e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg3e64_v_f64m1_m(...) __riscv_vlseg3e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg4e64_v_f64m1_m(...) __riscv_vlseg4e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg5e64_v_f64m1_m(...) __riscv_vlseg5e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg6e64_v_f64m1_m(...) __riscv_vlseg6e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg7e64_v_f64m1_m(...) __riscv_vlseg7e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg8e64_v_f64m1_m(...) __riscv_vlseg8e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg2e64_v_f64m2_m(...) __riscv_vlseg2e64_v_f64m2_tumu(__VA_ARGS__)
+#define vlseg3e64_v_f64m2_m(...) __riscv_vlseg3e64_v_f64m2_tumu(__VA_ARGS__)
+#define vlseg4e64_v_f64m2_m(...) __riscv_vlseg4e64_v_f64m2_tumu(__VA_ARGS__)
+#define vlseg2e64_v_f64m4_m(...) __riscv_vlseg2e64_v_f64m4_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_f16mf4_m(...) __riscv_vlseg2e16ff_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_f16mf4_m(...) __riscv_vlseg3e16ff_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_f16mf4_m(...) __riscv_vlseg4e16ff_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg5e16ff_v_f16mf4_m(...) __riscv_vlseg5e16ff_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg6e16ff_v_f16mf4_m(...) __riscv_vlseg6e16ff_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg7e16ff_v_f16mf4_m(...) __riscv_vlseg7e16ff_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg8e16ff_v_f16mf4_m(...) __riscv_vlseg8e16ff_v_f16mf4_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_f16mf2_m(...) __riscv_vlseg2e16ff_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_f16mf2_m(...) __riscv_vlseg3e16ff_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_f16mf2_m(...) __riscv_vlseg4e16ff_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg5e16ff_v_f16mf2_m(...) __riscv_vlseg5e16ff_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg6e16ff_v_f16mf2_m(...) __riscv_vlseg6e16ff_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg7e16ff_v_f16mf2_m(...) __riscv_vlseg7e16ff_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg8e16ff_v_f16mf2_m(...) __riscv_vlseg8e16ff_v_f16mf2_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_f16m1_m(...) __riscv_vlseg2e16ff_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_f16m1_m(...) __riscv_vlseg3e16ff_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_f16m1_m(...) __riscv_vlseg4e16ff_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg5e16ff_v_f16m1_m(...) __riscv_vlseg5e16ff_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg6e16ff_v_f16m1_m(...) __riscv_vlseg6e16ff_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg7e16ff_v_f16m1_m(...) __riscv_vlseg7e16ff_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg8e16ff_v_f16m1_m(...) __riscv_vlseg8e16ff_v_f16m1_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_f16m2_m(...) __riscv_vlseg2e16ff_v_f16m2_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_f16m2_m(...) __riscv_vlseg3e16ff_v_f16m2_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_f16m2_m(...) __riscv_vlseg4e16ff_v_f16m2_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_f16m4_m(...) __riscv_vlseg2e16ff_v_f16m4_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_f32mf2_m(...) __riscv_vlseg2e32ff_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg3e32ff_v_f32mf2_m(...) __riscv_vlseg3e32ff_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg4e32ff_v_f32mf2_m(...) __riscv_vlseg4e32ff_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg5e32ff_v_f32mf2_m(...) __riscv_vlseg5e32ff_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg6e32ff_v_f32mf2_m(...) __riscv_vlseg6e32ff_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg7e32ff_v_f32mf2_m(...) __riscv_vlseg7e32ff_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg8e32ff_v_f32mf2_m(...) __riscv_vlseg8e32ff_v_f32mf2_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_f32m1_m(...) __riscv_vlseg2e32ff_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg3e32ff_v_f32m1_m(...) __riscv_vlseg3e32ff_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg4e32ff_v_f32m1_m(...) __riscv_vlseg4e32ff_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg5e32ff_v_f32m1_m(...) __riscv_vlseg5e32ff_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg6e32ff_v_f32m1_m(...) __riscv_vlseg6e32ff_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg7e32ff_v_f32m1_m(...) __riscv_vlseg7e32ff_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg8e32ff_v_f32m1_m(...) __riscv_vlseg8e32ff_v_f32m1_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_f32m2_m(...) __riscv_vlseg2e32ff_v_f32m2_tumu(__VA_ARGS__)
+#define vlseg3e32ff_v_f32m2_m(...) __riscv_vlseg3e32ff_v_f32m2_tumu(__VA_ARGS__)
+#define vlseg4e32ff_v_f32m2_m(...) __riscv_vlseg4e32ff_v_f32m2_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_f32m4_m(...) __riscv_vlseg2e32ff_v_f32m4_tumu(__VA_ARGS__)
+#define vlseg2e64ff_v_f64m1_m(...) __riscv_vlseg2e64ff_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg3e64ff_v_f64m1_m(...) __riscv_vlseg3e64ff_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg4e64ff_v_f64m1_m(...) __riscv_vlseg4e64ff_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg5e64ff_v_f64m1_m(...) __riscv_vlseg5e64ff_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg6e64ff_v_f64m1_m(...) __riscv_vlseg6e64ff_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg7e64ff_v_f64m1_m(...) __riscv_vlseg7e64ff_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg8e64ff_v_f64m1_m(...) __riscv_vlseg8e64ff_v_f64m1_tumu(__VA_ARGS__)
+#define vlseg2e64ff_v_f64m2_m(...) __riscv_vlseg2e64ff_v_f64m2_tumu(__VA_ARGS__)
+#define vlseg3e64ff_v_f64m2_m(...) __riscv_vlseg3e64ff_v_f64m2_tumu(__VA_ARGS__)
+#define vlseg4e64ff_v_f64m2_m(...) __riscv_vlseg4e64ff_v_f64m2_tumu(__VA_ARGS__)
+#define vlseg2e64ff_v_f64m4_m(...) __riscv_vlseg2e64ff_v_f64m4_tumu(__VA_ARGS__)
+#define vlseg2e8_v_i8mf8_m(...) __riscv_vlseg2e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg3e8_v_i8mf8_m(...) __riscv_vlseg3e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg4e8_v_i8mf8_m(...) __riscv_vlseg4e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg5e8_v_i8mf8_m(...) __riscv_vlseg5e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg6e8_v_i8mf8_m(...) __riscv_vlseg6e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg7e8_v_i8mf8_m(...) __riscv_vlseg7e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg8e8_v_i8mf8_m(...) __riscv_vlseg8e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg2e8_v_i8mf4_m(...) __riscv_vlseg2e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg3e8_v_i8mf4_m(...) __riscv_vlseg3e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg4e8_v_i8mf4_m(...) __riscv_vlseg4e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg5e8_v_i8mf4_m(...) __riscv_vlseg5e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg6e8_v_i8mf4_m(...) __riscv_vlseg6e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg7e8_v_i8mf4_m(...) __riscv_vlseg7e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg8e8_v_i8mf4_m(...) __riscv_vlseg8e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg2e8_v_i8mf2_m(...) __riscv_vlseg2e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg3e8_v_i8mf2_m(...) __riscv_vlseg3e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg4e8_v_i8mf2_m(...) __riscv_vlseg4e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg5e8_v_i8mf2_m(...) __riscv_vlseg5e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg6e8_v_i8mf2_m(...) __riscv_vlseg6e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg7e8_v_i8mf2_m(...) __riscv_vlseg7e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg8e8_v_i8mf2_m(...) __riscv_vlseg8e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg2e8_v_i8m1_m(...) __riscv_vlseg2e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg3e8_v_i8m1_m(...) __riscv_vlseg3e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg4e8_v_i8m1_m(...) __riscv_vlseg4e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg5e8_v_i8m1_m(...) __riscv_vlseg5e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg6e8_v_i8m1_m(...) __riscv_vlseg6e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg7e8_v_i8m1_m(...) __riscv_vlseg7e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg8e8_v_i8m1_m(...) __riscv_vlseg8e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg2e8_v_i8m2_m(...) __riscv_vlseg2e8_v_i8m2_tumu(__VA_ARGS__)
+#define vlseg3e8_v_i8m2_m(...) __riscv_vlseg3e8_v_i8m2_tumu(__VA_ARGS__)
+#define vlseg4e8_v_i8m2_m(...) __riscv_vlseg4e8_v_i8m2_tumu(__VA_ARGS__)
+#define vlseg2e8_v_i8m4_m(...) __riscv_vlseg2e8_v_i8m4_tumu(__VA_ARGS__)
+#define vlseg2e16_v_i16mf4_m(...) __riscv_vlseg2e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg3e16_v_i16mf4_m(...) __riscv_vlseg3e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg4e16_v_i16mf4_m(...) __riscv_vlseg4e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg5e16_v_i16mf4_m(...) __riscv_vlseg5e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg6e16_v_i16mf4_m(...) __riscv_vlseg6e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg7e16_v_i16mf4_m(...) __riscv_vlseg7e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg8e16_v_i16mf4_m(...) __riscv_vlseg8e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg2e16_v_i16mf2_m(...) __riscv_vlseg2e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg3e16_v_i16mf2_m(...) __riscv_vlseg3e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg4e16_v_i16mf2_m(...) __riscv_vlseg4e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg5e16_v_i16mf2_m(...) __riscv_vlseg5e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg6e16_v_i16mf2_m(...) __riscv_vlseg6e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg7e16_v_i16mf2_m(...) __riscv_vlseg7e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg8e16_v_i16mf2_m(...) __riscv_vlseg8e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg2e16_v_i16m1_m(...) __riscv_vlseg2e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg3e16_v_i16m1_m(...) __riscv_vlseg3e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg4e16_v_i16m1_m(...) __riscv_vlseg4e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg5e16_v_i16m1_m(...) __riscv_vlseg5e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg6e16_v_i16m1_m(...) __riscv_vlseg6e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg7e16_v_i16m1_m(...) __riscv_vlseg7e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg8e16_v_i16m1_m(...) __riscv_vlseg8e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg2e16_v_i16m2_m(...) __riscv_vlseg2e16_v_i16m2_tumu(__VA_ARGS__)
+#define vlseg3e16_v_i16m2_m(...) __riscv_vlseg3e16_v_i16m2_tumu(__VA_ARGS__)
+#define vlseg4e16_v_i16m2_m(...) __riscv_vlseg4e16_v_i16m2_tumu(__VA_ARGS__)
+#define vlseg2e16_v_i16m4_m(...) __riscv_vlseg2e16_v_i16m4_tumu(__VA_ARGS__)
+#define vlseg2e32_v_i32mf2_m(...) __riscv_vlseg2e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg3e32_v_i32mf2_m(...) __riscv_vlseg3e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg4e32_v_i32mf2_m(...) __riscv_vlseg4e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg5e32_v_i32mf2_m(...) __riscv_vlseg5e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg6e32_v_i32mf2_m(...) __riscv_vlseg6e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg7e32_v_i32mf2_m(...) __riscv_vlseg7e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg8e32_v_i32mf2_m(...) __riscv_vlseg8e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg2e32_v_i32m1_m(...) __riscv_vlseg2e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg3e32_v_i32m1_m(...) __riscv_vlseg3e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg4e32_v_i32m1_m(...) __riscv_vlseg4e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg5e32_v_i32m1_m(...) __riscv_vlseg5e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg6e32_v_i32m1_m(...) __riscv_vlseg6e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg7e32_v_i32m1_m(...) __riscv_vlseg7e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg8e32_v_i32m1_m(...) __riscv_vlseg8e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg2e32_v_i32m2_m(...) __riscv_vlseg2e32_v_i32m2_tumu(__VA_ARGS__)
+#define vlseg3e32_v_i32m2_m(...) __riscv_vlseg3e32_v_i32m2_tumu(__VA_ARGS__)
+#define vlseg4e32_v_i32m2_m(...) __riscv_vlseg4e32_v_i32m2_tumu(__VA_ARGS__)
+#define vlseg2e32_v_i32m4_m(...) __riscv_vlseg2e32_v_i32m4_tumu(__VA_ARGS__)
+#define vlseg2e64_v_i64m1_m(...) __riscv_vlseg2e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg3e64_v_i64m1_m(...) __riscv_vlseg3e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg4e64_v_i64m1_m(...) __riscv_vlseg4e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg5e64_v_i64m1_m(...) __riscv_vlseg5e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg6e64_v_i64m1_m(...) __riscv_vlseg6e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg7e64_v_i64m1_m(...) __riscv_vlseg7e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg8e64_v_i64m1_m(...) __riscv_vlseg8e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg2e64_v_i64m2_m(...) __riscv_vlseg2e64_v_i64m2_tumu(__VA_ARGS__)
+#define vlseg3e64_v_i64m2_m(...) __riscv_vlseg3e64_v_i64m2_tumu(__VA_ARGS__)
+#define vlseg4e64_v_i64m2_m(...) __riscv_vlseg4e64_v_i64m2_tumu(__VA_ARGS__)
+#define vlseg2e64_v_i64m4_m(...) __riscv_vlseg2e64_v_i64m4_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_i8mf8_m(...) __riscv_vlseg2e8ff_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_i8mf8_m(...) __riscv_vlseg3e8ff_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_i8mf8_m(...) __riscv_vlseg4e8ff_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg5e8ff_v_i8mf8_m(...) __riscv_vlseg5e8ff_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg6e8ff_v_i8mf8_m(...) __riscv_vlseg6e8ff_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg7e8ff_v_i8mf8_m(...) __riscv_vlseg7e8ff_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg8e8ff_v_i8mf8_m(...) __riscv_vlseg8e8ff_v_i8mf8_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_i8mf4_m(...) __riscv_vlseg2e8ff_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_i8mf4_m(...) __riscv_vlseg3e8ff_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_i8mf4_m(...) __riscv_vlseg4e8ff_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg5e8ff_v_i8mf4_m(...) __riscv_vlseg5e8ff_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg6e8ff_v_i8mf4_m(...) __riscv_vlseg6e8ff_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg7e8ff_v_i8mf4_m(...) __riscv_vlseg7e8ff_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg8e8ff_v_i8mf4_m(...) __riscv_vlseg8e8ff_v_i8mf4_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_i8mf2_m(...) __riscv_vlseg2e8ff_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_i8mf2_m(...) __riscv_vlseg3e8ff_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_i8mf2_m(...) __riscv_vlseg4e8ff_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg5e8ff_v_i8mf2_m(...) __riscv_vlseg5e8ff_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg6e8ff_v_i8mf2_m(...) __riscv_vlseg6e8ff_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg7e8ff_v_i8mf2_m(...) __riscv_vlseg7e8ff_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg8e8ff_v_i8mf2_m(...) __riscv_vlseg8e8ff_v_i8mf2_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_i8m1_m(...) __riscv_vlseg2e8ff_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_i8m1_m(...) __riscv_vlseg3e8ff_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_i8m1_m(...) __riscv_vlseg4e8ff_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg5e8ff_v_i8m1_m(...) __riscv_vlseg5e8ff_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg6e8ff_v_i8m1_m(...) __riscv_vlseg6e8ff_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg7e8ff_v_i8m1_m(...) __riscv_vlseg7e8ff_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg8e8ff_v_i8m1_m(...) __riscv_vlseg8e8ff_v_i8m1_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_i8m2_m(...) __riscv_vlseg2e8ff_v_i8m2_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_i8m2_m(...) __riscv_vlseg3e8ff_v_i8m2_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_i8m2_m(...) __riscv_vlseg4e8ff_v_i8m2_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_i8m4_m(...) __riscv_vlseg2e8ff_v_i8m4_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_i16mf4_m(...) __riscv_vlseg2e16ff_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_i16mf4_m(...) __riscv_vlseg3e16ff_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_i16mf4_m(...) __riscv_vlseg4e16ff_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg5e16ff_v_i16mf4_m(...) __riscv_vlseg5e16ff_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg6e16ff_v_i16mf4_m(...) __riscv_vlseg6e16ff_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg7e16ff_v_i16mf4_m(...) __riscv_vlseg7e16ff_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg8e16ff_v_i16mf4_m(...) __riscv_vlseg8e16ff_v_i16mf4_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_i16mf2_m(...) __riscv_vlseg2e16ff_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_i16mf2_m(...) __riscv_vlseg3e16ff_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_i16mf2_m(...) __riscv_vlseg4e16ff_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg5e16ff_v_i16mf2_m(...) __riscv_vlseg5e16ff_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg6e16ff_v_i16mf2_m(...) __riscv_vlseg6e16ff_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg7e16ff_v_i16mf2_m(...) __riscv_vlseg7e16ff_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg8e16ff_v_i16mf2_m(...) __riscv_vlseg8e16ff_v_i16mf2_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_i16m1_m(...) __riscv_vlseg2e16ff_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_i16m1_m(...) __riscv_vlseg3e16ff_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_i16m1_m(...) __riscv_vlseg4e16ff_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg5e16ff_v_i16m1_m(...) __riscv_vlseg5e16ff_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg6e16ff_v_i16m1_m(...) __riscv_vlseg6e16ff_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg7e16ff_v_i16m1_m(...) __riscv_vlseg7e16ff_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg8e16ff_v_i16m1_m(...) __riscv_vlseg8e16ff_v_i16m1_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_i16m2_m(...) __riscv_vlseg2e16ff_v_i16m2_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_i16m2_m(...) __riscv_vlseg3e16ff_v_i16m2_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_i16m2_m(...) __riscv_vlseg4e16ff_v_i16m2_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_i16m4_m(...) __riscv_vlseg2e16ff_v_i16m4_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_i32mf2_m(...) __riscv_vlseg2e32ff_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg3e32ff_v_i32mf2_m(...) __riscv_vlseg3e32ff_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg4e32ff_v_i32mf2_m(...) __riscv_vlseg4e32ff_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg5e32ff_v_i32mf2_m(...) __riscv_vlseg5e32ff_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg6e32ff_v_i32mf2_m(...) __riscv_vlseg6e32ff_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg7e32ff_v_i32mf2_m(...) __riscv_vlseg7e32ff_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg8e32ff_v_i32mf2_m(...) __riscv_vlseg8e32ff_v_i32mf2_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_i32m1_m(...) __riscv_vlseg2e32ff_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg3e32ff_v_i32m1_m(...) __riscv_vlseg3e32ff_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg4e32ff_v_i32m1_m(...) __riscv_vlseg4e32ff_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg5e32ff_v_i32m1_m(...) __riscv_vlseg5e32ff_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg6e32ff_v_i32m1_m(...) __riscv_vlseg6e32ff_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg7e32ff_v_i32m1_m(...) __riscv_vlseg7e32ff_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg8e32ff_v_i32m1_m(...) __riscv_vlseg8e32ff_v_i32m1_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_i32m2_m(...) __riscv_vlseg2e32ff_v_i32m2_tumu(__VA_ARGS__)
+#define vlseg3e32ff_v_i32m2_m(...) __riscv_vlseg3e32ff_v_i32m2_tumu(__VA_ARGS__)
+#define vlseg4e32ff_v_i32m2_m(...) __riscv_vlseg4e32ff_v_i32m2_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_i32m4_m(...) __riscv_vlseg2e32ff_v_i32m4_tumu(__VA_ARGS__)
+#define vlseg2e64ff_v_i64m1_m(...) __riscv_vlseg2e64ff_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg3e64ff_v_i64m1_m(...) __riscv_vlseg3e64ff_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg4e64ff_v_i64m1_m(...) __riscv_vlseg4e64ff_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg5e64ff_v_i64m1_m(...) __riscv_vlseg5e64ff_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg6e64ff_v_i64m1_m(...) __riscv_vlseg6e64ff_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg7e64ff_v_i64m1_m(...) __riscv_vlseg7e64ff_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg8e64ff_v_i64m1_m(...) __riscv_vlseg8e64ff_v_i64m1_tumu(__VA_ARGS__)
+#define vlseg2e64ff_v_i64m2_m(...) __riscv_vlseg2e64ff_v_i64m2_tumu(__VA_ARGS__)
+#define vlseg3e64ff_v_i64m2_m(...) __riscv_vlseg3e64ff_v_i64m2_tumu(__VA_ARGS__)
+#define vlseg4e64ff_v_i64m2_m(...) __riscv_vlseg4e64ff_v_i64m2_tumu(__VA_ARGS__)
+#define vlseg2e64ff_v_i64m4_m(...) __riscv_vlseg2e64ff_v_i64m4_tumu(__VA_ARGS__)
+#define vlseg2e8_v_u8mf8_m(...) __riscv_vlseg2e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg3e8_v_u8mf8_m(...) __riscv_vlseg3e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg4e8_v_u8mf8_m(...) __riscv_vlseg4e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg5e8_v_u8mf8_m(...) __riscv_vlseg5e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg6e8_v_u8mf8_m(...) __riscv_vlseg6e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg7e8_v_u8mf8_m(...) __riscv_vlseg7e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg8e8_v_u8mf8_m(...) __riscv_vlseg8e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg2e8_v_u8mf4_m(...) __riscv_vlseg2e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg3e8_v_u8mf4_m(...) __riscv_vlseg3e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg4e8_v_u8mf4_m(...) __riscv_vlseg4e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg5e8_v_u8mf4_m(...) __riscv_vlseg5e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg6e8_v_u8mf4_m(...) __riscv_vlseg6e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg7e8_v_u8mf4_m(...) __riscv_vlseg7e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg8e8_v_u8mf4_m(...) __riscv_vlseg8e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg2e8_v_u8mf2_m(...) __riscv_vlseg2e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg3e8_v_u8mf2_m(...) __riscv_vlseg3e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg4e8_v_u8mf2_m(...) __riscv_vlseg4e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg5e8_v_u8mf2_m(...) __riscv_vlseg5e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg6e8_v_u8mf2_m(...) __riscv_vlseg6e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg7e8_v_u8mf2_m(...) __riscv_vlseg7e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg8e8_v_u8mf2_m(...) __riscv_vlseg8e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg2e8_v_u8m1_m(...) __riscv_vlseg2e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg3e8_v_u8m1_m(...) __riscv_vlseg3e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg4e8_v_u8m1_m(...) __riscv_vlseg4e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg5e8_v_u8m1_m(...) __riscv_vlseg5e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg6e8_v_u8m1_m(...) __riscv_vlseg6e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg7e8_v_u8m1_m(...) __riscv_vlseg7e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg8e8_v_u8m1_m(...) __riscv_vlseg8e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg2e8_v_u8m2_m(...) __riscv_vlseg2e8_v_u8m2_tumu(__VA_ARGS__)
+#define vlseg3e8_v_u8m2_m(...) __riscv_vlseg3e8_v_u8m2_tumu(__VA_ARGS__)
+#define vlseg4e8_v_u8m2_m(...) __riscv_vlseg4e8_v_u8m2_tumu(__VA_ARGS__)
+#define vlseg2e8_v_u8m4_m(...) __riscv_vlseg2e8_v_u8m4_tumu(__VA_ARGS__)
+#define vlseg2e16_v_u16mf4_m(...) __riscv_vlseg2e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg3e16_v_u16mf4_m(...) __riscv_vlseg3e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg4e16_v_u16mf4_m(...) __riscv_vlseg4e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg5e16_v_u16mf4_m(...) __riscv_vlseg5e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg6e16_v_u16mf4_m(...) __riscv_vlseg6e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg7e16_v_u16mf4_m(...) __riscv_vlseg7e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg8e16_v_u16mf4_m(...) __riscv_vlseg8e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg2e16_v_u16mf2_m(...) __riscv_vlseg2e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg3e16_v_u16mf2_m(...) __riscv_vlseg3e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg4e16_v_u16mf2_m(...) __riscv_vlseg4e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg5e16_v_u16mf2_m(...) __riscv_vlseg5e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg6e16_v_u16mf2_m(...) __riscv_vlseg6e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg7e16_v_u16mf2_m(...) __riscv_vlseg7e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg8e16_v_u16mf2_m(...) __riscv_vlseg8e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg2e16_v_u16m1_m(...) __riscv_vlseg2e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg3e16_v_u16m1_m(...) __riscv_vlseg3e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg4e16_v_u16m1_m(...) __riscv_vlseg4e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg5e16_v_u16m1_m(...) __riscv_vlseg5e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg6e16_v_u16m1_m(...) __riscv_vlseg6e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg7e16_v_u16m1_m(...) __riscv_vlseg7e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg8e16_v_u16m1_m(...) __riscv_vlseg8e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg2e16_v_u16m2_m(...) __riscv_vlseg2e16_v_u16m2_tumu(__VA_ARGS__)
+#define vlseg3e16_v_u16m2_m(...) __riscv_vlseg3e16_v_u16m2_tumu(__VA_ARGS__)
+#define vlseg4e16_v_u16m2_m(...) __riscv_vlseg4e16_v_u16m2_tumu(__VA_ARGS__)
+#define vlseg2e16_v_u16m4_m(...) __riscv_vlseg2e16_v_u16m4_tumu(__VA_ARGS__)
+#define vlseg2e32_v_u32mf2_m(...) __riscv_vlseg2e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg3e32_v_u32mf2_m(...) __riscv_vlseg3e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg4e32_v_u32mf2_m(...) __riscv_vlseg4e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg5e32_v_u32mf2_m(...) __riscv_vlseg5e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg6e32_v_u32mf2_m(...) __riscv_vlseg6e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg7e32_v_u32mf2_m(...) __riscv_vlseg7e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg8e32_v_u32mf2_m(...) __riscv_vlseg8e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg2e32_v_u32m1_m(...) __riscv_vlseg2e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg3e32_v_u32m1_m(...) __riscv_vlseg3e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg4e32_v_u32m1_m(...) __riscv_vlseg4e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg5e32_v_u32m1_m(...) __riscv_vlseg5e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg6e32_v_u32m1_m(...) __riscv_vlseg6e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg7e32_v_u32m1_m(...) __riscv_vlseg7e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg8e32_v_u32m1_m(...) __riscv_vlseg8e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg2e32_v_u32m2_m(...) __riscv_vlseg2e32_v_u32m2_tumu(__VA_ARGS__)
+#define vlseg3e32_v_u32m2_m(...) __riscv_vlseg3e32_v_u32m2_tumu(__VA_ARGS__)
+#define vlseg4e32_v_u32m2_m(...) __riscv_vlseg4e32_v_u32m2_tumu(__VA_ARGS__)
+#define vlseg2e32_v_u32m4_m(...) __riscv_vlseg2e32_v_u32m4_tumu(__VA_ARGS__)
+#define vlseg2e64_v_u64m1_m(...) __riscv_vlseg2e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg3e64_v_u64m1_m(...) __riscv_vlseg3e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg4e64_v_u64m1_m(...) __riscv_vlseg4e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg5e64_v_u64m1_m(...) __riscv_vlseg5e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg6e64_v_u64m1_m(...) __riscv_vlseg6e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg7e64_v_u64m1_m(...) __riscv_vlseg7e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg8e64_v_u64m1_m(...) __riscv_vlseg8e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg2e64_v_u64m2_m(...) __riscv_vlseg2e64_v_u64m2_tumu(__VA_ARGS__)
+#define vlseg3e64_v_u64m2_m(...) __riscv_vlseg3e64_v_u64m2_tumu(__VA_ARGS__)
+#define vlseg4e64_v_u64m2_m(...) __riscv_vlseg4e64_v_u64m2_tumu(__VA_ARGS__)
+#define vlseg2e64_v_u64m4_m(...) __riscv_vlseg2e64_v_u64m4_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_u8mf8_m(...) __riscv_vlseg2e8ff_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_u8mf8_m(...) __riscv_vlseg3e8ff_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_u8mf8_m(...) __riscv_vlseg4e8ff_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg5e8ff_v_u8mf8_m(...) __riscv_vlseg5e8ff_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg6e8ff_v_u8mf8_m(...) __riscv_vlseg6e8ff_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg7e8ff_v_u8mf8_m(...) __riscv_vlseg7e8ff_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg8e8ff_v_u8mf8_m(...) __riscv_vlseg8e8ff_v_u8mf8_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_u8mf4_m(...) __riscv_vlseg2e8ff_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_u8mf4_m(...) __riscv_vlseg3e8ff_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_u8mf4_m(...) __riscv_vlseg4e8ff_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg5e8ff_v_u8mf4_m(...) __riscv_vlseg5e8ff_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg6e8ff_v_u8mf4_m(...) __riscv_vlseg6e8ff_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg7e8ff_v_u8mf4_m(...) __riscv_vlseg7e8ff_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg8e8ff_v_u8mf4_m(...) __riscv_vlseg8e8ff_v_u8mf4_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_u8mf2_m(...) __riscv_vlseg2e8ff_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_u8mf2_m(...) __riscv_vlseg3e8ff_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_u8mf2_m(...) __riscv_vlseg4e8ff_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg5e8ff_v_u8mf2_m(...) __riscv_vlseg5e8ff_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg6e8ff_v_u8mf2_m(...) __riscv_vlseg6e8ff_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg7e8ff_v_u8mf2_m(...) __riscv_vlseg7e8ff_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg8e8ff_v_u8mf2_m(...) __riscv_vlseg8e8ff_v_u8mf2_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_u8m1_m(...) __riscv_vlseg2e8ff_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_u8m1_m(...) __riscv_vlseg3e8ff_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_u8m1_m(...) __riscv_vlseg4e8ff_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg5e8ff_v_u8m1_m(...) __riscv_vlseg5e8ff_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg6e8ff_v_u8m1_m(...) __riscv_vlseg6e8ff_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg7e8ff_v_u8m1_m(...) __riscv_vlseg7e8ff_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg8e8ff_v_u8m1_m(...) __riscv_vlseg8e8ff_v_u8m1_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_u8m2_m(...) __riscv_vlseg2e8ff_v_u8m2_tumu(__VA_ARGS__)
+#define vlseg3e8ff_v_u8m2_m(...) __riscv_vlseg3e8ff_v_u8m2_tumu(__VA_ARGS__)
+#define vlseg4e8ff_v_u8m2_m(...) __riscv_vlseg4e8ff_v_u8m2_tumu(__VA_ARGS__)
+#define vlseg2e8ff_v_u8m4_m(...) __riscv_vlseg2e8ff_v_u8m4_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_u16mf4_m(...) __riscv_vlseg2e16ff_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_u16mf4_m(...) __riscv_vlseg3e16ff_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_u16mf4_m(...) __riscv_vlseg4e16ff_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg5e16ff_v_u16mf4_m(...) __riscv_vlseg5e16ff_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg6e16ff_v_u16mf4_m(...) __riscv_vlseg6e16ff_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg7e16ff_v_u16mf4_m(...) __riscv_vlseg7e16ff_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg8e16ff_v_u16mf4_m(...) __riscv_vlseg8e16ff_v_u16mf4_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_u16mf2_m(...) __riscv_vlseg2e16ff_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_u16mf2_m(...) __riscv_vlseg3e16ff_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_u16mf2_m(...) __riscv_vlseg4e16ff_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg5e16ff_v_u16mf2_m(...) __riscv_vlseg5e16ff_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg6e16ff_v_u16mf2_m(...) __riscv_vlseg6e16ff_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg7e16ff_v_u16mf2_m(...) __riscv_vlseg7e16ff_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg8e16ff_v_u16mf2_m(...) __riscv_vlseg8e16ff_v_u16mf2_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_u16m1_m(...) __riscv_vlseg2e16ff_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_u16m1_m(...) __riscv_vlseg3e16ff_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_u16m1_m(...) __riscv_vlseg4e16ff_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg5e16ff_v_u16m1_m(...) __riscv_vlseg5e16ff_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg6e16ff_v_u16m1_m(...) __riscv_vlseg6e16ff_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg7e16ff_v_u16m1_m(...) __riscv_vlseg7e16ff_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg8e16ff_v_u16m1_m(...) __riscv_vlseg8e16ff_v_u16m1_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_u16m2_m(...) __riscv_vlseg2e16ff_v_u16m2_tumu(__VA_ARGS__)
+#define vlseg3e16ff_v_u16m2_m(...) __riscv_vlseg3e16ff_v_u16m2_tumu(__VA_ARGS__)
+#define vlseg4e16ff_v_u16m2_m(...) __riscv_vlseg4e16ff_v_u16m2_tumu(__VA_ARGS__)
+#define vlseg2e16ff_v_u16m4_m(...) __riscv_vlseg2e16ff_v_u16m4_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_u32mf2_m(...) __riscv_vlseg2e32ff_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg3e32ff_v_u32mf2_m(...) __riscv_vlseg3e32ff_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg4e32ff_v_u32mf2_m(...) __riscv_vlseg4e32ff_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg5e32ff_v_u32mf2_m(...) __riscv_vlseg5e32ff_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg6e32ff_v_u32mf2_m(...) __riscv_vlseg6e32ff_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg7e32ff_v_u32mf2_m(...) __riscv_vlseg7e32ff_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg8e32ff_v_u32mf2_m(...) __riscv_vlseg8e32ff_v_u32mf2_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_u32m1_m(...) __riscv_vlseg2e32ff_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg3e32ff_v_u32m1_m(...) __riscv_vlseg3e32ff_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg4e32ff_v_u32m1_m(...) __riscv_vlseg4e32ff_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg5e32ff_v_u32m1_m(...) __riscv_vlseg5e32ff_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg6e32ff_v_u32m1_m(...) __riscv_vlseg6e32ff_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg7e32ff_v_u32m1_m(...) __riscv_vlseg7e32ff_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg8e32ff_v_u32m1_m(...) __riscv_vlseg8e32ff_v_u32m1_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_u32m2_m(...) __riscv_vlseg2e32ff_v_u32m2_tumu(__VA_ARGS__)
+#define vlseg3e32ff_v_u32m2_m(...) __riscv_vlseg3e32ff_v_u32m2_tumu(__VA_ARGS__)
+#define vlseg4e32ff_v_u32m2_m(...) __riscv_vlseg4e32ff_v_u32m2_tumu(__VA_ARGS__)
+#define vlseg2e32ff_v_u32m4_m(...) __riscv_vlseg2e32ff_v_u32m4_tumu(__VA_ARGS__)
+#define vlseg2e64ff_v_u64m1_m(...) __riscv_vlseg2e64ff_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg3e64ff_v_u64m1_m(...) __riscv_vlseg3e64ff_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg4e64ff_v_u64m1_m(...) __riscv_vlseg4e64ff_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg5e64ff_v_u64m1_m(...) __riscv_vlseg5e64ff_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg6e64ff_v_u64m1_m(...) __riscv_vlseg6e64ff_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg7e64ff_v_u64m1_m(...) __riscv_vlseg7e64ff_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg8e64ff_v_u64m1_m(...) __riscv_vlseg8e64ff_v_u64m1_tumu(__VA_ARGS__)
+#define vlseg2e64ff_v_u64m2_m(...) __riscv_vlseg2e64ff_v_u64m2_tumu(__VA_ARGS__)
+#define vlseg3e64ff_v_u64m2_m(...) __riscv_vlseg3e64ff_v_u64m2_tumu(__VA_ARGS__)
+#define vlseg4e64ff_v_u64m2_m(...) __riscv_vlseg4e64ff_v_u64m2_tumu(__VA_ARGS__)
+#define vlseg2e64ff_v_u64m4_m(...) __riscv_vlseg2e64ff_v_u64m4_tumu(__VA_ARGS__)
+#define vsseg2e16_v_f16mf4(...) __riscv_vsseg2e16_v_f16mf4(__VA_ARGS__)
+#define vsseg3e16_v_f16mf4(...) __riscv_vsseg3e16_v_f16mf4(__VA_ARGS__)
+#define vsseg4e16_v_f16mf4(...) __riscv_vsseg4e16_v_f16mf4(__VA_ARGS__)
+#define vsseg5e16_v_f16mf4(...) __riscv_vsseg5e16_v_f16mf4(__VA_ARGS__)
+#define vsseg6e16_v_f16mf4(...) __riscv_vsseg6e16_v_f16mf4(__VA_ARGS__)
+#define vsseg7e16_v_f16mf4(...) __riscv_vsseg7e16_v_f16mf4(__VA_ARGS__)
+#define vsseg8e16_v_f16mf4(...) __riscv_vsseg8e16_v_f16mf4(__VA_ARGS__)
+#define vsseg2e16_v_f16mf2(...) __riscv_vsseg2e16_v_f16mf2(__VA_ARGS__)
+#define vsseg3e16_v_f16mf2(...) __riscv_vsseg3e16_v_f16mf2(__VA_ARGS__)
+#define vsseg4e16_v_f16mf2(...) __riscv_vsseg4e16_v_f16mf2(__VA_ARGS__)
+#define vsseg5e16_v_f16mf2(...) __riscv_vsseg5e16_v_f16mf2(__VA_ARGS__)
+#define vsseg6e16_v_f16mf2(...) __riscv_vsseg6e16_v_f16mf2(__VA_ARGS__)
+#define vsseg7e16_v_f16mf2(...) __riscv_vsseg7e16_v_f16mf2(__VA_ARGS__)
+#define vsseg8e16_v_f16mf2(...) __riscv_vsseg8e16_v_f16mf2(__VA_ARGS__)
+#define vsseg2e16_v_f16m1(...) __riscv_vsseg2e16_v_f16m1(__VA_ARGS__)
+#define vsseg3e16_v_f16m1(...) __riscv_vsseg3e16_v_f16m1(__VA_ARGS__)
+#define vsseg4e16_v_f16m1(...) __riscv_vsseg4e16_v_f16m1(__VA_ARGS__)
+#define vsseg5e16_v_f16m1(...) __riscv_vsseg5e16_v_f16m1(__VA_ARGS__)
+#define vsseg6e16_v_f16m1(...) __riscv_vsseg6e16_v_f16m1(__VA_ARGS__)
+#define vsseg7e16_v_f16m1(...) __riscv_vsseg7e16_v_f16m1(__VA_ARGS__)
+#define vsseg8e16_v_f16m1(...) __riscv_vsseg8e16_v_f16m1(__VA_ARGS__)
+#define vsseg2e16_v_f16m2(...) __riscv_vsseg2e16_v_f16m2(__VA_ARGS__)
+#define vsseg3e16_v_f16m2(...) __riscv_vsseg3e16_v_f16m2(__VA_ARGS__)
+#define vsseg4e16_v_f16m2(...) __riscv_vsseg4e16_v_f16m2(__VA_ARGS__)
+#define vsseg2e16_v_f16m4(...) __riscv_vsseg2e16_v_f16m4(__VA_ARGS__)
+#define vsseg2e32_v_f32mf2(...) __riscv_vsseg2e32_v_f32mf2(__VA_ARGS__)
+#define vsseg3e32_v_f32mf2(...) __riscv_vsseg3e32_v_f32mf2(__VA_ARGS__)
+#define vsseg4e32_v_f32mf2(...) __riscv_vsseg4e32_v_f32mf2(__VA_ARGS__)
+#define vsseg5e32_v_f32mf2(...) __riscv_vsseg5e32_v_f32mf2(__VA_ARGS__)
+#define vsseg6e32_v_f32mf2(...) __riscv_vsseg6e32_v_f32mf2(__VA_ARGS__)
+#define vsseg7e32_v_f32mf2(...) __riscv_vsseg7e32_v_f32mf2(__VA_ARGS__)
+#define vsseg8e32_v_f32mf2(...) __riscv_vsseg8e32_v_f32mf2(__VA_ARGS__)
+#define vsseg2e32_v_f32m1(...) __riscv_vsseg2e32_v_f32m1(__VA_ARGS__)
+#define vsseg3e32_v_f32m1(...) __riscv_vsseg3e32_v_f32m1(__VA_ARGS__)
+#define vsseg4e32_v_f32m1(...) __riscv_vsseg4e32_v_f32m1(__VA_ARGS__)
+#define vsseg5e32_v_f32m1(...) __riscv_vsseg5e32_v_f32m1(__VA_ARGS__)
+#define vsseg6e32_v_f32m1(...) __riscv_vsseg6e32_v_f32m1(__VA_ARGS__)
+#define vsseg7e32_v_f32m1(...) __riscv_vsseg7e32_v_f32m1(__VA_ARGS__)
+#define vsseg8e32_v_f32m1(...) __riscv_vsseg8e32_v_f32m1(__VA_ARGS__)
+#define vsseg2e32_v_f32m2(...) __riscv_vsseg2e32_v_f32m2(__VA_ARGS__)
+#define vsseg3e32_v_f32m2(...) __riscv_vsseg3e32_v_f32m2(__VA_ARGS__)
+#define vsseg4e32_v_f32m2(...) __riscv_vsseg4e32_v_f32m2(__VA_ARGS__)
+#define vsseg2e32_v_f32m4(...) __riscv_vsseg2e32_v_f32m4(__VA_ARGS__)
+#define vsseg2e64_v_f64m1(...) __riscv_vsseg2e64_v_f64m1(__VA_ARGS__)
+#define vsseg3e64_v_f64m1(...) __riscv_vsseg3e64_v_f64m1(__VA_ARGS__)
+#define vsseg4e64_v_f64m1(...) __riscv_vsseg4e64_v_f64m1(__VA_ARGS__)
+#define vsseg5e64_v_f64m1(...) __riscv_vsseg5e64_v_f64m1(__VA_ARGS__)
+#define vsseg6e64_v_f64m1(...) __riscv_vsseg6e64_v_f64m1(__VA_ARGS__)
+#define vsseg7e64_v_f64m1(...) __riscv_vsseg7e64_v_f64m1(__VA_ARGS__)
+#define vsseg8e64_v_f64m1(...) __riscv_vsseg8e64_v_f64m1(__VA_ARGS__)
+#define vsseg2e64_v_f64m2(...) __riscv_vsseg2e64_v_f64m2(__VA_ARGS__)
+#define vsseg3e64_v_f64m2(...) __riscv_vsseg3e64_v_f64m2(__VA_ARGS__)
+#define vsseg4e64_v_f64m2(...) __riscv_vsseg4e64_v_f64m2(__VA_ARGS__)
+#define vsseg2e64_v_f64m4(...) __riscv_vsseg2e64_v_f64m4(__VA_ARGS__)
+#define vsseg2e8_v_i8mf8(...) __riscv_vsseg2e8_v_i8mf8(__VA_ARGS__)
+#define vsseg3e8_v_i8mf8(...) __riscv_vsseg3e8_v_i8mf8(__VA_ARGS__)
+#define vsseg4e8_v_i8mf8(...) __riscv_vsseg4e8_v_i8mf8(__VA_ARGS__)
+#define vsseg5e8_v_i8mf8(...) __riscv_vsseg5e8_v_i8mf8(__VA_ARGS__)
+#define vsseg6e8_v_i8mf8(...) __riscv_vsseg6e8_v_i8mf8(__VA_ARGS__)
+#define vsseg7e8_v_i8mf8(...) __riscv_vsseg7e8_v_i8mf8(__VA_ARGS__)
+#define vsseg8e8_v_i8mf8(...) __riscv_vsseg8e8_v_i8mf8(__VA_ARGS__)
+#define vsseg2e8_v_i8mf4(...) __riscv_vsseg2e8_v_i8mf4(__VA_ARGS__)
+#define vsseg3e8_v_i8mf4(...) __riscv_vsseg3e8_v_i8mf4(__VA_ARGS__)
+#define vsseg4e8_v_i8mf4(...) __riscv_vsseg4e8_v_i8mf4(__VA_ARGS__)
+#define vsseg5e8_v_i8mf4(...) __riscv_vsseg5e8_v_i8mf4(__VA_ARGS__)
+#define vsseg6e8_v_i8mf4(...) __riscv_vsseg6e8_v_i8mf4(__VA_ARGS__)
+#define vsseg7e8_v_i8mf4(...) __riscv_vsseg7e8_v_i8mf4(__VA_ARGS__)
+#define vsseg8e8_v_i8mf4(...) __riscv_vsseg8e8_v_i8mf4(__VA_ARGS__)
+#define vsseg2e8_v_i8mf2(...) __riscv_vsseg2e8_v_i8mf2(__VA_ARGS__)
+#define vsseg3e8_v_i8mf2(...) __riscv_vsseg3e8_v_i8mf2(__VA_ARGS__)
+#define vsseg4e8_v_i8mf2(...) __riscv_vsseg4e8_v_i8mf2(__VA_ARGS__)
+#define vsseg5e8_v_i8mf2(...) __riscv_vsseg5e8_v_i8mf2(__VA_ARGS__)
+#define vsseg6e8_v_i8mf2(...) __riscv_vsseg6e8_v_i8mf2(__VA_ARGS__)
+#define vsseg7e8_v_i8mf2(...) __riscv_vsseg7e8_v_i8mf2(__VA_ARGS__)
+#define vsseg8e8_v_i8mf2(...) __riscv_vsseg8e8_v_i8mf2(__VA_ARGS__)
+#define vsseg2e8_v_i8m1(...) __riscv_vsseg2e8_v_i8m1(__VA_ARGS__)
+#define vsseg3e8_v_i8m1(...) __riscv_vsseg3e8_v_i8m1(__VA_ARGS__)
+#define vsseg4e8_v_i8m1(...) __riscv_vsseg4e8_v_i8m1(__VA_ARGS__)
+#define vsseg5e8_v_i8m1(...) __riscv_vsseg5e8_v_i8m1(__VA_ARGS__)
+#define vsseg6e8_v_i8m1(...) __riscv_vsseg6e8_v_i8m1(__VA_ARGS__)
+#define vsseg7e8_v_i8m1(...) __riscv_vsseg7e8_v_i8m1(__VA_ARGS__)
+#define vsseg8e8_v_i8m1(...) __riscv_vsseg8e8_v_i8m1(__VA_ARGS__)
+#define vsseg2e8_v_i8m2(...) __riscv_vsseg2e8_v_i8m2(__VA_ARGS__)
+#define vsseg3e8_v_i8m2(...) __riscv_vsseg3e8_v_i8m2(__VA_ARGS__)
+#define vsseg4e8_v_i8m2(...) __riscv_vsseg4e8_v_i8m2(__VA_ARGS__)
+#define vsseg2e8_v_i8m4(...) __riscv_vsseg2e8_v_i8m4(__VA_ARGS__)
+#define vsseg2e16_v_i16mf4(...) __riscv_vsseg2e16_v_i16mf4(__VA_ARGS__)
+#define vsseg3e16_v_i16mf4(...) __riscv_vsseg3e16_v_i16mf4(__VA_ARGS__)
+#define vsseg4e16_v_i16mf4(...) __riscv_vsseg4e16_v_i16mf4(__VA_ARGS__)
+#define vsseg5e16_v_i16mf4(...) __riscv_vsseg5e16_v_i16mf4(__VA_ARGS__)
+#define vsseg6e16_v_i16mf4(...) __riscv_vsseg6e16_v_i16mf4(__VA_ARGS__)
+#define vsseg7e16_v_i16mf4(...) __riscv_vsseg7e16_v_i16mf4(__VA_ARGS__)
+#define vsseg8e16_v_i16mf4(...) __riscv_vsseg8e16_v_i16mf4(__VA_ARGS__)
+#define vsseg2e16_v_i16mf2(...) __riscv_vsseg2e16_v_i16mf2(__VA_ARGS__)
+#define vsseg3e16_v_i16mf2(...) __riscv_vsseg3e16_v_i16mf2(__VA_ARGS__)
+#define vsseg4e16_v_i16mf2(...) __riscv_vsseg4e16_v_i16mf2(__VA_ARGS__)
+#define vsseg5e16_v_i16mf2(...) __riscv_vsseg5e16_v_i16mf2(__VA_ARGS__)
+#define vsseg6e16_v_i16mf2(...) __riscv_vsseg6e16_v_i16mf2(__VA_ARGS__)
+#define vsseg7e16_v_i16mf2(...) __riscv_vsseg7e16_v_i16mf2(__VA_ARGS__)
+#define vsseg8e16_v_i16mf2(...) __riscv_vsseg8e16_v_i16mf2(__VA_ARGS__)
+#define vsseg2e16_v_i16m1(...) __riscv_vsseg2e16_v_i16m1(__VA_ARGS__)
+#define vsseg3e16_v_i16m1(...) __riscv_vsseg3e16_v_i16m1(__VA_ARGS__)
+#define vsseg4e16_v_i16m1(...) __riscv_vsseg4e16_v_i16m1(__VA_ARGS__)
+#define vsseg5e16_v_i16m1(...) __riscv_vsseg5e16_v_i16m1(__VA_ARGS__)
+#define vsseg6e16_v_i16m1(...) __riscv_vsseg6e16_v_i16m1(__VA_ARGS__)
+#define vsseg7e16_v_i16m1(...) __riscv_vsseg7e16_v_i16m1(__VA_ARGS__)
+#define vsseg8e16_v_i16m1(...) __riscv_vsseg8e16_v_i16m1(__VA_ARGS__)
+#define vsseg2e16_v_i16m2(...) __riscv_vsseg2e16_v_i16m2(__VA_ARGS__)
+#define vsseg3e16_v_i16m2(...) __riscv_vsseg3e16_v_i16m2(__VA_ARGS__)
+#define vsseg4e16_v_i16m2(...) __riscv_vsseg4e16_v_i16m2(__VA_ARGS__)
+#define vsseg2e16_v_i16m4(...) __riscv_vsseg2e16_v_i16m4(__VA_ARGS__)
+#define vsseg2e32_v_i32mf2(...) __riscv_vsseg2e32_v_i32mf2(__VA_ARGS__)
+#define vsseg3e32_v_i32mf2(...) __riscv_vsseg3e32_v_i32mf2(__VA_ARGS__)
+#define vsseg4e32_v_i32mf2(...) __riscv_vsseg4e32_v_i32mf2(__VA_ARGS__)
+#define vsseg5e32_v_i32mf2(...) __riscv_vsseg5e32_v_i32mf2(__VA_ARGS__)
+#define vsseg6e32_v_i32mf2(...) __riscv_vsseg6e32_v_i32mf2(__VA_ARGS__)
+#define vsseg7e32_v_i32mf2(...) __riscv_vsseg7e32_v_i32mf2(__VA_ARGS__)
+#define vsseg8e32_v_i32mf2(...) __riscv_vsseg8e32_v_i32mf2(__VA_ARGS__)
+#define vsseg2e32_v_i32m1(...) __riscv_vsseg2e32_v_i32m1(__VA_ARGS__)
+#define vsseg3e32_v_i32m1(...) __riscv_vsseg3e32_v_i32m1(__VA_ARGS__)
+#define vsseg4e32_v_i32m1(...) __riscv_vsseg4e32_v_i32m1(__VA_ARGS__)
+#define vsseg5e32_v_i32m1(...) __riscv_vsseg5e32_v_i32m1(__VA_ARGS__)
+#define vsseg6e32_v_i32m1(...) __riscv_vsseg6e32_v_i32m1(__VA_ARGS__)
+#define vsseg7e32_v_i32m1(...) __riscv_vsseg7e32_v_i32m1(__VA_ARGS__)
+#define vsseg8e32_v_i32m1(...) __riscv_vsseg8e32_v_i32m1(__VA_ARGS__)
+#define vsseg2e32_v_i32m2(...) __riscv_vsseg2e32_v_i32m2(__VA_ARGS__)
+#define vsseg3e32_v_i32m2(...) __riscv_vsseg3e32_v_i32m2(__VA_ARGS__)
+#define vsseg4e32_v_i32m2(...) __riscv_vsseg4e32_v_i32m2(__VA_ARGS__)
+#define vsseg2e32_v_i32m4(...) __riscv_vsseg2e32_v_i32m4(__VA_ARGS__)
+#define vsseg2e64_v_i64m1(...) __riscv_vsseg2e64_v_i64m1(__VA_ARGS__)
+#define vsseg3e64_v_i64m1(...) __riscv_vsseg3e64_v_i64m1(__VA_ARGS__)
+#define vsseg4e64_v_i64m1(...) __riscv_vsseg4e64_v_i64m1(__VA_ARGS__)
+#define vsseg5e64_v_i64m1(...) __riscv_vsseg5e64_v_i64m1(__VA_ARGS__)
+#define vsseg6e64_v_i64m1(...) __riscv_vsseg6e64_v_i64m1(__VA_ARGS__)
+#define vsseg7e64_v_i64m1(...) __riscv_vsseg7e64_v_i64m1(__VA_ARGS__)
+#define vsseg8e64_v_i64m1(...) __riscv_vsseg8e64_v_i64m1(__VA_ARGS__)
+#define vsseg2e64_v_i64m2(...) __riscv_vsseg2e64_v_i64m2(__VA_ARGS__)
+#define vsseg3e64_v_i64m2(...) __riscv_vsseg3e64_v_i64m2(__VA_ARGS__)
+#define vsseg4e64_v_i64m2(...) __riscv_vsseg4e64_v_i64m2(__VA_ARGS__)
+#define vsseg2e64_v_i64m4(...) __riscv_vsseg2e64_v_i64m4(__VA_ARGS__)
+#define vsseg2e8_v_u8mf8(...) __riscv_vsseg2e8_v_u8mf8(__VA_ARGS__)
+#define vsseg3e8_v_u8mf8(...) __riscv_vsseg3e8_v_u8mf8(__VA_ARGS__)
+#define vsseg4e8_v_u8mf8(...) __riscv_vsseg4e8_v_u8mf8(__VA_ARGS__)
+#define vsseg5e8_v_u8mf8(...) __riscv_vsseg5e8_v_u8mf8(__VA_ARGS__)
+#define vsseg6e8_v_u8mf8(...) __riscv_vsseg6e8_v_u8mf8(__VA_ARGS__)
+#define vsseg7e8_v_u8mf8(...) __riscv_vsseg7e8_v_u8mf8(__VA_ARGS__)
+#define vsseg8e8_v_u8mf8(...) __riscv_vsseg8e8_v_u8mf8(__VA_ARGS__)
+#define vsseg2e8_v_u8mf4(...) __riscv_vsseg2e8_v_u8mf4(__VA_ARGS__)
+#define vsseg3e8_v_u8mf4(...) __riscv_vsseg3e8_v_u8mf4(__VA_ARGS__)
+#define vsseg4e8_v_u8mf4(...) __riscv_vsseg4e8_v_u8mf4(__VA_ARGS__)
+#define vsseg5e8_v_u8mf4(...) __riscv_vsseg5e8_v_u8mf4(__VA_ARGS__)
+#define vsseg6e8_v_u8mf4(...) __riscv_vsseg6e8_v_u8mf4(__VA_ARGS__)
+#define vsseg7e8_v_u8mf4(...) __riscv_vsseg7e8_v_u8mf4(__VA_ARGS__)
+#define vsseg8e8_v_u8mf4(...) __riscv_vsseg8e8_v_u8mf4(__VA_ARGS__)
+#define vsseg2e8_v_u8mf2(...) __riscv_vsseg2e8_v_u8mf2(__VA_ARGS__)
+#define vsseg3e8_v_u8mf2(...) __riscv_vsseg3e8_v_u8mf2(__VA_ARGS__)
+#define vsseg4e8_v_u8mf2(...) __riscv_vsseg4e8_v_u8mf2(__VA_ARGS__)
+#define vsseg5e8_v_u8mf2(...) __riscv_vsseg5e8_v_u8mf2(__VA_ARGS__)
+#define vsseg6e8_v_u8mf2(...) __riscv_vsseg6e8_v_u8mf2(__VA_ARGS__)
+#define vsseg7e8_v_u8mf2(...) __riscv_vsseg7e8_v_u8mf2(__VA_ARGS__)
+#define vsseg8e8_v_u8mf2(...) __riscv_vsseg8e8_v_u8mf2(__VA_ARGS__)
+#define vsseg2e8_v_u8m1(...) __riscv_vsseg2e8_v_u8m1(__VA_ARGS__)
+#define vsseg3e8_v_u8m1(...) __riscv_vsseg3e8_v_u8m1(__VA_ARGS__)
+#define vsseg4e8_v_u8m1(...) __riscv_vsseg4e8_v_u8m1(__VA_ARGS__)
+#define vsseg5e8_v_u8m1(...) __riscv_vsseg5e8_v_u8m1(__VA_ARGS__)
+#define vsseg6e8_v_u8m1(...) __riscv_vsseg6e8_v_u8m1(__VA_ARGS__)
+#define vsseg7e8_v_u8m1(...) __riscv_vsseg7e8_v_u8m1(__VA_ARGS__)
+#define vsseg8e8_v_u8m1(...) __riscv_vsseg8e8_v_u8m1(__VA_ARGS__)
+#define vsseg2e8_v_u8m2(...) __riscv_vsseg2e8_v_u8m2(__VA_ARGS__)
+#define vsseg3e8_v_u8m2(...) __riscv_vsseg3e8_v_u8m2(__VA_ARGS__)
+#define vsseg4e8_v_u8m2(...) __riscv_vsseg4e8_v_u8m2(__VA_ARGS__)
+#define vsseg2e8_v_u8m4(...) __riscv_vsseg2e8_v_u8m4(__VA_ARGS__)
+#define vsseg2e16_v_u16mf4(...) __riscv_vsseg2e16_v_u16mf4(__VA_ARGS__)
+#define vsseg3e16_v_u16mf4(...) __riscv_vsseg3e16_v_u16mf4(__VA_ARGS__)
+#define vsseg4e16_v_u16mf4(...) __riscv_vsseg4e16_v_u16mf4(__VA_ARGS__)
+#define vsseg5e16_v_u16mf4(...) __riscv_vsseg5e16_v_u16mf4(__VA_ARGS__)
+#define vsseg6e16_v_u16mf4(...) __riscv_vsseg6e16_v_u16mf4(__VA_ARGS__)
+#define vsseg7e16_v_u16mf4(...) __riscv_vsseg7e16_v_u16mf4(__VA_ARGS__)
+#define vsseg8e16_v_u16mf4(...) __riscv_vsseg8e16_v_u16mf4(__VA_ARGS__)
+#define vsseg2e16_v_u16mf2(...) __riscv_vsseg2e16_v_u16mf2(__VA_ARGS__)
+#define vsseg3e16_v_u16mf2(...) __riscv_vsseg3e16_v_u16mf2(__VA_ARGS__)
+#define vsseg4e16_v_u16mf2(...) __riscv_vsseg4e16_v_u16mf2(__VA_ARGS__)
+#define vsseg5e16_v_u16mf2(...) __riscv_vsseg5e16_v_u16mf2(__VA_ARGS__)
+#define vsseg6e16_v_u16mf2(...) __riscv_vsseg6e16_v_u16mf2(__VA_ARGS__)
+#define vsseg7e16_v_u16mf2(...) __riscv_vsseg7e16_v_u16mf2(__VA_ARGS__)
+#define vsseg8e16_v_u16mf2(...) __riscv_vsseg8e16_v_u16mf2(__VA_ARGS__)
+#define vsseg2e16_v_u16m1(...) __riscv_vsseg2e16_v_u16m1(__VA_ARGS__)
+#define vsseg3e16_v_u16m1(...) __riscv_vsseg3e16_v_u16m1(__VA_ARGS__)
+#define vsseg4e16_v_u16m1(...) __riscv_vsseg4e16_v_u16m1(__VA_ARGS__)
+#define vsseg5e16_v_u16m1(...) __riscv_vsseg5e16_v_u16m1(__VA_ARGS__)
+#define vsseg6e16_v_u16m1(...) __riscv_vsseg6e16_v_u16m1(__VA_ARGS__)
+#define vsseg7e16_v_u16m1(...) __riscv_vsseg7e16_v_u16m1(__VA_ARGS__)
+#define vsseg8e16_v_u16m1(...) __riscv_vsseg8e16_v_u16m1(__VA_ARGS__)
+#define vsseg2e16_v_u16m2(...) __riscv_vsseg2e16_v_u16m2(__VA_ARGS__)
+#define vsseg3e16_v_u16m2(...) __riscv_vsseg3e16_v_u16m2(__VA_ARGS__)
+#define vsseg4e16_v_u16m2(...) __riscv_vsseg4e16_v_u16m2(__VA_ARGS__)
+#define vsseg2e16_v_u16m4(...) __riscv_vsseg2e16_v_u16m4(__VA_ARGS__)
+#define vsseg2e32_v_u32mf2(...) __riscv_vsseg2e32_v_u32mf2(__VA_ARGS__)
+#define vsseg3e32_v_u32mf2(...) __riscv_vsseg3e32_v_u32mf2(__VA_ARGS__)
+#define vsseg4e32_v_u32mf2(...) __riscv_vsseg4e32_v_u32mf2(__VA_ARGS__)
+#define vsseg5e32_v_u32mf2(...) __riscv_vsseg5e32_v_u32mf2(__VA_ARGS__)
+#define vsseg6e32_v_u32mf2(...) __riscv_vsseg6e32_v_u32mf2(__VA_ARGS__)
+#define vsseg7e32_v_u32mf2(...) __riscv_vsseg7e32_v_u32mf2(__VA_ARGS__)
+#define vsseg8e32_v_u32mf2(...) __riscv_vsseg8e32_v_u32mf2(__VA_ARGS__)
+#define vsseg2e32_v_u32m1(...) __riscv_vsseg2e32_v_u32m1(__VA_ARGS__)
+#define vsseg3e32_v_u32m1(...) __riscv_vsseg3e32_v_u32m1(__VA_ARGS__)
+#define vsseg4e32_v_u32m1(...) __riscv_vsseg4e32_v_u32m1(__VA_ARGS__)
+#define vsseg5e32_v_u32m1(...) __riscv_vsseg5e32_v_u32m1(__VA_ARGS__)
+#define vsseg6e32_v_u32m1(...) __riscv_vsseg6e32_v_u32m1(__VA_ARGS__)
+#define vsseg7e32_v_u32m1(...) __riscv_vsseg7e32_v_u32m1(__VA_ARGS__)
+#define vsseg8e32_v_u32m1(...) __riscv_vsseg8e32_v_u32m1(__VA_ARGS__)
+#define vsseg2e32_v_u32m2(...) __riscv_vsseg2e32_v_u32m2(__VA_ARGS__)
+#define vsseg3e32_v_u32m2(...) __riscv_vsseg3e32_v_u32m2(__VA_ARGS__)
+#define vsseg4e32_v_u32m2(...) __riscv_vsseg4e32_v_u32m2(__VA_ARGS__)
+#define vsseg2e32_v_u32m4(...) __riscv_vsseg2e32_v_u32m4(__VA_ARGS__)
+#define vsseg2e64_v_u64m1(...) __riscv_vsseg2e64_v_u64m1(__VA_ARGS__)
+#define vsseg3e64_v_u64m1(...) __riscv_vsseg3e64_v_u64m1(__VA_ARGS__)
+#define vsseg4e64_v_u64m1(...) __riscv_vsseg4e64_v_u64m1(__VA_ARGS__)
+#define vsseg5e64_v_u64m1(...) __riscv_vsseg5e64_v_u64m1(__VA_ARGS__)
+#define vsseg6e64_v_u64m1(...) __riscv_vsseg6e64_v_u64m1(__VA_ARGS__)
+#define vsseg7e64_v_u64m1(...) __riscv_vsseg7e64_v_u64m1(__VA_ARGS__)
+#define vsseg8e64_v_u64m1(...) __riscv_vsseg8e64_v_u64m1(__VA_ARGS__)
+#define vsseg2e64_v_u64m2(...) __riscv_vsseg2e64_v_u64m2(__VA_ARGS__)
+#define vsseg3e64_v_u64m2(...) __riscv_vsseg3e64_v_u64m2(__VA_ARGS__)
+#define vsseg4e64_v_u64m2(...) __riscv_vsseg4e64_v_u64m2(__VA_ARGS__)
+#define vsseg2e64_v_u64m4(...) __riscv_vsseg2e64_v_u64m4(__VA_ARGS__)
+// masked functions
+#define vsseg2e16_v_f16mf4_m(...) __riscv_vsseg2e16_v_f16mf4_m(__VA_ARGS__)
+#define vsseg3e16_v_f16mf4_m(...) __riscv_vsseg3e16_v_f16mf4_m(__VA_ARGS__)
+#define vsseg4e16_v_f16mf4_m(...) __riscv_vsseg4e16_v_f16mf4_m(__VA_ARGS__)
+#define vsseg5e16_v_f16mf4_m(...) __riscv_vsseg5e16_v_f16mf4_m(__VA_ARGS__)
+#define vsseg6e16_v_f16mf4_m(...) __riscv_vsseg6e16_v_f16mf4_m(__VA_ARGS__)
+#define vsseg7e16_v_f16mf4_m(...) __riscv_vsseg7e16_v_f16mf4_m(__VA_ARGS__)
+#define vsseg8e16_v_f16mf4_m(...) __riscv_vsseg8e16_v_f16mf4_m(__VA_ARGS__)
+#define vsseg2e16_v_f16mf2_m(...) __riscv_vsseg2e16_v_f16mf2_m(__VA_ARGS__)
+#define vsseg3e16_v_f16mf2_m(...) __riscv_vsseg3e16_v_f16mf2_m(__VA_ARGS__)
+#define vsseg4e16_v_f16mf2_m(...) __riscv_vsseg4e16_v_f16mf2_m(__VA_ARGS__)
+#define vsseg5e16_v_f16mf2_m(...) __riscv_vsseg5e16_v_f16mf2_m(__VA_ARGS__)
+#define vsseg6e16_v_f16mf2_m(...) __riscv_vsseg6e16_v_f16mf2_m(__VA_ARGS__)
+#define vsseg7e16_v_f16mf2_m(...) __riscv_vsseg7e16_v_f16mf2_m(__VA_ARGS__)
+#define vsseg8e16_v_f16mf2_m(...) __riscv_vsseg8e16_v_f16mf2_m(__VA_ARGS__)
+#define vsseg2e16_v_f16m1_m(...) __riscv_vsseg2e16_v_f16m1_m(__VA_ARGS__)
+#define vsseg3e16_v_f16m1_m(...) __riscv_vsseg3e16_v_f16m1_m(__VA_ARGS__)
+#define vsseg4e16_v_f16m1_m(...) __riscv_vsseg4e16_v_f16m1_m(__VA_ARGS__)
+#define vsseg5e16_v_f16m1_m(...) __riscv_vsseg5e16_v_f16m1_m(__VA_ARGS__)
+#define vsseg6e16_v_f16m1_m(...) __riscv_vsseg6e16_v_f16m1_m(__VA_ARGS__)
+#define vsseg7e16_v_f16m1_m(...) __riscv_vsseg7e16_v_f16m1_m(__VA_ARGS__)
+#define vsseg8e16_v_f16m1_m(...) __riscv_vsseg8e16_v_f16m1_m(__VA_ARGS__)
+#define vsseg2e16_v_f16m2_m(...) __riscv_vsseg2e16_v_f16m2_m(__VA_ARGS__)
+#define vsseg3e16_v_f16m2_m(...) __riscv_vsseg3e16_v_f16m2_m(__VA_ARGS__)
+#define vsseg4e16_v_f16m2_m(...) __riscv_vsseg4e16_v_f16m2_m(__VA_ARGS__)
+#define vsseg2e16_v_f16m4_m(...) __riscv_vsseg2e16_v_f16m4_m(__VA_ARGS__)
+#define vsseg2e32_v_f32mf2_m(...) __riscv_vsseg2e32_v_f32mf2_m(__VA_ARGS__)
+#define vsseg3e32_v_f32mf2_m(...) __riscv_vsseg3e32_v_f32mf2_m(__VA_ARGS__)
+#define vsseg4e32_v_f32mf2_m(...) __riscv_vsseg4e32_v_f32mf2_m(__VA_ARGS__)
+#define vsseg5e32_v_f32mf2_m(...) __riscv_vsseg5e32_v_f32mf2_m(__VA_ARGS__)
+#define vsseg6e32_v_f32mf2_m(...) __riscv_vsseg6e32_v_f32mf2_m(__VA_ARGS__)
+#define vsseg7e32_v_f32mf2_m(...) __riscv_vsseg7e32_v_f32mf2_m(__VA_ARGS__)
+#define vsseg8e32_v_f32mf2_m(...) __riscv_vsseg8e32_v_f32mf2_m(__VA_ARGS__)
+#define vsseg2e32_v_f32m1_m(...) __riscv_vsseg2e32_v_f32m1_m(__VA_ARGS__)
+#define vsseg3e32_v_f32m1_m(...) __riscv_vsseg3e32_v_f32m1_m(__VA_ARGS__)
+#define vsseg4e32_v_f32m1_m(...) __riscv_vsseg4e32_v_f32m1_m(__VA_ARGS__)
+#define vsseg5e32_v_f32m1_m(...) __riscv_vsseg5e32_v_f32m1_m(__VA_ARGS__)
+#define vsseg6e32_v_f32m1_m(...) __riscv_vsseg6e32_v_f32m1_m(__VA_ARGS__)
+#define vsseg7e32_v_f32m1_m(...) __riscv_vsseg7e32_v_f32m1_m(__VA_ARGS__)
+#define vsseg8e32_v_f32m1_m(...) __riscv_vsseg8e32_v_f32m1_m(__VA_ARGS__)
+#define vsseg2e32_v_f32m2_m(...) __riscv_vsseg2e32_v_f32m2_m(__VA_ARGS__)
+#define vsseg3e32_v_f32m2_m(...) __riscv_vsseg3e32_v_f32m2_m(__VA_ARGS__)
+#define vsseg4e32_v_f32m2_m(...) __riscv_vsseg4e32_v_f32m2_m(__VA_ARGS__)
+#define vsseg2e32_v_f32m4_m(...) __riscv_vsseg2e32_v_f32m4_m(__VA_ARGS__)
+#define vsseg2e64_v_f64m1_m(...) __riscv_vsseg2e64_v_f64m1_m(__VA_ARGS__)
+#define vsseg3e64_v_f64m1_m(...) __riscv_vsseg3e64_v_f64m1_m(__VA_ARGS__)
+#define vsseg4e64_v_f64m1_m(...) __riscv_vsseg4e64_v_f64m1_m(__VA_ARGS__)
+#define vsseg5e64_v_f64m1_m(...) __riscv_vsseg5e64_v_f64m1_m(__VA_ARGS__)
+#define vsseg6e64_v_f64m1_m(...) __riscv_vsseg6e64_v_f64m1_m(__VA_ARGS__)
+#define vsseg7e64_v_f64m1_m(...) __riscv_vsseg7e64_v_f64m1_m(__VA_ARGS__)
+#define vsseg8e64_v_f64m1_m(...) __riscv_vsseg8e64_v_f64m1_m(__VA_ARGS__)
+#define vsseg2e64_v_f64m2_m(...) __riscv_vsseg2e64_v_f64m2_m(__VA_ARGS__)
+#define vsseg3e64_v_f64m2_m(...) __riscv_vsseg3e64_v_f64m2_m(__VA_ARGS__)
+#define vsseg4e64_v_f64m2_m(...) __riscv_vsseg4e64_v_f64m2_m(__VA_ARGS__)
+#define vsseg2e64_v_f64m4_m(...) __riscv_vsseg2e64_v_f64m4_m(__VA_ARGS__)
+#define vsseg2e8_v_i8mf8_m(...) __riscv_vsseg2e8_v_i8mf8_m(__VA_ARGS__)
+#define vsseg3e8_v_i8mf8_m(...) __riscv_vsseg3e8_v_i8mf8_m(__VA_ARGS__)
+#define vsseg4e8_v_i8mf8_m(...) __riscv_vsseg4e8_v_i8mf8_m(__VA_ARGS__)
+#define vsseg5e8_v_i8mf8_m(...) __riscv_vsseg5e8_v_i8mf8_m(__VA_ARGS__)
+#define vsseg6e8_v_i8mf8_m(...) __riscv_vsseg6e8_v_i8mf8_m(__VA_ARGS__)
+#define vsseg7e8_v_i8mf8_m(...) __riscv_vsseg7e8_v_i8mf8_m(__VA_ARGS__)
+#define vsseg8e8_v_i8mf8_m(...) __riscv_vsseg8e8_v_i8mf8_m(__VA_ARGS__)
+#define vsseg2e8_v_i8mf4_m(...) __riscv_vsseg2e8_v_i8mf4_m(__VA_ARGS__)
+#define vsseg3e8_v_i8mf4_m(...) __riscv_vsseg3e8_v_i8mf4_m(__VA_ARGS__)
+#define vsseg4e8_v_i8mf4_m(...) __riscv_vsseg4e8_v_i8mf4_m(__VA_ARGS__)
+#define vsseg5e8_v_i8mf4_m(...) __riscv_vsseg5e8_v_i8mf4_m(__VA_ARGS__)
+#define vsseg6e8_v_i8mf4_m(...) __riscv_vsseg6e8_v_i8mf4_m(__VA_ARGS__)
+#define vsseg7e8_v_i8mf4_m(...) __riscv_vsseg7e8_v_i8mf4_m(__VA_ARGS__)
+#define vsseg8e8_v_i8mf4_m(...) __riscv_vsseg8e8_v_i8mf4_m(__VA_ARGS__)
+#define vsseg2e8_v_i8mf2_m(...) __riscv_vsseg2e8_v_i8mf2_m(__VA_ARGS__)
+#define vsseg3e8_v_i8mf2_m(...) __riscv_vsseg3e8_v_i8mf2_m(__VA_ARGS__)
+#define vsseg4e8_v_i8mf2_m(...) __riscv_vsseg4e8_v_i8mf2_m(__VA_ARGS__)
+#define vsseg5e8_v_i8mf2_m(...) __riscv_vsseg5e8_v_i8mf2_m(__VA_ARGS__)
+#define vsseg6e8_v_i8mf2_m(...) __riscv_vsseg6e8_v_i8mf2_m(__VA_ARGS__)
+#define vsseg7e8_v_i8mf2_m(...) __riscv_vsseg7e8_v_i8mf2_m(__VA_ARGS__)
+#define vsseg8e8_v_i8mf2_m(...) __riscv_vsseg8e8_v_i8mf2_m(__VA_ARGS__)
+#define vsseg2e8_v_i8m1_m(...) __riscv_vsseg2e8_v_i8m1_m(__VA_ARGS__)
+#define vsseg3e8_v_i8m1_m(...) __riscv_vsseg3e8_v_i8m1_m(__VA_ARGS__)
+#define vsseg4e8_v_i8m1_m(...) __riscv_vsseg4e8_v_i8m1_m(__VA_ARGS__)
+#define vsseg5e8_v_i8m1_m(...) __riscv_vsseg5e8_v_i8m1_m(__VA_ARGS__)
+#define vsseg6e8_v_i8m1_m(...) __riscv_vsseg6e8_v_i8m1_m(__VA_ARGS__)
+#define vsseg7e8_v_i8m1_m(...) __riscv_vsseg7e8_v_i8m1_m(__VA_ARGS__)
+#define vsseg8e8_v_i8m1_m(...) __riscv_vsseg8e8_v_i8m1_m(__VA_ARGS__)
+#define vsseg2e8_v_i8m2_m(...) __riscv_vsseg2e8_v_i8m2_m(__VA_ARGS__)
+#define vsseg3e8_v_i8m2_m(...) __riscv_vsseg3e8_v_i8m2_m(__VA_ARGS__)
+#define vsseg4e8_v_i8m2_m(...) __riscv_vsseg4e8_v_i8m2_m(__VA_ARGS__)
+#define vsseg2e8_v_i8m4_m(...) __riscv_vsseg2e8_v_i8m4_m(__VA_ARGS__)
+#define vsseg2e16_v_i16mf4_m(...) __riscv_vsseg2e16_v_i16mf4_m(__VA_ARGS__)
+#define vsseg3e16_v_i16mf4_m(...) __riscv_vsseg3e16_v_i16mf4_m(__VA_ARGS__)
+#define vsseg4e16_v_i16mf4_m(...) __riscv_vsseg4e16_v_i16mf4_m(__VA_ARGS__)
+#define vsseg5e16_v_i16mf4_m(...) __riscv_vsseg5e16_v_i16mf4_m(__VA_ARGS__)
+#define vsseg6e16_v_i16mf4_m(...) __riscv_vsseg6e16_v_i16mf4_m(__VA_ARGS__)
+#define vsseg7e16_v_i16mf4_m(...) __riscv_vsseg7e16_v_i16mf4_m(__VA_ARGS__)
+#define vsseg8e16_v_i16mf4_m(...) __riscv_vsseg8e16_v_i16mf4_m(__VA_ARGS__)
+#define vsseg2e16_v_i16mf2_m(...) __riscv_vsseg2e16_v_i16mf2_m(__VA_ARGS__)
+#define vsseg3e16_v_i16mf2_m(...) __riscv_vsseg3e16_v_i16mf2_m(__VA_ARGS__)
+#define vsseg4e16_v_i16mf2_m(...) __riscv_vsseg4e16_v_i16mf2_m(__VA_ARGS__)
+#define vsseg5e16_v_i16mf2_m(...) __riscv_vsseg5e16_v_i16mf2_m(__VA_ARGS__)
+#define vsseg6e16_v_i16mf2_m(...) __riscv_vsseg6e16_v_i16mf2_m(__VA_ARGS__)
+#define vsseg7e16_v_i16mf2_m(...) __riscv_vsseg7e16_v_i16mf2_m(__VA_ARGS__)
+#define vsseg8e16_v_i16mf2_m(...) __riscv_vsseg8e16_v_i16mf2_m(__VA_ARGS__)
+#define vsseg2e16_v_i16m1_m(...) __riscv_vsseg2e16_v_i16m1_m(__VA_ARGS__)
+#define vsseg3e16_v_i16m1_m(...) __riscv_vsseg3e16_v_i16m1_m(__VA_ARGS__)
+#define vsseg4e16_v_i16m1_m(...) __riscv_vsseg4e16_v_i16m1_m(__VA_ARGS__)
+#define vsseg5e16_v_i16m1_m(...) __riscv_vsseg5e16_v_i16m1_m(__VA_ARGS__)
+#define vsseg6e16_v_i16m1_m(...) __riscv_vsseg6e16_v_i16m1_m(__VA_ARGS__)
+#define vsseg7e16_v_i16m1_m(...) __riscv_vsseg7e16_v_i16m1_m(__VA_ARGS__)
+#define vsseg8e16_v_i16m1_m(...) __riscv_vsseg8e16_v_i16m1_m(__VA_ARGS__)
+#define vsseg2e16_v_i16m2_m(...) __riscv_vsseg2e16_v_i16m2_m(__VA_ARGS__)
+#define vsseg3e16_v_i16m2_m(...) __riscv_vsseg3e16_v_i16m2_m(__VA_ARGS__)
+#define vsseg4e16_v_i16m2_m(...) __riscv_vsseg4e16_v_i16m2_m(__VA_ARGS__)
+#define vsseg2e16_v_i16m4_m(...) __riscv_vsseg2e16_v_i16m4_m(__VA_ARGS__)
+#define vsseg2e32_v_i32mf2_m(...) __riscv_vsseg2e32_v_i32mf2_m(__VA_ARGS__)
+#define vsseg3e32_v_i32mf2_m(...) __riscv_vsseg3e32_v_i32mf2_m(__VA_ARGS__)
+#define vsseg4e32_v_i32mf2_m(...) __riscv_vsseg4e32_v_i32mf2_m(__VA_ARGS__)
+#define vsseg5e32_v_i32mf2_m(...) __riscv_vsseg5e32_v_i32mf2_m(__VA_ARGS__)
+#define vsseg6e32_v_i32mf2_m(...) __riscv_vsseg6e32_v_i32mf2_m(__VA_ARGS__)
+#define vsseg7e32_v_i32mf2_m(...) __riscv_vsseg7e32_v_i32mf2_m(__VA_ARGS__)
+#define vsseg8e32_v_i32mf2_m(...) __riscv_vsseg8e32_v_i32mf2_m(__VA_ARGS__)
+#define vsseg2e32_v_i32m1_m(...) __riscv_vsseg2e32_v_i32m1_m(__VA_ARGS__)
+#define vsseg3e32_v_i32m1_m(...) __riscv_vsseg3e32_v_i32m1_m(__VA_ARGS__)
+#define vsseg4e32_v_i32m1_m(...) __riscv_vsseg4e32_v_i32m1_m(__VA_ARGS__)
+#define vsseg5e32_v_i32m1_m(...) __riscv_vsseg5e32_v_i32m1_m(__VA_ARGS__)
+#define vsseg6e32_v_i32m1_m(...) __riscv_vsseg6e32_v_i32m1_m(__VA_ARGS__)
+#define vsseg7e32_v_i32m1_m(...) __riscv_vsseg7e32_v_i32m1_m(__VA_ARGS__)
+#define vsseg8e32_v_i32m1_m(...) __riscv_vsseg8e32_v_i32m1_m(__VA_ARGS__)
+#define vsseg2e32_v_i32m2_m(...) __riscv_vsseg2e32_v_i32m2_m(__VA_ARGS__)
+#define vsseg3e32_v_i32m2_m(...) __riscv_vsseg3e32_v_i32m2_m(__VA_ARGS__)
+#define vsseg4e32_v_i32m2_m(...) __riscv_vsseg4e32_v_i32m2_m(__VA_ARGS__)
+#define vsseg2e32_v_i32m4_m(...) __riscv_vsseg2e32_v_i32m4_m(__VA_ARGS__)
+#define vsseg2e64_v_i64m1_m(...) __riscv_vsseg2e64_v_i64m1_m(__VA_ARGS__)
+#define vsseg3e64_v_i64m1_m(...) __riscv_vsseg3e64_v_i64m1_m(__VA_ARGS__)
+#define vsseg4e64_v_i64m1_m(...) __riscv_vsseg4e64_v_i64m1_m(__VA_ARGS__)
+#define vsseg5e64_v_i64m1_m(...) __riscv_vsseg5e64_v_i64m1_m(__VA_ARGS__)
+#define vsseg6e64_v_i64m1_m(...) __riscv_vsseg6e64_v_i64m1_m(__VA_ARGS__)
+#define vsseg7e64_v_i64m1_m(...) __riscv_vsseg7e64_v_i64m1_m(__VA_ARGS__)
+#define vsseg8e64_v_i64m1_m(...) __riscv_vsseg8e64_v_i64m1_m(__VA_ARGS__)
+#define vsseg2e64_v_i64m2_m(...) __riscv_vsseg2e64_v_i64m2_m(__VA_ARGS__)
+#define vsseg3e64_v_i64m2_m(...) __riscv_vsseg3e64_v_i64m2_m(__VA_ARGS__)
+#define vsseg4e64_v_i64m2_m(...) __riscv_vsseg4e64_v_i64m2_m(__VA_ARGS__)
+#define vsseg2e64_v_i64m4_m(...) __riscv_vsseg2e64_v_i64m4_m(__VA_ARGS__)
+#define vsseg2e8_v_u8mf8_m(...) __riscv_vsseg2e8_v_u8mf8_m(__VA_ARGS__)
+#define vsseg3e8_v_u8mf8_m(...) __riscv_vsseg3e8_v_u8mf8_m(__VA_ARGS__)
+#define vsseg4e8_v_u8mf8_m(...) __riscv_vsseg4e8_v_u8mf8_m(__VA_ARGS__)
+#define vsseg5e8_v_u8mf8_m(...) __riscv_vsseg5e8_v_u8mf8_m(__VA_ARGS__)
+#define vsseg6e8_v_u8mf8_m(...) __riscv_vsseg6e8_v_u8mf8_m(__VA_ARGS__)
+#define vsseg7e8_v_u8mf8_m(...) __riscv_vsseg7e8_v_u8mf8_m(__VA_ARGS__)
+#define vsseg8e8_v_u8mf8_m(...) __riscv_vsseg8e8_v_u8mf8_m(__VA_ARGS__)
+#define vsseg2e8_v_u8mf4_m(...) __riscv_vsseg2e8_v_u8mf4_m(__VA_ARGS__)
+#define vsseg3e8_v_u8mf4_m(...) __riscv_vsseg3e8_v_u8mf4_m(__VA_ARGS__)
+#define vsseg4e8_v_u8mf4_m(...) __riscv_vsseg4e8_v_u8mf4_m(__VA_ARGS__)
+#define vsseg5e8_v_u8mf4_m(...) __riscv_vsseg5e8_v_u8mf4_m(__VA_ARGS__)
+#define vsseg6e8_v_u8mf4_m(...) __riscv_vsseg6e8_v_u8mf4_m(__VA_ARGS__)
+#define vsseg7e8_v_u8mf4_m(...) __riscv_vsseg7e8_v_u8mf4_m(__VA_ARGS__)
+#define vsseg8e8_v_u8mf4_m(...) __riscv_vsseg8e8_v_u8mf4_m(__VA_ARGS__)
+#define vsseg2e8_v_u8mf2_m(...) __riscv_vsseg2e8_v_u8mf2_m(__VA_ARGS__)
+#define vsseg3e8_v_u8mf2_m(...) __riscv_vsseg3e8_v_u8mf2_m(__VA_ARGS__)
+#define vsseg4e8_v_u8mf2_m(...) __riscv_vsseg4e8_v_u8mf2_m(__VA_ARGS__)
+#define vsseg5e8_v_u8mf2_m(...) __riscv_vsseg5e8_v_u8mf2_m(__VA_ARGS__)
+#define vsseg6e8_v_u8mf2_m(...) __riscv_vsseg6e8_v_u8mf2_m(__VA_ARGS__)
+#define vsseg7e8_v_u8mf2_m(...) __riscv_vsseg7e8_v_u8mf2_m(__VA_ARGS__)
+#define vsseg8e8_v_u8mf2_m(...) __riscv_vsseg8e8_v_u8mf2_m(__VA_ARGS__)
+#define vsseg2e8_v_u8m1_m(...) __riscv_vsseg2e8_v_u8m1_m(__VA_ARGS__)
+#define vsseg3e8_v_u8m1_m(...) __riscv_vsseg3e8_v_u8m1_m(__VA_ARGS__)
+#define vsseg4e8_v_u8m1_m(...) __riscv_vsseg4e8_v_u8m1_m(__VA_ARGS__)
+#define vsseg5e8_v_u8m1_m(...) __riscv_vsseg5e8_v_u8m1_m(__VA_ARGS__)
+#define vsseg6e8_v_u8m1_m(...) __riscv_vsseg6e8_v_u8m1_m(__VA_ARGS__)
+#define vsseg7e8_v_u8m1_m(...) __riscv_vsseg7e8_v_u8m1_m(__VA_ARGS__)
+#define vsseg8e8_v_u8m1_m(...) __riscv_vsseg8e8_v_u8m1_m(__VA_ARGS__)
+#define vsseg2e8_v_u8m2_m(...) __riscv_vsseg2e8_v_u8m2_m(__VA_ARGS__)
+#define vsseg3e8_v_u8m2_m(...) __riscv_vsseg3e8_v_u8m2_m(__VA_ARGS__)
+#define vsseg4e8_v_u8m2_m(...) __riscv_vsseg4e8_v_u8m2_m(__VA_ARGS__)
+#define vsseg2e8_v_u8m4_m(...) __riscv_vsseg2e8_v_u8m4_m(__VA_ARGS__)
+#define vsseg2e16_v_u16mf4_m(...) __riscv_vsseg2e16_v_u16mf4_m(__VA_ARGS__)
+#define vsseg3e16_v_u16mf4_m(...) __riscv_vsseg3e16_v_u16mf4_m(__VA_ARGS__)
+#define vsseg4e16_v_u16mf4_m(...) __riscv_vsseg4e16_v_u16mf4_m(__VA_ARGS__)
+#define vsseg5e16_v_u16mf4_m(...) __riscv_vsseg5e16_v_u16mf4_m(__VA_ARGS__)
+#define vsseg6e16_v_u16mf4_m(...) __riscv_vsseg6e16_v_u16mf4_m(__VA_ARGS__)
+#define vsseg7e16_v_u16mf4_m(...) __riscv_vsseg7e16_v_u16mf4_m(__VA_ARGS__)
+#define vsseg8e16_v_u16mf4_m(...) __riscv_vsseg8e16_v_u16mf4_m(__VA_ARGS__)
+#define vsseg2e16_v_u16mf2_m(...) __riscv_vsseg2e16_v_u16mf2_m(__VA_ARGS__)
+#define vsseg3e16_v_u16mf2_m(...) __riscv_vsseg3e16_v_u16mf2_m(__VA_ARGS__)
+#define vsseg4e16_v_u16mf2_m(...) __riscv_vsseg4e16_v_u16mf2_m(__VA_ARGS__)
+#define vsseg5e16_v_u16mf2_m(...) __riscv_vsseg5e16_v_u16mf2_m(__VA_ARGS__)
+#define vsseg6e16_v_u16mf2_m(...) __riscv_vsseg6e16_v_u16mf2_m(__VA_ARGS__)
+#define vsseg7e16_v_u16mf2_m(...) __riscv_vsseg7e16_v_u16mf2_m(__VA_ARGS__)
+#define vsseg8e16_v_u16mf2_m(...) __riscv_vsseg8e16_v_u16mf2_m(__VA_ARGS__)
+#define vsseg2e16_v_u16m1_m(...) __riscv_vsseg2e16_v_u16m1_m(__VA_ARGS__)
+#define vsseg3e16_v_u16m1_m(...) __riscv_vsseg3e16_v_u16m1_m(__VA_ARGS__)
+#define vsseg4e16_v_u16m1_m(...) __riscv_vsseg4e16_v_u16m1_m(__VA_ARGS__)
+#define vsseg5e16_v_u16m1_m(...) __riscv_vsseg5e16_v_u16m1_m(__VA_ARGS__)
+#define vsseg6e16_v_u16m1_m(...) __riscv_vsseg6e16_v_u16m1_m(__VA_ARGS__)
+#define vsseg7e16_v_u16m1_m(...) __riscv_vsseg7e16_v_u16m1_m(__VA_ARGS__)
+#define vsseg8e16_v_u16m1_m(...) __riscv_vsseg8e16_v_u16m1_m(__VA_ARGS__)
+#define vsseg2e16_v_u16m2_m(...) __riscv_vsseg2e16_v_u16m2_m(__VA_ARGS__)
+#define vsseg3e16_v_u16m2_m(...) __riscv_vsseg3e16_v_u16m2_m(__VA_ARGS__)
+#define vsseg4e16_v_u16m2_m(...) __riscv_vsseg4e16_v_u16m2_m(__VA_ARGS__)
+#define vsseg2e16_v_u16m4_m(...) __riscv_vsseg2e16_v_u16m4_m(__VA_ARGS__)
+#define vsseg2e32_v_u32mf2_m(...) __riscv_vsseg2e32_v_u32mf2_m(__VA_ARGS__)
+#define vsseg3e32_v_u32mf2_m(...) __riscv_vsseg3e32_v_u32mf2_m(__VA_ARGS__)
+#define vsseg4e32_v_u32mf2_m(...) __riscv_vsseg4e32_v_u32mf2_m(__VA_ARGS__)
+#define vsseg5e32_v_u32mf2_m(...) __riscv_vsseg5e32_v_u32mf2_m(__VA_ARGS__)
+#define vsseg6e32_v_u32mf2_m(...) __riscv_vsseg6e32_v_u32mf2_m(__VA_ARGS__)
+#define vsseg7e32_v_u32mf2_m(...) __riscv_vsseg7e32_v_u32mf2_m(__VA_ARGS__)
+#define vsseg8e32_v_u32mf2_m(...) __riscv_vsseg8e32_v_u32mf2_m(__VA_ARGS__)
+#define vsseg2e32_v_u32m1_m(...) __riscv_vsseg2e32_v_u32m1_m(__VA_ARGS__)
+#define vsseg3e32_v_u32m1_m(...) __riscv_vsseg3e32_v_u32m1_m(__VA_ARGS__)
+#define vsseg4e32_v_u32m1_m(...) __riscv_vsseg4e32_v_u32m1_m(__VA_ARGS__)
+#define vsseg5e32_v_u32m1_m(...) __riscv_vsseg5e32_v_u32m1_m(__VA_ARGS__)
+#define vsseg6e32_v_u32m1_m(...) __riscv_vsseg6e32_v_u32m1_m(__VA_ARGS__)
+#define vsseg7e32_v_u32m1_m(...) __riscv_vsseg7e32_v_u32m1_m(__VA_ARGS__)
+#define vsseg8e32_v_u32m1_m(...) __riscv_vsseg8e32_v_u32m1_m(__VA_ARGS__)
+#define vsseg2e32_v_u32m2_m(...) __riscv_vsseg2e32_v_u32m2_m(__VA_ARGS__)
+#define vsseg3e32_v_u32m2_m(...) __riscv_vsseg3e32_v_u32m2_m(__VA_ARGS__)
+#define vsseg4e32_v_u32m2_m(...) __riscv_vsseg4e32_v_u32m2_m(__VA_ARGS__)
+#define vsseg2e32_v_u32m4_m(...) __riscv_vsseg2e32_v_u32m4_m(__VA_ARGS__)
+#define vsseg2e64_v_u64m1_m(...) __riscv_vsseg2e64_v_u64m1_m(__VA_ARGS__)
+#define vsseg3e64_v_u64m1_m(...) __riscv_vsseg3e64_v_u64m1_m(__VA_ARGS__)
+#define vsseg4e64_v_u64m1_m(...) __riscv_vsseg4e64_v_u64m1_m(__VA_ARGS__)
+#define vsseg5e64_v_u64m1_m(...) __riscv_vsseg5e64_v_u64m1_m(__VA_ARGS__)
+#define vsseg6e64_v_u64m1_m(...) __riscv_vsseg6e64_v_u64m1_m(__VA_ARGS__)
+#define vsseg7e64_v_u64m1_m(...) __riscv_vsseg7e64_v_u64m1_m(__VA_ARGS__)
+#define vsseg8e64_v_u64m1_m(...) __riscv_vsseg8e64_v_u64m1_m(__VA_ARGS__)
+#define vsseg2e64_v_u64m2_m(...) __riscv_vsseg2e64_v_u64m2_m(__VA_ARGS__)
+#define vsseg3e64_v_u64m2_m(...) __riscv_vsseg3e64_v_u64m2_m(__VA_ARGS__)
+#define vsseg4e64_v_u64m2_m(...) __riscv_vsseg4e64_v_u64m2_m(__VA_ARGS__)
+#define vsseg2e64_v_u64m4_m(...) __riscv_vsseg2e64_v_u64m4_m(__VA_ARGS__)
+#define vlsseg2e16_v_f16mf4(...) __riscv_vlsseg2e16_v_f16mf4(__VA_ARGS__)
+#define vlsseg3e16_v_f16mf4(...) __riscv_vlsseg3e16_v_f16mf4(__VA_ARGS__)
+#define vlsseg4e16_v_f16mf4(...) __riscv_vlsseg4e16_v_f16mf4(__VA_ARGS__)
+#define vlsseg5e16_v_f16mf4(...) __riscv_vlsseg5e16_v_f16mf4(__VA_ARGS__)
+#define vlsseg6e16_v_f16mf4(...) __riscv_vlsseg6e16_v_f16mf4(__VA_ARGS__)
+#define vlsseg7e16_v_f16mf4(...) __riscv_vlsseg7e16_v_f16mf4(__VA_ARGS__)
+#define vlsseg8e16_v_f16mf4(...) __riscv_vlsseg8e16_v_f16mf4(__VA_ARGS__)
+#define vlsseg2e16_v_f16mf2(...) __riscv_vlsseg2e16_v_f16mf2(__VA_ARGS__)
+#define vlsseg3e16_v_f16mf2(...) __riscv_vlsseg3e16_v_f16mf2(__VA_ARGS__)
+#define vlsseg4e16_v_f16mf2(...) __riscv_vlsseg4e16_v_f16mf2(__VA_ARGS__)
+#define vlsseg5e16_v_f16mf2(...) __riscv_vlsseg5e16_v_f16mf2(__VA_ARGS__)
+#define vlsseg6e16_v_f16mf2(...) __riscv_vlsseg6e16_v_f16mf2(__VA_ARGS__)
+#define vlsseg7e16_v_f16mf2(...) __riscv_vlsseg7e16_v_f16mf2(__VA_ARGS__)
+#define vlsseg8e16_v_f16mf2(...) __riscv_vlsseg8e16_v_f16mf2(__VA_ARGS__)
+#define vlsseg2e16_v_f16m1(...) __riscv_vlsseg2e16_v_f16m1(__VA_ARGS__)
+#define vlsseg3e16_v_f16m1(...) __riscv_vlsseg3e16_v_f16m1(__VA_ARGS__)
+#define vlsseg4e16_v_f16m1(...) __riscv_vlsseg4e16_v_f16m1(__VA_ARGS__)
+#define vlsseg5e16_v_f16m1(...) __riscv_vlsseg5e16_v_f16m1(__VA_ARGS__)
+#define vlsseg6e16_v_f16m1(...) __riscv_vlsseg6e16_v_f16m1(__VA_ARGS__)
+#define vlsseg7e16_v_f16m1(...) __riscv_vlsseg7e16_v_f16m1(__VA_ARGS__)
+#define vlsseg8e16_v_f16m1(...) __riscv_vlsseg8e16_v_f16m1(__VA_ARGS__)
+#define vlsseg2e16_v_f16m2(...) __riscv_vlsseg2e16_v_f16m2(__VA_ARGS__)
+#define vlsseg3e16_v_f16m2(...) __riscv_vlsseg3e16_v_f16m2(__VA_ARGS__)
+#define vlsseg4e16_v_f16m2(...) __riscv_vlsseg4e16_v_f16m2(__VA_ARGS__)
+#define vlsseg2e16_v_f16m4(...) __riscv_vlsseg2e16_v_f16m4(__VA_ARGS__)
+#define vlsseg2e32_v_f32mf2(...) __riscv_vlsseg2e32_v_f32mf2(__VA_ARGS__)
+#define vlsseg3e32_v_f32mf2(...) __riscv_vlsseg3e32_v_f32mf2(__VA_ARGS__)
+#define vlsseg4e32_v_f32mf2(...) __riscv_vlsseg4e32_v_f32mf2(__VA_ARGS__)
+#define vlsseg5e32_v_f32mf2(...) __riscv_vlsseg5e32_v_f32mf2(__VA_ARGS__)
+#define vlsseg6e32_v_f32mf2(...) __riscv_vlsseg6e32_v_f32mf2(__VA_ARGS__)
+#define vlsseg7e32_v_f32mf2(...) __riscv_vlsseg7e32_v_f32mf2(__VA_ARGS__)
+#define vlsseg8e32_v_f32mf2(...) __riscv_vlsseg8e32_v_f32mf2(__VA_ARGS__)
+#define vlsseg2e32_v_f32m1(...) __riscv_vlsseg2e32_v_f32m1(__VA_ARGS__)
+#define vlsseg3e32_v_f32m1(...) __riscv_vlsseg3e32_v_f32m1(__VA_ARGS__)
+#define vlsseg4e32_v_f32m1(...) __riscv_vlsseg4e32_v_f32m1(__VA_ARGS__)
+#define vlsseg5e32_v_f32m1(...) __riscv_vlsseg5e32_v_f32m1(__VA_ARGS__)
+#define vlsseg6e32_v_f32m1(...) __riscv_vlsseg6e32_v_f32m1(__VA_ARGS__)
+#define vlsseg7e32_v_f32m1(...) __riscv_vlsseg7e32_v_f32m1(__VA_ARGS__)
+#define vlsseg8e32_v_f32m1(...) __riscv_vlsseg8e32_v_f32m1(__VA_ARGS__)
+#define vlsseg2e32_v_f32m2(...) __riscv_vlsseg2e32_v_f32m2(__VA_ARGS__)
+#define vlsseg3e32_v_f32m2(...) __riscv_vlsseg3e32_v_f32m2(__VA_ARGS__)
+#define vlsseg4e32_v_f32m2(...) __riscv_vlsseg4e32_v_f32m2(__VA_ARGS__)
+#define vlsseg2e32_v_f32m4(...) __riscv_vlsseg2e32_v_f32m4(__VA_ARGS__)
+#define vlsseg2e64_v_f64m1(...) __riscv_vlsseg2e64_v_f64m1(__VA_ARGS__)
+#define vlsseg3e64_v_f64m1(...) __riscv_vlsseg3e64_v_f64m1(__VA_ARGS__)
+#define vlsseg4e64_v_f64m1(...) __riscv_vlsseg4e64_v_f64m1(__VA_ARGS__)
+#define vlsseg5e64_v_f64m1(...) __riscv_vlsseg5e64_v_f64m1(__VA_ARGS__)
+#define vlsseg6e64_v_f64m1(...) __riscv_vlsseg6e64_v_f64m1(__VA_ARGS__)
+#define vlsseg7e64_v_f64m1(...) __riscv_vlsseg7e64_v_f64m1(__VA_ARGS__)
+#define vlsseg8e64_v_f64m1(...) __riscv_vlsseg8e64_v_f64m1(__VA_ARGS__)
+#define vlsseg2e64_v_f64m2(...) __riscv_vlsseg2e64_v_f64m2(__VA_ARGS__)
+#define vlsseg3e64_v_f64m2(...) __riscv_vlsseg3e64_v_f64m2(__VA_ARGS__)
+#define vlsseg4e64_v_f64m2(...) __riscv_vlsseg4e64_v_f64m2(__VA_ARGS__)
+#define vlsseg2e64_v_f64m4(...) __riscv_vlsseg2e64_v_f64m4(__VA_ARGS__)
+#define vlsseg2e8_v_i8mf8(...) __riscv_vlsseg2e8_v_i8mf8(__VA_ARGS__)
+#define vlsseg3e8_v_i8mf8(...) __riscv_vlsseg3e8_v_i8mf8(__VA_ARGS__)
+#define vlsseg4e8_v_i8mf8(...) __riscv_vlsseg4e8_v_i8mf8(__VA_ARGS__)
+#define vlsseg5e8_v_i8mf8(...) __riscv_vlsseg5e8_v_i8mf8(__VA_ARGS__)
+#define vlsseg6e8_v_i8mf8(...) __riscv_vlsseg6e8_v_i8mf8(__VA_ARGS__)
+#define vlsseg7e8_v_i8mf8(...) __riscv_vlsseg7e8_v_i8mf8(__VA_ARGS__)
+#define vlsseg8e8_v_i8mf8(...) __riscv_vlsseg8e8_v_i8mf8(__VA_ARGS__)
+#define vlsseg2e8_v_i8mf4(...) __riscv_vlsseg2e8_v_i8mf4(__VA_ARGS__)
+#define vlsseg3e8_v_i8mf4(...) __riscv_vlsseg3e8_v_i8mf4(__VA_ARGS__)
+#define vlsseg4e8_v_i8mf4(...) __riscv_vlsseg4e8_v_i8mf4(__VA_ARGS__)
+#define vlsseg5e8_v_i8mf4(...) __riscv_vlsseg5e8_v_i8mf4(__VA_ARGS__)
+#define vlsseg6e8_v_i8mf4(...) __riscv_vlsseg6e8_v_i8mf4(__VA_ARGS__)
+#define vlsseg7e8_v_i8mf4(...) __riscv_vlsseg7e8_v_i8mf4(__VA_ARGS__)
+#define vlsseg8e8_v_i8mf4(...) __riscv_vlsseg8e8_v_i8mf4(__VA_ARGS__)
+#define vlsseg2e8_v_i8mf2(...) __riscv_vlsseg2e8_v_i8mf2(__VA_ARGS__)
+#define vlsseg3e8_v_i8mf2(...) __riscv_vlsseg3e8_v_i8mf2(__VA_ARGS__)
+#define vlsseg4e8_v_i8mf2(...) __riscv_vlsseg4e8_v_i8mf2(__VA_ARGS__)
+#define vlsseg5e8_v_i8mf2(...) __riscv_vlsseg5e8_v_i8mf2(__VA_ARGS__)
+#define vlsseg6e8_v_i8mf2(...) __riscv_vlsseg6e8_v_i8mf2(__VA_ARGS__)
+#define vlsseg7e8_v_i8mf2(...) __riscv_vlsseg7e8_v_i8mf2(__VA_ARGS__)
+#define vlsseg8e8_v_i8mf2(...) __riscv_vlsseg8e8_v_i8mf2(__VA_ARGS__)
+#define vlsseg2e8_v_i8m1(...) __riscv_vlsseg2e8_v_i8m1(__VA_ARGS__)
+#define vlsseg3e8_v_i8m1(...) __riscv_vlsseg3e8_v_i8m1(__VA_ARGS__)
+#define vlsseg4e8_v_i8m1(...) __riscv_vlsseg4e8_v_i8m1(__VA_ARGS__)
+#define vlsseg5e8_v_i8m1(...) __riscv_vlsseg5e8_v_i8m1(__VA_ARGS__)
+#define vlsseg6e8_v_i8m1(...) __riscv_vlsseg6e8_v_i8m1(__VA_ARGS__)
+#define vlsseg7e8_v_i8m1(...) __riscv_vlsseg7e8_v_i8m1(__VA_ARGS__)
+#define vlsseg8e8_v_i8m1(...) __riscv_vlsseg8e8_v_i8m1(__VA_ARGS__)
+#define vlsseg2e8_v_i8m2(...) __riscv_vlsseg2e8_v_i8m2(__VA_ARGS__)
+#define vlsseg3e8_v_i8m2(...) __riscv_vlsseg3e8_v_i8m2(__VA_ARGS__)
+#define vlsseg4e8_v_i8m2(...) __riscv_vlsseg4e8_v_i8m2(__VA_ARGS__)
+#define vlsseg2e8_v_i8m4(...) __riscv_vlsseg2e8_v_i8m4(__VA_ARGS__)
+#define vlsseg2e16_v_i16mf4(...) __riscv_vlsseg2e16_v_i16mf4(__VA_ARGS__)
+#define vlsseg3e16_v_i16mf4(...) __riscv_vlsseg3e16_v_i16mf4(__VA_ARGS__)
+#define vlsseg4e16_v_i16mf4(...) __riscv_vlsseg4e16_v_i16mf4(__VA_ARGS__)
+#define vlsseg5e16_v_i16mf4(...) __riscv_vlsseg5e16_v_i16mf4(__VA_ARGS__)
+#define vlsseg6e16_v_i16mf4(...) __riscv_vlsseg6e16_v_i16mf4(__VA_ARGS__)
+#define vlsseg7e16_v_i16mf4(...) __riscv_vlsseg7e16_v_i16mf4(__VA_ARGS__)
+#define vlsseg8e16_v_i16mf4(...) __riscv_vlsseg8e16_v_i16mf4(__VA_ARGS__)
+#define vlsseg2e16_v_i16mf2(...) __riscv_vlsseg2e16_v_i16mf2(__VA_ARGS__)
+#define vlsseg3e16_v_i16mf2(...) __riscv_vlsseg3e16_v_i16mf2(__VA_ARGS__)
+#define vlsseg4e16_v_i16mf2(...) __riscv_vlsseg4e16_v_i16mf2(__VA_ARGS__)
+#define vlsseg5e16_v_i16mf2(...) __riscv_vlsseg5e16_v_i16mf2(__VA_ARGS__)
+#define vlsseg6e16_v_i16mf2(...) __riscv_vlsseg6e16_v_i16mf2(__VA_ARGS__)
+#define vlsseg7e16_v_i16mf2(...) __riscv_vlsseg7e16_v_i16mf2(__VA_ARGS__)
+#define vlsseg8e16_v_i16mf2(...) __riscv_vlsseg8e16_v_i16mf2(__VA_ARGS__)
+#define vlsseg2e16_v_i16m1(...) __riscv_vlsseg2e16_v_i16m1(__VA_ARGS__)
+#define vlsseg3e16_v_i16m1(...) __riscv_vlsseg3e16_v_i16m1(__VA_ARGS__)
+#define vlsseg4e16_v_i16m1(...) __riscv_vlsseg4e16_v_i16m1(__VA_ARGS__)
+#define vlsseg5e16_v_i16m1(...) __riscv_vlsseg5e16_v_i16m1(__VA_ARGS__)
+#define vlsseg6e16_v_i16m1(...) __riscv_vlsseg6e16_v_i16m1(__VA_ARGS__)
+#define vlsseg7e16_v_i16m1(...) __riscv_vlsseg7e16_v_i16m1(__VA_ARGS__)
+#define vlsseg8e16_v_i16m1(...) __riscv_vlsseg8e16_v_i16m1(__VA_ARGS__)
+#define vlsseg2e16_v_i16m2(...) __riscv_vlsseg2e16_v_i16m2(__VA_ARGS__)
+#define vlsseg3e16_v_i16m2(...) __riscv_vlsseg3e16_v_i16m2(__VA_ARGS__)
+#define vlsseg4e16_v_i16m2(...) __riscv_vlsseg4e16_v_i16m2(__VA_ARGS__)
+#define vlsseg2e16_v_i16m4(...) __riscv_vlsseg2e16_v_i16m4(__VA_ARGS__)
+#define vlsseg2e32_v_i32mf2(...) __riscv_vlsseg2e32_v_i32mf2(__VA_ARGS__)
+#define vlsseg3e32_v_i32mf2(...) __riscv_vlsseg3e32_v_i32mf2(__VA_ARGS__)
+#define vlsseg4e32_v_i32mf2(...) __riscv_vlsseg4e32_v_i32mf2(__VA_ARGS__)
+#define vlsseg5e32_v_i32mf2(...) __riscv_vlsseg5e32_v_i32mf2(__VA_ARGS__)
+#define vlsseg6e32_v_i32mf2(...) __riscv_vlsseg6e32_v_i32mf2(__VA_ARGS__)
+#define vlsseg7e32_v_i32mf2(...) __riscv_vlsseg7e32_v_i32mf2(__VA_ARGS__)
+#define vlsseg8e32_v_i32mf2(...) __riscv_vlsseg8e32_v_i32mf2(__VA_ARGS__)
+#define vlsseg2e32_v_i32m1(...) __riscv_vlsseg2e32_v_i32m1(__VA_ARGS__)
+#define vlsseg3e32_v_i32m1(...) __riscv_vlsseg3e32_v_i32m1(__VA_ARGS__)
+#define vlsseg4e32_v_i32m1(...) __riscv_vlsseg4e32_v_i32m1(__VA_ARGS__)
+#define vlsseg5e32_v_i32m1(...) __riscv_vlsseg5e32_v_i32m1(__VA_ARGS__)
+#define vlsseg6e32_v_i32m1(...) __riscv_vlsseg6e32_v_i32m1(__VA_ARGS__)
+#define vlsseg7e32_v_i32m1(...) __riscv_vlsseg7e32_v_i32m1(__VA_ARGS__)
+#define vlsseg8e32_v_i32m1(...) __riscv_vlsseg8e32_v_i32m1(__VA_ARGS__)
+#define vlsseg2e32_v_i32m2(...) __riscv_vlsseg2e32_v_i32m2(__VA_ARGS__)
+#define vlsseg3e32_v_i32m2(...) __riscv_vlsseg3e32_v_i32m2(__VA_ARGS__)
+#define vlsseg4e32_v_i32m2(...) __riscv_vlsseg4e32_v_i32m2(__VA_ARGS__)
+#define vlsseg2e32_v_i32m4(...) __riscv_vlsseg2e32_v_i32m4(__VA_ARGS__)
+#define vlsseg2e64_v_i64m1(...) __riscv_vlsseg2e64_v_i64m1(__VA_ARGS__)
+#define vlsseg3e64_v_i64m1(...) __riscv_vlsseg3e64_v_i64m1(__VA_ARGS__)
+#define vlsseg4e64_v_i64m1(...) __riscv_vlsseg4e64_v_i64m1(__VA_ARGS__)
+#define vlsseg5e64_v_i64m1(...) __riscv_vlsseg5e64_v_i64m1(__VA_ARGS__)
+#define vlsseg6e64_v_i64m1(...) __riscv_vlsseg6e64_v_i64m1(__VA_ARGS__)
+#define vlsseg7e64_v_i64m1(...) __riscv_vlsseg7e64_v_i64m1(__VA_ARGS__)
+#define vlsseg8e64_v_i64m1(...) __riscv_vlsseg8e64_v_i64m1(__VA_ARGS__)
+#define vlsseg2e64_v_i64m2(...) __riscv_vlsseg2e64_v_i64m2(__VA_ARGS__)
+#define vlsseg3e64_v_i64m2(...) __riscv_vlsseg3e64_v_i64m2(__VA_ARGS__)
+#define vlsseg4e64_v_i64m2(...) __riscv_vlsseg4e64_v_i64m2(__VA_ARGS__)
+#define vlsseg2e64_v_i64m4(...) __riscv_vlsseg2e64_v_i64m4(__VA_ARGS__)
+#define vlsseg2e8_v_u8mf8(...) __riscv_vlsseg2e8_v_u8mf8(__VA_ARGS__)
+#define vlsseg3e8_v_u8mf8(...) __riscv_vlsseg3e8_v_u8mf8(__VA_ARGS__)
+#define vlsseg4e8_v_u8mf8(...) __riscv_vlsseg4e8_v_u8mf8(__VA_ARGS__)
+#define vlsseg5e8_v_u8mf8(...) __riscv_vlsseg5e8_v_u8mf8(__VA_ARGS__)
+#define vlsseg6e8_v_u8mf8(...) __riscv_vlsseg6e8_v_u8mf8(__VA_ARGS__)
+#define vlsseg7e8_v_u8mf8(...) __riscv_vlsseg7e8_v_u8mf8(__VA_ARGS__)
+#define vlsseg8e8_v_u8mf8(...) __riscv_vlsseg8e8_v_u8mf8(__VA_ARGS__)
+#define vlsseg2e8_v_u8mf4(...) __riscv_vlsseg2e8_v_u8mf4(__VA_ARGS__)
+#define vlsseg3e8_v_u8mf4(...) __riscv_vlsseg3e8_v_u8mf4(__VA_ARGS__)
+#define vlsseg4e8_v_u8mf4(...) __riscv_vlsseg4e8_v_u8mf4(__VA_ARGS__)
+#define vlsseg5e8_v_u8mf4(...) __riscv_vlsseg5e8_v_u8mf4(__VA_ARGS__)
+#define vlsseg6e8_v_u8mf4(...) __riscv_vlsseg6e8_v_u8mf4(__VA_ARGS__)
+#define vlsseg7e8_v_u8mf4(...) __riscv_vlsseg7e8_v_u8mf4(__VA_ARGS__)
+#define vlsseg8e8_v_u8mf4(...) __riscv_vlsseg8e8_v_u8mf4(__VA_ARGS__)
+#define vlsseg2e8_v_u8mf2(...) __riscv_vlsseg2e8_v_u8mf2(__VA_ARGS__)
+#define vlsseg3e8_v_u8mf2(...) __riscv_vlsseg3e8_v_u8mf2(__VA_ARGS__)
+#define vlsseg4e8_v_u8mf2(...) __riscv_vlsseg4e8_v_u8mf2(__VA_ARGS__)
+#define vlsseg5e8_v_u8mf2(...) __riscv_vlsseg5e8_v_u8mf2(__VA_ARGS__)
+#define vlsseg6e8_v_u8mf2(...) __riscv_vlsseg6e8_v_u8mf2(__VA_ARGS__)
+#define vlsseg7e8_v_u8mf2(...) __riscv_vlsseg7e8_v_u8mf2(__VA_ARGS__)
+#define vlsseg8e8_v_u8mf2(...) __riscv_vlsseg8e8_v_u8mf2(__VA_ARGS__)
+#define vlsseg2e8_v_u8m1(...) __riscv_vlsseg2e8_v_u8m1(__VA_ARGS__)
+#define vlsseg3e8_v_u8m1(...) __riscv_vlsseg3e8_v_u8m1(__VA_ARGS__)
+#define vlsseg4e8_v_u8m1(...) __riscv_vlsseg4e8_v_u8m1(__VA_ARGS__)
+#define vlsseg5e8_v_u8m1(...) __riscv_vlsseg5e8_v_u8m1(__VA_ARGS__)
+#define vlsseg6e8_v_u8m1(...) __riscv_vlsseg6e8_v_u8m1(__VA_ARGS__)
+#define vlsseg7e8_v_u8m1(...) __riscv_vlsseg7e8_v_u8m1(__VA_ARGS__)
+#define vlsseg8e8_v_u8m1(...) __riscv_vlsseg8e8_v_u8m1(__VA_ARGS__)
+#define vlsseg2e8_v_u8m2(...) __riscv_vlsseg2e8_v_u8m2(__VA_ARGS__)
+#define vlsseg3e8_v_u8m2(...) __riscv_vlsseg3e8_v_u8m2(__VA_ARGS__)
+#define vlsseg4e8_v_u8m2(...) __riscv_vlsseg4e8_v_u8m2(__VA_ARGS__)
+#define vlsseg2e8_v_u8m4(...) __riscv_vlsseg2e8_v_u8m4(__VA_ARGS__)
+#define vlsseg2e16_v_u16mf4(...) __riscv_vlsseg2e16_v_u16mf4(__VA_ARGS__)
+#define vlsseg3e16_v_u16mf4(...) __riscv_vlsseg3e16_v_u16mf4(__VA_ARGS__)
+#define vlsseg4e16_v_u16mf4(...) __riscv_vlsseg4e16_v_u16mf4(__VA_ARGS__)
+#define vlsseg5e16_v_u16mf4(...) __riscv_vlsseg5e16_v_u16mf4(__VA_ARGS__)
+#define vlsseg6e16_v_u16mf4(...) __riscv_vlsseg6e16_v_u16mf4(__VA_ARGS__)
+#define vlsseg7e16_v_u16mf4(...) __riscv_vlsseg7e16_v_u16mf4(__VA_ARGS__)
+#define vlsseg8e16_v_u16mf4(...) __riscv_vlsseg8e16_v_u16mf4(__VA_ARGS__)
+#define vlsseg2e16_v_u16mf2(...) __riscv_vlsseg2e16_v_u16mf2(__VA_ARGS__)
+#define vlsseg3e16_v_u16mf2(...) __riscv_vlsseg3e16_v_u16mf2(__VA_ARGS__)
+#define vlsseg4e16_v_u16mf2(...) __riscv_vlsseg4e16_v_u16mf2(__VA_ARGS__)
+#define vlsseg5e16_v_u16mf2(...) __riscv_vlsseg5e16_v_u16mf2(__VA_ARGS__)
+#define vlsseg6e16_v_u16mf2(...) __riscv_vlsseg6e16_v_u16mf2(__VA_ARGS__)
+#define vlsseg7e16_v_u16mf2(...) __riscv_vlsseg7e16_v_u16mf2(__VA_ARGS__)
+#define vlsseg8e16_v_u16mf2(...) __riscv_vlsseg8e16_v_u16mf2(__VA_ARGS__)
+#define vlsseg2e16_v_u16m1(...) __riscv_vlsseg2e16_v_u16m1(__VA_ARGS__)
+#define vlsseg3e16_v_u16m1(...) __riscv_vlsseg3e16_v_u16m1(__VA_ARGS__)
+#define vlsseg4e16_v_u16m1(...) __riscv_vlsseg4e16_v_u16m1(__VA_ARGS__)
+#define vlsseg5e16_v_u16m1(...) __riscv_vlsseg5e16_v_u16m1(__VA_ARGS__)
+#define vlsseg6e16_v_u16m1(...) __riscv_vlsseg6e16_v_u16m1(__VA_ARGS__)
+#define vlsseg7e16_v_u16m1(...) __riscv_vlsseg7e16_v_u16m1(__VA_ARGS__)
+#define vlsseg8e16_v_u16m1(...) __riscv_vlsseg8e16_v_u16m1(__VA_ARGS__)
+#define vlsseg2e16_v_u16m2(...) __riscv_vlsseg2e16_v_u16m2(__VA_ARGS__)
+#define vlsseg3e16_v_u16m2(...) __riscv_vlsseg3e16_v_u16m2(__VA_ARGS__)
+#define vlsseg4e16_v_u16m2(...) __riscv_vlsseg4e16_v_u16m2(__VA_ARGS__)
+#define vlsseg2e16_v_u16m4(...) __riscv_vlsseg2e16_v_u16m4(__VA_ARGS__)
+#define vlsseg2e32_v_u32mf2(...) __riscv_vlsseg2e32_v_u32mf2(__VA_ARGS__)
+#define vlsseg3e32_v_u32mf2(...) __riscv_vlsseg3e32_v_u32mf2(__VA_ARGS__)
+#define vlsseg4e32_v_u32mf2(...) __riscv_vlsseg4e32_v_u32mf2(__VA_ARGS__)
+#define vlsseg5e32_v_u32mf2(...) __riscv_vlsseg5e32_v_u32mf2(__VA_ARGS__)
+#define vlsseg6e32_v_u32mf2(...) __riscv_vlsseg6e32_v_u32mf2(__VA_ARGS__)
+#define vlsseg7e32_v_u32mf2(...) __riscv_vlsseg7e32_v_u32mf2(__VA_ARGS__)
+#define vlsseg8e32_v_u32mf2(...) __riscv_vlsseg8e32_v_u32mf2(__VA_ARGS__)
+#define vlsseg2e32_v_u32m1(...) __riscv_vlsseg2e32_v_u32m1(__VA_ARGS__)
+#define vlsseg3e32_v_u32m1(...) __riscv_vlsseg3e32_v_u32m1(__VA_ARGS__)
+#define vlsseg4e32_v_u32m1(...) __riscv_vlsseg4e32_v_u32m1(__VA_ARGS__)
+#define vlsseg5e32_v_u32m1(...) __riscv_vlsseg5e32_v_u32m1(__VA_ARGS__)
+#define vlsseg6e32_v_u32m1(...) __riscv_vlsseg6e32_v_u32m1(__VA_ARGS__)
+#define vlsseg7e32_v_u32m1(...) __riscv_vlsseg7e32_v_u32m1(__VA_ARGS__)
+#define vlsseg8e32_v_u32m1(...) __riscv_vlsseg8e32_v_u32m1(__VA_ARGS__)
+#define vlsseg2e32_v_u32m2(...) __riscv_vlsseg2e32_v_u32m2(__VA_ARGS__)
+#define vlsseg3e32_v_u32m2(...) __riscv_vlsseg3e32_v_u32m2(__VA_ARGS__)
+#define vlsseg4e32_v_u32m2(...) __riscv_vlsseg4e32_v_u32m2(__VA_ARGS__)
+#define vlsseg2e32_v_u32m4(...) __riscv_vlsseg2e32_v_u32m4(__VA_ARGS__)
+#define vlsseg2e64_v_u64m1(...) __riscv_vlsseg2e64_v_u64m1(__VA_ARGS__)
+#define vlsseg3e64_v_u64m1(...) __riscv_vlsseg3e64_v_u64m1(__VA_ARGS__)
+#define vlsseg4e64_v_u64m1(...) __riscv_vlsseg4e64_v_u64m1(__VA_ARGS__)
+#define vlsseg5e64_v_u64m1(...) __riscv_vlsseg5e64_v_u64m1(__VA_ARGS__)
+#define vlsseg6e64_v_u64m1(...) __riscv_vlsseg6e64_v_u64m1(__VA_ARGS__)
+#define vlsseg7e64_v_u64m1(...) __riscv_vlsseg7e64_v_u64m1(__VA_ARGS__)
+#define vlsseg8e64_v_u64m1(...) __riscv_vlsseg8e64_v_u64m1(__VA_ARGS__)
+#define vlsseg2e64_v_u64m2(...) __riscv_vlsseg2e64_v_u64m2(__VA_ARGS__)
+#define vlsseg3e64_v_u64m2(...) __riscv_vlsseg3e64_v_u64m2(__VA_ARGS__)
+#define vlsseg4e64_v_u64m2(...) __riscv_vlsseg4e64_v_u64m2(__VA_ARGS__)
+#define vlsseg2e64_v_u64m4(...) __riscv_vlsseg2e64_v_u64m4(__VA_ARGS__)
+// masked functions
+#define vlsseg2e16_v_f16mf4_m(...) __riscv_vlsseg2e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_f16mf4_m(...) __riscv_vlsseg3e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_f16mf4_m(...) __riscv_vlsseg4e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlsseg5e16_v_f16mf4_m(...) __riscv_vlsseg5e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlsseg6e16_v_f16mf4_m(...) __riscv_vlsseg6e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlsseg7e16_v_f16mf4_m(...) __riscv_vlsseg7e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlsseg8e16_v_f16mf4_m(...) __riscv_vlsseg8e16_v_f16mf4_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_f16mf2_m(...) __riscv_vlsseg2e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_f16mf2_m(...) __riscv_vlsseg3e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_f16mf2_m(...) __riscv_vlsseg4e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlsseg5e16_v_f16mf2_m(...) __riscv_vlsseg5e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlsseg6e16_v_f16mf2_m(...) __riscv_vlsseg6e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlsseg7e16_v_f16mf2_m(...) __riscv_vlsseg7e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlsseg8e16_v_f16mf2_m(...) __riscv_vlsseg8e16_v_f16mf2_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_f16m1_m(...) __riscv_vlsseg2e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_f16m1_m(...) __riscv_vlsseg3e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_f16m1_m(...) __riscv_vlsseg4e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlsseg5e16_v_f16m1_m(...) __riscv_vlsseg5e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlsseg6e16_v_f16m1_m(...) __riscv_vlsseg6e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlsseg7e16_v_f16m1_m(...) __riscv_vlsseg7e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlsseg8e16_v_f16m1_m(...) __riscv_vlsseg8e16_v_f16m1_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_f16m2_m(...) __riscv_vlsseg2e16_v_f16m2_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_f16m2_m(...) __riscv_vlsseg3e16_v_f16m2_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_f16m2_m(...) __riscv_vlsseg4e16_v_f16m2_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_f16m4_m(...) __riscv_vlsseg2e16_v_f16m4_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_f32mf2_m(...) __riscv_vlsseg2e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlsseg3e32_v_f32mf2_m(...) __riscv_vlsseg3e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlsseg4e32_v_f32mf2_m(...) __riscv_vlsseg4e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlsseg5e32_v_f32mf2_m(...) __riscv_vlsseg5e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlsseg6e32_v_f32mf2_m(...) __riscv_vlsseg6e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlsseg7e32_v_f32mf2_m(...) __riscv_vlsseg7e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlsseg8e32_v_f32mf2_m(...) __riscv_vlsseg8e32_v_f32mf2_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_f32m1_m(...) __riscv_vlsseg2e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlsseg3e32_v_f32m1_m(...) __riscv_vlsseg3e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlsseg4e32_v_f32m1_m(...) __riscv_vlsseg4e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlsseg5e32_v_f32m1_m(...) __riscv_vlsseg5e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlsseg6e32_v_f32m1_m(...) __riscv_vlsseg6e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlsseg7e32_v_f32m1_m(...) __riscv_vlsseg7e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlsseg8e32_v_f32m1_m(...) __riscv_vlsseg8e32_v_f32m1_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_f32m2_m(...) __riscv_vlsseg2e32_v_f32m2_tumu(__VA_ARGS__)
+#define vlsseg3e32_v_f32m2_m(...) __riscv_vlsseg3e32_v_f32m2_tumu(__VA_ARGS__)
+#define vlsseg4e32_v_f32m2_m(...) __riscv_vlsseg4e32_v_f32m2_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_f32m4_m(...) __riscv_vlsseg2e32_v_f32m4_tumu(__VA_ARGS__)
+#define vlsseg2e64_v_f64m1_m(...) __riscv_vlsseg2e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlsseg3e64_v_f64m1_m(...) __riscv_vlsseg3e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlsseg4e64_v_f64m1_m(...) __riscv_vlsseg4e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlsseg5e64_v_f64m1_m(...) __riscv_vlsseg5e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlsseg6e64_v_f64m1_m(...) __riscv_vlsseg6e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlsseg7e64_v_f64m1_m(...) __riscv_vlsseg7e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlsseg8e64_v_f64m1_m(...) __riscv_vlsseg8e64_v_f64m1_tumu(__VA_ARGS__)
+#define vlsseg2e64_v_f64m2_m(...) __riscv_vlsseg2e64_v_f64m2_tumu(__VA_ARGS__)
+#define vlsseg3e64_v_f64m2_m(...) __riscv_vlsseg3e64_v_f64m2_tumu(__VA_ARGS__)
+#define vlsseg4e64_v_f64m2_m(...) __riscv_vlsseg4e64_v_f64m2_tumu(__VA_ARGS__)
+#define vlsseg2e64_v_f64m4_m(...) __riscv_vlsseg2e64_v_f64m4_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_i8mf8_m(...) __riscv_vlsseg2e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_i8mf8_m(...) __riscv_vlsseg3e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_i8mf8_m(...) __riscv_vlsseg4e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlsseg5e8_v_i8mf8_m(...) __riscv_vlsseg5e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlsseg6e8_v_i8mf8_m(...) __riscv_vlsseg6e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlsseg7e8_v_i8mf8_m(...) __riscv_vlsseg7e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlsseg8e8_v_i8mf8_m(...) __riscv_vlsseg8e8_v_i8mf8_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_i8mf4_m(...) __riscv_vlsseg2e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_i8mf4_m(...) __riscv_vlsseg3e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_i8mf4_m(...) __riscv_vlsseg4e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlsseg5e8_v_i8mf4_m(...) __riscv_vlsseg5e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlsseg6e8_v_i8mf4_m(...) __riscv_vlsseg6e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlsseg7e8_v_i8mf4_m(...) __riscv_vlsseg7e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlsseg8e8_v_i8mf4_m(...) __riscv_vlsseg8e8_v_i8mf4_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_i8mf2_m(...) __riscv_vlsseg2e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_i8mf2_m(...) __riscv_vlsseg3e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_i8mf2_m(...) __riscv_vlsseg4e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlsseg5e8_v_i8mf2_m(...) __riscv_vlsseg5e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlsseg6e8_v_i8mf2_m(...) __riscv_vlsseg6e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlsseg7e8_v_i8mf2_m(...) __riscv_vlsseg7e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlsseg8e8_v_i8mf2_m(...) __riscv_vlsseg8e8_v_i8mf2_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_i8m1_m(...) __riscv_vlsseg2e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_i8m1_m(...) __riscv_vlsseg3e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_i8m1_m(...) __riscv_vlsseg4e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlsseg5e8_v_i8m1_m(...) __riscv_vlsseg5e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlsseg6e8_v_i8m1_m(...) __riscv_vlsseg6e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlsseg7e8_v_i8m1_m(...) __riscv_vlsseg7e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlsseg8e8_v_i8m1_m(...) __riscv_vlsseg8e8_v_i8m1_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_i8m2_m(...) __riscv_vlsseg2e8_v_i8m2_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_i8m2_m(...) __riscv_vlsseg3e8_v_i8m2_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_i8m2_m(...) __riscv_vlsseg4e8_v_i8m2_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_i8m4_m(...) __riscv_vlsseg2e8_v_i8m4_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_i16mf4_m(...) __riscv_vlsseg2e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_i16mf4_m(...) __riscv_vlsseg3e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_i16mf4_m(...) __riscv_vlsseg4e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlsseg5e16_v_i16mf4_m(...) __riscv_vlsseg5e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlsseg6e16_v_i16mf4_m(...) __riscv_vlsseg6e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlsseg7e16_v_i16mf4_m(...) __riscv_vlsseg7e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlsseg8e16_v_i16mf4_m(...) __riscv_vlsseg8e16_v_i16mf4_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_i16mf2_m(...) __riscv_vlsseg2e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_i16mf2_m(...) __riscv_vlsseg3e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_i16mf2_m(...) __riscv_vlsseg4e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlsseg5e16_v_i16mf2_m(...) __riscv_vlsseg5e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlsseg6e16_v_i16mf2_m(...) __riscv_vlsseg6e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlsseg7e16_v_i16mf2_m(...) __riscv_vlsseg7e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlsseg8e16_v_i16mf2_m(...) __riscv_vlsseg8e16_v_i16mf2_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_i16m1_m(...) __riscv_vlsseg2e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_i16m1_m(...) __riscv_vlsseg3e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_i16m1_m(...) __riscv_vlsseg4e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlsseg5e16_v_i16m1_m(...) __riscv_vlsseg5e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlsseg6e16_v_i16m1_m(...) __riscv_vlsseg6e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlsseg7e16_v_i16m1_m(...) __riscv_vlsseg7e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlsseg8e16_v_i16m1_m(...) __riscv_vlsseg8e16_v_i16m1_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_i16m2_m(...) __riscv_vlsseg2e16_v_i16m2_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_i16m2_m(...) __riscv_vlsseg3e16_v_i16m2_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_i16m2_m(...) __riscv_vlsseg4e16_v_i16m2_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_i16m4_m(...) __riscv_vlsseg2e16_v_i16m4_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_i32mf2_m(...) __riscv_vlsseg2e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlsseg3e32_v_i32mf2_m(...) __riscv_vlsseg3e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlsseg4e32_v_i32mf2_m(...) __riscv_vlsseg4e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlsseg5e32_v_i32mf2_m(...) __riscv_vlsseg5e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlsseg6e32_v_i32mf2_m(...) __riscv_vlsseg6e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlsseg7e32_v_i32mf2_m(...) __riscv_vlsseg7e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlsseg8e32_v_i32mf2_m(...) __riscv_vlsseg8e32_v_i32mf2_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_i32m1_m(...) __riscv_vlsseg2e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlsseg3e32_v_i32m1_m(...) __riscv_vlsseg3e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlsseg4e32_v_i32m1_m(...) __riscv_vlsseg4e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlsseg5e32_v_i32m1_m(...) __riscv_vlsseg5e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlsseg6e32_v_i32m1_m(...) __riscv_vlsseg6e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlsseg7e32_v_i32m1_m(...) __riscv_vlsseg7e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlsseg8e32_v_i32m1_m(...) __riscv_vlsseg8e32_v_i32m1_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_i32m2_m(...) __riscv_vlsseg2e32_v_i32m2_tumu(__VA_ARGS__)
+#define vlsseg3e32_v_i32m2_m(...) __riscv_vlsseg3e32_v_i32m2_tumu(__VA_ARGS__)
+#define vlsseg4e32_v_i32m2_m(...) __riscv_vlsseg4e32_v_i32m2_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_i32m4_m(...) __riscv_vlsseg2e32_v_i32m4_tumu(__VA_ARGS__)
+#define vlsseg2e64_v_i64m1_m(...) __riscv_vlsseg2e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlsseg3e64_v_i64m1_m(...) __riscv_vlsseg3e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlsseg4e64_v_i64m1_m(...) __riscv_vlsseg4e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlsseg5e64_v_i64m1_m(...) __riscv_vlsseg5e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlsseg6e64_v_i64m1_m(...) __riscv_vlsseg6e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlsseg7e64_v_i64m1_m(...) __riscv_vlsseg7e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlsseg8e64_v_i64m1_m(...) __riscv_vlsseg8e64_v_i64m1_tumu(__VA_ARGS__)
+#define vlsseg2e64_v_i64m2_m(...) __riscv_vlsseg2e64_v_i64m2_tumu(__VA_ARGS__)
+#define vlsseg3e64_v_i64m2_m(...) __riscv_vlsseg3e64_v_i64m2_tumu(__VA_ARGS__)
+#define vlsseg4e64_v_i64m2_m(...) __riscv_vlsseg4e64_v_i64m2_tumu(__VA_ARGS__)
+#define vlsseg2e64_v_i64m4_m(...) __riscv_vlsseg2e64_v_i64m4_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_u8mf8_m(...) __riscv_vlsseg2e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_u8mf8_m(...) __riscv_vlsseg3e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_u8mf8_m(...) __riscv_vlsseg4e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlsseg5e8_v_u8mf8_m(...) __riscv_vlsseg5e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlsseg6e8_v_u8mf8_m(...) __riscv_vlsseg6e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlsseg7e8_v_u8mf8_m(...) __riscv_vlsseg7e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlsseg8e8_v_u8mf8_m(...) __riscv_vlsseg8e8_v_u8mf8_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_u8mf4_m(...) __riscv_vlsseg2e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_u8mf4_m(...) __riscv_vlsseg3e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_u8mf4_m(...) __riscv_vlsseg4e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlsseg5e8_v_u8mf4_m(...) __riscv_vlsseg5e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlsseg6e8_v_u8mf4_m(...) __riscv_vlsseg6e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlsseg7e8_v_u8mf4_m(...) __riscv_vlsseg7e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlsseg8e8_v_u8mf4_m(...) __riscv_vlsseg8e8_v_u8mf4_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_u8mf2_m(...) __riscv_vlsseg2e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_u8mf2_m(...) __riscv_vlsseg3e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_u8mf2_m(...) __riscv_vlsseg4e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlsseg5e8_v_u8mf2_m(...) __riscv_vlsseg5e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlsseg6e8_v_u8mf2_m(...) __riscv_vlsseg6e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlsseg7e8_v_u8mf2_m(...) __riscv_vlsseg7e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlsseg8e8_v_u8mf2_m(...) __riscv_vlsseg8e8_v_u8mf2_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_u8m1_m(...) __riscv_vlsseg2e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_u8m1_m(...) __riscv_vlsseg3e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_u8m1_m(...) __riscv_vlsseg4e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlsseg5e8_v_u8m1_m(...) __riscv_vlsseg5e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlsseg6e8_v_u8m1_m(...) __riscv_vlsseg6e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlsseg7e8_v_u8m1_m(...) __riscv_vlsseg7e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlsseg8e8_v_u8m1_m(...) __riscv_vlsseg8e8_v_u8m1_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_u8m2_m(...) __riscv_vlsseg2e8_v_u8m2_tumu(__VA_ARGS__)
+#define vlsseg3e8_v_u8m2_m(...) __riscv_vlsseg3e8_v_u8m2_tumu(__VA_ARGS__)
+#define vlsseg4e8_v_u8m2_m(...) __riscv_vlsseg4e8_v_u8m2_tumu(__VA_ARGS__)
+#define vlsseg2e8_v_u8m4_m(...) __riscv_vlsseg2e8_v_u8m4_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_u16mf4_m(...) __riscv_vlsseg2e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_u16mf4_m(...) __riscv_vlsseg3e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_u16mf4_m(...) __riscv_vlsseg4e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlsseg5e16_v_u16mf4_m(...) __riscv_vlsseg5e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlsseg6e16_v_u16mf4_m(...) __riscv_vlsseg6e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlsseg7e16_v_u16mf4_m(...) __riscv_vlsseg7e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlsseg8e16_v_u16mf4_m(...) __riscv_vlsseg8e16_v_u16mf4_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_u16mf2_m(...) __riscv_vlsseg2e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_u16mf2_m(...) __riscv_vlsseg3e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_u16mf2_m(...) __riscv_vlsseg4e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlsseg5e16_v_u16mf2_m(...) __riscv_vlsseg5e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlsseg6e16_v_u16mf2_m(...) __riscv_vlsseg6e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlsseg7e16_v_u16mf2_m(...) __riscv_vlsseg7e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlsseg8e16_v_u16mf2_m(...) __riscv_vlsseg8e16_v_u16mf2_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_u16m1_m(...) __riscv_vlsseg2e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_u16m1_m(...) __riscv_vlsseg3e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_u16m1_m(...) __riscv_vlsseg4e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlsseg5e16_v_u16m1_m(...) __riscv_vlsseg5e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlsseg6e16_v_u16m1_m(...) __riscv_vlsseg6e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlsseg7e16_v_u16m1_m(...) __riscv_vlsseg7e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlsseg8e16_v_u16m1_m(...) __riscv_vlsseg8e16_v_u16m1_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_u16m2_m(...) __riscv_vlsseg2e16_v_u16m2_tumu(__VA_ARGS__)
+#define vlsseg3e16_v_u16m2_m(...) __riscv_vlsseg3e16_v_u16m2_tumu(__VA_ARGS__)
+#define vlsseg4e16_v_u16m2_m(...) __riscv_vlsseg4e16_v_u16m2_tumu(__VA_ARGS__)
+#define vlsseg2e16_v_u16m4_m(...) __riscv_vlsseg2e16_v_u16m4_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_u32mf2_m(...) __riscv_vlsseg2e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlsseg3e32_v_u32mf2_m(...) __riscv_vlsseg3e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlsseg4e32_v_u32mf2_m(...) __riscv_vlsseg4e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlsseg5e32_v_u32mf2_m(...) __riscv_vlsseg5e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlsseg6e32_v_u32mf2_m(...) __riscv_vlsseg6e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlsseg7e32_v_u32mf2_m(...) __riscv_vlsseg7e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlsseg8e32_v_u32mf2_m(...) __riscv_vlsseg8e32_v_u32mf2_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_u32m1_m(...) __riscv_vlsseg2e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlsseg3e32_v_u32m1_m(...) __riscv_vlsseg3e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlsseg4e32_v_u32m1_m(...) __riscv_vlsseg4e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlsseg5e32_v_u32m1_m(...) __riscv_vlsseg5e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlsseg6e32_v_u32m1_m(...) __riscv_vlsseg6e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlsseg7e32_v_u32m1_m(...) __riscv_vlsseg7e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlsseg8e32_v_u32m1_m(...) __riscv_vlsseg8e32_v_u32m1_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_u32m2_m(...) __riscv_vlsseg2e32_v_u32m2_tumu(__VA_ARGS__)
+#define vlsseg3e32_v_u32m2_m(...) __riscv_vlsseg3e32_v_u32m2_tumu(__VA_ARGS__)
+#define vlsseg4e32_v_u32m2_m(...) __riscv_vlsseg4e32_v_u32m2_tumu(__VA_ARGS__)
+#define vlsseg2e32_v_u32m4_m(...) __riscv_vlsseg2e32_v_u32m4_tumu(__VA_ARGS__)
+#define vlsseg2e64_v_u64m1_m(...) __riscv_vlsseg2e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlsseg3e64_v_u64m1_m(...) __riscv_vlsseg3e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlsseg4e64_v_u64m1_m(...) __riscv_vlsseg4e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlsseg5e64_v_u64m1_m(...) __riscv_vlsseg5e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlsseg6e64_v_u64m1_m(...) __riscv_vlsseg6e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlsseg7e64_v_u64m1_m(...) __riscv_vlsseg7e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlsseg8e64_v_u64m1_m(...) __riscv_vlsseg8e64_v_u64m1_tumu(__VA_ARGS__)
+#define vlsseg2e64_v_u64m2_m(...) __riscv_vlsseg2e64_v_u64m2_tumu(__VA_ARGS__)
+#define vlsseg3e64_v_u64m2_m(...) __riscv_vlsseg3e64_v_u64m2_tumu(__VA_ARGS__)
+#define vlsseg4e64_v_u64m2_m(...) __riscv_vlsseg4e64_v_u64m2_tumu(__VA_ARGS__)
+#define vlsseg2e64_v_u64m4_m(...) __riscv_vlsseg2e64_v_u64m4_tumu(__VA_ARGS__)
+#define vssseg2e16_v_f16mf4(...) __riscv_vssseg2e16_v_f16mf4(__VA_ARGS__)
+#define vssseg3e16_v_f16mf4(...) __riscv_vssseg3e16_v_f16mf4(__VA_ARGS__)
+#define vssseg4e16_v_f16mf4(...) __riscv_vssseg4e16_v_f16mf4(__VA_ARGS__)
+#define vssseg5e16_v_f16mf4(...) __riscv_vssseg5e16_v_f16mf4(__VA_ARGS__)
+#define vssseg6e16_v_f16mf4(...) __riscv_vssseg6e16_v_f16mf4(__VA_ARGS__)
+#define vssseg7e16_v_f16mf4(...) __riscv_vssseg7e16_v_f16mf4(__VA_ARGS__)
+#define vssseg8e16_v_f16mf4(...) __riscv_vssseg8e16_v_f16mf4(__VA_ARGS__)
+#define vssseg2e16_v_f16mf2(...) __riscv_vssseg2e16_v_f16mf2(__VA_ARGS__)
+#define vssseg3e16_v_f16mf2(...) __riscv_vssseg3e16_v_f16mf2(__VA_ARGS__)
+#define vssseg4e16_v_f16mf2(...) __riscv_vssseg4e16_v_f16mf2(__VA_ARGS__)
+#define vssseg5e16_v_f16mf2(...) __riscv_vssseg5e16_v_f16mf2(__VA_ARGS__)
+#define vssseg6e16_v_f16mf2(...) __riscv_vssseg6e16_v_f16mf2(__VA_ARGS__)
+#define vssseg7e16_v_f16mf2(...) __riscv_vssseg7e16_v_f16mf2(__VA_ARGS__)
+#define vssseg8e16_v_f16mf2(...) __riscv_vssseg8e16_v_f16mf2(__VA_ARGS__)
+#define vssseg2e16_v_f16m1(...) __riscv_vssseg2e16_v_f16m1(__VA_ARGS__)
+#define vssseg3e16_v_f16m1(...) __riscv_vssseg3e16_v_f16m1(__VA_ARGS__)
+#define vssseg4e16_v_f16m1(...) __riscv_vssseg4e16_v_f16m1(__VA_ARGS__)
+#define vssseg5e16_v_f16m1(...) __riscv_vssseg5e16_v_f16m1(__VA_ARGS__)
+#define vssseg6e16_v_f16m1(...) __riscv_vssseg6e16_v_f16m1(__VA_ARGS__)
+#define vssseg7e16_v_f16m1(...) __riscv_vssseg7e16_v_f16m1(__VA_ARGS__)
+#define vssseg8e16_v_f16m1(...) __riscv_vssseg8e16_v_f16m1(__VA_ARGS__)
+#define vssseg2e16_v_f16m2(...) __riscv_vssseg2e16_v_f16m2(__VA_ARGS__)
+#define vssseg3e16_v_f16m2(...) __riscv_vssseg3e16_v_f16m2(__VA_ARGS__)
+#define vssseg4e16_v_f16m2(...) __riscv_vssseg4e16_v_f16m2(__VA_ARGS__)
+#define vssseg2e16_v_f16m4(...) __riscv_vssseg2e16_v_f16m4(__VA_ARGS__)
+#define vssseg2e32_v_f32mf2(...) __riscv_vssseg2e32_v_f32mf2(__VA_ARGS__)
+#define vssseg3e32_v_f32mf2(...) __riscv_vssseg3e32_v_f32mf2(__VA_ARGS__)
+#define vssseg4e32_v_f32mf2(...) __riscv_vssseg4e32_v_f32mf2(__VA_ARGS__)
+#define vssseg5e32_v_f32mf2(...) __riscv_vssseg5e32_v_f32mf2(__VA_ARGS__)
+#define vssseg6e32_v_f32mf2(...) __riscv_vssseg6e32_v_f32mf2(__VA_ARGS__)
+#define vssseg7e32_v_f32mf2(...) __riscv_vssseg7e32_v_f32mf2(__VA_ARGS__)
+#define vssseg8e32_v_f32mf2(...) __riscv_vssseg8e32_v_f32mf2(__VA_ARGS__)
+#define vssseg2e32_v_f32m1(...) __riscv_vssseg2e32_v_f32m1(__VA_ARGS__)
+#define vssseg3e32_v_f32m1(...) __riscv_vssseg3e32_v_f32m1(__VA_ARGS__)
+#define vssseg4e32_v_f32m1(...) __riscv_vssseg4e32_v_f32m1(__VA_ARGS__)
+#define vssseg5e32_v_f32m1(...) __riscv_vssseg5e32_v_f32m1(__VA_ARGS__)
+#define vssseg6e32_v_f32m1(...) __riscv_vssseg6e32_v_f32m1(__VA_ARGS__)
+#define vssseg7e32_v_f32m1(...) __riscv_vssseg7e32_v_f32m1(__VA_ARGS__)
+#define vssseg8e32_v_f32m1(...) __riscv_vssseg8e32_v_f32m1(__VA_ARGS__)
+#define vssseg2e32_v_f32m2(...) __riscv_vssseg2e32_v_f32m2(__VA_ARGS__)
+#define vssseg3e32_v_f32m2(...) __riscv_vssseg3e32_v_f32m2(__VA_ARGS__)
+#define vssseg4e32_v_f32m2(...) __riscv_vssseg4e32_v_f32m2(__VA_ARGS__)
+#define vssseg2e32_v_f32m4(...) __riscv_vssseg2e32_v_f32m4(__VA_ARGS__)
+#define vssseg2e64_v_f64m1(...) __riscv_vssseg2e64_v_f64m1(__VA_ARGS__)
+#define vssseg3e64_v_f64m1(...) __riscv_vssseg3e64_v_f64m1(__VA_ARGS__)
+#define vssseg4e64_v_f64m1(...) __riscv_vssseg4e64_v_f64m1(__VA_ARGS__)
+#define vssseg5e64_v_f64m1(...) __riscv_vssseg5e64_v_f64m1(__VA_ARGS__)
+#define vssseg6e64_v_f64m1(...) __riscv_vssseg6e64_v_f64m1(__VA_ARGS__)
+#define vssseg7e64_v_f64m1(...) __riscv_vssseg7e64_v_f64m1(__VA_ARGS__)
+#define vssseg8e64_v_f64m1(...) __riscv_vssseg8e64_v_f64m1(__VA_ARGS__)
+#define vssseg2e64_v_f64m2(...) __riscv_vssseg2e64_v_f64m2(__VA_ARGS__)
+#define vssseg3e64_v_f64m2(...) __riscv_vssseg3e64_v_f64m2(__VA_ARGS__)
+#define vssseg4e64_v_f64m2(...) __riscv_vssseg4e64_v_f64m2(__VA_ARGS__)
+#define vssseg2e64_v_f64m4(...) __riscv_vssseg2e64_v_f64m4(__VA_ARGS__)
+#define vssseg2e8_v_i8mf8(...) __riscv_vssseg2e8_v_i8mf8(__VA_ARGS__)
+#define vssseg3e8_v_i8mf8(...) __riscv_vssseg3e8_v_i8mf8(__VA_ARGS__)
+#define vssseg4e8_v_i8mf8(...) __riscv_vssseg4e8_v_i8mf8(__VA_ARGS__)
+#define vssseg5e8_v_i8mf8(...) __riscv_vssseg5e8_v_i8mf8(__VA_ARGS__)
+#define vssseg6e8_v_i8mf8(...) __riscv_vssseg6e8_v_i8mf8(__VA_ARGS__)
+#define vssseg7e8_v_i8mf8(...) __riscv_vssseg7e8_v_i8mf8(__VA_ARGS__)
+#define vssseg8e8_v_i8mf8(...) __riscv_vssseg8e8_v_i8mf8(__VA_ARGS__)
+#define vssseg2e8_v_i8mf4(...) __riscv_vssseg2e8_v_i8mf4(__VA_ARGS__)
+#define vssseg3e8_v_i8mf4(...) __riscv_vssseg3e8_v_i8mf4(__VA_ARGS__)
+#define vssseg4e8_v_i8mf4(...) __riscv_vssseg4e8_v_i8mf4(__VA_ARGS__)
+#define vssseg5e8_v_i8mf4(...) __riscv_vssseg5e8_v_i8mf4(__VA_ARGS__)
+#define vssseg6e8_v_i8mf4(...) __riscv_vssseg6e8_v_i8mf4(__VA_ARGS__)
+#define vssseg7e8_v_i8mf4(...) __riscv_vssseg7e8_v_i8mf4(__VA_ARGS__)
+#define vssseg8e8_v_i8mf4(...) __riscv_vssseg8e8_v_i8mf4(__VA_ARGS__)
+#define vssseg2e8_v_i8mf2(...) __riscv_vssseg2e8_v_i8mf2(__VA_ARGS__)
+#define vssseg3e8_v_i8mf2(...) __riscv_vssseg3e8_v_i8mf2(__VA_ARGS__)
+#define vssseg4e8_v_i8mf2(...) __riscv_vssseg4e8_v_i8mf2(__VA_ARGS__)
+#define vssseg5e8_v_i8mf2(...) __riscv_vssseg5e8_v_i8mf2(__VA_ARGS__)
+#define vssseg6e8_v_i8mf2(...) __riscv_vssseg6e8_v_i8mf2(__VA_ARGS__)
+#define vssseg7e8_v_i8mf2(...) __riscv_vssseg7e8_v_i8mf2(__VA_ARGS__)
+#define vssseg8e8_v_i8mf2(...) __riscv_vssseg8e8_v_i8mf2(__VA_ARGS__)
+#define vssseg2e8_v_i8m1(...) __riscv_vssseg2e8_v_i8m1(__VA_ARGS__)
+#define vssseg3e8_v_i8m1(...) __riscv_vssseg3e8_v_i8m1(__VA_ARGS__)
+#define vssseg4e8_v_i8m1(...) __riscv_vssseg4e8_v_i8m1(__VA_ARGS__)
+#define vssseg5e8_v_i8m1(...) __riscv_vssseg5e8_v_i8m1(__VA_ARGS__)
+#define vssseg6e8_v_i8m1(...) __riscv_vssseg6e8_v_i8m1(__VA_ARGS__)
+#define vssseg7e8_v_i8m1(...) __riscv_vssseg7e8_v_i8m1(__VA_ARGS__)
+#define vssseg8e8_v_i8m1(...) __riscv_vssseg8e8_v_i8m1(__VA_ARGS__)
+#define vssseg2e8_v_i8m2(...) __riscv_vssseg2e8_v_i8m2(__VA_ARGS__)
+#define vssseg3e8_v_i8m2(...) __riscv_vssseg3e8_v_i8m2(__VA_ARGS__)
+#define vssseg4e8_v_i8m2(...) __riscv_vssseg4e8_v_i8m2(__VA_ARGS__)
+#define vssseg2e8_v_i8m4(...) __riscv_vssseg2e8_v_i8m4(__VA_ARGS__)
+#define vssseg2e16_v_i16mf4(...) __riscv_vssseg2e16_v_i16mf4(__VA_ARGS__)
+#define vssseg3e16_v_i16mf4(...) __riscv_vssseg3e16_v_i16mf4(__VA_ARGS__)
+#define vssseg4e16_v_i16mf4(...) __riscv_vssseg4e16_v_i16mf4(__VA_ARGS__)
+#define vssseg5e16_v_i16mf4(...) __riscv_vssseg5e16_v_i16mf4(__VA_ARGS__)
+#define vssseg6e16_v_i16mf4(...) __riscv_vssseg6e16_v_i16mf4(__VA_ARGS__)
+#define vssseg7e16_v_i16mf4(...) __riscv_vssseg7e16_v_i16mf4(__VA_ARGS__)
+#define vssseg8e16_v_i16mf4(...) __riscv_vssseg8e16_v_i16mf4(__VA_ARGS__)
+#define vssseg2e16_v_i16mf2(...) __riscv_vssseg2e16_v_i16mf2(__VA_ARGS__)
+#define vssseg3e16_v_i16mf2(...) __riscv_vssseg3e16_v_i16mf2(__VA_ARGS__)
+#define vssseg4e16_v_i16mf2(...) __riscv_vssseg4e16_v_i16mf2(__VA_ARGS__)
+#define vssseg5e16_v_i16mf2(...) __riscv_vssseg5e16_v_i16mf2(__VA_ARGS__)
+#define vssseg6e16_v_i16mf2(...) __riscv_vssseg6e16_v_i16mf2(__VA_ARGS__)
+#define vssseg7e16_v_i16mf2(...) __riscv_vssseg7e16_v_i16mf2(__VA_ARGS__)
+#define vssseg8e16_v_i16mf2(...) __riscv_vssseg8e16_v_i16mf2(__VA_ARGS__)
+#define vssseg2e16_v_i16m1(...) __riscv_vssseg2e16_v_i16m1(__VA_ARGS__)
+#define vssseg3e16_v_i16m1(...) __riscv_vssseg3e16_v_i16m1(__VA_ARGS__)
+#define vssseg4e16_v_i16m1(...) __riscv_vssseg4e16_v_i16m1(__VA_ARGS__)
+#define vssseg5e16_v_i16m1(...) __riscv_vssseg5e16_v_i16m1(__VA_ARGS__)
+#define vssseg6e16_v_i16m1(...) __riscv_vssseg6e16_v_i16m1(__VA_ARGS__)
+#define vssseg7e16_v_i16m1(...) __riscv_vssseg7e16_v_i16m1(__VA_ARGS__)
+#define vssseg8e16_v_i16m1(...) __riscv_vssseg8e16_v_i16m1(__VA_ARGS__)
+#define vssseg2e16_v_i16m2(...) __riscv_vssseg2e16_v_i16m2(__VA_ARGS__)
+#define vssseg3e16_v_i16m2(...) __riscv_vssseg3e16_v_i16m2(__VA_ARGS__)
+#define vssseg4e16_v_i16m2(...) __riscv_vssseg4e16_v_i16m2(__VA_ARGS__)
+#define vssseg2e16_v_i16m4(...) __riscv_vssseg2e16_v_i16m4(__VA_ARGS__)
+#define vssseg2e32_v_i32mf2(...) __riscv_vssseg2e32_v_i32mf2(__VA_ARGS__)
+#define vssseg3e32_v_i32mf2(...) __riscv_vssseg3e32_v_i32mf2(__VA_ARGS__)
+#define vssseg4e32_v_i32mf2(...) __riscv_vssseg4e32_v_i32mf2(__VA_ARGS__)
+#define vssseg5e32_v_i32mf2(...) __riscv_vssseg5e32_v_i32mf2(__VA_ARGS__)
+#define vssseg6e32_v_i32mf2(...) __riscv_vssseg6e32_v_i32mf2(__VA_ARGS__)
+#define vssseg7e32_v_i32mf2(...) __riscv_vssseg7e32_v_i32mf2(__VA_ARGS__)
+#define vssseg8e32_v_i32mf2(...) __riscv_vssseg8e32_v_i32mf2(__VA_ARGS__)
+#define vssseg2e32_v_i32m1(...) __riscv_vssseg2e32_v_i32m1(__VA_ARGS__)
+#define vssseg3e32_v_i32m1(...) __riscv_vssseg3e32_v_i32m1(__VA_ARGS__)
+#define vssseg4e32_v_i32m1(...) __riscv_vssseg4e32_v_i32m1(__VA_ARGS__)
+#define vssseg5e32_v_i32m1(...) __riscv_vssseg5e32_v_i32m1(__VA_ARGS__)
+#define vssseg6e32_v_i32m1(...) __riscv_vssseg6e32_v_i32m1(__VA_ARGS__)
+#define vssseg7e32_v_i32m1(...) __riscv_vssseg7e32_v_i32m1(__VA_ARGS__)
+#define vssseg8e32_v_i32m1(...) __riscv_vssseg8e32_v_i32m1(__VA_ARGS__)
+#define vssseg2e32_v_i32m2(...) __riscv_vssseg2e32_v_i32m2(__VA_ARGS__)
+#define vssseg3e32_v_i32m2(...) __riscv_vssseg3e32_v_i32m2(__VA_ARGS__)
+#define vssseg4e32_v_i32m2(...) __riscv_vssseg4e32_v_i32m2(__VA_ARGS__)
+#define vssseg2e32_v_i32m4(...) __riscv_vssseg2e32_v_i32m4(__VA_ARGS__)
+#define vssseg2e64_v_i64m1(...) __riscv_vssseg2e64_v_i64m1(__VA_ARGS__)
+#define vssseg3e64_v_i64m1(...) __riscv_vssseg3e64_v_i64m1(__VA_ARGS__)
+#define vssseg4e64_v_i64m1(...) __riscv_vssseg4e64_v_i64m1(__VA_ARGS__)
+#define vssseg5e64_v_i64m1(...) __riscv_vssseg5e64_v_i64m1(__VA_ARGS__)
+#define vssseg6e64_v_i64m1(...) __riscv_vssseg6e64_v_i64m1(__VA_ARGS__)
+#define vssseg7e64_v_i64m1(...) __riscv_vssseg7e64_v_i64m1(__VA_ARGS__)
+#define vssseg8e64_v_i64m1(...) __riscv_vssseg8e64_v_i64m1(__VA_ARGS__)
+#define vssseg2e64_v_i64m2(...) __riscv_vssseg2e64_v_i64m2(__VA_ARGS__)
+#define vssseg3e64_v_i64m2(...) __riscv_vssseg3e64_v_i64m2(__VA_ARGS__)
+#define vssseg4e64_v_i64m2(...) __riscv_vssseg4e64_v_i64m2(__VA_ARGS__)
+#define vssseg2e64_v_i64m4(...) __riscv_vssseg2e64_v_i64m4(__VA_ARGS__)
+#define vssseg2e8_v_u8mf8(...) __riscv_vssseg2e8_v_u8mf8(__VA_ARGS__)
+#define vssseg3e8_v_u8mf8(...) __riscv_vssseg3e8_v_u8mf8(__VA_ARGS__)
+#define vssseg4e8_v_u8mf8(...) __riscv_vssseg4e8_v_u8mf8(__VA_ARGS__)
+#define vssseg5e8_v_u8mf8(...) __riscv_vssseg5e8_v_u8mf8(__VA_ARGS__)
+#define vssseg6e8_v_u8mf8(...) __riscv_vssseg6e8_v_u8mf8(__VA_ARGS__)
+#define vssseg7e8_v_u8mf8(...) __riscv_vssseg7e8_v_u8mf8(__VA_ARGS__)
+#define vssseg8e8_v_u8mf8(...) __riscv_vssseg8e8_v_u8mf8(__VA_ARGS__)
+#define vssseg2e8_v_u8mf4(...) __riscv_vssseg2e8_v_u8mf4(__VA_ARGS__)
+#define vssseg3e8_v_u8mf4(...) __riscv_vssseg3e8_v_u8mf4(__VA_ARGS__)
+#define vssseg4e8_v_u8mf4(...) __riscv_vssseg4e8_v_u8mf4(__VA_ARGS__)
+#define vssseg5e8_v_u8mf4(...) __riscv_vssseg5e8_v_u8mf4(__VA_ARGS__)
+#define vssseg6e8_v_u8mf4(...) __riscv_vssseg6e8_v_u8mf4(__VA_ARGS__)
+#define vssseg7e8_v_u8mf4(...) __riscv_vssseg7e8_v_u8mf4(__VA_ARGS__)
+#define vssseg8e8_v_u8mf4(...) __riscv_vssseg8e8_v_u8mf4(__VA_ARGS__)
+#define vssseg2e8_v_u8mf2(...) __riscv_vssseg2e8_v_u8mf2(__VA_ARGS__)
+#define vssseg3e8_v_u8mf2(...) __riscv_vssseg3e8_v_u8mf2(__VA_ARGS__)
+#define vssseg4e8_v_u8mf2(...) __riscv_vssseg4e8_v_u8mf2(__VA_ARGS__)
+#define vssseg5e8_v_u8mf2(...) __riscv_vssseg5e8_v_u8mf2(__VA_ARGS__)
+#define vssseg6e8_v_u8mf2(...) __riscv_vssseg6e8_v_u8mf2(__VA_ARGS__)
+#define vssseg7e8_v_u8mf2(...) __riscv_vssseg7e8_v_u8mf2(__VA_ARGS__)
+#define vssseg8e8_v_u8mf2(...) __riscv_vssseg8e8_v_u8mf2(__VA_ARGS__)
+#define vssseg2e8_v_u8m1(...) __riscv_vssseg2e8_v_u8m1(__VA_ARGS__)
+#define vssseg3e8_v_u8m1(...) __riscv_vssseg3e8_v_u8m1(__VA_ARGS__)
+#define vssseg4e8_v_u8m1(...) __riscv_vssseg4e8_v_u8m1(__VA_ARGS__)
+#define vssseg5e8_v_u8m1(...) __riscv_vssseg5e8_v_u8m1(__VA_ARGS__)
+#define vssseg6e8_v_u8m1(...) __riscv_vssseg6e8_v_u8m1(__VA_ARGS__)
+#define vssseg7e8_v_u8m1(...) __riscv_vssseg7e8_v_u8m1(__VA_ARGS__)
+#define vssseg8e8_v_u8m1(...) __riscv_vssseg8e8_v_u8m1(__VA_ARGS__)
+#define vssseg2e8_v_u8m2(...) __riscv_vssseg2e8_v_u8m2(__VA_ARGS__)
+#define vssseg3e8_v_u8m2(...) __riscv_vssseg3e8_v_u8m2(__VA_ARGS__)
+#define vssseg4e8_v_u8m2(...) __riscv_vssseg4e8_v_u8m2(__VA_ARGS__)
+#define vssseg2e8_v_u8m4(...) __riscv_vssseg2e8_v_u8m4(__VA_ARGS__)
+#define vssseg2e16_v_u16mf4(...) __riscv_vssseg2e16_v_u16mf4(__VA_ARGS__)
+#define vssseg3e16_v_u16mf4(...) __riscv_vssseg3e16_v_u16mf4(__VA_ARGS__)
+#define vssseg4e16_v_u16mf4(...) __riscv_vssseg4e16_v_u16mf4(__VA_ARGS__)
+#define vssseg5e16_v_u16mf4(...) __riscv_vssseg5e16_v_u16mf4(__VA_ARGS__)
+#define vssseg6e16_v_u16mf4(...) __riscv_vssseg6e16_v_u16mf4(__VA_ARGS__)
+#define vssseg7e16_v_u16mf4(...) __riscv_vssseg7e16_v_u16mf4(__VA_ARGS__)
+#define vssseg8e16_v_u16mf4(...) __riscv_vssseg8e16_v_u16mf4(__VA_ARGS__)
+#define vssseg2e16_v_u16mf2(...) __riscv_vssseg2e16_v_u16mf2(__VA_ARGS__)
+#define vssseg3e16_v_u16mf2(...) __riscv_vssseg3e16_v_u16mf2(__VA_ARGS__)
+#define vssseg4e16_v_u16mf2(...) __riscv_vssseg4e16_v_u16mf2(__VA_ARGS__)
+#define vssseg5e16_v_u16mf2(...) __riscv_vssseg5e16_v_u16mf2(__VA_ARGS__)
+#define vssseg6e16_v_u16mf2(...) __riscv_vssseg6e16_v_u16mf2(__VA_ARGS__)
+#define vssseg7e16_v_u16mf2(...) __riscv_vssseg7e16_v_u16mf2(__VA_ARGS__)
+#define vssseg8e16_v_u16mf2(...) __riscv_vssseg8e16_v_u16mf2(__VA_ARGS__)
+#define vssseg2e16_v_u16m1(...) __riscv_vssseg2e16_v_u16m1(__VA_ARGS__)
+#define vssseg3e16_v_u16m1(...) __riscv_vssseg3e16_v_u16m1(__VA_ARGS__)
+#define vssseg4e16_v_u16m1(...) __riscv_vssseg4e16_v_u16m1(__VA_ARGS__)
+#define vssseg5e16_v_u16m1(...) __riscv_vssseg5e16_v_u16m1(__VA_ARGS__)
+#define vssseg6e16_v_u16m1(...) __riscv_vssseg6e16_v_u16m1(__VA_ARGS__)
+#define vssseg7e16_v_u16m1(...) __riscv_vssseg7e16_v_u16m1(__VA_ARGS__)
+#define vssseg8e16_v_u16m1(...) __riscv_vssseg8e16_v_u16m1(__VA_ARGS__)
+#define vssseg2e16_v_u16m2(...) __riscv_vssseg2e16_v_u16m2(__VA_ARGS__)
+#define vssseg3e16_v_u16m2(...) __riscv_vssseg3e16_v_u16m2(__VA_ARGS__)
+#define vssseg4e16_v_u16m2(...) __riscv_vssseg4e16_v_u16m2(__VA_ARGS__)
+#define vssseg2e16_v_u16m4(...) __riscv_vssseg2e16_v_u16m4(__VA_ARGS__)
+#define vssseg2e32_v_u32mf2(...) __riscv_vssseg2e32_v_u32mf2(__VA_ARGS__)
+#define vssseg3e32_v_u32mf2(...) __riscv_vssseg3e32_v_u32mf2(__VA_ARGS__)
+#define vssseg4e32_v_u32mf2(...) __riscv_vssseg4e32_v_u32mf2(__VA_ARGS__)
+#define vssseg5e32_v_u32mf2(...) __riscv_vssseg5e32_v_u32mf2(__VA_ARGS__)
+#define vssseg6e32_v_u32mf2(...) __riscv_vssseg6e32_v_u32mf2(__VA_ARGS__)
+#define vssseg7e32_v_u32mf2(...) __riscv_vssseg7e32_v_u32mf2(__VA_ARGS__)
+#define vssseg8e32_v_u32mf2(...) __riscv_vssseg8e32_v_u32mf2(__VA_ARGS__)
+#define vssseg2e32_v_u32m1(...) __riscv_vssseg2e32_v_u32m1(__VA_ARGS__)
+#define vssseg3e32_v_u32m1(...) __riscv_vssseg3e32_v_u32m1(__VA_ARGS__)
+#define vssseg4e32_v_u32m1(...) __riscv_vssseg4e32_v_u32m1(__VA_ARGS__)
+#define vssseg5e32_v_u32m1(...) __riscv_vssseg5e32_v_u32m1(__VA_ARGS__)
+#define vssseg6e32_v_u32m1(...) __riscv_vssseg6e32_v_u32m1(__VA_ARGS__)
+#define vssseg7e32_v_u32m1(...) __riscv_vssseg7e32_v_u32m1(__VA_ARGS__)
+#define vssseg8e32_v_u32m1(...) __riscv_vssseg8e32_v_u32m1(__VA_ARGS__)
+#define vssseg2e32_v_u32m2(...) __riscv_vssseg2e32_v_u32m2(__VA_ARGS__)
+#define vssseg3e32_v_u32m2(...) __riscv_vssseg3e32_v_u32m2(__VA_ARGS__)
+#define vssseg4e32_v_u32m2(...) __riscv_vssseg4e32_v_u32m2(__VA_ARGS__)
+#define vssseg2e32_v_u32m4(...) __riscv_vssseg2e32_v_u32m4(__VA_ARGS__)
+#define vssseg2e64_v_u64m1(...) __riscv_vssseg2e64_v_u64m1(__VA_ARGS__)
+#define vssseg3e64_v_u64m1(...) __riscv_vssseg3e64_v_u64m1(__VA_ARGS__)
+#define vssseg4e64_v_u64m1(...) __riscv_vssseg4e64_v_u64m1(__VA_ARGS__)
+#define vssseg5e64_v_u64m1(...) __riscv_vssseg5e64_v_u64m1(__VA_ARGS__)
+#define vssseg6e64_v_u64m1(...) __riscv_vssseg6e64_v_u64m1(__VA_ARGS__)
+#define vssseg7e64_v_u64m1(...) __riscv_vssseg7e64_v_u64m1(__VA_ARGS__)
+#define vssseg8e64_v_u64m1(...) __riscv_vssseg8e64_v_u64m1(__VA_ARGS__)
+#define vssseg2e64_v_u64m2(...) __riscv_vssseg2e64_v_u64m2(__VA_ARGS__)
+#define vssseg3e64_v_u64m2(...) __riscv_vssseg3e64_v_u64m2(__VA_ARGS__)
+#define vssseg4e64_v_u64m2(...) __riscv_vssseg4e64_v_u64m2(__VA_ARGS__)
+#define vssseg2e64_v_u64m4(...) __riscv_vssseg2e64_v_u64m4(__VA_ARGS__)
+// masked functions
+#define vssseg2e16_v_f16mf4_m(...) __riscv_vssseg2e16_v_f16mf4_m(__VA_ARGS__)
+#define vssseg3e16_v_f16mf4_m(...) __riscv_vssseg3e16_v_f16mf4_m(__VA_ARGS__)
+#define vssseg4e16_v_f16mf4_m(...) __riscv_vssseg4e16_v_f16mf4_m(__VA_ARGS__)
+#define vssseg5e16_v_f16mf4_m(...) __riscv_vssseg5e16_v_f16mf4_m(__VA_ARGS__)
+#define vssseg6e16_v_f16mf4_m(...) __riscv_vssseg6e16_v_f16mf4_m(__VA_ARGS__)
+#define vssseg7e16_v_f16mf4_m(...) __riscv_vssseg7e16_v_f16mf4_m(__VA_ARGS__)
+#define vssseg8e16_v_f16mf4_m(...) __riscv_vssseg8e16_v_f16mf4_m(__VA_ARGS__)
+#define vssseg2e16_v_f16mf2_m(...) __riscv_vssseg2e16_v_f16mf2_m(__VA_ARGS__)
+#define vssseg3e16_v_f16mf2_m(...) __riscv_vssseg3e16_v_f16mf2_m(__VA_ARGS__)
+#define vssseg4e16_v_f16mf2_m(...) __riscv_vssseg4e16_v_f16mf2_m(__VA_ARGS__)
+#define vssseg5e16_v_f16mf2_m(...) __riscv_vssseg5e16_v_f16mf2_m(__VA_ARGS__)
+#define vssseg6e16_v_f16mf2_m(...) __riscv_vssseg6e16_v_f16mf2_m(__VA_ARGS__)
+#define vssseg7e16_v_f16mf2_m(...) __riscv_vssseg7e16_v_f16mf2_m(__VA_ARGS__)
+#define vssseg8e16_v_f16mf2_m(...) __riscv_vssseg8e16_v_f16mf2_m(__VA_ARGS__)
+#define vssseg2e16_v_f16m1_m(...) __riscv_vssseg2e16_v_f16m1_m(__VA_ARGS__)
+#define vssseg3e16_v_f16m1_m(...) __riscv_vssseg3e16_v_f16m1_m(__VA_ARGS__)
+#define vssseg4e16_v_f16m1_m(...) __riscv_vssseg4e16_v_f16m1_m(__VA_ARGS__)
+#define vssseg5e16_v_f16m1_m(...) __riscv_vssseg5e16_v_f16m1_m(__VA_ARGS__)
+#define vssseg6e16_v_f16m1_m(...) __riscv_vssseg6e16_v_f16m1_m(__VA_ARGS__)
+#define vssseg7e16_v_f16m1_m(...) __riscv_vssseg7e16_v_f16m1_m(__VA_ARGS__)
+#define vssseg8e16_v_f16m1_m(...) __riscv_vssseg8e16_v_f16m1_m(__VA_ARGS__)
+#define vssseg2e16_v_f16m2_m(...) __riscv_vssseg2e16_v_f16m2_m(__VA_ARGS__)
+#define vssseg3e16_v_f16m2_m(...) __riscv_vssseg3e16_v_f16m2_m(__VA_ARGS__)
+#define vssseg4e16_v_f16m2_m(...) __riscv_vssseg4e16_v_f16m2_m(__VA_ARGS__)
+#define vssseg2e16_v_f16m4_m(...) __riscv_vssseg2e16_v_f16m4_m(__VA_ARGS__)
+#define vssseg2e32_v_f32mf2_m(...) __riscv_vssseg2e32_v_f32mf2_m(__VA_ARGS__)
+#define vssseg3e32_v_f32mf2_m(...) __riscv_vssseg3e32_v_f32mf2_m(__VA_ARGS__)
+#define vssseg4e32_v_f32mf2_m(...) __riscv_vssseg4e32_v_f32mf2_m(__VA_ARGS__)
+#define vssseg5e32_v_f32mf2_m(...) __riscv_vssseg5e32_v_f32mf2_m(__VA_ARGS__)
+#define vssseg6e32_v_f32mf2_m(...) __riscv_vssseg6e32_v_f32mf2_m(__VA_ARGS__)
+#define vssseg7e32_v_f32mf2_m(...) __riscv_vssseg7e32_v_f32mf2_m(__VA_ARGS__)
+#define vssseg8e32_v_f32mf2_m(...) __riscv_vssseg8e32_v_f32mf2_m(__VA_ARGS__)
+#define vssseg2e32_v_f32m1_m(...) __riscv_vssseg2e32_v_f32m1_m(__VA_ARGS__)
+#define vssseg3e32_v_f32m1_m(...) __riscv_vssseg3e32_v_f32m1_m(__VA_ARGS__)
+#define vssseg4e32_v_f32m1_m(...) __riscv_vssseg4e32_v_f32m1_m(__VA_ARGS__)
+#define vssseg5e32_v_f32m1_m(...) __riscv_vssseg5e32_v_f32m1_m(__VA_ARGS__)
+#define vssseg6e32_v_f32m1_m(...) __riscv_vssseg6e32_v_f32m1_m(__VA_ARGS__)
+#define vssseg7e32_v_f32m1_m(...) __riscv_vssseg7e32_v_f32m1_m(__VA_ARGS__)
+#define vssseg8e32_v_f32m1_m(...) __riscv_vssseg8e32_v_f32m1_m(__VA_ARGS__)
+#define vssseg2e32_v_f32m2_m(...) __riscv_vssseg2e32_v_f32m2_m(__VA_ARGS__)
+#define vssseg3e32_v_f32m2_m(...) __riscv_vssseg3e32_v_f32m2_m(__VA_ARGS__)
+#define vssseg4e32_v_f32m2_m(...) __riscv_vssseg4e32_v_f32m2_m(__VA_ARGS__)
+#define vssseg2e32_v_f32m4_m(...) __riscv_vssseg2e32_v_f32m4_m(__VA_ARGS__)
+#define vssseg2e64_v_f64m1_m(...) __riscv_vssseg2e64_v_f64m1_m(__VA_ARGS__)
+#define vssseg3e64_v_f64m1_m(...) __riscv_vssseg3e64_v_f64m1_m(__VA_ARGS__)
+#define vssseg4e64_v_f64m1_m(...) __riscv_vssseg4e64_v_f64m1_m(__VA_ARGS__)
+#define vssseg5e64_v_f64m1_m(...) __riscv_vssseg5e64_v_f64m1_m(__VA_ARGS__)
+#define vssseg6e64_v_f64m1_m(...) __riscv_vssseg6e64_v_f64m1_m(__VA_ARGS__)
+#define vssseg7e64_v_f64m1_m(...) __riscv_vssseg7e64_v_f64m1_m(__VA_ARGS__)
+#define vssseg8e64_v_f64m1_m(...) __riscv_vssseg8e64_v_f64m1_m(__VA_ARGS__)
+#define vssseg2e64_v_f64m2_m(...) __riscv_vssseg2e64_v_f64m2_m(__VA_ARGS__)
+#define vssseg3e64_v_f64m2_m(...) __riscv_vssseg3e64_v_f64m2_m(__VA_ARGS__)
+#define vssseg4e64_v_f64m2_m(...) __riscv_vssseg4e64_v_f64m2_m(__VA_ARGS__)
+#define vssseg2e64_v_f64m4_m(...) __riscv_vssseg2e64_v_f64m4_m(__VA_ARGS__)
+#define vssseg2e8_v_i8mf8_m(...) __riscv_vssseg2e8_v_i8mf8_m(__VA_ARGS__)
+#define vssseg3e8_v_i8mf8_m(...) __riscv_vssseg3e8_v_i8mf8_m(__VA_ARGS__)
+#define vssseg4e8_v_i8mf8_m(...) __riscv_vssseg4e8_v_i8mf8_m(__VA_ARGS__)
+#define vssseg5e8_v_i8mf8_m(...) __riscv_vssseg5e8_v_i8mf8_m(__VA_ARGS__)
+#define vssseg6e8_v_i8mf8_m(...) __riscv_vssseg6e8_v_i8mf8_m(__VA_ARGS__)
+#define vssseg7e8_v_i8mf8_m(...) __riscv_vssseg7e8_v_i8mf8_m(__VA_ARGS__)
+#define vssseg8e8_v_i8mf8_m(...) __riscv_vssseg8e8_v_i8mf8_m(__VA_ARGS__)
+#define vssseg2e8_v_i8mf4_m(...) __riscv_vssseg2e8_v_i8mf4_m(__VA_ARGS__)
+#define vssseg3e8_v_i8mf4_m(...) __riscv_vssseg3e8_v_i8mf4_m(__VA_ARGS__)
+#define vssseg4e8_v_i8mf4_m(...) __riscv_vssseg4e8_v_i8mf4_m(__VA_ARGS__)
+#define vssseg5e8_v_i8mf4_m(...) __riscv_vssseg5e8_v_i8mf4_m(__VA_ARGS__)
+#define vssseg6e8_v_i8mf4_m(...) __riscv_vssseg6e8_v_i8mf4_m(__VA_ARGS__)
+#define vssseg7e8_v_i8mf4_m(...) __riscv_vssseg7e8_v_i8mf4_m(__VA_ARGS__)
+#define vssseg8e8_v_i8mf4_m(...) __riscv_vssseg8e8_v_i8mf4_m(__VA_ARGS__)
+#define vssseg2e8_v_i8mf2_m(...) __riscv_vssseg2e8_v_i8mf2_m(__VA_ARGS__)
+#define vssseg3e8_v_i8mf2_m(...) __riscv_vssseg3e8_v_i8mf2_m(__VA_ARGS__)
+#define vssseg4e8_v_i8mf2_m(...) __riscv_vssseg4e8_v_i8mf2_m(__VA_ARGS__)
+#define vssseg5e8_v_i8mf2_m(...) __riscv_vssseg5e8_v_i8mf2_m(__VA_ARGS__)
+#define vssseg6e8_v_i8mf2_m(...) __riscv_vssseg6e8_v_i8mf2_m(__VA_ARGS__)
+#define vssseg7e8_v_i8mf2_m(...) __riscv_vssseg7e8_v_i8mf2_m(__VA_ARGS__)
+#define vssseg8e8_v_i8mf2_m(...) __riscv_vssseg8e8_v_i8mf2_m(__VA_ARGS__)
+#define vssseg2e8_v_i8m1_m(...) __riscv_vssseg2e8_v_i8m1_m(__VA_ARGS__)
+#define vssseg3e8_v_i8m1_m(...) __riscv_vssseg3e8_v_i8m1_m(__VA_ARGS__)
+#define vssseg4e8_v_i8m1_m(...) __riscv_vssseg4e8_v_i8m1_m(__VA_ARGS__)
+#define vssseg5e8_v_i8m1_m(...) __riscv_vssseg5e8_v_i8m1_m(__VA_ARGS__)
+#define vssseg6e8_v_i8m1_m(...) __riscv_vssseg6e8_v_i8m1_m(__VA_ARGS__)
+#define vssseg7e8_v_i8m1_m(...) __riscv_vssseg7e8_v_i8m1_m(__VA_ARGS__)
+#define vssseg8e8_v_i8m1_m(...) __riscv_vssseg8e8_v_i8m1_m(__VA_ARGS__)
+#define vssseg2e8_v_i8m2_m(...) __riscv_vssseg2e8_v_i8m2_m(__VA_ARGS__)
+#define vssseg3e8_v_i8m2_m(...) __riscv_vssseg3e8_v_i8m2_m(__VA_ARGS__)
+#define vssseg4e8_v_i8m2_m(...) __riscv_vssseg4e8_v_i8m2_m(__VA_ARGS__)
+#define vssseg2e8_v_i8m4_m(...) __riscv_vssseg2e8_v_i8m4_m(__VA_ARGS__)
+#define vssseg2e16_v_i16mf4_m(...) __riscv_vssseg2e16_v_i16mf4_m(__VA_ARGS__)
+#define vssseg3e16_v_i16mf4_m(...) __riscv_vssseg3e16_v_i16mf4_m(__VA_ARGS__)
+#define vssseg4e16_v_i16mf4_m(...) __riscv_vssseg4e16_v_i16mf4_m(__VA_ARGS__)
+#define vssseg5e16_v_i16mf4_m(...) __riscv_vssseg5e16_v_i16mf4_m(__VA_ARGS__)
+#define vssseg6e16_v_i16mf4_m(...) __riscv_vssseg6e16_v_i16mf4_m(__VA_ARGS__)
+#define vssseg7e16_v_i16mf4_m(...) __riscv_vssseg7e16_v_i16mf4_m(__VA_ARGS__)
+#define vssseg8e16_v_i16mf4_m(...) __riscv_vssseg8e16_v_i16mf4_m(__VA_ARGS__)
+#define vssseg2e16_v_i16mf2_m(...) __riscv_vssseg2e16_v_i16mf2_m(__VA_ARGS__)
+#define vssseg3e16_v_i16mf2_m(...) __riscv_vssseg3e16_v_i16mf2_m(__VA_ARGS__)
+#define vssseg4e16_v_i16mf2_m(...) __riscv_vssseg4e16_v_i16mf2_m(__VA_ARGS__)
+#define vssseg5e16_v_i16mf2_m(...) __riscv_vssseg5e16_v_i16mf2_m(__VA_ARGS__)
+#define vssseg6e16_v_i16mf2_m(...) __riscv_vssseg6e16_v_i16mf2_m(__VA_ARGS__)
+#define vssseg7e16_v_i16mf2_m(...) __riscv_vssseg7e16_v_i16mf2_m(__VA_ARGS__)
+#define vssseg8e16_v_i16mf2_m(...) __riscv_vssseg8e16_v_i16mf2_m(__VA_ARGS__)
+#define vssseg2e16_v_i16m1_m(...) __riscv_vssseg2e16_v_i16m1_m(__VA_ARGS__)
+#define vssseg3e16_v_i16m1_m(...) __riscv_vssseg3e16_v_i16m1_m(__VA_ARGS__)
+#define vssseg4e16_v_i16m1_m(...) __riscv_vssseg4e16_v_i16m1_m(__VA_ARGS__)
+#define vssseg5e16_v_i16m1_m(...) __riscv_vssseg5e16_v_i16m1_m(__VA_ARGS__)
+#define vssseg6e16_v_i16m1_m(...) __riscv_vssseg6e16_v_i16m1_m(__VA_ARGS__)
+#define vssseg7e16_v_i16m1_m(...) __riscv_vssseg7e16_v_i16m1_m(__VA_ARGS__)
+#define vssseg8e16_v_i16m1_m(...) __riscv_vssseg8e16_v_i16m1_m(__VA_ARGS__)
+#define vssseg2e16_v_i16m2_m(...) __riscv_vssseg2e16_v_i16m2_m(__VA_ARGS__)
+#define vssseg3e16_v_i16m2_m(...) __riscv_vssseg3e16_v_i16m2_m(__VA_ARGS__)
+#define vssseg4e16_v_i16m2_m(...) __riscv_vssseg4e16_v_i16m2_m(__VA_ARGS__)
+#define vssseg2e16_v_i16m4_m(...) __riscv_vssseg2e16_v_i16m4_m(__VA_ARGS__)
+#define vssseg2e32_v_i32mf2_m(...) __riscv_vssseg2e32_v_i32mf2_m(__VA_ARGS__)
+#define vssseg3e32_v_i32mf2_m(...) __riscv_vssseg3e32_v_i32mf2_m(__VA_ARGS__)
+#define vssseg4e32_v_i32mf2_m(...) __riscv_vssseg4e32_v_i32mf2_m(__VA_ARGS__)
+#define vssseg5e32_v_i32mf2_m(...) __riscv_vssseg5e32_v_i32mf2_m(__VA_ARGS__)
+#define vssseg6e32_v_i32mf2_m(...) __riscv_vssseg6e32_v_i32mf2_m(__VA_ARGS__)
+#define vssseg7e32_v_i32mf2_m(...) __riscv_vssseg7e32_v_i32mf2_m(__VA_ARGS__)
+#define vssseg8e32_v_i32mf2_m(...) __riscv_vssseg8e32_v_i32mf2_m(__VA_ARGS__)
+#define vssseg2e32_v_i32m1_m(...) __riscv_vssseg2e32_v_i32m1_m(__VA_ARGS__)
+#define vssseg3e32_v_i32m1_m(...) __riscv_vssseg3e32_v_i32m1_m(__VA_ARGS__)
+#define vssseg4e32_v_i32m1_m(...) __riscv_vssseg4e32_v_i32m1_m(__VA_ARGS__)
+#define vssseg5e32_v_i32m1_m(...) __riscv_vssseg5e32_v_i32m1_m(__VA_ARGS__)
+#define vssseg6e32_v_i32m1_m(...) __riscv_vssseg6e32_v_i32m1_m(__VA_ARGS__)
+#define vssseg7e32_v_i32m1_m(...) __riscv_vssseg7e32_v_i32m1_m(__VA_ARGS__)
+#define vssseg8e32_v_i32m1_m(...) __riscv_vssseg8e32_v_i32m1_m(__VA_ARGS__)
+#define vssseg2e32_v_i32m2_m(...) __riscv_vssseg2e32_v_i32m2_m(__VA_ARGS__)
+#define vssseg3e32_v_i32m2_m(...) __riscv_vssseg3e32_v_i32m2_m(__VA_ARGS__)
+#define vssseg4e32_v_i32m2_m(...) __riscv_vssseg4e32_v_i32m2_m(__VA_ARGS__)
+#define vssseg2e32_v_i32m4_m(...) __riscv_vssseg2e32_v_i32m4_m(__VA_ARGS__)
+#define vssseg2e64_v_i64m1_m(...) __riscv_vssseg2e64_v_i64m1_m(__VA_ARGS__)
+#define vssseg3e64_v_i64m1_m(...) __riscv_vssseg3e64_v_i64m1_m(__VA_ARGS__)
+#define vssseg4e64_v_i64m1_m(...) __riscv_vssseg4e64_v_i64m1_m(__VA_ARGS__)
+#define vssseg5e64_v_i64m1_m(...) __riscv_vssseg5e64_v_i64m1_m(__VA_ARGS__)
+#define vssseg6e64_v_i64m1_m(...) __riscv_vssseg6e64_v_i64m1_m(__VA_ARGS__)
+#define vssseg7e64_v_i64m1_m(...) __riscv_vssseg7e64_v_i64m1_m(__VA_ARGS__)
+#define vssseg8e64_v_i64m1_m(...) __riscv_vssseg8e64_v_i64m1_m(__VA_ARGS__)
+#define vssseg2e64_v_i64m2_m(...) __riscv_vssseg2e64_v_i64m2_m(__VA_ARGS__)
+#define vssseg3e64_v_i64m2_m(...) __riscv_vssseg3e64_v_i64m2_m(__VA_ARGS__)
+#define vssseg4e64_v_i64m2_m(...) __riscv_vssseg4e64_v_i64m2_m(__VA_ARGS__)
+#define vssseg2e64_v_i64m4_m(...) __riscv_vssseg2e64_v_i64m4_m(__VA_ARGS__)
+#define vssseg2e8_v_u8mf8_m(...) __riscv_vssseg2e8_v_u8mf8_m(__VA_ARGS__)
+#define vssseg3e8_v_u8mf8_m(...) __riscv_vssseg3e8_v_u8mf8_m(__VA_ARGS__)
+#define vssseg4e8_v_u8mf8_m(...) __riscv_vssseg4e8_v_u8mf8_m(__VA_ARGS__)
+#define vssseg5e8_v_u8mf8_m(...) __riscv_vssseg5e8_v_u8mf8_m(__VA_ARGS__)
+#define vssseg6e8_v_u8mf8_m(...) __riscv_vssseg6e8_v_u8mf8_m(__VA_ARGS__)
+#define vssseg7e8_v_u8mf8_m(...) __riscv_vssseg7e8_v_u8mf8_m(__VA_ARGS__)
+#define vssseg8e8_v_u8mf8_m(...) __riscv_vssseg8e8_v_u8mf8_m(__VA_ARGS__)
+#define vssseg2e8_v_u8mf4_m(...) __riscv_vssseg2e8_v_u8mf4_m(__VA_ARGS__)
+#define vssseg3e8_v_u8mf4_m(...) __riscv_vssseg3e8_v_u8mf4_m(__VA_ARGS__)
+#define vssseg4e8_v_u8mf4_m(...) __riscv_vssseg4e8_v_u8mf4_m(__VA_ARGS__)
+#define vssseg5e8_v_u8mf4_m(...) __riscv_vssseg5e8_v_u8mf4_m(__VA_ARGS__)
+#define vssseg6e8_v_u8mf4_m(...) __riscv_vssseg6e8_v_u8mf4_m(__VA_ARGS__)
+#define vssseg7e8_v_u8mf4_m(...) __riscv_vssseg7e8_v_u8mf4_m(__VA_ARGS__)
+#define vssseg8e8_v_u8mf4_m(...) __riscv_vssseg8e8_v_u8mf4_m(__VA_ARGS__)
+#define vssseg2e8_v_u8mf2_m(...) __riscv_vssseg2e8_v_u8mf2_m(__VA_ARGS__)
+#define vssseg3e8_v_u8mf2_m(...) __riscv_vssseg3e8_v_u8mf2_m(__VA_ARGS__)
+#define vssseg4e8_v_u8mf2_m(...) __riscv_vssseg4e8_v_u8mf2_m(__VA_ARGS__)
+#define vssseg5e8_v_u8mf2_m(...) __riscv_vssseg5e8_v_u8mf2_m(__VA_ARGS__)
+#define vssseg6e8_v_u8mf2_m(...) __riscv_vssseg6e8_v_u8mf2_m(__VA_ARGS__)
+#define vssseg7e8_v_u8mf2_m(...) __riscv_vssseg7e8_v_u8mf2_m(__VA_ARGS__)
+#define vssseg8e8_v_u8mf2_m(...) __riscv_vssseg8e8_v_u8mf2_m(__VA_ARGS__)
+#define vssseg2e8_v_u8m1_m(...) __riscv_vssseg2e8_v_u8m1_m(__VA_ARGS__)
+#define vssseg3e8_v_u8m1_m(...) __riscv_vssseg3e8_v_u8m1_m(__VA_ARGS__)
+#define vssseg4e8_v_u8m1_m(...) __riscv_vssseg4e8_v_u8m1_m(__VA_ARGS__)
+#define vssseg5e8_v_u8m1_m(...) __riscv_vssseg5e8_v_u8m1_m(__VA_ARGS__)
+#define vssseg6e8_v_u8m1_m(...) __riscv_vssseg6e8_v_u8m1_m(__VA_ARGS__)
+#define vssseg7e8_v_u8m1_m(...) __riscv_vssseg7e8_v_u8m1_m(__VA_ARGS__)
+#define vssseg8e8_v_u8m1_m(...) __riscv_vssseg8e8_v_u8m1_m(__VA_ARGS__)
+#define vssseg2e8_v_u8m2_m(...) __riscv_vssseg2e8_v_u8m2_m(__VA_ARGS__)
+#define vssseg3e8_v_u8m2_m(...) __riscv_vssseg3e8_v_u8m2_m(__VA_ARGS__)
+#define vssseg4e8_v_u8m2_m(...) __riscv_vssseg4e8_v_u8m2_m(__VA_ARGS__)
+#define vssseg2e8_v_u8m4_m(...) __riscv_vssseg2e8_v_u8m4_m(__VA_ARGS__)
+#define vssseg2e16_v_u16mf4_m(...) __riscv_vssseg2e16_v_u16mf4_m(__VA_ARGS__)
+#define vssseg3e16_v_u16mf4_m(...) __riscv_vssseg3e16_v_u16mf4_m(__VA_ARGS__)
+#define vssseg4e16_v_u16mf4_m(...) __riscv_vssseg4e16_v_u16mf4_m(__VA_ARGS__)
+#define vssseg5e16_v_u16mf4_m(...) __riscv_vssseg5e16_v_u16mf4_m(__VA_ARGS__)
+#define vssseg6e16_v_u16mf4_m(...) __riscv_vssseg6e16_v_u16mf4_m(__VA_ARGS__)
+#define vssseg7e16_v_u16mf4_m(...) __riscv_vssseg7e16_v_u16mf4_m(__VA_ARGS__)
+#define vssseg8e16_v_u16mf4_m(...) __riscv_vssseg8e16_v_u16mf4_m(__VA_ARGS__)
+#define vssseg2e16_v_u16mf2_m(...) __riscv_vssseg2e16_v_u16mf2_m(__VA_ARGS__)
+#define vssseg3e16_v_u16mf2_m(...) __riscv_vssseg3e16_v_u16mf2_m(__VA_ARGS__)
+#define vssseg4e16_v_u16mf2_m(...) __riscv_vssseg4e16_v_u16mf2_m(__VA_ARGS__)
+#define vssseg5e16_v_u16mf2_m(...) __riscv_vssseg5e16_v_u16mf2_m(__VA_ARGS__)
+#define vssseg6e16_v_u16mf2_m(...) __riscv_vssseg6e16_v_u16mf2_m(__VA_ARGS__)
+#define vssseg7e16_v_u16mf2_m(...) __riscv_vssseg7e16_v_u16mf2_m(__VA_ARGS__)
+#define vssseg8e16_v_u16mf2_m(...) __riscv_vssseg8e16_v_u16mf2_m(__VA_ARGS__)
+#define vssseg2e16_v_u16m1_m(...) __riscv_vssseg2e16_v_u16m1_m(__VA_ARGS__)
+#define vssseg3e16_v_u16m1_m(...) __riscv_vssseg3e16_v_u16m1_m(__VA_ARGS__)
+#define vssseg4e16_v_u16m1_m(...) __riscv_vssseg4e16_v_u16m1_m(__VA_ARGS__)
+#define vssseg5e16_v_u16m1_m(...) __riscv_vssseg5e16_v_u16m1_m(__VA_ARGS__)
+#define vssseg6e16_v_u16m1_m(...) __riscv_vssseg6e16_v_u16m1_m(__VA_ARGS__)
+#define vssseg7e16_v_u16m1_m(...) __riscv_vssseg7e16_v_u16m1_m(__VA_ARGS__)
+#define vssseg8e16_v_u16m1_m(...) __riscv_vssseg8e16_v_u16m1_m(__VA_ARGS__)
+#define vssseg2e16_v_u16m2_m(...) __riscv_vssseg2e16_v_u16m2_m(__VA_ARGS__)
+#define vssseg3e16_v_u16m2_m(...) __riscv_vssseg3e16_v_u16m2_m(__VA_ARGS__)
+#define vssseg4e16_v_u16m2_m(...) __riscv_vssseg4e16_v_u16m2_m(__VA_ARGS__)
+#define vssseg2e16_v_u16m4_m(...) __riscv_vssseg2e16_v_u16m4_m(__VA_ARGS__)
+#define vssseg2e32_v_u32mf2_m(...) __riscv_vssseg2e32_v_u32mf2_m(__VA_ARGS__)
+#define vssseg3e32_v_u32mf2_m(...) __riscv_vssseg3e32_v_u32mf2_m(__VA_ARGS__)
+#define vssseg4e32_v_u32mf2_m(...) __riscv_vssseg4e32_v_u32mf2_m(__VA_ARGS__)
+#define vssseg5e32_v_u32mf2_m(...) __riscv_vssseg5e32_v_u32mf2_m(__VA_ARGS__)
+#define vssseg6e32_v_u32mf2_m(...) __riscv_vssseg6e32_v_u32mf2_m(__VA_ARGS__)
+#define vssseg7e32_v_u32mf2_m(...) __riscv_vssseg7e32_v_u32mf2_m(__VA_ARGS__)
+#define vssseg8e32_v_u32mf2_m(...) __riscv_vssseg8e32_v_u32mf2_m(__VA_ARGS__)
+#define vssseg2e32_v_u32m1_m(...) __riscv_vssseg2e32_v_u32m1_m(__VA_ARGS__)
+#define vssseg3e32_v_u32m1_m(...) __riscv_vssseg3e32_v_u32m1_m(__VA_ARGS__)
+#define vssseg4e32_v_u32m1_m(...) __riscv_vssseg4e32_v_u32m1_m(__VA_ARGS__)
+#define vssseg5e32_v_u32m1_m(...) __riscv_vssseg5e32_v_u32m1_m(__VA_ARGS__)
+#define vssseg6e32_v_u32m1_m(...) __riscv_vssseg6e32_v_u32m1_m(__VA_ARGS__)
+#define vssseg7e32_v_u32m1_m(...) __riscv_vssseg7e32_v_u32m1_m(__VA_ARGS__)
+#define vssseg8e32_v_u32m1_m(...) __riscv_vssseg8e32_v_u32m1_m(__VA_ARGS__)
+#define vssseg2e32_v_u32m2_m(...) __riscv_vssseg2e32_v_u32m2_m(__VA_ARGS__)
+#define vssseg3e32_v_u32m2_m(...) __riscv_vssseg3e32_v_u32m2_m(__VA_ARGS__)
+#define vssseg4e32_v_u32m2_m(...) __riscv_vssseg4e32_v_u32m2_m(__VA_ARGS__)
+#define vssseg2e32_v_u32m4_m(...) __riscv_vssseg2e32_v_u32m4_m(__VA_ARGS__)
+#define vssseg2e64_v_u64m1_m(...) __riscv_vssseg2e64_v_u64m1_m(__VA_ARGS__)
+#define vssseg3e64_v_u64m1_m(...) __riscv_vssseg3e64_v_u64m1_m(__VA_ARGS__)
+#define vssseg4e64_v_u64m1_m(...) __riscv_vssseg4e64_v_u64m1_m(__VA_ARGS__)
+#define vssseg5e64_v_u64m1_m(...) __riscv_vssseg5e64_v_u64m1_m(__VA_ARGS__)
+#define vssseg6e64_v_u64m1_m(...) __riscv_vssseg6e64_v_u64m1_m(__VA_ARGS__)
+#define vssseg7e64_v_u64m1_m(...) __riscv_vssseg7e64_v_u64m1_m(__VA_ARGS__)
+#define vssseg8e64_v_u64m1_m(...) __riscv_vssseg8e64_v_u64m1_m(__VA_ARGS__)
+#define vssseg2e64_v_u64m2_m(...) __riscv_vssseg2e64_v_u64m2_m(__VA_ARGS__)
+#define vssseg3e64_v_u64m2_m(...) __riscv_vssseg3e64_v_u64m2_m(__VA_ARGS__)
+#define vssseg4e64_v_u64m2_m(...) __riscv_vssseg4e64_v_u64m2_m(__VA_ARGS__)
+#define vssseg2e64_v_u64m4_m(...) __riscv_vssseg2e64_v_u64m4_m(__VA_ARGS__)
+#define vloxseg2ei8_v_f16mf4(...) __riscv_vloxseg2ei8_v_f16mf4(__VA_ARGS__)
+#define vloxseg3ei8_v_f16mf4(...) __riscv_vloxseg3ei8_v_f16mf4(__VA_ARGS__)
+#define vloxseg4ei8_v_f16mf4(...) __riscv_vloxseg4ei8_v_f16mf4(__VA_ARGS__)
+#define vloxseg5ei8_v_f16mf4(...) __riscv_vloxseg5ei8_v_f16mf4(__VA_ARGS__)
+#define vloxseg6ei8_v_f16mf4(...) __riscv_vloxseg6ei8_v_f16mf4(__VA_ARGS__)
+#define vloxseg7ei8_v_f16mf4(...) __riscv_vloxseg7ei8_v_f16mf4(__VA_ARGS__)
+#define vloxseg8ei8_v_f16mf4(...) __riscv_vloxseg8ei8_v_f16mf4(__VA_ARGS__)
+#define vloxseg2ei8_v_f16mf2(...) __riscv_vloxseg2ei8_v_f16mf2(__VA_ARGS__)
+#define vloxseg3ei8_v_f16mf2(...) __riscv_vloxseg3ei8_v_f16mf2(__VA_ARGS__)
+#define vloxseg4ei8_v_f16mf2(...) __riscv_vloxseg4ei8_v_f16mf2(__VA_ARGS__)
+#define vloxseg5ei8_v_f16mf2(...) __riscv_vloxseg5ei8_v_f16mf2(__VA_ARGS__)
+#define vloxseg6ei8_v_f16mf2(...) __riscv_vloxseg6ei8_v_f16mf2(__VA_ARGS__)
+#define vloxseg7ei8_v_f16mf2(...) __riscv_vloxseg7ei8_v_f16mf2(__VA_ARGS__)
+#define vloxseg8ei8_v_f16mf2(...) __riscv_vloxseg8ei8_v_f16mf2(__VA_ARGS__)
+#define vloxseg2ei8_v_f16m1(...) __riscv_vloxseg2ei8_v_f16m1(__VA_ARGS__)
+#define vloxseg3ei8_v_f16m1(...) __riscv_vloxseg3ei8_v_f16m1(__VA_ARGS__)
+#define vloxseg4ei8_v_f16m1(...) __riscv_vloxseg4ei8_v_f16m1(__VA_ARGS__)
+#define vloxseg5ei8_v_f16m1(...) __riscv_vloxseg5ei8_v_f16m1(__VA_ARGS__)
+#define vloxseg6ei8_v_f16m1(...) __riscv_vloxseg6ei8_v_f16m1(__VA_ARGS__)
+#define vloxseg7ei8_v_f16m1(...) __riscv_vloxseg7ei8_v_f16m1(__VA_ARGS__)
+#define vloxseg8ei8_v_f16m1(...) __riscv_vloxseg8ei8_v_f16m1(__VA_ARGS__)
+#define vloxseg2ei8_v_f16m2(...) __riscv_vloxseg2ei8_v_f16m2(__VA_ARGS__)
+#define vloxseg3ei8_v_f16m2(...) __riscv_vloxseg3ei8_v_f16m2(__VA_ARGS__)
+#define vloxseg4ei8_v_f16m2(...) __riscv_vloxseg4ei8_v_f16m2(__VA_ARGS__)
+#define vloxseg2ei8_v_f16m4(...) __riscv_vloxseg2ei8_v_f16m4(__VA_ARGS__)
+#define vloxseg2ei16_v_f16mf4(...) __riscv_vloxseg2ei16_v_f16mf4(__VA_ARGS__)
+#define vloxseg3ei16_v_f16mf4(...) __riscv_vloxseg3ei16_v_f16mf4(__VA_ARGS__)
+#define vloxseg4ei16_v_f16mf4(...) __riscv_vloxseg4ei16_v_f16mf4(__VA_ARGS__)
+#define vloxseg5ei16_v_f16mf4(...) __riscv_vloxseg5ei16_v_f16mf4(__VA_ARGS__)
+#define vloxseg6ei16_v_f16mf4(...) __riscv_vloxseg6ei16_v_f16mf4(__VA_ARGS__)
+#define vloxseg7ei16_v_f16mf4(...) __riscv_vloxseg7ei16_v_f16mf4(__VA_ARGS__)
+#define vloxseg8ei16_v_f16mf4(...) __riscv_vloxseg8ei16_v_f16mf4(__VA_ARGS__)
+#define vloxseg2ei16_v_f16mf2(...) __riscv_vloxseg2ei16_v_f16mf2(__VA_ARGS__)
+#define vloxseg3ei16_v_f16mf2(...) __riscv_vloxseg3ei16_v_f16mf2(__VA_ARGS__)
+#define vloxseg4ei16_v_f16mf2(...) __riscv_vloxseg4ei16_v_f16mf2(__VA_ARGS__)
+#define vloxseg5ei16_v_f16mf2(...) __riscv_vloxseg5ei16_v_f16mf2(__VA_ARGS__)
+#define vloxseg6ei16_v_f16mf2(...) __riscv_vloxseg6ei16_v_f16mf2(__VA_ARGS__)
+#define vloxseg7ei16_v_f16mf2(...) __riscv_vloxseg7ei16_v_f16mf2(__VA_ARGS__)
+#define vloxseg8ei16_v_f16mf2(...) __riscv_vloxseg8ei16_v_f16mf2(__VA_ARGS__)
+#define vloxseg2ei16_v_f16m1(...) __riscv_vloxseg2ei16_v_f16m1(__VA_ARGS__)
+#define vloxseg3ei16_v_f16m1(...) __riscv_vloxseg3ei16_v_f16m1(__VA_ARGS__)
+#define vloxseg4ei16_v_f16m1(...) __riscv_vloxseg4ei16_v_f16m1(__VA_ARGS__)
+#define vloxseg5ei16_v_f16m1(...) __riscv_vloxseg5ei16_v_f16m1(__VA_ARGS__)
+#define vloxseg6ei16_v_f16m1(...) __riscv_vloxseg6ei16_v_f16m1(__VA_ARGS__)
+#define vloxseg7ei16_v_f16m1(...) __riscv_vloxseg7ei16_v_f16m1(__VA_ARGS__)
+#define vloxseg8ei16_v_f16m1(...) __riscv_vloxseg8ei16_v_f16m1(__VA_ARGS__)
+#define vloxseg2ei16_v_f16m2(...) __riscv_vloxseg2ei16_v_f16m2(__VA_ARGS__)
+#define vloxseg3ei16_v_f16m2(...) __riscv_vloxseg3ei16_v_f16m2(__VA_ARGS__)
+#define vloxseg4ei16_v_f16m2(...) __riscv_vloxseg4ei16_v_f16m2(__VA_ARGS__)
+#define vloxseg2ei16_v_f16m4(...) __riscv_vloxseg2ei16_v_f16m4(__VA_ARGS__)
+#define vloxseg2ei32_v_f16mf4(...) __riscv_vloxseg2ei32_v_f16mf4(__VA_ARGS__)
+#define vloxseg3ei32_v_f16mf4(...) __riscv_vloxseg3ei32_v_f16mf4(__VA_ARGS__)
+#define vloxseg4ei32_v_f16mf4(...) __riscv_vloxseg4ei32_v_f16mf4(__VA_ARGS__)
+#define vloxseg5ei32_v_f16mf4(...) __riscv_vloxseg5ei32_v_f16mf4(__VA_ARGS__)
+#define vloxseg6ei32_v_f16mf4(...) __riscv_vloxseg6ei32_v_f16mf4(__VA_ARGS__)
+#define vloxseg7ei32_v_f16mf4(...) __riscv_vloxseg7ei32_v_f16mf4(__VA_ARGS__)
+#define vloxseg8ei32_v_f16mf4(...) __riscv_vloxseg8ei32_v_f16mf4(__VA_ARGS__)
+#define vloxseg2ei32_v_f16mf2(...) __riscv_vloxseg2ei32_v_f16mf2(__VA_ARGS__)
+#define vloxseg3ei32_v_f16mf2(...) __riscv_vloxseg3ei32_v_f16mf2(__VA_ARGS__)
+#define vloxseg4ei32_v_f16mf2(...) __riscv_vloxseg4ei32_v_f16mf2(__VA_ARGS__)
+#define vloxseg5ei32_v_f16mf2(...) __riscv_vloxseg5ei32_v_f16mf2(__VA_ARGS__)
+#define vloxseg6ei32_v_f16mf2(...) __riscv_vloxseg6ei32_v_f16mf2(__VA_ARGS__)
+#define vloxseg7ei32_v_f16mf2(...) __riscv_vloxseg7ei32_v_f16mf2(__VA_ARGS__)
+#define vloxseg8ei32_v_f16mf2(...) __riscv_vloxseg8ei32_v_f16mf2(__VA_ARGS__)
+#define vloxseg2ei32_v_f16m1(...) __riscv_vloxseg2ei32_v_f16m1(__VA_ARGS__)
+#define vloxseg3ei32_v_f16m1(...) __riscv_vloxseg3ei32_v_f16m1(__VA_ARGS__)
+#define vloxseg4ei32_v_f16m1(...) __riscv_vloxseg4ei32_v_f16m1(__VA_ARGS__)
+#define vloxseg5ei32_v_f16m1(...) __riscv_vloxseg5ei32_v_f16m1(__VA_ARGS__)
+#define vloxseg6ei32_v_f16m1(...) __riscv_vloxseg6ei32_v_f16m1(__VA_ARGS__)
+#define vloxseg7ei32_v_f16m1(...) __riscv_vloxseg7ei32_v_f16m1(__VA_ARGS__)
+#define vloxseg8ei32_v_f16m1(...) __riscv_vloxseg8ei32_v_f16m1(__VA_ARGS__)
+#define vloxseg2ei32_v_f16m2(...) __riscv_vloxseg2ei32_v_f16m2(__VA_ARGS__)
+#define vloxseg3ei32_v_f16m2(...) __riscv_vloxseg3ei32_v_f16m2(__VA_ARGS__)
+#define vloxseg4ei32_v_f16m2(...) __riscv_vloxseg4ei32_v_f16m2(__VA_ARGS__)
+#define vloxseg2ei32_v_f16m4(...) __riscv_vloxseg2ei32_v_f16m4(__VA_ARGS__)
+#define vloxseg2ei64_v_f16mf4(...) __riscv_vloxseg2ei64_v_f16mf4(__VA_ARGS__)
+#define vloxseg3ei64_v_f16mf4(...) __riscv_vloxseg3ei64_v_f16mf4(__VA_ARGS__)
+#define vloxseg4ei64_v_f16mf4(...) __riscv_vloxseg4ei64_v_f16mf4(__VA_ARGS__)
+#define vloxseg5ei64_v_f16mf4(...) __riscv_vloxseg5ei64_v_f16mf4(__VA_ARGS__)
+#define vloxseg6ei64_v_f16mf4(...) __riscv_vloxseg6ei64_v_f16mf4(__VA_ARGS__)
+#define vloxseg7ei64_v_f16mf4(...) __riscv_vloxseg7ei64_v_f16mf4(__VA_ARGS__)
+#define vloxseg8ei64_v_f16mf4(...) __riscv_vloxseg8ei64_v_f16mf4(__VA_ARGS__)
+#define vloxseg2ei64_v_f16mf2(...) __riscv_vloxseg2ei64_v_f16mf2(__VA_ARGS__)
+#define vloxseg3ei64_v_f16mf2(...) __riscv_vloxseg3ei64_v_f16mf2(__VA_ARGS__)
+#define vloxseg4ei64_v_f16mf2(...) __riscv_vloxseg4ei64_v_f16mf2(__VA_ARGS__)
+#define vloxseg5ei64_v_f16mf2(...) __riscv_vloxseg5ei64_v_f16mf2(__VA_ARGS__)
+#define vloxseg6ei64_v_f16mf2(...) __riscv_vloxseg6ei64_v_f16mf2(__VA_ARGS__)
+#define vloxseg7ei64_v_f16mf2(...) __riscv_vloxseg7ei64_v_f16mf2(__VA_ARGS__)
+#define vloxseg8ei64_v_f16mf2(...) __riscv_vloxseg8ei64_v_f16mf2(__VA_ARGS__)
+#define vloxseg2ei64_v_f16m1(...) __riscv_vloxseg2ei64_v_f16m1(__VA_ARGS__)
+#define vloxseg3ei64_v_f16m1(...) __riscv_vloxseg3ei64_v_f16m1(__VA_ARGS__)
+#define vloxseg4ei64_v_f16m1(...) __riscv_vloxseg4ei64_v_f16m1(__VA_ARGS__)
+#define vloxseg5ei64_v_f16m1(...) __riscv_vloxseg5ei64_v_f16m1(__VA_ARGS__)
+#define vloxseg6ei64_v_f16m1(...) __riscv_vloxseg6ei64_v_f16m1(__VA_ARGS__)
+#define vloxseg7ei64_v_f16m1(...) __riscv_vloxseg7ei64_v_f16m1(__VA_ARGS__)
+#define vloxseg8ei64_v_f16m1(...) __riscv_vloxseg8ei64_v_f16m1(__VA_ARGS__)
+#define vloxseg2ei64_v_f16m2(...) __riscv_vloxseg2ei64_v_f16m2(__VA_ARGS__)
+#define vloxseg3ei64_v_f16m2(...) __riscv_vloxseg3ei64_v_f16m2(__VA_ARGS__)
+#define vloxseg4ei64_v_f16m2(...) __riscv_vloxseg4ei64_v_f16m2(__VA_ARGS__)
+#define vloxseg2ei8_v_f32mf2(...) __riscv_vloxseg2ei8_v_f32mf2(__VA_ARGS__)
+#define vloxseg3ei8_v_f32mf2(...) __riscv_vloxseg3ei8_v_f32mf2(__VA_ARGS__)
+#define vloxseg4ei8_v_f32mf2(...) __riscv_vloxseg4ei8_v_f32mf2(__VA_ARGS__)
+#define vloxseg5ei8_v_f32mf2(...) __riscv_vloxseg5ei8_v_f32mf2(__VA_ARGS__)
+#define vloxseg6ei8_v_f32mf2(...) __riscv_vloxseg6ei8_v_f32mf2(__VA_ARGS__)
+#define vloxseg7ei8_v_f32mf2(...) __riscv_vloxseg7ei8_v_f32mf2(__VA_ARGS__)
+#define vloxseg8ei8_v_f32mf2(...) __riscv_vloxseg8ei8_v_f32mf2(__VA_ARGS__)
+#define vloxseg2ei8_v_f32m1(...) __riscv_vloxseg2ei8_v_f32m1(__VA_ARGS__)
+#define vloxseg3ei8_v_f32m1(...) __riscv_vloxseg3ei8_v_f32m1(__VA_ARGS__)
+#define vloxseg4ei8_v_f32m1(...) __riscv_vloxseg4ei8_v_f32m1(__VA_ARGS__)
+#define vloxseg5ei8_v_f32m1(...) __riscv_vloxseg5ei8_v_f32m1(__VA_ARGS__)
+#define vloxseg6ei8_v_f32m1(...) __riscv_vloxseg6ei8_v_f32m1(__VA_ARGS__)
+#define vloxseg7ei8_v_f32m1(...) __riscv_vloxseg7ei8_v_f32m1(__VA_ARGS__)
+#define vloxseg8ei8_v_f32m1(...) __riscv_vloxseg8ei8_v_f32m1(__VA_ARGS__)
+#define vloxseg2ei8_v_f32m2(...) __riscv_vloxseg2ei8_v_f32m2(__VA_ARGS__)
+#define vloxseg3ei8_v_f32m2(...) __riscv_vloxseg3ei8_v_f32m2(__VA_ARGS__)
+#define vloxseg4ei8_v_f32m2(...) __riscv_vloxseg4ei8_v_f32m2(__VA_ARGS__)
+#define vloxseg2ei8_v_f32m4(...) __riscv_vloxseg2ei8_v_f32m4(__VA_ARGS__)
+#define vloxseg2ei16_v_f32mf2(...) __riscv_vloxseg2ei16_v_f32mf2(__VA_ARGS__)
+#define vloxseg3ei16_v_f32mf2(...) __riscv_vloxseg3ei16_v_f32mf2(__VA_ARGS__)
+#define vloxseg4ei16_v_f32mf2(...) __riscv_vloxseg4ei16_v_f32mf2(__VA_ARGS__)
+#define vloxseg5ei16_v_f32mf2(...) __riscv_vloxseg5ei16_v_f32mf2(__VA_ARGS__)
+#define vloxseg6ei16_v_f32mf2(...) __riscv_vloxseg6ei16_v_f32mf2(__VA_ARGS__)
+#define vloxseg7ei16_v_f32mf2(...) __riscv_vloxseg7ei16_v_f32mf2(__VA_ARGS__)
+#define vloxseg8ei16_v_f32mf2(...) __riscv_vloxseg8ei16_v_f32mf2(__VA_ARGS__)
+#define vloxseg2ei16_v_f32m1(...) __riscv_vloxseg2ei16_v_f32m1(__VA_ARGS__)
+#define vloxseg3ei16_v_f32m1(...) __riscv_vloxseg3ei16_v_f32m1(__VA_ARGS__)
+#define vloxseg4ei16_v_f32m1(...) __riscv_vloxseg4ei16_v_f32m1(__VA_ARGS__)
+#define vloxseg5ei16_v_f32m1(...) __riscv_vloxseg5ei16_v_f32m1(__VA_ARGS__)
+#define vloxseg6ei16_v_f32m1(...) __riscv_vloxseg6ei16_v_f32m1(__VA_ARGS__)
+#define vloxseg7ei16_v_f32m1(...) __riscv_vloxseg7ei16_v_f32m1(__VA_ARGS__)
+#define vloxseg8ei16_v_f32m1(...) __riscv_vloxseg8ei16_v_f32m1(__VA_ARGS__)
+#define vloxseg2ei16_v_f32m2(...) __riscv_vloxseg2ei16_v_f32m2(__VA_ARGS__)
+#define vloxseg3ei16_v_f32m2(...) __riscv_vloxseg3ei16_v_f32m2(__VA_ARGS__)
+#define vloxseg4ei16_v_f32m2(...) __riscv_vloxseg4ei16_v_f32m2(__VA_ARGS__)
+#define vloxseg2ei16_v_f32m4(...) __riscv_vloxseg2ei16_v_f32m4(__VA_ARGS__)
+#define vloxseg2ei32_v_f32mf2(...) __riscv_vloxseg2ei32_v_f32mf2(__VA_ARGS__)
+#define vloxseg3ei32_v_f32mf2(...) __riscv_vloxseg3ei32_v_f32mf2(__VA_ARGS__)
+#define vloxseg4ei32_v_f32mf2(...) __riscv_vloxseg4ei32_v_f32mf2(__VA_ARGS__)
+#define vloxseg5ei32_v_f32mf2(...) __riscv_vloxseg5ei32_v_f32mf2(__VA_ARGS__)
+#define vloxseg6ei32_v_f32mf2(...) __riscv_vloxseg6ei32_v_f32mf2(__VA_ARGS__)
+#define vloxseg7ei32_v_f32mf2(...) __riscv_vloxseg7ei32_v_f32mf2(__VA_ARGS__)
+#define vloxseg8ei32_v_f32mf2(...) __riscv_vloxseg8ei32_v_f32mf2(__VA_ARGS__)
+#define vloxseg2ei32_v_f32m1(...) __riscv_vloxseg2ei32_v_f32m1(__VA_ARGS__)
+#define vloxseg3ei32_v_f32m1(...) __riscv_vloxseg3ei32_v_f32m1(__VA_ARGS__)
+#define vloxseg4ei32_v_f32m1(...) __riscv_vloxseg4ei32_v_f32m1(__VA_ARGS__)
+#define vloxseg5ei32_v_f32m1(...) __riscv_vloxseg5ei32_v_f32m1(__VA_ARGS__)
+#define vloxseg6ei32_v_f32m1(...) __riscv_vloxseg6ei32_v_f32m1(__VA_ARGS__)
+#define vloxseg7ei32_v_f32m1(...) __riscv_vloxseg7ei32_v_f32m1(__VA_ARGS__)
+#define vloxseg8ei32_v_f32m1(...) __riscv_vloxseg8ei32_v_f32m1(__VA_ARGS__)
+#define vloxseg2ei32_v_f32m2(...) __riscv_vloxseg2ei32_v_f32m2(__VA_ARGS__)
+#define vloxseg3ei32_v_f32m2(...) __riscv_vloxseg3ei32_v_f32m2(__VA_ARGS__)
+#define vloxseg4ei32_v_f32m2(...) __riscv_vloxseg4ei32_v_f32m2(__VA_ARGS__)
+#define vloxseg2ei32_v_f32m4(...) __riscv_vloxseg2ei32_v_f32m4(__VA_ARGS__)
+#define vloxseg2ei64_v_f32mf2(...) __riscv_vloxseg2ei64_v_f32mf2(__VA_ARGS__)
+#define vloxseg3ei64_v_f32mf2(...) __riscv_vloxseg3ei64_v_f32mf2(__VA_ARGS__)
+#define vloxseg4ei64_v_f32mf2(...) __riscv_vloxseg4ei64_v_f32mf2(__VA_ARGS__)
+#define vloxseg5ei64_v_f32mf2(...) __riscv_vloxseg5ei64_v_f32mf2(__VA_ARGS__)
+#define vloxseg6ei64_v_f32mf2(...) __riscv_vloxseg6ei64_v_f32mf2(__VA_ARGS__)
+#define vloxseg7ei64_v_f32mf2(...) __riscv_vloxseg7ei64_v_f32mf2(__VA_ARGS__)
+#define vloxseg8ei64_v_f32mf2(...) __riscv_vloxseg8ei64_v_f32mf2(__VA_ARGS__)
+#define vloxseg2ei64_v_f32m1(...) __riscv_vloxseg2ei64_v_f32m1(__VA_ARGS__)
+#define vloxseg3ei64_v_f32m1(...) __riscv_vloxseg3ei64_v_f32m1(__VA_ARGS__)
+#define vloxseg4ei64_v_f32m1(...) __riscv_vloxseg4ei64_v_f32m1(__VA_ARGS__)
+#define vloxseg5ei64_v_f32m1(...) __riscv_vloxseg5ei64_v_f32m1(__VA_ARGS__)
+#define vloxseg6ei64_v_f32m1(...) __riscv_vloxseg6ei64_v_f32m1(__VA_ARGS__)
+#define vloxseg7ei64_v_f32m1(...) __riscv_vloxseg7ei64_v_f32m1(__VA_ARGS__)
+#define vloxseg8ei64_v_f32m1(...) __riscv_vloxseg8ei64_v_f32m1(__VA_ARGS__)
+#define vloxseg2ei64_v_f32m2(...) __riscv_vloxseg2ei64_v_f32m2(__VA_ARGS__)
+#define vloxseg3ei64_v_f32m2(...) __riscv_vloxseg3ei64_v_f32m2(__VA_ARGS__)
+#define vloxseg4ei64_v_f32m2(...) __riscv_vloxseg4ei64_v_f32m2(__VA_ARGS__)
+#define vloxseg2ei64_v_f32m4(...) __riscv_vloxseg2ei64_v_f32m4(__VA_ARGS__)
+#define vloxseg2ei8_v_f64m1(...) __riscv_vloxseg2ei8_v_f64m1(__VA_ARGS__)
+#define vloxseg3ei8_v_f64m1(...) __riscv_vloxseg3ei8_v_f64m1(__VA_ARGS__)
+#define vloxseg4ei8_v_f64m1(...) __riscv_vloxseg4ei8_v_f64m1(__VA_ARGS__)
+#define vloxseg5ei8_v_f64m1(...) __riscv_vloxseg5ei8_v_f64m1(__VA_ARGS__)
+#define vloxseg6ei8_v_f64m1(...) __riscv_vloxseg6ei8_v_f64m1(__VA_ARGS__)
+#define vloxseg7ei8_v_f64m1(...) __riscv_vloxseg7ei8_v_f64m1(__VA_ARGS__)
+#define vloxseg8ei8_v_f64m1(...) __riscv_vloxseg8ei8_v_f64m1(__VA_ARGS__)
+#define vloxseg2ei8_v_f64m2(...) __riscv_vloxseg2ei8_v_f64m2(__VA_ARGS__)
+#define vloxseg3ei8_v_f64m2(...) __riscv_vloxseg3ei8_v_f64m2(__VA_ARGS__)
+#define vloxseg4ei8_v_f64m2(...) __riscv_vloxseg4ei8_v_f64m2(__VA_ARGS__)
+#define vloxseg2ei8_v_f64m4(...) __riscv_vloxseg2ei8_v_f64m4(__VA_ARGS__)
+#define vloxseg2ei16_v_f64m1(...) __riscv_vloxseg2ei16_v_f64m1(__VA_ARGS__)
+#define vloxseg3ei16_v_f64m1(...) __riscv_vloxseg3ei16_v_f64m1(__VA_ARGS__)
+#define vloxseg4ei16_v_f64m1(...) __riscv_vloxseg4ei16_v_f64m1(__VA_ARGS__)
+#define vloxseg5ei16_v_f64m1(...) __riscv_vloxseg5ei16_v_f64m1(__VA_ARGS__)
+#define vloxseg6ei16_v_f64m1(...) __riscv_vloxseg6ei16_v_f64m1(__VA_ARGS__)
+#define vloxseg7ei16_v_f64m1(...) __riscv_vloxseg7ei16_v_f64m1(__VA_ARGS__)
+#define vloxseg8ei16_v_f64m1(...) __riscv_vloxseg8ei16_v_f64m1(__VA_ARGS__)
+#define vloxseg2ei16_v_f64m2(...) __riscv_vloxseg2ei16_v_f64m2(__VA_ARGS__)
+#define vloxseg3ei16_v_f64m2(...) __riscv_vloxseg3ei16_v_f64m2(__VA_ARGS__)
+#define vloxseg4ei16_v_f64m2(...) __riscv_vloxseg4ei16_v_f64m2(__VA_ARGS__)
+#define vloxseg2ei16_v_f64m4(...) __riscv_vloxseg2ei16_v_f64m4(__VA_ARGS__)
+#define vloxseg2ei32_v_f64m1(...) __riscv_vloxseg2ei32_v_f64m1(__VA_ARGS__)
+#define vloxseg3ei32_v_f64m1(...) __riscv_vloxseg3ei32_v_f64m1(__VA_ARGS__)
+#define vloxseg4ei32_v_f64m1(...) __riscv_vloxseg4ei32_v_f64m1(__VA_ARGS__)
+#define vloxseg5ei32_v_f64m1(...) __riscv_vloxseg5ei32_v_f64m1(__VA_ARGS__)
+#define vloxseg6ei32_v_f64m1(...) __riscv_vloxseg6ei32_v_f64m1(__VA_ARGS__)
+#define vloxseg7ei32_v_f64m1(...) __riscv_vloxseg7ei32_v_f64m1(__VA_ARGS__)
+#define vloxseg8ei32_v_f64m1(...) __riscv_vloxseg8ei32_v_f64m1(__VA_ARGS__)
+#define vloxseg2ei32_v_f64m2(...) __riscv_vloxseg2ei32_v_f64m2(__VA_ARGS__)
+#define vloxseg3ei32_v_f64m2(...) __riscv_vloxseg3ei32_v_f64m2(__VA_ARGS__)
+#define vloxseg4ei32_v_f64m2(...) __riscv_vloxseg4ei32_v_f64m2(__VA_ARGS__)
+#define vloxseg2ei32_v_f64m4(...) __riscv_vloxseg2ei32_v_f64m4(__VA_ARGS__)
+#define vloxseg2ei64_v_f64m1(...) __riscv_vloxseg2ei64_v_f64m1(__VA_ARGS__)
+#define vloxseg3ei64_v_f64m1(...) __riscv_vloxseg3ei64_v_f64m1(__VA_ARGS__)
+#define vloxseg4ei64_v_f64m1(...) __riscv_vloxseg4ei64_v_f64m1(__VA_ARGS__)
+#define vloxseg5ei64_v_f64m1(...) __riscv_vloxseg5ei64_v_f64m1(__VA_ARGS__)
+#define vloxseg6ei64_v_f64m1(...) __riscv_vloxseg6ei64_v_f64m1(__VA_ARGS__)
+#define vloxseg7ei64_v_f64m1(...) __riscv_vloxseg7ei64_v_f64m1(__VA_ARGS__)
+#define vloxseg8ei64_v_f64m1(...) __riscv_vloxseg8ei64_v_f64m1(__VA_ARGS__)
+#define vloxseg2ei64_v_f64m2(...) __riscv_vloxseg2ei64_v_f64m2(__VA_ARGS__)
+#define vloxseg3ei64_v_f64m2(...) __riscv_vloxseg3ei64_v_f64m2(__VA_ARGS__)
+#define vloxseg4ei64_v_f64m2(...) __riscv_vloxseg4ei64_v_f64m2(__VA_ARGS__)
+#define vloxseg2ei64_v_f64m4(...) __riscv_vloxseg2ei64_v_f64m4(__VA_ARGS__)
+#define vluxseg2ei8_v_f16mf4(...) __riscv_vluxseg2ei8_v_f16mf4(__VA_ARGS__)
+#define vluxseg3ei8_v_f16mf4(...) __riscv_vluxseg3ei8_v_f16mf4(__VA_ARGS__)
+#define vluxseg4ei8_v_f16mf4(...) __riscv_vluxseg4ei8_v_f16mf4(__VA_ARGS__)
+#define vluxseg5ei8_v_f16mf4(...) __riscv_vluxseg5ei8_v_f16mf4(__VA_ARGS__)
+#define vluxseg6ei8_v_f16mf4(...) __riscv_vluxseg6ei8_v_f16mf4(__VA_ARGS__)
+#define vluxseg7ei8_v_f16mf4(...) __riscv_vluxseg7ei8_v_f16mf4(__VA_ARGS__)
+#define vluxseg8ei8_v_f16mf4(...) __riscv_vluxseg8ei8_v_f16mf4(__VA_ARGS__)
+#define vluxseg2ei8_v_f16mf2(...) __riscv_vluxseg2ei8_v_f16mf2(__VA_ARGS__)
+#define vluxseg3ei8_v_f16mf2(...) __riscv_vluxseg3ei8_v_f16mf2(__VA_ARGS__)
+#define vluxseg4ei8_v_f16mf2(...) __riscv_vluxseg4ei8_v_f16mf2(__VA_ARGS__)
+#define vluxseg5ei8_v_f16mf2(...) __riscv_vluxseg5ei8_v_f16mf2(__VA_ARGS__)
+#define vluxseg6ei8_v_f16mf2(...) __riscv_vluxseg6ei8_v_f16mf2(__VA_ARGS__)
+#define vluxseg7ei8_v_f16mf2(...) __riscv_vluxseg7ei8_v_f16mf2(__VA_ARGS__)
+#define vluxseg8ei8_v_f16mf2(...) __riscv_vluxseg8ei8_v_f16mf2(__VA_ARGS__)
+#define vluxseg2ei8_v_f16m1(...) __riscv_vluxseg2ei8_v_f16m1(__VA_ARGS__)
+#define vluxseg3ei8_v_f16m1(...) __riscv_vluxseg3ei8_v_f16m1(__VA_ARGS__)
+#define vluxseg4ei8_v_f16m1(...) __riscv_vluxseg4ei8_v_f16m1(__VA_ARGS__)
+#define vluxseg5ei8_v_f16m1(...) __riscv_vluxseg5ei8_v_f16m1(__VA_ARGS__)
+#define vluxseg6ei8_v_f16m1(...) __riscv_vluxseg6ei8_v_f16m1(__VA_ARGS__)
+#define vluxseg7ei8_v_f16m1(...) __riscv_vluxseg7ei8_v_f16m1(__VA_ARGS__)
+#define vluxseg8ei8_v_f16m1(...) __riscv_vluxseg8ei8_v_f16m1(__VA_ARGS__)
+#define vluxseg2ei8_v_f16m2(...) __riscv_vluxseg2ei8_v_f16m2(__VA_ARGS__)
+#define vluxseg3ei8_v_f16m2(...) __riscv_vluxseg3ei8_v_f16m2(__VA_ARGS__)
+#define vluxseg4ei8_v_f16m2(...) __riscv_vluxseg4ei8_v_f16m2(__VA_ARGS__)
+#define vluxseg2ei8_v_f16m4(...) __riscv_vluxseg2ei8_v_f16m4(__VA_ARGS__)
+#define vluxseg2ei16_v_f16mf4(...) __riscv_vluxseg2ei16_v_f16mf4(__VA_ARGS__)
+#define vluxseg3ei16_v_f16mf4(...) __riscv_vluxseg3ei16_v_f16mf4(__VA_ARGS__)
+#define vluxseg4ei16_v_f16mf4(...) __riscv_vluxseg4ei16_v_f16mf4(__VA_ARGS__)
+#define vluxseg5ei16_v_f16mf4(...) __riscv_vluxseg5ei16_v_f16mf4(__VA_ARGS__)
+#define vluxseg6ei16_v_f16mf4(...) __riscv_vluxseg6ei16_v_f16mf4(__VA_ARGS__)
+#define vluxseg7ei16_v_f16mf4(...) __riscv_vluxseg7ei16_v_f16mf4(__VA_ARGS__)
+#define vluxseg8ei16_v_f16mf4(...) __riscv_vluxseg8ei16_v_f16mf4(__VA_ARGS__)
+#define vluxseg2ei16_v_f16mf2(...) __riscv_vluxseg2ei16_v_f16mf2(__VA_ARGS__)
+#define vluxseg3ei16_v_f16mf2(...) __riscv_vluxseg3ei16_v_f16mf2(__VA_ARGS__)
+#define vluxseg4ei16_v_f16mf2(...) __riscv_vluxseg4ei16_v_f16mf2(__VA_ARGS__)
+#define vluxseg5ei16_v_f16mf2(...) __riscv_vluxseg5ei16_v_f16mf2(__VA_ARGS__)
+#define vluxseg6ei16_v_f16mf2(...) __riscv_vluxseg6ei16_v_f16mf2(__VA_ARGS__)
+#define vluxseg7ei16_v_f16mf2(...) __riscv_vluxseg7ei16_v_f16mf2(__VA_ARGS__)
+#define vluxseg8ei16_v_f16mf2(...) __riscv_vluxseg8ei16_v_f16mf2(__VA_ARGS__)
+#define vluxseg2ei16_v_f16m1(...) __riscv_vluxseg2ei16_v_f16m1(__VA_ARGS__)
+#define vluxseg3ei16_v_f16m1(...) __riscv_vluxseg3ei16_v_f16m1(__VA_ARGS__)
+#define vluxseg4ei16_v_f16m1(...) __riscv_vluxseg4ei16_v_f16m1(__VA_ARGS__)
+#define vluxseg5ei16_v_f16m1(...) __riscv_vluxseg5ei16_v_f16m1(__VA_ARGS__)
+#define vluxseg6ei16_v_f16m1(...) __riscv_vluxseg6ei16_v_f16m1(__VA_ARGS__)
+#define vluxseg7ei16_v_f16m1(...) __riscv_vluxseg7ei16_v_f16m1(__VA_ARGS__)
+#define vluxseg8ei16_v_f16m1(...) __riscv_vluxseg8ei16_v_f16m1(__VA_ARGS__)
+#define vluxseg2ei16_v_f16m2(...) __riscv_vluxseg2ei16_v_f16m2(__VA_ARGS__)
+#define vluxseg3ei16_v_f16m2(...) __riscv_vluxseg3ei16_v_f16m2(__VA_ARGS__)
+#define vluxseg4ei16_v_f16m2(...) __riscv_vluxseg4ei16_v_f16m2(__VA_ARGS__)
+#define vluxseg2ei16_v_f16m4(...) __riscv_vluxseg2ei16_v_f16m4(__VA_ARGS__)
+#define vluxseg2ei32_v_f16mf4(...) __riscv_vluxseg2ei32_v_f16mf4(__VA_ARGS__)
+#define vluxseg3ei32_v_f16mf4(...) __riscv_vluxseg3ei32_v_f16mf4(__VA_ARGS__)
+#define vluxseg4ei32_v_f16mf4(...) __riscv_vluxseg4ei32_v_f16mf4(__VA_ARGS__)
+#define vluxseg5ei32_v_f16mf4(...) __riscv_vluxseg5ei32_v_f16mf4(__VA_ARGS__)
+#define vluxseg6ei32_v_f16mf4(...) __riscv_vluxseg6ei32_v_f16mf4(__VA_ARGS__)
+#define vluxseg7ei32_v_f16mf4(...) __riscv_vluxseg7ei32_v_f16mf4(__VA_ARGS__)
+#define vluxseg8ei32_v_f16mf4(...) __riscv_vluxseg8ei32_v_f16mf4(__VA_ARGS__)
+#define vluxseg2ei32_v_f16mf2(...) __riscv_vluxseg2ei32_v_f16mf2(__VA_ARGS__)
+#define vluxseg3ei32_v_f16mf2(...) __riscv_vluxseg3ei32_v_f16mf2(__VA_ARGS__)
+#define vluxseg4ei32_v_f16mf2(...) __riscv_vluxseg4ei32_v_f16mf2(__VA_ARGS__)
+#define vluxseg5ei32_v_f16mf2(...) __riscv_vluxseg5ei32_v_f16mf2(__VA_ARGS__)
+#define vluxseg6ei32_v_f16mf2(...) __riscv_vluxseg6ei32_v_f16mf2(__VA_ARGS__)
+#define vluxseg7ei32_v_f16mf2(...) __riscv_vluxseg7ei32_v_f16mf2(__VA_ARGS__)
+#define vluxseg8ei32_v_f16mf2(...) __riscv_vluxseg8ei32_v_f16mf2(__VA_ARGS__)
+#define vluxseg2ei32_v_f16m1(...) __riscv_vluxseg2ei32_v_f16m1(__VA_ARGS__)
+#define vluxseg3ei32_v_f16m1(...) __riscv_vluxseg3ei32_v_f16m1(__VA_ARGS__)
+#define vluxseg4ei32_v_f16m1(...) __riscv_vluxseg4ei32_v_f16m1(__VA_ARGS__)
+#define vluxseg5ei32_v_f16m1(...) __riscv_vluxseg5ei32_v_f16m1(__VA_ARGS__)
+#define vluxseg6ei32_v_f16m1(...) __riscv_vluxseg6ei32_v_f16m1(__VA_ARGS__)
+#define vluxseg7ei32_v_f16m1(...) __riscv_vluxseg7ei32_v_f16m1(__VA_ARGS__)
+#define vluxseg8ei32_v_f16m1(...) __riscv_vluxseg8ei32_v_f16m1(__VA_ARGS__)
+#define vluxseg2ei32_v_f16m2(...) __riscv_vluxseg2ei32_v_f16m2(__VA_ARGS__)
+#define vluxseg3ei32_v_f16m2(...) __riscv_vluxseg3ei32_v_f16m2(__VA_ARGS__)
+#define vluxseg4ei32_v_f16m2(...) __riscv_vluxseg4ei32_v_f16m2(__VA_ARGS__)
+#define vluxseg2ei32_v_f16m4(...) __riscv_vluxseg2ei32_v_f16m4(__VA_ARGS__)
+#define vluxseg2ei64_v_f16mf4(...) __riscv_vluxseg2ei64_v_f16mf4(__VA_ARGS__)
+#define vluxseg3ei64_v_f16mf4(...) __riscv_vluxseg3ei64_v_f16mf4(__VA_ARGS__)
+#define vluxseg4ei64_v_f16mf4(...) __riscv_vluxseg4ei64_v_f16mf4(__VA_ARGS__)
+#define vluxseg5ei64_v_f16mf4(...) __riscv_vluxseg5ei64_v_f16mf4(__VA_ARGS__)
+#define vluxseg6ei64_v_f16mf4(...) __riscv_vluxseg6ei64_v_f16mf4(__VA_ARGS__)
+#define vluxseg7ei64_v_f16mf4(...) __riscv_vluxseg7ei64_v_f16mf4(__VA_ARGS__)
+#define vluxseg8ei64_v_f16mf4(...) __riscv_vluxseg8ei64_v_f16mf4(__VA_ARGS__)
+#define vluxseg2ei64_v_f16mf2(...) __riscv_vluxseg2ei64_v_f16mf2(__VA_ARGS__)
+#define vluxseg3ei64_v_f16mf2(...) __riscv_vluxseg3ei64_v_f16mf2(__VA_ARGS__)
+#define vluxseg4ei64_v_f16mf2(...) __riscv_vluxseg4ei64_v_f16mf2(__VA_ARGS__)
+#define vluxseg5ei64_v_f16mf2(...) __riscv_vluxseg5ei64_v_f16mf2(__VA_ARGS__)
+#define vluxseg6ei64_v_f16mf2(...) __riscv_vluxseg6ei64_v_f16mf2(__VA_ARGS__)
+#define vluxseg7ei64_v_f16mf2(...) __riscv_vluxseg7ei64_v_f16mf2(__VA_ARGS__)
+#define vluxseg8ei64_v_f16mf2(...) __riscv_vluxseg8ei64_v_f16mf2(__VA_ARGS__)
+#define vluxseg2ei64_v_f16m1(...) __riscv_vluxseg2ei64_v_f16m1(__VA_ARGS__)
+#define vluxseg3ei64_v_f16m1(...) __riscv_vluxseg3ei64_v_f16m1(__VA_ARGS__)
+#define vluxseg4ei64_v_f16m1(...) __riscv_vluxseg4ei64_v_f16m1(__VA_ARGS__)
+#define vluxseg5ei64_v_f16m1(...) __riscv_vluxseg5ei64_v_f16m1(__VA_ARGS__)
+#define vluxseg6ei64_v_f16m1(...) __riscv_vluxseg6ei64_v_f16m1(__VA_ARGS__)
+#define vluxseg7ei64_v_f16m1(...) __riscv_vluxseg7ei64_v_f16m1(__VA_ARGS__)
+#define vluxseg8ei64_v_f16m1(...) __riscv_vluxseg8ei64_v_f16m1(__VA_ARGS__)
+#define vluxseg2ei64_v_f16m2(...) __riscv_vluxseg2ei64_v_f16m2(__VA_ARGS__)
+#define vluxseg3ei64_v_f16m2(...) __riscv_vluxseg3ei64_v_f16m2(__VA_ARGS__)
+#define vluxseg4ei64_v_f16m2(...) __riscv_vluxseg4ei64_v_f16m2(__VA_ARGS__)
+#define vluxseg2ei8_v_f32mf2(...) __riscv_vluxseg2ei8_v_f32mf2(__VA_ARGS__)
+#define vluxseg3ei8_v_f32mf2(...) __riscv_vluxseg3ei8_v_f32mf2(__VA_ARGS__)
+#define vluxseg4ei8_v_f32mf2(...) __riscv_vluxseg4ei8_v_f32mf2(__VA_ARGS__)
+#define vluxseg5ei8_v_f32mf2(...) __riscv_vluxseg5ei8_v_f32mf2(__VA_ARGS__)
+#define vluxseg6ei8_v_f32mf2(...) __riscv_vluxseg6ei8_v_f32mf2(__VA_ARGS__)
+#define vluxseg7ei8_v_f32mf2(...) __riscv_vluxseg7ei8_v_f32mf2(__VA_ARGS__)
+#define vluxseg8ei8_v_f32mf2(...) __riscv_vluxseg8ei8_v_f32mf2(__VA_ARGS__)
+#define vluxseg2ei8_v_f32m1(...) __riscv_vluxseg2ei8_v_f32m1(__VA_ARGS__)
+#define vluxseg3ei8_v_f32m1(...) __riscv_vluxseg3ei8_v_f32m1(__VA_ARGS__)
+#define vluxseg4ei8_v_f32m1(...) __riscv_vluxseg4ei8_v_f32m1(__VA_ARGS__)
+#define vluxseg5ei8_v_f32m1(...) __riscv_vluxseg5ei8_v_f32m1(__VA_ARGS__)
+#define vluxseg6ei8_v_f32m1(...) __riscv_vluxseg6ei8_v_f32m1(__VA_ARGS__)
+#define vluxseg7ei8_v_f32m1(...) __riscv_vluxseg7ei8_v_f32m1(__VA_ARGS__)
+#define vluxseg8ei8_v_f32m1(...) __riscv_vluxseg8ei8_v_f32m1(__VA_ARGS__)
+#define vluxseg2ei8_v_f32m2(...) __riscv_vluxseg2ei8_v_f32m2(__VA_ARGS__)
+#define vluxseg3ei8_v_f32m2(...) __riscv_vluxseg3ei8_v_f32m2(__VA_ARGS__)
+#define vluxseg4ei8_v_f32m2(...) __riscv_vluxseg4ei8_v_f32m2(__VA_ARGS__)
+#define vluxseg2ei8_v_f32m4(...) __riscv_vluxseg2ei8_v_f32m4(__VA_ARGS__)
+#define vluxseg2ei16_v_f32mf2(...) __riscv_vluxseg2ei16_v_f32mf2(__VA_ARGS__)
+#define vluxseg3ei16_v_f32mf2(...) __riscv_vluxseg3ei16_v_f32mf2(__VA_ARGS__)
+#define vluxseg4ei16_v_f32mf2(...) __riscv_vluxseg4ei16_v_f32mf2(__VA_ARGS__)
+#define vluxseg5ei16_v_f32mf2(...) __riscv_vluxseg5ei16_v_f32mf2(__VA_ARGS__)
+#define vluxseg6ei16_v_f32mf2(...) __riscv_vluxseg6ei16_v_f32mf2(__VA_ARGS__)
+#define vluxseg7ei16_v_f32mf2(...) __riscv_vluxseg7ei16_v_f32mf2(__VA_ARGS__)
+#define vluxseg8ei16_v_f32mf2(...) __riscv_vluxseg8ei16_v_f32mf2(__VA_ARGS__)
+#define vluxseg2ei16_v_f32m1(...) __riscv_vluxseg2ei16_v_f32m1(__VA_ARGS__)
+#define vluxseg3ei16_v_f32m1(...) __riscv_vluxseg3ei16_v_f32m1(__VA_ARGS__)
+#define vluxseg4ei16_v_f32m1(...) __riscv_vluxseg4ei16_v_f32m1(__VA_ARGS__)
+#define vluxseg5ei16_v_f32m1(...) __riscv_vluxseg5ei16_v_f32m1(__VA_ARGS__)
+#define vluxseg6ei16_v_f32m1(...) __riscv_vluxseg6ei16_v_f32m1(__VA_ARGS__)
+#define vluxseg7ei16_v_f32m1(...) __riscv_vluxseg7ei16_v_f32m1(__VA_ARGS__)
+#define vluxseg8ei16_v_f32m1(...) __riscv_vluxseg8ei16_v_f32m1(__VA_ARGS__)
+#define vluxseg2ei16_v_f32m2(...) __riscv_vluxseg2ei16_v_f32m2(__VA_ARGS__)
+#define vluxseg3ei16_v_f32m2(...) __riscv_vluxseg3ei16_v_f32m2(__VA_ARGS__)
+#define vluxseg4ei16_v_f32m2(...) __riscv_vluxseg4ei16_v_f32m2(__VA_ARGS__)
+#define vluxseg2ei16_v_f32m4(...) __riscv_vluxseg2ei16_v_f32m4(__VA_ARGS__)
+#define vluxseg2ei32_v_f32mf2(...) __riscv_vluxseg2ei32_v_f32mf2(__VA_ARGS__)
+#define vluxseg3ei32_v_f32mf2(...) __riscv_vluxseg3ei32_v_f32mf2(__VA_ARGS__)
+#define vluxseg4ei32_v_f32mf2(...) __riscv_vluxseg4ei32_v_f32mf2(__VA_ARGS__)
+#define vluxseg5ei32_v_f32mf2(...) __riscv_vluxseg5ei32_v_f32mf2(__VA_ARGS__)
+#define vluxseg6ei32_v_f32mf2(...) __riscv_vluxseg6ei32_v_f32mf2(__VA_ARGS__)
+#define vluxseg7ei32_v_f32mf2(...) __riscv_vluxseg7ei32_v_f32mf2(__VA_ARGS__)
+#define vluxseg8ei32_v_f32mf2(...) __riscv_vluxseg8ei32_v_f32mf2(__VA_ARGS__)
+#define vluxseg2ei32_v_f32m1(...) __riscv_vluxseg2ei32_v_f32m1(__VA_ARGS__)
+#define vluxseg3ei32_v_f32m1(...) __riscv_vluxseg3ei32_v_f32m1(__VA_ARGS__)
+#define vluxseg4ei32_v_f32m1(...) __riscv_vluxseg4ei32_v_f32m1(__VA_ARGS__)
+#define vluxseg5ei32_v_f32m1(...) __riscv_vluxseg5ei32_v_f32m1(__VA_ARGS__)
+#define vluxseg6ei32_v_f32m1(...) __riscv_vluxseg6ei32_v_f32m1(__VA_ARGS__)
+#define vluxseg7ei32_v_f32m1(...) __riscv_vluxseg7ei32_v_f32m1(__VA_ARGS__)
+#define vluxseg8ei32_v_f32m1(...) __riscv_vluxseg8ei32_v_f32m1(__VA_ARGS__)
+#define vluxseg2ei32_v_f32m2(...) __riscv_vluxseg2ei32_v_f32m2(__VA_ARGS__)
+#define vluxseg3ei32_v_f32m2(...) __riscv_vluxseg3ei32_v_f32m2(__VA_ARGS__)
+#define vluxseg4ei32_v_f32m2(...) __riscv_vluxseg4ei32_v_f32m2(__VA_ARGS__)
+#define vluxseg2ei32_v_f32m4(...) __riscv_vluxseg2ei32_v_f32m4(__VA_ARGS__)
+#define vluxseg2ei64_v_f32mf2(...) __riscv_vluxseg2ei64_v_f32mf2(__VA_ARGS__)
+#define vluxseg3ei64_v_f32mf2(...) __riscv_vluxseg3ei64_v_f32mf2(__VA_ARGS__)
+#define vluxseg4ei64_v_f32mf2(...) __riscv_vluxseg4ei64_v_f32mf2(__VA_ARGS__)
+#define vluxseg5ei64_v_f32mf2(...) __riscv_vluxseg5ei64_v_f32mf2(__VA_ARGS__)
+#define vluxseg6ei64_v_f32mf2(...) __riscv_vluxseg6ei64_v_f32mf2(__VA_ARGS__)
+#define vluxseg7ei64_v_f32mf2(...) __riscv_vluxseg7ei64_v_f32mf2(__VA_ARGS__)
+#define vluxseg8ei64_v_f32mf2(...) __riscv_vluxseg8ei64_v_f32mf2(__VA_ARGS__)
+#define vluxseg2ei64_v_f32m1(...) __riscv_vluxseg2ei64_v_f32m1(__VA_ARGS__)
+#define vluxseg3ei64_v_f32m1(...) __riscv_vluxseg3ei64_v_f32m1(__VA_ARGS__)
+#define vluxseg4ei64_v_f32m1(...) __riscv_vluxseg4ei64_v_f32m1(__VA_ARGS__)
+#define vluxseg5ei64_v_f32m1(...) __riscv_vluxseg5ei64_v_f32m1(__VA_ARGS__)
+#define vluxseg6ei64_v_f32m1(...) __riscv_vluxseg6ei64_v_f32m1(__VA_ARGS__)
+#define vluxseg7ei64_v_f32m1(...) __riscv_vluxseg7ei64_v_f32m1(__VA_ARGS__)
+#define vluxseg8ei64_v_f32m1(...) __riscv_vluxseg8ei64_v_f32m1(__VA_ARGS__)
+#define vluxseg2ei64_v_f32m2(...) __riscv_vluxseg2ei64_v_f32m2(__VA_ARGS__)
+#define vluxseg3ei64_v_f32m2(...) __riscv_vluxseg3ei64_v_f32m2(__VA_ARGS__)
+#define vluxseg4ei64_v_f32m2(...) __riscv_vluxseg4ei64_v_f32m2(__VA_ARGS__)
+#define vluxseg2ei64_v_f32m4(...) __riscv_vluxseg2ei64_v_f32m4(__VA_ARGS__)
+#define vluxseg2ei8_v_f64m1(...) __riscv_vluxseg2ei8_v_f64m1(__VA_ARGS__)
+#define vluxseg3ei8_v_f64m1(...) __riscv_vluxseg3ei8_v_f64m1(__VA_ARGS__)
+#define vluxseg4ei8_v_f64m1(...) __riscv_vluxseg4ei8_v_f64m1(__VA_ARGS__)
+#define vluxseg5ei8_v_f64m1(...) __riscv_vluxseg5ei8_v_f64m1(__VA_ARGS__)
+#define vluxseg6ei8_v_f64m1(...) __riscv_vluxseg6ei8_v_f64m1(__VA_ARGS__)
+#define vluxseg7ei8_v_f64m1(...) __riscv_vluxseg7ei8_v_f64m1(__VA_ARGS__)
+#define vluxseg8ei8_v_f64m1(...) __riscv_vluxseg8ei8_v_f64m1(__VA_ARGS__)
+#define vluxseg2ei8_v_f64m2(...) __riscv_vluxseg2ei8_v_f64m2(__VA_ARGS__)
+#define vluxseg3ei8_v_f64m2(...) __riscv_vluxseg3ei8_v_f64m2(__VA_ARGS__)
+#define vluxseg4ei8_v_f64m2(...) __riscv_vluxseg4ei8_v_f64m2(__VA_ARGS__)
+#define vluxseg2ei8_v_f64m4(...) __riscv_vluxseg2ei8_v_f64m4(__VA_ARGS__)
+#define vluxseg2ei16_v_f64m1(...) __riscv_vluxseg2ei16_v_f64m1(__VA_ARGS__)
+#define vluxseg3ei16_v_f64m1(...) __riscv_vluxseg3ei16_v_f64m1(__VA_ARGS__)
+#define vluxseg4ei16_v_f64m1(...) __riscv_vluxseg4ei16_v_f64m1(__VA_ARGS__)
+#define vluxseg5ei16_v_f64m1(...) __riscv_vluxseg5ei16_v_f64m1(__VA_ARGS__)
+#define vluxseg6ei16_v_f64m1(...) __riscv_vluxseg6ei16_v_f64m1(__VA_ARGS__)
+#define vluxseg7ei16_v_f64m1(...) __riscv_vluxseg7ei16_v_f64m1(__VA_ARGS__)
+#define vluxseg8ei16_v_f64m1(...) __riscv_vluxseg8ei16_v_f64m1(__VA_ARGS__)
+#define vluxseg2ei16_v_f64m2(...) __riscv_vluxseg2ei16_v_f64m2(__VA_ARGS__)
+#define vluxseg3ei16_v_f64m2(...) __riscv_vluxseg3ei16_v_f64m2(__VA_ARGS__)
+#define vluxseg4ei16_v_f64m2(...) __riscv_vluxseg4ei16_v_f64m2(__VA_ARGS__)
+#define vluxseg2ei16_v_f64m4(...) __riscv_vluxseg2ei16_v_f64m4(__VA_ARGS__)
+#define vluxseg2ei32_v_f64m1(...) __riscv_vluxseg2ei32_v_f64m1(__VA_ARGS__)
+#define vluxseg3ei32_v_f64m1(...) __riscv_vluxseg3ei32_v_f64m1(__VA_ARGS__)
+#define vluxseg4ei32_v_f64m1(...) __riscv_vluxseg4ei32_v_f64m1(__VA_ARGS__)
+#define vluxseg5ei32_v_f64m1(...) __riscv_vluxseg5ei32_v_f64m1(__VA_ARGS__)
+#define vluxseg6ei32_v_f64m1(...) __riscv_vluxseg6ei32_v_f64m1(__VA_ARGS__)
+#define vluxseg7ei32_v_f64m1(...) __riscv_vluxseg7ei32_v_f64m1(__VA_ARGS__)
+#define vluxseg8ei32_v_f64m1(...) __riscv_vluxseg8ei32_v_f64m1(__VA_ARGS__)
+#define vluxseg2ei32_v_f64m2(...) __riscv_vluxseg2ei32_v_f64m2(__VA_ARGS__)
+#define vluxseg3ei32_v_f64m2(...) __riscv_vluxseg3ei32_v_f64m2(__VA_ARGS__)
+#define vluxseg4ei32_v_f64m2(...) __riscv_vluxseg4ei32_v_f64m2(__VA_ARGS__)
+#define vluxseg2ei32_v_f64m4(...) __riscv_vluxseg2ei32_v_f64m4(__VA_ARGS__)
+#define vluxseg2ei64_v_f64m1(...) __riscv_vluxseg2ei64_v_f64m1(__VA_ARGS__)
+#define vluxseg3ei64_v_f64m1(...) __riscv_vluxseg3ei64_v_f64m1(__VA_ARGS__)
+#define vluxseg4ei64_v_f64m1(...) __riscv_vluxseg4ei64_v_f64m1(__VA_ARGS__)
+#define vluxseg5ei64_v_f64m1(...) __riscv_vluxseg5ei64_v_f64m1(__VA_ARGS__)
+#define vluxseg6ei64_v_f64m1(...) __riscv_vluxseg6ei64_v_f64m1(__VA_ARGS__)
+#define vluxseg7ei64_v_f64m1(...) __riscv_vluxseg7ei64_v_f64m1(__VA_ARGS__)
+#define vluxseg8ei64_v_f64m1(...) __riscv_vluxseg8ei64_v_f64m1(__VA_ARGS__)
+#define vluxseg2ei64_v_f64m2(...) __riscv_vluxseg2ei64_v_f64m2(__VA_ARGS__)
+#define vluxseg3ei64_v_f64m2(...) __riscv_vluxseg3ei64_v_f64m2(__VA_ARGS__)
+#define vluxseg4ei64_v_f64m2(...) __riscv_vluxseg4ei64_v_f64m2(__VA_ARGS__)
+#define vluxseg2ei64_v_f64m4(...) __riscv_vluxseg2ei64_v_f64m4(__VA_ARGS__)
+#define vloxseg2ei8_v_i8mf8(...) __riscv_vloxseg2ei8_v_i8mf8(__VA_ARGS__)
+#define vloxseg3ei8_v_i8mf8(...) __riscv_vloxseg3ei8_v_i8mf8(__VA_ARGS__)
+#define vloxseg4ei8_v_i8mf8(...) __riscv_vloxseg4ei8_v_i8mf8(__VA_ARGS__)
+#define vloxseg5ei8_v_i8mf8(...) __riscv_vloxseg5ei8_v_i8mf8(__VA_ARGS__)
+#define vloxseg6ei8_v_i8mf8(...) __riscv_vloxseg6ei8_v_i8mf8(__VA_ARGS__)
+#define vloxseg7ei8_v_i8mf8(...) __riscv_vloxseg7ei8_v_i8mf8(__VA_ARGS__)
+#define vloxseg8ei8_v_i8mf8(...) __riscv_vloxseg8ei8_v_i8mf8(__VA_ARGS__)
+#define vloxseg2ei8_v_i8mf4(...) __riscv_vloxseg2ei8_v_i8mf4(__VA_ARGS__)
+#define vloxseg3ei8_v_i8mf4(...) __riscv_vloxseg3ei8_v_i8mf4(__VA_ARGS__)
+#define vloxseg4ei8_v_i8mf4(...) __riscv_vloxseg4ei8_v_i8mf4(__VA_ARGS__)
+#define vloxseg5ei8_v_i8mf4(...) __riscv_vloxseg5ei8_v_i8mf4(__VA_ARGS__)
+#define vloxseg6ei8_v_i8mf4(...) __riscv_vloxseg6ei8_v_i8mf4(__VA_ARGS__)
+#define vloxseg7ei8_v_i8mf4(...) __riscv_vloxseg7ei8_v_i8mf4(__VA_ARGS__)
+#define vloxseg8ei8_v_i8mf4(...) __riscv_vloxseg8ei8_v_i8mf4(__VA_ARGS__)
+#define vloxseg2ei8_v_i8mf2(...) __riscv_vloxseg2ei8_v_i8mf2(__VA_ARGS__)
+#define vloxseg3ei8_v_i8mf2(...) __riscv_vloxseg3ei8_v_i8mf2(__VA_ARGS__)
+#define vloxseg4ei8_v_i8mf2(...) __riscv_vloxseg4ei8_v_i8mf2(__VA_ARGS__)
+#define vloxseg5ei8_v_i8mf2(...) __riscv_vloxseg5ei8_v_i8mf2(__VA_ARGS__)
+#define vloxseg6ei8_v_i8mf2(...) __riscv_vloxseg6ei8_v_i8mf2(__VA_ARGS__)
+#define vloxseg7ei8_v_i8mf2(...) __riscv_vloxseg7ei8_v_i8mf2(__VA_ARGS__)
+#define vloxseg8ei8_v_i8mf2(...) __riscv_vloxseg8ei8_v_i8mf2(__VA_ARGS__)
+#define vloxseg2ei8_v_i8m1(...) __riscv_vloxseg2ei8_v_i8m1(__VA_ARGS__)
+#define vloxseg3ei8_v_i8m1(...) __riscv_vloxseg3ei8_v_i8m1(__VA_ARGS__)
+#define vloxseg4ei8_v_i8m1(...) __riscv_vloxseg4ei8_v_i8m1(__VA_ARGS__)
+#define vloxseg5ei8_v_i8m1(...) __riscv_vloxseg5ei8_v_i8m1(__VA_ARGS__)
+#define vloxseg6ei8_v_i8m1(...) __riscv_vloxseg6ei8_v_i8m1(__VA_ARGS__)
+#define vloxseg7ei8_v_i8m1(...) __riscv_vloxseg7ei8_v_i8m1(__VA_ARGS__)
+#define vloxseg8ei8_v_i8m1(...) __riscv_vloxseg8ei8_v_i8m1(__VA_ARGS__)
+#define vloxseg2ei8_v_i8m2(...) __riscv_vloxseg2ei8_v_i8m2(__VA_ARGS__)
+#define vloxseg3ei8_v_i8m2(...) __riscv_vloxseg3ei8_v_i8m2(__VA_ARGS__)
+#define vloxseg4ei8_v_i8m2(...) __riscv_vloxseg4ei8_v_i8m2(__VA_ARGS__)
+#define vloxseg2ei8_v_i8m4(...) __riscv_vloxseg2ei8_v_i8m4(__VA_ARGS__)
+#define vloxseg2ei16_v_i8mf8(...) __riscv_vloxseg2ei16_v_i8mf8(__VA_ARGS__)
+#define vloxseg3ei16_v_i8mf8(...) __riscv_vloxseg3ei16_v_i8mf8(__VA_ARGS__)
+#define vloxseg4ei16_v_i8mf8(...) __riscv_vloxseg4ei16_v_i8mf8(__VA_ARGS__)
+#define vloxseg5ei16_v_i8mf8(...) __riscv_vloxseg5ei16_v_i8mf8(__VA_ARGS__)
+#define vloxseg6ei16_v_i8mf8(...) __riscv_vloxseg6ei16_v_i8mf8(__VA_ARGS__)
+#define vloxseg7ei16_v_i8mf8(...) __riscv_vloxseg7ei16_v_i8mf8(__VA_ARGS__)
+#define vloxseg8ei16_v_i8mf8(...) __riscv_vloxseg8ei16_v_i8mf8(__VA_ARGS__)
+#define vloxseg2ei16_v_i8mf4(...) __riscv_vloxseg2ei16_v_i8mf4(__VA_ARGS__)
+#define vloxseg3ei16_v_i8mf4(...) __riscv_vloxseg3ei16_v_i8mf4(__VA_ARGS__)
+#define vloxseg4ei16_v_i8mf4(...) __riscv_vloxseg4ei16_v_i8mf4(__VA_ARGS__)
+#define vloxseg5ei16_v_i8mf4(...) __riscv_vloxseg5ei16_v_i8mf4(__VA_ARGS__)
+#define vloxseg6ei16_v_i8mf4(...) __riscv_vloxseg6ei16_v_i8mf4(__VA_ARGS__)
+#define vloxseg7ei16_v_i8mf4(...) __riscv_vloxseg7ei16_v_i8mf4(__VA_ARGS__)
+#define vloxseg8ei16_v_i8mf4(...) __riscv_vloxseg8ei16_v_i8mf4(__VA_ARGS__)
+#define vloxseg2ei16_v_i8mf2(...) __riscv_vloxseg2ei16_v_i8mf2(__VA_ARGS__)
+#define vloxseg3ei16_v_i8mf2(...) __riscv_vloxseg3ei16_v_i8mf2(__VA_ARGS__)
+#define vloxseg4ei16_v_i8mf2(...) __riscv_vloxseg4ei16_v_i8mf2(__VA_ARGS__)
+#define vloxseg5ei16_v_i8mf2(...) __riscv_vloxseg5ei16_v_i8mf2(__VA_ARGS__)
+#define vloxseg6ei16_v_i8mf2(...) __riscv_vloxseg6ei16_v_i8mf2(__VA_ARGS__)
+#define vloxseg7ei16_v_i8mf2(...) __riscv_vloxseg7ei16_v_i8mf2(__VA_ARGS__)
+#define vloxseg8ei16_v_i8mf2(...) __riscv_vloxseg8ei16_v_i8mf2(__VA_ARGS__)
+#define vloxseg2ei16_v_i8m1(...) __riscv_vloxseg2ei16_v_i8m1(__VA_ARGS__)
+#define vloxseg3ei16_v_i8m1(...) __riscv_vloxseg3ei16_v_i8m1(__VA_ARGS__)
+#define vloxseg4ei16_v_i8m1(...) __riscv_vloxseg4ei16_v_i8m1(__VA_ARGS__)
+#define vloxseg5ei16_v_i8m1(...) __riscv_vloxseg5ei16_v_i8m1(__VA_ARGS__)
+#define vloxseg6ei16_v_i8m1(...) __riscv_vloxseg6ei16_v_i8m1(__VA_ARGS__)
+#define vloxseg7ei16_v_i8m1(...) __riscv_vloxseg7ei16_v_i8m1(__VA_ARGS__)
+#define vloxseg8ei16_v_i8m1(...) __riscv_vloxseg8ei16_v_i8m1(__VA_ARGS__)
+#define vloxseg2ei16_v_i8m2(...) __riscv_vloxseg2ei16_v_i8m2(__VA_ARGS__)
+#define vloxseg3ei16_v_i8m2(...) __riscv_vloxseg3ei16_v_i8m2(__VA_ARGS__)
+#define vloxseg4ei16_v_i8m2(...) __riscv_vloxseg4ei16_v_i8m2(__VA_ARGS__)
+#define vloxseg2ei16_v_i8m4(...) __riscv_vloxseg2ei16_v_i8m4(__VA_ARGS__)
+#define vloxseg2ei32_v_i8mf8(...) __riscv_vloxseg2ei32_v_i8mf8(__VA_ARGS__)
+#define vloxseg3ei32_v_i8mf8(...) __riscv_vloxseg3ei32_v_i8mf8(__VA_ARGS__)
+#define vloxseg4ei32_v_i8mf8(...) __riscv_vloxseg4ei32_v_i8mf8(__VA_ARGS__)
+#define vloxseg5ei32_v_i8mf8(...) __riscv_vloxseg5ei32_v_i8mf8(__VA_ARGS__)
+#define vloxseg6ei32_v_i8mf8(...) __riscv_vloxseg6ei32_v_i8mf8(__VA_ARGS__)
+#define vloxseg7ei32_v_i8mf8(...) __riscv_vloxseg7ei32_v_i8mf8(__VA_ARGS__)
+#define vloxseg8ei32_v_i8mf8(...) __riscv_vloxseg8ei32_v_i8mf8(__VA_ARGS__)
+#define vloxseg2ei32_v_i8mf4(...) __riscv_vloxseg2ei32_v_i8mf4(__VA_ARGS__)
+#define vloxseg3ei32_v_i8mf4(...) __riscv_vloxseg3ei32_v_i8mf4(__VA_ARGS__)
+#define vloxseg4ei32_v_i8mf4(...) __riscv_vloxseg4ei32_v_i8mf4(__VA_ARGS__)
+#define vloxseg5ei32_v_i8mf4(...) __riscv_vloxseg5ei32_v_i8mf4(__VA_ARGS__)
+#define vloxseg6ei32_v_i8mf4(...) __riscv_vloxseg6ei32_v_i8mf4(__VA_ARGS__)
+#define vloxseg7ei32_v_i8mf4(...) __riscv_vloxseg7ei32_v_i8mf4(__VA_ARGS__)
+#define vloxseg8ei32_v_i8mf4(...) __riscv_vloxseg8ei32_v_i8mf4(__VA_ARGS__)
+#define vloxseg2ei32_v_i8mf2(...) __riscv_vloxseg2ei32_v_i8mf2(__VA_ARGS__)
+#define vloxseg3ei32_v_i8mf2(...) __riscv_vloxseg3ei32_v_i8mf2(__VA_ARGS__)
+#define vloxseg4ei32_v_i8mf2(...) __riscv_vloxseg4ei32_v_i8mf2(__VA_ARGS__)
+#define vloxseg5ei32_v_i8mf2(...) __riscv_vloxseg5ei32_v_i8mf2(__VA_ARGS__)
+#define vloxseg6ei32_v_i8mf2(...) __riscv_vloxseg6ei32_v_i8mf2(__VA_ARGS__)
+#define vloxseg7ei32_v_i8mf2(...) __riscv_vloxseg7ei32_v_i8mf2(__VA_ARGS__)
+#define vloxseg8ei32_v_i8mf2(...) __riscv_vloxseg8ei32_v_i8mf2(__VA_ARGS__)
+#define vloxseg2ei32_v_i8m1(...) __riscv_vloxseg2ei32_v_i8m1(__VA_ARGS__)
+#define vloxseg3ei32_v_i8m1(...) __riscv_vloxseg3ei32_v_i8m1(__VA_ARGS__)
+#define vloxseg4ei32_v_i8m1(...) __riscv_vloxseg4ei32_v_i8m1(__VA_ARGS__)
+#define vloxseg5ei32_v_i8m1(...) __riscv_vloxseg5ei32_v_i8m1(__VA_ARGS__)
+#define vloxseg6ei32_v_i8m1(...) __riscv_vloxseg6ei32_v_i8m1(__VA_ARGS__)
+#define vloxseg7ei32_v_i8m1(...) __riscv_vloxseg7ei32_v_i8m1(__VA_ARGS__)
+#define vloxseg8ei32_v_i8m1(...) __riscv_vloxseg8ei32_v_i8m1(__VA_ARGS__)
+#define vloxseg2ei32_v_i8m2(...) __riscv_vloxseg2ei32_v_i8m2(__VA_ARGS__)
+#define vloxseg3ei32_v_i8m2(...) __riscv_vloxseg3ei32_v_i8m2(__VA_ARGS__)
+#define vloxseg4ei32_v_i8m2(...) __riscv_vloxseg4ei32_v_i8m2(__VA_ARGS__)
+#define vloxseg2ei64_v_i8mf8(...) __riscv_vloxseg2ei64_v_i8mf8(__VA_ARGS__)
+#define vloxseg3ei64_v_i8mf8(...) __riscv_vloxseg3ei64_v_i8mf8(__VA_ARGS__)
+#define vloxseg4ei64_v_i8mf8(...) __riscv_vloxseg4ei64_v_i8mf8(__VA_ARGS__)
+#define vloxseg5ei64_v_i8mf8(...) __riscv_vloxseg5ei64_v_i8mf8(__VA_ARGS__)
+#define vloxseg6ei64_v_i8mf8(...) __riscv_vloxseg6ei64_v_i8mf8(__VA_ARGS__)
+#define vloxseg7ei64_v_i8mf8(...) __riscv_vloxseg7ei64_v_i8mf8(__VA_ARGS__)
+#define vloxseg8ei64_v_i8mf8(...) __riscv_vloxseg8ei64_v_i8mf8(__VA_ARGS__)
+#define vloxseg2ei64_v_i8mf4(...) __riscv_vloxseg2ei64_v_i8mf4(__VA_ARGS__)
+#define vloxseg3ei64_v_i8mf4(...) __riscv_vloxseg3ei64_v_i8mf4(__VA_ARGS__)
+#define vloxseg4ei64_v_i8mf4(...) __riscv_vloxseg4ei64_v_i8mf4(__VA_ARGS__)
+#define vloxseg5ei64_v_i8mf4(...) __riscv_vloxseg5ei64_v_i8mf4(__VA_ARGS__)
+#define vloxseg6ei64_v_i8mf4(...) __riscv_vloxseg6ei64_v_i8mf4(__VA_ARGS__)
+#define vloxseg7ei64_v_i8mf4(...) __riscv_vloxseg7ei64_v_i8mf4(__VA_ARGS__)
+#define vloxseg8ei64_v_i8mf4(...) __riscv_vloxseg8ei64_v_i8mf4(__VA_ARGS__)
+#define vloxseg2ei64_v_i8mf2(...) __riscv_vloxseg2ei64_v_i8mf2(__VA_ARGS__)
+#define vloxseg3ei64_v_i8mf2(...) __riscv_vloxseg3ei64_v_i8mf2(__VA_ARGS__)
+#define vloxseg4ei64_v_i8mf2(...) __riscv_vloxseg4ei64_v_i8mf2(__VA_ARGS__)
+#define vloxseg5ei64_v_i8mf2(...) __riscv_vloxseg5ei64_v_i8mf2(__VA_ARGS__)
+#define vloxseg6ei64_v_i8mf2(...) __riscv_vloxseg6ei64_v_i8mf2(__VA_ARGS__)
+#define vloxseg7ei64_v_i8mf2(...) __riscv_vloxseg7ei64_v_i8mf2(__VA_ARGS__)
+#define vloxseg8ei64_v_i8mf2(...) __riscv_vloxseg8ei64_v_i8mf2(__VA_ARGS__)
+#define vloxseg2ei64_v_i8m1(...) __riscv_vloxseg2ei64_v_i8m1(__VA_ARGS__)
+#define vloxseg3ei64_v_i8m1(...) __riscv_vloxseg3ei64_v_i8m1(__VA_ARGS__)
+#define vloxseg4ei64_v_i8m1(...) __riscv_vloxseg4ei64_v_i8m1(__VA_ARGS__)
+#define vloxseg5ei64_v_i8m1(...) __riscv_vloxseg5ei64_v_i8m1(__VA_ARGS__)
+#define vloxseg6ei64_v_i8m1(...) __riscv_vloxseg6ei64_v_i8m1(__VA_ARGS__)
+#define vloxseg7ei64_v_i8m1(...) __riscv_vloxseg7ei64_v_i8m1(__VA_ARGS__)
+#define vloxseg8ei64_v_i8m1(...) __riscv_vloxseg8ei64_v_i8m1(__VA_ARGS__)
+#define vloxseg2ei8_v_i16mf4(...) __riscv_vloxseg2ei8_v_i16mf4(__VA_ARGS__)
+#define vloxseg3ei8_v_i16mf4(...) __riscv_vloxseg3ei8_v_i16mf4(__VA_ARGS__)
+#define vloxseg4ei8_v_i16mf4(...) __riscv_vloxseg4ei8_v_i16mf4(__VA_ARGS__)
+#define vloxseg5ei8_v_i16mf4(...) __riscv_vloxseg5ei8_v_i16mf4(__VA_ARGS__)
+#define vloxseg6ei8_v_i16mf4(...) __riscv_vloxseg6ei8_v_i16mf4(__VA_ARGS__)
+#define vloxseg7ei8_v_i16mf4(...) __riscv_vloxseg7ei8_v_i16mf4(__VA_ARGS__)
+#define vloxseg8ei8_v_i16mf4(...) __riscv_vloxseg8ei8_v_i16mf4(__VA_ARGS__)
+#define vloxseg2ei8_v_i16mf2(...) __riscv_vloxseg2ei8_v_i16mf2(__VA_ARGS__)
+#define vloxseg3ei8_v_i16mf2(...) __riscv_vloxseg3ei8_v_i16mf2(__VA_ARGS__)
+#define vloxseg4ei8_v_i16mf2(...) __riscv_vloxseg4ei8_v_i16mf2(__VA_ARGS__)
+#define vloxseg5ei8_v_i16mf2(...) __riscv_vloxseg5ei8_v_i16mf2(__VA_ARGS__)
+#define vloxseg6ei8_v_i16mf2(...) __riscv_vloxseg6ei8_v_i16mf2(__VA_ARGS__)
+#define vloxseg7ei8_v_i16mf2(...) __riscv_vloxseg7ei8_v_i16mf2(__VA_ARGS__)
+#define vloxseg8ei8_v_i16mf2(...) __riscv_vloxseg8ei8_v_i16mf2(__VA_ARGS__)
+#define vloxseg2ei8_v_i16m1(...) __riscv_vloxseg2ei8_v_i16m1(__VA_ARGS__)
+#define vloxseg3ei8_v_i16m1(...) __riscv_vloxseg3ei8_v_i16m1(__VA_ARGS__)
+#define vloxseg4ei8_v_i16m1(...) __riscv_vloxseg4ei8_v_i16m1(__VA_ARGS__)
+#define vloxseg5ei8_v_i16m1(...) __riscv_vloxseg5ei8_v_i16m1(__VA_ARGS__)
+#define vloxseg6ei8_v_i16m1(...) __riscv_vloxseg6ei8_v_i16m1(__VA_ARGS__)
+#define vloxseg7ei8_v_i16m1(...) __riscv_vloxseg7ei8_v_i16m1(__VA_ARGS__)
+#define vloxseg8ei8_v_i16m1(...) __riscv_vloxseg8ei8_v_i16m1(__VA_ARGS__)
+#define vloxseg2ei8_v_i16m2(...) __riscv_vloxseg2ei8_v_i16m2(__VA_ARGS__)
+#define vloxseg3ei8_v_i16m2(...) __riscv_vloxseg3ei8_v_i16m2(__VA_ARGS__)
+#define vloxseg4ei8_v_i16m2(...) __riscv_vloxseg4ei8_v_i16m2(__VA_ARGS__)
+#define vloxseg2ei8_v_i16m4(...) __riscv_vloxseg2ei8_v_i16m4(__VA_ARGS__)
+#define vloxseg2ei16_v_i16mf4(...) __riscv_vloxseg2ei16_v_i16mf4(__VA_ARGS__)
+#define vloxseg3ei16_v_i16mf4(...) __riscv_vloxseg3ei16_v_i16mf4(__VA_ARGS__)
+#define vloxseg4ei16_v_i16mf4(...) __riscv_vloxseg4ei16_v_i16mf4(__VA_ARGS__)
+#define vloxseg5ei16_v_i16mf4(...) __riscv_vloxseg5ei16_v_i16mf4(__VA_ARGS__)
+#define vloxseg6ei16_v_i16mf4(...) __riscv_vloxseg6ei16_v_i16mf4(__VA_ARGS__)
+#define vloxseg7ei16_v_i16mf4(...) __riscv_vloxseg7ei16_v_i16mf4(__VA_ARGS__)
+#define vloxseg8ei16_v_i16mf4(...) __riscv_vloxseg8ei16_v_i16mf4(__VA_ARGS__)
+#define vloxseg2ei16_v_i16mf2(...) __riscv_vloxseg2ei16_v_i16mf2(__VA_ARGS__)
+#define vloxseg3ei16_v_i16mf2(...) __riscv_vloxseg3ei16_v_i16mf2(__VA_ARGS__)
+#define vloxseg4ei16_v_i16mf2(...) __riscv_vloxseg4ei16_v_i16mf2(__VA_ARGS__)
+#define vloxseg5ei16_v_i16mf2(...) __riscv_vloxseg5ei16_v_i16mf2(__VA_ARGS__)
+#define vloxseg6ei16_v_i16mf2(...) __riscv_vloxseg6ei16_v_i16mf2(__VA_ARGS__)
+#define vloxseg7ei16_v_i16mf2(...) __riscv_vloxseg7ei16_v_i16mf2(__VA_ARGS__)
+#define vloxseg8ei16_v_i16mf2(...) __riscv_vloxseg8ei16_v_i16mf2(__VA_ARGS__)
+#define vloxseg2ei16_v_i16m1(...) __riscv_vloxseg2ei16_v_i16m1(__VA_ARGS__)
+#define vloxseg3ei16_v_i16m1(...) __riscv_vloxseg3ei16_v_i16m1(__VA_ARGS__)
+#define vloxseg4ei16_v_i16m1(...) __riscv_vloxseg4ei16_v_i16m1(__VA_ARGS__)
+#define vloxseg5ei16_v_i16m1(...) __riscv_vloxseg5ei16_v_i16m1(__VA_ARGS__)
+#define vloxseg6ei16_v_i16m1(...) __riscv_vloxseg6ei16_v_i16m1(__VA_ARGS__)
+#define vloxseg7ei16_v_i16m1(...) __riscv_vloxseg7ei16_v_i16m1(__VA_ARGS__)
+#define vloxseg8ei16_v_i16m1(...) __riscv_vloxseg8ei16_v_i16m1(__VA_ARGS__)
+#define vloxseg2ei16_v_i16m2(...) __riscv_vloxseg2ei16_v_i16m2(__VA_ARGS__)
+#define vloxseg3ei16_v_i16m2(...) __riscv_vloxseg3ei16_v_i16m2(__VA_ARGS__)
+#define vloxseg4ei16_v_i16m2(...) __riscv_vloxseg4ei16_v_i16m2(__VA_ARGS__)
+#define vloxseg2ei16_v_i16m4(...) __riscv_vloxseg2ei16_v_i16m4(__VA_ARGS__)
+#define vloxseg2ei32_v_i16mf4(...) __riscv_vloxseg2ei32_v_i16mf4(__VA_ARGS__)
+#define vloxseg3ei32_v_i16mf4(...) __riscv_vloxseg3ei32_v_i16mf4(__VA_ARGS__)
+#define vloxseg4ei32_v_i16mf4(...) __riscv_vloxseg4ei32_v_i16mf4(__VA_ARGS__)
+#define vloxseg5ei32_v_i16mf4(...) __riscv_vloxseg5ei32_v_i16mf4(__VA_ARGS__)
+#define vloxseg6ei32_v_i16mf4(...) __riscv_vloxseg6ei32_v_i16mf4(__VA_ARGS__)
+#define vloxseg7ei32_v_i16mf4(...) __riscv_vloxseg7ei32_v_i16mf4(__VA_ARGS__)
+#define vloxseg8ei32_v_i16mf4(...) __riscv_vloxseg8ei32_v_i16mf4(__VA_ARGS__)
+#define vloxseg2ei32_v_i16mf2(...) __riscv_vloxseg2ei32_v_i16mf2(__VA_ARGS__)
+#define vloxseg3ei32_v_i16mf2(...) __riscv_vloxseg3ei32_v_i16mf2(__VA_ARGS__)
+#define vloxseg4ei32_v_i16mf2(...) __riscv_vloxseg4ei32_v_i16mf2(__VA_ARGS__)
+#define vloxseg5ei32_v_i16mf2(...) __riscv_vloxseg5ei32_v_i16mf2(__VA_ARGS__)
+#define vloxseg6ei32_v_i16mf2(...) __riscv_vloxseg6ei32_v_i16mf2(__VA_ARGS__)
+#define vloxseg7ei32_v_i16mf2(...) __riscv_vloxseg7ei32_v_i16mf2(__VA_ARGS__)
+#define vloxseg8ei32_v_i16mf2(...) __riscv_vloxseg8ei32_v_i16mf2(__VA_ARGS__)
+#define vloxseg2ei32_v_i16m1(...) __riscv_vloxseg2ei32_v_i16m1(__VA_ARGS__)
+#define vloxseg3ei32_v_i16m1(...) __riscv_vloxseg3ei32_v_i16m1(__VA_ARGS__)
+#define vloxseg4ei32_v_i16m1(...) __riscv_vloxseg4ei32_v_i16m1(__VA_ARGS__)
+#define vloxseg5ei32_v_i16m1(...) __riscv_vloxseg5ei32_v_i16m1(__VA_ARGS__)
+#define vloxseg6ei32_v_i16m1(...) __riscv_vloxseg6ei32_v_i16m1(__VA_ARGS__)
+#define vloxseg7ei32_v_i16m1(...) __riscv_vloxseg7ei32_v_i16m1(__VA_ARGS__)
+#define vloxseg8ei32_v_i16m1(...) __riscv_vloxseg8ei32_v_i16m1(__VA_ARGS__)
+#define vloxseg2ei32_v_i16m2(...) __riscv_vloxseg2ei32_v_i16m2(__VA_ARGS__)
+#define vloxseg3ei32_v_i16m2(...) __riscv_vloxseg3ei32_v_i16m2(__VA_ARGS__)
+#define vloxseg4ei32_v_i16m2(...) __riscv_vloxseg4ei32_v_i16m2(__VA_ARGS__)
+#define vloxseg2ei32_v_i16m4(...) __riscv_vloxseg2ei32_v_i16m4(__VA_ARGS__)
+#define vloxseg2ei64_v_i16mf4(...) __riscv_vloxseg2ei64_v_i16mf4(__VA_ARGS__)
+#define vloxseg3ei64_v_i16mf4(...) __riscv_vloxseg3ei64_v_i16mf4(__VA_ARGS__)
+#define vloxseg4ei64_v_i16mf4(...) __riscv_vloxseg4ei64_v_i16mf4(__VA_ARGS__)
+#define vloxseg5ei64_v_i16mf4(...) __riscv_vloxseg5ei64_v_i16mf4(__VA_ARGS__)
+#define vloxseg6ei64_v_i16mf4(...) __riscv_vloxseg6ei64_v_i16mf4(__VA_ARGS__)
+#define vloxseg7ei64_v_i16mf4(...) __riscv_vloxseg7ei64_v_i16mf4(__VA_ARGS__)
+#define vloxseg8ei64_v_i16mf4(...) __riscv_vloxseg8ei64_v_i16mf4(__VA_ARGS__)
+#define vloxseg2ei64_v_i16mf2(...) __riscv_vloxseg2ei64_v_i16mf2(__VA_ARGS__)
+#define vloxseg3ei64_v_i16mf2(...) __riscv_vloxseg3ei64_v_i16mf2(__VA_ARGS__)
+#define vloxseg4ei64_v_i16mf2(...) __riscv_vloxseg4ei64_v_i16mf2(__VA_ARGS__)
+#define vloxseg5ei64_v_i16mf2(...) __riscv_vloxseg5ei64_v_i16mf2(__VA_ARGS__)
+#define vloxseg6ei64_v_i16mf2(...) __riscv_vloxseg6ei64_v_i16mf2(__VA_ARGS__)
+#define vloxseg7ei64_v_i16mf2(...) __riscv_vloxseg7ei64_v_i16mf2(__VA_ARGS__)
+#define vloxseg8ei64_v_i16mf2(...) __riscv_vloxseg8ei64_v_i16mf2(__VA_ARGS__)
+#define vloxseg2ei64_v_i16m1(...) __riscv_vloxseg2ei64_v_i16m1(__VA_ARGS__)
+#define vloxseg3ei64_v_i16m1(...) __riscv_vloxseg3ei64_v_i16m1(__VA_ARGS__)
+#define vloxseg4ei64_v_i16m1(...) __riscv_vloxseg4ei64_v_i16m1(__VA_ARGS__)
+#define vloxseg5ei64_v_i16m1(...) __riscv_vloxseg5ei64_v_i16m1(__VA_ARGS__)
+#define vloxseg6ei64_v_i16m1(...) __riscv_vloxseg6ei64_v_i16m1(__VA_ARGS__)
+#define vloxseg7ei64_v_i16m1(...) __riscv_vloxseg7ei64_v_i16m1(__VA_ARGS__)
+#define vloxseg8ei64_v_i16m1(...) __riscv_vloxseg8ei64_v_i16m1(__VA_ARGS__)
+#define vloxseg2ei64_v_i16m2(...) __riscv_vloxseg2ei64_v_i16m2(__VA_ARGS__)
+#define vloxseg3ei64_v_i16m2(...) __riscv_vloxseg3ei64_v_i16m2(__VA_ARGS__)
+#define vloxseg4ei64_v_i16m2(...) __riscv_vloxseg4ei64_v_i16m2(__VA_ARGS__)
+#define vloxseg2ei8_v_i32mf2(...) __riscv_vloxseg2ei8_v_i32mf2(__VA_ARGS__)
+#define vloxseg3ei8_v_i32mf2(...) __riscv_vloxseg3ei8_v_i32mf2(__VA_ARGS__)
+#define vloxseg4ei8_v_i32mf2(...) __riscv_vloxseg4ei8_v_i32mf2(__VA_ARGS__)
+#define vloxseg5ei8_v_i32mf2(...) __riscv_vloxseg5ei8_v_i32mf2(__VA_ARGS__)
+#define vloxseg6ei8_v_i32mf2(...) __riscv_vloxseg6ei8_v_i32mf2(__VA_ARGS__)
+#define vloxseg7ei8_v_i32mf2(...) __riscv_vloxseg7ei8_v_i32mf2(__VA_ARGS__)
+#define vloxseg8ei8_v_i32mf2(...) __riscv_vloxseg8ei8_v_i32mf2(__VA_ARGS__)
+#define vloxseg2ei8_v_i32m1(...) __riscv_vloxseg2ei8_v_i32m1(__VA_ARGS__)
+#define vloxseg3ei8_v_i32m1(...) __riscv_vloxseg3ei8_v_i32m1(__VA_ARGS__)
+#define vloxseg4ei8_v_i32m1(...) __riscv_vloxseg4ei8_v_i32m1(__VA_ARGS__)
+#define vloxseg5ei8_v_i32m1(...) __riscv_vloxseg5ei8_v_i32m1(__VA_ARGS__)
+#define vloxseg6ei8_v_i32m1(...) __riscv_vloxseg6ei8_v_i32m1(__VA_ARGS__)
+#define vloxseg7ei8_v_i32m1(...) __riscv_vloxseg7ei8_v_i32m1(__VA_ARGS__)
+#define vloxseg8ei8_v_i32m1(...) __riscv_vloxseg8ei8_v_i32m1(__VA_ARGS__)
+#define vloxseg2ei8_v_i32m2(...) __riscv_vloxseg2ei8_v_i32m2(__VA_ARGS__)
+#define vloxseg3ei8_v_i32m2(...) __riscv_vloxseg3ei8_v_i32m2(__VA_ARGS__)
+#define vloxseg4ei8_v_i32m2(...) __riscv_vloxseg4ei8_v_i32m2(__VA_ARGS__)
+#define vloxseg2ei8_v_i32m4(...) __riscv_vloxseg2ei8_v_i32m4(__VA_ARGS__)
+#define vloxseg2ei16_v_i32mf2(...) __riscv_vloxseg2ei16_v_i32mf2(__VA_ARGS__)
+#define vloxseg3ei16_v_i32mf2(...) __riscv_vloxseg3ei16_v_i32mf2(__VA_ARGS__)
+#define vloxseg4ei16_v_i32mf2(...) __riscv_vloxseg4ei16_v_i32mf2(__VA_ARGS__)
+#define vloxseg5ei16_v_i32mf2(...) __riscv_vloxseg5ei16_v_i32mf2(__VA_ARGS__)
+#define vloxseg6ei16_v_i32mf2(...) __riscv_vloxseg6ei16_v_i32mf2(__VA_ARGS__)
+#define vloxseg7ei16_v_i32mf2(...) __riscv_vloxseg7ei16_v_i32mf2(__VA_ARGS__)
+#define vloxseg8ei16_v_i32mf2(...) __riscv_vloxseg8ei16_v_i32mf2(__VA_ARGS__)
+#define vloxseg2ei16_v_i32m1(...) __riscv_vloxseg2ei16_v_i32m1(__VA_ARGS__)
+#define vloxseg3ei16_v_i32m1(...) __riscv_vloxseg3ei16_v_i32m1(__VA_ARGS__)
+#define vloxseg4ei16_v_i32m1(...) __riscv_vloxseg4ei16_v_i32m1(__VA_ARGS__)
+#define vloxseg5ei16_v_i32m1(...) __riscv_vloxseg5ei16_v_i32m1(__VA_ARGS__)
+#define vloxseg6ei16_v_i32m1(...) __riscv_vloxseg6ei16_v_i32m1(__VA_ARGS__)
+#define vloxseg7ei16_v_i32m1(...) __riscv_vloxseg7ei16_v_i32m1(__VA_ARGS__)
+#define vloxseg8ei16_v_i32m1(...) __riscv_vloxseg8ei16_v_i32m1(__VA_ARGS__)
+#define vloxseg2ei16_v_i32m2(...) __riscv_vloxseg2ei16_v_i32m2(__VA_ARGS__)
+#define vloxseg3ei16_v_i32m2(...) __riscv_vloxseg3ei16_v_i32m2(__VA_ARGS__)
+#define vloxseg4ei16_v_i32m2(...) __riscv_vloxseg4ei16_v_i32m2(__VA_ARGS__)
+#define vloxseg2ei16_v_i32m4(...) __riscv_vloxseg2ei16_v_i32m4(__VA_ARGS__)
+#define vloxseg2ei32_v_i32mf2(...) __riscv_vloxseg2ei32_v_i32mf2(__VA_ARGS__)
+#define vloxseg3ei32_v_i32mf2(...) __riscv_vloxseg3ei32_v_i32mf2(__VA_ARGS__)
+#define vloxseg4ei32_v_i32mf2(...) __riscv_vloxseg4ei32_v_i32mf2(__VA_ARGS__)
+#define vloxseg5ei32_v_i32mf2(...) __riscv_vloxseg5ei32_v_i32mf2(__VA_ARGS__)
+#define vloxseg6ei32_v_i32mf2(...) __riscv_vloxseg6ei32_v_i32mf2(__VA_ARGS__)
+#define vloxseg7ei32_v_i32mf2(...) __riscv_vloxseg7ei32_v_i32mf2(__VA_ARGS__)
+#define vloxseg8ei32_v_i32mf2(...) __riscv_vloxseg8ei32_v_i32mf2(__VA_ARGS__)
+#define vloxseg2ei32_v_i32m1(...) __riscv_vloxseg2ei32_v_i32m1(__VA_ARGS__)
+#define vloxseg3ei32_v_i32m1(...) __riscv_vloxseg3ei32_v_i32m1(__VA_ARGS__)
+#define vloxseg4ei32_v_i32m1(...) __riscv_vloxseg4ei32_v_i32m1(__VA_ARGS__)
+#define vloxseg5ei32_v_i32m1(...) __riscv_vloxseg5ei32_v_i32m1(__VA_ARGS__)
+#define vloxseg6ei32_v_i32m1(...) __riscv_vloxseg6ei32_v_i32m1(__VA_ARGS__)
+#define vloxseg7ei32_v_i32m1(...) __riscv_vloxseg7ei32_v_i32m1(__VA_ARGS__)
+#define vloxseg8ei32_v_i32m1(...) __riscv_vloxseg8ei32_v_i32m1(__VA_ARGS__)
+#define vloxseg2ei32_v_i32m2(...) __riscv_vloxseg2ei32_v_i32m2(__VA_ARGS__)
+#define vloxseg3ei32_v_i32m2(...) __riscv_vloxseg3ei32_v_i32m2(__VA_ARGS__)
+#define vloxseg4ei32_v_i32m2(...) __riscv_vloxseg4ei32_v_i32m2(__VA_ARGS__)
+#define vloxseg2ei32_v_i32m4(...) __riscv_vloxseg2ei32_v_i32m4(__VA_ARGS__)
+#define vloxseg2ei64_v_i32mf2(...) __riscv_vloxseg2ei64_v_i32mf2(__VA_ARGS__)
+#define vloxseg3ei64_v_i32mf2(...) __riscv_vloxseg3ei64_v_i32mf2(__VA_ARGS__)
+#define vloxseg4ei64_v_i32mf2(...) __riscv_vloxseg4ei64_v_i32mf2(__VA_ARGS__)
+#define vloxseg5ei64_v_i32mf2(...) __riscv_vloxseg5ei64_v_i32mf2(__VA_ARGS__)
+#define vloxseg6ei64_v_i32mf2(...) __riscv_vloxseg6ei64_v_i32mf2(__VA_ARGS__)
+#define vloxseg7ei64_v_i32mf2(...) __riscv_vloxseg7ei64_v_i32mf2(__VA_ARGS__)
+#define vloxseg8ei64_v_i32mf2(...) __riscv_vloxseg8ei64_v_i32mf2(__VA_ARGS__)
+#define vloxseg2ei64_v_i32m1(...) __riscv_vloxseg2ei64_v_i32m1(__VA_ARGS__)
+#define vloxseg3ei64_v_i32m1(...) __riscv_vloxseg3ei64_v_i32m1(__VA_ARGS__)
+#define vloxseg4ei64_v_i32m1(...) __riscv_vloxseg4ei64_v_i32m1(__VA_ARGS__)
+#define vloxseg5ei64_v_i32m1(...) __riscv_vloxseg5ei64_v_i32m1(__VA_ARGS__)
+#define vloxseg6ei64_v_i32m1(...) __riscv_vloxseg6ei64_v_i32m1(__VA_ARGS__)
+#define vloxseg7ei64_v_i32m1(...) __riscv_vloxseg7ei64_v_i32m1(__VA_ARGS__)
+#define vloxseg8ei64_v_i32m1(...) __riscv_vloxseg8ei64_v_i32m1(__VA_ARGS__)
+#define vloxseg2ei64_v_i32m2(...) __riscv_vloxseg2ei64_v_i32m2(__VA_ARGS__)
+#define vloxseg3ei64_v_i32m2(...) __riscv_vloxseg3ei64_v_i32m2(__VA_ARGS__)
+#define vloxseg4ei64_v_i32m2(...) __riscv_vloxseg4ei64_v_i32m2(__VA_ARGS__)
+#define vloxseg2ei64_v_i32m4(...) __riscv_vloxseg2ei64_v_i32m4(__VA_ARGS__)
+#define vloxseg2ei8_v_i64m1(...) __riscv_vloxseg2ei8_v_i64m1(__VA_ARGS__)
+#define vloxseg3ei8_v_i64m1(...) __riscv_vloxseg3ei8_v_i64m1(__VA_ARGS__)
+#define vloxseg4ei8_v_i64m1(...) __riscv_vloxseg4ei8_v_i64m1(__VA_ARGS__)
+#define vloxseg5ei8_v_i64m1(...) __riscv_vloxseg5ei8_v_i64m1(__VA_ARGS__)
+#define vloxseg6ei8_v_i64m1(...) __riscv_vloxseg6ei8_v_i64m1(__VA_ARGS__)
+#define vloxseg7ei8_v_i64m1(...) __riscv_vloxseg7ei8_v_i64m1(__VA_ARGS__)
+#define vloxseg8ei8_v_i64m1(...) __riscv_vloxseg8ei8_v_i64m1(__VA_ARGS__)
+#define vloxseg2ei8_v_i64m2(...) __riscv_vloxseg2ei8_v_i64m2(__VA_ARGS__)
+#define vloxseg3ei8_v_i64m2(...) __riscv_vloxseg3ei8_v_i64m2(__VA_ARGS__)
+#define vloxseg4ei8_v_i64m2(...) __riscv_vloxseg4ei8_v_i64m2(__VA_ARGS__)
+#define vloxseg2ei8_v_i64m4(...) __riscv_vloxseg2ei8_v_i64m4(__VA_ARGS__)
+#define vloxseg2ei16_v_i64m1(...) __riscv_vloxseg2ei16_v_i64m1(__VA_ARGS__)
+#define vloxseg3ei16_v_i64m1(...) __riscv_vloxseg3ei16_v_i64m1(__VA_ARGS__)
+#define vloxseg4ei16_v_i64m1(...) __riscv_vloxseg4ei16_v_i64m1(__VA_ARGS__)
+#define vloxseg5ei16_v_i64m1(...) __riscv_vloxseg5ei16_v_i64m1(__VA_ARGS__)
+#define vloxseg6ei16_v_i64m1(...) __riscv_vloxseg6ei16_v_i64m1(__VA_ARGS__)
+#define vloxseg7ei16_v_i64m1(...) __riscv_vloxseg7ei16_v_i64m1(__VA_ARGS__)
+#define vloxseg8ei16_v_i64m1(...) __riscv_vloxseg8ei16_v_i64m1(__VA_ARGS__)
+#define vloxseg2ei16_v_i64m2(...) __riscv_vloxseg2ei16_v_i64m2(__VA_ARGS__)
+#define vloxseg3ei16_v_i64m2(...) __riscv_vloxseg3ei16_v_i64m2(__VA_ARGS__)
+#define vloxseg4ei16_v_i64m2(...) __riscv_vloxseg4ei16_v_i64m2(__VA_ARGS__)
+#define vloxseg2ei16_v_i64m4(...) __riscv_vloxseg2ei16_v_i64m4(__VA_ARGS__)
+#define vloxseg2ei32_v_i64m1(...) __riscv_vloxseg2ei32_v_i64m1(__VA_ARGS__)
+#define vloxseg3ei32_v_i64m1(...) __riscv_vloxseg3ei32_v_i64m1(__VA_ARGS__)
+#define vloxseg4ei32_v_i64m1(...) __riscv_vloxseg4ei32_v_i64m1(__VA_ARGS__)
+#define vloxseg5ei32_v_i64m1(...) __riscv_vloxseg5ei32_v_i64m1(__VA_ARGS__)
+#define vloxseg6ei32_v_i64m1(...) __riscv_vloxseg6ei32_v_i64m1(__VA_ARGS__)
+#define vloxseg7ei32_v_i64m1(...) __riscv_vloxseg7ei32_v_i64m1(__VA_ARGS__)
+#define vloxseg8ei32_v_i64m1(...) __riscv_vloxseg8ei32_v_i64m1(__VA_ARGS__)
+#define vloxseg2ei32_v_i64m2(...) __riscv_vloxseg2ei32_v_i64m2(__VA_ARGS__)
+#define vloxseg3ei32_v_i64m2(...) __riscv_vloxseg3ei32_v_i64m2(__VA_ARGS__)
+#define vloxseg4ei32_v_i64m2(...) __riscv_vloxseg4ei32_v_i64m2(__VA_ARGS__)
+#define vloxseg2ei32_v_i64m4(...) __riscv_vloxseg2ei32_v_i64m4(__VA_ARGS__)
+#define vloxseg2ei64_v_i64m1(...) __riscv_vloxseg2ei64_v_i64m1(__VA_ARGS__)
+#define vloxseg3ei64_v_i64m1(...) __riscv_vloxseg3ei64_v_i64m1(__VA_ARGS__)
+#define vloxseg4ei64_v_i64m1(...) __riscv_vloxseg4ei64_v_i64m1(__VA_ARGS__)
+#define vloxseg5ei64_v_i64m1(...) __riscv_vloxseg5ei64_v_i64m1(__VA_ARGS__)
+#define vloxseg6ei64_v_i64m1(...) __riscv_vloxseg6ei64_v_i64m1(__VA_ARGS__)
+#define vloxseg7ei64_v_i64m1(...) __riscv_vloxseg7ei64_v_i64m1(__VA_ARGS__)
+#define vloxseg8ei64_v_i64m1(...) __riscv_vloxseg8ei64_v_i64m1(__VA_ARGS__)
+#define vloxseg2ei64_v_i64m2(...) __riscv_vloxseg2ei64_v_i64m2(__VA_ARGS__)
+#define vloxseg3ei64_v_i64m2(...) __riscv_vloxseg3ei64_v_i64m2(__VA_ARGS__)
+#define vloxseg4ei64_v_i64m2(...) __riscv_vloxseg4ei64_v_i64m2(__VA_ARGS__)
+#define vloxseg2ei64_v_i64m4(...) __riscv_vloxseg2ei64_v_i64m4(__VA_ARGS__)
+#define vluxseg2ei8_v_i8mf8(...) __riscv_vluxseg2ei8_v_i8mf8(__VA_ARGS__)
+#define vluxseg3ei8_v_i8mf8(...) __riscv_vluxseg3ei8_v_i8mf8(__VA_ARGS__)
+#define vluxseg4ei8_v_i8mf8(...) __riscv_vluxseg4ei8_v_i8mf8(__VA_ARGS__)
+#define vluxseg5ei8_v_i8mf8(...) __riscv_vluxseg5ei8_v_i8mf8(__VA_ARGS__)
+#define vluxseg6ei8_v_i8mf8(...) __riscv_vluxseg6ei8_v_i8mf8(__VA_ARGS__)
+#define vluxseg7ei8_v_i8mf8(...) __riscv_vluxseg7ei8_v_i8mf8(__VA_ARGS__)
+#define vluxseg8ei8_v_i8mf8(...) __riscv_vluxseg8ei8_v_i8mf8(__VA_ARGS__)
+#define vluxseg2ei8_v_i8mf4(...) __riscv_vluxseg2ei8_v_i8mf4(__VA_ARGS__)
+#define vluxseg3ei8_v_i8mf4(...) __riscv_vluxseg3ei8_v_i8mf4(__VA_ARGS__)
+#define vluxseg4ei8_v_i8mf4(...) __riscv_vluxseg4ei8_v_i8mf4(__VA_ARGS__)
+#define vluxseg5ei8_v_i8mf4(...) __riscv_vluxseg5ei8_v_i8mf4(__VA_ARGS__)
+#define vluxseg6ei8_v_i8mf4(...) __riscv_vluxseg6ei8_v_i8mf4(__VA_ARGS__)
+#define vluxseg7ei8_v_i8mf4(...) __riscv_vluxseg7ei8_v_i8mf4(__VA_ARGS__)
+#define vluxseg8ei8_v_i8mf4(...) __riscv_vluxseg8ei8_v_i8mf4(__VA_ARGS__)
+#define vluxseg2ei8_v_i8mf2(...) __riscv_vluxseg2ei8_v_i8mf2(__VA_ARGS__)
+#define vluxseg3ei8_v_i8mf2(...) __riscv_vluxseg3ei8_v_i8mf2(__VA_ARGS__)
+#define vluxseg4ei8_v_i8mf2(...) __riscv_vluxseg4ei8_v_i8mf2(__VA_ARGS__)
+#define vluxseg5ei8_v_i8mf2(...) __riscv_vluxseg5ei8_v_i8mf2(__VA_ARGS__)
+#define vluxseg6ei8_v_i8mf2(...) __riscv_vluxseg6ei8_v_i8mf2(__VA_ARGS__)
+#define vluxseg7ei8_v_i8mf2(...) __riscv_vluxseg7ei8_v_i8mf2(__VA_ARGS__)
+#define vluxseg8ei8_v_i8mf2(...) __riscv_vluxseg8ei8_v_i8mf2(__VA_ARGS__)
+#define vluxseg2ei8_v_i8m1(...) __riscv_vluxseg2ei8_v_i8m1(__VA_ARGS__)
+#define vluxseg3ei8_v_i8m1(...) __riscv_vluxseg3ei8_v_i8m1(__VA_ARGS__)
+#define vluxseg4ei8_v_i8m1(...) __riscv_vluxseg4ei8_v_i8m1(__VA_ARGS__)
+#define vluxseg5ei8_v_i8m1(...) __riscv_vluxseg5ei8_v_i8m1(__VA_ARGS__)
+#define vluxseg6ei8_v_i8m1(...) __riscv_vluxseg6ei8_v_i8m1(__VA_ARGS__)
+#define vluxseg7ei8_v_i8m1(...) __riscv_vluxseg7ei8_v_i8m1(__VA_ARGS__)
+#define vluxseg8ei8_v_i8m1(...) __riscv_vluxseg8ei8_v_i8m1(__VA_ARGS__)
+#define vluxseg2ei8_v_i8m2(...) __riscv_vluxseg2ei8_v_i8m2(__VA_ARGS__)
+#define vluxseg3ei8_v_i8m2(...) __riscv_vluxseg3ei8_v_i8m2(__VA_ARGS__)
+#define vluxseg4ei8_v_i8m2(...) __riscv_vluxseg4ei8_v_i8m2(__VA_ARGS__)
+#define vluxseg2ei8_v_i8m4(...) __riscv_vluxseg2ei8_v_i8m4(__VA_ARGS__)
+#define vluxseg2ei16_v_i8mf8(...) __riscv_vluxseg2ei16_v_i8mf8(__VA_ARGS__)
+#define vluxseg3ei16_v_i8mf8(...) __riscv_vluxseg3ei16_v_i8mf8(__VA_ARGS__)
+#define vluxseg4ei16_v_i8mf8(...) __riscv_vluxseg4ei16_v_i8mf8(__VA_ARGS__)
+#define vluxseg5ei16_v_i8mf8(...) __riscv_vluxseg5ei16_v_i8mf8(__VA_ARGS__)
+#define vluxseg6ei16_v_i8mf8(...) __riscv_vluxseg6ei16_v_i8mf8(__VA_ARGS__)
+#define vluxseg7ei16_v_i8mf8(...) __riscv_vluxseg7ei16_v_i8mf8(__VA_ARGS__)
+#define vluxseg8ei16_v_i8mf8(...) __riscv_vluxseg8ei16_v_i8mf8(__VA_ARGS__)
+#define vluxseg2ei16_v_i8mf4(...) __riscv_vluxseg2ei16_v_i8mf4(__VA_ARGS__)
+#define vluxseg3ei16_v_i8mf4(...) __riscv_vluxseg3ei16_v_i8mf4(__VA_ARGS__)
+#define vluxseg4ei16_v_i8mf4(...) __riscv_vluxseg4ei16_v_i8mf4(__VA_ARGS__)
+#define vluxseg5ei16_v_i8mf4(...) __riscv_vluxseg5ei16_v_i8mf4(__VA_ARGS__)
+#define vluxseg6ei16_v_i8mf4(...) __riscv_vluxseg6ei16_v_i8mf4(__VA_ARGS__)
+#define vluxseg7ei16_v_i8mf4(...) __riscv_vluxseg7ei16_v_i8mf4(__VA_ARGS__)
+#define vluxseg8ei16_v_i8mf4(...) __riscv_vluxseg8ei16_v_i8mf4(__VA_ARGS__)
+#define vluxseg2ei16_v_i8mf2(...) __riscv_vluxseg2ei16_v_i8mf2(__VA_ARGS__)
+#define vluxseg3ei16_v_i8mf2(...) __riscv_vluxseg3ei16_v_i8mf2(__VA_ARGS__)
+#define vluxseg4ei16_v_i8mf2(...) __riscv_vluxseg4ei16_v_i8mf2(__VA_ARGS__)
+#define vluxseg5ei16_v_i8mf2(...) __riscv_vluxseg5ei16_v_i8mf2(__VA_ARGS__)
+#define vluxseg6ei16_v_i8mf2(...) __riscv_vluxseg6ei16_v_i8mf2(__VA_ARGS__)
+#define vluxseg7ei16_v_i8mf2(...) __riscv_vluxseg7ei16_v_i8mf2(__VA_ARGS__)
+#define vluxseg8ei16_v_i8mf2(...) __riscv_vluxseg8ei16_v_i8mf2(__VA_ARGS__)
+#define vluxseg2ei16_v_i8m1(...) __riscv_vluxseg2ei16_v_i8m1(__VA_ARGS__)
+#define vluxseg3ei16_v_i8m1(...) __riscv_vluxseg3ei16_v_i8m1(__VA_ARGS__)
+#define vluxseg4ei16_v_i8m1(...) __riscv_vluxseg4ei16_v_i8m1(__VA_ARGS__)
+#define vluxseg5ei16_v_i8m1(...) __riscv_vluxseg5ei16_v_i8m1(__VA_ARGS__)
+#define vluxseg6ei16_v_i8m1(...) __riscv_vluxseg6ei16_v_i8m1(__VA_ARGS__)
+#define vluxseg7ei16_v_i8m1(...) __riscv_vluxseg7ei16_v_i8m1(__VA_ARGS__)
+#define vluxseg8ei16_v_i8m1(...) __riscv_vluxseg8ei16_v_i8m1(__VA_ARGS__)
+#define vluxseg2ei16_v_i8m2(...) __riscv_vluxseg2ei16_v_i8m2(__VA_ARGS__)
+#define vluxseg3ei16_v_i8m2(...) __riscv_vluxseg3ei16_v_i8m2(__VA_ARGS__)
+#define vluxseg4ei16_v_i8m2(...) __riscv_vluxseg4ei16_v_i8m2(__VA_ARGS__)
+#define vluxseg2ei16_v_i8m4(...) __riscv_vluxseg2ei16_v_i8m4(__VA_ARGS__)
+#define vluxseg2ei32_v_i8mf8(...) __riscv_vluxseg2ei32_v_i8mf8(__VA_ARGS__)
+#define vluxseg3ei32_v_i8mf8(...) __riscv_vluxseg3ei32_v_i8mf8(__VA_ARGS__)
+#define vluxseg4ei32_v_i8mf8(...) __riscv_vluxseg4ei32_v_i8mf8(__VA_ARGS__)
+#define vluxseg5ei32_v_i8mf8(...) __riscv_vluxseg5ei32_v_i8mf8(__VA_ARGS__)
+#define vluxseg6ei32_v_i8mf8(...) __riscv_vluxseg6ei32_v_i8mf8(__VA_ARGS__)
+#define vluxseg7ei32_v_i8mf8(...) __riscv_vluxseg7ei32_v_i8mf8(__VA_ARGS__)
+#define vluxseg8ei32_v_i8mf8(...) __riscv_vluxseg8ei32_v_i8mf8(__VA_ARGS__)
+#define vluxseg2ei32_v_i8mf4(...) __riscv_vluxseg2ei32_v_i8mf4(__VA_ARGS__)
+#define vluxseg3ei32_v_i8mf4(...) __riscv_vluxseg3ei32_v_i8mf4(__VA_ARGS__)
+#define vluxseg4ei32_v_i8mf4(...) __riscv_vluxseg4ei32_v_i8mf4(__VA_ARGS__)
+#define vluxseg5ei32_v_i8mf4(...) __riscv_vluxseg5ei32_v_i8mf4(__VA_ARGS__)
+#define vluxseg6ei32_v_i8mf4(...) __riscv_vluxseg6ei32_v_i8mf4(__VA_ARGS__)
+#define vluxseg7ei32_v_i8mf4(...) __riscv_vluxseg7ei32_v_i8mf4(__VA_ARGS__)
+#define vluxseg8ei32_v_i8mf4(...) __riscv_vluxseg8ei32_v_i8mf4(__VA_ARGS__)
+#define vluxseg2ei32_v_i8mf2(...) __riscv_vluxseg2ei32_v_i8mf2(__VA_ARGS__)
+#define vluxseg3ei32_v_i8mf2(...) __riscv_vluxseg3ei32_v_i8mf2(__VA_ARGS__)
+#define vluxseg4ei32_v_i8mf2(...) __riscv_vluxseg4ei32_v_i8mf2(__VA_ARGS__)
+#define vluxseg5ei32_v_i8mf2(...) __riscv_vluxseg5ei32_v_i8mf2(__VA_ARGS__)
+#define vluxseg6ei32_v_i8mf2(...) __riscv_vluxseg6ei32_v_i8mf2(__VA_ARGS__)
+#define vluxseg7ei32_v_i8mf2(...) __riscv_vluxseg7ei32_v_i8mf2(__VA_ARGS__)
+#define vluxseg8ei32_v_i8mf2(...) __riscv_vluxseg8ei32_v_i8mf2(__VA_ARGS__)
+#define vluxseg2ei32_v_i8m1(...) __riscv_vluxseg2ei32_v_i8m1(__VA_ARGS__)
+#define vluxseg3ei32_v_i8m1(...) __riscv_vluxseg3ei32_v_i8m1(__VA_ARGS__)
+#define vluxseg4ei32_v_i8m1(...) __riscv_vluxseg4ei32_v_i8m1(__VA_ARGS__)
+#define vluxseg5ei32_v_i8m1(...) __riscv_vluxseg5ei32_v_i8m1(__VA_ARGS__)
+#define vluxseg6ei32_v_i8m1(...) __riscv_vluxseg6ei32_v_i8m1(__VA_ARGS__)
+#define vluxseg7ei32_v_i8m1(...) __riscv_vluxseg7ei32_v_i8m1(__VA_ARGS__)
+#define vluxseg8ei32_v_i8m1(...) __riscv_vluxseg8ei32_v_i8m1(__VA_ARGS__)
+#define vluxseg2ei32_v_i8m2(...) __riscv_vluxseg2ei32_v_i8m2(__VA_ARGS__)
+#define vluxseg3ei32_v_i8m2(...) __riscv_vluxseg3ei32_v_i8m2(__VA_ARGS__)
+#define vluxseg4ei32_v_i8m2(...) __riscv_vluxseg4ei32_v_i8m2(__VA_ARGS__)
+#define vluxseg2ei64_v_i8mf8(...) __riscv_vluxseg2ei64_v_i8mf8(__VA_ARGS__)
+#define vluxseg3ei64_v_i8mf8(...) __riscv_vluxseg3ei64_v_i8mf8(__VA_ARGS__)
+#define vluxseg4ei64_v_i8mf8(...) __riscv_vluxseg4ei64_v_i8mf8(__VA_ARGS__)
+#define vluxseg5ei64_v_i8mf8(...) __riscv_vluxseg5ei64_v_i8mf8(__VA_ARGS__)
+#define vluxseg6ei64_v_i8mf8(...) __riscv_vluxseg6ei64_v_i8mf8(__VA_ARGS__)
+#define vluxseg7ei64_v_i8mf8(...) __riscv_vluxseg7ei64_v_i8mf8(__VA_ARGS__)
+#define vluxseg8ei64_v_i8mf8(...) __riscv_vluxseg8ei64_v_i8mf8(__VA_ARGS__)
+#define vluxseg2ei64_v_i8mf4(...) __riscv_vluxseg2ei64_v_i8mf4(__VA_ARGS__)
+#define vluxseg3ei64_v_i8mf4(...) __riscv_vluxseg3ei64_v_i8mf4(__VA_ARGS__)
+#define vluxseg4ei64_v_i8mf4(...) __riscv_vluxseg4ei64_v_i8mf4(__VA_ARGS__)
+#define vluxseg5ei64_v_i8mf4(...) __riscv_vluxseg5ei64_v_i8mf4(__VA_ARGS__)
+#define vluxseg6ei64_v_i8mf4(...) __riscv_vluxseg6ei64_v_i8mf4(__VA_ARGS__)
+#define vluxseg7ei64_v_i8mf4(...) __riscv_vluxseg7ei64_v_i8mf4(__VA_ARGS__)
+#define vluxseg8ei64_v_i8mf4(...) __riscv_vluxseg8ei64_v_i8mf4(__VA_ARGS__)
+#define vluxseg2ei64_v_i8mf2(...) __riscv_vluxseg2ei64_v_i8mf2(__VA_ARGS__)
+#define vluxseg3ei64_v_i8mf2(...) __riscv_vluxseg3ei64_v_i8mf2(__VA_ARGS__)
+#define vluxseg4ei64_v_i8mf2(...) __riscv_vluxseg4ei64_v_i8mf2(__VA_ARGS__)
+#define vluxseg5ei64_v_i8mf2(...) __riscv_vluxseg5ei64_v_i8mf2(__VA_ARGS__)
+#define vluxseg6ei64_v_i8mf2(...) __riscv_vluxseg6ei64_v_i8mf2(__VA_ARGS__)
+#define vluxseg7ei64_v_i8mf2(...) __riscv_vluxseg7ei64_v_i8mf2(__VA_ARGS__)
+#define vluxseg8ei64_v_i8mf2(...) __riscv_vluxseg8ei64_v_i8mf2(__VA_ARGS__)
+#define vluxseg2ei64_v_i8m1(...) __riscv_vluxseg2ei64_v_i8m1(__VA_ARGS__)
+#define vluxseg3ei64_v_i8m1(...) __riscv_vluxseg3ei64_v_i8m1(__VA_ARGS__)
+#define vluxseg4ei64_v_i8m1(...) __riscv_vluxseg4ei64_v_i8m1(__VA_ARGS__)
+#define vluxseg5ei64_v_i8m1(...) __riscv_vluxseg5ei64_v_i8m1(__VA_ARGS__)
+#define vluxseg6ei64_v_i8m1(...) __riscv_vluxseg6ei64_v_i8m1(__VA_ARGS__)
+#define vluxseg7ei64_v_i8m1(...) __riscv_vluxseg7ei64_v_i8m1(__VA_ARGS__)
+#define vluxseg8ei64_v_i8m1(...) __riscv_vluxseg8ei64_v_i8m1(__VA_ARGS__)
+#define vluxseg2ei8_v_i16mf4(...) __riscv_vluxseg2ei8_v_i16mf4(__VA_ARGS__)
+#define vluxseg3ei8_v_i16mf4(...) __riscv_vluxseg3ei8_v_i16mf4(__VA_ARGS__)
+#define vluxseg4ei8_v_i16mf4(...) __riscv_vluxseg4ei8_v_i16mf4(__VA_ARGS__)
+#define vluxseg5ei8_v_i16mf4(...) __riscv_vluxseg5ei8_v_i16mf4(__VA_ARGS__)
+#define vluxseg6ei8_v_i16mf4(...) __riscv_vluxseg6ei8_v_i16mf4(__VA_ARGS__)
+#define vluxseg7ei8_v_i16mf4(...) __riscv_vluxseg7ei8_v_i16mf4(__VA_ARGS__)
+#define vluxseg8ei8_v_i16mf4(...) __riscv_vluxseg8ei8_v_i16mf4(__VA_ARGS__)
+#define vluxseg2ei8_v_i16mf2(...) __riscv_vluxseg2ei8_v_i16mf2(__VA_ARGS__)
+#define vluxseg3ei8_v_i16mf2(...) __riscv_vluxseg3ei8_v_i16mf2(__VA_ARGS__)
+#define vluxseg4ei8_v_i16mf2(...) __riscv_vluxseg4ei8_v_i16mf2(__VA_ARGS__)
+#define vluxseg5ei8_v_i16mf2(...) __riscv_vluxseg5ei8_v_i16mf2(__VA_ARGS__)
+#define vluxseg6ei8_v_i16mf2(...) __riscv_vluxseg6ei8_v_i16mf2(__VA_ARGS__)
+#define vluxseg7ei8_v_i16mf2(...) __riscv_vluxseg7ei8_v_i16mf2(__VA_ARGS__)
+#define vluxseg8ei8_v_i16mf2(...) __riscv_vluxseg8ei8_v_i16mf2(__VA_ARGS__)
+#define vluxseg2ei8_v_i16m1(...) __riscv_vluxseg2ei8_v_i16m1(__VA_ARGS__)
+#define vluxseg3ei8_v_i16m1(...) __riscv_vluxseg3ei8_v_i16m1(__VA_ARGS__)
+#define vluxseg4ei8_v_i16m1(...) __riscv_vluxseg4ei8_v_i16m1(__VA_ARGS__)
+#define vluxseg5ei8_v_i16m1(...) __riscv_vluxseg5ei8_v_i16m1(__VA_ARGS__)
+#define vluxseg6ei8_v_i16m1(...) __riscv_vluxseg6ei8_v_i16m1(__VA_ARGS__)
+#define vluxseg7ei8_v_i16m1(...) __riscv_vluxseg7ei8_v_i16m1(__VA_ARGS__)
+#define vluxseg8ei8_v_i16m1(...) __riscv_vluxseg8ei8_v_i16m1(__VA_ARGS__)
+#define vluxseg2ei8_v_i16m2(...) __riscv_vluxseg2ei8_v_i16m2(__VA_ARGS__)
+#define vluxseg3ei8_v_i16m2(...) __riscv_vluxseg3ei8_v_i16m2(__VA_ARGS__)
+#define vluxseg4ei8_v_i16m2(...) __riscv_vluxseg4ei8_v_i16m2(__VA_ARGS__)
+#define vluxseg2ei8_v_i16m4(...) __riscv_vluxseg2ei8_v_i16m4(__VA_ARGS__)
+#define vluxseg2ei16_v_i16mf4(...) __riscv_vluxseg2ei16_v_i16mf4(__VA_ARGS__)
+#define vluxseg3ei16_v_i16mf4(...) __riscv_vluxseg3ei16_v_i16mf4(__VA_ARGS__)
+#define vluxseg4ei16_v_i16mf4(...) __riscv_vluxseg4ei16_v_i16mf4(__VA_ARGS__)
+#define vluxseg5ei16_v_i16mf4(...) __riscv_vluxseg5ei16_v_i16mf4(__VA_ARGS__)
+#define vluxseg6ei16_v_i16mf4(...) __riscv_vluxseg6ei16_v_i16mf4(__VA_ARGS__)
+#define vluxseg7ei16_v_i16mf4(...) __riscv_vluxseg7ei16_v_i16mf4(__VA_ARGS__)
+#define vluxseg8ei16_v_i16mf4(...) __riscv_vluxseg8ei16_v_i16mf4(__VA_ARGS__)
+#define vluxseg2ei16_v_i16mf2(...) __riscv_vluxseg2ei16_v_i16mf2(__VA_ARGS__)
+#define vluxseg3ei16_v_i16mf2(...) __riscv_vluxseg3ei16_v_i16mf2(__VA_ARGS__)
+#define vluxseg4ei16_v_i16mf2(...) __riscv_vluxseg4ei16_v_i16mf2(__VA_ARGS__)
+#define vluxseg5ei16_v_i16mf2(...) __riscv_vluxseg5ei16_v_i16mf2(__VA_ARGS__)
+#define vluxseg6ei16_v_i16mf2(...) __riscv_vluxseg6ei16_v_i16mf2(__VA_ARGS__)
+#define vluxseg7ei16_v_i16mf2(...) __riscv_vluxseg7ei16_v_i16mf2(__VA_ARGS__)
+#define vluxseg8ei16_v_i16mf2(...) __riscv_vluxseg8ei16_v_i16mf2(__VA_ARGS__)
+#define vluxseg2ei16_v_i16m1(...) __riscv_vluxseg2ei16_v_i16m1(__VA_ARGS__)
+#define vluxseg3ei16_v_i16m1(...) __riscv_vluxseg3ei16_v_i16m1(__VA_ARGS__)
+#define vluxseg4ei16_v_i16m1(...) __riscv_vluxseg4ei16_v_i16m1(__VA_ARGS__)
+#define vluxseg5ei16_v_i16m1(...) __riscv_vluxseg5ei16_v_i16m1(__VA_ARGS__)
+#define vluxseg6ei16_v_i16m1(...) __riscv_vluxseg6ei16_v_i16m1(__VA_ARGS__)
+#define vluxseg7ei16_v_i16m1(...) __riscv_vluxseg7ei16_v_i16m1(__VA_ARGS__)
+#define vluxseg8ei16_v_i16m1(...) __riscv_vluxseg8ei16_v_i16m1(__VA_ARGS__)
+#define vluxseg2ei16_v_i16m2(...) __riscv_vluxseg2ei16_v_i16m2(__VA_ARGS__)
+#define vluxseg3ei16_v_i16m2(...) __riscv_vluxseg3ei16_v_i16m2(__VA_ARGS__)
+#define vluxseg4ei16_v_i16m2(...) __riscv_vluxseg4ei16_v_i16m2(__VA_ARGS__)
+#define vluxseg2ei16_v_i16m4(...) __riscv_vluxseg2ei16_v_i16m4(__VA_ARGS__)
+#define vluxseg2ei32_v_i16mf4(...) __riscv_vluxseg2ei32_v_i16mf4(__VA_ARGS__)
+#define vluxseg3ei32_v_i16mf4(...) __riscv_vluxseg3ei32_v_i16mf4(__VA_ARGS__)
+#define vluxseg4ei32_v_i16mf4(...) __riscv_vluxseg4ei32_v_i16mf4(__VA_ARGS__)
+#define vluxseg5ei32_v_i16mf4(...) __riscv_vluxseg5ei32_v_i16mf4(__VA_ARGS__)
+#define vluxseg6ei32_v_i16mf4(...) __riscv_vluxseg6ei32_v_i16mf4(__VA_ARGS__)
+#define vluxseg7ei32_v_i16mf4(...) __riscv_vluxseg7ei32_v_i16mf4(__VA_ARGS__)
+#define vluxseg8ei32_v_i16mf4(...) __riscv_vluxseg8ei32_v_i16mf4(__VA_ARGS__)
+#define vluxseg2ei32_v_i16mf2(...) __riscv_vluxseg2ei32_v_i16mf2(__VA_ARGS__)
+#define vluxseg3ei32_v_i16mf2(...) __riscv_vluxseg3ei32_v_i16mf2(__VA_ARGS__)
+#define vluxseg4ei32_v_i16mf2(...) __riscv_vluxseg4ei32_v_i16mf2(__VA_ARGS__)
+#define vluxseg5ei32_v_i16mf2(...) __riscv_vluxseg5ei32_v_i16mf2(__VA_ARGS__)
+#define vluxseg6ei32_v_i16mf2(...) __riscv_vluxseg6ei32_v_i16mf2(__VA_ARGS__)
+#define vluxseg7ei32_v_i16mf2(...) __riscv_vluxseg7ei32_v_i16mf2(__VA_ARGS__)
+#define vluxseg8ei32_v_i16mf2(...) __riscv_vluxseg8ei32_v_i16mf2(__VA_ARGS__)
+#define vluxseg2ei32_v_i16m1(...) __riscv_vluxseg2ei32_v_i16m1(__VA_ARGS__)
+#define vluxseg3ei32_v_i16m1(...) __riscv_vluxseg3ei32_v_i16m1(__VA_ARGS__)
+#define vluxseg4ei32_v_i16m1(...) __riscv_vluxseg4ei32_v_i16m1(__VA_ARGS__)
+#define vluxseg5ei32_v_i16m1(...) __riscv_vluxseg5ei32_v_i16m1(__VA_ARGS__)
+#define vluxseg6ei32_v_i16m1(...) __riscv_vluxseg6ei32_v_i16m1(__VA_ARGS__)
+#define vluxseg7ei32_v_i16m1(...) __riscv_vluxseg7ei32_v_i16m1(__VA_ARGS__)
+#define vluxseg8ei32_v_i16m1(...) __riscv_vluxseg8ei32_v_i16m1(__VA_ARGS__)
+#define vluxseg2ei32_v_i16m2(...) __riscv_vluxseg2ei32_v_i16m2(__VA_ARGS__)
+#define vluxseg3ei32_v_i16m2(...) __riscv_vluxseg3ei32_v_i16m2(__VA_ARGS__)
+#define vluxseg4ei32_v_i16m2(...) __riscv_vluxseg4ei32_v_i16m2(__VA_ARGS__)
+#define vluxseg2ei32_v_i16m4(...) __riscv_vluxseg2ei32_v_i16m4(__VA_ARGS__)
+#define vluxseg2ei64_v_i16mf4(...) __riscv_vluxseg2ei64_v_i16mf4(__VA_ARGS__)
+#define vluxseg3ei64_v_i16mf4(...) __riscv_vluxseg3ei64_v_i16mf4(__VA_ARGS__)
+#define vluxseg4ei64_v_i16mf4(...) __riscv_vluxseg4ei64_v_i16mf4(__VA_ARGS__)
+#define vluxseg5ei64_v_i16mf4(...) __riscv_vluxseg5ei64_v_i16mf4(__VA_ARGS__)
+#define vluxseg6ei64_v_i16mf4(...) __riscv_vluxseg6ei64_v_i16mf4(__VA_ARGS__)
+#define vluxseg7ei64_v_i16mf4(...) __riscv_vluxseg7ei64_v_i16mf4(__VA_ARGS__)
+#define vluxseg8ei64_v_i16mf4(...) __riscv_vluxseg8ei64_v_i16mf4(__VA_ARGS__)
+#define vluxseg2ei64_v_i16mf2(...) __riscv_vluxseg2ei64_v_i16mf2(__VA_ARGS__)
+#define vluxseg3ei64_v_i16mf2(...) __riscv_vluxseg3ei64_v_i16mf2(__VA_ARGS__)
+#define vluxseg4ei64_v_i16mf2(...) __riscv_vluxseg4ei64_v_i16mf2(__VA_ARGS__)
+#define vluxseg5ei64_v_i16mf2(...) __riscv_vluxseg5ei64_v_i16mf2(__VA_ARGS__)
+#define vluxseg6ei64_v_i16mf2(...) __riscv_vluxseg6ei64_v_i16mf2(__VA_ARGS__)
+#define vluxseg7ei64_v_i16mf2(...) __riscv_vluxseg7ei64_v_i16mf2(__VA_ARGS__)
+#define vluxseg8ei64_v_i16mf2(...) __riscv_vluxseg8ei64_v_i16mf2(__VA_ARGS__)
+#define vluxseg2ei64_v_i16m1(...) __riscv_vluxseg2ei64_v_i16m1(__VA_ARGS__)
+#define vluxseg3ei64_v_i16m1(...) __riscv_vluxseg3ei64_v_i16m1(__VA_ARGS__)
+#define vluxseg4ei64_v_i16m1(...) __riscv_vluxseg4ei64_v_i16m1(__VA_ARGS__)
+#define vluxseg5ei64_v_i16m1(...) __riscv_vluxseg5ei64_v_i16m1(__VA_ARGS__)
+#define vluxseg6ei64_v_i16m1(...) __riscv_vluxseg6ei64_v_i16m1(__VA_ARGS__)
+#define vluxseg7ei64_v_i16m1(...) __riscv_vluxseg7ei64_v_i16m1(__VA_ARGS__)
+#define vluxseg8ei64_v_i16m1(...) __riscv_vluxseg8ei64_v_i16m1(__VA_ARGS__)
+#define vluxseg2ei64_v_i16m2(...) __riscv_vluxseg2ei64_v_i16m2(__VA_ARGS__)
+#define vluxseg3ei64_v_i16m2(...) __riscv_vluxseg3ei64_v_i16m2(__VA_ARGS__)
+#define vluxseg4ei64_v_i16m2(...) __riscv_vluxseg4ei64_v_i16m2(__VA_ARGS__)
+#define vluxseg2ei8_v_i32mf2(...) __riscv_vluxseg2ei8_v_i32mf2(__VA_ARGS__)
+#define vluxseg3ei8_v_i32mf2(...) __riscv_vluxseg3ei8_v_i32mf2(__VA_ARGS__)
+#define vluxseg4ei8_v_i32mf2(...) __riscv_vluxseg4ei8_v_i32mf2(__VA_ARGS__)
+#define vluxseg5ei8_v_i32mf2(...) __riscv_vluxseg5ei8_v_i32mf2(__VA_ARGS__)
+#define vluxseg6ei8_v_i32mf2(...) __riscv_vluxseg6ei8_v_i32mf2(__VA_ARGS__)
+#define vluxseg7ei8_v_i32mf2(...) __riscv_vluxseg7ei8_v_i32mf2(__VA_ARGS__)
+#define vluxseg8ei8_v_i32mf2(...) __riscv_vluxseg8ei8_v_i32mf2(__VA_ARGS__)
+#define vluxseg2ei8_v_i32m1(...) __riscv_vluxseg2ei8_v_i32m1(__VA_ARGS__)
+#define vluxseg3ei8_v_i32m1(...) __riscv_vluxseg3ei8_v_i32m1(__VA_ARGS__)
+#define vluxseg4ei8_v_i32m1(...) __riscv_vluxseg4ei8_v_i32m1(__VA_ARGS__)
+#define vluxseg5ei8_v_i32m1(...) __riscv_vluxseg5ei8_v_i32m1(__VA_ARGS__)
+#define vluxseg6ei8_v_i32m1(...) __riscv_vluxseg6ei8_v_i32m1(__VA_ARGS__)
+#define vluxseg7ei8_v_i32m1(...) __riscv_vluxseg7ei8_v_i32m1(__VA_ARGS__)
+#define vluxseg8ei8_v_i32m1(...) __riscv_vluxseg8ei8_v_i32m1(__VA_ARGS__)
+#define vluxseg2ei8_v_i32m2(...) __riscv_vluxseg2ei8_v_i32m2(__VA_ARGS__)
+#define vluxseg3ei8_v_i32m2(...) __riscv_vluxseg3ei8_v_i32m2(__VA_ARGS__)
+#define vluxseg4ei8_v_i32m2(...) __riscv_vluxseg4ei8_v_i32m2(__VA_ARGS__)
+#define vluxseg2ei8_v_i32m4(...) __riscv_vluxseg2ei8_v_i32m4(__VA_ARGS__)
+#define vluxseg2ei16_v_i32mf2(...) __riscv_vluxseg2ei16_v_i32mf2(__VA_ARGS__)
+#define vluxseg3ei16_v_i32mf2(...) __riscv_vluxseg3ei16_v_i32mf2(__VA_ARGS__)
+#define vluxseg4ei16_v_i32mf2(...) __riscv_vluxseg4ei16_v_i32mf2(__VA_ARGS__)
+#define vluxseg5ei16_v_i32mf2(...) __riscv_vluxseg5ei16_v_i32mf2(__VA_ARGS__)
+#define vluxseg6ei16_v_i32mf2(...) __riscv_vluxseg6ei16_v_i32mf2(__VA_ARGS__)
+#define vluxseg7ei16_v_i32mf2(...) __riscv_vluxseg7ei16_v_i32mf2(__VA_ARGS__)
+#define vluxseg8ei16_v_i32mf2(...) __riscv_vluxseg8ei16_v_i32mf2(__VA_ARGS__)
+#define vluxseg2ei16_v_i32m1(...) __riscv_vluxseg2ei16_v_i32m1(__VA_ARGS__)
+#define vluxseg3ei16_v_i32m1(...) __riscv_vluxseg3ei16_v_i32m1(__VA_ARGS__)
+#define vluxseg4ei16_v_i32m1(...) __riscv_vluxseg4ei16_v_i32m1(__VA_ARGS__)
+#define vluxseg5ei16_v_i32m1(...) __riscv_vluxseg5ei16_v_i32m1(__VA_ARGS__)
+#define vluxseg6ei16_v_i32m1(...) __riscv_vluxseg6ei16_v_i32m1(__VA_ARGS__)
+#define vluxseg7ei16_v_i32m1(...) __riscv_vluxseg7ei16_v_i32m1(__VA_ARGS__)
+#define vluxseg8ei16_v_i32m1(...) __riscv_vluxseg8ei16_v_i32m1(__VA_ARGS__)
+#define vluxseg2ei16_v_i32m2(...) __riscv_vluxseg2ei16_v_i32m2(__VA_ARGS__)
+#define vluxseg3ei16_v_i32m2(...) __riscv_vluxseg3ei16_v_i32m2(__VA_ARGS__)
+#define vluxseg4ei16_v_i32m2(...) __riscv_vluxseg4ei16_v_i32m2(__VA_ARGS__)
+#define vluxseg2ei16_v_i32m4(...) __riscv_vluxseg2ei16_v_i32m4(__VA_ARGS__)
+#define vluxseg2ei32_v_i32mf2(...) __riscv_vluxseg2ei32_v_i32mf2(__VA_ARGS__)
+#define vluxseg3ei32_v_i32mf2(...) __riscv_vluxseg3ei32_v_i32mf2(__VA_ARGS__)
+#define vluxseg4ei32_v_i32mf2(...) __riscv_vluxseg4ei32_v_i32mf2(__VA_ARGS__)
+#define vluxseg5ei32_v_i32mf2(...) __riscv_vluxseg5ei32_v_i32mf2(__VA_ARGS__)
+#define vluxseg6ei32_v_i32mf2(...) __riscv_vluxseg6ei32_v_i32mf2(__VA_ARGS__)
+#define vluxseg7ei32_v_i32mf2(...) __riscv_vluxseg7ei32_v_i32mf2(__VA_ARGS__)
+#define vluxseg8ei32_v_i32mf2(...) __riscv_vluxseg8ei32_v_i32mf2(__VA_ARGS__)
+#define vluxseg2ei32_v_i32m1(...) __riscv_vluxseg2ei32_v_i32m1(__VA_ARGS__)
+#define vluxseg3ei32_v_i32m1(...) __riscv_vluxseg3ei32_v_i32m1(__VA_ARGS__)
+#define vluxseg4ei32_v_i32m1(...) __riscv_vluxseg4ei32_v_i32m1(__VA_ARGS__)
+#define vluxseg5ei32_v_i32m1(...) __riscv_vluxseg5ei32_v_i32m1(__VA_ARGS__)
+#define vluxseg6ei32_v_i32m1(...) __riscv_vluxseg6ei32_v_i32m1(__VA_ARGS__)
+#define vluxseg7ei32_v_i32m1(...) __riscv_vluxseg7ei32_v_i32m1(__VA_ARGS__)
+#define vluxseg8ei32_v_i32m1(...) __riscv_vluxseg8ei32_v_i32m1(__VA_ARGS__)
+#define vluxseg2ei32_v_i32m2(...) __riscv_vluxseg2ei32_v_i32m2(__VA_ARGS__)
+#define vluxseg3ei32_v_i32m2(...) __riscv_vluxseg3ei32_v_i32m2(__VA_ARGS__)
+#define vluxseg4ei32_v_i32m2(...) __riscv_vluxseg4ei32_v_i32m2(__VA_ARGS__)
+#define vluxseg2ei32_v_i32m4(...) __riscv_vluxseg2ei32_v_i32m4(__VA_ARGS__)
+#define vluxseg2ei64_v_i32mf2(...) __riscv_vluxseg2ei64_v_i32mf2(__VA_ARGS__)
+#define vluxseg3ei64_v_i32mf2(...) __riscv_vluxseg3ei64_v_i32mf2(__VA_ARGS__)
+#define vluxseg4ei64_v_i32mf2(...) __riscv_vluxseg4ei64_v_i32mf2(__VA_ARGS__)
+#define vluxseg5ei64_v_i32mf2(...) __riscv_vluxseg5ei64_v_i32mf2(__VA_ARGS__)
+#define vluxseg6ei64_v_i32mf2(...) __riscv_vluxseg6ei64_v_i32mf2(__VA_ARGS__)
+#define vluxseg7ei64_v_i32mf2(...) __riscv_vluxseg7ei64_v_i32mf2(__VA_ARGS__)
+#define vluxseg8ei64_v_i32mf2(...) __riscv_vluxseg8ei64_v_i32mf2(__VA_ARGS__)
+#define vluxseg2ei64_v_i32m1(...) __riscv_vluxseg2ei64_v_i32m1(__VA_ARGS__)
+#define vluxseg3ei64_v_i32m1(...) __riscv_vluxseg3ei64_v_i32m1(__VA_ARGS__)
+#define vluxseg4ei64_v_i32m1(...) __riscv_vluxseg4ei64_v_i32m1(__VA_ARGS__)
+#define vluxseg5ei64_v_i32m1(...) __riscv_vluxseg5ei64_v_i32m1(__VA_ARGS__)
+#define vluxseg6ei64_v_i32m1(...) __riscv_vluxseg6ei64_v_i32m1(__VA_ARGS__)
+#define vluxseg7ei64_v_i32m1(...) __riscv_vluxseg7ei64_v_i32m1(__VA_ARGS__)
+#define vluxseg8ei64_v_i32m1(...) __riscv_vluxseg8ei64_v_i32m1(__VA_ARGS__)
+#define vluxseg2ei64_v_i32m2(...) __riscv_vluxseg2ei64_v_i32m2(__VA_ARGS__)
+#define vluxseg3ei64_v_i32m2(...) __riscv_vluxseg3ei64_v_i32m2(__VA_ARGS__)
+#define vluxseg4ei64_v_i32m2(...) __riscv_vluxseg4ei64_v_i32m2(__VA_ARGS__)
+#define vluxseg2ei64_v_i32m4(...) __riscv_vluxseg2ei64_v_i32m4(__VA_ARGS__)
+#define vluxseg2ei8_v_i64m1(...) __riscv_vluxseg2ei8_v_i64m1(__VA_ARGS__)
+#define vluxseg3ei8_v_i64m1(...) __riscv_vluxseg3ei8_v_i64m1(__VA_ARGS__)
+#define vluxseg4ei8_v_i64m1(...) __riscv_vluxseg4ei8_v_i64m1(__VA_ARGS__)
+#define vluxseg5ei8_v_i64m1(...) __riscv_vluxseg5ei8_v_i64m1(__VA_ARGS__)
+#define vluxseg6ei8_v_i64m1(...) __riscv_vluxseg6ei8_v_i64m1(__VA_ARGS__)
+#define vluxseg7ei8_v_i64m1(...) __riscv_vluxseg7ei8_v_i64m1(__VA_ARGS__)
+#define vluxseg8ei8_v_i64m1(...) __riscv_vluxseg8ei8_v_i64m1(__VA_ARGS__)
+#define vluxseg2ei8_v_i64m2(...) __riscv_vluxseg2ei8_v_i64m2(__VA_ARGS__)
+#define vluxseg3ei8_v_i64m2(...) __riscv_vluxseg3ei8_v_i64m2(__VA_ARGS__)
+#define vluxseg4ei8_v_i64m2(...) __riscv_vluxseg4ei8_v_i64m2(__VA_ARGS__)
+#define vluxseg2ei8_v_i64m4(...) __riscv_vluxseg2ei8_v_i64m4(__VA_ARGS__)
+#define vluxseg2ei16_v_i64m1(...) __riscv_vluxseg2ei16_v_i64m1(__VA_ARGS__)
+#define vluxseg3ei16_v_i64m1(...) __riscv_vluxseg3ei16_v_i64m1(__VA_ARGS__)
+#define vluxseg4ei16_v_i64m1(...) __riscv_vluxseg4ei16_v_i64m1(__VA_ARGS__)
+#define vluxseg5ei16_v_i64m1(...) __riscv_vluxseg5ei16_v_i64m1(__VA_ARGS__)
+#define vluxseg6ei16_v_i64m1(...) __riscv_vluxseg6ei16_v_i64m1(__VA_ARGS__)
+#define vluxseg7ei16_v_i64m1(...) __riscv_vluxseg7ei16_v_i64m1(__VA_ARGS__)
+#define vluxseg8ei16_v_i64m1(...) __riscv_vluxseg8ei16_v_i64m1(__VA_ARGS__)
+#define vluxseg2ei16_v_i64m2(...) __riscv_vluxseg2ei16_v_i64m2(__VA_ARGS__)
+#define vluxseg3ei16_v_i64m2(...) __riscv_vluxseg3ei16_v_i64m2(__VA_ARGS__)
+#define vluxseg4ei16_v_i64m2(...) __riscv_vluxseg4ei16_v_i64m2(__VA_ARGS__)
+#define vluxseg2ei16_v_i64m4(...) __riscv_vluxseg2ei16_v_i64m4(__VA_ARGS__)
+#define vluxseg2ei32_v_i64m1(...) __riscv_vluxseg2ei32_v_i64m1(__VA_ARGS__)
+#define vluxseg3ei32_v_i64m1(...) __riscv_vluxseg3ei32_v_i64m1(__VA_ARGS__)
+#define vluxseg4ei32_v_i64m1(...) __riscv_vluxseg4ei32_v_i64m1(__VA_ARGS__)
+#define vluxseg5ei32_v_i64m1(...) __riscv_vluxseg5ei32_v_i64m1(__VA_ARGS__)
+#define vluxseg6ei32_v_i64m1(...) __riscv_vluxseg6ei32_v_i64m1(__VA_ARGS__)
+#define vluxseg7ei32_v_i64m1(...) __riscv_vluxseg7ei32_v_i64m1(__VA_ARGS__)
+#define vluxseg8ei32_v_i64m1(...) __riscv_vluxseg8ei32_v_i64m1(__VA_ARGS__)
+#define vluxseg2ei32_v_i64m2(...) __riscv_vluxseg2ei32_v_i64m2(__VA_ARGS__)
+#define vluxseg3ei32_v_i64m2(...) __riscv_vluxseg3ei32_v_i64m2(__VA_ARGS__)
+#define vluxseg4ei32_v_i64m2(...) __riscv_vluxseg4ei32_v_i64m2(__VA_ARGS__)
+#define vluxseg2ei32_v_i64m4(...) __riscv_vluxseg2ei32_v_i64m4(__VA_ARGS__)
+#define vluxseg2ei64_v_i64m1(...) __riscv_vluxseg2ei64_v_i64m1(__VA_ARGS__)
+#define vluxseg3ei64_v_i64m1(...) __riscv_vluxseg3ei64_v_i64m1(__VA_ARGS__)
+#define vluxseg4ei64_v_i64m1(...) __riscv_vluxseg4ei64_v_i64m1(__VA_ARGS__)
+#define vluxseg5ei64_v_i64m1(...) __riscv_vluxseg5ei64_v_i64m1(__VA_ARGS__)
+#define vluxseg6ei64_v_i64m1(...) __riscv_vluxseg6ei64_v_i64m1(__VA_ARGS__)
+#define vluxseg7ei64_v_i64m1(...) __riscv_vluxseg7ei64_v_i64m1(__VA_ARGS__)
+#define vluxseg8ei64_v_i64m1(...) __riscv_vluxseg8ei64_v_i64m1(__VA_ARGS__)
+#define vluxseg2ei64_v_i64m2(...) __riscv_vluxseg2ei64_v_i64m2(__VA_ARGS__)
+#define vluxseg3ei64_v_i64m2(...) __riscv_vluxseg3ei64_v_i64m2(__VA_ARGS__)
+#define vluxseg4ei64_v_i64m2(...) __riscv_vluxseg4ei64_v_i64m2(__VA_ARGS__)
+#define vluxseg2ei64_v_i64m4(...) __riscv_vluxseg2ei64_v_i64m4(__VA_ARGS__)
+#define vloxseg2ei8_v_u8mf8(...) __riscv_vloxseg2ei8_v_u8mf8(__VA_ARGS__)
+#define vloxseg3ei8_v_u8mf8(...) __riscv_vloxseg3ei8_v_u8mf8(__VA_ARGS__)
+#define vloxseg4ei8_v_u8mf8(...) __riscv_vloxseg4ei8_v_u8mf8(__VA_ARGS__)
+#define vloxseg5ei8_v_u8mf8(...) __riscv_vloxseg5ei8_v_u8mf8(__VA_ARGS__)
+#define vloxseg6ei8_v_u8mf8(...) __riscv_vloxseg6ei8_v_u8mf8(__VA_ARGS__)
+#define vloxseg7ei8_v_u8mf8(...) __riscv_vloxseg7ei8_v_u8mf8(__VA_ARGS__)
+#define vloxseg8ei8_v_u8mf8(...) __riscv_vloxseg8ei8_v_u8mf8(__VA_ARGS__)
+#define vloxseg2ei8_v_u8mf4(...) __riscv_vloxseg2ei8_v_u8mf4(__VA_ARGS__)
+#define vloxseg3ei8_v_u8mf4(...) __riscv_vloxseg3ei8_v_u8mf4(__VA_ARGS__)
+#define vloxseg4ei8_v_u8mf4(...) __riscv_vloxseg4ei8_v_u8mf4(__VA_ARGS__)
+#define vloxseg5ei8_v_u8mf4(...) __riscv_vloxseg5ei8_v_u8mf4(__VA_ARGS__)
+#define vloxseg6ei8_v_u8mf4(...) __riscv_vloxseg6ei8_v_u8mf4(__VA_ARGS__)
+#define vloxseg7ei8_v_u8mf4(...) __riscv_vloxseg7ei8_v_u8mf4(__VA_ARGS__)
+#define vloxseg8ei8_v_u8mf4(...) __riscv_vloxseg8ei8_v_u8mf4(__VA_ARGS__)
+#define vloxseg2ei8_v_u8mf2(...) __riscv_vloxseg2ei8_v_u8mf2(__VA_ARGS__)
+#define vloxseg3ei8_v_u8mf2(...) __riscv_vloxseg3ei8_v_u8mf2(__VA_ARGS__)
+#define vloxseg4ei8_v_u8mf2(...) __riscv_vloxseg4ei8_v_u8mf2(__VA_ARGS__)
+#define vloxseg5ei8_v_u8mf2(...) __riscv_vloxseg5ei8_v_u8mf2(__VA_ARGS__)
+#define vloxseg6ei8_v_u8mf2(...) __riscv_vloxseg6ei8_v_u8mf2(__VA_ARGS__)
+#define vloxseg7ei8_v_u8mf2(...) __riscv_vloxseg7ei8_v_u8mf2(__VA_ARGS__)
+#define vloxseg8ei8_v_u8mf2(...) __riscv_vloxseg8ei8_v_u8mf2(__VA_ARGS__)
+#define vloxseg2ei8_v_u8m1(...) __riscv_vloxseg2ei8_v_u8m1(__VA_ARGS__)
+#define vloxseg3ei8_v_u8m1(...) __riscv_vloxseg3ei8_v_u8m1(__VA_ARGS__)
+#define vloxseg4ei8_v_u8m1(...) __riscv_vloxseg4ei8_v_u8m1(__VA_ARGS__)
+#define vloxseg5ei8_v_u8m1(...) __riscv_vloxseg5ei8_v_u8m1(__VA_ARGS__)
+#define vloxseg6ei8_v_u8m1(...) __riscv_vloxseg6ei8_v_u8m1(__VA_ARGS__)
+#define vloxseg7ei8_v_u8m1(...) __riscv_vloxseg7ei8_v_u8m1(__VA_ARGS__)
+#define vloxseg8ei8_v_u8m1(...) __riscv_vloxseg8ei8_v_u8m1(__VA_ARGS__)
+#define vloxseg2ei8_v_u8m2(...) __riscv_vloxseg2ei8_v_u8m2(__VA_ARGS__)
+#define vloxseg3ei8_v_u8m2(...) __riscv_vloxseg3ei8_v_u8m2(__VA_ARGS__)
+#define vloxseg4ei8_v_u8m2(...) __riscv_vloxseg4ei8_v_u8m2(__VA_ARGS__)
+#define vloxseg2ei8_v_u8m4(...) __riscv_vloxseg2ei8_v_u8m4(__VA_ARGS__)
+#define vloxseg2ei16_v_u8mf8(...) __riscv_vloxseg2ei16_v_u8mf8(__VA_ARGS__)
+#define vloxseg3ei16_v_u8mf8(...) __riscv_vloxseg3ei16_v_u8mf8(__VA_ARGS__)
+#define vloxseg4ei16_v_u8mf8(...) __riscv_vloxseg4ei16_v_u8mf8(__VA_ARGS__)
+#define vloxseg5ei16_v_u8mf8(...) __riscv_vloxseg5ei16_v_u8mf8(__VA_ARGS__)
+#define vloxseg6ei16_v_u8mf8(...) __riscv_vloxseg6ei16_v_u8mf8(__VA_ARGS__)
+#define vloxseg7ei16_v_u8mf8(...) __riscv_vloxseg7ei16_v_u8mf8(__VA_ARGS__)
+#define vloxseg8ei16_v_u8mf8(...) __riscv_vloxseg8ei16_v_u8mf8(__VA_ARGS__)
+#define vloxseg2ei16_v_u8mf4(...) __riscv_vloxseg2ei16_v_u8mf4(__VA_ARGS__)
+#define vloxseg3ei16_v_u8mf4(...) __riscv_vloxseg3ei16_v_u8mf4(__VA_ARGS__)
+#define vloxseg4ei16_v_u8mf4(...) __riscv_vloxseg4ei16_v_u8mf4(__VA_ARGS__)
+#define vloxseg5ei16_v_u8mf4(...) __riscv_vloxseg5ei16_v_u8mf4(__VA_ARGS__)
+#define vloxseg6ei16_v_u8mf4(...) __riscv_vloxseg6ei16_v_u8mf4(__VA_ARGS__)
+#define vloxseg7ei16_v_u8mf4(...) __riscv_vloxseg7ei16_v_u8mf4(__VA_ARGS__)
+#define vloxseg8ei16_v_u8mf4(...) __riscv_vloxseg8ei16_v_u8mf4(__VA_ARGS__)
+#define vloxseg2ei16_v_u8mf2(...) __riscv_vloxseg2ei16_v_u8mf2(__VA_ARGS__)
+#define vloxseg3ei16_v_u8mf2(...) __riscv_vloxseg3ei16_v_u8mf2(__VA_ARGS__)
+#define vloxseg4ei16_v_u8mf2(...) __riscv_vloxseg4ei16_v_u8mf2(__VA_ARGS__)
+#define vloxseg5ei16_v_u8mf2(...) __riscv_vloxseg5ei16_v_u8mf2(__VA_ARGS__)
+#define vloxseg6ei16_v_u8mf2(...) __riscv_vloxseg6ei16_v_u8mf2(__VA_ARGS__)
+#define vloxseg7ei16_v_u8mf2(...) __riscv_vloxseg7ei16_v_u8mf2(__VA_ARGS__)
+#define vloxseg8ei16_v_u8mf2(...) __riscv_vloxseg8ei16_v_u8mf2(__VA_ARGS__)
+#define vloxseg2ei16_v_u8m1(...) __riscv_vloxseg2ei16_v_u8m1(__VA_ARGS__)
+#define vloxseg3ei16_v_u8m1(...) __riscv_vloxseg3ei16_v_u8m1(__VA_ARGS__)
+#define vloxseg4ei16_v_u8m1(...) __riscv_vloxseg4ei16_v_u8m1(__VA_ARGS__)
+#define vloxseg5ei16_v_u8m1(...) __riscv_vloxseg5ei16_v_u8m1(__VA_ARGS__)
+#define vloxseg6ei16_v_u8m1(...) __riscv_vloxseg6ei16_v_u8m1(__VA_ARGS__)
+#define vloxseg7ei16_v_u8m1(...) __riscv_vloxseg7ei16_v_u8m1(__VA_ARGS__)
+#define vloxseg8ei16_v_u8m1(...) __riscv_vloxseg8ei16_v_u8m1(__VA_ARGS__)
+#define vloxseg2ei16_v_u8m2(...) __riscv_vloxseg2ei16_v_u8m2(__VA_ARGS__)
+#define vloxseg3ei16_v_u8m2(...) __riscv_vloxseg3ei16_v_u8m2(__VA_ARGS__)
+#define vloxseg4ei16_v_u8m2(...) __riscv_vloxseg4ei16_v_u8m2(__VA_ARGS__)
+#define vloxseg2ei16_v_u8m4(...) __riscv_vloxseg2ei16_v_u8m4(__VA_ARGS__)
+#define vloxseg2ei32_v_u8mf8(...) __riscv_vloxseg2ei32_v_u8mf8(__VA_ARGS__)
+#define vloxseg3ei32_v_u8mf8(...) __riscv_vloxseg3ei32_v_u8mf8(__VA_ARGS__)
+#define vloxseg4ei32_v_u8mf8(...) __riscv_vloxseg4ei32_v_u8mf8(__VA_ARGS__)
+#define vloxseg5ei32_v_u8mf8(...) __riscv_vloxseg5ei32_v_u8mf8(__VA_ARGS__)
+#define vloxseg6ei32_v_u8mf8(...) __riscv_vloxseg6ei32_v_u8mf8(__VA_ARGS__)
+#define vloxseg7ei32_v_u8mf8(...) __riscv_vloxseg7ei32_v_u8mf8(__VA_ARGS__)
+#define vloxseg8ei32_v_u8mf8(...) __riscv_vloxseg8ei32_v_u8mf8(__VA_ARGS__)
+#define vloxseg2ei32_v_u8mf4(...) __riscv_vloxseg2ei32_v_u8mf4(__VA_ARGS__)
+#define vloxseg3ei32_v_u8mf4(...) __riscv_vloxseg3ei32_v_u8mf4(__VA_ARGS__)
+#define vloxseg4ei32_v_u8mf4(...) __riscv_vloxseg4ei32_v_u8mf4(__VA_ARGS__)
+#define vloxseg5ei32_v_u8mf4(...) __riscv_vloxseg5ei32_v_u8mf4(__VA_ARGS__)
+#define vloxseg6ei32_v_u8mf4(...) __riscv_vloxseg6ei32_v_u8mf4(__VA_ARGS__)
+#define vloxseg7ei32_v_u8mf4(...) __riscv_vloxseg7ei32_v_u8mf4(__VA_ARGS__)
+#define vloxseg8ei32_v_u8mf4(...) __riscv_vloxseg8ei32_v_u8mf4(__VA_ARGS__)
+#define vloxseg2ei32_v_u8mf2(...) __riscv_vloxseg2ei32_v_u8mf2(__VA_ARGS__)
+#define vloxseg3ei32_v_u8mf2(...) __riscv_vloxseg3ei32_v_u8mf2(__VA_ARGS__)
+#define vloxseg4ei32_v_u8mf2(...) __riscv_vloxseg4ei32_v_u8mf2(__VA_ARGS__)
+#define vloxseg5ei32_v_u8mf2(...) __riscv_vloxseg5ei32_v_u8mf2(__VA_ARGS__)
+#define vloxseg6ei32_v_u8mf2(...) __riscv_vloxseg6ei32_v_u8mf2(__VA_ARGS__)
+#define vloxseg7ei32_v_u8mf2(...) __riscv_vloxseg7ei32_v_u8mf2(__VA_ARGS__)
+#define vloxseg8ei32_v_u8mf2(...) __riscv_vloxseg8ei32_v_u8mf2(__VA_ARGS__)
+#define vloxseg2ei32_v_u8m1(...) __riscv_vloxseg2ei32_v_u8m1(__VA_ARGS__)
+#define vloxseg3ei32_v_u8m1(...) __riscv_vloxseg3ei32_v_u8m1(__VA_ARGS__)
+#define vloxseg4ei32_v_u8m1(...) __riscv_vloxseg4ei32_v_u8m1(__VA_ARGS__)
+#define vloxseg5ei32_v_u8m1(...) __riscv_vloxseg5ei32_v_u8m1(__VA_ARGS__)
+#define vloxseg6ei32_v_u8m1(...) __riscv_vloxseg6ei32_v_u8m1(__VA_ARGS__)
+#define vloxseg7ei32_v_u8m1(...) __riscv_vloxseg7ei32_v_u8m1(__VA_ARGS__)
+#define vloxseg8ei32_v_u8m1(...) __riscv_vloxseg8ei32_v_u8m1(__VA_ARGS__)
+#define vloxseg2ei32_v_u8m2(...) __riscv_vloxseg2ei32_v_u8m2(__VA_ARGS__)
+#define vloxseg3ei32_v_u8m2(...) __riscv_vloxseg3ei32_v_u8m2(__VA_ARGS__)
+#define vloxseg4ei32_v_u8m2(...) __riscv_vloxseg4ei32_v_u8m2(__VA_ARGS__)
+#define vloxseg2ei64_v_u8mf8(...) __riscv_vloxseg2ei64_v_u8mf8(__VA_ARGS__)
+#define vloxseg3ei64_v_u8mf8(...) __riscv_vloxseg3ei64_v_u8mf8(__VA_ARGS__)
+#define vloxseg4ei64_v_u8mf8(...) __riscv_vloxseg4ei64_v_u8mf8(__VA_ARGS__)
+#define vloxseg5ei64_v_u8mf8(...) __riscv_vloxseg5ei64_v_u8mf8(__VA_ARGS__)
+#define vloxseg6ei64_v_u8mf8(...) __riscv_vloxseg6ei64_v_u8mf8(__VA_ARGS__)
+#define vloxseg7ei64_v_u8mf8(...) __riscv_vloxseg7ei64_v_u8mf8(__VA_ARGS__)
+#define vloxseg8ei64_v_u8mf8(...) __riscv_vloxseg8ei64_v_u8mf8(__VA_ARGS__)
+#define vloxseg2ei64_v_u8mf4(...) __riscv_vloxseg2ei64_v_u8mf4(__VA_ARGS__)
+#define vloxseg3ei64_v_u8mf4(...) __riscv_vloxseg3ei64_v_u8mf4(__VA_ARGS__)
+#define vloxseg4ei64_v_u8mf4(...) __riscv_vloxseg4ei64_v_u8mf4(__VA_ARGS__)
+#define vloxseg5ei64_v_u8mf4(...) __riscv_vloxseg5ei64_v_u8mf4(__VA_ARGS__)
+#define vloxseg6ei64_v_u8mf4(...) __riscv_vloxseg6ei64_v_u8mf4(__VA_ARGS__)
+#define vloxseg7ei64_v_u8mf4(...) __riscv_vloxseg7ei64_v_u8mf4(__VA_ARGS__)
+#define vloxseg8ei64_v_u8mf4(...) __riscv_vloxseg8ei64_v_u8mf4(__VA_ARGS__)
+#define vloxseg2ei64_v_u8mf2(...) __riscv_vloxseg2ei64_v_u8mf2(__VA_ARGS__)
+#define vloxseg3ei64_v_u8mf2(...) __riscv_vloxseg3ei64_v_u8mf2(__VA_ARGS__)
+#define vloxseg4ei64_v_u8mf2(...) __riscv_vloxseg4ei64_v_u8mf2(__VA_ARGS__)
+#define vloxseg5ei64_v_u8mf2(...) __riscv_vloxseg5ei64_v_u8mf2(__VA_ARGS__)
+#define vloxseg6ei64_v_u8mf2(...) __riscv_vloxseg6ei64_v_u8mf2(__VA_ARGS__)
+#define vloxseg7ei64_v_u8mf2(...) __riscv_vloxseg7ei64_v_u8mf2(__VA_ARGS__)
+#define vloxseg8ei64_v_u8mf2(...) __riscv_vloxseg8ei64_v_u8mf2(__VA_ARGS__)
+#define vloxseg2ei64_v_u8m1(...) __riscv_vloxseg2ei64_v_u8m1(__VA_ARGS__)
+#define vloxseg3ei64_v_u8m1(...) __riscv_vloxseg3ei64_v_u8m1(__VA_ARGS__)
+#define vloxseg4ei64_v_u8m1(...) __riscv_vloxseg4ei64_v_u8m1(__VA_ARGS__)
+#define vloxseg5ei64_v_u8m1(...) __riscv_vloxseg5ei64_v_u8m1(__VA_ARGS__)
+#define vloxseg6ei64_v_u8m1(...) __riscv_vloxseg6ei64_v_u8m1(__VA_ARGS__)
+#define vloxseg7ei64_v_u8m1(...) __riscv_vloxseg7ei64_v_u8m1(__VA_ARGS__)
+#define vloxseg8ei64_v_u8m1(...) __riscv_vloxseg8ei64_v_u8m1(__VA_ARGS__)
+#define vloxseg2ei8_v_u16mf4(...) __riscv_vloxseg2ei8_v_u16mf4(__VA_ARGS__)
+#define vloxseg3ei8_v_u16mf4(...) __riscv_vloxseg3ei8_v_u16mf4(__VA_ARGS__)
+#define vloxseg4ei8_v_u16mf4(...) __riscv_vloxseg4ei8_v_u16mf4(__VA_ARGS__)
+#define vloxseg5ei8_v_u16mf4(...) __riscv_vloxseg5ei8_v_u16mf4(__VA_ARGS__)
+#define vloxseg6ei8_v_u16mf4(...) __riscv_vloxseg6ei8_v_u16mf4(__VA_ARGS__)
+#define vloxseg7ei8_v_u16mf4(...) __riscv_vloxseg7ei8_v_u16mf4(__VA_ARGS__)
+#define vloxseg8ei8_v_u16mf4(...) __riscv_vloxseg8ei8_v_u16mf4(__VA_ARGS__)
+#define vloxseg2ei8_v_u16mf2(...) __riscv_vloxseg2ei8_v_u16mf2(__VA_ARGS__)
+#define vloxseg3ei8_v_u16mf2(...) __riscv_vloxseg3ei8_v_u16mf2(__VA_ARGS__)
+#define vloxseg4ei8_v_u16mf2(...) __riscv_vloxseg4ei8_v_u16mf2(__VA_ARGS__)
+#define vloxseg5ei8_v_u16mf2(...) __riscv_vloxseg5ei8_v_u16mf2(__VA_ARGS__)
+#define vloxseg6ei8_v_u16mf2(...) __riscv_vloxseg6ei8_v_u16mf2(__VA_ARGS__)
+#define vloxseg7ei8_v_u16mf2(...) __riscv_vloxseg7ei8_v_u16mf2(__VA_ARGS__)
+#define vloxseg8ei8_v_u16mf2(...) __riscv_vloxseg8ei8_v_u16mf2(__VA_ARGS__)
+#define vloxseg2ei8_v_u16m1(...) __riscv_vloxseg2ei8_v_u16m1(__VA_ARGS__)
+#define vloxseg3ei8_v_u16m1(...) __riscv_vloxseg3ei8_v_u16m1(__VA_ARGS__)
+#define vloxseg4ei8_v_u16m1(...) __riscv_vloxseg4ei8_v_u16m1(__VA_ARGS__)
+#define vloxseg5ei8_v_u16m1(...) __riscv_vloxseg5ei8_v_u16m1(__VA_ARGS__)
+#define vloxseg6ei8_v_u16m1(...) __riscv_vloxseg6ei8_v_u16m1(__VA_ARGS__)
+#define vloxseg7ei8_v_u16m1(...) __riscv_vloxseg7ei8_v_u16m1(__VA_ARGS__)
+#define vloxseg8ei8_v_u16m1(...) __riscv_vloxseg8ei8_v_u16m1(__VA_ARGS__)
+#define vloxseg2ei8_v_u16m2(...) __riscv_vloxseg2ei8_v_u16m2(__VA_ARGS__)
+#define vloxseg3ei8_v_u16m2(...) __riscv_vloxseg3ei8_v_u16m2(__VA_ARGS__)
+#define vloxseg4ei8_v_u16m2(...) __riscv_vloxseg4ei8_v_u16m2(__VA_ARGS__)
+#define vloxseg2ei8_v_u16m4(...) __riscv_vloxseg2ei8_v_u16m4(__VA_ARGS__)
+#define vloxseg2ei16_v_u16mf4(...) __riscv_vloxseg2ei16_v_u16mf4(__VA_ARGS__)
+#define vloxseg3ei16_v_u16mf4(...) __riscv_vloxseg3ei16_v_u16mf4(__VA_ARGS__)
+#define vloxseg4ei16_v_u16mf4(...) __riscv_vloxseg4ei16_v_u16mf4(__VA_ARGS__)
+#define vloxseg5ei16_v_u16mf4(...) __riscv_vloxseg5ei16_v_u16mf4(__VA_ARGS__)
+#define vloxseg6ei16_v_u16mf4(...) __riscv_vloxseg6ei16_v_u16mf4(__VA_ARGS__)
+#define vloxseg7ei16_v_u16mf4(...) __riscv_vloxseg7ei16_v_u16mf4(__VA_ARGS__)
+#define vloxseg8ei16_v_u16mf4(...) __riscv_vloxseg8ei16_v_u16mf4(__VA_ARGS__)
+#define vloxseg2ei16_v_u16mf2(...) __riscv_vloxseg2ei16_v_u16mf2(__VA_ARGS__)
+#define vloxseg3ei16_v_u16mf2(...) __riscv_vloxseg3ei16_v_u16mf2(__VA_ARGS__)
+#define vloxseg4ei16_v_u16mf2(...) __riscv_vloxseg4ei16_v_u16mf2(__VA_ARGS__)
+#define vloxseg5ei16_v_u16mf2(...) __riscv_vloxseg5ei16_v_u16mf2(__VA_ARGS__)
+#define vloxseg6ei16_v_u16mf2(...) __riscv_vloxseg6ei16_v_u16mf2(__VA_ARGS__)
+#define vloxseg7ei16_v_u16mf2(...) __riscv_vloxseg7ei16_v_u16mf2(__VA_ARGS__)
+#define vloxseg8ei16_v_u16mf2(...) __riscv_vloxseg8ei16_v_u16mf2(__VA_ARGS__)
+#define vloxseg2ei16_v_u16m1(...) __riscv_vloxseg2ei16_v_u16m1(__VA_ARGS__)
+#define vloxseg3ei16_v_u16m1(...) __riscv_vloxseg3ei16_v_u16m1(__VA_ARGS__)
+#define vloxseg4ei16_v_u16m1(...) __riscv_vloxseg4ei16_v_u16m1(__VA_ARGS__)
+#define vloxseg5ei16_v_u16m1(...) __riscv_vloxseg5ei16_v_u16m1(__VA_ARGS__)
+#define vloxseg6ei16_v_u16m1(...) __riscv_vloxseg6ei16_v_u16m1(__VA_ARGS__)
+#define vloxseg7ei16_v_u16m1(...) __riscv_vloxseg7ei16_v_u16m1(__VA_ARGS__)
+#define vloxseg8ei16_v_u16m1(...) __riscv_vloxseg8ei16_v_u16m1(__VA_ARGS__)
+#define vloxseg2ei16_v_u16m2(...) __riscv_vloxseg2ei16_v_u16m2(__VA_ARGS__)
+#define vloxseg3ei16_v_u16m2(...) __riscv_vloxseg3ei16_v_u16m2(__VA_ARGS__)
+#define vloxseg4ei16_v_u16m2(...) __riscv_vloxseg4ei16_v_u16m2(__VA_ARGS__)
+#define vloxseg2ei16_v_u16m4(...) __riscv_vloxseg2ei16_v_u16m4(__VA_ARGS__)
+#define vloxseg2ei32_v_u16mf4(...) __riscv_vloxseg2ei32_v_u16mf4(__VA_ARGS__)
+#define vloxseg3ei32_v_u16mf4(...) __riscv_vloxseg3ei32_v_u16mf4(__VA_ARGS__)
+#define vloxseg4ei32_v_u16mf4(...) __riscv_vloxseg4ei32_v_u16mf4(__VA_ARGS__)
+#define vloxseg5ei32_v_u16mf4(...) __riscv_vloxseg5ei32_v_u16mf4(__VA_ARGS__)
+#define vloxseg6ei32_v_u16mf4(...) __riscv_vloxseg6ei32_v_u16mf4(__VA_ARGS__)
+#define vloxseg7ei32_v_u16mf4(...) __riscv_vloxseg7ei32_v_u16mf4(__VA_ARGS__)
+#define vloxseg8ei32_v_u16mf4(...) __riscv_vloxseg8ei32_v_u16mf4(__VA_ARGS__)
+#define vloxseg2ei32_v_u16mf2(...) __riscv_vloxseg2ei32_v_u16mf2(__VA_ARGS__)
+#define vloxseg3ei32_v_u16mf2(...) __riscv_vloxseg3ei32_v_u16mf2(__VA_ARGS__)
+#define vloxseg4ei32_v_u16mf2(...) __riscv_vloxseg4ei32_v_u16mf2(__VA_ARGS__)
+#define vloxseg5ei32_v_u16mf2(...) __riscv_vloxseg5ei32_v_u16mf2(__VA_ARGS__)
+#define vloxseg6ei32_v_u16mf2(...) __riscv_vloxseg6ei32_v_u16mf2(__VA_ARGS__)
+#define vloxseg7ei32_v_u16mf2(...) __riscv_vloxseg7ei32_v_u16mf2(__VA_ARGS__)
+#define vloxseg8ei32_v_u16mf2(...) __riscv_vloxseg8ei32_v_u16mf2(__VA_ARGS__)
+#define vloxseg2ei32_v_u16m1(...) __riscv_vloxseg2ei32_v_u16m1(__VA_ARGS__)
+#define vloxseg3ei32_v_u16m1(...) __riscv_vloxseg3ei32_v_u16m1(__VA_ARGS__)
+#define vloxseg4ei32_v_u16m1(...) __riscv_vloxseg4ei32_v_u16m1(__VA_ARGS__)
+#define vloxseg5ei32_v_u16m1(...) __riscv_vloxseg5ei32_v_u16m1(__VA_ARGS__)
+#define vloxseg6ei32_v_u16m1(...) __riscv_vloxseg6ei32_v_u16m1(__VA_ARGS__)
+#define vloxseg7ei32_v_u16m1(...) __riscv_vloxseg7ei32_v_u16m1(__VA_ARGS__)
+#define vloxseg8ei32_v_u16m1(...) __riscv_vloxseg8ei32_v_u16m1(__VA_ARGS__)
+#define vloxseg2ei32_v_u16m2(...) __riscv_vloxseg2ei32_v_u16m2(__VA_ARGS__)
+#define vloxseg3ei32_v_u16m2(...) __riscv_vloxseg3ei32_v_u16m2(__VA_ARGS__)
+#define vloxseg4ei32_v_u16m2(...) __riscv_vloxseg4ei32_v_u16m2(__VA_ARGS__)
+#define vloxseg2ei32_v_u16m4(...) __riscv_vloxseg2ei32_v_u16m4(__VA_ARGS__)
+#define vloxseg2ei64_v_u16mf4(...) __riscv_vloxseg2ei64_v_u16mf4(__VA_ARGS__)
+#define vloxseg3ei64_v_u16mf4(...) __riscv_vloxseg3ei64_v_u16mf4(__VA_ARGS__)
+#define vloxseg4ei64_v_u16mf4(...) __riscv_vloxseg4ei64_v_u16mf4(__VA_ARGS__)
+#define vloxseg5ei64_v_u16mf4(...) __riscv_vloxseg5ei64_v_u16mf4(__VA_ARGS__)
+#define vloxseg6ei64_v_u16mf4(...) __riscv_vloxseg6ei64_v_u16mf4(__VA_ARGS__)
+#define vloxseg7ei64_v_u16mf4(...) __riscv_vloxseg7ei64_v_u16mf4(__VA_ARGS__)
+#define vloxseg8ei64_v_u16mf4(...) __riscv_vloxseg8ei64_v_u16mf4(__VA_ARGS__)
+#define vloxseg2ei64_v_u16mf2(...) __riscv_vloxseg2ei64_v_u16mf2(__VA_ARGS__)
+#define vloxseg3ei64_v_u16mf2(...) __riscv_vloxseg3ei64_v_u16mf2(__VA_ARGS__)
+#define vloxseg4ei64_v_u16mf2(...) __riscv_vloxseg4ei64_v_u16mf2(__VA_ARGS__)
+#define vloxseg5ei64_v_u16mf2(...) __riscv_vloxseg5ei64_v_u16mf2(__VA_ARGS__)
+#define vloxseg6ei64_v_u16mf2(...) __riscv_vloxseg6ei64_v_u16mf2(__VA_ARGS__)
+#define vloxseg7ei64_v_u16mf2(...) __riscv_vloxseg7ei64_v_u16mf2(__VA_ARGS__)
+#define vloxseg8ei64_v_u16mf2(...) __riscv_vloxseg8ei64_v_u16mf2(__VA_ARGS__)
+#define vloxseg2ei64_v_u16m1(...) __riscv_vloxseg2ei64_v_u16m1(__VA_ARGS__)
+#define vloxseg3ei64_v_u16m1(...) __riscv_vloxseg3ei64_v_u16m1(__VA_ARGS__)
+#define vloxseg4ei64_v_u16m1(...) __riscv_vloxseg4ei64_v_u16m1(__VA_ARGS__)
+#define vloxseg5ei64_v_u16m1(...) __riscv_vloxseg5ei64_v_u16m1(__VA_ARGS__)
+#define vloxseg6ei64_v_u16m1(...) __riscv_vloxseg6ei64_v_u16m1(__VA_ARGS__)
+#define vloxseg7ei64_v_u16m1(...) __riscv_vloxseg7ei64_v_u16m1(__VA_ARGS__)
+#define vloxseg8ei64_v_u16m1(...) __riscv_vloxseg8ei64_v_u16m1(__VA_ARGS__)
+#define vloxseg2ei64_v_u16m2(...) __riscv_vloxseg2ei64_v_u16m2(__VA_ARGS__)
+#define vloxseg3ei64_v_u16m2(...) __riscv_vloxseg3ei64_v_u16m2(__VA_ARGS__)
+#define vloxseg4ei64_v_u16m2(...) __riscv_vloxseg4ei64_v_u16m2(__VA_ARGS__)
+#define vloxseg2ei8_v_u32mf2(...) __riscv_vloxseg2ei8_v_u32mf2(__VA_ARGS__)
+#define vloxseg3ei8_v_u32mf2(...) __riscv_vloxseg3ei8_v_u32mf2(__VA_ARGS__)
+#define vloxseg4ei8_v_u32mf2(...) __riscv_vloxseg4ei8_v_u32mf2(__VA_ARGS__)
+#define vloxseg5ei8_v_u32mf2(...) __riscv_vloxseg5ei8_v_u32mf2(__VA_ARGS__)
+#define vloxseg6ei8_v_u32mf2(...) __riscv_vloxseg6ei8_v_u32mf2(__VA_ARGS__)
+#define vloxseg7ei8_v_u32mf2(...) __riscv_vloxseg7ei8_v_u32mf2(__VA_ARGS__)
+#define vloxseg8ei8_v_u32mf2(...) __riscv_vloxseg8ei8_v_u32mf2(__VA_ARGS__)
+#define vloxseg2ei8_v_u32m1(...) __riscv_vloxseg2ei8_v_u32m1(__VA_ARGS__)
+#define vloxseg3ei8_v_u32m1(...) __riscv_vloxseg3ei8_v_u32m1(__VA_ARGS__)
+#define vloxseg4ei8_v_u32m1(...) __riscv_vloxseg4ei8_v_u32m1(__VA_ARGS__)
+#define vloxseg5ei8_v_u32m1(...) __riscv_vloxseg5ei8_v_u32m1(__VA_ARGS__)
+#define vloxseg6ei8_v_u32m1(...) __riscv_vloxseg6ei8_v_u32m1(__VA_ARGS__)
+#define vloxseg7ei8_v_u32m1(...) __riscv_vloxseg7ei8_v_u32m1(__VA_ARGS__)
+#define vloxseg8ei8_v_u32m1(...) __riscv_vloxseg8ei8_v_u32m1(__VA_ARGS__)
+#define vloxseg2ei8_v_u32m2(...) __riscv_vloxseg2ei8_v_u32m2(__VA_ARGS__)
+#define vloxseg3ei8_v_u32m2(...) __riscv_vloxseg3ei8_v_u32m2(__VA_ARGS__)
+#define vloxseg4ei8_v_u32m2(...) __riscv_vloxseg4ei8_v_u32m2(__VA_ARGS__)
+#define vloxseg2ei8_v_u32m4(...) __riscv_vloxseg2ei8_v_u32m4(__VA_ARGS__)
+#define vloxseg2ei16_v_u32mf2(...) __riscv_vloxseg2ei16_v_u32mf2(__VA_ARGS__)
+#define vloxseg3ei16_v_u32mf2(...) __riscv_vloxseg3ei16_v_u32mf2(__VA_ARGS__)
+#define vloxseg4ei16_v_u32mf2(...) __riscv_vloxseg4ei16_v_u32mf2(__VA_ARGS__)
+#define vloxseg5ei16_v_u32mf2(...) __riscv_vloxseg5ei16_v_u32mf2(__VA_ARGS__)
+#define vloxseg6ei16_v_u32mf2(...) __riscv_vloxseg6ei16_v_u32mf2(__VA_ARGS__)
+#define vloxseg7ei16_v_u32mf2(...) __riscv_vloxseg7ei16_v_u32mf2(__VA_ARGS__)
+#define vloxseg8ei16_v_u32mf2(...) __riscv_vloxseg8ei16_v_u32mf2(__VA_ARGS__)
+#define vloxseg2ei16_v_u32m1(...) __riscv_vloxseg2ei16_v_u32m1(__VA_ARGS__)
+#define vloxseg3ei16_v_u32m1(...) __riscv_vloxseg3ei16_v_u32m1(__VA_ARGS__)
+#define vloxseg4ei16_v_u32m1(...) __riscv_vloxseg4ei16_v_u32m1(__VA_ARGS__)
+#define vloxseg5ei16_v_u32m1(...) __riscv_vloxseg5ei16_v_u32m1(__VA_ARGS__)
+#define vloxseg6ei16_v_u32m1(...) __riscv_vloxseg6ei16_v_u32m1(__VA_ARGS__)
+#define vloxseg7ei16_v_u32m1(...) __riscv_vloxseg7ei16_v_u32m1(__VA_ARGS__)
+#define vloxseg8ei16_v_u32m1(...) __riscv_vloxseg8ei16_v_u32m1(__VA_ARGS__)
+#define vloxseg2ei16_v_u32m2(...) __riscv_vloxseg2ei16_v_u32m2(__VA_ARGS__)
+#define vloxseg3ei16_v_u32m2(...) __riscv_vloxseg3ei16_v_u32m2(__VA_ARGS__)
+#define vloxseg4ei16_v_u32m2(...) __riscv_vloxseg4ei16_v_u32m2(__VA_ARGS__)
+#define vloxseg2ei16_v_u32m4(...) __riscv_vloxseg2ei16_v_u32m4(__VA_ARGS__)
+#define vloxseg2ei32_v_u32mf2(...) __riscv_vloxseg2ei32_v_u32mf2(__VA_ARGS__)
+#define vloxseg3ei32_v_u32mf2(...) __riscv_vloxseg3ei32_v_u32mf2(__VA_ARGS__)
+#define vloxseg4ei32_v_u32mf2(...) __riscv_vloxseg4ei32_v_u32mf2(__VA_ARGS__)
+#define vloxseg5ei32_v_u32mf2(...) __riscv_vloxseg5ei32_v_u32mf2(__VA_ARGS__)
+#define vloxseg6ei32_v_u32mf2(...) __riscv_vloxseg6ei32_v_u32mf2(__VA_ARGS__)
+#define vloxseg7ei32_v_u32mf2(...) __riscv_vloxseg7ei32_v_u32mf2(__VA_ARGS__)
+#define vloxseg8ei32_v_u32mf2(...) __riscv_vloxseg8ei32_v_u32mf2(__VA_ARGS__)
+#define vloxseg2ei32_v_u32m1(...) __riscv_vloxseg2ei32_v_u32m1(__VA_ARGS__)
+#define vloxseg3ei32_v_u32m1(...) __riscv_vloxseg3ei32_v_u32m1(__VA_ARGS__)
+#define vloxseg4ei32_v_u32m1(...) __riscv_vloxseg4ei32_v_u32m1(__VA_ARGS__)
+#define vloxseg5ei32_v_u32m1(...) __riscv_vloxseg5ei32_v_u32m1(__VA_ARGS__)
+#define vloxseg6ei32_v_u32m1(...) __riscv_vloxseg6ei32_v_u32m1(__VA_ARGS__)
+#define vloxseg7ei32_v_u32m1(...) __riscv_vloxseg7ei32_v_u32m1(__VA_ARGS__)
+#define vloxseg8ei32_v_u32m1(...) __riscv_vloxseg8ei32_v_u32m1(__VA_ARGS__)
+#define vloxseg2ei32_v_u32m2(...) __riscv_vloxseg2ei32_v_u32m2(__VA_ARGS__)
+#define vloxseg3ei32_v_u32m2(...) __riscv_vloxseg3ei32_v_u32m2(__VA_ARGS__)
+#define vloxseg4ei32_v_u32m2(...) __riscv_vloxseg4ei32_v_u32m2(__VA_ARGS__)
+#define vloxseg2ei32_v_u32m4(...) __riscv_vloxseg2ei32_v_u32m4(__VA_ARGS__)
+#define vloxseg2ei64_v_u32mf2(...) __riscv_vloxseg2ei64_v_u32mf2(__VA_ARGS__)
+#define vloxseg3ei64_v_u32mf2(...) __riscv_vloxseg3ei64_v_u32mf2(__VA_ARGS__)
+#define vloxseg4ei64_v_u32mf2(...) __riscv_vloxseg4ei64_v_u32mf2(__VA_ARGS__)
+#define vloxseg5ei64_v_u32mf2(...) __riscv_vloxseg5ei64_v_u32mf2(__VA_ARGS__)
+#define vloxseg6ei64_v_u32mf2(...) __riscv_vloxseg6ei64_v_u32mf2(__VA_ARGS__)
+#define vloxseg7ei64_v_u32mf2(...) __riscv_vloxseg7ei64_v_u32mf2(__VA_ARGS__)
+#define vloxseg8ei64_v_u32mf2(...) __riscv_vloxseg8ei64_v_u32mf2(__VA_ARGS__)
+#define vloxseg2ei64_v_u32m1(...) __riscv_vloxseg2ei64_v_u32m1(__VA_ARGS__)
+#define vloxseg3ei64_v_u32m1(...) __riscv_vloxseg3ei64_v_u32m1(__VA_ARGS__)
+#define vloxseg4ei64_v_u32m1(...) __riscv_vloxseg4ei64_v_u32m1(__VA_ARGS__)
+#define vloxseg5ei64_v_u32m1(...) __riscv_vloxseg5ei64_v_u32m1(__VA_ARGS__)
+#define vloxseg6ei64_v_u32m1(...) __riscv_vloxseg6ei64_v_u32m1(__VA_ARGS__)
+#define vloxseg7ei64_v_u32m1(...) __riscv_vloxseg7ei64_v_u32m1(__VA_ARGS__)
+#define vloxseg8ei64_v_u32m1(...) __riscv_vloxseg8ei64_v_u32m1(__VA_ARGS__)
+#define vloxseg2ei64_v_u32m2(...) __riscv_vloxseg2ei64_v_u32m2(__VA_ARGS__)
+#define vloxseg3ei64_v_u32m2(...) __riscv_vloxseg3ei64_v_u32m2(__VA_ARGS__)
+#define vloxseg4ei64_v_u32m2(...) __riscv_vloxseg4ei64_v_u32m2(__VA_ARGS__)
+#define vloxseg2ei64_v_u32m4(...) __riscv_vloxseg2ei64_v_u32m4(__VA_ARGS__)
+#define vloxseg2ei8_v_u64m1(...) __riscv_vloxseg2ei8_v_u64m1(__VA_ARGS__)
+#define vloxseg3ei8_v_u64m1(...) __riscv_vloxseg3ei8_v_u64m1(__VA_ARGS__)
+#define vloxseg4ei8_v_u64m1(...) __riscv_vloxseg4ei8_v_u64m1(__VA_ARGS__)
+#define vloxseg5ei8_v_u64m1(...) __riscv_vloxseg5ei8_v_u64m1(__VA_ARGS__)
+#define vloxseg6ei8_v_u64m1(...) __riscv_vloxseg6ei8_v_u64m1(__VA_ARGS__)
+#define vloxseg7ei8_v_u64m1(...) __riscv_vloxseg7ei8_v_u64m1(__VA_ARGS__)
+#define vloxseg8ei8_v_u64m1(...) __riscv_vloxseg8ei8_v_u64m1(__VA_ARGS__)
+#define vloxseg2ei8_v_u64m2(...) __riscv_vloxseg2ei8_v_u64m2(__VA_ARGS__)
+#define vloxseg3ei8_v_u64m2(...) __riscv_vloxseg3ei8_v_u64m2(__VA_ARGS__)
+#define vloxseg4ei8_v_u64m2(...) __riscv_vloxseg4ei8_v_u64m2(__VA_ARGS__)
+#define vloxseg2ei8_v_u64m4(...) __riscv_vloxseg2ei8_v_u64m4(__VA_ARGS__)
+#define vloxseg2ei16_v_u64m1(...) __riscv_vloxseg2ei16_v_u64m1(__VA_ARGS__)
+#define vloxseg3ei16_v_u64m1(...) __riscv_vloxseg3ei16_v_u64m1(__VA_ARGS__)
+#define vloxseg4ei16_v_u64m1(...) __riscv_vloxseg4ei16_v_u64m1(__VA_ARGS__)
+#define vloxseg5ei16_v_u64m1(...) __riscv_vloxseg5ei16_v_u64m1(__VA_ARGS__)
+#define vloxseg6ei16_v_u64m1(...) __riscv_vloxseg6ei16_v_u64m1(__VA_ARGS__)
+#define vloxseg7ei16_v_u64m1(...) __riscv_vloxseg7ei16_v_u64m1(__VA_ARGS__)
+#define vloxseg8ei16_v_u64m1(...) __riscv_vloxseg8ei16_v_u64m1(__VA_ARGS__)
+#define vloxseg2ei16_v_u64m2(...) __riscv_vloxseg2ei16_v_u64m2(__VA_ARGS__)
+#define vloxseg3ei16_v_u64m2(...) __riscv_vloxseg3ei16_v_u64m2(__VA_ARGS__)
+#define vloxseg4ei16_v_u64m2(...) __riscv_vloxseg4ei16_v_u64m2(__VA_ARGS__)
+#define vloxseg2ei16_v_u64m4(...) __riscv_vloxseg2ei16_v_u64m4(__VA_ARGS__)
+#define vloxseg2ei32_v_u64m1(...) __riscv_vloxseg2ei32_v_u64m1(__VA_ARGS__)
+#define vloxseg3ei32_v_u64m1(...) __riscv_vloxseg3ei32_v_u64m1(__VA_ARGS__)
+#define vloxseg4ei32_v_u64m1(...) __riscv_vloxseg4ei32_v_u64m1(__VA_ARGS__)
+#define vloxseg5ei32_v_u64m1(...) __riscv_vloxseg5ei32_v_u64m1(__VA_ARGS__)
+#define vloxseg6ei32_v_u64m1(...) __riscv_vloxseg6ei32_v_u64m1(__VA_ARGS__)
+#define vloxseg7ei32_v_u64m1(...) __riscv_vloxseg7ei32_v_u64m1(__VA_ARGS__)
+#define vloxseg8ei32_v_u64m1(...) __riscv_vloxseg8ei32_v_u64m1(__VA_ARGS__)
+#define vloxseg2ei32_v_u64m2(...) __riscv_vloxseg2ei32_v_u64m2(__VA_ARGS__)
+#define vloxseg3ei32_v_u64m2(...) __riscv_vloxseg3ei32_v_u64m2(__VA_ARGS__)
+#define vloxseg4ei32_v_u64m2(...) __riscv_vloxseg4ei32_v_u64m2(__VA_ARGS__)
+#define vloxseg2ei32_v_u64m4(...) __riscv_vloxseg2ei32_v_u64m4(__VA_ARGS__)
+#define vloxseg2ei64_v_u64m1(...) __riscv_vloxseg2ei64_v_u64m1(__VA_ARGS__)
+#define vloxseg3ei64_v_u64m1(...) __riscv_vloxseg3ei64_v_u64m1(__VA_ARGS__)
+#define vloxseg4ei64_v_u64m1(...) __riscv_vloxseg4ei64_v_u64m1(__VA_ARGS__)
+#define vloxseg5ei64_v_u64m1(...) __riscv_vloxseg5ei64_v_u64m1(__VA_ARGS__)
+#define vloxseg6ei64_v_u64m1(...) __riscv_vloxseg6ei64_v_u64m1(__VA_ARGS__)
+#define vloxseg7ei64_v_u64m1(...) __riscv_vloxseg7ei64_v_u64m1(__VA_ARGS__)
+#define vloxseg8ei64_v_u64m1(...) __riscv_vloxseg8ei64_v_u64m1(__VA_ARGS__)
+#define vloxseg2ei64_v_u64m2(...) __riscv_vloxseg2ei64_v_u64m2(__VA_ARGS__)
+#define vloxseg3ei64_v_u64m2(...) __riscv_vloxseg3ei64_v_u64m2(__VA_ARGS__)
+#define vloxseg4ei64_v_u64m2(...) __riscv_vloxseg4ei64_v_u64m2(__VA_ARGS__)
+#define vloxseg2ei64_v_u64m4(...) __riscv_vloxseg2ei64_v_u64m4(__VA_ARGS__)
+#define vluxseg2ei8_v_u8mf8(...) __riscv_vluxseg2ei8_v_u8mf8(__VA_ARGS__)
+#define vluxseg3ei8_v_u8mf8(...) __riscv_vluxseg3ei8_v_u8mf8(__VA_ARGS__)
+#define vluxseg4ei8_v_u8mf8(...) __riscv_vluxseg4ei8_v_u8mf8(__VA_ARGS__)
+#define vluxseg5ei8_v_u8mf8(...) __riscv_vluxseg5ei8_v_u8mf8(__VA_ARGS__)
+#define vluxseg6ei8_v_u8mf8(...) __riscv_vluxseg6ei8_v_u8mf8(__VA_ARGS__)
+#define vluxseg7ei8_v_u8mf8(...) __riscv_vluxseg7ei8_v_u8mf8(__VA_ARGS__)
+#define vluxseg8ei8_v_u8mf8(...) __riscv_vluxseg8ei8_v_u8mf8(__VA_ARGS__)
+#define vluxseg2ei8_v_u8mf4(...) __riscv_vluxseg2ei8_v_u8mf4(__VA_ARGS__)
+#define vluxseg3ei8_v_u8mf4(...) __riscv_vluxseg3ei8_v_u8mf4(__VA_ARGS__)
+#define vluxseg4ei8_v_u8mf4(...) __riscv_vluxseg4ei8_v_u8mf4(__VA_ARGS__)
+#define vluxseg5ei8_v_u8mf4(...) __riscv_vluxseg5ei8_v_u8mf4(__VA_ARGS__)
+#define vluxseg6ei8_v_u8mf4(...) __riscv_vluxseg6ei8_v_u8mf4(__VA_ARGS__)
+#define vluxseg7ei8_v_u8mf4(...) __riscv_vluxseg7ei8_v_u8mf4(__VA_ARGS__)
+#define vluxseg8ei8_v_u8mf4(...) __riscv_vluxseg8ei8_v_u8mf4(__VA_ARGS__)
+#define vluxseg2ei8_v_u8mf2(...) __riscv_vluxseg2ei8_v_u8mf2(__VA_ARGS__)
+#define vluxseg3ei8_v_u8mf2(...) __riscv_vluxseg3ei8_v_u8mf2(__VA_ARGS__)
+#define vluxseg4ei8_v_u8mf2(...) __riscv_vluxseg4ei8_v_u8mf2(__VA_ARGS__)
+#define vluxseg5ei8_v_u8mf2(...) __riscv_vluxseg5ei8_v_u8mf2(__VA_ARGS__)
+#define vluxseg6ei8_v_u8mf2(...) __riscv_vluxseg6ei8_v_u8mf2(__VA_ARGS__)
+#define vluxseg7ei8_v_u8mf2(...) __riscv_vluxseg7ei8_v_u8mf2(__VA_ARGS__)
+#define vluxseg8ei8_v_u8mf2(...) __riscv_vluxseg8ei8_v_u8mf2(__VA_ARGS__)
+#define vluxseg2ei8_v_u8m1(...) __riscv_vluxseg2ei8_v_u8m1(__VA_ARGS__)
+#define vluxseg3ei8_v_u8m1(...) __riscv_vluxseg3ei8_v_u8m1(__VA_ARGS__)
+#define vluxseg4ei8_v_u8m1(...) __riscv_vluxseg4ei8_v_u8m1(__VA_ARGS__)
+#define vluxseg5ei8_v_u8m1(...) __riscv_vluxseg5ei8_v_u8m1(__VA_ARGS__)
+#define vluxseg6ei8_v_u8m1(...) __riscv_vluxseg6ei8_v_u8m1(__VA_ARGS__)
+#define vluxseg7ei8_v_u8m1(...) __riscv_vluxseg7ei8_v_u8m1(__VA_ARGS__)
+#define vluxseg8ei8_v_u8m1(...) __riscv_vluxseg8ei8_v_u8m1(__VA_ARGS__)
+#define vluxseg2ei8_v_u8m2(...) __riscv_vluxseg2ei8_v_u8m2(__VA_ARGS__)
+#define vluxseg3ei8_v_u8m2(...) __riscv_vluxseg3ei8_v_u8m2(__VA_ARGS__)
+#define vluxseg4ei8_v_u8m2(...) __riscv_vluxseg4ei8_v_u8m2(__VA_ARGS__)
+#define vluxseg2ei8_v_u8m4(...) __riscv_vluxseg2ei8_v_u8m4(__VA_ARGS__)
+#define vluxseg2ei16_v_u8mf8(...) __riscv_vluxseg2ei16_v_u8mf8(__VA_ARGS__)
+#define vluxseg3ei16_v_u8mf8(...) __riscv_vluxseg3ei16_v_u8mf8(__VA_ARGS__)
+#define vluxseg4ei16_v_u8mf8(...) __riscv_vluxseg4ei16_v_u8mf8(__VA_ARGS__)
+#define vluxseg5ei16_v_u8mf8(...) __riscv_vluxseg5ei16_v_u8mf8(__VA_ARGS__)
+#define vluxseg6ei16_v_u8mf8(...) __riscv_vluxseg6ei16_v_u8mf8(__VA_ARGS__)
+#define vluxseg7ei16_v_u8mf8(...) __riscv_vluxseg7ei16_v_u8mf8(__VA_ARGS__)
+#define vluxseg8ei16_v_u8mf8(...) __riscv_vluxseg8ei16_v_u8mf8(__VA_ARGS__)
+#define vluxseg2ei16_v_u8mf4(...) __riscv_vluxseg2ei16_v_u8mf4(__VA_ARGS__)
+#define vluxseg3ei16_v_u8mf4(...) __riscv_vluxseg3ei16_v_u8mf4(__VA_ARGS__)
+#define vluxseg4ei16_v_u8mf4(...) __riscv_vluxseg4ei16_v_u8mf4(__VA_ARGS__)
+#define vluxseg5ei16_v_u8mf4(...) __riscv_vluxseg5ei16_v_u8mf4(__VA_ARGS__)
+#define vluxseg6ei16_v_u8mf4(...) __riscv_vluxseg6ei16_v_u8mf4(__VA_ARGS__)
+#define vluxseg7ei16_v_u8mf4(...) __riscv_vluxseg7ei16_v_u8mf4(__VA_ARGS__)
+#define vluxseg8ei16_v_u8mf4(...) __riscv_vluxseg8ei16_v_u8mf4(__VA_ARGS__)
+#define vluxseg2ei16_v_u8mf2(...) __riscv_vluxseg2ei16_v_u8mf2(__VA_ARGS__)
+#define vluxseg3ei16_v_u8mf2(...) __riscv_vluxseg3ei16_v_u8mf2(__VA_ARGS__)
+#define vluxseg4ei16_v_u8mf2(...) __riscv_vluxseg4ei16_v_u8mf2(__VA_ARGS__)
+#define vluxseg5ei16_v_u8mf2(...) __riscv_vluxseg5ei16_v_u8mf2(__VA_ARGS__)
+#define vluxseg6ei16_v_u8mf2(...) __riscv_vluxseg6ei16_v_u8mf2(__VA_ARGS__)
+#define vluxseg7ei16_v_u8mf2(...) __riscv_vluxseg7ei16_v_u8mf2(__VA_ARGS__)
+#define vluxseg8ei16_v_u8mf2(...) __riscv_vluxseg8ei16_v_u8mf2(__VA_ARGS__)
+#define vluxseg2ei16_v_u8m1(...) __riscv_vluxseg2ei16_v_u8m1(__VA_ARGS__)
+#define vluxseg3ei16_v_u8m1(...) __riscv_vluxseg3ei16_v_u8m1(__VA_ARGS__)
+#define vluxseg4ei16_v_u8m1(...) __riscv_vluxseg4ei16_v_u8m1(__VA_ARGS__)
+#define vluxseg5ei16_v_u8m1(...) __riscv_vluxseg5ei16_v_u8m1(__VA_ARGS__)
+#define vluxseg6ei16_v_u8m1(...) __riscv_vluxseg6ei16_v_u8m1(__VA_ARGS__)
+#define vluxseg7ei16_v_u8m1(...) __riscv_vluxseg7ei16_v_u8m1(__VA_ARGS__)
+#define vluxseg8ei16_v_u8m1(...) __riscv_vluxseg8ei16_v_u8m1(__VA_ARGS__)
+#define vluxseg2ei16_v_u8m2(...) __riscv_vluxseg2ei16_v_u8m2(__VA_ARGS__)
+#define vluxseg3ei16_v_u8m2(...) __riscv_vluxseg3ei16_v_u8m2(__VA_ARGS__)
+#define vluxseg4ei16_v_u8m2(...) __riscv_vluxseg4ei16_v_u8m2(__VA_ARGS__)
+#define vluxseg2ei16_v_u8m4(...) __riscv_vluxseg2ei16_v_u8m4(__VA_ARGS__)
+#define vluxseg2ei32_v_u8mf8(...) __riscv_vluxseg2ei32_v_u8mf8(__VA_ARGS__)
+#define vluxseg3ei32_v_u8mf8(...) __riscv_vluxseg3ei32_v_u8mf8(__VA_ARGS__)
+#define vluxseg4ei32_v_u8mf8(...) __riscv_vluxseg4ei32_v_u8mf8(__VA_ARGS__)
+#define vluxseg5ei32_v_u8mf8(...) __riscv_vluxseg5ei32_v_u8mf8(__VA_ARGS__)
+#define vluxseg6ei32_v_u8mf8(...) __riscv_vluxseg6ei32_v_u8mf8(__VA_ARGS__)
+#define vluxseg7ei32_v_u8mf8(...) __riscv_vluxseg7ei32_v_u8mf8(__VA_ARGS__)
+#define vluxseg8ei32_v_u8mf8(...) __riscv_vluxseg8ei32_v_u8mf8(__VA_ARGS__)
+#define vluxseg2ei32_v_u8mf4(...) __riscv_vluxseg2ei32_v_u8mf4(__VA_ARGS__)
+#define vluxseg3ei32_v_u8mf4(...) __riscv_vluxseg3ei32_v_u8mf4(__VA_ARGS__)
+#define vluxseg4ei32_v_u8mf4(...) __riscv_vluxseg4ei32_v_u8mf4(__VA_ARGS__)
+#define vluxseg5ei32_v_u8mf4(...) __riscv_vluxseg5ei32_v_u8mf4(__VA_ARGS__)
+#define vluxseg6ei32_v_u8mf4(...) __riscv_vluxseg6ei32_v_u8mf4(__VA_ARGS__)
+#define vluxseg7ei32_v_u8mf4(...) __riscv_vluxseg7ei32_v_u8mf4(__VA_ARGS__)
+#define vluxseg8ei32_v_u8mf4(...) __riscv_vluxseg8ei32_v_u8mf4(__VA_ARGS__)
+#define vluxseg2ei32_v_u8mf2(...) __riscv_vluxseg2ei32_v_u8mf2(__VA_ARGS__)
+#define vluxseg3ei32_v_u8mf2(...) __riscv_vluxseg3ei32_v_u8mf2(__VA_ARGS__)
+#define vluxseg4ei32_v_u8mf2(...) __riscv_vluxseg4ei32_v_u8mf2(__VA_ARGS__)
+#define vluxseg5ei32_v_u8mf2(...) __riscv_vluxseg5ei32_v_u8mf2(__VA_ARGS__)
+#define vluxseg6ei32_v_u8mf2(...) __riscv_vluxseg6ei32_v_u8mf2(__VA_ARGS__)
+#define vluxseg7ei32_v_u8mf2(...) __riscv_vluxseg7ei32_v_u8mf2(__VA_ARGS__)
+#define vluxseg8ei32_v_u8mf2(...) __riscv_vluxseg8ei32_v_u8mf2(__VA_ARGS__)
+#define vluxseg2ei32_v_u8m1(...) __riscv_vluxseg2ei32_v_u8m1(__VA_ARGS__)
+#define vluxseg3ei32_v_u8m1(...) __riscv_vluxseg3ei32_v_u8m1(__VA_ARGS__)
+#define vluxseg4ei32_v_u8m1(...) __riscv_vluxseg4ei32_v_u8m1(__VA_ARGS__)
+#define vluxseg5ei32_v_u8m1(...) __riscv_vluxseg5ei32_v_u8m1(__VA_ARGS__)
+#define vluxseg6ei32_v_u8m1(...) __riscv_vluxseg6ei32_v_u8m1(__VA_ARGS__)
+#define vluxseg7ei32_v_u8m1(...) __riscv_vluxseg7ei32_v_u8m1(__VA_ARGS__)
+#define vluxseg8ei32_v_u8m1(...) __riscv_vluxseg8ei32_v_u8m1(__VA_ARGS__)
+#define vluxseg2ei32_v_u8m2(...) __riscv_vluxseg2ei32_v_u8m2(__VA_ARGS__)
+#define vluxseg3ei32_v_u8m2(...) __riscv_vluxseg3ei32_v_u8m2(__VA_ARGS__)
+#define vluxseg4ei32_v_u8m2(...) __riscv_vluxseg4ei32_v_u8m2(__VA_ARGS__)
+#define vluxseg2ei64_v_u8mf8(...) __riscv_vluxseg2ei64_v_u8mf8(__VA_ARGS__)
+#define vluxseg3ei64_v_u8mf8(...) __riscv_vluxseg3ei64_v_u8mf8(__VA_ARGS__)
+#define vluxseg4ei64_v_u8mf8(...) __riscv_vluxseg4ei64_v_u8mf8(__VA_ARGS__)
+#define vluxseg5ei64_v_u8mf8(...) __riscv_vluxseg5ei64_v_u8mf8(__VA_ARGS__)
+#define vluxseg6ei64_v_u8mf8(...) __riscv_vluxseg6ei64_v_u8mf8(__VA_ARGS__)
+#define vluxseg7ei64_v_u8mf8(...) __riscv_vluxseg7ei64_v_u8mf8(__VA_ARGS__)
+#define vluxseg8ei64_v_u8mf8(...) __riscv_vluxseg8ei64_v_u8mf8(__VA_ARGS__)
+#define vluxseg2ei64_v_u8mf4(...) __riscv_vluxseg2ei64_v_u8mf4(__VA_ARGS__)
+#define vluxseg3ei64_v_u8mf4(...) __riscv_vluxseg3ei64_v_u8mf4(__VA_ARGS__)
+#define vluxseg4ei64_v_u8mf4(...) __riscv_vluxseg4ei64_v_u8mf4(__VA_ARGS__)
+#define vluxseg5ei64_v_u8mf4(...) __riscv_vluxseg5ei64_v_u8mf4(__VA_ARGS__)
+#define vluxseg6ei64_v_u8mf4(...) __riscv_vluxseg6ei64_v_u8mf4(__VA_ARGS__)
+#define vluxseg7ei64_v_u8mf4(...) __riscv_vluxseg7ei64_v_u8mf4(__VA_ARGS__)
+#define vluxseg8ei64_v_u8mf4(...) __riscv_vluxseg8ei64_v_u8mf4(__VA_ARGS__)
+#define vluxseg2ei64_v_u8mf2(...) __riscv_vluxseg2ei64_v_u8mf2(__VA_ARGS__)
+#define vluxseg3ei64_v_u8mf2(...) __riscv_vluxseg3ei64_v_u8mf2(__VA_ARGS__)
+#define vluxseg4ei64_v_u8mf2(...) __riscv_vluxseg4ei64_v_u8mf2(__VA_ARGS__)
+#define vluxseg5ei64_v_u8mf2(...) __riscv_vluxseg5ei64_v_u8mf2(__VA_ARGS__)
+#define vluxseg6ei64_v_u8mf2(...) __riscv_vluxseg6ei64_v_u8mf2(__VA_ARGS__)
+#define vluxseg7ei64_v_u8mf2(...) __riscv_vluxseg7ei64_v_u8mf2(__VA_ARGS__)
+#define vluxseg8ei64_v_u8mf2(...) __riscv_vluxseg8ei64_v_u8mf2(__VA_ARGS__)
+#define vluxseg2ei64_v_u8m1(...) __riscv_vluxseg2ei64_v_u8m1(__VA_ARGS__)
+#define vluxseg3ei64_v_u8m1(...) __riscv_vluxseg3ei64_v_u8m1(__VA_ARGS__)
+#define vluxseg4ei64_v_u8m1(...) __riscv_vluxseg4ei64_v_u8m1(__VA_ARGS__)
+#define vluxseg5ei64_v_u8m1(...) __riscv_vluxseg5ei64_v_u8m1(__VA_ARGS__)
+#define vluxseg6ei64_v_u8m1(...) __riscv_vluxseg6ei64_v_u8m1(__VA_ARGS__)
+#define vluxseg7ei64_v_u8m1(...) __riscv_vluxseg7ei64_v_u8m1(__VA_ARGS__)
+#define vluxseg8ei64_v_u8m1(...) __riscv_vluxseg8ei64_v_u8m1(__VA_ARGS__)
+#define vluxseg2ei8_v_u16mf4(...) __riscv_vluxseg2ei8_v_u16mf4(__VA_ARGS__)
+#define vluxseg3ei8_v_u16mf4(...) __riscv_vluxseg3ei8_v_u16mf4(__VA_ARGS__)
+#define vluxseg4ei8_v_u16mf4(...) __riscv_vluxseg4ei8_v_u16mf4(__VA_ARGS__)
+#define vluxseg5ei8_v_u16mf4(...) __riscv_vluxseg5ei8_v_u16mf4(__VA_ARGS__)
+#define vluxseg6ei8_v_u16mf4(...) __riscv_vluxseg6ei8_v_u16mf4(__VA_ARGS__)
+#define vluxseg7ei8_v_u16mf4(...) __riscv_vluxseg7ei8_v_u16mf4(__VA_ARGS__)
+#define vluxseg8ei8_v_u16mf4(...) __riscv_vluxseg8ei8_v_u16mf4(__VA_ARGS__)
+#define vluxseg2ei8_v_u16mf2(...) __riscv_vluxseg2ei8_v_u16mf2(__VA_ARGS__)
+#define vluxseg3ei8_v_u16mf2(...) __riscv_vluxseg3ei8_v_u16mf2(__VA_ARGS__)
+#define vluxseg4ei8_v_u16mf2(...) __riscv_vluxseg4ei8_v_u16mf2(__VA_ARGS__)
+#define vluxseg5ei8_v_u16mf2(...) __riscv_vluxseg5ei8_v_u16mf2(__VA_ARGS__)
+#define vluxseg6ei8_v_u16mf2(...) __riscv_vluxseg6ei8_v_u16mf2(__VA_ARGS__)
+#define vluxseg7ei8_v_u16mf2(...) __riscv_vluxseg7ei8_v_u16mf2(__VA_ARGS__)
+#define vluxseg8ei8_v_u16mf2(...) __riscv_vluxseg8ei8_v_u16mf2(__VA_ARGS__)
+#define vluxseg2ei8_v_u16m1(...) __riscv_vluxseg2ei8_v_u16m1(__VA_ARGS__)
+#define vluxseg3ei8_v_u16m1(...) __riscv_vluxseg3ei8_v_u16m1(__VA_ARGS__)
+#define vluxseg4ei8_v_u16m1(...) __riscv_vluxseg4ei8_v_u16m1(__VA_ARGS__)
+#define vluxseg5ei8_v_u16m1(...) __riscv_vluxseg5ei8_v_u16m1(__VA_ARGS__)
+#define vluxseg6ei8_v_u16m1(...) __riscv_vluxseg6ei8_v_u16m1(__VA_ARGS__)
+#define vluxseg7ei8_v_u16m1(...) __riscv_vluxseg7ei8_v_u16m1(__VA_ARGS__)
+#define vluxseg8ei8_v_u16m1(...) __riscv_vluxseg8ei8_v_u16m1(__VA_ARGS__)
+#define vluxseg2ei8_v_u16m2(...) __riscv_vluxseg2ei8_v_u16m2(__VA_ARGS__)
+#define vluxseg3ei8_v_u16m2(...) __riscv_vluxseg3ei8_v_u16m2(__VA_ARGS__)
+#define vluxseg4ei8_v_u16m2(...) __riscv_vluxseg4ei8_v_u16m2(__VA_ARGS__)
+#define vluxseg2ei8_v_u16m4(...) __riscv_vluxseg2ei8_v_u16m4(__VA_ARGS__)
+#define vluxseg2ei16_v_u16mf4(...) __riscv_vluxseg2ei16_v_u16mf4(__VA_ARGS__)
+#define vluxseg3ei16_v_u16mf4(...) __riscv_vluxseg3ei16_v_u16mf4(__VA_ARGS__)
+#define vluxseg4ei16_v_u16mf4(...) __riscv_vluxseg4ei16_v_u16mf4(__VA_ARGS__)
+#define vluxseg5ei16_v_u16mf4(...) __riscv_vluxseg5ei16_v_u16mf4(__VA_ARGS__)
+#define vluxseg6ei16_v_u16mf4(...) __riscv_vluxseg6ei16_v_u16mf4(__VA_ARGS__)
+#define vluxseg7ei16_v_u16mf4(...) __riscv_vluxseg7ei16_v_u16mf4(__VA_ARGS__)
+#define vluxseg8ei16_v_u16mf4(...) __riscv_vluxseg8ei16_v_u16mf4(__VA_ARGS__)
+#define vluxseg2ei16_v_u16mf2(...) __riscv_vluxseg2ei16_v_u16mf2(__VA_ARGS__)
+#define vluxseg3ei16_v_u16mf2(...) __riscv_vluxseg3ei16_v_u16mf2(__VA_ARGS__)
+#define vluxseg4ei16_v_u16mf2(...) __riscv_vluxseg4ei16_v_u16mf2(__VA_ARGS__)
+#define vluxseg5ei16_v_u16mf2(...) __riscv_vluxseg5ei16_v_u16mf2(__VA_ARGS__)
+#define vluxseg6ei16_v_u16mf2(...) __riscv_vluxseg6ei16_v_u16mf2(__VA_ARGS__)
+#define vluxseg7ei16_v_u16mf2(...) __riscv_vluxseg7ei16_v_u16mf2(__VA_ARGS__)
+#define vluxseg8ei16_v_u16mf2(...) __riscv_vluxseg8ei16_v_u16mf2(__VA_ARGS__)
+#define vluxseg2ei16_v_u16m1(...) __riscv_vluxseg2ei16_v_u16m1(__VA_ARGS__)
+#define vluxseg3ei16_v_u16m1(...) __riscv_vluxseg3ei16_v_u16m1(__VA_ARGS__)
+#define vluxseg4ei16_v_u16m1(...) __riscv_vluxseg4ei16_v_u16m1(__VA_ARGS__)
+#define vluxseg5ei16_v_u16m1(...) __riscv_vluxseg5ei16_v_u16m1(__VA_ARGS__)
+#define vluxseg6ei16_v_u16m1(...) __riscv_vluxseg6ei16_v_u16m1(__VA_ARGS__)
+#define vluxseg7ei16_v_u16m1(...) __riscv_vluxseg7ei16_v_u16m1(__VA_ARGS__)
+#define vluxseg8ei16_v_u16m1(...) __riscv_vluxseg8ei16_v_u16m1(__VA_ARGS__)
+#define vluxseg2ei16_v_u16m2(...) __riscv_vluxseg2ei16_v_u16m2(__VA_ARGS__)
+#define vluxseg3ei16_v_u16m2(...) __riscv_vluxseg3ei16_v_u16m2(__VA_ARGS__)
+#define vluxseg4ei16_v_u16m2(...) __riscv_vluxseg4ei16_v_u16m2(__VA_ARGS__)
+#define vluxseg2ei16_v_u16m4(...) __riscv_vluxseg2ei16_v_u16m4(__VA_ARGS__)
+#define vluxseg2ei32_v_u16mf4(...) __riscv_vluxseg2ei32_v_u16mf4(__VA_ARGS__)
+#define vluxseg3ei32_v_u16mf4(...) __riscv_vluxseg3ei32_v_u16mf4(__VA_ARGS__)
+#define vluxseg4ei32_v_u16mf4(...) __riscv_vluxseg4ei32_v_u16mf4(__VA_ARGS__)
+#define vluxseg5ei32_v_u16mf4(...) __riscv_vluxseg5ei32_v_u16mf4(__VA_ARGS__)
+#define vluxseg6ei32_v_u16mf4(...) __riscv_vluxseg6ei32_v_u16mf4(__VA_ARGS__)
+#define vluxseg7ei32_v_u16mf4(...) __riscv_vluxseg7ei32_v_u16mf4(__VA_ARGS__)
+#define vluxseg8ei32_v_u16mf4(...) __riscv_vluxseg8ei32_v_u16mf4(__VA_ARGS__)
+#define vluxseg2ei32_v_u16mf2(...) __riscv_vluxseg2ei32_v_u16mf2(__VA_ARGS__)
+#define vluxseg3ei32_v_u16mf2(...) __riscv_vluxseg3ei32_v_u16mf2(__VA_ARGS__)
+#define vluxseg4ei32_v_u16mf2(...) __riscv_vluxseg4ei32_v_u16mf2(__VA_ARGS__)
+#define vluxseg5ei32_v_u16mf2(...) __riscv_vluxseg5ei32_v_u16mf2(__VA_ARGS__)
+#define vluxseg6ei32_v_u16mf2(...) __riscv_vluxseg6ei32_v_u16mf2(__VA_ARGS__)
+#define vluxseg7ei32_v_u16mf2(...) __riscv_vluxseg7ei32_v_u16mf2(__VA_ARGS__)
+#define vluxseg8ei32_v_u16mf2(...) __riscv_vluxseg8ei32_v_u16mf2(__VA_ARGS__)
+#define vluxseg2ei32_v_u16m1(...) __riscv_vluxseg2ei32_v_u16m1(__VA_ARGS__)
+#define vluxseg3ei32_v_u16m1(...) __riscv_vluxseg3ei32_v_u16m1(__VA_ARGS__)
+#define vluxseg4ei32_v_u16m1(...) __riscv_vluxseg4ei32_v_u16m1(__VA_ARGS__)
+#define vluxseg5ei32_v_u16m1(...) __riscv_vluxseg5ei32_v_u16m1(__VA_ARGS__)
+#define vluxseg6ei32_v_u16m1(...) __riscv_vluxseg6ei32_v_u16m1(__VA_ARGS__)
+#define vluxseg7ei32_v_u16m1(...) __riscv_vluxseg7ei32_v_u16m1(__VA_ARGS__)
+#define vluxseg8ei32_v_u16m1(...) __riscv_vluxseg8ei32_v_u16m1(__VA_ARGS__)
+#define vluxseg2ei32_v_u16m2(...) __riscv_vluxseg2ei32_v_u16m2(__VA_ARGS__)
+#define vluxseg3ei32_v_u16m2(...) __riscv_vluxseg3ei32_v_u16m2(__VA_ARGS__)
+#define vluxseg4ei32_v_u16m2(...) __riscv_vluxseg4ei32_v_u16m2(__VA_ARGS__)
+#define vluxseg2ei32_v_u16m4(...) __riscv_vluxseg2ei32_v_u16m4(__VA_ARGS__)
+#define vluxseg2ei64_v_u16mf4(...) __riscv_vluxseg2ei64_v_u16mf4(__VA_ARGS__)
+#define vluxseg3ei64_v_u16mf4(...) __riscv_vluxseg3ei64_v_u16mf4(__VA_ARGS__)
+#define vluxseg4ei64_v_u16mf4(...) __riscv_vluxseg4ei64_v_u16mf4(__VA_ARGS__)
+#define vluxseg5ei64_v_u16mf4(...) __riscv_vluxseg5ei64_v_u16mf4(__VA_ARGS__)
+#define vluxseg6ei64_v_u16mf4(...) __riscv_vluxseg6ei64_v_u16mf4(__VA_ARGS__)
+#define vluxseg7ei64_v_u16mf4(...) __riscv_vluxseg7ei64_v_u16mf4(__VA_ARGS__)
+#define vluxseg8ei64_v_u16mf4(...) __riscv_vluxseg8ei64_v_u16mf4(__VA_ARGS__)
+#define vluxseg2ei64_v_u16mf2(...) __riscv_vluxseg2ei64_v_u16mf2(__VA_ARGS__)
+#define vluxseg3ei64_v_u16mf2(...) __riscv_vluxseg3ei64_v_u16mf2(__VA_ARGS__)
+#define vluxseg4ei64_v_u16mf2(...) __riscv_vluxseg4ei64_v_u16mf2(__VA_ARGS__)
+#define vluxseg5ei64_v_u16mf2(...) __riscv_vluxseg5ei64_v_u16mf2(__VA_ARGS__)
+#define vluxseg6ei64_v_u16mf2(...) __riscv_vluxseg6ei64_v_u16mf2(__VA_ARGS__)
+#define vluxseg7ei64_v_u16mf2(...) __riscv_vluxseg7ei64_v_u16mf2(__VA_ARGS__)
+#define vluxseg8ei64_v_u16mf2(...) __riscv_vluxseg8ei64_v_u16mf2(__VA_ARGS__)
+#define vluxseg2ei64_v_u16m1(...) __riscv_vluxseg2ei64_v_u16m1(__VA_ARGS__)
+#define vluxseg3ei64_v_u16m1(...) __riscv_vluxseg3ei64_v_u16m1(__VA_ARGS__)
+#define vluxseg4ei64_v_u16m1(...) __riscv_vluxseg4ei64_v_u16m1(__VA_ARGS__)
+#define vluxseg5ei64_v_u16m1(...) __riscv_vluxseg5ei64_v_u16m1(__VA_ARGS__)
+#define vluxseg6ei64_v_u16m1(...) __riscv_vluxseg6ei64_v_u16m1(__VA_ARGS__)
+#define vluxseg7ei64_v_u16m1(...) __riscv_vluxseg7ei64_v_u16m1(__VA_ARGS__)
+#define vluxseg8ei64_v_u16m1(...) __riscv_vluxseg8ei64_v_u16m1(__VA_ARGS__)
+#define vluxseg2ei64_v_u16m2(...) __riscv_vluxseg2ei64_v_u16m2(__VA_ARGS__)
+#define vluxseg3ei64_v_u16m2(...) __riscv_vluxseg3ei64_v_u16m2(__VA_ARGS__)
+#define vluxseg4ei64_v_u16m2(...) __riscv_vluxseg4ei64_v_u16m2(__VA_ARGS__)
+#define vluxseg2ei8_v_u32mf2(...) __riscv_vluxseg2ei8_v_u32mf2(__VA_ARGS__)
+#define vluxseg3ei8_v_u32mf2(...) __riscv_vluxseg3ei8_v_u32mf2(__VA_ARGS__)
+#define vluxseg4ei8_v_u32mf2(...) __riscv_vluxseg4ei8_v_u32mf2(__VA_ARGS__)
+#define vluxseg5ei8_v_u32mf2(...) __riscv_vluxseg5ei8_v_u32mf2(__VA_ARGS__)
+#define vluxseg6ei8_v_u32mf2(...) __riscv_vluxseg6ei8_v_u32mf2(__VA_ARGS__)
+#define vluxseg7ei8_v_u32mf2(...) __riscv_vluxseg7ei8_v_u32mf2(__VA_ARGS__)
+#define vluxseg8ei8_v_u32mf2(...) __riscv_vluxseg8ei8_v_u32mf2(__VA_ARGS__)
+#define vluxseg2ei8_v_u32m1(...) __riscv_vluxseg2ei8_v_u32m1(__VA_ARGS__)
+#define vluxseg3ei8_v_u32m1(...) __riscv_vluxseg3ei8_v_u32m1(__VA_ARGS__)
+#define vluxseg4ei8_v_u32m1(...) __riscv_vluxseg4ei8_v_u32m1(__VA_ARGS__)
+#define vluxseg5ei8_v_u32m1(...) __riscv_vluxseg5ei8_v_u32m1(__VA_ARGS__)
+#define vluxseg6ei8_v_u32m1(...) __riscv_vluxseg6ei8_v_u32m1(__VA_ARGS__)
+#define vluxseg7ei8_v_u32m1(...) __riscv_vluxseg7ei8_v_u32m1(__VA_ARGS__)
+#define vluxseg8ei8_v_u32m1(...) __riscv_vluxseg8ei8_v_u32m1(__VA_ARGS__)
+#define vluxseg2ei8_v_u32m2(...) __riscv_vluxseg2ei8_v_u32m2(__VA_ARGS__)
+#define vluxseg3ei8_v_u32m2(...) __riscv_vluxseg3ei8_v_u32m2(__VA_ARGS__)
+#define vluxseg4ei8_v_u32m2(...) __riscv_vluxseg4ei8_v_u32m2(__VA_ARGS__)
+#define vluxseg2ei8_v_u32m4(...) __riscv_vluxseg2ei8_v_u32m4(__VA_ARGS__)
+#define vluxseg2ei16_v_u32mf2(...) __riscv_vluxseg2ei16_v_u32mf2(__VA_ARGS__)
+#define vluxseg3ei16_v_u32mf2(...) __riscv_vluxseg3ei16_v_u32mf2(__VA_ARGS__)
+#define vluxseg4ei16_v_u32mf2(...) __riscv_vluxseg4ei16_v_u32mf2(__VA_ARGS__)
+#define vluxseg5ei16_v_u32mf2(...) __riscv_vluxseg5ei16_v_u32mf2(__VA_ARGS__)
+#define vluxseg6ei16_v_u32mf2(...) __riscv_vluxseg6ei16_v_u32mf2(__VA_ARGS__)
+#define vluxseg7ei16_v_u32mf2(...) __riscv_vluxseg7ei16_v_u32mf2(__VA_ARGS__)
+#define vluxseg8ei16_v_u32mf2(...) __riscv_vluxseg8ei16_v_u32mf2(__VA_ARGS__)
+#define vluxseg2ei16_v_u32m1(...) __riscv_vluxseg2ei16_v_u32m1(__VA_ARGS__)
+#define vluxseg3ei16_v_u32m1(...) __riscv_vluxseg3ei16_v_u32m1(__VA_ARGS__)
+#define vluxseg4ei16_v_u32m1(...) __riscv_vluxseg4ei16_v_u32m1(__VA_ARGS__)
+#define vluxseg5ei16_v_u32m1(...) __riscv_vluxseg5ei16_v_u32m1(__VA_ARGS__)
+#define vluxseg6ei16_v_u32m1(...) __riscv_vluxseg6ei16_v_u32m1(__VA_ARGS__)
+#define vluxseg7ei16_v_u32m1(...) __riscv_vluxseg7ei16_v_u32m1(__VA_ARGS__)
+#define vluxseg8ei16_v_u32m1(...) __riscv_vluxseg8ei16_v_u32m1(__VA_ARGS__)
+#define vluxseg2ei16_v_u32m2(...) __riscv_vluxseg2ei16_v_u32m2(__VA_ARGS__)
+#define vluxseg3ei16_v_u32m2(...) __riscv_vluxseg3ei16_v_u32m2(__VA_ARGS__)
+#define vluxseg4ei16_v_u32m2(...) __riscv_vluxseg4ei16_v_u32m2(__VA_ARGS__)
+#define vluxseg2ei16_v_u32m4(...) __riscv_vluxseg2ei16_v_u32m4(__VA_ARGS__)
+#define vluxseg2ei32_v_u32mf2(...) __riscv_vluxseg2ei32_v_u32mf2(__VA_ARGS__)
+#define vluxseg3ei32_v_u32mf2(...) __riscv_vluxseg3ei32_v_u32mf2(__VA_ARGS__)
+#define vluxseg4ei32_v_u32mf2(...) __riscv_vluxseg4ei32_v_u32mf2(__VA_ARGS__)
+#define vluxseg5ei32_v_u32mf2(...) __riscv_vluxseg5ei32_v_u32mf2(__VA_ARGS__)
+#define vluxseg6ei32_v_u32mf2(...) __riscv_vluxseg6ei32_v_u32mf2(__VA_ARGS__)
+#define vluxseg7ei32_v_u32mf2(...) __riscv_vluxseg7ei32_v_u32mf2(__VA_ARGS__)
+#define vluxseg8ei32_v_u32mf2(...) __riscv_vluxseg8ei32_v_u32mf2(__VA_ARGS__)
+#define vluxseg2ei32_v_u32m1(...) __riscv_vluxseg2ei32_v_u32m1(__VA_ARGS__)
+#define vluxseg3ei32_v_u32m1(...) __riscv_vluxseg3ei32_v_u32m1(__VA_ARGS__)
+#define vluxseg4ei32_v_u32m1(...) __riscv_vluxseg4ei32_v_u32m1(__VA_ARGS__)
+#define vluxseg5ei32_v_u32m1(...) __riscv_vluxseg5ei32_v_u32m1(__VA_ARGS__)
+#define vluxseg6ei32_v_u32m1(...) __riscv_vluxseg6ei32_v_u32m1(__VA_ARGS__)
+#define vluxseg7ei32_v_u32m1(...) __riscv_vluxseg7ei32_v_u32m1(__VA_ARGS__)
+#define vluxseg8ei32_v_u32m1(...) __riscv_vluxseg8ei32_v_u32m1(__VA_ARGS__)
+#define vluxseg2ei32_v_u32m2(...) __riscv_vluxseg2ei32_v_u32m2(__VA_ARGS__)
+#define vluxseg3ei32_v_u32m2(...) __riscv_vluxseg3ei32_v_u32m2(__VA_ARGS__)
+#define vluxseg4ei32_v_u32m2(...) __riscv_vluxseg4ei32_v_u32m2(__VA_ARGS__)
+#define vluxseg2ei32_v_u32m4(...) __riscv_vluxseg2ei32_v_u32m4(__VA_ARGS__)
+#define vluxseg2ei64_v_u32mf2(...) __riscv_vluxseg2ei64_v_u32mf2(__VA_ARGS__)
+#define vluxseg3ei64_v_u32mf2(...) __riscv_vluxseg3ei64_v_u32mf2(__VA_ARGS__)
+#define vluxseg4ei64_v_u32mf2(...) __riscv_vluxseg4ei64_v_u32mf2(__VA_ARGS__)
+#define vluxseg5ei64_v_u32mf2(...) __riscv_vluxseg5ei64_v_u32mf2(__VA_ARGS__)
+#define vluxseg6ei64_v_u32mf2(...) __riscv_vluxseg6ei64_v_u32mf2(__VA_ARGS__)
+#define vluxseg7ei64_v_u32mf2(...) __riscv_vluxseg7ei64_v_u32mf2(__VA_ARGS__)
+#define vluxseg8ei64_v_u32mf2(...) __riscv_vluxseg8ei64_v_u32mf2(__VA_ARGS__)
+#define vluxseg2ei64_v_u32m1(...) __riscv_vluxseg2ei64_v_u32m1(__VA_ARGS__)
+#define vluxseg3ei64_v_u32m1(...) __riscv_vluxseg3ei64_v_u32m1(__VA_ARGS__)
+#define vluxseg4ei64_v_u32m1(...) __riscv_vluxseg4ei64_v_u32m1(__VA_ARGS__)
+#define vluxseg5ei64_v_u32m1(...) __riscv_vluxseg5ei64_v_u32m1(__VA_ARGS__)
+#define vluxseg6ei64_v_u32m1(...) __riscv_vluxseg6ei64_v_u32m1(__VA_ARGS__)
+#define vluxseg7ei64_v_u32m1(...) __riscv_vluxseg7ei64_v_u32m1(__VA_ARGS__)
+#define vluxseg8ei64_v_u32m1(...) __riscv_vluxseg8ei64_v_u32m1(__VA_ARGS__)
+#define vluxseg2ei64_v_u32m2(...) __riscv_vluxseg2ei64_v_u32m2(__VA_ARGS__)
+#define vluxseg3ei64_v_u32m2(...) __riscv_vluxseg3ei64_v_u32m2(__VA_ARGS__)
+#define vluxseg4ei64_v_u32m2(...) __riscv_vluxseg4ei64_v_u32m2(__VA_ARGS__)
+#define vluxseg2ei64_v_u32m4(...) __riscv_vluxseg2ei64_v_u32m4(__VA_ARGS__)
+#define vluxseg2ei8_v_u64m1(...) __riscv_vluxseg2ei8_v_u64m1(__VA_ARGS__)
+#define vluxseg3ei8_v_u64m1(...) __riscv_vluxseg3ei8_v_u64m1(__VA_ARGS__)
+#define vluxseg4ei8_v_u64m1(...) __riscv_vluxseg4ei8_v_u64m1(__VA_ARGS__)
+#define vluxseg5ei8_v_u64m1(...) __riscv_vluxseg5ei8_v_u64m1(__VA_ARGS__)
+#define vluxseg6ei8_v_u64m1(...) __riscv_vluxseg6ei8_v_u64m1(__VA_ARGS__)
+#define vluxseg7ei8_v_u64m1(...) __riscv_vluxseg7ei8_v_u64m1(__VA_ARGS__)
+#define vluxseg8ei8_v_u64m1(...) __riscv_vluxseg8ei8_v_u64m1(__VA_ARGS__)
+#define vluxseg2ei8_v_u64m2(...) __riscv_vluxseg2ei8_v_u64m2(__VA_ARGS__)
+#define vluxseg3ei8_v_u64m2(...) __riscv_vluxseg3ei8_v_u64m2(__VA_ARGS__)
+#define vluxseg4ei8_v_u64m2(...) __riscv_vluxseg4ei8_v_u64m2(__VA_ARGS__)
+#define vluxseg2ei8_v_u64m4(...) __riscv_vluxseg2ei8_v_u64m4(__VA_ARGS__)
+#define vluxseg2ei16_v_u64m1(...) __riscv_vluxseg2ei16_v_u64m1(__VA_ARGS__)
+#define vluxseg3ei16_v_u64m1(...) __riscv_vluxseg3ei16_v_u64m1(__VA_ARGS__)
+#define vluxseg4ei16_v_u64m1(...) __riscv_vluxseg4ei16_v_u64m1(__VA_ARGS__)
+#define vluxseg5ei16_v_u64m1(...) __riscv_vluxseg5ei16_v_u64m1(__VA_ARGS__)
+#define vluxseg6ei16_v_u64m1(...) __riscv_vluxseg6ei16_v_u64m1(__VA_ARGS__)
+#define vluxseg7ei16_v_u64m1(...) __riscv_vluxseg7ei16_v_u64m1(__VA_ARGS__)
+#define vluxseg8ei16_v_u64m1(...) __riscv_vluxseg8ei16_v_u64m1(__VA_ARGS__)
+#define vluxseg2ei16_v_u64m2(...) __riscv_vluxseg2ei16_v_u64m2(__VA_ARGS__)
+#define vluxseg3ei16_v_u64m2(...) __riscv_vluxseg3ei16_v_u64m2(__VA_ARGS__)
+#define vluxseg4ei16_v_u64m2(...) __riscv_vluxseg4ei16_v_u64m2(__VA_ARGS__)
+#define vluxseg2ei16_v_u64m4(...) __riscv_vluxseg2ei16_v_u64m4(__VA_ARGS__)
+#define vluxseg2ei32_v_u64m1(...) __riscv_vluxseg2ei32_v_u64m1(__VA_ARGS__)
+#define vluxseg3ei32_v_u64m1(...) __riscv_vluxseg3ei32_v_u64m1(__VA_ARGS__)
+#define vluxseg4ei32_v_u64m1(...) __riscv_vluxseg4ei32_v_u64m1(__VA_ARGS__)
+#define vluxseg5ei32_v_u64m1(...) __riscv_vluxseg5ei32_v_u64m1(__VA_ARGS__)
+#define vluxseg6ei32_v_u64m1(...) __riscv_vluxseg6ei32_v_u64m1(__VA_ARGS__)
+#define vluxseg7ei32_v_u64m1(...) __riscv_vluxseg7ei32_v_u64m1(__VA_ARGS__)
+#define vluxseg8ei32_v_u64m1(...) __riscv_vluxseg8ei32_v_u64m1(__VA_ARGS__)
+#define vluxseg2ei32_v_u64m2(...) __riscv_vluxseg2ei32_v_u64m2(__VA_ARGS__)
+#define vluxseg3ei32_v_u64m2(...) __riscv_vluxseg3ei32_v_u64m2(__VA_ARGS__)
+#define vluxseg4ei32_v_u64m2(...) __riscv_vluxseg4ei32_v_u64m2(__VA_ARGS__)
+#define vluxseg2ei32_v_u64m4(...) __riscv_vluxseg2ei32_v_u64m4(__VA_ARGS__)
+#define vluxseg2ei64_v_u64m1(...) __riscv_vluxseg2ei64_v_u64m1(__VA_ARGS__)
+#define vluxseg3ei64_v_u64m1(...) __riscv_vluxseg3ei64_v_u64m1(__VA_ARGS__)
+#define vluxseg4ei64_v_u64m1(...) __riscv_vluxseg4ei64_v_u64m1(__VA_ARGS__)
+#define vluxseg5ei64_v_u64m1(...) __riscv_vluxseg5ei64_v_u64m1(__VA_ARGS__)
+#define vluxseg6ei64_v_u64m1(...) __riscv_vluxseg6ei64_v_u64m1(__VA_ARGS__)
+#define vluxseg7ei64_v_u64m1(...) __riscv_vluxseg7ei64_v_u64m1(__VA_ARGS__)
+#define vluxseg8ei64_v_u64m1(...) __riscv_vluxseg8ei64_v_u64m1(__VA_ARGS__)
+#define vluxseg2ei64_v_u64m2(...) __riscv_vluxseg2ei64_v_u64m2(__VA_ARGS__)
+#define vluxseg3ei64_v_u64m2(...) __riscv_vluxseg3ei64_v_u64m2(__VA_ARGS__)
+#define vluxseg4ei64_v_u64m2(...) __riscv_vluxseg4ei64_v_u64m2(__VA_ARGS__)
+#define vluxseg2ei64_v_u64m4(...) __riscv_vluxseg2ei64_v_u64m4(__VA_ARGS__)
+// masked functions
+#define vloxseg2ei8_v_f16mf4_m(...) __riscv_vloxseg2ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_f16mf4_m(...) __riscv_vloxseg3ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_f16mf4_m(...) __riscv_vloxseg4ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_f16mf4_m(...) __riscv_vloxseg5ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_f16mf4_m(...) __riscv_vloxseg6ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_f16mf4_m(...) __riscv_vloxseg7ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_f16mf4_m(...) __riscv_vloxseg8ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f16mf2_m(...) __riscv_vloxseg2ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_f16mf2_m(...) __riscv_vloxseg3ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_f16mf2_m(...) __riscv_vloxseg4ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_f16mf2_m(...) __riscv_vloxseg5ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_f16mf2_m(...) __riscv_vloxseg6ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_f16mf2_m(...) __riscv_vloxseg7ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_f16mf2_m(...) __riscv_vloxseg8ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f16m1_m(...) __riscv_vloxseg2ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_f16m1_m(...) __riscv_vloxseg3ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_f16m1_m(...) __riscv_vloxseg4ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_f16m1_m(...) __riscv_vloxseg5ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_f16m1_m(...) __riscv_vloxseg6ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_f16m1_m(...) __riscv_vloxseg7ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_f16m1_m(...) __riscv_vloxseg8ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f16m2_m(...) __riscv_vloxseg2ei8_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_f16m2_m(...) __riscv_vloxseg3ei8_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_f16m2_m(...) __riscv_vloxseg4ei8_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f16m4_m(...) __riscv_vloxseg2ei8_v_f16m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f16mf4_m(...) __riscv_vloxseg2ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_f16mf4_m(...) __riscv_vloxseg3ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_f16mf4_m(...) __riscv_vloxseg4ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_f16mf4_m(...) __riscv_vloxseg5ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_f16mf4_m(...) __riscv_vloxseg6ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_f16mf4_m(...) __riscv_vloxseg7ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_f16mf4_m(...) __riscv_vloxseg8ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f16mf2_m(...) __riscv_vloxseg2ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_f16mf2_m(...) __riscv_vloxseg3ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_f16mf2_m(...) __riscv_vloxseg4ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_f16mf2_m(...) __riscv_vloxseg5ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_f16mf2_m(...) __riscv_vloxseg6ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_f16mf2_m(...) __riscv_vloxseg7ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_f16mf2_m(...) __riscv_vloxseg8ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f16m1_m(...) __riscv_vloxseg2ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_f16m1_m(...) __riscv_vloxseg3ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_f16m1_m(...) __riscv_vloxseg4ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_f16m1_m(...) __riscv_vloxseg5ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_f16m1_m(...) __riscv_vloxseg6ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_f16m1_m(...) __riscv_vloxseg7ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_f16m1_m(...) __riscv_vloxseg8ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f16m2_m(...) __riscv_vloxseg2ei16_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_f16m2_m(...) __riscv_vloxseg3ei16_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_f16m2_m(...) __riscv_vloxseg4ei16_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f16m4_m(...) __riscv_vloxseg2ei16_v_f16m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f16mf4_m(...) __riscv_vloxseg2ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_f16mf4_m(...) __riscv_vloxseg3ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_f16mf4_m(...) __riscv_vloxseg4ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_f16mf4_m(...) __riscv_vloxseg5ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_f16mf4_m(...) __riscv_vloxseg6ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_f16mf4_m(...) __riscv_vloxseg7ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_f16mf4_m(...) __riscv_vloxseg8ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f16mf2_m(...) __riscv_vloxseg2ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_f16mf2_m(...) __riscv_vloxseg3ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_f16mf2_m(...) __riscv_vloxseg4ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_f16mf2_m(...) __riscv_vloxseg5ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_f16mf2_m(...) __riscv_vloxseg6ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_f16mf2_m(...) __riscv_vloxseg7ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_f16mf2_m(...) __riscv_vloxseg8ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f16m1_m(...) __riscv_vloxseg2ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_f16m1_m(...) __riscv_vloxseg3ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_f16m1_m(...) __riscv_vloxseg4ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_f16m1_m(...) __riscv_vloxseg5ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_f16m1_m(...) __riscv_vloxseg6ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_f16m1_m(...) __riscv_vloxseg7ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_f16m1_m(...) __riscv_vloxseg8ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f16m2_m(...) __riscv_vloxseg2ei32_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_f16m2_m(...) __riscv_vloxseg3ei32_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_f16m2_m(...) __riscv_vloxseg4ei32_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f16m4_m(...) __riscv_vloxseg2ei32_v_f16m4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f16mf4_m(...) __riscv_vloxseg2ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_f16mf4_m(...) __riscv_vloxseg3ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_f16mf4_m(...) __riscv_vloxseg4ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_f16mf4_m(...) __riscv_vloxseg5ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_f16mf4_m(...) __riscv_vloxseg6ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_f16mf4_m(...) __riscv_vloxseg7ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_f16mf4_m(...) __riscv_vloxseg8ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f16mf2_m(...) __riscv_vloxseg2ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_f16mf2_m(...) __riscv_vloxseg3ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_f16mf2_m(...) __riscv_vloxseg4ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_f16mf2_m(...) __riscv_vloxseg5ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_f16mf2_m(...) __riscv_vloxseg6ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_f16mf2_m(...) __riscv_vloxseg7ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_f16mf2_m(...) __riscv_vloxseg8ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f16m1_m(...) __riscv_vloxseg2ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_f16m1_m(...) __riscv_vloxseg3ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_f16m1_m(...) __riscv_vloxseg4ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_f16m1_m(...) __riscv_vloxseg5ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_f16m1_m(...) __riscv_vloxseg6ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_f16m1_m(...) __riscv_vloxseg7ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_f16m1_m(...) __riscv_vloxseg8ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f16m2_m(...) __riscv_vloxseg2ei64_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_f16m2_m(...) __riscv_vloxseg3ei64_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_f16m2_m(...) __riscv_vloxseg4ei64_v_f16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f32mf2_m(...) __riscv_vloxseg2ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_f32mf2_m(...) __riscv_vloxseg3ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_f32mf2_m(...) __riscv_vloxseg4ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_f32mf2_m(...) __riscv_vloxseg5ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_f32mf2_m(...) __riscv_vloxseg6ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_f32mf2_m(...) __riscv_vloxseg7ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_f32mf2_m(...) __riscv_vloxseg8ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f32m1_m(...) __riscv_vloxseg2ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_f32m1_m(...) __riscv_vloxseg3ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_f32m1_m(...) __riscv_vloxseg4ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_f32m1_m(...) __riscv_vloxseg5ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_f32m1_m(...) __riscv_vloxseg6ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_f32m1_m(...) __riscv_vloxseg7ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_f32m1_m(...) __riscv_vloxseg8ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f32m2_m(...) __riscv_vloxseg2ei8_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_f32m2_m(...) __riscv_vloxseg3ei8_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_f32m2_m(...) __riscv_vloxseg4ei8_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f32m4_m(...) __riscv_vloxseg2ei8_v_f32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f32mf2_m(...) __riscv_vloxseg2ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_f32mf2_m(...) __riscv_vloxseg3ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_f32mf2_m(...) __riscv_vloxseg4ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_f32mf2_m(...) __riscv_vloxseg5ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_f32mf2_m(...) __riscv_vloxseg6ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_f32mf2_m(...) __riscv_vloxseg7ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_f32mf2_m(...) __riscv_vloxseg8ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f32m1_m(...) __riscv_vloxseg2ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_f32m1_m(...) __riscv_vloxseg3ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_f32m1_m(...) __riscv_vloxseg4ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_f32m1_m(...) __riscv_vloxseg5ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_f32m1_m(...) __riscv_vloxseg6ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_f32m1_m(...) __riscv_vloxseg7ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_f32m1_m(...) __riscv_vloxseg8ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f32m2_m(...) __riscv_vloxseg2ei16_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_f32m2_m(...) __riscv_vloxseg3ei16_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_f32m2_m(...) __riscv_vloxseg4ei16_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f32m4_m(...) __riscv_vloxseg2ei16_v_f32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f32mf2_m(...) __riscv_vloxseg2ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_f32mf2_m(...) __riscv_vloxseg3ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_f32mf2_m(...) __riscv_vloxseg4ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_f32mf2_m(...) __riscv_vloxseg5ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_f32mf2_m(...) __riscv_vloxseg6ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_f32mf2_m(...) __riscv_vloxseg7ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_f32mf2_m(...) __riscv_vloxseg8ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f32m1_m(...) __riscv_vloxseg2ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_f32m1_m(...) __riscv_vloxseg3ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_f32m1_m(...) __riscv_vloxseg4ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_f32m1_m(...) __riscv_vloxseg5ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_f32m1_m(...) __riscv_vloxseg6ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_f32m1_m(...) __riscv_vloxseg7ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_f32m1_m(...) __riscv_vloxseg8ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f32m2_m(...) __riscv_vloxseg2ei32_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_f32m2_m(...) __riscv_vloxseg3ei32_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_f32m2_m(...) __riscv_vloxseg4ei32_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f32m4_m(...) __riscv_vloxseg2ei32_v_f32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f32mf2_m(...) __riscv_vloxseg2ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_f32mf2_m(...) __riscv_vloxseg3ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_f32mf2_m(...) __riscv_vloxseg4ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_f32mf2_m(...) __riscv_vloxseg5ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_f32mf2_m(...) __riscv_vloxseg6ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_f32mf2_m(...) __riscv_vloxseg7ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_f32mf2_m(...) __riscv_vloxseg8ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f32m1_m(...) __riscv_vloxseg2ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_f32m1_m(...) __riscv_vloxseg3ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_f32m1_m(...) __riscv_vloxseg4ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_f32m1_m(...) __riscv_vloxseg5ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_f32m1_m(...) __riscv_vloxseg6ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_f32m1_m(...) __riscv_vloxseg7ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_f32m1_m(...) __riscv_vloxseg8ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f32m2_m(...) __riscv_vloxseg2ei64_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_f32m2_m(...) __riscv_vloxseg3ei64_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_f32m2_m(...) __riscv_vloxseg4ei64_v_f32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f32m4_m(...) __riscv_vloxseg2ei64_v_f32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f64m1_m(...) __riscv_vloxseg2ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_f64m1_m(...) __riscv_vloxseg3ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_f64m1_m(...) __riscv_vloxseg4ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_f64m1_m(...) __riscv_vloxseg5ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_f64m1_m(...) __riscv_vloxseg6ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_f64m1_m(...) __riscv_vloxseg7ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_f64m1_m(...) __riscv_vloxseg8ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f64m2_m(...) __riscv_vloxseg2ei8_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_f64m2_m(...) __riscv_vloxseg3ei8_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_f64m2_m(...) __riscv_vloxseg4ei8_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_f64m4_m(...) __riscv_vloxseg2ei8_v_f64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f64m1_m(...) __riscv_vloxseg2ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_f64m1_m(...) __riscv_vloxseg3ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_f64m1_m(...) __riscv_vloxseg4ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_f64m1_m(...) __riscv_vloxseg5ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_f64m1_m(...) __riscv_vloxseg6ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_f64m1_m(...) __riscv_vloxseg7ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_f64m1_m(...) __riscv_vloxseg8ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f64m2_m(...) __riscv_vloxseg2ei16_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_f64m2_m(...) __riscv_vloxseg3ei16_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_f64m2_m(...) __riscv_vloxseg4ei16_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_f64m4_m(...) __riscv_vloxseg2ei16_v_f64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f64m1_m(...) __riscv_vloxseg2ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_f64m1_m(...) __riscv_vloxseg3ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_f64m1_m(...) __riscv_vloxseg4ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_f64m1_m(...) __riscv_vloxseg5ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_f64m1_m(...) __riscv_vloxseg6ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_f64m1_m(...) __riscv_vloxseg7ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_f64m1_m(...) __riscv_vloxseg8ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f64m2_m(...) __riscv_vloxseg2ei32_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_f64m2_m(...) __riscv_vloxseg3ei32_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_f64m2_m(...) __riscv_vloxseg4ei32_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_f64m4_m(...) __riscv_vloxseg2ei32_v_f64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f64m1_m(...) __riscv_vloxseg2ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_f64m1_m(...) __riscv_vloxseg3ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_f64m1_m(...) __riscv_vloxseg4ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_f64m1_m(...) __riscv_vloxseg5ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_f64m1_m(...) __riscv_vloxseg6ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_f64m1_m(...) __riscv_vloxseg7ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_f64m1_m(...) __riscv_vloxseg8ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f64m2_m(...) __riscv_vloxseg2ei64_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_f64m2_m(...) __riscv_vloxseg3ei64_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_f64m2_m(...) __riscv_vloxseg4ei64_v_f64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_f64m4_m(...) __riscv_vloxseg2ei64_v_f64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f16mf4_m(...) __riscv_vluxseg2ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_f16mf4_m(...) __riscv_vluxseg3ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_f16mf4_m(...) __riscv_vluxseg4ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_f16mf4_m(...) __riscv_vluxseg5ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_f16mf4_m(...) __riscv_vluxseg6ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_f16mf4_m(...) __riscv_vluxseg7ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_f16mf4_m(...) __riscv_vluxseg8ei8_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f16mf2_m(...) __riscv_vluxseg2ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_f16mf2_m(...) __riscv_vluxseg3ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_f16mf2_m(...) __riscv_vluxseg4ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_f16mf2_m(...) __riscv_vluxseg5ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_f16mf2_m(...) __riscv_vluxseg6ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_f16mf2_m(...) __riscv_vluxseg7ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_f16mf2_m(...) __riscv_vluxseg8ei8_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f16m1_m(...) __riscv_vluxseg2ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_f16m1_m(...) __riscv_vluxseg3ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_f16m1_m(...) __riscv_vluxseg4ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_f16m1_m(...) __riscv_vluxseg5ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_f16m1_m(...) __riscv_vluxseg6ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_f16m1_m(...) __riscv_vluxseg7ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_f16m1_m(...) __riscv_vluxseg8ei8_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f16m2_m(...) __riscv_vluxseg2ei8_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_f16m2_m(...) __riscv_vluxseg3ei8_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_f16m2_m(...) __riscv_vluxseg4ei8_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f16m4_m(...) __riscv_vluxseg2ei8_v_f16m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f16mf4_m(...) __riscv_vluxseg2ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_f16mf4_m(...) __riscv_vluxseg3ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_f16mf4_m(...) __riscv_vluxseg4ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_f16mf4_m(...) __riscv_vluxseg5ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_f16mf4_m(...) __riscv_vluxseg6ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_f16mf4_m(...) __riscv_vluxseg7ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_f16mf4_m(...) __riscv_vluxseg8ei16_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f16mf2_m(...) __riscv_vluxseg2ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_f16mf2_m(...) __riscv_vluxseg3ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_f16mf2_m(...) __riscv_vluxseg4ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_f16mf2_m(...) __riscv_vluxseg5ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_f16mf2_m(...) __riscv_vluxseg6ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_f16mf2_m(...) __riscv_vluxseg7ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_f16mf2_m(...) __riscv_vluxseg8ei16_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f16m1_m(...) __riscv_vluxseg2ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_f16m1_m(...) __riscv_vluxseg3ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_f16m1_m(...) __riscv_vluxseg4ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_f16m1_m(...) __riscv_vluxseg5ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_f16m1_m(...) __riscv_vluxseg6ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_f16m1_m(...) __riscv_vluxseg7ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_f16m1_m(...) __riscv_vluxseg8ei16_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f16m2_m(...) __riscv_vluxseg2ei16_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_f16m2_m(...) __riscv_vluxseg3ei16_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_f16m2_m(...) __riscv_vluxseg4ei16_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f16m4_m(...) __riscv_vluxseg2ei16_v_f16m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f16mf4_m(...) __riscv_vluxseg2ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_f16mf4_m(...) __riscv_vluxseg3ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_f16mf4_m(...) __riscv_vluxseg4ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_f16mf4_m(...) __riscv_vluxseg5ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_f16mf4_m(...) __riscv_vluxseg6ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_f16mf4_m(...) __riscv_vluxseg7ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_f16mf4_m(...) __riscv_vluxseg8ei32_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f16mf2_m(...) __riscv_vluxseg2ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_f16mf2_m(...) __riscv_vluxseg3ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_f16mf2_m(...) __riscv_vluxseg4ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_f16mf2_m(...) __riscv_vluxseg5ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_f16mf2_m(...) __riscv_vluxseg6ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_f16mf2_m(...) __riscv_vluxseg7ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_f16mf2_m(...) __riscv_vluxseg8ei32_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f16m1_m(...) __riscv_vluxseg2ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_f16m1_m(...) __riscv_vluxseg3ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_f16m1_m(...) __riscv_vluxseg4ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_f16m1_m(...) __riscv_vluxseg5ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_f16m1_m(...) __riscv_vluxseg6ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_f16m1_m(...) __riscv_vluxseg7ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_f16m1_m(...) __riscv_vluxseg8ei32_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f16m2_m(...) __riscv_vluxseg2ei32_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_f16m2_m(...) __riscv_vluxseg3ei32_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_f16m2_m(...) __riscv_vluxseg4ei32_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f16m4_m(...) __riscv_vluxseg2ei32_v_f16m4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f16mf4_m(...) __riscv_vluxseg2ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_f16mf4_m(...) __riscv_vluxseg3ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_f16mf4_m(...) __riscv_vluxseg4ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_f16mf4_m(...) __riscv_vluxseg5ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_f16mf4_m(...) __riscv_vluxseg6ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_f16mf4_m(...) __riscv_vluxseg7ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_f16mf4_m(...) __riscv_vluxseg8ei64_v_f16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f16mf2_m(...) __riscv_vluxseg2ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_f16mf2_m(...) __riscv_vluxseg3ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_f16mf2_m(...) __riscv_vluxseg4ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_f16mf2_m(...) __riscv_vluxseg5ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_f16mf2_m(...) __riscv_vluxseg6ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_f16mf2_m(...) __riscv_vluxseg7ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_f16mf2_m(...) __riscv_vluxseg8ei64_v_f16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f16m1_m(...) __riscv_vluxseg2ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_f16m1_m(...) __riscv_vluxseg3ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_f16m1_m(...) __riscv_vluxseg4ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_f16m1_m(...) __riscv_vluxseg5ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_f16m1_m(...) __riscv_vluxseg6ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_f16m1_m(...) __riscv_vluxseg7ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_f16m1_m(...) __riscv_vluxseg8ei64_v_f16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f16m2_m(...) __riscv_vluxseg2ei64_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_f16m2_m(...) __riscv_vluxseg3ei64_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_f16m2_m(...) __riscv_vluxseg4ei64_v_f16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f32mf2_m(...) __riscv_vluxseg2ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_f32mf2_m(...) __riscv_vluxseg3ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_f32mf2_m(...) __riscv_vluxseg4ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_f32mf2_m(...) __riscv_vluxseg5ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_f32mf2_m(...) __riscv_vluxseg6ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_f32mf2_m(...) __riscv_vluxseg7ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_f32mf2_m(...) __riscv_vluxseg8ei8_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f32m1_m(...) __riscv_vluxseg2ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_f32m1_m(...) __riscv_vluxseg3ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_f32m1_m(...) __riscv_vluxseg4ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_f32m1_m(...) __riscv_vluxseg5ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_f32m1_m(...) __riscv_vluxseg6ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_f32m1_m(...) __riscv_vluxseg7ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_f32m1_m(...) __riscv_vluxseg8ei8_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f32m2_m(...) __riscv_vluxseg2ei8_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_f32m2_m(...) __riscv_vluxseg3ei8_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_f32m2_m(...) __riscv_vluxseg4ei8_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f32m4_m(...) __riscv_vluxseg2ei8_v_f32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f32mf2_m(...) __riscv_vluxseg2ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_f32mf2_m(...) __riscv_vluxseg3ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_f32mf2_m(...) __riscv_vluxseg4ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_f32mf2_m(...) __riscv_vluxseg5ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_f32mf2_m(...) __riscv_vluxseg6ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_f32mf2_m(...) __riscv_vluxseg7ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_f32mf2_m(...) __riscv_vluxseg8ei16_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f32m1_m(...) __riscv_vluxseg2ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_f32m1_m(...) __riscv_vluxseg3ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_f32m1_m(...) __riscv_vluxseg4ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_f32m1_m(...) __riscv_vluxseg5ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_f32m1_m(...) __riscv_vluxseg6ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_f32m1_m(...) __riscv_vluxseg7ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_f32m1_m(...) __riscv_vluxseg8ei16_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f32m2_m(...) __riscv_vluxseg2ei16_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_f32m2_m(...) __riscv_vluxseg3ei16_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_f32m2_m(...) __riscv_vluxseg4ei16_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f32m4_m(...) __riscv_vluxseg2ei16_v_f32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f32mf2_m(...) __riscv_vluxseg2ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_f32mf2_m(...) __riscv_vluxseg3ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_f32mf2_m(...) __riscv_vluxseg4ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_f32mf2_m(...) __riscv_vluxseg5ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_f32mf2_m(...) __riscv_vluxseg6ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_f32mf2_m(...) __riscv_vluxseg7ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_f32mf2_m(...) __riscv_vluxseg8ei32_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f32m1_m(...) __riscv_vluxseg2ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_f32m1_m(...) __riscv_vluxseg3ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_f32m1_m(...) __riscv_vluxseg4ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_f32m1_m(...) __riscv_vluxseg5ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_f32m1_m(...) __riscv_vluxseg6ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_f32m1_m(...) __riscv_vluxseg7ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_f32m1_m(...) __riscv_vluxseg8ei32_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f32m2_m(...) __riscv_vluxseg2ei32_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_f32m2_m(...) __riscv_vluxseg3ei32_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_f32m2_m(...) __riscv_vluxseg4ei32_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f32m4_m(...) __riscv_vluxseg2ei32_v_f32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f32mf2_m(...) __riscv_vluxseg2ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_f32mf2_m(...) __riscv_vluxseg3ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_f32mf2_m(...) __riscv_vluxseg4ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_f32mf2_m(...) __riscv_vluxseg5ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_f32mf2_m(...) __riscv_vluxseg6ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_f32mf2_m(...) __riscv_vluxseg7ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_f32mf2_m(...) __riscv_vluxseg8ei64_v_f32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f32m1_m(...) __riscv_vluxseg2ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_f32m1_m(...) __riscv_vluxseg3ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_f32m1_m(...) __riscv_vluxseg4ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_f32m1_m(...) __riscv_vluxseg5ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_f32m1_m(...) __riscv_vluxseg6ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_f32m1_m(...) __riscv_vluxseg7ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_f32m1_m(...) __riscv_vluxseg8ei64_v_f32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f32m2_m(...) __riscv_vluxseg2ei64_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_f32m2_m(...) __riscv_vluxseg3ei64_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_f32m2_m(...) __riscv_vluxseg4ei64_v_f32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f32m4_m(...) __riscv_vluxseg2ei64_v_f32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f64m1_m(...) __riscv_vluxseg2ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_f64m1_m(...) __riscv_vluxseg3ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_f64m1_m(...) __riscv_vluxseg4ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_f64m1_m(...) __riscv_vluxseg5ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_f64m1_m(...) __riscv_vluxseg6ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_f64m1_m(...) __riscv_vluxseg7ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_f64m1_m(...) __riscv_vluxseg8ei8_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f64m2_m(...) __riscv_vluxseg2ei8_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_f64m2_m(...) __riscv_vluxseg3ei8_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_f64m2_m(...) __riscv_vluxseg4ei8_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_f64m4_m(...) __riscv_vluxseg2ei8_v_f64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f64m1_m(...) __riscv_vluxseg2ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_f64m1_m(...) __riscv_vluxseg3ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_f64m1_m(...) __riscv_vluxseg4ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_f64m1_m(...) __riscv_vluxseg5ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_f64m1_m(...) __riscv_vluxseg6ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_f64m1_m(...) __riscv_vluxseg7ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_f64m1_m(...) __riscv_vluxseg8ei16_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f64m2_m(...) __riscv_vluxseg2ei16_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_f64m2_m(...) __riscv_vluxseg3ei16_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_f64m2_m(...) __riscv_vluxseg4ei16_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_f64m4_m(...) __riscv_vluxseg2ei16_v_f64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f64m1_m(...) __riscv_vluxseg2ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_f64m1_m(...) __riscv_vluxseg3ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_f64m1_m(...) __riscv_vluxseg4ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_f64m1_m(...) __riscv_vluxseg5ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_f64m1_m(...) __riscv_vluxseg6ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_f64m1_m(...) __riscv_vluxseg7ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_f64m1_m(...) __riscv_vluxseg8ei32_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f64m2_m(...) __riscv_vluxseg2ei32_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_f64m2_m(...) __riscv_vluxseg3ei32_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_f64m2_m(...) __riscv_vluxseg4ei32_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_f64m4_m(...) __riscv_vluxseg2ei32_v_f64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f64m1_m(...) __riscv_vluxseg2ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_f64m1_m(...) __riscv_vluxseg3ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_f64m1_m(...) __riscv_vluxseg4ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_f64m1_m(...) __riscv_vluxseg5ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_f64m1_m(...) __riscv_vluxseg6ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_f64m1_m(...) __riscv_vluxseg7ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_f64m1_m(...) __riscv_vluxseg8ei64_v_f64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f64m2_m(...) __riscv_vluxseg2ei64_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_f64m2_m(...) __riscv_vluxseg3ei64_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_f64m2_m(...) __riscv_vluxseg4ei64_v_f64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_f64m4_m(...) __riscv_vluxseg2ei64_v_f64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i8mf8_m(...) __riscv_vloxseg2ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i8mf8_m(...) __riscv_vloxseg3ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i8mf8_m(...) __riscv_vloxseg4ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i8mf8_m(...) __riscv_vloxseg5ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i8mf8_m(...) __riscv_vloxseg6ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i8mf8_m(...) __riscv_vloxseg7ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i8mf8_m(...) __riscv_vloxseg8ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i8mf4_m(...) __riscv_vloxseg2ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i8mf4_m(...) __riscv_vloxseg3ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i8mf4_m(...) __riscv_vloxseg4ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i8mf4_m(...) __riscv_vloxseg5ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i8mf4_m(...) __riscv_vloxseg6ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i8mf4_m(...) __riscv_vloxseg7ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i8mf4_m(...) __riscv_vloxseg8ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i8mf2_m(...) __riscv_vloxseg2ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i8mf2_m(...) __riscv_vloxseg3ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i8mf2_m(...) __riscv_vloxseg4ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i8mf2_m(...) __riscv_vloxseg5ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i8mf2_m(...) __riscv_vloxseg6ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i8mf2_m(...) __riscv_vloxseg7ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i8mf2_m(...) __riscv_vloxseg8ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i8m1_m(...) __riscv_vloxseg2ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i8m1_m(...) __riscv_vloxseg3ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i8m1_m(...) __riscv_vloxseg4ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i8m1_m(...) __riscv_vloxseg5ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i8m1_m(...) __riscv_vloxseg6ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i8m1_m(...) __riscv_vloxseg7ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i8m1_m(...) __riscv_vloxseg8ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i8m2_m(...) __riscv_vloxseg2ei8_v_i8m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i8m2_m(...) __riscv_vloxseg3ei8_v_i8m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i8m2_m(...) __riscv_vloxseg4ei8_v_i8m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i8m4_m(...) __riscv_vloxseg2ei8_v_i8m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i8mf8_m(...) __riscv_vloxseg2ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i8mf8_m(...) __riscv_vloxseg3ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i8mf8_m(...) __riscv_vloxseg4ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i8mf8_m(...) __riscv_vloxseg5ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i8mf8_m(...) __riscv_vloxseg6ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i8mf8_m(...) __riscv_vloxseg7ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i8mf8_m(...) __riscv_vloxseg8ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i8mf4_m(...) __riscv_vloxseg2ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i8mf4_m(...) __riscv_vloxseg3ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i8mf4_m(...) __riscv_vloxseg4ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i8mf4_m(...) __riscv_vloxseg5ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i8mf4_m(...) __riscv_vloxseg6ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i8mf4_m(...) __riscv_vloxseg7ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i8mf4_m(...) __riscv_vloxseg8ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i8mf2_m(...) __riscv_vloxseg2ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i8mf2_m(...) __riscv_vloxseg3ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i8mf2_m(...) __riscv_vloxseg4ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i8mf2_m(...) __riscv_vloxseg5ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i8mf2_m(...) __riscv_vloxseg6ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i8mf2_m(...) __riscv_vloxseg7ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i8mf2_m(...) __riscv_vloxseg8ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i8m1_m(...) __riscv_vloxseg2ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i8m1_m(...) __riscv_vloxseg3ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i8m1_m(...) __riscv_vloxseg4ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i8m1_m(...) __riscv_vloxseg5ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i8m1_m(...) __riscv_vloxseg6ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i8m1_m(...) __riscv_vloxseg7ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i8m1_m(...) __riscv_vloxseg8ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i8m2_m(...) __riscv_vloxseg2ei16_v_i8m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i8m2_m(...) __riscv_vloxseg3ei16_v_i8m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i8m2_m(...) __riscv_vloxseg4ei16_v_i8m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i8m4_m(...) __riscv_vloxseg2ei16_v_i8m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i8mf8_m(...) __riscv_vloxseg2ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i8mf8_m(...) __riscv_vloxseg3ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i8mf8_m(...) __riscv_vloxseg4ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i8mf8_m(...) __riscv_vloxseg5ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i8mf8_m(...) __riscv_vloxseg6ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i8mf8_m(...) __riscv_vloxseg7ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i8mf8_m(...) __riscv_vloxseg8ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i8mf4_m(...) __riscv_vloxseg2ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i8mf4_m(...) __riscv_vloxseg3ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i8mf4_m(...) __riscv_vloxseg4ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i8mf4_m(...) __riscv_vloxseg5ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i8mf4_m(...) __riscv_vloxseg6ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i8mf4_m(...) __riscv_vloxseg7ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i8mf4_m(...) __riscv_vloxseg8ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i8mf2_m(...) __riscv_vloxseg2ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i8mf2_m(...) __riscv_vloxseg3ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i8mf2_m(...) __riscv_vloxseg4ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i8mf2_m(...) __riscv_vloxseg5ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i8mf2_m(...) __riscv_vloxseg6ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i8mf2_m(...) __riscv_vloxseg7ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i8mf2_m(...) __riscv_vloxseg8ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i8m1_m(...) __riscv_vloxseg2ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i8m1_m(...) __riscv_vloxseg3ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i8m1_m(...) __riscv_vloxseg4ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i8m1_m(...) __riscv_vloxseg5ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i8m1_m(...) __riscv_vloxseg6ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i8m1_m(...) __riscv_vloxseg7ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i8m1_m(...) __riscv_vloxseg8ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i8m2_m(...) __riscv_vloxseg2ei32_v_i8m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i8m2_m(...) __riscv_vloxseg3ei32_v_i8m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i8m2_m(...) __riscv_vloxseg4ei32_v_i8m2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i8mf8_m(...) __riscv_vloxseg2ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i8mf8_m(...) __riscv_vloxseg3ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i8mf8_m(...) __riscv_vloxseg4ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i8mf8_m(...) __riscv_vloxseg5ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i8mf8_m(...) __riscv_vloxseg6ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i8mf8_m(...) __riscv_vloxseg7ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i8mf8_m(...) __riscv_vloxseg8ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i8mf4_m(...) __riscv_vloxseg2ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i8mf4_m(...) __riscv_vloxseg3ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i8mf4_m(...) __riscv_vloxseg4ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i8mf4_m(...) __riscv_vloxseg5ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i8mf4_m(...) __riscv_vloxseg6ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i8mf4_m(...) __riscv_vloxseg7ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i8mf4_m(...) __riscv_vloxseg8ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i8mf2_m(...) __riscv_vloxseg2ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i8mf2_m(...) __riscv_vloxseg3ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i8mf2_m(...) __riscv_vloxseg4ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i8mf2_m(...) __riscv_vloxseg5ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i8mf2_m(...) __riscv_vloxseg6ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i8mf2_m(...) __riscv_vloxseg7ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i8mf2_m(...) __riscv_vloxseg8ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i8m1_m(...) __riscv_vloxseg2ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i8m1_m(...) __riscv_vloxseg3ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i8m1_m(...) __riscv_vloxseg4ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i8m1_m(...) __riscv_vloxseg5ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i8m1_m(...) __riscv_vloxseg6ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i8m1_m(...) __riscv_vloxseg7ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i8m1_m(...) __riscv_vloxseg8ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i16mf4_m(...) __riscv_vloxseg2ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i16mf4_m(...) __riscv_vloxseg3ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i16mf4_m(...) __riscv_vloxseg4ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i16mf4_m(...) __riscv_vloxseg5ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i16mf4_m(...) __riscv_vloxseg6ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i16mf4_m(...) __riscv_vloxseg7ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i16mf4_m(...) __riscv_vloxseg8ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i16mf2_m(...) __riscv_vloxseg2ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i16mf2_m(...) __riscv_vloxseg3ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i16mf2_m(...) __riscv_vloxseg4ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i16mf2_m(...) __riscv_vloxseg5ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i16mf2_m(...) __riscv_vloxseg6ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i16mf2_m(...) __riscv_vloxseg7ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i16mf2_m(...) __riscv_vloxseg8ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i16m1_m(...) __riscv_vloxseg2ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i16m1_m(...) __riscv_vloxseg3ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i16m1_m(...) __riscv_vloxseg4ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i16m1_m(...) __riscv_vloxseg5ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i16m1_m(...) __riscv_vloxseg6ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i16m1_m(...) __riscv_vloxseg7ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i16m1_m(...) __riscv_vloxseg8ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i16m2_m(...) __riscv_vloxseg2ei8_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i16m2_m(...) __riscv_vloxseg3ei8_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i16m2_m(...) __riscv_vloxseg4ei8_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i16m4_m(...) __riscv_vloxseg2ei8_v_i16m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i16mf4_m(...) __riscv_vloxseg2ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i16mf4_m(...) __riscv_vloxseg3ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i16mf4_m(...) __riscv_vloxseg4ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i16mf4_m(...) __riscv_vloxseg5ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i16mf4_m(...) __riscv_vloxseg6ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i16mf4_m(...) __riscv_vloxseg7ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i16mf4_m(...) __riscv_vloxseg8ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i16mf2_m(...) __riscv_vloxseg2ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i16mf2_m(...) __riscv_vloxseg3ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i16mf2_m(...) __riscv_vloxseg4ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i16mf2_m(...) __riscv_vloxseg5ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i16mf2_m(...) __riscv_vloxseg6ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i16mf2_m(...) __riscv_vloxseg7ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i16mf2_m(...) __riscv_vloxseg8ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i16m1_m(...) __riscv_vloxseg2ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i16m1_m(...) __riscv_vloxseg3ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i16m1_m(...) __riscv_vloxseg4ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i16m1_m(...) __riscv_vloxseg5ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i16m1_m(...) __riscv_vloxseg6ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i16m1_m(...) __riscv_vloxseg7ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i16m1_m(...) __riscv_vloxseg8ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i16m2_m(...) __riscv_vloxseg2ei16_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i16m2_m(...) __riscv_vloxseg3ei16_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i16m2_m(...) __riscv_vloxseg4ei16_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i16m4_m(...) __riscv_vloxseg2ei16_v_i16m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i16mf4_m(...) __riscv_vloxseg2ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i16mf4_m(...) __riscv_vloxseg3ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i16mf4_m(...) __riscv_vloxseg4ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i16mf4_m(...) __riscv_vloxseg5ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i16mf4_m(...) __riscv_vloxseg6ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i16mf4_m(...) __riscv_vloxseg7ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i16mf4_m(...) __riscv_vloxseg8ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i16mf2_m(...) __riscv_vloxseg2ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i16mf2_m(...) __riscv_vloxseg3ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i16mf2_m(...) __riscv_vloxseg4ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i16mf2_m(...) __riscv_vloxseg5ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i16mf2_m(...) __riscv_vloxseg6ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i16mf2_m(...) __riscv_vloxseg7ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i16mf2_m(...) __riscv_vloxseg8ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i16m1_m(...) __riscv_vloxseg2ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i16m1_m(...) __riscv_vloxseg3ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i16m1_m(...) __riscv_vloxseg4ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i16m1_m(...) __riscv_vloxseg5ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i16m1_m(...) __riscv_vloxseg6ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i16m1_m(...) __riscv_vloxseg7ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i16m1_m(...) __riscv_vloxseg8ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i16m2_m(...) __riscv_vloxseg2ei32_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i16m2_m(...) __riscv_vloxseg3ei32_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i16m2_m(...) __riscv_vloxseg4ei32_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i16m4_m(...) __riscv_vloxseg2ei32_v_i16m4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i16mf4_m(...) __riscv_vloxseg2ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i16mf4_m(...) __riscv_vloxseg3ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i16mf4_m(...) __riscv_vloxseg4ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i16mf4_m(...) __riscv_vloxseg5ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i16mf4_m(...) __riscv_vloxseg6ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i16mf4_m(...) __riscv_vloxseg7ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i16mf4_m(...) __riscv_vloxseg8ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i16mf2_m(...) __riscv_vloxseg2ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i16mf2_m(...) __riscv_vloxseg3ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i16mf2_m(...) __riscv_vloxseg4ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i16mf2_m(...) __riscv_vloxseg5ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i16mf2_m(...) __riscv_vloxseg6ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i16mf2_m(...) __riscv_vloxseg7ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i16mf2_m(...) __riscv_vloxseg8ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i16m1_m(...) __riscv_vloxseg2ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i16m1_m(...) __riscv_vloxseg3ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i16m1_m(...) __riscv_vloxseg4ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i16m1_m(...) __riscv_vloxseg5ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i16m1_m(...) __riscv_vloxseg6ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i16m1_m(...) __riscv_vloxseg7ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i16m1_m(...) __riscv_vloxseg8ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i16m2_m(...) __riscv_vloxseg2ei64_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i16m2_m(...) __riscv_vloxseg3ei64_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i16m2_m(...) __riscv_vloxseg4ei64_v_i16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i32mf2_m(...) __riscv_vloxseg2ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i32mf2_m(...) __riscv_vloxseg3ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i32mf2_m(...) __riscv_vloxseg4ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i32mf2_m(...) __riscv_vloxseg5ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i32mf2_m(...) __riscv_vloxseg6ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i32mf2_m(...) __riscv_vloxseg7ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i32mf2_m(...) __riscv_vloxseg8ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i32m1_m(...) __riscv_vloxseg2ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i32m1_m(...) __riscv_vloxseg3ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i32m1_m(...) __riscv_vloxseg4ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i32m1_m(...) __riscv_vloxseg5ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i32m1_m(...) __riscv_vloxseg6ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i32m1_m(...) __riscv_vloxseg7ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i32m1_m(...) __riscv_vloxseg8ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i32m2_m(...) __riscv_vloxseg2ei8_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i32m2_m(...) __riscv_vloxseg3ei8_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i32m2_m(...) __riscv_vloxseg4ei8_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i32m4_m(...) __riscv_vloxseg2ei8_v_i32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i32mf2_m(...) __riscv_vloxseg2ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i32mf2_m(...) __riscv_vloxseg3ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i32mf2_m(...) __riscv_vloxseg4ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i32mf2_m(...) __riscv_vloxseg5ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i32mf2_m(...) __riscv_vloxseg6ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i32mf2_m(...) __riscv_vloxseg7ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i32mf2_m(...) __riscv_vloxseg8ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i32m1_m(...) __riscv_vloxseg2ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i32m1_m(...) __riscv_vloxseg3ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i32m1_m(...) __riscv_vloxseg4ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i32m1_m(...) __riscv_vloxseg5ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i32m1_m(...) __riscv_vloxseg6ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i32m1_m(...) __riscv_vloxseg7ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i32m1_m(...) __riscv_vloxseg8ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i32m2_m(...) __riscv_vloxseg2ei16_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i32m2_m(...) __riscv_vloxseg3ei16_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i32m2_m(...) __riscv_vloxseg4ei16_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i32m4_m(...) __riscv_vloxseg2ei16_v_i32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i32mf2_m(...) __riscv_vloxseg2ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i32mf2_m(...) __riscv_vloxseg3ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i32mf2_m(...) __riscv_vloxseg4ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i32mf2_m(...) __riscv_vloxseg5ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i32mf2_m(...) __riscv_vloxseg6ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i32mf2_m(...) __riscv_vloxseg7ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i32mf2_m(...) __riscv_vloxseg8ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i32m1_m(...) __riscv_vloxseg2ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i32m1_m(...) __riscv_vloxseg3ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i32m1_m(...) __riscv_vloxseg4ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i32m1_m(...) __riscv_vloxseg5ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i32m1_m(...) __riscv_vloxseg6ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i32m1_m(...) __riscv_vloxseg7ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i32m1_m(...) __riscv_vloxseg8ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i32m2_m(...) __riscv_vloxseg2ei32_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i32m2_m(...) __riscv_vloxseg3ei32_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i32m2_m(...) __riscv_vloxseg4ei32_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i32m4_m(...) __riscv_vloxseg2ei32_v_i32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i32mf2_m(...) __riscv_vloxseg2ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i32mf2_m(...) __riscv_vloxseg3ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i32mf2_m(...) __riscv_vloxseg4ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i32mf2_m(...) __riscv_vloxseg5ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i32mf2_m(...) __riscv_vloxseg6ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i32mf2_m(...) __riscv_vloxseg7ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i32mf2_m(...) __riscv_vloxseg8ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i32m1_m(...) __riscv_vloxseg2ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i32m1_m(...) __riscv_vloxseg3ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i32m1_m(...) __riscv_vloxseg4ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i32m1_m(...) __riscv_vloxseg5ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i32m1_m(...) __riscv_vloxseg6ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i32m1_m(...) __riscv_vloxseg7ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i32m1_m(...) __riscv_vloxseg8ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i32m2_m(...) __riscv_vloxseg2ei64_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i32m2_m(...) __riscv_vloxseg3ei64_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i32m2_m(...) __riscv_vloxseg4ei64_v_i32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i32m4_m(...) __riscv_vloxseg2ei64_v_i32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i64m1_m(...) __riscv_vloxseg2ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i64m1_m(...) __riscv_vloxseg3ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i64m1_m(...) __riscv_vloxseg4ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_i64m1_m(...) __riscv_vloxseg5ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_i64m1_m(...) __riscv_vloxseg6ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_i64m1_m(...) __riscv_vloxseg7ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_i64m1_m(...) __riscv_vloxseg8ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i64m2_m(...) __riscv_vloxseg2ei8_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_i64m2_m(...) __riscv_vloxseg3ei8_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_i64m2_m(...) __riscv_vloxseg4ei8_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_i64m4_m(...) __riscv_vloxseg2ei8_v_i64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i64m1_m(...) __riscv_vloxseg2ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i64m1_m(...) __riscv_vloxseg3ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i64m1_m(...) __riscv_vloxseg4ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_i64m1_m(...) __riscv_vloxseg5ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_i64m1_m(...) __riscv_vloxseg6ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_i64m1_m(...) __riscv_vloxseg7ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_i64m1_m(...) __riscv_vloxseg8ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i64m2_m(...) __riscv_vloxseg2ei16_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_i64m2_m(...) __riscv_vloxseg3ei16_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_i64m2_m(...) __riscv_vloxseg4ei16_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_i64m4_m(...) __riscv_vloxseg2ei16_v_i64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i64m1_m(...) __riscv_vloxseg2ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i64m1_m(...) __riscv_vloxseg3ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i64m1_m(...) __riscv_vloxseg4ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_i64m1_m(...) __riscv_vloxseg5ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_i64m1_m(...) __riscv_vloxseg6ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_i64m1_m(...) __riscv_vloxseg7ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_i64m1_m(...) __riscv_vloxseg8ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i64m2_m(...) __riscv_vloxseg2ei32_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_i64m2_m(...) __riscv_vloxseg3ei32_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_i64m2_m(...) __riscv_vloxseg4ei32_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_i64m4_m(...) __riscv_vloxseg2ei32_v_i64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i64m1_m(...) __riscv_vloxseg2ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i64m1_m(...) __riscv_vloxseg3ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i64m1_m(...) __riscv_vloxseg4ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_i64m1_m(...) __riscv_vloxseg5ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_i64m1_m(...) __riscv_vloxseg6ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_i64m1_m(...) __riscv_vloxseg7ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_i64m1_m(...) __riscv_vloxseg8ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i64m2_m(...) __riscv_vloxseg2ei64_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_i64m2_m(...) __riscv_vloxseg3ei64_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_i64m2_m(...) __riscv_vloxseg4ei64_v_i64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_i64m4_m(...) __riscv_vloxseg2ei64_v_i64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i8mf8_m(...) __riscv_vluxseg2ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i8mf8_m(...) __riscv_vluxseg3ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i8mf8_m(...) __riscv_vluxseg4ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i8mf8_m(...) __riscv_vluxseg5ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i8mf8_m(...) __riscv_vluxseg6ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i8mf8_m(...) __riscv_vluxseg7ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i8mf8_m(...) __riscv_vluxseg8ei8_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i8mf4_m(...) __riscv_vluxseg2ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i8mf4_m(...) __riscv_vluxseg3ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i8mf4_m(...) __riscv_vluxseg4ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i8mf4_m(...) __riscv_vluxseg5ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i8mf4_m(...) __riscv_vluxseg6ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i8mf4_m(...) __riscv_vluxseg7ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i8mf4_m(...) __riscv_vluxseg8ei8_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i8mf2_m(...) __riscv_vluxseg2ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i8mf2_m(...) __riscv_vluxseg3ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i8mf2_m(...) __riscv_vluxseg4ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i8mf2_m(...) __riscv_vluxseg5ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i8mf2_m(...) __riscv_vluxseg6ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i8mf2_m(...) __riscv_vluxseg7ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i8mf2_m(...) __riscv_vluxseg8ei8_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i8m1_m(...) __riscv_vluxseg2ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i8m1_m(...) __riscv_vluxseg3ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i8m1_m(...) __riscv_vluxseg4ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i8m1_m(...) __riscv_vluxseg5ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i8m1_m(...) __riscv_vluxseg6ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i8m1_m(...) __riscv_vluxseg7ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i8m1_m(...) __riscv_vluxseg8ei8_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i8m2_m(...) __riscv_vluxseg2ei8_v_i8m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i8m2_m(...) __riscv_vluxseg3ei8_v_i8m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i8m2_m(...) __riscv_vluxseg4ei8_v_i8m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i8m4_m(...) __riscv_vluxseg2ei8_v_i8m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i8mf8_m(...) __riscv_vluxseg2ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i8mf8_m(...) __riscv_vluxseg3ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i8mf8_m(...) __riscv_vluxseg4ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i8mf8_m(...) __riscv_vluxseg5ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i8mf8_m(...) __riscv_vluxseg6ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i8mf8_m(...) __riscv_vluxseg7ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i8mf8_m(...) __riscv_vluxseg8ei16_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i8mf4_m(...) __riscv_vluxseg2ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i8mf4_m(...) __riscv_vluxseg3ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i8mf4_m(...) __riscv_vluxseg4ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i8mf4_m(...) __riscv_vluxseg5ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i8mf4_m(...) __riscv_vluxseg6ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i8mf4_m(...) __riscv_vluxseg7ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i8mf4_m(...) __riscv_vluxseg8ei16_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i8mf2_m(...) __riscv_vluxseg2ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i8mf2_m(...) __riscv_vluxseg3ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i8mf2_m(...) __riscv_vluxseg4ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i8mf2_m(...) __riscv_vluxseg5ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i8mf2_m(...) __riscv_vluxseg6ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i8mf2_m(...) __riscv_vluxseg7ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i8mf2_m(...) __riscv_vluxseg8ei16_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i8m1_m(...) __riscv_vluxseg2ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i8m1_m(...) __riscv_vluxseg3ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i8m1_m(...) __riscv_vluxseg4ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i8m1_m(...) __riscv_vluxseg5ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i8m1_m(...) __riscv_vluxseg6ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i8m1_m(...) __riscv_vluxseg7ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i8m1_m(...) __riscv_vluxseg8ei16_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i8m2_m(...) __riscv_vluxseg2ei16_v_i8m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i8m2_m(...) __riscv_vluxseg3ei16_v_i8m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i8m2_m(...) __riscv_vluxseg4ei16_v_i8m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i8m4_m(...) __riscv_vluxseg2ei16_v_i8m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i8mf8_m(...) __riscv_vluxseg2ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i8mf8_m(...) __riscv_vluxseg3ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i8mf8_m(...) __riscv_vluxseg4ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i8mf8_m(...) __riscv_vluxseg5ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i8mf8_m(...) __riscv_vluxseg6ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i8mf8_m(...) __riscv_vluxseg7ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i8mf8_m(...) __riscv_vluxseg8ei32_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i8mf4_m(...) __riscv_vluxseg2ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i8mf4_m(...) __riscv_vluxseg3ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i8mf4_m(...) __riscv_vluxseg4ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i8mf4_m(...) __riscv_vluxseg5ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i8mf4_m(...) __riscv_vluxseg6ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i8mf4_m(...) __riscv_vluxseg7ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i8mf4_m(...) __riscv_vluxseg8ei32_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i8mf2_m(...) __riscv_vluxseg2ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i8mf2_m(...) __riscv_vluxseg3ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i8mf2_m(...) __riscv_vluxseg4ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i8mf2_m(...) __riscv_vluxseg5ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i8mf2_m(...) __riscv_vluxseg6ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i8mf2_m(...) __riscv_vluxseg7ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i8mf2_m(...) __riscv_vluxseg8ei32_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i8m1_m(...) __riscv_vluxseg2ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i8m1_m(...) __riscv_vluxseg3ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i8m1_m(...) __riscv_vluxseg4ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i8m1_m(...) __riscv_vluxseg5ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i8m1_m(...) __riscv_vluxseg6ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i8m1_m(...) __riscv_vluxseg7ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i8m1_m(...) __riscv_vluxseg8ei32_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i8m2_m(...) __riscv_vluxseg2ei32_v_i8m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i8m2_m(...) __riscv_vluxseg3ei32_v_i8m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i8m2_m(...) __riscv_vluxseg4ei32_v_i8m2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i8mf8_m(...) __riscv_vluxseg2ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i8mf8_m(...) __riscv_vluxseg3ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i8mf8_m(...) __riscv_vluxseg4ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i8mf8_m(...) __riscv_vluxseg5ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i8mf8_m(...) __riscv_vluxseg6ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i8mf8_m(...) __riscv_vluxseg7ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i8mf8_m(...) __riscv_vluxseg8ei64_v_i8mf8_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i8mf4_m(...) __riscv_vluxseg2ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i8mf4_m(...) __riscv_vluxseg3ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i8mf4_m(...) __riscv_vluxseg4ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i8mf4_m(...) __riscv_vluxseg5ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i8mf4_m(...) __riscv_vluxseg6ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i8mf4_m(...) __riscv_vluxseg7ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i8mf4_m(...) __riscv_vluxseg8ei64_v_i8mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i8mf2_m(...) __riscv_vluxseg2ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i8mf2_m(...) __riscv_vluxseg3ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i8mf2_m(...) __riscv_vluxseg4ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i8mf2_m(...) __riscv_vluxseg5ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i8mf2_m(...) __riscv_vluxseg6ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i8mf2_m(...) __riscv_vluxseg7ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i8mf2_m(...) __riscv_vluxseg8ei64_v_i8mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i8m1_m(...) __riscv_vluxseg2ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i8m1_m(...) __riscv_vluxseg3ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i8m1_m(...) __riscv_vluxseg4ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i8m1_m(...) __riscv_vluxseg5ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i8m1_m(...) __riscv_vluxseg6ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i8m1_m(...) __riscv_vluxseg7ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i8m1_m(...) __riscv_vluxseg8ei64_v_i8m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i16mf4_m(...) __riscv_vluxseg2ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i16mf4_m(...) __riscv_vluxseg3ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i16mf4_m(...) __riscv_vluxseg4ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i16mf4_m(...) __riscv_vluxseg5ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i16mf4_m(...) __riscv_vluxseg6ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i16mf4_m(...) __riscv_vluxseg7ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i16mf4_m(...) __riscv_vluxseg8ei8_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i16mf2_m(...) __riscv_vluxseg2ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i16mf2_m(...) __riscv_vluxseg3ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i16mf2_m(...) __riscv_vluxseg4ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i16mf2_m(...) __riscv_vluxseg5ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i16mf2_m(...) __riscv_vluxseg6ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i16mf2_m(...) __riscv_vluxseg7ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i16mf2_m(...) __riscv_vluxseg8ei8_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i16m1_m(...) __riscv_vluxseg2ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i16m1_m(...) __riscv_vluxseg3ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i16m1_m(...) __riscv_vluxseg4ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i16m1_m(...) __riscv_vluxseg5ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i16m1_m(...) __riscv_vluxseg6ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i16m1_m(...) __riscv_vluxseg7ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i16m1_m(...) __riscv_vluxseg8ei8_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i16m2_m(...) __riscv_vluxseg2ei8_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i16m2_m(...) __riscv_vluxseg3ei8_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i16m2_m(...) __riscv_vluxseg4ei8_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i16m4_m(...) __riscv_vluxseg2ei8_v_i16m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i16mf4_m(...) __riscv_vluxseg2ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i16mf4_m(...) __riscv_vluxseg3ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i16mf4_m(...) __riscv_vluxseg4ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i16mf4_m(...) __riscv_vluxseg5ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i16mf4_m(...) __riscv_vluxseg6ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i16mf4_m(...) __riscv_vluxseg7ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i16mf4_m(...) __riscv_vluxseg8ei16_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i16mf2_m(...) __riscv_vluxseg2ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i16mf2_m(...) __riscv_vluxseg3ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i16mf2_m(...) __riscv_vluxseg4ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i16mf2_m(...) __riscv_vluxseg5ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i16mf2_m(...) __riscv_vluxseg6ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i16mf2_m(...) __riscv_vluxseg7ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i16mf2_m(...) __riscv_vluxseg8ei16_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i16m1_m(...) __riscv_vluxseg2ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i16m1_m(...) __riscv_vluxseg3ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i16m1_m(...) __riscv_vluxseg4ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i16m1_m(...) __riscv_vluxseg5ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i16m1_m(...) __riscv_vluxseg6ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i16m1_m(...) __riscv_vluxseg7ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i16m1_m(...) __riscv_vluxseg8ei16_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i16m2_m(...) __riscv_vluxseg2ei16_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i16m2_m(...) __riscv_vluxseg3ei16_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i16m2_m(...) __riscv_vluxseg4ei16_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i16m4_m(...) __riscv_vluxseg2ei16_v_i16m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i16mf4_m(...) __riscv_vluxseg2ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i16mf4_m(...) __riscv_vluxseg3ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i16mf4_m(...) __riscv_vluxseg4ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i16mf4_m(...) __riscv_vluxseg5ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i16mf4_m(...) __riscv_vluxseg6ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i16mf4_m(...) __riscv_vluxseg7ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i16mf4_m(...) __riscv_vluxseg8ei32_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i16mf2_m(...) __riscv_vluxseg2ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i16mf2_m(...) __riscv_vluxseg3ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i16mf2_m(...) __riscv_vluxseg4ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i16mf2_m(...) __riscv_vluxseg5ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i16mf2_m(...) __riscv_vluxseg6ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i16mf2_m(...) __riscv_vluxseg7ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i16mf2_m(...) __riscv_vluxseg8ei32_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i16m1_m(...) __riscv_vluxseg2ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i16m1_m(...) __riscv_vluxseg3ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i16m1_m(...) __riscv_vluxseg4ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i16m1_m(...) __riscv_vluxseg5ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i16m1_m(...) __riscv_vluxseg6ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i16m1_m(...) __riscv_vluxseg7ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i16m1_m(...) __riscv_vluxseg8ei32_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i16m2_m(...) __riscv_vluxseg2ei32_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i16m2_m(...) __riscv_vluxseg3ei32_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i16m2_m(...) __riscv_vluxseg4ei32_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i16m4_m(...) __riscv_vluxseg2ei32_v_i16m4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i16mf4_m(...) __riscv_vluxseg2ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i16mf4_m(...) __riscv_vluxseg3ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i16mf4_m(...) __riscv_vluxseg4ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i16mf4_m(...) __riscv_vluxseg5ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i16mf4_m(...) __riscv_vluxseg6ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i16mf4_m(...) __riscv_vluxseg7ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i16mf4_m(...) __riscv_vluxseg8ei64_v_i16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i16mf2_m(...) __riscv_vluxseg2ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i16mf2_m(...) __riscv_vluxseg3ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i16mf2_m(...) __riscv_vluxseg4ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i16mf2_m(...) __riscv_vluxseg5ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i16mf2_m(...) __riscv_vluxseg6ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i16mf2_m(...) __riscv_vluxseg7ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i16mf2_m(...) __riscv_vluxseg8ei64_v_i16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i16m1_m(...) __riscv_vluxseg2ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i16m1_m(...) __riscv_vluxseg3ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i16m1_m(...) __riscv_vluxseg4ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i16m1_m(...) __riscv_vluxseg5ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i16m1_m(...) __riscv_vluxseg6ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i16m1_m(...) __riscv_vluxseg7ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i16m1_m(...) __riscv_vluxseg8ei64_v_i16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i16m2_m(...) __riscv_vluxseg2ei64_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i16m2_m(...) __riscv_vluxseg3ei64_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i16m2_m(...) __riscv_vluxseg4ei64_v_i16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i32mf2_m(...) __riscv_vluxseg2ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i32mf2_m(...) __riscv_vluxseg3ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i32mf2_m(...) __riscv_vluxseg4ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i32mf2_m(...) __riscv_vluxseg5ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i32mf2_m(...) __riscv_vluxseg6ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i32mf2_m(...) __riscv_vluxseg7ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i32mf2_m(...) __riscv_vluxseg8ei8_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i32m1_m(...) __riscv_vluxseg2ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i32m1_m(...) __riscv_vluxseg3ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i32m1_m(...) __riscv_vluxseg4ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i32m1_m(...) __riscv_vluxseg5ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i32m1_m(...) __riscv_vluxseg6ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i32m1_m(...) __riscv_vluxseg7ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i32m1_m(...) __riscv_vluxseg8ei8_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i32m2_m(...) __riscv_vluxseg2ei8_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i32m2_m(...) __riscv_vluxseg3ei8_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i32m2_m(...) __riscv_vluxseg4ei8_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i32m4_m(...) __riscv_vluxseg2ei8_v_i32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i32mf2_m(...) __riscv_vluxseg2ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i32mf2_m(...) __riscv_vluxseg3ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i32mf2_m(...) __riscv_vluxseg4ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i32mf2_m(...) __riscv_vluxseg5ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i32mf2_m(...) __riscv_vluxseg6ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i32mf2_m(...) __riscv_vluxseg7ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i32mf2_m(...) __riscv_vluxseg8ei16_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i32m1_m(...) __riscv_vluxseg2ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i32m1_m(...) __riscv_vluxseg3ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i32m1_m(...) __riscv_vluxseg4ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i32m1_m(...) __riscv_vluxseg5ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i32m1_m(...) __riscv_vluxseg6ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i32m1_m(...) __riscv_vluxseg7ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i32m1_m(...) __riscv_vluxseg8ei16_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i32m2_m(...) __riscv_vluxseg2ei16_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i32m2_m(...) __riscv_vluxseg3ei16_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i32m2_m(...) __riscv_vluxseg4ei16_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i32m4_m(...) __riscv_vluxseg2ei16_v_i32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i32mf2_m(...) __riscv_vluxseg2ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i32mf2_m(...) __riscv_vluxseg3ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i32mf2_m(...) __riscv_vluxseg4ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i32mf2_m(...) __riscv_vluxseg5ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i32mf2_m(...) __riscv_vluxseg6ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i32mf2_m(...) __riscv_vluxseg7ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i32mf2_m(...) __riscv_vluxseg8ei32_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i32m1_m(...) __riscv_vluxseg2ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i32m1_m(...) __riscv_vluxseg3ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i32m1_m(...) __riscv_vluxseg4ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i32m1_m(...) __riscv_vluxseg5ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i32m1_m(...) __riscv_vluxseg6ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i32m1_m(...) __riscv_vluxseg7ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i32m1_m(...) __riscv_vluxseg8ei32_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i32m2_m(...) __riscv_vluxseg2ei32_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i32m2_m(...) __riscv_vluxseg3ei32_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i32m2_m(...) __riscv_vluxseg4ei32_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i32m4_m(...) __riscv_vluxseg2ei32_v_i32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i32mf2_m(...) __riscv_vluxseg2ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i32mf2_m(...) __riscv_vluxseg3ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i32mf2_m(...) __riscv_vluxseg4ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i32mf2_m(...) __riscv_vluxseg5ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i32mf2_m(...) __riscv_vluxseg6ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i32mf2_m(...) __riscv_vluxseg7ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i32mf2_m(...) __riscv_vluxseg8ei64_v_i32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i32m1_m(...) __riscv_vluxseg2ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i32m1_m(...) __riscv_vluxseg3ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i32m1_m(...) __riscv_vluxseg4ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i32m1_m(...) __riscv_vluxseg5ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i32m1_m(...) __riscv_vluxseg6ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i32m1_m(...) __riscv_vluxseg7ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i32m1_m(...) __riscv_vluxseg8ei64_v_i32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i32m2_m(...) __riscv_vluxseg2ei64_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i32m2_m(...) __riscv_vluxseg3ei64_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i32m2_m(...) __riscv_vluxseg4ei64_v_i32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i32m4_m(...) __riscv_vluxseg2ei64_v_i32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i64m1_m(...) __riscv_vluxseg2ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i64m1_m(...) __riscv_vluxseg3ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i64m1_m(...) __riscv_vluxseg4ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_i64m1_m(...) __riscv_vluxseg5ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_i64m1_m(...) __riscv_vluxseg6ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_i64m1_m(...) __riscv_vluxseg7ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_i64m1_m(...) __riscv_vluxseg8ei8_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i64m2_m(...) __riscv_vluxseg2ei8_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_i64m2_m(...) __riscv_vluxseg3ei8_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_i64m2_m(...) __riscv_vluxseg4ei8_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_i64m4_m(...) __riscv_vluxseg2ei8_v_i64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i64m1_m(...) __riscv_vluxseg2ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i64m1_m(...) __riscv_vluxseg3ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i64m1_m(...) __riscv_vluxseg4ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_i64m1_m(...) __riscv_vluxseg5ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_i64m1_m(...) __riscv_vluxseg6ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_i64m1_m(...) __riscv_vluxseg7ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_i64m1_m(...) __riscv_vluxseg8ei16_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i64m2_m(...) __riscv_vluxseg2ei16_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_i64m2_m(...) __riscv_vluxseg3ei16_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_i64m2_m(...) __riscv_vluxseg4ei16_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_i64m4_m(...) __riscv_vluxseg2ei16_v_i64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i64m1_m(...) __riscv_vluxseg2ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i64m1_m(...) __riscv_vluxseg3ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i64m1_m(...) __riscv_vluxseg4ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_i64m1_m(...) __riscv_vluxseg5ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_i64m1_m(...) __riscv_vluxseg6ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_i64m1_m(...) __riscv_vluxseg7ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_i64m1_m(...) __riscv_vluxseg8ei32_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i64m2_m(...) __riscv_vluxseg2ei32_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_i64m2_m(...) __riscv_vluxseg3ei32_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_i64m2_m(...) __riscv_vluxseg4ei32_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_i64m4_m(...) __riscv_vluxseg2ei32_v_i64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i64m1_m(...) __riscv_vluxseg2ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i64m1_m(...) __riscv_vluxseg3ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i64m1_m(...) __riscv_vluxseg4ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_i64m1_m(...) __riscv_vluxseg5ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_i64m1_m(...) __riscv_vluxseg6ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_i64m1_m(...) __riscv_vluxseg7ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_i64m1_m(...) __riscv_vluxseg8ei64_v_i64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i64m2_m(...) __riscv_vluxseg2ei64_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_i64m2_m(...) __riscv_vluxseg3ei64_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_i64m2_m(...) __riscv_vluxseg4ei64_v_i64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_i64m4_m(...) __riscv_vluxseg2ei64_v_i64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u8mf8_m(...) __riscv_vloxseg2ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u8mf8_m(...) __riscv_vloxseg3ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u8mf8_m(...) __riscv_vloxseg4ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u8mf8_m(...) __riscv_vloxseg5ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u8mf8_m(...) __riscv_vloxseg6ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u8mf8_m(...) __riscv_vloxseg7ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u8mf8_m(...) __riscv_vloxseg8ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u8mf4_m(...) __riscv_vloxseg2ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u8mf4_m(...) __riscv_vloxseg3ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u8mf4_m(...) __riscv_vloxseg4ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u8mf4_m(...) __riscv_vloxseg5ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u8mf4_m(...) __riscv_vloxseg6ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u8mf4_m(...) __riscv_vloxseg7ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u8mf4_m(...) __riscv_vloxseg8ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u8mf2_m(...) __riscv_vloxseg2ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u8mf2_m(...) __riscv_vloxseg3ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u8mf2_m(...) __riscv_vloxseg4ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u8mf2_m(...) __riscv_vloxseg5ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u8mf2_m(...) __riscv_vloxseg6ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u8mf2_m(...) __riscv_vloxseg7ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u8mf2_m(...) __riscv_vloxseg8ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u8m1_m(...) __riscv_vloxseg2ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u8m1_m(...) __riscv_vloxseg3ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u8m1_m(...) __riscv_vloxseg4ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u8m1_m(...) __riscv_vloxseg5ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u8m1_m(...) __riscv_vloxseg6ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u8m1_m(...) __riscv_vloxseg7ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u8m1_m(...) __riscv_vloxseg8ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u8m2_m(...) __riscv_vloxseg2ei8_v_u8m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u8m2_m(...) __riscv_vloxseg3ei8_v_u8m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u8m2_m(...) __riscv_vloxseg4ei8_v_u8m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u8m4_m(...) __riscv_vloxseg2ei8_v_u8m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u8mf8_m(...) __riscv_vloxseg2ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u8mf8_m(...) __riscv_vloxseg3ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u8mf8_m(...) __riscv_vloxseg4ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u8mf8_m(...) __riscv_vloxseg5ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u8mf8_m(...) __riscv_vloxseg6ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u8mf8_m(...) __riscv_vloxseg7ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u8mf8_m(...) __riscv_vloxseg8ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u8mf4_m(...) __riscv_vloxseg2ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u8mf4_m(...) __riscv_vloxseg3ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u8mf4_m(...) __riscv_vloxseg4ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u8mf4_m(...) __riscv_vloxseg5ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u8mf4_m(...) __riscv_vloxseg6ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u8mf4_m(...) __riscv_vloxseg7ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u8mf4_m(...) __riscv_vloxseg8ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u8mf2_m(...) __riscv_vloxseg2ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u8mf2_m(...) __riscv_vloxseg3ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u8mf2_m(...) __riscv_vloxseg4ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u8mf2_m(...) __riscv_vloxseg5ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u8mf2_m(...) __riscv_vloxseg6ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u8mf2_m(...) __riscv_vloxseg7ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u8mf2_m(...) __riscv_vloxseg8ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u8m1_m(...) __riscv_vloxseg2ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u8m1_m(...) __riscv_vloxseg3ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u8m1_m(...) __riscv_vloxseg4ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u8m1_m(...) __riscv_vloxseg5ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u8m1_m(...) __riscv_vloxseg6ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u8m1_m(...) __riscv_vloxseg7ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u8m1_m(...) __riscv_vloxseg8ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u8m2_m(...) __riscv_vloxseg2ei16_v_u8m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u8m2_m(...) __riscv_vloxseg3ei16_v_u8m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u8m2_m(...) __riscv_vloxseg4ei16_v_u8m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u8m4_m(...) __riscv_vloxseg2ei16_v_u8m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u8mf8_m(...) __riscv_vloxseg2ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u8mf8_m(...) __riscv_vloxseg3ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u8mf8_m(...) __riscv_vloxseg4ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u8mf8_m(...) __riscv_vloxseg5ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u8mf8_m(...) __riscv_vloxseg6ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u8mf8_m(...) __riscv_vloxseg7ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u8mf8_m(...) __riscv_vloxseg8ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u8mf4_m(...) __riscv_vloxseg2ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u8mf4_m(...) __riscv_vloxseg3ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u8mf4_m(...) __riscv_vloxseg4ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u8mf4_m(...) __riscv_vloxseg5ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u8mf4_m(...) __riscv_vloxseg6ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u8mf4_m(...) __riscv_vloxseg7ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u8mf4_m(...) __riscv_vloxseg8ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u8mf2_m(...) __riscv_vloxseg2ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u8mf2_m(...) __riscv_vloxseg3ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u8mf2_m(...) __riscv_vloxseg4ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u8mf2_m(...) __riscv_vloxseg5ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u8mf2_m(...) __riscv_vloxseg6ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u8mf2_m(...) __riscv_vloxseg7ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u8mf2_m(...) __riscv_vloxseg8ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u8m1_m(...) __riscv_vloxseg2ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u8m1_m(...) __riscv_vloxseg3ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u8m1_m(...) __riscv_vloxseg4ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u8m1_m(...) __riscv_vloxseg5ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u8m1_m(...) __riscv_vloxseg6ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u8m1_m(...) __riscv_vloxseg7ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u8m1_m(...) __riscv_vloxseg8ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u8m2_m(...) __riscv_vloxseg2ei32_v_u8m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u8m2_m(...) __riscv_vloxseg3ei32_v_u8m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u8m2_m(...) __riscv_vloxseg4ei32_v_u8m2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u8mf8_m(...) __riscv_vloxseg2ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u8mf8_m(...) __riscv_vloxseg3ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u8mf8_m(...) __riscv_vloxseg4ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u8mf8_m(...) __riscv_vloxseg5ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u8mf8_m(...) __riscv_vloxseg6ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u8mf8_m(...) __riscv_vloxseg7ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u8mf8_m(...) __riscv_vloxseg8ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u8mf4_m(...) __riscv_vloxseg2ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u8mf4_m(...) __riscv_vloxseg3ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u8mf4_m(...) __riscv_vloxseg4ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u8mf4_m(...) __riscv_vloxseg5ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u8mf4_m(...) __riscv_vloxseg6ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u8mf4_m(...) __riscv_vloxseg7ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u8mf4_m(...) __riscv_vloxseg8ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u8mf2_m(...) __riscv_vloxseg2ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u8mf2_m(...) __riscv_vloxseg3ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u8mf2_m(...) __riscv_vloxseg4ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u8mf2_m(...) __riscv_vloxseg5ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u8mf2_m(...) __riscv_vloxseg6ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u8mf2_m(...) __riscv_vloxseg7ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u8mf2_m(...) __riscv_vloxseg8ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u8m1_m(...) __riscv_vloxseg2ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u8m1_m(...) __riscv_vloxseg3ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u8m1_m(...) __riscv_vloxseg4ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u8m1_m(...) __riscv_vloxseg5ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u8m1_m(...) __riscv_vloxseg6ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u8m1_m(...) __riscv_vloxseg7ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u8m1_m(...) __riscv_vloxseg8ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u16mf4_m(...) __riscv_vloxseg2ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u16mf4_m(...) __riscv_vloxseg3ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u16mf4_m(...) __riscv_vloxseg4ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u16mf4_m(...) __riscv_vloxseg5ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u16mf4_m(...) __riscv_vloxseg6ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u16mf4_m(...) __riscv_vloxseg7ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u16mf4_m(...) __riscv_vloxseg8ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u16mf2_m(...) __riscv_vloxseg2ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u16mf2_m(...) __riscv_vloxseg3ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u16mf2_m(...) __riscv_vloxseg4ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u16mf2_m(...) __riscv_vloxseg5ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u16mf2_m(...) __riscv_vloxseg6ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u16mf2_m(...) __riscv_vloxseg7ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u16mf2_m(...) __riscv_vloxseg8ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u16m1_m(...) __riscv_vloxseg2ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u16m1_m(...) __riscv_vloxseg3ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u16m1_m(...) __riscv_vloxseg4ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u16m1_m(...) __riscv_vloxseg5ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u16m1_m(...) __riscv_vloxseg6ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u16m1_m(...) __riscv_vloxseg7ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u16m1_m(...) __riscv_vloxseg8ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u16m2_m(...) __riscv_vloxseg2ei8_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u16m2_m(...) __riscv_vloxseg3ei8_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u16m2_m(...) __riscv_vloxseg4ei8_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u16m4_m(...) __riscv_vloxseg2ei8_v_u16m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u16mf4_m(...) __riscv_vloxseg2ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u16mf4_m(...) __riscv_vloxseg3ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u16mf4_m(...) __riscv_vloxseg4ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u16mf4_m(...) __riscv_vloxseg5ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u16mf4_m(...) __riscv_vloxseg6ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u16mf4_m(...) __riscv_vloxseg7ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u16mf4_m(...) __riscv_vloxseg8ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u16mf2_m(...) __riscv_vloxseg2ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u16mf2_m(...) __riscv_vloxseg3ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u16mf2_m(...) __riscv_vloxseg4ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u16mf2_m(...) __riscv_vloxseg5ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u16mf2_m(...) __riscv_vloxseg6ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u16mf2_m(...) __riscv_vloxseg7ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u16mf2_m(...) __riscv_vloxseg8ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u16m1_m(...) __riscv_vloxseg2ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u16m1_m(...) __riscv_vloxseg3ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u16m1_m(...) __riscv_vloxseg4ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u16m1_m(...) __riscv_vloxseg5ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u16m1_m(...) __riscv_vloxseg6ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u16m1_m(...) __riscv_vloxseg7ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u16m1_m(...) __riscv_vloxseg8ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u16m2_m(...) __riscv_vloxseg2ei16_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u16m2_m(...) __riscv_vloxseg3ei16_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u16m2_m(...) __riscv_vloxseg4ei16_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u16m4_m(...) __riscv_vloxseg2ei16_v_u16m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u16mf4_m(...) __riscv_vloxseg2ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u16mf4_m(...) __riscv_vloxseg3ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u16mf4_m(...) __riscv_vloxseg4ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u16mf4_m(...) __riscv_vloxseg5ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u16mf4_m(...) __riscv_vloxseg6ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u16mf4_m(...) __riscv_vloxseg7ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u16mf4_m(...) __riscv_vloxseg8ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u16mf2_m(...) __riscv_vloxseg2ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u16mf2_m(...) __riscv_vloxseg3ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u16mf2_m(...) __riscv_vloxseg4ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u16mf2_m(...) __riscv_vloxseg5ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u16mf2_m(...) __riscv_vloxseg6ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u16mf2_m(...) __riscv_vloxseg7ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u16mf2_m(...) __riscv_vloxseg8ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u16m1_m(...) __riscv_vloxseg2ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u16m1_m(...) __riscv_vloxseg3ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u16m1_m(...) __riscv_vloxseg4ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u16m1_m(...) __riscv_vloxseg5ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u16m1_m(...) __riscv_vloxseg6ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u16m1_m(...) __riscv_vloxseg7ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u16m1_m(...) __riscv_vloxseg8ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u16m2_m(...) __riscv_vloxseg2ei32_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u16m2_m(...) __riscv_vloxseg3ei32_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u16m2_m(...) __riscv_vloxseg4ei32_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u16m4_m(...) __riscv_vloxseg2ei32_v_u16m4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u16mf4_m(...) __riscv_vloxseg2ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u16mf4_m(...) __riscv_vloxseg3ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u16mf4_m(...) __riscv_vloxseg4ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u16mf4_m(...) __riscv_vloxseg5ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u16mf4_m(...) __riscv_vloxseg6ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u16mf4_m(...) __riscv_vloxseg7ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u16mf4_m(...) __riscv_vloxseg8ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u16mf2_m(...) __riscv_vloxseg2ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u16mf2_m(...) __riscv_vloxseg3ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u16mf2_m(...) __riscv_vloxseg4ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u16mf2_m(...) __riscv_vloxseg5ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u16mf2_m(...) __riscv_vloxseg6ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u16mf2_m(...) __riscv_vloxseg7ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u16mf2_m(...) __riscv_vloxseg8ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u16m1_m(...) __riscv_vloxseg2ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u16m1_m(...) __riscv_vloxseg3ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u16m1_m(...) __riscv_vloxseg4ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u16m1_m(...) __riscv_vloxseg5ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u16m1_m(...) __riscv_vloxseg6ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u16m1_m(...) __riscv_vloxseg7ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u16m1_m(...) __riscv_vloxseg8ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u16m2_m(...) __riscv_vloxseg2ei64_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u16m2_m(...) __riscv_vloxseg3ei64_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u16m2_m(...) __riscv_vloxseg4ei64_v_u16m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u32mf2_m(...) __riscv_vloxseg2ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u32mf2_m(...) __riscv_vloxseg3ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u32mf2_m(...) __riscv_vloxseg4ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u32mf2_m(...) __riscv_vloxseg5ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u32mf2_m(...) __riscv_vloxseg6ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u32mf2_m(...) __riscv_vloxseg7ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u32mf2_m(...) __riscv_vloxseg8ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u32m1_m(...) __riscv_vloxseg2ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u32m1_m(...) __riscv_vloxseg3ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u32m1_m(...) __riscv_vloxseg4ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u32m1_m(...) __riscv_vloxseg5ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u32m1_m(...) __riscv_vloxseg6ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u32m1_m(...) __riscv_vloxseg7ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u32m1_m(...) __riscv_vloxseg8ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u32m2_m(...) __riscv_vloxseg2ei8_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u32m2_m(...) __riscv_vloxseg3ei8_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u32m2_m(...) __riscv_vloxseg4ei8_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u32m4_m(...) __riscv_vloxseg2ei8_v_u32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u32mf2_m(...) __riscv_vloxseg2ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u32mf2_m(...) __riscv_vloxseg3ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u32mf2_m(...) __riscv_vloxseg4ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u32mf2_m(...) __riscv_vloxseg5ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u32mf2_m(...) __riscv_vloxseg6ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u32mf2_m(...) __riscv_vloxseg7ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u32mf2_m(...) __riscv_vloxseg8ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u32m1_m(...) __riscv_vloxseg2ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u32m1_m(...) __riscv_vloxseg3ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u32m1_m(...) __riscv_vloxseg4ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u32m1_m(...) __riscv_vloxseg5ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u32m1_m(...) __riscv_vloxseg6ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u32m1_m(...) __riscv_vloxseg7ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u32m1_m(...) __riscv_vloxseg8ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u32m2_m(...) __riscv_vloxseg2ei16_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u32m2_m(...) __riscv_vloxseg3ei16_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u32m2_m(...) __riscv_vloxseg4ei16_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u32m4_m(...) __riscv_vloxseg2ei16_v_u32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u32mf2_m(...) __riscv_vloxseg2ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u32mf2_m(...) __riscv_vloxseg3ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u32mf2_m(...) __riscv_vloxseg4ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u32mf2_m(...) __riscv_vloxseg5ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u32mf2_m(...) __riscv_vloxseg6ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u32mf2_m(...) __riscv_vloxseg7ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u32mf2_m(...) __riscv_vloxseg8ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u32m1_m(...) __riscv_vloxseg2ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u32m1_m(...) __riscv_vloxseg3ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u32m1_m(...) __riscv_vloxseg4ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u32m1_m(...) __riscv_vloxseg5ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u32m1_m(...) __riscv_vloxseg6ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u32m1_m(...) __riscv_vloxseg7ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u32m1_m(...) __riscv_vloxseg8ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u32m2_m(...) __riscv_vloxseg2ei32_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u32m2_m(...) __riscv_vloxseg3ei32_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u32m2_m(...) __riscv_vloxseg4ei32_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u32m4_m(...) __riscv_vloxseg2ei32_v_u32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u32mf2_m(...) __riscv_vloxseg2ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u32mf2_m(...) __riscv_vloxseg3ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u32mf2_m(...) __riscv_vloxseg4ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u32mf2_m(...) __riscv_vloxseg5ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u32mf2_m(...) __riscv_vloxseg6ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u32mf2_m(...) __riscv_vloxseg7ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u32mf2_m(...) __riscv_vloxseg8ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u32m1_m(...) __riscv_vloxseg2ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u32m1_m(...) __riscv_vloxseg3ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u32m1_m(...) __riscv_vloxseg4ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u32m1_m(...) __riscv_vloxseg5ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u32m1_m(...) __riscv_vloxseg6ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u32m1_m(...) __riscv_vloxseg7ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u32m1_m(...) __riscv_vloxseg8ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u32m2_m(...) __riscv_vloxseg2ei64_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u32m2_m(...) __riscv_vloxseg3ei64_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u32m2_m(...) __riscv_vloxseg4ei64_v_u32m2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u32m4_m(...) __riscv_vloxseg2ei64_v_u32m4_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u64m1_m(...) __riscv_vloxseg2ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u64m1_m(...) __riscv_vloxseg3ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u64m1_m(...) __riscv_vloxseg4ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei8_v_u64m1_m(...) __riscv_vloxseg5ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei8_v_u64m1_m(...) __riscv_vloxseg6ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei8_v_u64m1_m(...) __riscv_vloxseg7ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei8_v_u64m1_m(...) __riscv_vloxseg8ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u64m2_m(...) __riscv_vloxseg2ei8_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei8_v_u64m2_m(...) __riscv_vloxseg3ei8_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei8_v_u64m2_m(...) __riscv_vloxseg4ei8_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei8_v_u64m4_m(...) __riscv_vloxseg2ei8_v_u64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u64m1_m(...) __riscv_vloxseg2ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u64m1_m(...) __riscv_vloxseg3ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u64m1_m(...) __riscv_vloxseg4ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei16_v_u64m1_m(...) __riscv_vloxseg5ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei16_v_u64m1_m(...) __riscv_vloxseg6ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei16_v_u64m1_m(...) __riscv_vloxseg7ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei16_v_u64m1_m(...) __riscv_vloxseg8ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u64m2_m(...) __riscv_vloxseg2ei16_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei16_v_u64m2_m(...) __riscv_vloxseg3ei16_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei16_v_u64m2_m(...) __riscv_vloxseg4ei16_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei16_v_u64m4_m(...) __riscv_vloxseg2ei16_v_u64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u64m1_m(...) __riscv_vloxseg2ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u64m1_m(...) __riscv_vloxseg3ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u64m1_m(...) __riscv_vloxseg4ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei32_v_u64m1_m(...) __riscv_vloxseg5ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei32_v_u64m1_m(...) __riscv_vloxseg6ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei32_v_u64m1_m(...) __riscv_vloxseg7ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei32_v_u64m1_m(...) __riscv_vloxseg8ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u64m2_m(...) __riscv_vloxseg2ei32_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei32_v_u64m2_m(...) __riscv_vloxseg3ei32_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei32_v_u64m2_m(...) __riscv_vloxseg4ei32_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei32_v_u64m4_m(...) __riscv_vloxseg2ei32_v_u64m4_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u64m1_m(...) __riscv_vloxseg2ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u64m1_m(...) __riscv_vloxseg3ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u64m1_m(...) __riscv_vloxseg4ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg5ei64_v_u64m1_m(...) __riscv_vloxseg5ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg6ei64_v_u64m1_m(...) __riscv_vloxseg6ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg7ei64_v_u64m1_m(...) __riscv_vloxseg7ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg8ei64_v_u64m1_m(...) __riscv_vloxseg8ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u64m2_m(...) __riscv_vloxseg2ei64_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg3ei64_v_u64m2_m(...) __riscv_vloxseg3ei64_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg4ei64_v_u64m2_m(...) __riscv_vloxseg4ei64_v_u64m2_tumu(__VA_ARGS__)
+#define vloxseg2ei64_v_u64m4_m(...) __riscv_vloxseg2ei64_v_u64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u8mf8_m(...) __riscv_vluxseg2ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u8mf8_m(...) __riscv_vluxseg3ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u8mf8_m(...) __riscv_vluxseg4ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u8mf8_m(...) __riscv_vluxseg5ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u8mf8_m(...) __riscv_vluxseg6ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u8mf8_m(...) __riscv_vluxseg7ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u8mf8_m(...) __riscv_vluxseg8ei8_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u8mf4_m(...) __riscv_vluxseg2ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u8mf4_m(...) __riscv_vluxseg3ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u8mf4_m(...) __riscv_vluxseg4ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u8mf4_m(...) __riscv_vluxseg5ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u8mf4_m(...) __riscv_vluxseg6ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u8mf4_m(...) __riscv_vluxseg7ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u8mf4_m(...) __riscv_vluxseg8ei8_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u8mf2_m(...) __riscv_vluxseg2ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u8mf2_m(...) __riscv_vluxseg3ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u8mf2_m(...) __riscv_vluxseg4ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u8mf2_m(...) __riscv_vluxseg5ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u8mf2_m(...) __riscv_vluxseg6ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u8mf2_m(...) __riscv_vluxseg7ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u8mf2_m(...) __riscv_vluxseg8ei8_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u8m1_m(...) __riscv_vluxseg2ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u8m1_m(...) __riscv_vluxseg3ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u8m1_m(...) __riscv_vluxseg4ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u8m1_m(...) __riscv_vluxseg5ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u8m1_m(...) __riscv_vluxseg6ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u8m1_m(...) __riscv_vluxseg7ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u8m1_m(...) __riscv_vluxseg8ei8_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u8m2_m(...) __riscv_vluxseg2ei8_v_u8m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u8m2_m(...) __riscv_vluxseg3ei8_v_u8m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u8m2_m(...) __riscv_vluxseg4ei8_v_u8m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u8m4_m(...) __riscv_vluxseg2ei8_v_u8m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u8mf8_m(...) __riscv_vluxseg2ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u8mf8_m(...) __riscv_vluxseg3ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u8mf8_m(...) __riscv_vluxseg4ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u8mf8_m(...) __riscv_vluxseg5ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u8mf8_m(...) __riscv_vluxseg6ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u8mf8_m(...) __riscv_vluxseg7ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u8mf8_m(...) __riscv_vluxseg8ei16_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u8mf4_m(...) __riscv_vluxseg2ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u8mf4_m(...) __riscv_vluxseg3ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u8mf4_m(...) __riscv_vluxseg4ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u8mf4_m(...) __riscv_vluxseg5ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u8mf4_m(...) __riscv_vluxseg6ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u8mf4_m(...) __riscv_vluxseg7ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u8mf4_m(...) __riscv_vluxseg8ei16_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u8mf2_m(...) __riscv_vluxseg2ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u8mf2_m(...) __riscv_vluxseg3ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u8mf2_m(...) __riscv_vluxseg4ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u8mf2_m(...) __riscv_vluxseg5ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u8mf2_m(...) __riscv_vluxseg6ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u8mf2_m(...) __riscv_vluxseg7ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u8mf2_m(...) __riscv_vluxseg8ei16_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u8m1_m(...) __riscv_vluxseg2ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u8m1_m(...) __riscv_vluxseg3ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u8m1_m(...) __riscv_vluxseg4ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u8m1_m(...) __riscv_vluxseg5ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u8m1_m(...) __riscv_vluxseg6ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u8m1_m(...) __riscv_vluxseg7ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u8m1_m(...) __riscv_vluxseg8ei16_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u8m2_m(...) __riscv_vluxseg2ei16_v_u8m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u8m2_m(...) __riscv_vluxseg3ei16_v_u8m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u8m2_m(...) __riscv_vluxseg4ei16_v_u8m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u8m4_m(...) __riscv_vluxseg2ei16_v_u8m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u8mf8_m(...) __riscv_vluxseg2ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u8mf8_m(...) __riscv_vluxseg3ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u8mf8_m(...) __riscv_vluxseg4ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u8mf8_m(...) __riscv_vluxseg5ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u8mf8_m(...) __riscv_vluxseg6ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u8mf8_m(...) __riscv_vluxseg7ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u8mf8_m(...) __riscv_vluxseg8ei32_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u8mf4_m(...) __riscv_vluxseg2ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u8mf4_m(...) __riscv_vluxseg3ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u8mf4_m(...) __riscv_vluxseg4ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u8mf4_m(...) __riscv_vluxseg5ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u8mf4_m(...) __riscv_vluxseg6ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u8mf4_m(...) __riscv_vluxseg7ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u8mf4_m(...) __riscv_vluxseg8ei32_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u8mf2_m(...) __riscv_vluxseg2ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u8mf2_m(...) __riscv_vluxseg3ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u8mf2_m(...) __riscv_vluxseg4ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u8mf2_m(...) __riscv_vluxseg5ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u8mf2_m(...) __riscv_vluxseg6ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u8mf2_m(...) __riscv_vluxseg7ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u8mf2_m(...) __riscv_vluxseg8ei32_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u8m1_m(...) __riscv_vluxseg2ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u8m1_m(...) __riscv_vluxseg3ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u8m1_m(...) __riscv_vluxseg4ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u8m1_m(...) __riscv_vluxseg5ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u8m1_m(...) __riscv_vluxseg6ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u8m1_m(...) __riscv_vluxseg7ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u8m1_m(...) __riscv_vluxseg8ei32_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u8m2_m(...) __riscv_vluxseg2ei32_v_u8m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u8m2_m(...) __riscv_vluxseg3ei32_v_u8m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u8m2_m(...) __riscv_vluxseg4ei32_v_u8m2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u8mf8_m(...) __riscv_vluxseg2ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u8mf8_m(...) __riscv_vluxseg3ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u8mf8_m(...) __riscv_vluxseg4ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u8mf8_m(...) __riscv_vluxseg5ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u8mf8_m(...) __riscv_vluxseg6ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u8mf8_m(...) __riscv_vluxseg7ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u8mf8_m(...) __riscv_vluxseg8ei64_v_u8mf8_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u8mf4_m(...) __riscv_vluxseg2ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u8mf4_m(...) __riscv_vluxseg3ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u8mf4_m(...) __riscv_vluxseg4ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u8mf4_m(...) __riscv_vluxseg5ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u8mf4_m(...) __riscv_vluxseg6ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u8mf4_m(...) __riscv_vluxseg7ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u8mf4_m(...) __riscv_vluxseg8ei64_v_u8mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u8mf2_m(...) __riscv_vluxseg2ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u8mf2_m(...) __riscv_vluxseg3ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u8mf2_m(...) __riscv_vluxseg4ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u8mf2_m(...) __riscv_vluxseg5ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u8mf2_m(...) __riscv_vluxseg6ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u8mf2_m(...) __riscv_vluxseg7ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u8mf2_m(...) __riscv_vluxseg8ei64_v_u8mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u8m1_m(...) __riscv_vluxseg2ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u8m1_m(...) __riscv_vluxseg3ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u8m1_m(...) __riscv_vluxseg4ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u8m1_m(...) __riscv_vluxseg5ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u8m1_m(...) __riscv_vluxseg6ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u8m1_m(...) __riscv_vluxseg7ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u8m1_m(...) __riscv_vluxseg8ei64_v_u8m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u16mf4_m(...) __riscv_vluxseg2ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u16mf4_m(...) __riscv_vluxseg3ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u16mf4_m(...) __riscv_vluxseg4ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u16mf4_m(...) __riscv_vluxseg5ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u16mf4_m(...) __riscv_vluxseg6ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u16mf4_m(...) __riscv_vluxseg7ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u16mf4_m(...) __riscv_vluxseg8ei8_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u16mf2_m(...) __riscv_vluxseg2ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u16mf2_m(...) __riscv_vluxseg3ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u16mf2_m(...) __riscv_vluxseg4ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u16mf2_m(...) __riscv_vluxseg5ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u16mf2_m(...) __riscv_vluxseg6ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u16mf2_m(...) __riscv_vluxseg7ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u16mf2_m(...) __riscv_vluxseg8ei8_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u16m1_m(...) __riscv_vluxseg2ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u16m1_m(...) __riscv_vluxseg3ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u16m1_m(...) __riscv_vluxseg4ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u16m1_m(...) __riscv_vluxseg5ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u16m1_m(...) __riscv_vluxseg6ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u16m1_m(...) __riscv_vluxseg7ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u16m1_m(...) __riscv_vluxseg8ei8_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u16m2_m(...) __riscv_vluxseg2ei8_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u16m2_m(...) __riscv_vluxseg3ei8_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u16m2_m(...) __riscv_vluxseg4ei8_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u16m4_m(...) __riscv_vluxseg2ei8_v_u16m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u16mf4_m(...) __riscv_vluxseg2ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u16mf4_m(...) __riscv_vluxseg3ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u16mf4_m(...) __riscv_vluxseg4ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u16mf4_m(...) __riscv_vluxseg5ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u16mf4_m(...) __riscv_vluxseg6ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u16mf4_m(...) __riscv_vluxseg7ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u16mf4_m(...) __riscv_vluxseg8ei16_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u16mf2_m(...) __riscv_vluxseg2ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u16mf2_m(...) __riscv_vluxseg3ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u16mf2_m(...) __riscv_vluxseg4ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u16mf2_m(...) __riscv_vluxseg5ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u16mf2_m(...) __riscv_vluxseg6ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u16mf2_m(...) __riscv_vluxseg7ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u16mf2_m(...) __riscv_vluxseg8ei16_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u16m1_m(...) __riscv_vluxseg2ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u16m1_m(...) __riscv_vluxseg3ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u16m1_m(...) __riscv_vluxseg4ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u16m1_m(...) __riscv_vluxseg5ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u16m1_m(...) __riscv_vluxseg6ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u16m1_m(...) __riscv_vluxseg7ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u16m1_m(...) __riscv_vluxseg8ei16_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u16m2_m(...) __riscv_vluxseg2ei16_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u16m2_m(...) __riscv_vluxseg3ei16_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u16m2_m(...) __riscv_vluxseg4ei16_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u16m4_m(...) __riscv_vluxseg2ei16_v_u16m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u16mf4_m(...) __riscv_vluxseg2ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u16mf4_m(...) __riscv_vluxseg3ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u16mf4_m(...) __riscv_vluxseg4ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u16mf4_m(...) __riscv_vluxseg5ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u16mf4_m(...) __riscv_vluxseg6ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u16mf4_m(...) __riscv_vluxseg7ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u16mf4_m(...) __riscv_vluxseg8ei32_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u16mf2_m(...) __riscv_vluxseg2ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u16mf2_m(...) __riscv_vluxseg3ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u16mf2_m(...) __riscv_vluxseg4ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u16mf2_m(...) __riscv_vluxseg5ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u16mf2_m(...) __riscv_vluxseg6ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u16mf2_m(...) __riscv_vluxseg7ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u16mf2_m(...) __riscv_vluxseg8ei32_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u16m1_m(...) __riscv_vluxseg2ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u16m1_m(...) __riscv_vluxseg3ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u16m1_m(...) __riscv_vluxseg4ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u16m1_m(...) __riscv_vluxseg5ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u16m1_m(...) __riscv_vluxseg6ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u16m1_m(...) __riscv_vluxseg7ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u16m1_m(...) __riscv_vluxseg8ei32_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u16m2_m(...) __riscv_vluxseg2ei32_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u16m2_m(...) __riscv_vluxseg3ei32_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u16m2_m(...) __riscv_vluxseg4ei32_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u16m4_m(...) __riscv_vluxseg2ei32_v_u16m4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u16mf4_m(...) __riscv_vluxseg2ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u16mf4_m(...) __riscv_vluxseg3ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u16mf4_m(...) __riscv_vluxseg4ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u16mf4_m(...) __riscv_vluxseg5ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u16mf4_m(...) __riscv_vluxseg6ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u16mf4_m(...) __riscv_vluxseg7ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u16mf4_m(...) __riscv_vluxseg8ei64_v_u16mf4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u16mf2_m(...) __riscv_vluxseg2ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u16mf2_m(...) __riscv_vluxseg3ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u16mf2_m(...) __riscv_vluxseg4ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u16mf2_m(...) __riscv_vluxseg5ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u16mf2_m(...) __riscv_vluxseg6ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u16mf2_m(...) __riscv_vluxseg7ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u16mf2_m(...) __riscv_vluxseg8ei64_v_u16mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u16m1_m(...) __riscv_vluxseg2ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u16m1_m(...) __riscv_vluxseg3ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u16m1_m(...) __riscv_vluxseg4ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u16m1_m(...) __riscv_vluxseg5ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u16m1_m(...) __riscv_vluxseg6ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u16m1_m(...) __riscv_vluxseg7ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u16m1_m(...) __riscv_vluxseg8ei64_v_u16m1_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u16m2_m(...) __riscv_vluxseg2ei64_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u16m2_m(...) __riscv_vluxseg3ei64_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u16m2_m(...) __riscv_vluxseg4ei64_v_u16m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u32mf2_m(...) __riscv_vluxseg2ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u32mf2_m(...) __riscv_vluxseg3ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u32mf2_m(...) __riscv_vluxseg4ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u32mf2_m(...) __riscv_vluxseg5ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u32mf2_m(...) __riscv_vluxseg6ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u32mf2_m(...) __riscv_vluxseg7ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u32mf2_m(...) __riscv_vluxseg8ei8_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u32m1_m(...) __riscv_vluxseg2ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u32m1_m(...) __riscv_vluxseg3ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u32m1_m(...) __riscv_vluxseg4ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u32m1_m(...) __riscv_vluxseg5ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u32m1_m(...) __riscv_vluxseg6ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u32m1_m(...) __riscv_vluxseg7ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u32m1_m(...) __riscv_vluxseg8ei8_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u32m2_m(...) __riscv_vluxseg2ei8_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u32m2_m(...) __riscv_vluxseg3ei8_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u32m2_m(...) __riscv_vluxseg4ei8_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u32m4_m(...) __riscv_vluxseg2ei8_v_u32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u32mf2_m(...) __riscv_vluxseg2ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u32mf2_m(...) __riscv_vluxseg3ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u32mf2_m(...) __riscv_vluxseg4ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u32mf2_m(...) __riscv_vluxseg5ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u32mf2_m(...) __riscv_vluxseg6ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u32mf2_m(...) __riscv_vluxseg7ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u32mf2_m(...) __riscv_vluxseg8ei16_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u32m1_m(...) __riscv_vluxseg2ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u32m1_m(...) __riscv_vluxseg3ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u32m1_m(...) __riscv_vluxseg4ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u32m1_m(...) __riscv_vluxseg5ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u32m1_m(...) __riscv_vluxseg6ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u32m1_m(...) __riscv_vluxseg7ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u32m1_m(...) __riscv_vluxseg8ei16_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u32m2_m(...) __riscv_vluxseg2ei16_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u32m2_m(...) __riscv_vluxseg3ei16_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u32m2_m(...) __riscv_vluxseg4ei16_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u32m4_m(...) __riscv_vluxseg2ei16_v_u32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u32mf2_m(...) __riscv_vluxseg2ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u32mf2_m(...) __riscv_vluxseg3ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u32mf2_m(...) __riscv_vluxseg4ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u32mf2_m(...) __riscv_vluxseg5ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u32mf2_m(...) __riscv_vluxseg6ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u32mf2_m(...) __riscv_vluxseg7ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u32mf2_m(...) __riscv_vluxseg8ei32_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u32m1_m(...) __riscv_vluxseg2ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u32m1_m(...) __riscv_vluxseg3ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u32m1_m(...) __riscv_vluxseg4ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u32m1_m(...) __riscv_vluxseg5ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u32m1_m(...) __riscv_vluxseg6ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u32m1_m(...) __riscv_vluxseg7ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u32m1_m(...) __riscv_vluxseg8ei32_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u32m2_m(...) __riscv_vluxseg2ei32_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u32m2_m(...) __riscv_vluxseg3ei32_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u32m2_m(...) __riscv_vluxseg4ei32_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u32m4_m(...) __riscv_vluxseg2ei32_v_u32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u32mf2_m(...) __riscv_vluxseg2ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u32mf2_m(...) __riscv_vluxseg3ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u32mf2_m(...) __riscv_vluxseg4ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u32mf2_m(...) __riscv_vluxseg5ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u32mf2_m(...) __riscv_vluxseg6ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u32mf2_m(...) __riscv_vluxseg7ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u32mf2_m(...) __riscv_vluxseg8ei64_v_u32mf2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u32m1_m(...) __riscv_vluxseg2ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u32m1_m(...) __riscv_vluxseg3ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u32m1_m(...) __riscv_vluxseg4ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u32m1_m(...) __riscv_vluxseg5ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u32m1_m(...) __riscv_vluxseg6ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u32m1_m(...) __riscv_vluxseg7ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u32m1_m(...) __riscv_vluxseg8ei64_v_u32m1_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u32m2_m(...) __riscv_vluxseg2ei64_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u32m2_m(...) __riscv_vluxseg3ei64_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u32m2_m(...) __riscv_vluxseg4ei64_v_u32m2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u32m4_m(...) __riscv_vluxseg2ei64_v_u32m4_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u64m1_m(...) __riscv_vluxseg2ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u64m1_m(...) __riscv_vluxseg3ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u64m1_m(...) __riscv_vluxseg4ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei8_v_u64m1_m(...) __riscv_vluxseg5ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei8_v_u64m1_m(...) __riscv_vluxseg6ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei8_v_u64m1_m(...) __riscv_vluxseg7ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei8_v_u64m1_m(...) __riscv_vluxseg8ei8_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u64m2_m(...) __riscv_vluxseg2ei8_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei8_v_u64m2_m(...) __riscv_vluxseg3ei8_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei8_v_u64m2_m(...) __riscv_vluxseg4ei8_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei8_v_u64m4_m(...) __riscv_vluxseg2ei8_v_u64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u64m1_m(...) __riscv_vluxseg2ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u64m1_m(...) __riscv_vluxseg3ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u64m1_m(...) __riscv_vluxseg4ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei16_v_u64m1_m(...) __riscv_vluxseg5ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei16_v_u64m1_m(...) __riscv_vluxseg6ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei16_v_u64m1_m(...) __riscv_vluxseg7ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei16_v_u64m1_m(...) __riscv_vluxseg8ei16_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u64m2_m(...) __riscv_vluxseg2ei16_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei16_v_u64m2_m(...) __riscv_vluxseg3ei16_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei16_v_u64m2_m(...) __riscv_vluxseg4ei16_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei16_v_u64m4_m(...) __riscv_vluxseg2ei16_v_u64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u64m1_m(...) __riscv_vluxseg2ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u64m1_m(...) __riscv_vluxseg3ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u64m1_m(...) __riscv_vluxseg4ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei32_v_u64m1_m(...) __riscv_vluxseg5ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei32_v_u64m1_m(...) __riscv_vluxseg6ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei32_v_u64m1_m(...) __riscv_vluxseg7ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei32_v_u64m1_m(...) __riscv_vluxseg8ei32_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u64m2_m(...) __riscv_vluxseg2ei32_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei32_v_u64m2_m(...) __riscv_vluxseg3ei32_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei32_v_u64m2_m(...) __riscv_vluxseg4ei32_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei32_v_u64m4_m(...) __riscv_vluxseg2ei32_v_u64m4_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u64m1_m(...) __riscv_vluxseg2ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u64m1_m(...) __riscv_vluxseg3ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u64m1_m(...) __riscv_vluxseg4ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg5ei64_v_u64m1_m(...) __riscv_vluxseg5ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg6ei64_v_u64m1_m(...) __riscv_vluxseg6ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg7ei64_v_u64m1_m(...) __riscv_vluxseg7ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg8ei64_v_u64m1_m(...) __riscv_vluxseg8ei64_v_u64m1_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u64m2_m(...) __riscv_vluxseg2ei64_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg3ei64_v_u64m2_m(...) __riscv_vluxseg3ei64_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg4ei64_v_u64m2_m(...) __riscv_vluxseg4ei64_v_u64m2_tumu(__VA_ARGS__)
+#define vluxseg2ei64_v_u64m4_m(...) __riscv_vluxseg2ei64_v_u64m4_tumu(__VA_ARGS__)
+#define vsoxseg2ei8_v_f16mf4(...) __riscv_vsoxseg2ei8_v_f16mf4(__VA_ARGS__)
+#define vsoxseg3ei8_v_f16mf4(...) __riscv_vsoxseg3ei8_v_f16mf4(__VA_ARGS__)
+#define vsoxseg4ei8_v_f16mf4(...) __riscv_vsoxseg4ei8_v_f16mf4(__VA_ARGS__)
+#define vsoxseg5ei8_v_f16mf4(...) __riscv_vsoxseg5ei8_v_f16mf4(__VA_ARGS__)
+#define vsoxseg6ei8_v_f16mf4(...) __riscv_vsoxseg6ei8_v_f16mf4(__VA_ARGS__)
+#define vsoxseg7ei8_v_f16mf4(...) __riscv_vsoxseg7ei8_v_f16mf4(__VA_ARGS__)
+#define vsoxseg8ei8_v_f16mf4(...) __riscv_vsoxseg8ei8_v_f16mf4(__VA_ARGS__)
+#define vsoxseg2ei8_v_f16mf2(...) __riscv_vsoxseg2ei8_v_f16mf2(__VA_ARGS__)
+#define vsoxseg3ei8_v_f16mf2(...) __riscv_vsoxseg3ei8_v_f16mf2(__VA_ARGS__)
+#define vsoxseg4ei8_v_f16mf2(...) __riscv_vsoxseg4ei8_v_f16mf2(__VA_ARGS__)
+#define vsoxseg5ei8_v_f16mf2(...) __riscv_vsoxseg5ei8_v_f16mf2(__VA_ARGS__)
+#define vsoxseg6ei8_v_f16mf2(...) __riscv_vsoxseg6ei8_v_f16mf2(__VA_ARGS__)
+#define vsoxseg7ei8_v_f16mf2(...) __riscv_vsoxseg7ei8_v_f16mf2(__VA_ARGS__)
+#define vsoxseg8ei8_v_f16mf2(...) __riscv_vsoxseg8ei8_v_f16mf2(__VA_ARGS__)
+#define vsoxseg2ei8_v_f16m1(...) __riscv_vsoxseg2ei8_v_f16m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_f16m1(...) __riscv_vsoxseg3ei8_v_f16m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_f16m1(...) __riscv_vsoxseg4ei8_v_f16m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_f16m1(...) __riscv_vsoxseg5ei8_v_f16m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_f16m1(...) __riscv_vsoxseg6ei8_v_f16m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_f16m1(...) __riscv_vsoxseg7ei8_v_f16m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_f16m1(...) __riscv_vsoxseg8ei8_v_f16m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_f16m2(...) __riscv_vsoxseg2ei8_v_f16m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_f16m2(...) __riscv_vsoxseg3ei8_v_f16m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_f16m2(...) __riscv_vsoxseg4ei8_v_f16m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_f16m4(...) __riscv_vsoxseg2ei8_v_f16m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16mf4(...) __riscv_vsoxseg2ei16_v_f16mf4(__VA_ARGS__)
+#define vsoxseg3ei16_v_f16mf4(...) __riscv_vsoxseg3ei16_v_f16mf4(__VA_ARGS__)
+#define vsoxseg4ei16_v_f16mf4(...) __riscv_vsoxseg4ei16_v_f16mf4(__VA_ARGS__)
+#define vsoxseg5ei16_v_f16mf4(...) __riscv_vsoxseg5ei16_v_f16mf4(__VA_ARGS__)
+#define vsoxseg6ei16_v_f16mf4(...) __riscv_vsoxseg6ei16_v_f16mf4(__VA_ARGS__)
+#define vsoxseg7ei16_v_f16mf4(...) __riscv_vsoxseg7ei16_v_f16mf4(__VA_ARGS__)
+#define vsoxseg8ei16_v_f16mf4(...) __riscv_vsoxseg8ei16_v_f16mf4(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16mf2(...) __riscv_vsoxseg2ei16_v_f16mf2(__VA_ARGS__)
+#define vsoxseg3ei16_v_f16mf2(...) __riscv_vsoxseg3ei16_v_f16mf2(__VA_ARGS__)
+#define vsoxseg4ei16_v_f16mf2(...) __riscv_vsoxseg4ei16_v_f16mf2(__VA_ARGS__)
+#define vsoxseg5ei16_v_f16mf2(...) __riscv_vsoxseg5ei16_v_f16mf2(__VA_ARGS__)
+#define vsoxseg6ei16_v_f16mf2(...) __riscv_vsoxseg6ei16_v_f16mf2(__VA_ARGS__)
+#define vsoxseg7ei16_v_f16mf2(...) __riscv_vsoxseg7ei16_v_f16mf2(__VA_ARGS__)
+#define vsoxseg8ei16_v_f16mf2(...) __riscv_vsoxseg8ei16_v_f16mf2(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16m1(...) __riscv_vsoxseg2ei16_v_f16m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_f16m1(...) __riscv_vsoxseg3ei16_v_f16m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_f16m1(...) __riscv_vsoxseg4ei16_v_f16m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_f16m1(...) __riscv_vsoxseg5ei16_v_f16m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_f16m1(...) __riscv_vsoxseg6ei16_v_f16m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_f16m1(...) __riscv_vsoxseg7ei16_v_f16m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_f16m1(...) __riscv_vsoxseg8ei16_v_f16m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16m2(...) __riscv_vsoxseg2ei16_v_f16m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_f16m2(...) __riscv_vsoxseg3ei16_v_f16m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_f16m2(...) __riscv_vsoxseg4ei16_v_f16m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16m4(...) __riscv_vsoxseg2ei16_v_f16m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16mf4(...) __riscv_vsoxseg2ei32_v_f16mf4(__VA_ARGS__)
+#define vsoxseg3ei32_v_f16mf4(...) __riscv_vsoxseg3ei32_v_f16mf4(__VA_ARGS__)
+#define vsoxseg4ei32_v_f16mf4(...) __riscv_vsoxseg4ei32_v_f16mf4(__VA_ARGS__)
+#define vsoxseg5ei32_v_f16mf4(...) __riscv_vsoxseg5ei32_v_f16mf4(__VA_ARGS__)
+#define vsoxseg6ei32_v_f16mf4(...) __riscv_vsoxseg6ei32_v_f16mf4(__VA_ARGS__)
+#define vsoxseg7ei32_v_f16mf4(...) __riscv_vsoxseg7ei32_v_f16mf4(__VA_ARGS__)
+#define vsoxseg8ei32_v_f16mf4(...) __riscv_vsoxseg8ei32_v_f16mf4(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16mf2(...) __riscv_vsoxseg2ei32_v_f16mf2(__VA_ARGS__)
+#define vsoxseg3ei32_v_f16mf2(...) __riscv_vsoxseg3ei32_v_f16mf2(__VA_ARGS__)
+#define vsoxseg4ei32_v_f16mf2(...) __riscv_vsoxseg4ei32_v_f16mf2(__VA_ARGS__)
+#define vsoxseg5ei32_v_f16mf2(...) __riscv_vsoxseg5ei32_v_f16mf2(__VA_ARGS__)
+#define vsoxseg6ei32_v_f16mf2(...) __riscv_vsoxseg6ei32_v_f16mf2(__VA_ARGS__)
+#define vsoxseg7ei32_v_f16mf2(...) __riscv_vsoxseg7ei32_v_f16mf2(__VA_ARGS__)
+#define vsoxseg8ei32_v_f16mf2(...) __riscv_vsoxseg8ei32_v_f16mf2(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16m1(...) __riscv_vsoxseg2ei32_v_f16m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_f16m1(...) __riscv_vsoxseg3ei32_v_f16m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_f16m1(...) __riscv_vsoxseg4ei32_v_f16m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_f16m1(...) __riscv_vsoxseg5ei32_v_f16m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_f16m1(...) __riscv_vsoxseg6ei32_v_f16m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_f16m1(...) __riscv_vsoxseg7ei32_v_f16m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_f16m1(...) __riscv_vsoxseg8ei32_v_f16m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16m2(...) __riscv_vsoxseg2ei32_v_f16m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_f16m2(...) __riscv_vsoxseg3ei32_v_f16m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_f16m2(...) __riscv_vsoxseg4ei32_v_f16m2(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16m4(...) __riscv_vsoxseg2ei32_v_f16m4(__VA_ARGS__)
+#define vsoxseg2ei64_v_f16mf4(...) __riscv_vsoxseg2ei64_v_f16mf4(__VA_ARGS__)
+#define vsoxseg3ei64_v_f16mf4(...) __riscv_vsoxseg3ei64_v_f16mf4(__VA_ARGS__)
+#define vsoxseg4ei64_v_f16mf4(...) __riscv_vsoxseg4ei64_v_f16mf4(__VA_ARGS__)
+#define vsoxseg5ei64_v_f16mf4(...) __riscv_vsoxseg5ei64_v_f16mf4(__VA_ARGS__)
+#define vsoxseg6ei64_v_f16mf4(...) __riscv_vsoxseg6ei64_v_f16mf4(__VA_ARGS__)
+#define vsoxseg7ei64_v_f16mf4(...) __riscv_vsoxseg7ei64_v_f16mf4(__VA_ARGS__)
+#define vsoxseg8ei64_v_f16mf4(...) __riscv_vsoxseg8ei64_v_f16mf4(__VA_ARGS__)
+#define vsoxseg2ei64_v_f16mf2(...) __riscv_vsoxseg2ei64_v_f16mf2(__VA_ARGS__)
+#define vsoxseg3ei64_v_f16mf2(...) __riscv_vsoxseg3ei64_v_f16mf2(__VA_ARGS__)
+#define vsoxseg4ei64_v_f16mf2(...) __riscv_vsoxseg4ei64_v_f16mf2(__VA_ARGS__)
+#define vsoxseg5ei64_v_f16mf2(...) __riscv_vsoxseg5ei64_v_f16mf2(__VA_ARGS__)
+#define vsoxseg6ei64_v_f16mf2(...) __riscv_vsoxseg6ei64_v_f16mf2(__VA_ARGS__)
+#define vsoxseg7ei64_v_f16mf2(...) __riscv_vsoxseg7ei64_v_f16mf2(__VA_ARGS__)
+#define vsoxseg8ei64_v_f16mf2(...) __riscv_vsoxseg8ei64_v_f16mf2(__VA_ARGS__)
+#define vsoxseg2ei64_v_f16m1(...) __riscv_vsoxseg2ei64_v_f16m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_f16m1(...) __riscv_vsoxseg3ei64_v_f16m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_f16m1(...) __riscv_vsoxseg4ei64_v_f16m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_f16m1(...) __riscv_vsoxseg5ei64_v_f16m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_f16m1(...) __riscv_vsoxseg6ei64_v_f16m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_f16m1(...) __riscv_vsoxseg7ei64_v_f16m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_f16m1(...) __riscv_vsoxseg8ei64_v_f16m1(__VA_ARGS__)
+#define vsoxseg2ei64_v_f16m2(...) __riscv_vsoxseg2ei64_v_f16m2(__VA_ARGS__)
+#define vsoxseg3ei64_v_f16m2(...) __riscv_vsoxseg3ei64_v_f16m2(__VA_ARGS__)
+#define vsoxseg4ei64_v_f16m2(...) __riscv_vsoxseg4ei64_v_f16m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_f32mf2(...) __riscv_vsoxseg2ei8_v_f32mf2(__VA_ARGS__)
+#define vsoxseg3ei8_v_f32mf2(...) __riscv_vsoxseg3ei8_v_f32mf2(__VA_ARGS__)
+#define vsoxseg4ei8_v_f32mf2(...) __riscv_vsoxseg4ei8_v_f32mf2(__VA_ARGS__)
+#define vsoxseg5ei8_v_f32mf2(...) __riscv_vsoxseg5ei8_v_f32mf2(__VA_ARGS__)
+#define vsoxseg6ei8_v_f32mf2(...) __riscv_vsoxseg6ei8_v_f32mf2(__VA_ARGS__)
+#define vsoxseg7ei8_v_f32mf2(...) __riscv_vsoxseg7ei8_v_f32mf2(__VA_ARGS__)
+#define vsoxseg8ei8_v_f32mf2(...) __riscv_vsoxseg8ei8_v_f32mf2(__VA_ARGS__)
+#define vsoxseg2ei8_v_f32m1(...) __riscv_vsoxseg2ei8_v_f32m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_f32m1(...) __riscv_vsoxseg3ei8_v_f32m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_f32m1(...) __riscv_vsoxseg4ei8_v_f32m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_f32m1(...) __riscv_vsoxseg5ei8_v_f32m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_f32m1(...) __riscv_vsoxseg6ei8_v_f32m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_f32m1(...) __riscv_vsoxseg7ei8_v_f32m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_f32m1(...) __riscv_vsoxseg8ei8_v_f32m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_f32m2(...) __riscv_vsoxseg2ei8_v_f32m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_f32m2(...) __riscv_vsoxseg3ei8_v_f32m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_f32m2(...) __riscv_vsoxseg4ei8_v_f32m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_f32m4(...) __riscv_vsoxseg2ei8_v_f32m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_f32mf2(...) __riscv_vsoxseg2ei16_v_f32mf2(__VA_ARGS__)
+#define vsoxseg3ei16_v_f32mf2(...) __riscv_vsoxseg3ei16_v_f32mf2(__VA_ARGS__)
+#define vsoxseg4ei16_v_f32mf2(...) __riscv_vsoxseg4ei16_v_f32mf2(__VA_ARGS__)
+#define vsoxseg5ei16_v_f32mf2(...) __riscv_vsoxseg5ei16_v_f32mf2(__VA_ARGS__)
+#define vsoxseg6ei16_v_f32mf2(...) __riscv_vsoxseg6ei16_v_f32mf2(__VA_ARGS__)
+#define vsoxseg7ei16_v_f32mf2(...) __riscv_vsoxseg7ei16_v_f32mf2(__VA_ARGS__)
+#define vsoxseg8ei16_v_f32mf2(...) __riscv_vsoxseg8ei16_v_f32mf2(__VA_ARGS__)
+#define vsoxseg2ei16_v_f32m1(...) __riscv_vsoxseg2ei16_v_f32m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_f32m1(...) __riscv_vsoxseg3ei16_v_f32m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_f32m1(...) __riscv_vsoxseg4ei16_v_f32m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_f32m1(...) __riscv_vsoxseg5ei16_v_f32m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_f32m1(...) __riscv_vsoxseg6ei16_v_f32m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_f32m1(...) __riscv_vsoxseg7ei16_v_f32m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_f32m1(...) __riscv_vsoxseg8ei16_v_f32m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_f32m2(...) __riscv_vsoxseg2ei16_v_f32m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_f32m2(...) __riscv_vsoxseg3ei16_v_f32m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_f32m2(...) __riscv_vsoxseg4ei16_v_f32m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_f32m4(...) __riscv_vsoxseg2ei16_v_f32m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_f32mf2(...) __riscv_vsoxseg2ei32_v_f32mf2(__VA_ARGS__)
+#define vsoxseg3ei32_v_f32mf2(...) __riscv_vsoxseg3ei32_v_f32mf2(__VA_ARGS__)
+#define vsoxseg4ei32_v_f32mf2(...) __riscv_vsoxseg4ei32_v_f32mf2(__VA_ARGS__)
+#define vsoxseg5ei32_v_f32mf2(...) __riscv_vsoxseg5ei32_v_f32mf2(__VA_ARGS__)
+#define vsoxseg6ei32_v_f32mf2(...) __riscv_vsoxseg6ei32_v_f32mf2(__VA_ARGS__)
+#define vsoxseg7ei32_v_f32mf2(...) __riscv_vsoxseg7ei32_v_f32mf2(__VA_ARGS__)
+#define vsoxseg8ei32_v_f32mf2(...) __riscv_vsoxseg8ei32_v_f32mf2(__VA_ARGS__)
+#define vsoxseg2ei32_v_f32m1(...) __riscv_vsoxseg2ei32_v_f32m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_f32m1(...) __riscv_vsoxseg3ei32_v_f32m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_f32m1(...) __riscv_vsoxseg4ei32_v_f32m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_f32m1(...) __riscv_vsoxseg5ei32_v_f32m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_f32m1(...) __riscv_vsoxseg6ei32_v_f32m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_f32m1(...) __riscv_vsoxseg7ei32_v_f32m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_f32m1(...) __riscv_vsoxseg8ei32_v_f32m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_f32m2(...) __riscv_vsoxseg2ei32_v_f32m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_f32m2(...) __riscv_vsoxseg3ei32_v_f32m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_f32m2(...) __riscv_vsoxseg4ei32_v_f32m2(__VA_ARGS__)
+#define vsoxseg2ei32_v_f32m4(...) __riscv_vsoxseg2ei32_v_f32m4(__VA_ARGS__)
+#define vsoxseg2ei64_v_f32mf2(...) __riscv_vsoxseg2ei64_v_f32mf2(__VA_ARGS__)
+#define vsoxseg3ei64_v_f32mf2(...) __riscv_vsoxseg3ei64_v_f32mf2(__VA_ARGS__)
+#define vsoxseg4ei64_v_f32mf2(...) __riscv_vsoxseg4ei64_v_f32mf2(__VA_ARGS__)
+#define vsoxseg5ei64_v_f32mf2(...) __riscv_vsoxseg5ei64_v_f32mf2(__VA_ARGS__)
+#define vsoxseg6ei64_v_f32mf2(...) __riscv_vsoxseg6ei64_v_f32mf2(__VA_ARGS__)
+#define vsoxseg7ei64_v_f32mf2(...) __riscv_vsoxseg7ei64_v_f32mf2(__VA_ARGS__)
+#define vsoxseg8ei64_v_f32mf2(...) __riscv_vsoxseg8ei64_v_f32mf2(__VA_ARGS__)
+#define vsoxseg2ei64_v_f32m1(...) __riscv_vsoxseg2ei64_v_f32m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_f32m1(...) __riscv_vsoxseg3ei64_v_f32m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_f32m1(...) __riscv_vsoxseg4ei64_v_f32m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_f32m1(...) __riscv_vsoxseg5ei64_v_f32m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_f32m1(...) __riscv_vsoxseg6ei64_v_f32m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_f32m1(...) __riscv_vsoxseg7ei64_v_f32m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_f32m1(...) __riscv_vsoxseg8ei64_v_f32m1(__VA_ARGS__)
+#define vsoxseg2ei64_v_f32m2(...) __riscv_vsoxseg2ei64_v_f32m2(__VA_ARGS__)
+#define vsoxseg3ei64_v_f32m2(...) __riscv_vsoxseg3ei64_v_f32m2(__VA_ARGS__)
+#define vsoxseg4ei64_v_f32m2(...) __riscv_vsoxseg4ei64_v_f32m2(__VA_ARGS__)
+#define vsoxseg2ei64_v_f32m4(...) __riscv_vsoxseg2ei64_v_f32m4(__VA_ARGS__)
+#define vsoxseg2ei8_v_f64m1(...) __riscv_vsoxseg2ei8_v_f64m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_f64m1(...) __riscv_vsoxseg3ei8_v_f64m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_f64m1(...) __riscv_vsoxseg4ei8_v_f64m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_f64m1(...) __riscv_vsoxseg5ei8_v_f64m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_f64m1(...) __riscv_vsoxseg6ei8_v_f64m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_f64m1(...) __riscv_vsoxseg7ei8_v_f64m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_f64m1(...) __riscv_vsoxseg8ei8_v_f64m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_f64m2(...) __riscv_vsoxseg2ei8_v_f64m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_f64m2(...) __riscv_vsoxseg3ei8_v_f64m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_f64m2(...) __riscv_vsoxseg4ei8_v_f64m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_f64m4(...) __riscv_vsoxseg2ei8_v_f64m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_f64m1(...) __riscv_vsoxseg2ei16_v_f64m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_f64m1(...) __riscv_vsoxseg3ei16_v_f64m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_f64m1(...) __riscv_vsoxseg4ei16_v_f64m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_f64m1(...) __riscv_vsoxseg5ei16_v_f64m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_f64m1(...) __riscv_vsoxseg6ei16_v_f64m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_f64m1(...) __riscv_vsoxseg7ei16_v_f64m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_f64m1(...) __riscv_vsoxseg8ei16_v_f64m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_f64m2(...) __riscv_vsoxseg2ei16_v_f64m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_f64m2(...) __riscv_vsoxseg3ei16_v_f64m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_f64m2(...) __riscv_vsoxseg4ei16_v_f64m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_f64m4(...) __riscv_vsoxseg2ei16_v_f64m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_f64m1(...) __riscv_vsoxseg2ei32_v_f64m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_f64m1(...) __riscv_vsoxseg3ei32_v_f64m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_f64m1(...) __riscv_vsoxseg4ei32_v_f64m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_f64m1(...) __riscv_vsoxseg5ei32_v_f64m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_f64m1(...) __riscv_vsoxseg6ei32_v_f64m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_f64m1(...) __riscv_vsoxseg7ei32_v_f64m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_f64m1(...) __riscv_vsoxseg8ei32_v_f64m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_f64m2(...) __riscv_vsoxseg2ei32_v_f64m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_f64m2(...) __riscv_vsoxseg3ei32_v_f64m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_f64m2(...) __riscv_vsoxseg4ei32_v_f64m2(__VA_ARGS__)
+#define vsoxseg2ei32_v_f64m4(...) __riscv_vsoxseg2ei32_v_f64m4(__VA_ARGS__)
+#define vsoxseg2ei64_v_f64m1(...) __riscv_vsoxseg2ei64_v_f64m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_f64m1(...) __riscv_vsoxseg3ei64_v_f64m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_f64m1(...) __riscv_vsoxseg4ei64_v_f64m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_f64m1(...) __riscv_vsoxseg5ei64_v_f64m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_f64m1(...) __riscv_vsoxseg6ei64_v_f64m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_f64m1(...) __riscv_vsoxseg7ei64_v_f64m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_f64m1(...) __riscv_vsoxseg8ei64_v_f64m1(__VA_ARGS__)
+#define vsoxseg2ei64_v_f64m2(...) __riscv_vsoxseg2ei64_v_f64m2(__VA_ARGS__)
+#define vsoxseg3ei64_v_f64m2(...) __riscv_vsoxseg3ei64_v_f64m2(__VA_ARGS__)
+#define vsoxseg4ei64_v_f64m2(...) __riscv_vsoxseg4ei64_v_f64m2(__VA_ARGS__)
+#define vsoxseg2ei64_v_f64m4(...) __riscv_vsoxseg2ei64_v_f64m4(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16mf4(...) __riscv_vsuxseg2ei8_v_f16mf4(__VA_ARGS__)
+#define vsuxseg3ei8_v_f16mf4(...) __riscv_vsuxseg3ei8_v_f16mf4(__VA_ARGS__)
+#define vsuxseg4ei8_v_f16mf4(...) __riscv_vsuxseg4ei8_v_f16mf4(__VA_ARGS__)
+#define vsuxseg5ei8_v_f16mf4(...) __riscv_vsuxseg5ei8_v_f16mf4(__VA_ARGS__)
+#define vsuxseg6ei8_v_f16mf4(...) __riscv_vsuxseg6ei8_v_f16mf4(__VA_ARGS__)
+#define vsuxseg7ei8_v_f16mf4(...) __riscv_vsuxseg7ei8_v_f16mf4(__VA_ARGS__)
+#define vsuxseg8ei8_v_f16mf4(...) __riscv_vsuxseg8ei8_v_f16mf4(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16mf2(...) __riscv_vsuxseg2ei8_v_f16mf2(__VA_ARGS__)
+#define vsuxseg3ei8_v_f16mf2(...) __riscv_vsuxseg3ei8_v_f16mf2(__VA_ARGS__)
+#define vsuxseg4ei8_v_f16mf2(...) __riscv_vsuxseg4ei8_v_f16mf2(__VA_ARGS__)
+#define vsuxseg5ei8_v_f16mf2(...) __riscv_vsuxseg5ei8_v_f16mf2(__VA_ARGS__)
+#define vsuxseg6ei8_v_f16mf2(...) __riscv_vsuxseg6ei8_v_f16mf2(__VA_ARGS__)
+#define vsuxseg7ei8_v_f16mf2(...) __riscv_vsuxseg7ei8_v_f16mf2(__VA_ARGS__)
+#define vsuxseg8ei8_v_f16mf2(...) __riscv_vsuxseg8ei8_v_f16mf2(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16m1(...) __riscv_vsuxseg2ei8_v_f16m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_f16m1(...) __riscv_vsuxseg3ei8_v_f16m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_f16m1(...) __riscv_vsuxseg4ei8_v_f16m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_f16m1(...) __riscv_vsuxseg5ei8_v_f16m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_f16m1(...) __riscv_vsuxseg6ei8_v_f16m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_f16m1(...) __riscv_vsuxseg7ei8_v_f16m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_f16m1(...) __riscv_vsuxseg8ei8_v_f16m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16m2(...) __riscv_vsuxseg2ei8_v_f16m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_f16m2(...) __riscv_vsuxseg3ei8_v_f16m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_f16m2(...) __riscv_vsuxseg4ei8_v_f16m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16m4(...) __riscv_vsuxseg2ei8_v_f16m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16mf4(...) __riscv_vsuxseg2ei16_v_f16mf4(__VA_ARGS__)
+#define vsuxseg3ei16_v_f16mf4(...) __riscv_vsuxseg3ei16_v_f16mf4(__VA_ARGS__)
+#define vsuxseg4ei16_v_f16mf4(...) __riscv_vsuxseg4ei16_v_f16mf4(__VA_ARGS__)
+#define vsuxseg5ei16_v_f16mf4(...) __riscv_vsuxseg5ei16_v_f16mf4(__VA_ARGS__)
+#define vsuxseg6ei16_v_f16mf4(...) __riscv_vsuxseg6ei16_v_f16mf4(__VA_ARGS__)
+#define vsuxseg7ei16_v_f16mf4(...) __riscv_vsuxseg7ei16_v_f16mf4(__VA_ARGS__)
+#define vsuxseg8ei16_v_f16mf4(...) __riscv_vsuxseg8ei16_v_f16mf4(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16mf2(...) __riscv_vsuxseg2ei16_v_f16mf2(__VA_ARGS__)
+#define vsuxseg3ei16_v_f16mf2(...) __riscv_vsuxseg3ei16_v_f16mf2(__VA_ARGS__)
+#define vsuxseg4ei16_v_f16mf2(...) __riscv_vsuxseg4ei16_v_f16mf2(__VA_ARGS__)
+#define vsuxseg5ei16_v_f16mf2(...) __riscv_vsuxseg5ei16_v_f16mf2(__VA_ARGS__)
+#define vsuxseg6ei16_v_f16mf2(...) __riscv_vsuxseg6ei16_v_f16mf2(__VA_ARGS__)
+#define vsuxseg7ei16_v_f16mf2(...) __riscv_vsuxseg7ei16_v_f16mf2(__VA_ARGS__)
+#define vsuxseg8ei16_v_f16mf2(...) __riscv_vsuxseg8ei16_v_f16mf2(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16m1(...) __riscv_vsuxseg2ei16_v_f16m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_f16m1(...) __riscv_vsuxseg3ei16_v_f16m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_f16m1(...) __riscv_vsuxseg4ei16_v_f16m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_f16m1(...) __riscv_vsuxseg5ei16_v_f16m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_f16m1(...) __riscv_vsuxseg6ei16_v_f16m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_f16m1(...) __riscv_vsuxseg7ei16_v_f16m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_f16m1(...) __riscv_vsuxseg8ei16_v_f16m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16m2(...) __riscv_vsuxseg2ei16_v_f16m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_f16m2(...) __riscv_vsuxseg3ei16_v_f16m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_f16m2(...) __riscv_vsuxseg4ei16_v_f16m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16m4(...) __riscv_vsuxseg2ei16_v_f16m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16mf4(...) __riscv_vsuxseg2ei32_v_f16mf4(__VA_ARGS__)
+#define vsuxseg3ei32_v_f16mf4(...) __riscv_vsuxseg3ei32_v_f16mf4(__VA_ARGS__)
+#define vsuxseg4ei32_v_f16mf4(...) __riscv_vsuxseg4ei32_v_f16mf4(__VA_ARGS__)
+#define vsuxseg5ei32_v_f16mf4(...) __riscv_vsuxseg5ei32_v_f16mf4(__VA_ARGS__)
+#define vsuxseg6ei32_v_f16mf4(...) __riscv_vsuxseg6ei32_v_f16mf4(__VA_ARGS__)
+#define vsuxseg7ei32_v_f16mf4(...) __riscv_vsuxseg7ei32_v_f16mf4(__VA_ARGS__)
+#define vsuxseg8ei32_v_f16mf4(...) __riscv_vsuxseg8ei32_v_f16mf4(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16mf2(...) __riscv_vsuxseg2ei32_v_f16mf2(__VA_ARGS__)
+#define vsuxseg3ei32_v_f16mf2(...) __riscv_vsuxseg3ei32_v_f16mf2(__VA_ARGS__)
+#define vsuxseg4ei32_v_f16mf2(...) __riscv_vsuxseg4ei32_v_f16mf2(__VA_ARGS__)
+#define vsuxseg5ei32_v_f16mf2(...) __riscv_vsuxseg5ei32_v_f16mf2(__VA_ARGS__)
+#define vsuxseg6ei32_v_f16mf2(...) __riscv_vsuxseg6ei32_v_f16mf2(__VA_ARGS__)
+#define vsuxseg7ei32_v_f16mf2(...) __riscv_vsuxseg7ei32_v_f16mf2(__VA_ARGS__)
+#define vsuxseg8ei32_v_f16mf2(...) __riscv_vsuxseg8ei32_v_f16mf2(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16m1(...) __riscv_vsuxseg2ei32_v_f16m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_f16m1(...) __riscv_vsuxseg3ei32_v_f16m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_f16m1(...) __riscv_vsuxseg4ei32_v_f16m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_f16m1(...) __riscv_vsuxseg5ei32_v_f16m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_f16m1(...) __riscv_vsuxseg6ei32_v_f16m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_f16m1(...) __riscv_vsuxseg7ei32_v_f16m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_f16m1(...) __riscv_vsuxseg8ei32_v_f16m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16m2(...) __riscv_vsuxseg2ei32_v_f16m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_f16m2(...) __riscv_vsuxseg3ei32_v_f16m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_f16m2(...) __riscv_vsuxseg4ei32_v_f16m2(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16m4(...) __riscv_vsuxseg2ei32_v_f16m4(__VA_ARGS__)
+#define vsuxseg2ei64_v_f16mf4(...) __riscv_vsuxseg2ei64_v_f16mf4(__VA_ARGS__)
+#define vsuxseg3ei64_v_f16mf4(...) __riscv_vsuxseg3ei64_v_f16mf4(__VA_ARGS__)
+#define vsuxseg4ei64_v_f16mf4(...) __riscv_vsuxseg4ei64_v_f16mf4(__VA_ARGS__)
+#define vsuxseg5ei64_v_f16mf4(...) __riscv_vsuxseg5ei64_v_f16mf4(__VA_ARGS__)
+#define vsuxseg6ei64_v_f16mf4(...) __riscv_vsuxseg6ei64_v_f16mf4(__VA_ARGS__)
+#define vsuxseg7ei64_v_f16mf4(...) __riscv_vsuxseg7ei64_v_f16mf4(__VA_ARGS__)
+#define vsuxseg8ei64_v_f16mf4(...) __riscv_vsuxseg8ei64_v_f16mf4(__VA_ARGS__)
+#define vsuxseg2ei64_v_f16mf2(...) __riscv_vsuxseg2ei64_v_f16mf2(__VA_ARGS__)
+#define vsuxseg3ei64_v_f16mf2(...) __riscv_vsuxseg3ei64_v_f16mf2(__VA_ARGS__)
+#define vsuxseg4ei64_v_f16mf2(...) __riscv_vsuxseg4ei64_v_f16mf2(__VA_ARGS__)
+#define vsuxseg5ei64_v_f16mf2(...) __riscv_vsuxseg5ei64_v_f16mf2(__VA_ARGS__)
+#define vsuxseg6ei64_v_f16mf2(...) __riscv_vsuxseg6ei64_v_f16mf2(__VA_ARGS__)
+#define vsuxseg7ei64_v_f16mf2(...) __riscv_vsuxseg7ei64_v_f16mf2(__VA_ARGS__)
+#define vsuxseg8ei64_v_f16mf2(...) __riscv_vsuxseg8ei64_v_f16mf2(__VA_ARGS__)
+#define vsuxseg2ei64_v_f16m1(...) __riscv_vsuxseg2ei64_v_f16m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_f16m1(...) __riscv_vsuxseg3ei64_v_f16m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_f16m1(...) __riscv_vsuxseg4ei64_v_f16m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_f16m1(...) __riscv_vsuxseg5ei64_v_f16m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_f16m1(...) __riscv_vsuxseg6ei64_v_f16m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_f16m1(...) __riscv_vsuxseg7ei64_v_f16m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_f16m1(...) __riscv_vsuxseg8ei64_v_f16m1(__VA_ARGS__)
+#define vsuxseg2ei64_v_f16m2(...) __riscv_vsuxseg2ei64_v_f16m2(__VA_ARGS__)
+#define vsuxseg3ei64_v_f16m2(...) __riscv_vsuxseg3ei64_v_f16m2(__VA_ARGS__)
+#define vsuxseg4ei64_v_f16m2(...) __riscv_vsuxseg4ei64_v_f16m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_f32mf2(...) __riscv_vsuxseg2ei8_v_f32mf2(__VA_ARGS__)
+#define vsuxseg3ei8_v_f32mf2(...) __riscv_vsuxseg3ei8_v_f32mf2(__VA_ARGS__)
+#define vsuxseg4ei8_v_f32mf2(...) __riscv_vsuxseg4ei8_v_f32mf2(__VA_ARGS__)
+#define vsuxseg5ei8_v_f32mf2(...) __riscv_vsuxseg5ei8_v_f32mf2(__VA_ARGS__)
+#define vsuxseg6ei8_v_f32mf2(...) __riscv_vsuxseg6ei8_v_f32mf2(__VA_ARGS__)
+#define vsuxseg7ei8_v_f32mf2(...) __riscv_vsuxseg7ei8_v_f32mf2(__VA_ARGS__)
+#define vsuxseg8ei8_v_f32mf2(...) __riscv_vsuxseg8ei8_v_f32mf2(__VA_ARGS__)
+#define vsuxseg2ei8_v_f32m1(...) __riscv_vsuxseg2ei8_v_f32m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_f32m1(...) __riscv_vsuxseg3ei8_v_f32m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_f32m1(...) __riscv_vsuxseg4ei8_v_f32m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_f32m1(...) __riscv_vsuxseg5ei8_v_f32m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_f32m1(...) __riscv_vsuxseg6ei8_v_f32m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_f32m1(...) __riscv_vsuxseg7ei8_v_f32m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_f32m1(...) __riscv_vsuxseg8ei8_v_f32m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_f32m2(...) __riscv_vsuxseg2ei8_v_f32m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_f32m2(...) __riscv_vsuxseg3ei8_v_f32m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_f32m2(...) __riscv_vsuxseg4ei8_v_f32m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_f32m4(...) __riscv_vsuxseg2ei8_v_f32m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_f32mf2(...) __riscv_vsuxseg2ei16_v_f32mf2(__VA_ARGS__)
+#define vsuxseg3ei16_v_f32mf2(...) __riscv_vsuxseg3ei16_v_f32mf2(__VA_ARGS__)
+#define vsuxseg4ei16_v_f32mf2(...) __riscv_vsuxseg4ei16_v_f32mf2(__VA_ARGS__)
+#define vsuxseg5ei16_v_f32mf2(...) __riscv_vsuxseg5ei16_v_f32mf2(__VA_ARGS__)
+#define vsuxseg6ei16_v_f32mf2(...) __riscv_vsuxseg6ei16_v_f32mf2(__VA_ARGS__)
+#define vsuxseg7ei16_v_f32mf2(...) __riscv_vsuxseg7ei16_v_f32mf2(__VA_ARGS__)
+#define vsuxseg8ei16_v_f32mf2(...) __riscv_vsuxseg8ei16_v_f32mf2(__VA_ARGS__)
+#define vsuxseg2ei16_v_f32m1(...) __riscv_vsuxseg2ei16_v_f32m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_f32m1(...) __riscv_vsuxseg3ei16_v_f32m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_f32m1(...) __riscv_vsuxseg4ei16_v_f32m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_f32m1(...) __riscv_vsuxseg5ei16_v_f32m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_f32m1(...) __riscv_vsuxseg6ei16_v_f32m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_f32m1(...) __riscv_vsuxseg7ei16_v_f32m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_f32m1(...) __riscv_vsuxseg8ei16_v_f32m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_f32m2(...) __riscv_vsuxseg2ei16_v_f32m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_f32m2(...) __riscv_vsuxseg3ei16_v_f32m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_f32m2(...) __riscv_vsuxseg4ei16_v_f32m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_f32m4(...) __riscv_vsuxseg2ei16_v_f32m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_f32mf2(...) __riscv_vsuxseg2ei32_v_f32mf2(__VA_ARGS__)
+#define vsuxseg3ei32_v_f32mf2(...) __riscv_vsuxseg3ei32_v_f32mf2(__VA_ARGS__)
+#define vsuxseg4ei32_v_f32mf2(...) __riscv_vsuxseg4ei32_v_f32mf2(__VA_ARGS__)
+#define vsuxseg5ei32_v_f32mf2(...) __riscv_vsuxseg5ei32_v_f32mf2(__VA_ARGS__)
+#define vsuxseg6ei32_v_f32mf2(...) __riscv_vsuxseg6ei32_v_f32mf2(__VA_ARGS__)
+#define vsuxseg7ei32_v_f32mf2(...) __riscv_vsuxseg7ei32_v_f32mf2(__VA_ARGS__)
+#define vsuxseg8ei32_v_f32mf2(...) __riscv_vsuxseg8ei32_v_f32mf2(__VA_ARGS__)
+#define vsuxseg2ei32_v_f32m1(...) __riscv_vsuxseg2ei32_v_f32m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_f32m1(...) __riscv_vsuxseg3ei32_v_f32m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_f32m1(...) __riscv_vsuxseg4ei32_v_f32m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_f32m1(...) __riscv_vsuxseg5ei32_v_f32m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_f32m1(...) __riscv_vsuxseg6ei32_v_f32m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_f32m1(...) __riscv_vsuxseg7ei32_v_f32m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_f32m1(...) __riscv_vsuxseg8ei32_v_f32m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_f32m2(...) __riscv_vsuxseg2ei32_v_f32m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_f32m2(...) __riscv_vsuxseg3ei32_v_f32m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_f32m2(...) __riscv_vsuxseg4ei32_v_f32m2(__VA_ARGS__)
+#define vsuxseg2ei32_v_f32m4(...) __riscv_vsuxseg2ei32_v_f32m4(__VA_ARGS__)
+#define vsuxseg2ei64_v_f32mf2(...) __riscv_vsuxseg2ei64_v_f32mf2(__VA_ARGS__)
+#define vsuxseg3ei64_v_f32mf2(...) __riscv_vsuxseg3ei64_v_f32mf2(__VA_ARGS__)
+#define vsuxseg4ei64_v_f32mf2(...) __riscv_vsuxseg4ei64_v_f32mf2(__VA_ARGS__)
+#define vsuxseg5ei64_v_f32mf2(...) __riscv_vsuxseg5ei64_v_f32mf2(__VA_ARGS__)
+#define vsuxseg6ei64_v_f32mf2(...) __riscv_vsuxseg6ei64_v_f32mf2(__VA_ARGS__)
+#define vsuxseg7ei64_v_f32mf2(...) __riscv_vsuxseg7ei64_v_f32mf2(__VA_ARGS__)
+#define vsuxseg8ei64_v_f32mf2(...) __riscv_vsuxseg8ei64_v_f32mf2(__VA_ARGS__)
+#define vsuxseg2ei64_v_f32m1(...) __riscv_vsuxseg2ei64_v_f32m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_f32m1(...) __riscv_vsuxseg3ei64_v_f32m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_f32m1(...) __riscv_vsuxseg4ei64_v_f32m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_f32m1(...) __riscv_vsuxseg5ei64_v_f32m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_f32m1(...) __riscv_vsuxseg6ei64_v_f32m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_f32m1(...) __riscv_vsuxseg7ei64_v_f32m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_f32m1(...) __riscv_vsuxseg8ei64_v_f32m1(__VA_ARGS__)
+#define vsuxseg2ei64_v_f32m2(...) __riscv_vsuxseg2ei64_v_f32m2(__VA_ARGS__)
+#define vsuxseg3ei64_v_f32m2(...) __riscv_vsuxseg3ei64_v_f32m2(__VA_ARGS__)
+#define vsuxseg4ei64_v_f32m2(...) __riscv_vsuxseg4ei64_v_f32m2(__VA_ARGS__)
+#define vsuxseg2ei64_v_f32m4(...) __riscv_vsuxseg2ei64_v_f32m4(__VA_ARGS__)
+#define vsuxseg2ei8_v_f64m1(...) __riscv_vsuxseg2ei8_v_f64m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_f64m1(...) __riscv_vsuxseg3ei8_v_f64m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_f64m1(...) __riscv_vsuxseg4ei8_v_f64m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_f64m1(...) __riscv_vsuxseg5ei8_v_f64m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_f64m1(...) __riscv_vsuxseg6ei8_v_f64m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_f64m1(...) __riscv_vsuxseg7ei8_v_f64m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_f64m1(...) __riscv_vsuxseg8ei8_v_f64m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_f64m2(...) __riscv_vsuxseg2ei8_v_f64m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_f64m2(...) __riscv_vsuxseg3ei8_v_f64m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_f64m2(...) __riscv_vsuxseg4ei8_v_f64m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_f64m4(...) __riscv_vsuxseg2ei8_v_f64m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_f64m1(...) __riscv_vsuxseg2ei16_v_f64m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_f64m1(...) __riscv_vsuxseg3ei16_v_f64m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_f64m1(...) __riscv_vsuxseg4ei16_v_f64m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_f64m1(...) __riscv_vsuxseg5ei16_v_f64m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_f64m1(...) __riscv_vsuxseg6ei16_v_f64m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_f64m1(...) __riscv_vsuxseg7ei16_v_f64m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_f64m1(...) __riscv_vsuxseg8ei16_v_f64m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_f64m2(...) __riscv_vsuxseg2ei16_v_f64m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_f64m2(...) __riscv_vsuxseg3ei16_v_f64m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_f64m2(...) __riscv_vsuxseg4ei16_v_f64m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_f64m4(...) __riscv_vsuxseg2ei16_v_f64m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_f64m1(...) __riscv_vsuxseg2ei32_v_f64m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_f64m1(...) __riscv_vsuxseg3ei32_v_f64m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_f64m1(...) __riscv_vsuxseg4ei32_v_f64m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_f64m1(...) __riscv_vsuxseg5ei32_v_f64m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_f64m1(...) __riscv_vsuxseg6ei32_v_f64m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_f64m1(...) __riscv_vsuxseg7ei32_v_f64m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_f64m1(...) __riscv_vsuxseg8ei32_v_f64m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_f64m2(...) __riscv_vsuxseg2ei32_v_f64m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_f64m2(...) __riscv_vsuxseg3ei32_v_f64m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_f64m2(...) __riscv_vsuxseg4ei32_v_f64m2(__VA_ARGS__)
+#define vsuxseg2ei32_v_f64m4(...) __riscv_vsuxseg2ei32_v_f64m4(__VA_ARGS__)
+#define vsuxseg2ei64_v_f64m1(...) __riscv_vsuxseg2ei64_v_f64m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_f64m1(...) __riscv_vsuxseg3ei64_v_f64m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_f64m1(...) __riscv_vsuxseg4ei64_v_f64m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_f64m1(...) __riscv_vsuxseg5ei64_v_f64m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_f64m1(...) __riscv_vsuxseg6ei64_v_f64m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_f64m1(...) __riscv_vsuxseg7ei64_v_f64m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_f64m1(...) __riscv_vsuxseg8ei64_v_f64m1(__VA_ARGS__)
+#define vsuxseg2ei64_v_f64m2(...) __riscv_vsuxseg2ei64_v_f64m2(__VA_ARGS__)
+#define vsuxseg3ei64_v_f64m2(...) __riscv_vsuxseg3ei64_v_f64m2(__VA_ARGS__)
+#define vsuxseg4ei64_v_f64m2(...) __riscv_vsuxseg4ei64_v_f64m2(__VA_ARGS__)
+#define vsuxseg2ei64_v_f64m4(...) __riscv_vsuxseg2ei64_v_f64m4(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8mf8(...) __riscv_vsoxseg2ei8_v_i8mf8(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8mf8(...) __riscv_vsoxseg3ei8_v_i8mf8(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8mf8(...) __riscv_vsoxseg4ei8_v_i8mf8(__VA_ARGS__)
+#define vsoxseg5ei8_v_i8mf8(...) __riscv_vsoxseg5ei8_v_i8mf8(__VA_ARGS__)
+#define vsoxseg6ei8_v_i8mf8(...) __riscv_vsoxseg6ei8_v_i8mf8(__VA_ARGS__)
+#define vsoxseg7ei8_v_i8mf8(...) __riscv_vsoxseg7ei8_v_i8mf8(__VA_ARGS__)
+#define vsoxseg8ei8_v_i8mf8(...) __riscv_vsoxseg8ei8_v_i8mf8(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8mf4(...) __riscv_vsoxseg2ei8_v_i8mf4(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8mf4(...) __riscv_vsoxseg3ei8_v_i8mf4(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8mf4(...) __riscv_vsoxseg4ei8_v_i8mf4(__VA_ARGS__)
+#define vsoxseg5ei8_v_i8mf4(...) __riscv_vsoxseg5ei8_v_i8mf4(__VA_ARGS__)
+#define vsoxseg6ei8_v_i8mf4(...) __riscv_vsoxseg6ei8_v_i8mf4(__VA_ARGS__)
+#define vsoxseg7ei8_v_i8mf4(...) __riscv_vsoxseg7ei8_v_i8mf4(__VA_ARGS__)
+#define vsoxseg8ei8_v_i8mf4(...) __riscv_vsoxseg8ei8_v_i8mf4(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8mf2(...) __riscv_vsoxseg2ei8_v_i8mf2(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8mf2(...) __riscv_vsoxseg3ei8_v_i8mf2(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8mf2(...) __riscv_vsoxseg4ei8_v_i8mf2(__VA_ARGS__)
+#define vsoxseg5ei8_v_i8mf2(...) __riscv_vsoxseg5ei8_v_i8mf2(__VA_ARGS__)
+#define vsoxseg6ei8_v_i8mf2(...) __riscv_vsoxseg6ei8_v_i8mf2(__VA_ARGS__)
+#define vsoxseg7ei8_v_i8mf2(...) __riscv_vsoxseg7ei8_v_i8mf2(__VA_ARGS__)
+#define vsoxseg8ei8_v_i8mf2(...) __riscv_vsoxseg8ei8_v_i8mf2(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8m1(...) __riscv_vsoxseg2ei8_v_i8m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8m1(...) __riscv_vsoxseg3ei8_v_i8m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8m1(...) __riscv_vsoxseg4ei8_v_i8m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_i8m1(...) __riscv_vsoxseg5ei8_v_i8m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_i8m1(...) __riscv_vsoxseg6ei8_v_i8m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_i8m1(...) __riscv_vsoxseg7ei8_v_i8m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_i8m1(...) __riscv_vsoxseg8ei8_v_i8m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8m2(...) __riscv_vsoxseg2ei8_v_i8m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8m2(...) __riscv_vsoxseg3ei8_v_i8m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8m2(...) __riscv_vsoxseg4ei8_v_i8m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8m4(...) __riscv_vsoxseg2ei8_v_i8m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8mf8(...) __riscv_vsoxseg2ei16_v_i8mf8(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8mf8(...) __riscv_vsoxseg3ei16_v_i8mf8(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8mf8(...) __riscv_vsoxseg4ei16_v_i8mf8(__VA_ARGS__)
+#define vsoxseg5ei16_v_i8mf8(...) __riscv_vsoxseg5ei16_v_i8mf8(__VA_ARGS__)
+#define vsoxseg6ei16_v_i8mf8(...) __riscv_vsoxseg6ei16_v_i8mf8(__VA_ARGS__)
+#define vsoxseg7ei16_v_i8mf8(...) __riscv_vsoxseg7ei16_v_i8mf8(__VA_ARGS__)
+#define vsoxseg8ei16_v_i8mf8(...) __riscv_vsoxseg8ei16_v_i8mf8(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8mf4(...) __riscv_vsoxseg2ei16_v_i8mf4(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8mf4(...) __riscv_vsoxseg3ei16_v_i8mf4(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8mf4(...) __riscv_vsoxseg4ei16_v_i8mf4(__VA_ARGS__)
+#define vsoxseg5ei16_v_i8mf4(...) __riscv_vsoxseg5ei16_v_i8mf4(__VA_ARGS__)
+#define vsoxseg6ei16_v_i8mf4(...) __riscv_vsoxseg6ei16_v_i8mf4(__VA_ARGS__)
+#define vsoxseg7ei16_v_i8mf4(...) __riscv_vsoxseg7ei16_v_i8mf4(__VA_ARGS__)
+#define vsoxseg8ei16_v_i8mf4(...) __riscv_vsoxseg8ei16_v_i8mf4(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8mf2(...) __riscv_vsoxseg2ei16_v_i8mf2(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8mf2(...) __riscv_vsoxseg3ei16_v_i8mf2(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8mf2(...) __riscv_vsoxseg4ei16_v_i8mf2(__VA_ARGS__)
+#define vsoxseg5ei16_v_i8mf2(...) __riscv_vsoxseg5ei16_v_i8mf2(__VA_ARGS__)
+#define vsoxseg6ei16_v_i8mf2(...) __riscv_vsoxseg6ei16_v_i8mf2(__VA_ARGS__)
+#define vsoxseg7ei16_v_i8mf2(...) __riscv_vsoxseg7ei16_v_i8mf2(__VA_ARGS__)
+#define vsoxseg8ei16_v_i8mf2(...) __riscv_vsoxseg8ei16_v_i8mf2(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8m1(...) __riscv_vsoxseg2ei16_v_i8m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8m1(...) __riscv_vsoxseg3ei16_v_i8m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8m1(...) __riscv_vsoxseg4ei16_v_i8m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_i8m1(...) __riscv_vsoxseg5ei16_v_i8m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_i8m1(...) __riscv_vsoxseg6ei16_v_i8m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_i8m1(...) __riscv_vsoxseg7ei16_v_i8m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_i8m1(...) __riscv_vsoxseg8ei16_v_i8m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8m2(...) __riscv_vsoxseg2ei16_v_i8m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8m2(...) __riscv_vsoxseg3ei16_v_i8m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8m2(...) __riscv_vsoxseg4ei16_v_i8m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8m4(...) __riscv_vsoxseg2ei16_v_i8m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8mf8(...) __riscv_vsoxseg2ei32_v_i8mf8(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8mf8(...) __riscv_vsoxseg3ei32_v_i8mf8(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8mf8(...) __riscv_vsoxseg4ei32_v_i8mf8(__VA_ARGS__)
+#define vsoxseg5ei32_v_i8mf8(...) __riscv_vsoxseg5ei32_v_i8mf8(__VA_ARGS__)
+#define vsoxseg6ei32_v_i8mf8(...) __riscv_vsoxseg6ei32_v_i8mf8(__VA_ARGS__)
+#define vsoxseg7ei32_v_i8mf8(...) __riscv_vsoxseg7ei32_v_i8mf8(__VA_ARGS__)
+#define vsoxseg8ei32_v_i8mf8(...) __riscv_vsoxseg8ei32_v_i8mf8(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8mf4(...) __riscv_vsoxseg2ei32_v_i8mf4(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8mf4(...) __riscv_vsoxseg3ei32_v_i8mf4(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8mf4(...) __riscv_vsoxseg4ei32_v_i8mf4(__VA_ARGS__)
+#define vsoxseg5ei32_v_i8mf4(...) __riscv_vsoxseg5ei32_v_i8mf4(__VA_ARGS__)
+#define vsoxseg6ei32_v_i8mf4(...) __riscv_vsoxseg6ei32_v_i8mf4(__VA_ARGS__)
+#define vsoxseg7ei32_v_i8mf4(...) __riscv_vsoxseg7ei32_v_i8mf4(__VA_ARGS__)
+#define vsoxseg8ei32_v_i8mf4(...) __riscv_vsoxseg8ei32_v_i8mf4(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8mf2(...) __riscv_vsoxseg2ei32_v_i8mf2(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8mf2(...) __riscv_vsoxseg3ei32_v_i8mf2(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8mf2(...) __riscv_vsoxseg4ei32_v_i8mf2(__VA_ARGS__)
+#define vsoxseg5ei32_v_i8mf2(...) __riscv_vsoxseg5ei32_v_i8mf2(__VA_ARGS__)
+#define vsoxseg6ei32_v_i8mf2(...) __riscv_vsoxseg6ei32_v_i8mf2(__VA_ARGS__)
+#define vsoxseg7ei32_v_i8mf2(...) __riscv_vsoxseg7ei32_v_i8mf2(__VA_ARGS__)
+#define vsoxseg8ei32_v_i8mf2(...) __riscv_vsoxseg8ei32_v_i8mf2(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8m1(...) __riscv_vsoxseg2ei32_v_i8m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8m1(...) __riscv_vsoxseg3ei32_v_i8m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8m1(...) __riscv_vsoxseg4ei32_v_i8m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_i8m1(...) __riscv_vsoxseg5ei32_v_i8m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_i8m1(...) __riscv_vsoxseg6ei32_v_i8m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_i8m1(...) __riscv_vsoxseg7ei32_v_i8m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_i8m1(...) __riscv_vsoxseg8ei32_v_i8m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8m2(...) __riscv_vsoxseg2ei32_v_i8m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8m2(...) __riscv_vsoxseg3ei32_v_i8m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8m2(...) __riscv_vsoxseg4ei32_v_i8m2(__VA_ARGS__)
+#define vsoxseg2ei64_v_i8mf8(...) __riscv_vsoxseg2ei64_v_i8mf8(__VA_ARGS__)
+#define vsoxseg3ei64_v_i8mf8(...) __riscv_vsoxseg3ei64_v_i8mf8(__VA_ARGS__)
+#define vsoxseg4ei64_v_i8mf8(...) __riscv_vsoxseg4ei64_v_i8mf8(__VA_ARGS__)
+#define vsoxseg5ei64_v_i8mf8(...) __riscv_vsoxseg5ei64_v_i8mf8(__VA_ARGS__)
+#define vsoxseg6ei64_v_i8mf8(...) __riscv_vsoxseg6ei64_v_i8mf8(__VA_ARGS__)
+#define vsoxseg7ei64_v_i8mf8(...) __riscv_vsoxseg7ei64_v_i8mf8(__VA_ARGS__)
+#define vsoxseg8ei64_v_i8mf8(...) __riscv_vsoxseg8ei64_v_i8mf8(__VA_ARGS__)
+#define vsoxseg2ei64_v_i8mf4(...) __riscv_vsoxseg2ei64_v_i8mf4(__VA_ARGS__)
+#define vsoxseg3ei64_v_i8mf4(...) __riscv_vsoxseg3ei64_v_i8mf4(__VA_ARGS__)
+#define vsoxseg4ei64_v_i8mf4(...) __riscv_vsoxseg4ei64_v_i8mf4(__VA_ARGS__)
+#define vsoxseg5ei64_v_i8mf4(...) __riscv_vsoxseg5ei64_v_i8mf4(__VA_ARGS__)
+#define vsoxseg6ei64_v_i8mf4(...) __riscv_vsoxseg6ei64_v_i8mf4(__VA_ARGS__)
+#define vsoxseg7ei64_v_i8mf4(...) __riscv_vsoxseg7ei64_v_i8mf4(__VA_ARGS__)
+#define vsoxseg8ei64_v_i8mf4(...) __riscv_vsoxseg8ei64_v_i8mf4(__VA_ARGS__)
+#define vsoxseg2ei64_v_i8mf2(...) __riscv_vsoxseg2ei64_v_i8mf2(__VA_ARGS__)
+#define vsoxseg3ei64_v_i8mf2(...) __riscv_vsoxseg3ei64_v_i8mf2(__VA_ARGS__)
+#define vsoxseg4ei64_v_i8mf2(...) __riscv_vsoxseg4ei64_v_i8mf2(__VA_ARGS__)
+#define vsoxseg5ei64_v_i8mf2(...) __riscv_vsoxseg5ei64_v_i8mf2(__VA_ARGS__)
+#define vsoxseg6ei64_v_i8mf2(...) __riscv_vsoxseg6ei64_v_i8mf2(__VA_ARGS__)
+#define vsoxseg7ei64_v_i8mf2(...) __riscv_vsoxseg7ei64_v_i8mf2(__VA_ARGS__)
+#define vsoxseg8ei64_v_i8mf2(...) __riscv_vsoxseg8ei64_v_i8mf2(__VA_ARGS__)
+#define vsoxseg2ei64_v_i8m1(...) __riscv_vsoxseg2ei64_v_i8m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_i8m1(...) __riscv_vsoxseg3ei64_v_i8m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_i8m1(...) __riscv_vsoxseg4ei64_v_i8m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_i8m1(...) __riscv_vsoxseg5ei64_v_i8m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_i8m1(...) __riscv_vsoxseg6ei64_v_i8m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_i8m1(...) __riscv_vsoxseg7ei64_v_i8m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_i8m1(...) __riscv_vsoxseg8ei64_v_i8m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16mf4(...) __riscv_vsoxseg2ei8_v_i16mf4(__VA_ARGS__)
+#define vsoxseg3ei8_v_i16mf4(...) __riscv_vsoxseg3ei8_v_i16mf4(__VA_ARGS__)
+#define vsoxseg4ei8_v_i16mf4(...) __riscv_vsoxseg4ei8_v_i16mf4(__VA_ARGS__)
+#define vsoxseg5ei8_v_i16mf4(...) __riscv_vsoxseg5ei8_v_i16mf4(__VA_ARGS__)
+#define vsoxseg6ei8_v_i16mf4(...) __riscv_vsoxseg6ei8_v_i16mf4(__VA_ARGS__)
+#define vsoxseg7ei8_v_i16mf4(...) __riscv_vsoxseg7ei8_v_i16mf4(__VA_ARGS__)
+#define vsoxseg8ei8_v_i16mf4(...) __riscv_vsoxseg8ei8_v_i16mf4(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16mf2(...) __riscv_vsoxseg2ei8_v_i16mf2(__VA_ARGS__)
+#define vsoxseg3ei8_v_i16mf2(...) __riscv_vsoxseg3ei8_v_i16mf2(__VA_ARGS__)
+#define vsoxseg4ei8_v_i16mf2(...) __riscv_vsoxseg4ei8_v_i16mf2(__VA_ARGS__)
+#define vsoxseg5ei8_v_i16mf2(...) __riscv_vsoxseg5ei8_v_i16mf2(__VA_ARGS__)
+#define vsoxseg6ei8_v_i16mf2(...) __riscv_vsoxseg6ei8_v_i16mf2(__VA_ARGS__)
+#define vsoxseg7ei8_v_i16mf2(...) __riscv_vsoxseg7ei8_v_i16mf2(__VA_ARGS__)
+#define vsoxseg8ei8_v_i16mf2(...) __riscv_vsoxseg8ei8_v_i16mf2(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16m1(...) __riscv_vsoxseg2ei8_v_i16m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_i16m1(...) __riscv_vsoxseg3ei8_v_i16m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_i16m1(...) __riscv_vsoxseg4ei8_v_i16m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_i16m1(...) __riscv_vsoxseg5ei8_v_i16m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_i16m1(...) __riscv_vsoxseg6ei8_v_i16m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_i16m1(...) __riscv_vsoxseg7ei8_v_i16m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_i16m1(...) __riscv_vsoxseg8ei8_v_i16m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16m2(...) __riscv_vsoxseg2ei8_v_i16m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_i16m2(...) __riscv_vsoxseg3ei8_v_i16m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_i16m2(...) __riscv_vsoxseg4ei8_v_i16m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16m4(...) __riscv_vsoxseg2ei8_v_i16m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16mf4(...) __riscv_vsoxseg2ei16_v_i16mf4(__VA_ARGS__)
+#define vsoxseg3ei16_v_i16mf4(...) __riscv_vsoxseg3ei16_v_i16mf4(__VA_ARGS__)
+#define vsoxseg4ei16_v_i16mf4(...) __riscv_vsoxseg4ei16_v_i16mf4(__VA_ARGS__)
+#define vsoxseg5ei16_v_i16mf4(...) __riscv_vsoxseg5ei16_v_i16mf4(__VA_ARGS__)
+#define vsoxseg6ei16_v_i16mf4(...) __riscv_vsoxseg6ei16_v_i16mf4(__VA_ARGS__)
+#define vsoxseg7ei16_v_i16mf4(...) __riscv_vsoxseg7ei16_v_i16mf4(__VA_ARGS__)
+#define vsoxseg8ei16_v_i16mf4(...) __riscv_vsoxseg8ei16_v_i16mf4(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16mf2(...) __riscv_vsoxseg2ei16_v_i16mf2(__VA_ARGS__)
+#define vsoxseg3ei16_v_i16mf2(...) __riscv_vsoxseg3ei16_v_i16mf2(__VA_ARGS__)
+#define vsoxseg4ei16_v_i16mf2(...) __riscv_vsoxseg4ei16_v_i16mf2(__VA_ARGS__)
+#define vsoxseg5ei16_v_i16mf2(...) __riscv_vsoxseg5ei16_v_i16mf2(__VA_ARGS__)
+#define vsoxseg6ei16_v_i16mf2(...) __riscv_vsoxseg6ei16_v_i16mf2(__VA_ARGS__)
+#define vsoxseg7ei16_v_i16mf2(...) __riscv_vsoxseg7ei16_v_i16mf2(__VA_ARGS__)
+#define vsoxseg8ei16_v_i16mf2(...) __riscv_vsoxseg8ei16_v_i16mf2(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16m1(...) __riscv_vsoxseg2ei16_v_i16m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_i16m1(...) __riscv_vsoxseg3ei16_v_i16m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_i16m1(...) __riscv_vsoxseg4ei16_v_i16m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_i16m1(...) __riscv_vsoxseg5ei16_v_i16m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_i16m1(...) __riscv_vsoxseg6ei16_v_i16m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_i16m1(...) __riscv_vsoxseg7ei16_v_i16m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_i16m1(...) __riscv_vsoxseg8ei16_v_i16m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16m2(...) __riscv_vsoxseg2ei16_v_i16m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_i16m2(...) __riscv_vsoxseg3ei16_v_i16m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_i16m2(...) __riscv_vsoxseg4ei16_v_i16m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16m4(...) __riscv_vsoxseg2ei16_v_i16m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16mf4(...) __riscv_vsoxseg2ei32_v_i16mf4(__VA_ARGS__)
+#define vsoxseg3ei32_v_i16mf4(...) __riscv_vsoxseg3ei32_v_i16mf4(__VA_ARGS__)
+#define vsoxseg4ei32_v_i16mf4(...) __riscv_vsoxseg4ei32_v_i16mf4(__VA_ARGS__)
+#define vsoxseg5ei32_v_i16mf4(...) __riscv_vsoxseg5ei32_v_i16mf4(__VA_ARGS__)
+#define vsoxseg6ei32_v_i16mf4(...) __riscv_vsoxseg6ei32_v_i16mf4(__VA_ARGS__)
+#define vsoxseg7ei32_v_i16mf4(...) __riscv_vsoxseg7ei32_v_i16mf4(__VA_ARGS__)
+#define vsoxseg8ei32_v_i16mf4(...) __riscv_vsoxseg8ei32_v_i16mf4(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16mf2(...) __riscv_vsoxseg2ei32_v_i16mf2(__VA_ARGS__)
+#define vsoxseg3ei32_v_i16mf2(...) __riscv_vsoxseg3ei32_v_i16mf2(__VA_ARGS__)
+#define vsoxseg4ei32_v_i16mf2(...) __riscv_vsoxseg4ei32_v_i16mf2(__VA_ARGS__)
+#define vsoxseg5ei32_v_i16mf2(...) __riscv_vsoxseg5ei32_v_i16mf2(__VA_ARGS__)
+#define vsoxseg6ei32_v_i16mf2(...) __riscv_vsoxseg6ei32_v_i16mf2(__VA_ARGS__)
+#define vsoxseg7ei32_v_i16mf2(...) __riscv_vsoxseg7ei32_v_i16mf2(__VA_ARGS__)
+#define vsoxseg8ei32_v_i16mf2(...) __riscv_vsoxseg8ei32_v_i16mf2(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16m1(...) __riscv_vsoxseg2ei32_v_i16m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_i16m1(...) __riscv_vsoxseg3ei32_v_i16m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_i16m1(...) __riscv_vsoxseg4ei32_v_i16m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_i16m1(...) __riscv_vsoxseg5ei32_v_i16m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_i16m1(...) __riscv_vsoxseg6ei32_v_i16m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_i16m1(...) __riscv_vsoxseg7ei32_v_i16m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_i16m1(...) __riscv_vsoxseg8ei32_v_i16m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16m2(...) __riscv_vsoxseg2ei32_v_i16m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_i16m2(...) __riscv_vsoxseg3ei32_v_i16m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_i16m2(...) __riscv_vsoxseg4ei32_v_i16m2(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16m4(...) __riscv_vsoxseg2ei32_v_i16m4(__VA_ARGS__)
+#define vsoxseg2ei64_v_i16mf4(...) __riscv_vsoxseg2ei64_v_i16mf4(__VA_ARGS__)
+#define vsoxseg3ei64_v_i16mf4(...) __riscv_vsoxseg3ei64_v_i16mf4(__VA_ARGS__)
+#define vsoxseg4ei64_v_i16mf4(...) __riscv_vsoxseg4ei64_v_i16mf4(__VA_ARGS__)
+#define vsoxseg5ei64_v_i16mf4(...) __riscv_vsoxseg5ei64_v_i16mf4(__VA_ARGS__)
+#define vsoxseg6ei64_v_i16mf4(...) __riscv_vsoxseg6ei64_v_i16mf4(__VA_ARGS__)
+#define vsoxseg7ei64_v_i16mf4(...) __riscv_vsoxseg7ei64_v_i16mf4(__VA_ARGS__)
+#define vsoxseg8ei64_v_i16mf4(...) __riscv_vsoxseg8ei64_v_i16mf4(__VA_ARGS__)
+#define vsoxseg2ei64_v_i16mf2(...) __riscv_vsoxseg2ei64_v_i16mf2(__VA_ARGS__)
+#define vsoxseg3ei64_v_i16mf2(...) __riscv_vsoxseg3ei64_v_i16mf2(__VA_ARGS__)
+#define vsoxseg4ei64_v_i16mf2(...) __riscv_vsoxseg4ei64_v_i16mf2(__VA_ARGS__)
+#define vsoxseg5ei64_v_i16mf2(...) __riscv_vsoxseg5ei64_v_i16mf2(__VA_ARGS__)
+#define vsoxseg6ei64_v_i16mf2(...) __riscv_vsoxseg6ei64_v_i16mf2(__VA_ARGS__)
+#define vsoxseg7ei64_v_i16mf2(...) __riscv_vsoxseg7ei64_v_i16mf2(__VA_ARGS__)
+#define vsoxseg8ei64_v_i16mf2(...) __riscv_vsoxseg8ei64_v_i16mf2(__VA_ARGS__)
+#define vsoxseg2ei64_v_i16m1(...) __riscv_vsoxseg2ei64_v_i16m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_i16m1(...) __riscv_vsoxseg3ei64_v_i16m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_i16m1(...) __riscv_vsoxseg4ei64_v_i16m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_i16m1(...) __riscv_vsoxseg5ei64_v_i16m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_i16m1(...) __riscv_vsoxseg6ei64_v_i16m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_i16m1(...) __riscv_vsoxseg7ei64_v_i16m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_i16m1(...) __riscv_vsoxseg8ei64_v_i16m1(__VA_ARGS__)
+#define vsoxseg2ei64_v_i16m2(...) __riscv_vsoxseg2ei64_v_i16m2(__VA_ARGS__)
+#define vsoxseg3ei64_v_i16m2(...) __riscv_vsoxseg3ei64_v_i16m2(__VA_ARGS__)
+#define vsoxseg4ei64_v_i16m2(...) __riscv_vsoxseg4ei64_v_i16m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_i32mf2(...) __riscv_vsoxseg2ei8_v_i32mf2(__VA_ARGS__)
+#define vsoxseg3ei8_v_i32mf2(...) __riscv_vsoxseg3ei8_v_i32mf2(__VA_ARGS__)
+#define vsoxseg4ei8_v_i32mf2(...) __riscv_vsoxseg4ei8_v_i32mf2(__VA_ARGS__)
+#define vsoxseg5ei8_v_i32mf2(...) __riscv_vsoxseg5ei8_v_i32mf2(__VA_ARGS__)
+#define vsoxseg6ei8_v_i32mf2(...) __riscv_vsoxseg6ei8_v_i32mf2(__VA_ARGS__)
+#define vsoxseg7ei8_v_i32mf2(...) __riscv_vsoxseg7ei8_v_i32mf2(__VA_ARGS__)
+#define vsoxseg8ei8_v_i32mf2(...) __riscv_vsoxseg8ei8_v_i32mf2(__VA_ARGS__)
+#define vsoxseg2ei8_v_i32m1(...) __riscv_vsoxseg2ei8_v_i32m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_i32m1(...) __riscv_vsoxseg3ei8_v_i32m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_i32m1(...) __riscv_vsoxseg4ei8_v_i32m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_i32m1(...) __riscv_vsoxseg5ei8_v_i32m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_i32m1(...) __riscv_vsoxseg6ei8_v_i32m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_i32m1(...) __riscv_vsoxseg7ei8_v_i32m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_i32m1(...) __riscv_vsoxseg8ei8_v_i32m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_i32m2(...) __riscv_vsoxseg2ei8_v_i32m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_i32m2(...) __riscv_vsoxseg3ei8_v_i32m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_i32m2(...) __riscv_vsoxseg4ei8_v_i32m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_i32m4(...) __riscv_vsoxseg2ei8_v_i32m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_i32mf2(...) __riscv_vsoxseg2ei16_v_i32mf2(__VA_ARGS__)
+#define vsoxseg3ei16_v_i32mf2(...) __riscv_vsoxseg3ei16_v_i32mf2(__VA_ARGS__)
+#define vsoxseg4ei16_v_i32mf2(...) __riscv_vsoxseg4ei16_v_i32mf2(__VA_ARGS__)
+#define vsoxseg5ei16_v_i32mf2(...) __riscv_vsoxseg5ei16_v_i32mf2(__VA_ARGS__)
+#define vsoxseg6ei16_v_i32mf2(...) __riscv_vsoxseg6ei16_v_i32mf2(__VA_ARGS__)
+#define vsoxseg7ei16_v_i32mf2(...) __riscv_vsoxseg7ei16_v_i32mf2(__VA_ARGS__)
+#define vsoxseg8ei16_v_i32mf2(...) __riscv_vsoxseg8ei16_v_i32mf2(__VA_ARGS__)
+#define vsoxseg2ei16_v_i32m1(...) __riscv_vsoxseg2ei16_v_i32m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_i32m1(...) __riscv_vsoxseg3ei16_v_i32m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_i32m1(...) __riscv_vsoxseg4ei16_v_i32m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_i32m1(...) __riscv_vsoxseg5ei16_v_i32m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_i32m1(...) __riscv_vsoxseg6ei16_v_i32m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_i32m1(...) __riscv_vsoxseg7ei16_v_i32m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_i32m1(...) __riscv_vsoxseg8ei16_v_i32m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_i32m2(...) __riscv_vsoxseg2ei16_v_i32m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_i32m2(...) __riscv_vsoxseg3ei16_v_i32m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_i32m2(...) __riscv_vsoxseg4ei16_v_i32m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_i32m4(...) __riscv_vsoxseg2ei16_v_i32m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_i32mf2(...) __riscv_vsoxseg2ei32_v_i32mf2(__VA_ARGS__)
+#define vsoxseg3ei32_v_i32mf2(...) __riscv_vsoxseg3ei32_v_i32mf2(__VA_ARGS__)
+#define vsoxseg4ei32_v_i32mf2(...) __riscv_vsoxseg4ei32_v_i32mf2(__VA_ARGS__)
+#define vsoxseg5ei32_v_i32mf2(...) __riscv_vsoxseg5ei32_v_i32mf2(__VA_ARGS__)
+#define vsoxseg6ei32_v_i32mf2(...) __riscv_vsoxseg6ei32_v_i32mf2(__VA_ARGS__)
+#define vsoxseg7ei32_v_i32mf2(...) __riscv_vsoxseg7ei32_v_i32mf2(__VA_ARGS__)
+#define vsoxseg8ei32_v_i32mf2(...) __riscv_vsoxseg8ei32_v_i32mf2(__VA_ARGS__)
+#define vsoxseg2ei32_v_i32m1(...) __riscv_vsoxseg2ei32_v_i32m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_i32m1(...) __riscv_vsoxseg3ei32_v_i32m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_i32m1(...) __riscv_vsoxseg4ei32_v_i32m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_i32m1(...) __riscv_vsoxseg5ei32_v_i32m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_i32m1(...) __riscv_vsoxseg6ei32_v_i32m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_i32m1(...) __riscv_vsoxseg7ei32_v_i32m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_i32m1(...) __riscv_vsoxseg8ei32_v_i32m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_i32m2(...) __riscv_vsoxseg2ei32_v_i32m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_i32m2(...) __riscv_vsoxseg3ei32_v_i32m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_i32m2(...) __riscv_vsoxseg4ei32_v_i32m2(__VA_ARGS__)
+#define vsoxseg2ei32_v_i32m4(...) __riscv_vsoxseg2ei32_v_i32m4(__VA_ARGS__)
+#define vsoxseg2ei64_v_i32mf2(...) __riscv_vsoxseg2ei64_v_i32mf2(__VA_ARGS__)
+#define vsoxseg3ei64_v_i32mf2(...) __riscv_vsoxseg3ei64_v_i32mf2(__VA_ARGS__)
+#define vsoxseg4ei64_v_i32mf2(...) __riscv_vsoxseg4ei64_v_i32mf2(__VA_ARGS__)
+#define vsoxseg5ei64_v_i32mf2(...) __riscv_vsoxseg5ei64_v_i32mf2(__VA_ARGS__)
+#define vsoxseg6ei64_v_i32mf2(...) __riscv_vsoxseg6ei64_v_i32mf2(__VA_ARGS__)
+#define vsoxseg7ei64_v_i32mf2(...) __riscv_vsoxseg7ei64_v_i32mf2(__VA_ARGS__)
+#define vsoxseg8ei64_v_i32mf2(...) __riscv_vsoxseg8ei64_v_i32mf2(__VA_ARGS__)
+#define vsoxseg2ei64_v_i32m1(...) __riscv_vsoxseg2ei64_v_i32m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_i32m1(...) __riscv_vsoxseg3ei64_v_i32m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_i32m1(...) __riscv_vsoxseg4ei64_v_i32m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_i32m1(...) __riscv_vsoxseg5ei64_v_i32m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_i32m1(...) __riscv_vsoxseg6ei64_v_i32m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_i32m1(...) __riscv_vsoxseg7ei64_v_i32m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_i32m1(...) __riscv_vsoxseg8ei64_v_i32m1(__VA_ARGS__)
+#define vsoxseg2ei64_v_i32m2(...) __riscv_vsoxseg2ei64_v_i32m2(__VA_ARGS__)
+#define vsoxseg3ei64_v_i32m2(...) __riscv_vsoxseg3ei64_v_i32m2(__VA_ARGS__)
+#define vsoxseg4ei64_v_i32m2(...) __riscv_vsoxseg4ei64_v_i32m2(__VA_ARGS__)
+#define vsoxseg2ei64_v_i32m4(...) __riscv_vsoxseg2ei64_v_i32m4(__VA_ARGS__)
+#define vsoxseg2ei8_v_i64m1(...) __riscv_vsoxseg2ei8_v_i64m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_i64m1(...) __riscv_vsoxseg3ei8_v_i64m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_i64m1(...) __riscv_vsoxseg4ei8_v_i64m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_i64m1(...) __riscv_vsoxseg5ei8_v_i64m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_i64m1(...) __riscv_vsoxseg6ei8_v_i64m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_i64m1(...) __riscv_vsoxseg7ei8_v_i64m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_i64m1(...) __riscv_vsoxseg8ei8_v_i64m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_i64m2(...) __riscv_vsoxseg2ei8_v_i64m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_i64m2(...) __riscv_vsoxseg3ei8_v_i64m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_i64m2(...) __riscv_vsoxseg4ei8_v_i64m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_i64m4(...) __riscv_vsoxseg2ei8_v_i64m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_i64m1(...) __riscv_vsoxseg2ei16_v_i64m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_i64m1(...) __riscv_vsoxseg3ei16_v_i64m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_i64m1(...) __riscv_vsoxseg4ei16_v_i64m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_i64m1(...) __riscv_vsoxseg5ei16_v_i64m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_i64m1(...) __riscv_vsoxseg6ei16_v_i64m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_i64m1(...) __riscv_vsoxseg7ei16_v_i64m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_i64m1(...) __riscv_vsoxseg8ei16_v_i64m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_i64m2(...) __riscv_vsoxseg2ei16_v_i64m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_i64m2(...) __riscv_vsoxseg3ei16_v_i64m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_i64m2(...) __riscv_vsoxseg4ei16_v_i64m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_i64m4(...) __riscv_vsoxseg2ei16_v_i64m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_i64m1(...) __riscv_vsoxseg2ei32_v_i64m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_i64m1(...) __riscv_vsoxseg3ei32_v_i64m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_i64m1(...) __riscv_vsoxseg4ei32_v_i64m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_i64m1(...) __riscv_vsoxseg5ei32_v_i64m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_i64m1(...) __riscv_vsoxseg6ei32_v_i64m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_i64m1(...) __riscv_vsoxseg7ei32_v_i64m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_i64m1(...) __riscv_vsoxseg8ei32_v_i64m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_i64m2(...) __riscv_vsoxseg2ei32_v_i64m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_i64m2(...) __riscv_vsoxseg3ei32_v_i64m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_i64m2(...) __riscv_vsoxseg4ei32_v_i64m2(__VA_ARGS__)
+#define vsoxseg2ei32_v_i64m4(...) __riscv_vsoxseg2ei32_v_i64m4(__VA_ARGS__)
+#define vsoxseg2ei64_v_i64m1(...) __riscv_vsoxseg2ei64_v_i64m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_i64m1(...) __riscv_vsoxseg3ei64_v_i64m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_i64m1(...) __riscv_vsoxseg4ei64_v_i64m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_i64m1(...) __riscv_vsoxseg5ei64_v_i64m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_i64m1(...) __riscv_vsoxseg6ei64_v_i64m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_i64m1(...) __riscv_vsoxseg7ei64_v_i64m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_i64m1(...) __riscv_vsoxseg8ei64_v_i64m1(__VA_ARGS__)
+#define vsoxseg2ei64_v_i64m2(...) __riscv_vsoxseg2ei64_v_i64m2(__VA_ARGS__)
+#define vsoxseg3ei64_v_i64m2(...) __riscv_vsoxseg3ei64_v_i64m2(__VA_ARGS__)
+#define vsoxseg4ei64_v_i64m2(...) __riscv_vsoxseg4ei64_v_i64m2(__VA_ARGS__)
+#define vsoxseg2ei64_v_i64m4(...) __riscv_vsoxseg2ei64_v_i64m4(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8mf8(...) __riscv_vsuxseg2ei8_v_i8mf8(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8mf8(...) __riscv_vsuxseg3ei8_v_i8mf8(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8mf8(...) __riscv_vsuxseg4ei8_v_i8mf8(__VA_ARGS__)
+#define vsuxseg5ei8_v_i8mf8(...) __riscv_vsuxseg5ei8_v_i8mf8(__VA_ARGS__)
+#define vsuxseg6ei8_v_i8mf8(...) __riscv_vsuxseg6ei8_v_i8mf8(__VA_ARGS__)
+#define vsuxseg7ei8_v_i8mf8(...) __riscv_vsuxseg7ei8_v_i8mf8(__VA_ARGS__)
+#define vsuxseg8ei8_v_i8mf8(...) __riscv_vsuxseg8ei8_v_i8mf8(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8mf4(...) __riscv_vsuxseg2ei8_v_i8mf4(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8mf4(...) __riscv_vsuxseg3ei8_v_i8mf4(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8mf4(...) __riscv_vsuxseg4ei8_v_i8mf4(__VA_ARGS__)
+#define vsuxseg5ei8_v_i8mf4(...) __riscv_vsuxseg5ei8_v_i8mf4(__VA_ARGS__)
+#define vsuxseg6ei8_v_i8mf4(...) __riscv_vsuxseg6ei8_v_i8mf4(__VA_ARGS__)
+#define vsuxseg7ei8_v_i8mf4(...) __riscv_vsuxseg7ei8_v_i8mf4(__VA_ARGS__)
+#define vsuxseg8ei8_v_i8mf4(...) __riscv_vsuxseg8ei8_v_i8mf4(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8mf2(...) __riscv_vsuxseg2ei8_v_i8mf2(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8mf2(...) __riscv_vsuxseg3ei8_v_i8mf2(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8mf2(...) __riscv_vsuxseg4ei8_v_i8mf2(__VA_ARGS__)
+#define vsuxseg5ei8_v_i8mf2(...) __riscv_vsuxseg5ei8_v_i8mf2(__VA_ARGS__)
+#define vsuxseg6ei8_v_i8mf2(...) __riscv_vsuxseg6ei8_v_i8mf2(__VA_ARGS__)
+#define vsuxseg7ei8_v_i8mf2(...) __riscv_vsuxseg7ei8_v_i8mf2(__VA_ARGS__)
+#define vsuxseg8ei8_v_i8mf2(...) __riscv_vsuxseg8ei8_v_i8mf2(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8m1(...) __riscv_vsuxseg2ei8_v_i8m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8m1(...) __riscv_vsuxseg3ei8_v_i8m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8m1(...) __riscv_vsuxseg4ei8_v_i8m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_i8m1(...) __riscv_vsuxseg5ei8_v_i8m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_i8m1(...) __riscv_vsuxseg6ei8_v_i8m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_i8m1(...) __riscv_vsuxseg7ei8_v_i8m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_i8m1(...) __riscv_vsuxseg8ei8_v_i8m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8m2(...) __riscv_vsuxseg2ei8_v_i8m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8m2(...) __riscv_vsuxseg3ei8_v_i8m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8m2(...) __riscv_vsuxseg4ei8_v_i8m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8m4(...) __riscv_vsuxseg2ei8_v_i8m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8mf8(...) __riscv_vsuxseg2ei16_v_i8mf8(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8mf8(...) __riscv_vsuxseg3ei16_v_i8mf8(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8mf8(...) __riscv_vsuxseg4ei16_v_i8mf8(__VA_ARGS__)
+#define vsuxseg5ei16_v_i8mf8(...) __riscv_vsuxseg5ei16_v_i8mf8(__VA_ARGS__)
+#define vsuxseg6ei16_v_i8mf8(...) __riscv_vsuxseg6ei16_v_i8mf8(__VA_ARGS__)
+#define vsuxseg7ei16_v_i8mf8(...) __riscv_vsuxseg7ei16_v_i8mf8(__VA_ARGS__)
+#define vsuxseg8ei16_v_i8mf8(...) __riscv_vsuxseg8ei16_v_i8mf8(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8mf4(...) __riscv_vsuxseg2ei16_v_i8mf4(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8mf4(...) __riscv_vsuxseg3ei16_v_i8mf4(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8mf4(...) __riscv_vsuxseg4ei16_v_i8mf4(__VA_ARGS__)
+#define vsuxseg5ei16_v_i8mf4(...) __riscv_vsuxseg5ei16_v_i8mf4(__VA_ARGS__)
+#define vsuxseg6ei16_v_i8mf4(...) __riscv_vsuxseg6ei16_v_i8mf4(__VA_ARGS__)
+#define vsuxseg7ei16_v_i8mf4(...) __riscv_vsuxseg7ei16_v_i8mf4(__VA_ARGS__)
+#define vsuxseg8ei16_v_i8mf4(...) __riscv_vsuxseg8ei16_v_i8mf4(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8mf2(...) __riscv_vsuxseg2ei16_v_i8mf2(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8mf2(...) __riscv_vsuxseg3ei16_v_i8mf2(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8mf2(...) __riscv_vsuxseg4ei16_v_i8mf2(__VA_ARGS__)
+#define vsuxseg5ei16_v_i8mf2(...) __riscv_vsuxseg5ei16_v_i8mf2(__VA_ARGS__)
+#define vsuxseg6ei16_v_i8mf2(...) __riscv_vsuxseg6ei16_v_i8mf2(__VA_ARGS__)
+#define vsuxseg7ei16_v_i8mf2(...) __riscv_vsuxseg7ei16_v_i8mf2(__VA_ARGS__)
+#define vsuxseg8ei16_v_i8mf2(...) __riscv_vsuxseg8ei16_v_i8mf2(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8m1(...) __riscv_vsuxseg2ei16_v_i8m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8m1(...) __riscv_vsuxseg3ei16_v_i8m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8m1(...) __riscv_vsuxseg4ei16_v_i8m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_i8m1(...) __riscv_vsuxseg5ei16_v_i8m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_i8m1(...) __riscv_vsuxseg6ei16_v_i8m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_i8m1(...) __riscv_vsuxseg7ei16_v_i8m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_i8m1(...) __riscv_vsuxseg8ei16_v_i8m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8m2(...) __riscv_vsuxseg2ei16_v_i8m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8m2(...) __riscv_vsuxseg3ei16_v_i8m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8m2(...) __riscv_vsuxseg4ei16_v_i8m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8m4(...) __riscv_vsuxseg2ei16_v_i8m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8mf8(...) __riscv_vsuxseg2ei32_v_i8mf8(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8mf8(...) __riscv_vsuxseg3ei32_v_i8mf8(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8mf8(...) __riscv_vsuxseg4ei32_v_i8mf8(__VA_ARGS__)
+#define vsuxseg5ei32_v_i8mf8(...) __riscv_vsuxseg5ei32_v_i8mf8(__VA_ARGS__)
+#define vsuxseg6ei32_v_i8mf8(...) __riscv_vsuxseg6ei32_v_i8mf8(__VA_ARGS__)
+#define vsuxseg7ei32_v_i8mf8(...) __riscv_vsuxseg7ei32_v_i8mf8(__VA_ARGS__)
+#define vsuxseg8ei32_v_i8mf8(...) __riscv_vsuxseg8ei32_v_i8mf8(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8mf4(...) __riscv_vsuxseg2ei32_v_i8mf4(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8mf4(...) __riscv_vsuxseg3ei32_v_i8mf4(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8mf4(...) __riscv_vsuxseg4ei32_v_i8mf4(__VA_ARGS__)
+#define vsuxseg5ei32_v_i8mf4(...) __riscv_vsuxseg5ei32_v_i8mf4(__VA_ARGS__)
+#define vsuxseg6ei32_v_i8mf4(...) __riscv_vsuxseg6ei32_v_i8mf4(__VA_ARGS__)
+#define vsuxseg7ei32_v_i8mf4(...) __riscv_vsuxseg7ei32_v_i8mf4(__VA_ARGS__)
+#define vsuxseg8ei32_v_i8mf4(...) __riscv_vsuxseg8ei32_v_i8mf4(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8mf2(...) __riscv_vsuxseg2ei32_v_i8mf2(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8mf2(...) __riscv_vsuxseg3ei32_v_i8mf2(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8mf2(...) __riscv_vsuxseg4ei32_v_i8mf2(__VA_ARGS__)
+#define vsuxseg5ei32_v_i8mf2(...) __riscv_vsuxseg5ei32_v_i8mf2(__VA_ARGS__)
+#define vsuxseg6ei32_v_i8mf2(...) __riscv_vsuxseg6ei32_v_i8mf2(__VA_ARGS__)
+#define vsuxseg7ei32_v_i8mf2(...) __riscv_vsuxseg7ei32_v_i8mf2(__VA_ARGS__)
+#define vsuxseg8ei32_v_i8mf2(...) __riscv_vsuxseg8ei32_v_i8mf2(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8m1(...) __riscv_vsuxseg2ei32_v_i8m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8m1(...) __riscv_vsuxseg3ei32_v_i8m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8m1(...) __riscv_vsuxseg4ei32_v_i8m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_i8m1(...) __riscv_vsuxseg5ei32_v_i8m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_i8m1(...) __riscv_vsuxseg6ei32_v_i8m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_i8m1(...) __riscv_vsuxseg7ei32_v_i8m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_i8m1(...) __riscv_vsuxseg8ei32_v_i8m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8m2(...) __riscv_vsuxseg2ei32_v_i8m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8m2(...) __riscv_vsuxseg3ei32_v_i8m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8m2(...) __riscv_vsuxseg4ei32_v_i8m2(__VA_ARGS__)
+#define vsuxseg2ei64_v_i8mf8(...) __riscv_vsuxseg2ei64_v_i8mf8(__VA_ARGS__)
+#define vsuxseg3ei64_v_i8mf8(...) __riscv_vsuxseg3ei64_v_i8mf8(__VA_ARGS__)
+#define vsuxseg4ei64_v_i8mf8(...) __riscv_vsuxseg4ei64_v_i8mf8(__VA_ARGS__)
+#define vsuxseg5ei64_v_i8mf8(...) __riscv_vsuxseg5ei64_v_i8mf8(__VA_ARGS__)
+#define vsuxseg6ei64_v_i8mf8(...) __riscv_vsuxseg6ei64_v_i8mf8(__VA_ARGS__)
+#define vsuxseg7ei64_v_i8mf8(...) __riscv_vsuxseg7ei64_v_i8mf8(__VA_ARGS__)
+#define vsuxseg8ei64_v_i8mf8(...) __riscv_vsuxseg8ei64_v_i8mf8(__VA_ARGS__)
+#define vsuxseg2ei64_v_i8mf4(...) __riscv_vsuxseg2ei64_v_i8mf4(__VA_ARGS__)
+#define vsuxseg3ei64_v_i8mf4(...) __riscv_vsuxseg3ei64_v_i8mf4(__VA_ARGS__)
+#define vsuxseg4ei64_v_i8mf4(...) __riscv_vsuxseg4ei64_v_i8mf4(__VA_ARGS__)
+#define vsuxseg5ei64_v_i8mf4(...) __riscv_vsuxseg5ei64_v_i8mf4(__VA_ARGS__)
+#define vsuxseg6ei64_v_i8mf4(...) __riscv_vsuxseg6ei64_v_i8mf4(__VA_ARGS__)
+#define vsuxseg7ei64_v_i8mf4(...) __riscv_vsuxseg7ei64_v_i8mf4(__VA_ARGS__)
+#define vsuxseg8ei64_v_i8mf4(...) __riscv_vsuxseg8ei64_v_i8mf4(__VA_ARGS__)
+#define vsuxseg2ei64_v_i8mf2(...) __riscv_vsuxseg2ei64_v_i8mf2(__VA_ARGS__)
+#define vsuxseg3ei64_v_i8mf2(...) __riscv_vsuxseg3ei64_v_i8mf2(__VA_ARGS__)
+#define vsuxseg4ei64_v_i8mf2(...) __riscv_vsuxseg4ei64_v_i8mf2(__VA_ARGS__)
+#define vsuxseg5ei64_v_i8mf2(...) __riscv_vsuxseg5ei64_v_i8mf2(__VA_ARGS__)
+#define vsuxseg6ei64_v_i8mf2(...) __riscv_vsuxseg6ei64_v_i8mf2(__VA_ARGS__)
+#define vsuxseg7ei64_v_i8mf2(...) __riscv_vsuxseg7ei64_v_i8mf2(__VA_ARGS__)
+#define vsuxseg8ei64_v_i8mf2(...) __riscv_vsuxseg8ei64_v_i8mf2(__VA_ARGS__)
+#define vsuxseg2ei64_v_i8m1(...) __riscv_vsuxseg2ei64_v_i8m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_i8m1(...) __riscv_vsuxseg3ei64_v_i8m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_i8m1(...) __riscv_vsuxseg4ei64_v_i8m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_i8m1(...) __riscv_vsuxseg5ei64_v_i8m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_i8m1(...) __riscv_vsuxseg6ei64_v_i8m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_i8m1(...) __riscv_vsuxseg7ei64_v_i8m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_i8m1(...) __riscv_vsuxseg8ei64_v_i8m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16mf4(...) __riscv_vsuxseg2ei8_v_i16mf4(__VA_ARGS__)
+#define vsuxseg3ei8_v_i16mf4(...) __riscv_vsuxseg3ei8_v_i16mf4(__VA_ARGS__)
+#define vsuxseg4ei8_v_i16mf4(...) __riscv_vsuxseg4ei8_v_i16mf4(__VA_ARGS__)
+#define vsuxseg5ei8_v_i16mf4(...) __riscv_vsuxseg5ei8_v_i16mf4(__VA_ARGS__)
+#define vsuxseg6ei8_v_i16mf4(...) __riscv_vsuxseg6ei8_v_i16mf4(__VA_ARGS__)
+#define vsuxseg7ei8_v_i16mf4(...) __riscv_vsuxseg7ei8_v_i16mf4(__VA_ARGS__)
+#define vsuxseg8ei8_v_i16mf4(...) __riscv_vsuxseg8ei8_v_i16mf4(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16mf2(...) __riscv_vsuxseg2ei8_v_i16mf2(__VA_ARGS__)
+#define vsuxseg3ei8_v_i16mf2(...) __riscv_vsuxseg3ei8_v_i16mf2(__VA_ARGS__)
+#define vsuxseg4ei8_v_i16mf2(...) __riscv_vsuxseg4ei8_v_i16mf2(__VA_ARGS__)
+#define vsuxseg5ei8_v_i16mf2(...) __riscv_vsuxseg5ei8_v_i16mf2(__VA_ARGS__)
+#define vsuxseg6ei8_v_i16mf2(...) __riscv_vsuxseg6ei8_v_i16mf2(__VA_ARGS__)
+#define vsuxseg7ei8_v_i16mf2(...) __riscv_vsuxseg7ei8_v_i16mf2(__VA_ARGS__)
+#define vsuxseg8ei8_v_i16mf2(...) __riscv_vsuxseg8ei8_v_i16mf2(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16m1(...) __riscv_vsuxseg2ei8_v_i16m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_i16m1(...) __riscv_vsuxseg3ei8_v_i16m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_i16m1(...) __riscv_vsuxseg4ei8_v_i16m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_i16m1(...) __riscv_vsuxseg5ei8_v_i16m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_i16m1(...) __riscv_vsuxseg6ei8_v_i16m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_i16m1(...) __riscv_vsuxseg7ei8_v_i16m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_i16m1(...) __riscv_vsuxseg8ei8_v_i16m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16m2(...) __riscv_vsuxseg2ei8_v_i16m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_i16m2(...) __riscv_vsuxseg3ei8_v_i16m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_i16m2(...) __riscv_vsuxseg4ei8_v_i16m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16m4(...) __riscv_vsuxseg2ei8_v_i16m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16mf4(...) __riscv_vsuxseg2ei16_v_i16mf4(__VA_ARGS__)
+#define vsuxseg3ei16_v_i16mf4(...) __riscv_vsuxseg3ei16_v_i16mf4(__VA_ARGS__)
+#define vsuxseg4ei16_v_i16mf4(...) __riscv_vsuxseg4ei16_v_i16mf4(__VA_ARGS__)
+#define vsuxseg5ei16_v_i16mf4(...) __riscv_vsuxseg5ei16_v_i16mf4(__VA_ARGS__)
+#define vsuxseg6ei16_v_i16mf4(...) __riscv_vsuxseg6ei16_v_i16mf4(__VA_ARGS__)
+#define vsuxseg7ei16_v_i16mf4(...) __riscv_vsuxseg7ei16_v_i16mf4(__VA_ARGS__)
+#define vsuxseg8ei16_v_i16mf4(...) __riscv_vsuxseg8ei16_v_i16mf4(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16mf2(...) __riscv_vsuxseg2ei16_v_i16mf2(__VA_ARGS__)
+#define vsuxseg3ei16_v_i16mf2(...) __riscv_vsuxseg3ei16_v_i16mf2(__VA_ARGS__)
+#define vsuxseg4ei16_v_i16mf2(...) __riscv_vsuxseg4ei16_v_i16mf2(__VA_ARGS__)
+#define vsuxseg5ei16_v_i16mf2(...) __riscv_vsuxseg5ei16_v_i16mf2(__VA_ARGS__)
+#define vsuxseg6ei16_v_i16mf2(...) __riscv_vsuxseg6ei16_v_i16mf2(__VA_ARGS__)
+#define vsuxseg7ei16_v_i16mf2(...) __riscv_vsuxseg7ei16_v_i16mf2(__VA_ARGS__)
+#define vsuxseg8ei16_v_i16mf2(...) __riscv_vsuxseg8ei16_v_i16mf2(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16m1(...) __riscv_vsuxseg2ei16_v_i16m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_i16m1(...) __riscv_vsuxseg3ei16_v_i16m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_i16m1(...) __riscv_vsuxseg4ei16_v_i16m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_i16m1(...) __riscv_vsuxseg5ei16_v_i16m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_i16m1(...) __riscv_vsuxseg6ei16_v_i16m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_i16m1(...) __riscv_vsuxseg7ei16_v_i16m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_i16m1(...) __riscv_vsuxseg8ei16_v_i16m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16m2(...) __riscv_vsuxseg2ei16_v_i16m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_i16m2(...) __riscv_vsuxseg3ei16_v_i16m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_i16m2(...) __riscv_vsuxseg4ei16_v_i16m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16m4(...) __riscv_vsuxseg2ei16_v_i16m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16mf4(...) __riscv_vsuxseg2ei32_v_i16mf4(__VA_ARGS__)
+#define vsuxseg3ei32_v_i16mf4(...) __riscv_vsuxseg3ei32_v_i16mf4(__VA_ARGS__)
+#define vsuxseg4ei32_v_i16mf4(...) __riscv_vsuxseg4ei32_v_i16mf4(__VA_ARGS__)
+#define vsuxseg5ei32_v_i16mf4(...) __riscv_vsuxseg5ei32_v_i16mf4(__VA_ARGS__)
+#define vsuxseg6ei32_v_i16mf4(...) __riscv_vsuxseg6ei32_v_i16mf4(__VA_ARGS__)
+#define vsuxseg7ei32_v_i16mf4(...) __riscv_vsuxseg7ei32_v_i16mf4(__VA_ARGS__)
+#define vsuxseg8ei32_v_i16mf4(...) __riscv_vsuxseg8ei32_v_i16mf4(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16mf2(...) __riscv_vsuxseg2ei32_v_i16mf2(__VA_ARGS__)
+#define vsuxseg3ei32_v_i16mf2(...) __riscv_vsuxseg3ei32_v_i16mf2(__VA_ARGS__)
+#define vsuxseg4ei32_v_i16mf2(...) __riscv_vsuxseg4ei32_v_i16mf2(__VA_ARGS__)
+#define vsuxseg5ei32_v_i16mf2(...) __riscv_vsuxseg5ei32_v_i16mf2(__VA_ARGS__)
+#define vsuxseg6ei32_v_i16mf2(...) __riscv_vsuxseg6ei32_v_i16mf2(__VA_ARGS__)
+#define vsuxseg7ei32_v_i16mf2(...) __riscv_vsuxseg7ei32_v_i16mf2(__VA_ARGS__)
+#define vsuxseg8ei32_v_i16mf2(...) __riscv_vsuxseg8ei32_v_i16mf2(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16m1(...) __riscv_vsuxseg2ei32_v_i16m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_i16m1(...) __riscv_vsuxseg3ei32_v_i16m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_i16m1(...) __riscv_vsuxseg4ei32_v_i16m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_i16m1(...) __riscv_vsuxseg5ei32_v_i16m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_i16m1(...) __riscv_vsuxseg6ei32_v_i16m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_i16m1(...) __riscv_vsuxseg7ei32_v_i16m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_i16m1(...) __riscv_vsuxseg8ei32_v_i16m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16m2(...) __riscv_vsuxseg2ei32_v_i16m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_i16m2(...) __riscv_vsuxseg3ei32_v_i16m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_i16m2(...) __riscv_vsuxseg4ei32_v_i16m2(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16m4(...) __riscv_vsuxseg2ei32_v_i16m4(__VA_ARGS__)
+#define vsuxseg2ei64_v_i16mf4(...) __riscv_vsuxseg2ei64_v_i16mf4(__VA_ARGS__)
+#define vsuxseg3ei64_v_i16mf4(...) __riscv_vsuxseg3ei64_v_i16mf4(__VA_ARGS__)
+#define vsuxseg4ei64_v_i16mf4(...) __riscv_vsuxseg4ei64_v_i16mf4(__VA_ARGS__)
+#define vsuxseg5ei64_v_i16mf4(...) __riscv_vsuxseg5ei64_v_i16mf4(__VA_ARGS__)
+#define vsuxseg6ei64_v_i16mf4(...) __riscv_vsuxseg6ei64_v_i16mf4(__VA_ARGS__)
+#define vsuxseg7ei64_v_i16mf4(...) __riscv_vsuxseg7ei64_v_i16mf4(__VA_ARGS__)
+#define vsuxseg8ei64_v_i16mf4(...) __riscv_vsuxseg8ei64_v_i16mf4(__VA_ARGS__)
+#define vsuxseg2ei64_v_i16mf2(...) __riscv_vsuxseg2ei64_v_i16mf2(__VA_ARGS__)
+#define vsuxseg3ei64_v_i16mf2(...) __riscv_vsuxseg3ei64_v_i16mf2(__VA_ARGS__)
+#define vsuxseg4ei64_v_i16mf2(...) __riscv_vsuxseg4ei64_v_i16mf2(__VA_ARGS__)
+#define vsuxseg5ei64_v_i16mf2(...) __riscv_vsuxseg5ei64_v_i16mf2(__VA_ARGS__)
+#define vsuxseg6ei64_v_i16mf2(...) __riscv_vsuxseg6ei64_v_i16mf2(__VA_ARGS__)
+#define vsuxseg7ei64_v_i16mf2(...) __riscv_vsuxseg7ei64_v_i16mf2(__VA_ARGS__)
+#define vsuxseg8ei64_v_i16mf2(...) __riscv_vsuxseg8ei64_v_i16mf2(__VA_ARGS__)
+#define vsuxseg2ei64_v_i16m1(...) __riscv_vsuxseg2ei64_v_i16m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_i16m1(...) __riscv_vsuxseg3ei64_v_i16m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_i16m1(...) __riscv_vsuxseg4ei64_v_i16m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_i16m1(...) __riscv_vsuxseg5ei64_v_i16m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_i16m1(...) __riscv_vsuxseg6ei64_v_i16m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_i16m1(...) __riscv_vsuxseg7ei64_v_i16m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_i16m1(...) __riscv_vsuxseg8ei64_v_i16m1(__VA_ARGS__)
+#define vsuxseg2ei64_v_i16m2(...) __riscv_vsuxseg2ei64_v_i16m2(__VA_ARGS__)
+#define vsuxseg3ei64_v_i16m2(...) __riscv_vsuxseg3ei64_v_i16m2(__VA_ARGS__)
+#define vsuxseg4ei64_v_i16m2(...) __riscv_vsuxseg4ei64_v_i16m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_i32mf2(...) __riscv_vsuxseg2ei8_v_i32mf2(__VA_ARGS__)
+#define vsuxseg3ei8_v_i32mf2(...) __riscv_vsuxseg3ei8_v_i32mf2(__VA_ARGS__)
+#define vsuxseg4ei8_v_i32mf2(...) __riscv_vsuxseg4ei8_v_i32mf2(__VA_ARGS__)
+#define vsuxseg5ei8_v_i32mf2(...) __riscv_vsuxseg5ei8_v_i32mf2(__VA_ARGS__)
+#define vsuxseg6ei8_v_i32mf2(...) __riscv_vsuxseg6ei8_v_i32mf2(__VA_ARGS__)
+#define vsuxseg7ei8_v_i32mf2(...) __riscv_vsuxseg7ei8_v_i32mf2(__VA_ARGS__)
+#define vsuxseg8ei8_v_i32mf2(...) __riscv_vsuxseg8ei8_v_i32mf2(__VA_ARGS__)
+#define vsuxseg2ei8_v_i32m1(...) __riscv_vsuxseg2ei8_v_i32m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_i32m1(...) __riscv_vsuxseg3ei8_v_i32m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_i32m1(...) __riscv_vsuxseg4ei8_v_i32m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_i32m1(...) __riscv_vsuxseg5ei8_v_i32m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_i32m1(...) __riscv_vsuxseg6ei8_v_i32m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_i32m1(...) __riscv_vsuxseg7ei8_v_i32m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_i32m1(...) __riscv_vsuxseg8ei8_v_i32m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_i32m2(...) __riscv_vsuxseg2ei8_v_i32m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_i32m2(...) __riscv_vsuxseg3ei8_v_i32m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_i32m2(...) __riscv_vsuxseg4ei8_v_i32m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_i32m4(...) __riscv_vsuxseg2ei8_v_i32m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_i32mf2(...) __riscv_vsuxseg2ei16_v_i32mf2(__VA_ARGS__)
+#define vsuxseg3ei16_v_i32mf2(...) __riscv_vsuxseg3ei16_v_i32mf2(__VA_ARGS__)
+#define vsuxseg4ei16_v_i32mf2(...) __riscv_vsuxseg4ei16_v_i32mf2(__VA_ARGS__)
+#define vsuxseg5ei16_v_i32mf2(...) __riscv_vsuxseg5ei16_v_i32mf2(__VA_ARGS__)
+#define vsuxseg6ei16_v_i32mf2(...) __riscv_vsuxseg6ei16_v_i32mf2(__VA_ARGS__)
+#define vsuxseg7ei16_v_i32mf2(...) __riscv_vsuxseg7ei16_v_i32mf2(__VA_ARGS__)
+#define vsuxseg8ei16_v_i32mf2(...) __riscv_vsuxseg8ei16_v_i32mf2(__VA_ARGS__)
+#define vsuxseg2ei16_v_i32m1(...) __riscv_vsuxseg2ei16_v_i32m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_i32m1(...) __riscv_vsuxseg3ei16_v_i32m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_i32m1(...) __riscv_vsuxseg4ei16_v_i32m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_i32m1(...) __riscv_vsuxseg5ei16_v_i32m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_i32m1(...) __riscv_vsuxseg6ei16_v_i32m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_i32m1(...) __riscv_vsuxseg7ei16_v_i32m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_i32m1(...) __riscv_vsuxseg8ei16_v_i32m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_i32m2(...) __riscv_vsuxseg2ei16_v_i32m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_i32m2(...) __riscv_vsuxseg3ei16_v_i32m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_i32m2(...) __riscv_vsuxseg4ei16_v_i32m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_i32m4(...) __riscv_vsuxseg2ei16_v_i32m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_i32mf2(...) __riscv_vsuxseg2ei32_v_i32mf2(__VA_ARGS__)
+#define vsuxseg3ei32_v_i32mf2(...) __riscv_vsuxseg3ei32_v_i32mf2(__VA_ARGS__)
+#define vsuxseg4ei32_v_i32mf2(...) __riscv_vsuxseg4ei32_v_i32mf2(__VA_ARGS__)
+#define vsuxseg5ei32_v_i32mf2(...) __riscv_vsuxseg5ei32_v_i32mf2(__VA_ARGS__)
+#define vsuxseg6ei32_v_i32mf2(...) __riscv_vsuxseg6ei32_v_i32mf2(__VA_ARGS__)
+#define vsuxseg7ei32_v_i32mf2(...) __riscv_vsuxseg7ei32_v_i32mf2(__VA_ARGS__)
+#define vsuxseg8ei32_v_i32mf2(...) __riscv_vsuxseg8ei32_v_i32mf2(__VA_ARGS__)
+#define vsuxseg2ei32_v_i32m1(...) __riscv_vsuxseg2ei32_v_i32m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_i32m1(...) __riscv_vsuxseg3ei32_v_i32m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_i32m1(...) __riscv_vsuxseg4ei32_v_i32m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_i32m1(...) __riscv_vsuxseg5ei32_v_i32m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_i32m1(...) __riscv_vsuxseg6ei32_v_i32m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_i32m1(...) __riscv_vsuxseg7ei32_v_i32m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_i32m1(...) __riscv_vsuxseg8ei32_v_i32m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_i32m2(...) __riscv_vsuxseg2ei32_v_i32m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_i32m2(...) __riscv_vsuxseg3ei32_v_i32m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_i32m2(...) __riscv_vsuxseg4ei32_v_i32m2(__VA_ARGS__)
+#define vsuxseg2ei32_v_i32m4(...) __riscv_vsuxseg2ei32_v_i32m4(__VA_ARGS__)
+#define vsuxseg2ei64_v_i32mf2(...) __riscv_vsuxseg2ei64_v_i32mf2(__VA_ARGS__)
+#define vsuxseg3ei64_v_i32mf2(...) __riscv_vsuxseg3ei64_v_i32mf2(__VA_ARGS__)
+#define vsuxseg4ei64_v_i32mf2(...) __riscv_vsuxseg4ei64_v_i32mf2(__VA_ARGS__)
+#define vsuxseg5ei64_v_i32mf2(...) __riscv_vsuxseg5ei64_v_i32mf2(__VA_ARGS__)
+#define vsuxseg6ei64_v_i32mf2(...) __riscv_vsuxseg6ei64_v_i32mf2(__VA_ARGS__)
+#define vsuxseg7ei64_v_i32mf2(...) __riscv_vsuxseg7ei64_v_i32mf2(__VA_ARGS__)
+#define vsuxseg8ei64_v_i32mf2(...) __riscv_vsuxseg8ei64_v_i32mf2(__VA_ARGS__)
+#define vsuxseg2ei64_v_i32m1(...) __riscv_vsuxseg2ei64_v_i32m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_i32m1(...) __riscv_vsuxseg3ei64_v_i32m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_i32m1(...) __riscv_vsuxseg4ei64_v_i32m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_i32m1(...) __riscv_vsuxseg5ei64_v_i32m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_i32m1(...) __riscv_vsuxseg6ei64_v_i32m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_i32m1(...) __riscv_vsuxseg7ei64_v_i32m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_i32m1(...) __riscv_vsuxseg8ei64_v_i32m1(__VA_ARGS__)
+#define vsuxseg2ei64_v_i32m2(...) __riscv_vsuxseg2ei64_v_i32m2(__VA_ARGS__)
+#define vsuxseg3ei64_v_i32m2(...) __riscv_vsuxseg3ei64_v_i32m2(__VA_ARGS__)
+#define vsuxseg4ei64_v_i32m2(...) __riscv_vsuxseg4ei64_v_i32m2(__VA_ARGS__)
+#define vsuxseg2ei64_v_i32m4(...) __riscv_vsuxseg2ei64_v_i32m4(__VA_ARGS__)
+#define vsuxseg2ei8_v_i64m1(...) __riscv_vsuxseg2ei8_v_i64m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_i64m1(...) __riscv_vsuxseg3ei8_v_i64m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_i64m1(...) __riscv_vsuxseg4ei8_v_i64m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_i64m1(...) __riscv_vsuxseg5ei8_v_i64m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_i64m1(...) __riscv_vsuxseg6ei8_v_i64m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_i64m1(...) __riscv_vsuxseg7ei8_v_i64m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_i64m1(...) __riscv_vsuxseg8ei8_v_i64m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_i64m2(...) __riscv_vsuxseg2ei8_v_i64m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_i64m2(...) __riscv_vsuxseg3ei8_v_i64m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_i64m2(...) __riscv_vsuxseg4ei8_v_i64m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_i64m4(...) __riscv_vsuxseg2ei8_v_i64m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_i64m1(...) __riscv_vsuxseg2ei16_v_i64m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_i64m1(...) __riscv_vsuxseg3ei16_v_i64m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_i64m1(...) __riscv_vsuxseg4ei16_v_i64m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_i64m1(...) __riscv_vsuxseg5ei16_v_i64m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_i64m1(...) __riscv_vsuxseg6ei16_v_i64m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_i64m1(...) __riscv_vsuxseg7ei16_v_i64m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_i64m1(...) __riscv_vsuxseg8ei16_v_i64m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_i64m2(...) __riscv_vsuxseg2ei16_v_i64m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_i64m2(...) __riscv_vsuxseg3ei16_v_i64m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_i64m2(...) __riscv_vsuxseg4ei16_v_i64m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_i64m4(...) __riscv_vsuxseg2ei16_v_i64m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_i64m1(...) __riscv_vsuxseg2ei32_v_i64m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_i64m1(...) __riscv_vsuxseg3ei32_v_i64m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_i64m1(...) __riscv_vsuxseg4ei32_v_i64m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_i64m1(...) __riscv_vsuxseg5ei32_v_i64m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_i64m1(...) __riscv_vsuxseg6ei32_v_i64m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_i64m1(...) __riscv_vsuxseg7ei32_v_i64m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_i64m1(...) __riscv_vsuxseg8ei32_v_i64m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_i64m2(...) __riscv_vsuxseg2ei32_v_i64m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_i64m2(...) __riscv_vsuxseg3ei32_v_i64m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_i64m2(...) __riscv_vsuxseg4ei32_v_i64m2(__VA_ARGS__)
+#define vsuxseg2ei32_v_i64m4(...) __riscv_vsuxseg2ei32_v_i64m4(__VA_ARGS__)
+#define vsuxseg2ei64_v_i64m1(...) __riscv_vsuxseg2ei64_v_i64m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_i64m1(...) __riscv_vsuxseg3ei64_v_i64m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_i64m1(...) __riscv_vsuxseg4ei64_v_i64m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_i64m1(...) __riscv_vsuxseg5ei64_v_i64m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_i64m1(...) __riscv_vsuxseg6ei64_v_i64m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_i64m1(...) __riscv_vsuxseg7ei64_v_i64m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_i64m1(...) __riscv_vsuxseg8ei64_v_i64m1(__VA_ARGS__)
+#define vsuxseg2ei64_v_i64m2(...) __riscv_vsuxseg2ei64_v_i64m2(__VA_ARGS__)
+#define vsuxseg3ei64_v_i64m2(...) __riscv_vsuxseg3ei64_v_i64m2(__VA_ARGS__)
+#define vsuxseg4ei64_v_i64m2(...) __riscv_vsuxseg4ei64_v_i64m2(__VA_ARGS__)
+#define vsuxseg2ei64_v_i64m4(...) __riscv_vsuxseg2ei64_v_i64m4(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8mf8(...) __riscv_vsoxseg2ei8_v_u8mf8(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8mf8(...) __riscv_vsoxseg3ei8_v_u8mf8(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8mf8(...) __riscv_vsoxseg4ei8_v_u8mf8(__VA_ARGS__)
+#define vsoxseg5ei8_v_u8mf8(...) __riscv_vsoxseg5ei8_v_u8mf8(__VA_ARGS__)
+#define vsoxseg6ei8_v_u8mf8(...) __riscv_vsoxseg6ei8_v_u8mf8(__VA_ARGS__)
+#define vsoxseg7ei8_v_u8mf8(...) __riscv_vsoxseg7ei8_v_u8mf8(__VA_ARGS__)
+#define vsoxseg8ei8_v_u8mf8(...) __riscv_vsoxseg8ei8_v_u8mf8(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8mf4(...) __riscv_vsoxseg2ei8_v_u8mf4(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8mf4(...) __riscv_vsoxseg3ei8_v_u8mf4(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8mf4(...) __riscv_vsoxseg4ei8_v_u8mf4(__VA_ARGS__)
+#define vsoxseg5ei8_v_u8mf4(...) __riscv_vsoxseg5ei8_v_u8mf4(__VA_ARGS__)
+#define vsoxseg6ei8_v_u8mf4(...) __riscv_vsoxseg6ei8_v_u8mf4(__VA_ARGS__)
+#define vsoxseg7ei8_v_u8mf4(...) __riscv_vsoxseg7ei8_v_u8mf4(__VA_ARGS__)
+#define vsoxseg8ei8_v_u8mf4(...) __riscv_vsoxseg8ei8_v_u8mf4(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8mf2(...) __riscv_vsoxseg2ei8_v_u8mf2(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8mf2(...) __riscv_vsoxseg3ei8_v_u8mf2(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8mf2(...) __riscv_vsoxseg4ei8_v_u8mf2(__VA_ARGS__)
+#define vsoxseg5ei8_v_u8mf2(...) __riscv_vsoxseg5ei8_v_u8mf2(__VA_ARGS__)
+#define vsoxseg6ei8_v_u8mf2(...) __riscv_vsoxseg6ei8_v_u8mf2(__VA_ARGS__)
+#define vsoxseg7ei8_v_u8mf2(...) __riscv_vsoxseg7ei8_v_u8mf2(__VA_ARGS__)
+#define vsoxseg8ei8_v_u8mf2(...) __riscv_vsoxseg8ei8_v_u8mf2(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8m1(...) __riscv_vsoxseg2ei8_v_u8m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8m1(...) __riscv_vsoxseg3ei8_v_u8m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8m1(...) __riscv_vsoxseg4ei8_v_u8m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_u8m1(...) __riscv_vsoxseg5ei8_v_u8m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_u8m1(...) __riscv_vsoxseg6ei8_v_u8m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_u8m1(...) __riscv_vsoxseg7ei8_v_u8m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_u8m1(...) __riscv_vsoxseg8ei8_v_u8m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8m2(...) __riscv_vsoxseg2ei8_v_u8m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8m2(...) __riscv_vsoxseg3ei8_v_u8m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8m2(...) __riscv_vsoxseg4ei8_v_u8m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8m4(...) __riscv_vsoxseg2ei8_v_u8m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8mf8(...) __riscv_vsoxseg2ei16_v_u8mf8(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8mf8(...) __riscv_vsoxseg3ei16_v_u8mf8(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8mf8(...) __riscv_vsoxseg4ei16_v_u8mf8(__VA_ARGS__)
+#define vsoxseg5ei16_v_u8mf8(...) __riscv_vsoxseg5ei16_v_u8mf8(__VA_ARGS__)
+#define vsoxseg6ei16_v_u8mf8(...) __riscv_vsoxseg6ei16_v_u8mf8(__VA_ARGS__)
+#define vsoxseg7ei16_v_u8mf8(...) __riscv_vsoxseg7ei16_v_u8mf8(__VA_ARGS__)
+#define vsoxseg8ei16_v_u8mf8(...) __riscv_vsoxseg8ei16_v_u8mf8(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8mf4(...) __riscv_vsoxseg2ei16_v_u8mf4(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8mf4(...) __riscv_vsoxseg3ei16_v_u8mf4(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8mf4(...) __riscv_vsoxseg4ei16_v_u8mf4(__VA_ARGS__)
+#define vsoxseg5ei16_v_u8mf4(...) __riscv_vsoxseg5ei16_v_u8mf4(__VA_ARGS__)
+#define vsoxseg6ei16_v_u8mf4(...) __riscv_vsoxseg6ei16_v_u8mf4(__VA_ARGS__)
+#define vsoxseg7ei16_v_u8mf4(...) __riscv_vsoxseg7ei16_v_u8mf4(__VA_ARGS__)
+#define vsoxseg8ei16_v_u8mf4(...) __riscv_vsoxseg8ei16_v_u8mf4(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8mf2(...) __riscv_vsoxseg2ei16_v_u8mf2(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8mf2(...) __riscv_vsoxseg3ei16_v_u8mf2(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8mf2(...) __riscv_vsoxseg4ei16_v_u8mf2(__VA_ARGS__)
+#define vsoxseg5ei16_v_u8mf2(...) __riscv_vsoxseg5ei16_v_u8mf2(__VA_ARGS__)
+#define vsoxseg6ei16_v_u8mf2(...) __riscv_vsoxseg6ei16_v_u8mf2(__VA_ARGS__)
+#define vsoxseg7ei16_v_u8mf2(...) __riscv_vsoxseg7ei16_v_u8mf2(__VA_ARGS__)
+#define vsoxseg8ei16_v_u8mf2(...) __riscv_vsoxseg8ei16_v_u8mf2(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8m1(...) __riscv_vsoxseg2ei16_v_u8m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8m1(...) __riscv_vsoxseg3ei16_v_u8m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8m1(...) __riscv_vsoxseg4ei16_v_u8m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_u8m1(...) __riscv_vsoxseg5ei16_v_u8m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_u8m1(...) __riscv_vsoxseg6ei16_v_u8m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_u8m1(...) __riscv_vsoxseg7ei16_v_u8m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_u8m1(...) __riscv_vsoxseg8ei16_v_u8m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8m2(...) __riscv_vsoxseg2ei16_v_u8m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8m2(...) __riscv_vsoxseg3ei16_v_u8m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8m2(...) __riscv_vsoxseg4ei16_v_u8m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8m4(...) __riscv_vsoxseg2ei16_v_u8m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8mf8(...) __riscv_vsoxseg2ei32_v_u8mf8(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8mf8(...) __riscv_vsoxseg3ei32_v_u8mf8(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8mf8(...) __riscv_vsoxseg4ei32_v_u8mf8(__VA_ARGS__)
+#define vsoxseg5ei32_v_u8mf8(...) __riscv_vsoxseg5ei32_v_u8mf8(__VA_ARGS__)
+#define vsoxseg6ei32_v_u8mf8(...) __riscv_vsoxseg6ei32_v_u8mf8(__VA_ARGS__)
+#define vsoxseg7ei32_v_u8mf8(...) __riscv_vsoxseg7ei32_v_u8mf8(__VA_ARGS__)
+#define vsoxseg8ei32_v_u8mf8(...) __riscv_vsoxseg8ei32_v_u8mf8(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8mf4(...) __riscv_vsoxseg2ei32_v_u8mf4(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8mf4(...) __riscv_vsoxseg3ei32_v_u8mf4(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8mf4(...) __riscv_vsoxseg4ei32_v_u8mf4(__VA_ARGS__)
+#define vsoxseg5ei32_v_u8mf4(...) __riscv_vsoxseg5ei32_v_u8mf4(__VA_ARGS__)
+#define vsoxseg6ei32_v_u8mf4(...) __riscv_vsoxseg6ei32_v_u8mf4(__VA_ARGS__)
+#define vsoxseg7ei32_v_u8mf4(...) __riscv_vsoxseg7ei32_v_u8mf4(__VA_ARGS__)
+#define vsoxseg8ei32_v_u8mf4(...) __riscv_vsoxseg8ei32_v_u8mf4(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8mf2(...) __riscv_vsoxseg2ei32_v_u8mf2(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8mf2(...) __riscv_vsoxseg3ei32_v_u8mf2(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8mf2(...) __riscv_vsoxseg4ei32_v_u8mf2(__VA_ARGS__)
+#define vsoxseg5ei32_v_u8mf2(...) __riscv_vsoxseg5ei32_v_u8mf2(__VA_ARGS__)
+#define vsoxseg6ei32_v_u8mf2(...) __riscv_vsoxseg6ei32_v_u8mf2(__VA_ARGS__)
+#define vsoxseg7ei32_v_u8mf2(...) __riscv_vsoxseg7ei32_v_u8mf2(__VA_ARGS__)
+#define vsoxseg8ei32_v_u8mf2(...) __riscv_vsoxseg8ei32_v_u8mf2(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8m1(...) __riscv_vsoxseg2ei32_v_u8m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8m1(...) __riscv_vsoxseg3ei32_v_u8m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8m1(...) __riscv_vsoxseg4ei32_v_u8m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_u8m1(...) __riscv_vsoxseg5ei32_v_u8m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_u8m1(...) __riscv_vsoxseg6ei32_v_u8m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_u8m1(...) __riscv_vsoxseg7ei32_v_u8m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_u8m1(...) __riscv_vsoxseg8ei32_v_u8m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8m2(...) __riscv_vsoxseg2ei32_v_u8m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8m2(...) __riscv_vsoxseg3ei32_v_u8m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8m2(...) __riscv_vsoxseg4ei32_v_u8m2(__VA_ARGS__)
+#define vsoxseg2ei64_v_u8mf8(...) __riscv_vsoxseg2ei64_v_u8mf8(__VA_ARGS__)
+#define vsoxseg3ei64_v_u8mf8(...) __riscv_vsoxseg3ei64_v_u8mf8(__VA_ARGS__)
+#define vsoxseg4ei64_v_u8mf8(...) __riscv_vsoxseg4ei64_v_u8mf8(__VA_ARGS__)
+#define vsoxseg5ei64_v_u8mf8(...) __riscv_vsoxseg5ei64_v_u8mf8(__VA_ARGS__)
+#define vsoxseg6ei64_v_u8mf8(...) __riscv_vsoxseg6ei64_v_u8mf8(__VA_ARGS__)
+#define vsoxseg7ei64_v_u8mf8(...) __riscv_vsoxseg7ei64_v_u8mf8(__VA_ARGS__)
+#define vsoxseg8ei64_v_u8mf8(...) __riscv_vsoxseg8ei64_v_u8mf8(__VA_ARGS__)
+#define vsoxseg2ei64_v_u8mf4(...) __riscv_vsoxseg2ei64_v_u8mf4(__VA_ARGS__)
+#define vsoxseg3ei64_v_u8mf4(...) __riscv_vsoxseg3ei64_v_u8mf4(__VA_ARGS__)
+#define vsoxseg4ei64_v_u8mf4(...) __riscv_vsoxseg4ei64_v_u8mf4(__VA_ARGS__)
+#define vsoxseg5ei64_v_u8mf4(...) __riscv_vsoxseg5ei64_v_u8mf4(__VA_ARGS__)
+#define vsoxseg6ei64_v_u8mf4(...) __riscv_vsoxseg6ei64_v_u8mf4(__VA_ARGS__)
+#define vsoxseg7ei64_v_u8mf4(...) __riscv_vsoxseg7ei64_v_u8mf4(__VA_ARGS__)
+#define vsoxseg8ei64_v_u8mf4(...) __riscv_vsoxseg8ei64_v_u8mf4(__VA_ARGS__)
+#define vsoxseg2ei64_v_u8mf2(...) __riscv_vsoxseg2ei64_v_u8mf2(__VA_ARGS__)
+#define vsoxseg3ei64_v_u8mf2(...) __riscv_vsoxseg3ei64_v_u8mf2(__VA_ARGS__)
+#define vsoxseg4ei64_v_u8mf2(...) __riscv_vsoxseg4ei64_v_u8mf2(__VA_ARGS__)
+#define vsoxseg5ei64_v_u8mf2(...) __riscv_vsoxseg5ei64_v_u8mf2(__VA_ARGS__)
+#define vsoxseg6ei64_v_u8mf2(...) __riscv_vsoxseg6ei64_v_u8mf2(__VA_ARGS__)
+#define vsoxseg7ei64_v_u8mf2(...) __riscv_vsoxseg7ei64_v_u8mf2(__VA_ARGS__)
+#define vsoxseg8ei64_v_u8mf2(...) __riscv_vsoxseg8ei64_v_u8mf2(__VA_ARGS__)
+#define vsoxseg2ei64_v_u8m1(...) __riscv_vsoxseg2ei64_v_u8m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_u8m1(...) __riscv_vsoxseg3ei64_v_u8m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_u8m1(...) __riscv_vsoxseg4ei64_v_u8m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_u8m1(...) __riscv_vsoxseg5ei64_v_u8m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_u8m1(...) __riscv_vsoxseg6ei64_v_u8m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_u8m1(...) __riscv_vsoxseg7ei64_v_u8m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_u8m1(...) __riscv_vsoxseg8ei64_v_u8m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16mf4(...) __riscv_vsoxseg2ei8_v_u16mf4(__VA_ARGS__)
+#define vsoxseg3ei8_v_u16mf4(...) __riscv_vsoxseg3ei8_v_u16mf4(__VA_ARGS__)
+#define vsoxseg4ei8_v_u16mf4(...) __riscv_vsoxseg4ei8_v_u16mf4(__VA_ARGS__)
+#define vsoxseg5ei8_v_u16mf4(...) __riscv_vsoxseg5ei8_v_u16mf4(__VA_ARGS__)
+#define vsoxseg6ei8_v_u16mf4(...) __riscv_vsoxseg6ei8_v_u16mf4(__VA_ARGS__)
+#define vsoxseg7ei8_v_u16mf4(...) __riscv_vsoxseg7ei8_v_u16mf4(__VA_ARGS__)
+#define vsoxseg8ei8_v_u16mf4(...) __riscv_vsoxseg8ei8_v_u16mf4(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16mf2(...) __riscv_vsoxseg2ei8_v_u16mf2(__VA_ARGS__)
+#define vsoxseg3ei8_v_u16mf2(...) __riscv_vsoxseg3ei8_v_u16mf2(__VA_ARGS__)
+#define vsoxseg4ei8_v_u16mf2(...) __riscv_vsoxseg4ei8_v_u16mf2(__VA_ARGS__)
+#define vsoxseg5ei8_v_u16mf2(...) __riscv_vsoxseg5ei8_v_u16mf2(__VA_ARGS__)
+#define vsoxseg6ei8_v_u16mf2(...) __riscv_vsoxseg6ei8_v_u16mf2(__VA_ARGS__)
+#define vsoxseg7ei8_v_u16mf2(...) __riscv_vsoxseg7ei8_v_u16mf2(__VA_ARGS__)
+#define vsoxseg8ei8_v_u16mf2(...) __riscv_vsoxseg8ei8_v_u16mf2(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16m1(...) __riscv_vsoxseg2ei8_v_u16m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_u16m1(...) __riscv_vsoxseg3ei8_v_u16m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_u16m1(...) __riscv_vsoxseg4ei8_v_u16m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_u16m1(...) __riscv_vsoxseg5ei8_v_u16m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_u16m1(...) __riscv_vsoxseg6ei8_v_u16m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_u16m1(...) __riscv_vsoxseg7ei8_v_u16m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_u16m1(...) __riscv_vsoxseg8ei8_v_u16m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16m2(...) __riscv_vsoxseg2ei8_v_u16m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_u16m2(...) __riscv_vsoxseg3ei8_v_u16m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_u16m2(...) __riscv_vsoxseg4ei8_v_u16m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16m4(...) __riscv_vsoxseg2ei8_v_u16m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16mf4(...) __riscv_vsoxseg2ei16_v_u16mf4(__VA_ARGS__)
+#define vsoxseg3ei16_v_u16mf4(...) __riscv_vsoxseg3ei16_v_u16mf4(__VA_ARGS__)
+#define vsoxseg4ei16_v_u16mf4(...) __riscv_vsoxseg4ei16_v_u16mf4(__VA_ARGS__)
+#define vsoxseg5ei16_v_u16mf4(...) __riscv_vsoxseg5ei16_v_u16mf4(__VA_ARGS__)
+#define vsoxseg6ei16_v_u16mf4(...) __riscv_vsoxseg6ei16_v_u16mf4(__VA_ARGS__)
+#define vsoxseg7ei16_v_u16mf4(...) __riscv_vsoxseg7ei16_v_u16mf4(__VA_ARGS__)
+#define vsoxseg8ei16_v_u16mf4(...) __riscv_vsoxseg8ei16_v_u16mf4(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16mf2(...) __riscv_vsoxseg2ei16_v_u16mf2(__VA_ARGS__)
+#define vsoxseg3ei16_v_u16mf2(...) __riscv_vsoxseg3ei16_v_u16mf2(__VA_ARGS__)
+#define vsoxseg4ei16_v_u16mf2(...) __riscv_vsoxseg4ei16_v_u16mf2(__VA_ARGS__)
+#define vsoxseg5ei16_v_u16mf2(...) __riscv_vsoxseg5ei16_v_u16mf2(__VA_ARGS__)
+#define vsoxseg6ei16_v_u16mf2(...) __riscv_vsoxseg6ei16_v_u16mf2(__VA_ARGS__)
+#define vsoxseg7ei16_v_u16mf2(...) __riscv_vsoxseg7ei16_v_u16mf2(__VA_ARGS__)
+#define vsoxseg8ei16_v_u16mf2(...) __riscv_vsoxseg8ei16_v_u16mf2(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16m1(...) __riscv_vsoxseg2ei16_v_u16m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_u16m1(...) __riscv_vsoxseg3ei16_v_u16m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_u16m1(...) __riscv_vsoxseg4ei16_v_u16m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_u16m1(...) __riscv_vsoxseg5ei16_v_u16m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_u16m1(...) __riscv_vsoxseg6ei16_v_u16m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_u16m1(...) __riscv_vsoxseg7ei16_v_u16m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_u16m1(...) __riscv_vsoxseg8ei16_v_u16m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16m2(...) __riscv_vsoxseg2ei16_v_u16m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_u16m2(...) __riscv_vsoxseg3ei16_v_u16m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_u16m2(...) __riscv_vsoxseg4ei16_v_u16m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16m4(...) __riscv_vsoxseg2ei16_v_u16m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16mf4(...) __riscv_vsoxseg2ei32_v_u16mf4(__VA_ARGS__)
+#define vsoxseg3ei32_v_u16mf4(...) __riscv_vsoxseg3ei32_v_u16mf4(__VA_ARGS__)
+#define vsoxseg4ei32_v_u16mf4(...) __riscv_vsoxseg4ei32_v_u16mf4(__VA_ARGS__)
+#define vsoxseg5ei32_v_u16mf4(...) __riscv_vsoxseg5ei32_v_u16mf4(__VA_ARGS__)
+#define vsoxseg6ei32_v_u16mf4(...) __riscv_vsoxseg6ei32_v_u16mf4(__VA_ARGS__)
+#define vsoxseg7ei32_v_u16mf4(...) __riscv_vsoxseg7ei32_v_u16mf4(__VA_ARGS__)
+#define vsoxseg8ei32_v_u16mf4(...) __riscv_vsoxseg8ei32_v_u16mf4(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16mf2(...) __riscv_vsoxseg2ei32_v_u16mf2(__VA_ARGS__)
+#define vsoxseg3ei32_v_u16mf2(...) __riscv_vsoxseg3ei32_v_u16mf2(__VA_ARGS__)
+#define vsoxseg4ei32_v_u16mf2(...) __riscv_vsoxseg4ei32_v_u16mf2(__VA_ARGS__)
+#define vsoxseg5ei32_v_u16mf2(...) __riscv_vsoxseg5ei32_v_u16mf2(__VA_ARGS__)
+#define vsoxseg6ei32_v_u16mf2(...) __riscv_vsoxseg6ei32_v_u16mf2(__VA_ARGS__)
+#define vsoxseg7ei32_v_u16mf2(...) __riscv_vsoxseg7ei32_v_u16mf2(__VA_ARGS__)
+#define vsoxseg8ei32_v_u16mf2(...) __riscv_vsoxseg8ei32_v_u16mf2(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16m1(...) __riscv_vsoxseg2ei32_v_u16m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_u16m1(...) __riscv_vsoxseg3ei32_v_u16m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_u16m1(...) __riscv_vsoxseg4ei32_v_u16m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_u16m1(...) __riscv_vsoxseg5ei32_v_u16m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_u16m1(...) __riscv_vsoxseg6ei32_v_u16m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_u16m1(...) __riscv_vsoxseg7ei32_v_u16m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_u16m1(...) __riscv_vsoxseg8ei32_v_u16m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16m2(...) __riscv_vsoxseg2ei32_v_u16m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_u16m2(...) __riscv_vsoxseg3ei32_v_u16m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_u16m2(...) __riscv_vsoxseg4ei32_v_u16m2(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16m4(...) __riscv_vsoxseg2ei32_v_u16m4(__VA_ARGS__)
+#define vsoxseg2ei64_v_u16mf4(...) __riscv_vsoxseg2ei64_v_u16mf4(__VA_ARGS__)
+#define vsoxseg3ei64_v_u16mf4(...) __riscv_vsoxseg3ei64_v_u16mf4(__VA_ARGS__)
+#define vsoxseg4ei64_v_u16mf4(...) __riscv_vsoxseg4ei64_v_u16mf4(__VA_ARGS__)
+#define vsoxseg5ei64_v_u16mf4(...) __riscv_vsoxseg5ei64_v_u16mf4(__VA_ARGS__)
+#define vsoxseg6ei64_v_u16mf4(...) __riscv_vsoxseg6ei64_v_u16mf4(__VA_ARGS__)
+#define vsoxseg7ei64_v_u16mf4(...) __riscv_vsoxseg7ei64_v_u16mf4(__VA_ARGS__)
+#define vsoxseg8ei64_v_u16mf4(...) __riscv_vsoxseg8ei64_v_u16mf4(__VA_ARGS__)
+#define vsoxseg2ei64_v_u16mf2(...) __riscv_vsoxseg2ei64_v_u16mf2(__VA_ARGS__)
+#define vsoxseg3ei64_v_u16mf2(...) __riscv_vsoxseg3ei64_v_u16mf2(__VA_ARGS__)
+#define vsoxseg4ei64_v_u16mf2(...) __riscv_vsoxseg4ei64_v_u16mf2(__VA_ARGS__)
+#define vsoxseg5ei64_v_u16mf2(...) __riscv_vsoxseg5ei64_v_u16mf2(__VA_ARGS__)
+#define vsoxseg6ei64_v_u16mf2(...) __riscv_vsoxseg6ei64_v_u16mf2(__VA_ARGS__)
+#define vsoxseg7ei64_v_u16mf2(...) __riscv_vsoxseg7ei64_v_u16mf2(__VA_ARGS__)
+#define vsoxseg8ei64_v_u16mf2(...) __riscv_vsoxseg8ei64_v_u16mf2(__VA_ARGS__)
+#define vsoxseg2ei64_v_u16m1(...) __riscv_vsoxseg2ei64_v_u16m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_u16m1(...) __riscv_vsoxseg3ei64_v_u16m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_u16m1(...) __riscv_vsoxseg4ei64_v_u16m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_u16m1(...) __riscv_vsoxseg5ei64_v_u16m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_u16m1(...) __riscv_vsoxseg6ei64_v_u16m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_u16m1(...) __riscv_vsoxseg7ei64_v_u16m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_u16m1(...) __riscv_vsoxseg8ei64_v_u16m1(__VA_ARGS__)
+#define vsoxseg2ei64_v_u16m2(...) __riscv_vsoxseg2ei64_v_u16m2(__VA_ARGS__)
+#define vsoxseg3ei64_v_u16m2(...) __riscv_vsoxseg3ei64_v_u16m2(__VA_ARGS__)
+#define vsoxseg4ei64_v_u16m2(...) __riscv_vsoxseg4ei64_v_u16m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_u32mf2(...) __riscv_vsoxseg2ei8_v_u32mf2(__VA_ARGS__)
+#define vsoxseg3ei8_v_u32mf2(...) __riscv_vsoxseg3ei8_v_u32mf2(__VA_ARGS__)
+#define vsoxseg4ei8_v_u32mf2(...) __riscv_vsoxseg4ei8_v_u32mf2(__VA_ARGS__)
+#define vsoxseg5ei8_v_u32mf2(...) __riscv_vsoxseg5ei8_v_u32mf2(__VA_ARGS__)
+#define vsoxseg6ei8_v_u32mf2(...) __riscv_vsoxseg6ei8_v_u32mf2(__VA_ARGS__)
+#define vsoxseg7ei8_v_u32mf2(...) __riscv_vsoxseg7ei8_v_u32mf2(__VA_ARGS__)
+#define vsoxseg8ei8_v_u32mf2(...) __riscv_vsoxseg8ei8_v_u32mf2(__VA_ARGS__)
+#define vsoxseg2ei8_v_u32m1(...) __riscv_vsoxseg2ei8_v_u32m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_u32m1(...) __riscv_vsoxseg3ei8_v_u32m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_u32m1(...) __riscv_vsoxseg4ei8_v_u32m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_u32m1(...) __riscv_vsoxseg5ei8_v_u32m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_u32m1(...) __riscv_vsoxseg6ei8_v_u32m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_u32m1(...) __riscv_vsoxseg7ei8_v_u32m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_u32m1(...) __riscv_vsoxseg8ei8_v_u32m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_u32m2(...) __riscv_vsoxseg2ei8_v_u32m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_u32m2(...) __riscv_vsoxseg3ei8_v_u32m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_u32m2(...) __riscv_vsoxseg4ei8_v_u32m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_u32m4(...) __riscv_vsoxseg2ei8_v_u32m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_u32mf2(...) __riscv_vsoxseg2ei16_v_u32mf2(__VA_ARGS__)
+#define vsoxseg3ei16_v_u32mf2(...) __riscv_vsoxseg3ei16_v_u32mf2(__VA_ARGS__)
+#define vsoxseg4ei16_v_u32mf2(...) __riscv_vsoxseg4ei16_v_u32mf2(__VA_ARGS__)
+#define vsoxseg5ei16_v_u32mf2(...) __riscv_vsoxseg5ei16_v_u32mf2(__VA_ARGS__)
+#define vsoxseg6ei16_v_u32mf2(...) __riscv_vsoxseg6ei16_v_u32mf2(__VA_ARGS__)
+#define vsoxseg7ei16_v_u32mf2(...) __riscv_vsoxseg7ei16_v_u32mf2(__VA_ARGS__)
+#define vsoxseg8ei16_v_u32mf2(...) __riscv_vsoxseg8ei16_v_u32mf2(__VA_ARGS__)
+#define vsoxseg2ei16_v_u32m1(...) __riscv_vsoxseg2ei16_v_u32m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_u32m1(...) __riscv_vsoxseg3ei16_v_u32m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_u32m1(...) __riscv_vsoxseg4ei16_v_u32m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_u32m1(...) __riscv_vsoxseg5ei16_v_u32m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_u32m1(...) __riscv_vsoxseg6ei16_v_u32m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_u32m1(...) __riscv_vsoxseg7ei16_v_u32m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_u32m1(...) __riscv_vsoxseg8ei16_v_u32m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_u32m2(...) __riscv_vsoxseg2ei16_v_u32m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_u32m2(...) __riscv_vsoxseg3ei16_v_u32m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_u32m2(...) __riscv_vsoxseg4ei16_v_u32m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_u32m4(...) __riscv_vsoxseg2ei16_v_u32m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_u32mf2(...) __riscv_vsoxseg2ei32_v_u32mf2(__VA_ARGS__)
+#define vsoxseg3ei32_v_u32mf2(...) __riscv_vsoxseg3ei32_v_u32mf2(__VA_ARGS__)
+#define vsoxseg4ei32_v_u32mf2(...) __riscv_vsoxseg4ei32_v_u32mf2(__VA_ARGS__)
+#define vsoxseg5ei32_v_u32mf2(...) __riscv_vsoxseg5ei32_v_u32mf2(__VA_ARGS__)
+#define vsoxseg6ei32_v_u32mf2(...) __riscv_vsoxseg6ei32_v_u32mf2(__VA_ARGS__)
+#define vsoxseg7ei32_v_u32mf2(...) __riscv_vsoxseg7ei32_v_u32mf2(__VA_ARGS__)
+#define vsoxseg8ei32_v_u32mf2(...) __riscv_vsoxseg8ei32_v_u32mf2(__VA_ARGS__)
+#define vsoxseg2ei32_v_u32m1(...) __riscv_vsoxseg2ei32_v_u32m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_u32m1(...) __riscv_vsoxseg3ei32_v_u32m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_u32m1(...) __riscv_vsoxseg4ei32_v_u32m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_u32m1(...) __riscv_vsoxseg5ei32_v_u32m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_u32m1(...) __riscv_vsoxseg6ei32_v_u32m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_u32m1(...) __riscv_vsoxseg7ei32_v_u32m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_u32m1(...) __riscv_vsoxseg8ei32_v_u32m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_u32m2(...) __riscv_vsoxseg2ei32_v_u32m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_u32m2(...) __riscv_vsoxseg3ei32_v_u32m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_u32m2(...) __riscv_vsoxseg4ei32_v_u32m2(__VA_ARGS__)
+#define vsoxseg2ei32_v_u32m4(...) __riscv_vsoxseg2ei32_v_u32m4(__VA_ARGS__)
+#define vsoxseg2ei64_v_u32mf2(...) __riscv_vsoxseg2ei64_v_u32mf2(__VA_ARGS__)
+#define vsoxseg3ei64_v_u32mf2(...) __riscv_vsoxseg3ei64_v_u32mf2(__VA_ARGS__)
+#define vsoxseg4ei64_v_u32mf2(...) __riscv_vsoxseg4ei64_v_u32mf2(__VA_ARGS__)
+#define vsoxseg5ei64_v_u32mf2(...) __riscv_vsoxseg5ei64_v_u32mf2(__VA_ARGS__)
+#define vsoxseg6ei64_v_u32mf2(...) __riscv_vsoxseg6ei64_v_u32mf2(__VA_ARGS__)
+#define vsoxseg7ei64_v_u32mf2(...) __riscv_vsoxseg7ei64_v_u32mf2(__VA_ARGS__)
+#define vsoxseg8ei64_v_u32mf2(...) __riscv_vsoxseg8ei64_v_u32mf2(__VA_ARGS__)
+#define vsoxseg2ei64_v_u32m1(...) __riscv_vsoxseg2ei64_v_u32m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_u32m1(...) __riscv_vsoxseg3ei64_v_u32m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_u32m1(...) __riscv_vsoxseg4ei64_v_u32m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_u32m1(...) __riscv_vsoxseg5ei64_v_u32m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_u32m1(...) __riscv_vsoxseg6ei64_v_u32m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_u32m1(...) __riscv_vsoxseg7ei64_v_u32m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_u32m1(...) __riscv_vsoxseg8ei64_v_u32m1(__VA_ARGS__)
+#define vsoxseg2ei64_v_u32m2(...) __riscv_vsoxseg2ei64_v_u32m2(__VA_ARGS__)
+#define vsoxseg3ei64_v_u32m2(...) __riscv_vsoxseg3ei64_v_u32m2(__VA_ARGS__)
+#define vsoxseg4ei64_v_u32m2(...) __riscv_vsoxseg4ei64_v_u32m2(__VA_ARGS__)
+#define vsoxseg2ei64_v_u32m4(...) __riscv_vsoxseg2ei64_v_u32m4(__VA_ARGS__)
+#define vsoxseg2ei8_v_u64m1(...) __riscv_vsoxseg2ei8_v_u64m1(__VA_ARGS__)
+#define vsoxseg3ei8_v_u64m1(...) __riscv_vsoxseg3ei8_v_u64m1(__VA_ARGS__)
+#define vsoxseg4ei8_v_u64m1(...) __riscv_vsoxseg4ei8_v_u64m1(__VA_ARGS__)
+#define vsoxseg5ei8_v_u64m1(...) __riscv_vsoxseg5ei8_v_u64m1(__VA_ARGS__)
+#define vsoxseg6ei8_v_u64m1(...) __riscv_vsoxseg6ei8_v_u64m1(__VA_ARGS__)
+#define vsoxseg7ei8_v_u64m1(...) __riscv_vsoxseg7ei8_v_u64m1(__VA_ARGS__)
+#define vsoxseg8ei8_v_u64m1(...) __riscv_vsoxseg8ei8_v_u64m1(__VA_ARGS__)
+#define vsoxseg2ei8_v_u64m2(...) __riscv_vsoxseg2ei8_v_u64m2(__VA_ARGS__)
+#define vsoxseg3ei8_v_u64m2(...) __riscv_vsoxseg3ei8_v_u64m2(__VA_ARGS__)
+#define vsoxseg4ei8_v_u64m2(...) __riscv_vsoxseg4ei8_v_u64m2(__VA_ARGS__)
+#define vsoxseg2ei8_v_u64m4(...) __riscv_vsoxseg2ei8_v_u64m4(__VA_ARGS__)
+#define vsoxseg2ei16_v_u64m1(...) __riscv_vsoxseg2ei16_v_u64m1(__VA_ARGS__)
+#define vsoxseg3ei16_v_u64m1(...) __riscv_vsoxseg3ei16_v_u64m1(__VA_ARGS__)
+#define vsoxseg4ei16_v_u64m1(...) __riscv_vsoxseg4ei16_v_u64m1(__VA_ARGS__)
+#define vsoxseg5ei16_v_u64m1(...) __riscv_vsoxseg5ei16_v_u64m1(__VA_ARGS__)
+#define vsoxseg6ei16_v_u64m1(...) __riscv_vsoxseg6ei16_v_u64m1(__VA_ARGS__)
+#define vsoxseg7ei16_v_u64m1(...) __riscv_vsoxseg7ei16_v_u64m1(__VA_ARGS__)
+#define vsoxseg8ei16_v_u64m1(...) __riscv_vsoxseg8ei16_v_u64m1(__VA_ARGS__)
+#define vsoxseg2ei16_v_u64m2(...) __riscv_vsoxseg2ei16_v_u64m2(__VA_ARGS__)
+#define vsoxseg3ei16_v_u64m2(...) __riscv_vsoxseg3ei16_v_u64m2(__VA_ARGS__)
+#define vsoxseg4ei16_v_u64m2(...) __riscv_vsoxseg4ei16_v_u64m2(__VA_ARGS__)
+#define vsoxseg2ei16_v_u64m4(...) __riscv_vsoxseg2ei16_v_u64m4(__VA_ARGS__)
+#define vsoxseg2ei32_v_u64m1(...) __riscv_vsoxseg2ei32_v_u64m1(__VA_ARGS__)
+#define vsoxseg3ei32_v_u64m1(...) __riscv_vsoxseg3ei32_v_u64m1(__VA_ARGS__)
+#define vsoxseg4ei32_v_u64m1(...) __riscv_vsoxseg4ei32_v_u64m1(__VA_ARGS__)
+#define vsoxseg5ei32_v_u64m1(...) __riscv_vsoxseg5ei32_v_u64m1(__VA_ARGS__)
+#define vsoxseg6ei32_v_u64m1(...) __riscv_vsoxseg6ei32_v_u64m1(__VA_ARGS__)
+#define vsoxseg7ei32_v_u64m1(...) __riscv_vsoxseg7ei32_v_u64m1(__VA_ARGS__)
+#define vsoxseg8ei32_v_u64m1(...) __riscv_vsoxseg8ei32_v_u64m1(__VA_ARGS__)
+#define vsoxseg2ei32_v_u64m2(...) __riscv_vsoxseg2ei32_v_u64m2(__VA_ARGS__)
+#define vsoxseg3ei32_v_u64m2(...) __riscv_vsoxseg3ei32_v_u64m2(__VA_ARGS__)
+#define vsoxseg4ei32_v_u64m2(...) __riscv_vsoxseg4ei32_v_u64m2(__VA_ARGS__)
+#define vsoxseg2ei32_v_u64m4(...) __riscv_vsoxseg2ei32_v_u64m4(__VA_ARGS__)
+#define vsoxseg2ei64_v_u64m1(...) __riscv_vsoxseg2ei64_v_u64m1(__VA_ARGS__)
+#define vsoxseg3ei64_v_u64m1(...) __riscv_vsoxseg3ei64_v_u64m1(__VA_ARGS__)
+#define vsoxseg4ei64_v_u64m1(...) __riscv_vsoxseg4ei64_v_u64m1(__VA_ARGS__)
+#define vsoxseg5ei64_v_u64m1(...) __riscv_vsoxseg5ei64_v_u64m1(__VA_ARGS__)
+#define vsoxseg6ei64_v_u64m1(...) __riscv_vsoxseg6ei64_v_u64m1(__VA_ARGS__)
+#define vsoxseg7ei64_v_u64m1(...) __riscv_vsoxseg7ei64_v_u64m1(__VA_ARGS__)
+#define vsoxseg8ei64_v_u64m1(...) __riscv_vsoxseg8ei64_v_u64m1(__VA_ARGS__)
+#define vsoxseg2ei64_v_u64m2(...) __riscv_vsoxseg2ei64_v_u64m2(__VA_ARGS__)
+#define vsoxseg3ei64_v_u64m2(...) __riscv_vsoxseg3ei64_v_u64m2(__VA_ARGS__)
+#define vsoxseg4ei64_v_u64m2(...) __riscv_vsoxseg4ei64_v_u64m2(__VA_ARGS__)
+#define vsoxseg2ei64_v_u64m4(...) __riscv_vsoxseg2ei64_v_u64m4(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8mf8(...) __riscv_vsuxseg2ei8_v_u8mf8(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8mf8(...) __riscv_vsuxseg3ei8_v_u8mf8(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8mf8(...) __riscv_vsuxseg4ei8_v_u8mf8(__VA_ARGS__)
+#define vsuxseg5ei8_v_u8mf8(...) __riscv_vsuxseg5ei8_v_u8mf8(__VA_ARGS__)
+#define vsuxseg6ei8_v_u8mf8(...) __riscv_vsuxseg6ei8_v_u8mf8(__VA_ARGS__)
+#define vsuxseg7ei8_v_u8mf8(...) __riscv_vsuxseg7ei8_v_u8mf8(__VA_ARGS__)
+#define vsuxseg8ei8_v_u8mf8(...) __riscv_vsuxseg8ei8_v_u8mf8(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8mf4(...) __riscv_vsuxseg2ei8_v_u8mf4(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8mf4(...) __riscv_vsuxseg3ei8_v_u8mf4(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8mf4(...) __riscv_vsuxseg4ei8_v_u8mf4(__VA_ARGS__)
+#define vsuxseg5ei8_v_u8mf4(...) __riscv_vsuxseg5ei8_v_u8mf4(__VA_ARGS__)
+#define vsuxseg6ei8_v_u8mf4(...) __riscv_vsuxseg6ei8_v_u8mf4(__VA_ARGS__)
+#define vsuxseg7ei8_v_u8mf4(...) __riscv_vsuxseg7ei8_v_u8mf4(__VA_ARGS__)
+#define vsuxseg8ei8_v_u8mf4(...) __riscv_vsuxseg8ei8_v_u8mf4(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8mf2(...) __riscv_vsuxseg2ei8_v_u8mf2(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8mf2(...) __riscv_vsuxseg3ei8_v_u8mf2(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8mf2(...) __riscv_vsuxseg4ei8_v_u8mf2(__VA_ARGS__)
+#define vsuxseg5ei8_v_u8mf2(...) __riscv_vsuxseg5ei8_v_u8mf2(__VA_ARGS__)
+#define vsuxseg6ei8_v_u8mf2(...) __riscv_vsuxseg6ei8_v_u8mf2(__VA_ARGS__)
+#define vsuxseg7ei8_v_u8mf2(...) __riscv_vsuxseg7ei8_v_u8mf2(__VA_ARGS__)
+#define vsuxseg8ei8_v_u8mf2(...) __riscv_vsuxseg8ei8_v_u8mf2(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8m1(...) __riscv_vsuxseg2ei8_v_u8m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8m1(...) __riscv_vsuxseg3ei8_v_u8m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8m1(...) __riscv_vsuxseg4ei8_v_u8m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_u8m1(...) __riscv_vsuxseg5ei8_v_u8m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_u8m1(...) __riscv_vsuxseg6ei8_v_u8m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_u8m1(...) __riscv_vsuxseg7ei8_v_u8m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_u8m1(...) __riscv_vsuxseg8ei8_v_u8m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8m2(...) __riscv_vsuxseg2ei8_v_u8m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8m2(...) __riscv_vsuxseg3ei8_v_u8m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8m2(...) __riscv_vsuxseg4ei8_v_u8m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8m4(...) __riscv_vsuxseg2ei8_v_u8m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8mf8(...) __riscv_vsuxseg2ei16_v_u8mf8(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8mf8(...) __riscv_vsuxseg3ei16_v_u8mf8(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8mf8(...) __riscv_vsuxseg4ei16_v_u8mf8(__VA_ARGS__)
+#define vsuxseg5ei16_v_u8mf8(...) __riscv_vsuxseg5ei16_v_u8mf8(__VA_ARGS__)
+#define vsuxseg6ei16_v_u8mf8(...) __riscv_vsuxseg6ei16_v_u8mf8(__VA_ARGS__)
+#define vsuxseg7ei16_v_u8mf8(...) __riscv_vsuxseg7ei16_v_u8mf8(__VA_ARGS__)
+#define vsuxseg8ei16_v_u8mf8(...) __riscv_vsuxseg8ei16_v_u8mf8(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8mf4(...) __riscv_vsuxseg2ei16_v_u8mf4(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8mf4(...) __riscv_vsuxseg3ei16_v_u8mf4(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8mf4(...) __riscv_vsuxseg4ei16_v_u8mf4(__VA_ARGS__)
+#define vsuxseg5ei16_v_u8mf4(...) __riscv_vsuxseg5ei16_v_u8mf4(__VA_ARGS__)
+#define vsuxseg6ei16_v_u8mf4(...) __riscv_vsuxseg6ei16_v_u8mf4(__VA_ARGS__)
+#define vsuxseg7ei16_v_u8mf4(...) __riscv_vsuxseg7ei16_v_u8mf4(__VA_ARGS__)
+#define vsuxseg8ei16_v_u8mf4(...) __riscv_vsuxseg8ei16_v_u8mf4(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8mf2(...) __riscv_vsuxseg2ei16_v_u8mf2(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8mf2(...) __riscv_vsuxseg3ei16_v_u8mf2(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8mf2(...) __riscv_vsuxseg4ei16_v_u8mf2(__VA_ARGS__)
+#define vsuxseg5ei16_v_u8mf2(...) __riscv_vsuxseg5ei16_v_u8mf2(__VA_ARGS__)
+#define vsuxseg6ei16_v_u8mf2(...) __riscv_vsuxseg6ei16_v_u8mf2(__VA_ARGS__)
+#define vsuxseg7ei16_v_u8mf2(...) __riscv_vsuxseg7ei16_v_u8mf2(__VA_ARGS__)
+#define vsuxseg8ei16_v_u8mf2(...) __riscv_vsuxseg8ei16_v_u8mf2(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8m1(...) __riscv_vsuxseg2ei16_v_u8m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8m1(...) __riscv_vsuxseg3ei16_v_u8m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8m1(...) __riscv_vsuxseg4ei16_v_u8m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_u8m1(...) __riscv_vsuxseg5ei16_v_u8m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_u8m1(...) __riscv_vsuxseg6ei16_v_u8m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_u8m1(...) __riscv_vsuxseg7ei16_v_u8m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_u8m1(...) __riscv_vsuxseg8ei16_v_u8m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8m2(...) __riscv_vsuxseg2ei16_v_u8m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8m2(...) __riscv_vsuxseg3ei16_v_u8m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8m2(...) __riscv_vsuxseg4ei16_v_u8m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8m4(...) __riscv_vsuxseg2ei16_v_u8m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8mf8(...) __riscv_vsuxseg2ei32_v_u8mf8(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8mf8(...) __riscv_vsuxseg3ei32_v_u8mf8(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8mf8(...) __riscv_vsuxseg4ei32_v_u8mf8(__VA_ARGS__)
+#define vsuxseg5ei32_v_u8mf8(...) __riscv_vsuxseg5ei32_v_u8mf8(__VA_ARGS__)
+#define vsuxseg6ei32_v_u8mf8(...) __riscv_vsuxseg6ei32_v_u8mf8(__VA_ARGS__)
+#define vsuxseg7ei32_v_u8mf8(...) __riscv_vsuxseg7ei32_v_u8mf8(__VA_ARGS__)
+#define vsuxseg8ei32_v_u8mf8(...) __riscv_vsuxseg8ei32_v_u8mf8(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8mf4(...) __riscv_vsuxseg2ei32_v_u8mf4(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8mf4(...) __riscv_vsuxseg3ei32_v_u8mf4(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8mf4(...) __riscv_vsuxseg4ei32_v_u8mf4(__VA_ARGS__)
+#define vsuxseg5ei32_v_u8mf4(...) __riscv_vsuxseg5ei32_v_u8mf4(__VA_ARGS__)
+#define vsuxseg6ei32_v_u8mf4(...) __riscv_vsuxseg6ei32_v_u8mf4(__VA_ARGS__)
+#define vsuxseg7ei32_v_u8mf4(...) __riscv_vsuxseg7ei32_v_u8mf4(__VA_ARGS__)
+#define vsuxseg8ei32_v_u8mf4(...) __riscv_vsuxseg8ei32_v_u8mf4(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8mf2(...) __riscv_vsuxseg2ei32_v_u8mf2(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8mf2(...) __riscv_vsuxseg3ei32_v_u8mf2(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8mf2(...) __riscv_vsuxseg4ei32_v_u8mf2(__VA_ARGS__)
+#define vsuxseg5ei32_v_u8mf2(...) __riscv_vsuxseg5ei32_v_u8mf2(__VA_ARGS__)
+#define vsuxseg6ei32_v_u8mf2(...) __riscv_vsuxseg6ei32_v_u8mf2(__VA_ARGS__)
+#define vsuxseg7ei32_v_u8mf2(...) __riscv_vsuxseg7ei32_v_u8mf2(__VA_ARGS__)
+#define vsuxseg8ei32_v_u8mf2(...) __riscv_vsuxseg8ei32_v_u8mf2(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8m1(...) __riscv_vsuxseg2ei32_v_u8m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8m1(...) __riscv_vsuxseg3ei32_v_u8m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8m1(...) __riscv_vsuxseg4ei32_v_u8m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_u8m1(...) __riscv_vsuxseg5ei32_v_u8m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_u8m1(...) __riscv_vsuxseg6ei32_v_u8m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_u8m1(...) __riscv_vsuxseg7ei32_v_u8m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_u8m1(...) __riscv_vsuxseg8ei32_v_u8m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8m2(...) __riscv_vsuxseg2ei32_v_u8m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8m2(...) __riscv_vsuxseg3ei32_v_u8m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8m2(...) __riscv_vsuxseg4ei32_v_u8m2(__VA_ARGS__)
+#define vsuxseg2ei64_v_u8mf8(...) __riscv_vsuxseg2ei64_v_u8mf8(__VA_ARGS__)
+#define vsuxseg3ei64_v_u8mf8(...) __riscv_vsuxseg3ei64_v_u8mf8(__VA_ARGS__)
+#define vsuxseg4ei64_v_u8mf8(...) __riscv_vsuxseg4ei64_v_u8mf8(__VA_ARGS__)
+#define vsuxseg5ei64_v_u8mf8(...) __riscv_vsuxseg5ei64_v_u8mf8(__VA_ARGS__)
+#define vsuxseg6ei64_v_u8mf8(...) __riscv_vsuxseg6ei64_v_u8mf8(__VA_ARGS__)
+#define vsuxseg7ei64_v_u8mf8(...) __riscv_vsuxseg7ei64_v_u8mf8(__VA_ARGS__)
+#define vsuxseg8ei64_v_u8mf8(...) __riscv_vsuxseg8ei64_v_u8mf8(__VA_ARGS__)
+#define vsuxseg2ei64_v_u8mf4(...) __riscv_vsuxseg2ei64_v_u8mf4(__VA_ARGS__)
+#define vsuxseg3ei64_v_u8mf4(...) __riscv_vsuxseg3ei64_v_u8mf4(__VA_ARGS__)
+#define vsuxseg4ei64_v_u8mf4(...) __riscv_vsuxseg4ei64_v_u8mf4(__VA_ARGS__)
+#define vsuxseg5ei64_v_u8mf4(...) __riscv_vsuxseg5ei64_v_u8mf4(__VA_ARGS__)
+#define vsuxseg6ei64_v_u8mf4(...) __riscv_vsuxseg6ei64_v_u8mf4(__VA_ARGS__)
+#define vsuxseg7ei64_v_u8mf4(...) __riscv_vsuxseg7ei64_v_u8mf4(__VA_ARGS__)
+#define vsuxseg8ei64_v_u8mf4(...) __riscv_vsuxseg8ei64_v_u8mf4(__VA_ARGS__)
+#define vsuxseg2ei64_v_u8mf2(...) __riscv_vsuxseg2ei64_v_u8mf2(__VA_ARGS__)
+#define vsuxseg3ei64_v_u8mf2(...) __riscv_vsuxseg3ei64_v_u8mf2(__VA_ARGS__)
+#define vsuxseg4ei64_v_u8mf2(...) __riscv_vsuxseg4ei64_v_u8mf2(__VA_ARGS__)
+#define vsuxseg5ei64_v_u8mf2(...) __riscv_vsuxseg5ei64_v_u8mf2(__VA_ARGS__)
+#define vsuxseg6ei64_v_u8mf2(...) __riscv_vsuxseg6ei64_v_u8mf2(__VA_ARGS__)
+#define vsuxseg7ei64_v_u8mf2(...) __riscv_vsuxseg7ei64_v_u8mf2(__VA_ARGS__)
+#define vsuxseg8ei64_v_u8mf2(...) __riscv_vsuxseg8ei64_v_u8mf2(__VA_ARGS__)
+#define vsuxseg2ei64_v_u8m1(...) __riscv_vsuxseg2ei64_v_u8m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_u8m1(...) __riscv_vsuxseg3ei64_v_u8m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_u8m1(...) __riscv_vsuxseg4ei64_v_u8m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_u8m1(...) __riscv_vsuxseg5ei64_v_u8m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_u8m1(...) __riscv_vsuxseg6ei64_v_u8m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_u8m1(...) __riscv_vsuxseg7ei64_v_u8m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_u8m1(...) __riscv_vsuxseg8ei64_v_u8m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16mf4(...) __riscv_vsuxseg2ei8_v_u16mf4(__VA_ARGS__)
+#define vsuxseg3ei8_v_u16mf4(...) __riscv_vsuxseg3ei8_v_u16mf4(__VA_ARGS__)
+#define vsuxseg4ei8_v_u16mf4(...) __riscv_vsuxseg4ei8_v_u16mf4(__VA_ARGS__)
+#define vsuxseg5ei8_v_u16mf4(...) __riscv_vsuxseg5ei8_v_u16mf4(__VA_ARGS__)
+#define vsuxseg6ei8_v_u16mf4(...) __riscv_vsuxseg6ei8_v_u16mf4(__VA_ARGS__)
+#define vsuxseg7ei8_v_u16mf4(...) __riscv_vsuxseg7ei8_v_u16mf4(__VA_ARGS__)
+#define vsuxseg8ei8_v_u16mf4(...) __riscv_vsuxseg8ei8_v_u16mf4(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16mf2(...) __riscv_vsuxseg2ei8_v_u16mf2(__VA_ARGS__)
+#define vsuxseg3ei8_v_u16mf2(...) __riscv_vsuxseg3ei8_v_u16mf2(__VA_ARGS__)
+#define vsuxseg4ei8_v_u16mf2(...) __riscv_vsuxseg4ei8_v_u16mf2(__VA_ARGS__)
+#define vsuxseg5ei8_v_u16mf2(...) __riscv_vsuxseg5ei8_v_u16mf2(__VA_ARGS__)
+#define vsuxseg6ei8_v_u16mf2(...) __riscv_vsuxseg6ei8_v_u16mf2(__VA_ARGS__)
+#define vsuxseg7ei8_v_u16mf2(...) __riscv_vsuxseg7ei8_v_u16mf2(__VA_ARGS__)
+#define vsuxseg8ei8_v_u16mf2(...) __riscv_vsuxseg8ei8_v_u16mf2(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16m1(...) __riscv_vsuxseg2ei8_v_u16m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_u16m1(...) __riscv_vsuxseg3ei8_v_u16m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_u16m1(...) __riscv_vsuxseg4ei8_v_u16m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_u16m1(...) __riscv_vsuxseg5ei8_v_u16m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_u16m1(...) __riscv_vsuxseg6ei8_v_u16m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_u16m1(...) __riscv_vsuxseg7ei8_v_u16m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_u16m1(...) __riscv_vsuxseg8ei8_v_u16m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16m2(...) __riscv_vsuxseg2ei8_v_u16m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_u16m2(...) __riscv_vsuxseg3ei8_v_u16m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_u16m2(...) __riscv_vsuxseg4ei8_v_u16m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16m4(...) __riscv_vsuxseg2ei8_v_u16m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16mf4(...) __riscv_vsuxseg2ei16_v_u16mf4(__VA_ARGS__)
+#define vsuxseg3ei16_v_u16mf4(...) __riscv_vsuxseg3ei16_v_u16mf4(__VA_ARGS__)
+#define vsuxseg4ei16_v_u16mf4(...) __riscv_vsuxseg4ei16_v_u16mf4(__VA_ARGS__)
+#define vsuxseg5ei16_v_u16mf4(...) __riscv_vsuxseg5ei16_v_u16mf4(__VA_ARGS__)
+#define vsuxseg6ei16_v_u16mf4(...) __riscv_vsuxseg6ei16_v_u16mf4(__VA_ARGS__)
+#define vsuxseg7ei16_v_u16mf4(...) __riscv_vsuxseg7ei16_v_u16mf4(__VA_ARGS__)
+#define vsuxseg8ei16_v_u16mf4(...) __riscv_vsuxseg8ei16_v_u16mf4(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16mf2(...) __riscv_vsuxseg2ei16_v_u16mf2(__VA_ARGS__)
+#define vsuxseg3ei16_v_u16mf2(...) __riscv_vsuxseg3ei16_v_u16mf2(__VA_ARGS__)
+#define vsuxseg4ei16_v_u16mf2(...) __riscv_vsuxseg4ei16_v_u16mf2(__VA_ARGS__)
+#define vsuxseg5ei16_v_u16mf2(...) __riscv_vsuxseg5ei16_v_u16mf2(__VA_ARGS__)
+#define vsuxseg6ei16_v_u16mf2(...) __riscv_vsuxseg6ei16_v_u16mf2(__VA_ARGS__)
+#define vsuxseg7ei16_v_u16mf2(...) __riscv_vsuxseg7ei16_v_u16mf2(__VA_ARGS__)
+#define vsuxseg8ei16_v_u16mf2(...) __riscv_vsuxseg8ei16_v_u16mf2(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16m1(...) __riscv_vsuxseg2ei16_v_u16m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_u16m1(...) __riscv_vsuxseg3ei16_v_u16m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_u16m1(...) __riscv_vsuxseg4ei16_v_u16m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_u16m1(...) __riscv_vsuxseg5ei16_v_u16m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_u16m1(...) __riscv_vsuxseg6ei16_v_u16m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_u16m1(...) __riscv_vsuxseg7ei16_v_u16m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_u16m1(...) __riscv_vsuxseg8ei16_v_u16m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16m2(...) __riscv_vsuxseg2ei16_v_u16m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_u16m2(...) __riscv_vsuxseg3ei16_v_u16m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_u16m2(...) __riscv_vsuxseg4ei16_v_u16m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16m4(...) __riscv_vsuxseg2ei16_v_u16m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16mf4(...) __riscv_vsuxseg2ei32_v_u16mf4(__VA_ARGS__)
+#define vsuxseg3ei32_v_u16mf4(...) __riscv_vsuxseg3ei32_v_u16mf4(__VA_ARGS__)
+#define vsuxseg4ei32_v_u16mf4(...) __riscv_vsuxseg4ei32_v_u16mf4(__VA_ARGS__)
+#define vsuxseg5ei32_v_u16mf4(...) __riscv_vsuxseg5ei32_v_u16mf4(__VA_ARGS__)
+#define vsuxseg6ei32_v_u16mf4(...) __riscv_vsuxseg6ei32_v_u16mf4(__VA_ARGS__)
+#define vsuxseg7ei32_v_u16mf4(...) __riscv_vsuxseg7ei32_v_u16mf4(__VA_ARGS__)
+#define vsuxseg8ei32_v_u16mf4(...) __riscv_vsuxseg8ei32_v_u16mf4(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16mf2(...) __riscv_vsuxseg2ei32_v_u16mf2(__VA_ARGS__)
+#define vsuxseg3ei32_v_u16mf2(...) __riscv_vsuxseg3ei32_v_u16mf2(__VA_ARGS__)
+#define vsuxseg4ei32_v_u16mf2(...) __riscv_vsuxseg4ei32_v_u16mf2(__VA_ARGS__)
+#define vsuxseg5ei32_v_u16mf2(...) __riscv_vsuxseg5ei32_v_u16mf2(__VA_ARGS__)
+#define vsuxseg6ei32_v_u16mf2(...) __riscv_vsuxseg6ei32_v_u16mf2(__VA_ARGS__)
+#define vsuxseg7ei32_v_u16mf2(...) __riscv_vsuxseg7ei32_v_u16mf2(__VA_ARGS__)
+#define vsuxseg8ei32_v_u16mf2(...) __riscv_vsuxseg8ei32_v_u16mf2(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16m1(...) __riscv_vsuxseg2ei32_v_u16m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_u16m1(...) __riscv_vsuxseg3ei32_v_u16m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_u16m1(...) __riscv_vsuxseg4ei32_v_u16m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_u16m1(...) __riscv_vsuxseg5ei32_v_u16m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_u16m1(...) __riscv_vsuxseg6ei32_v_u16m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_u16m1(...) __riscv_vsuxseg7ei32_v_u16m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_u16m1(...) __riscv_vsuxseg8ei32_v_u16m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16m2(...) __riscv_vsuxseg2ei32_v_u16m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_u16m2(...) __riscv_vsuxseg3ei32_v_u16m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_u16m2(...) __riscv_vsuxseg4ei32_v_u16m2(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16m4(...) __riscv_vsuxseg2ei32_v_u16m4(__VA_ARGS__)
+#define vsuxseg2ei64_v_u16mf4(...) __riscv_vsuxseg2ei64_v_u16mf4(__VA_ARGS__)
+#define vsuxseg3ei64_v_u16mf4(...) __riscv_vsuxseg3ei64_v_u16mf4(__VA_ARGS__)
+#define vsuxseg4ei64_v_u16mf4(...) __riscv_vsuxseg4ei64_v_u16mf4(__VA_ARGS__)
+#define vsuxseg5ei64_v_u16mf4(...) __riscv_vsuxseg5ei64_v_u16mf4(__VA_ARGS__)
+#define vsuxseg6ei64_v_u16mf4(...) __riscv_vsuxseg6ei64_v_u16mf4(__VA_ARGS__)
+#define vsuxseg7ei64_v_u16mf4(...) __riscv_vsuxseg7ei64_v_u16mf4(__VA_ARGS__)
+#define vsuxseg8ei64_v_u16mf4(...) __riscv_vsuxseg8ei64_v_u16mf4(__VA_ARGS__)
+#define vsuxseg2ei64_v_u16mf2(...) __riscv_vsuxseg2ei64_v_u16mf2(__VA_ARGS__)
+#define vsuxseg3ei64_v_u16mf2(...) __riscv_vsuxseg3ei64_v_u16mf2(__VA_ARGS__)
+#define vsuxseg4ei64_v_u16mf2(...) __riscv_vsuxseg4ei64_v_u16mf2(__VA_ARGS__)
+#define vsuxseg5ei64_v_u16mf2(...) __riscv_vsuxseg5ei64_v_u16mf2(__VA_ARGS__)
+#define vsuxseg6ei64_v_u16mf2(...) __riscv_vsuxseg6ei64_v_u16mf2(__VA_ARGS__)
+#define vsuxseg7ei64_v_u16mf2(...) __riscv_vsuxseg7ei64_v_u16mf2(__VA_ARGS__)
+#define vsuxseg8ei64_v_u16mf2(...) __riscv_vsuxseg8ei64_v_u16mf2(__VA_ARGS__)
+#define vsuxseg2ei64_v_u16m1(...) __riscv_vsuxseg2ei64_v_u16m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_u16m1(...) __riscv_vsuxseg3ei64_v_u16m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_u16m1(...) __riscv_vsuxseg4ei64_v_u16m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_u16m1(...) __riscv_vsuxseg5ei64_v_u16m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_u16m1(...) __riscv_vsuxseg6ei64_v_u16m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_u16m1(...) __riscv_vsuxseg7ei64_v_u16m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_u16m1(...) __riscv_vsuxseg8ei64_v_u16m1(__VA_ARGS__)
+#define vsuxseg2ei64_v_u16m2(...) __riscv_vsuxseg2ei64_v_u16m2(__VA_ARGS__)
+#define vsuxseg3ei64_v_u16m2(...) __riscv_vsuxseg3ei64_v_u16m2(__VA_ARGS__)
+#define vsuxseg4ei64_v_u16m2(...) __riscv_vsuxseg4ei64_v_u16m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_u32mf2(...) __riscv_vsuxseg2ei8_v_u32mf2(__VA_ARGS__)
+#define vsuxseg3ei8_v_u32mf2(...) __riscv_vsuxseg3ei8_v_u32mf2(__VA_ARGS__)
+#define vsuxseg4ei8_v_u32mf2(...) __riscv_vsuxseg4ei8_v_u32mf2(__VA_ARGS__)
+#define vsuxseg5ei8_v_u32mf2(...) __riscv_vsuxseg5ei8_v_u32mf2(__VA_ARGS__)
+#define vsuxseg6ei8_v_u32mf2(...) __riscv_vsuxseg6ei8_v_u32mf2(__VA_ARGS__)
+#define vsuxseg7ei8_v_u32mf2(...) __riscv_vsuxseg7ei8_v_u32mf2(__VA_ARGS__)
+#define vsuxseg8ei8_v_u32mf2(...) __riscv_vsuxseg8ei8_v_u32mf2(__VA_ARGS__)
+#define vsuxseg2ei8_v_u32m1(...) __riscv_vsuxseg2ei8_v_u32m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_u32m1(...) __riscv_vsuxseg3ei8_v_u32m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_u32m1(...) __riscv_vsuxseg4ei8_v_u32m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_u32m1(...) __riscv_vsuxseg5ei8_v_u32m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_u32m1(...) __riscv_vsuxseg6ei8_v_u32m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_u32m1(...) __riscv_vsuxseg7ei8_v_u32m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_u32m1(...) __riscv_vsuxseg8ei8_v_u32m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_u32m2(...) __riscv_vsuxseg2ei8_v_u32m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_u32m2(...) __riscv_vsuxseg3ei8_v_u32m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_u32m2(...) __riscv_vsuxseg4ei8_v_u32m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_u32m4(...) __riscv_vsuxseg2ei8_v_u32m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_u32mf2(...) __riscv_vsuxseg2ei16_v_u32mf2(__VA_ARGS__)
+#define vsuxseg3ei16_v_u32mf2(...) __riscv_vsuxseg3ei16_v_u32mf2(__VA_ARGS__)
+#define vsuxseg4ei16_v_u32mf2(...) __riscv_vsuxseg4ei16_v_u32mf2(__VA_ARGS__)
+#define vsuxseg5ei16_v_u32mf2(...) __riscv_vsuxseg5ei16_v_u32mf2(__VA_ARGS__)
+#define vsuxseg6ei16_v_u32mf2(...) __riscv_vsuxseg6ei16_v_u32mf2(__VA_ARGS__)
+#define vsuxseg7ei16_v_u32mf2(...) __riscv_vsuxseg7ei16_v_u32mf2(__VA_ARGS__)
+#define vsuxseg8ei16_v_u32mf2(...) __riscv_vsuxseg8ei16_v_u32mf2(__VA_ARGS__)
+#define vsuxseg2ei16_v_u32m1(...) __riscv_vsuxseg2ei16_v_u32m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_u32m1(...) __riscv_vsuxseg3ei16_v_u32m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_u32m1(...) __riscv_vsuxseg4ei16_v_u32m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_u32m1(...) __riscv_vsuxseg5ei16_v_u32m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_u32m1(...) __riscv_vsuxseg6ei16_v_u32m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_u32m1(...) __riscv_vsuxseg7ei16_v_u32m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_u32m1(...) __riscv_vsuxseg8ei16_v_u32m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_u32m2(...) __riscv_vsuxseg2ei16_v_u32m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_u32m2(...) __riscv_vsuxseg3ei16_v_u32m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_u32m2(...) __riscv_vsuxseg4ei16_v_u32m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_u32m4(...) __riscv_vsuxseg2ei16_v_u32m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_u32mf2(...) __riscv_vsuxseg2ei32_v_u32mf2(__VA_ARGS__)
+#define vsuxseg3ei32_v_u32mf2(...) __riscv_vsuxseg3ei32_v_u32mf2(__VA_ARGS__)
+#define vsuxseg4ei32_v_u32mf2(...) __riscv_vsuxseg4ei32_v_u32mf2(__VA_ARGS__)
+#define vsuxseg5ei32_v_u32mf2(...) __riscv_vsuxseg5ei32_v_u32mf2(__VA_ARGS__)
+#define vsuxseg6ei32_v_u32mf2(...) __riscv_vsuxseg6ei32_v_u32mf2(__VA_ARGS__)
+#define vsuxseg7ei32_v_u32mf2(...) __riscv_vsuxseg7ei32_v_u32mf2(__VA_ARGS__)
+#define vsuxseg8ei32_v_u32mf2(...) __riscv_vsuxseg8ei32_v_u32mf2(__VA_ARGS__)
+#define vsuxseg2ei32_v_u32m1(...) __riscv_vsuxseg2ei32_v_u32m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_u32m1(...) __riscv_vsuxseg3ei32_v_u32m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_u32m1(...) __riscv_vsuxseg4ei32_v_u32m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_u32m1(...) __riscv_vsuxseg5ei32_v_u32m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_u32m1(...) __riscv_vsuxseg6ei32_v_u32m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_u32m1(...) __riscv_vsuxseg7ei32_v_u32m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_u32m1(...) __riscv_vsuxseg8ei32_v_u32m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_u32m2(...) __riscv_vsuxseg2ei32_v_u32m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_u32m2(...) __riscv_vsuxseg3ei32_v_u32m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_u32m2(...) __riscv_vsuxseg4ei32_v_u32m2(__VA_ARGS__)
+#define vsuxseg2ei32_v_u32m4(...) __riscv_vsuxseg2ei32_v_u32m4(__VA_ARGS__)
+#define vsuxseg2ei64_v_u32mf2(...) __riscv_vsuxseg2ei64_v_u32mf2(__VA_ARGS__)
+#define vsuxseg3ei64_v_u32mf2(...) __riscv_vsuxseg3ei64_v_u32mf2(__VA_ARGS__)
+#define vsuxseg4ei64_v_u32mf2(...) __riscv_vsuxseg4ei64_v_u32mf2(__VA_ARGS__)
+#define vsuxseg5ei64_v_u32mf2(...) __riscv_vsuxseg5ei64_v_u32mf2(__VA_ARGS__)
+#define vsuxseg6ei64_v_u32mf2(...) __riscv_vsuxseg6ei64_v_u32mf2(__VA_ARGS__)
+#define vsuxseg7ei64_v_u32mf2(...) __riscv_vsuxseg7ei64_v_u32mf2(__VA_ARGS__)
+#define vsuxseg8ei64_v_u32mf2(...) __riscv_vsuxseg8ei64_v_u32mf2(__VA_ARGS__)
+#define vsuxseg2ei64_v_u32m1(...) __riscv_vsuxseg2ei64_v_u32m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_u32m1(...) __riscv_vsuxseg3ei64_v_u32m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_u32m1(...) __riscv_vsuxseg4ei64_v_u32m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_u32m1(...) __riscv_vsuxseg5ei64_v_u32m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_u32m1(...) __riscv_vsuxseg6ei64_v_u32m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_u32m1(...) __riscv_vsuxseg7ei64_v_u32m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_u32m1(...) __riscv_vsuxseg8ei64_v_u32m1(__VA_ARGS__)
+#define vsuxseg2ei64_v_u32m2(...) __riscv_vsuxseg2ei64_v_u32m2(__VA_ARGS__)
+#define vsuxseg3ei64_v_u32m2(...) __riscv_vsuxseg3ei64_v_u32m2(__VA_ARGS__)
+#define vsuxseg4ei64_v_u32m2(...) __riscv_vsuxseg4ei64_v_u32m2(__VA_ARGS__)
+#define vsuxseg2ei64_v_u32m4(...) __riscv_vsuxseg2ei64_v_u32m4(__VA_ARGS__)
+#define vsuxseg2ei8_v_u64m1(...) __riscv_vsuxseg2ei8_v_u64m1(__VA_ARGS__)
+#define vsuxseg3ei8_v_u64m1(...) __riscv_vsuxseg3ei8_v_u64m1(__VA_ARGS__)
+#define vsuxseg4ei8_v_u64m1(...) __riscv_vsuxseg4ei8_v_u64m1(__VA_ARGS__)
+#define vsuxseg5ei8_v_u64m1(...) __riscv_vsuxseg5ei8_v_u64m1(__VA_ARGS__)
+#define vsuxseg6ei8_v_u64m1(...) __riscv_vsuxseg6ei8_v_u64m1(__VA_ARGS__)
+#define vsuxseg7ei8_v_u64m1(...) __riscv_vsuxseg7ei8_v_u64m1(__VA_ARGS__)
+#define vsuxseg8ei8_v_u64m1(...) __riscv_vsuxseg8ei8_v_u64m1(__VA_ARGS__)
+#define vsuxseg2ei8_v_u64m2(...) __riscv_vsuxseg2ei8_v_u64m2(__VA_ARGS__)
+#define vsuxseg3ei8_v_u64m2(...) __riscv_vsuxseg3ei8_v_u64m2(__VA_ARGS__)
+#define vsuxseg4ei8_v_u64m2(...) __riscv_vsuxseg4ei8_v_u64m2(__VA_ARGS__)
+#define vsuxseg2ei8_v_u64m4(...) __riscv_vsuxseg2ei8_v_u64m4(__VA_ARGS__)
+#define vsuxseg2ei16_v_u64m1(...) __riscv_vsuxseg2ei16_v_u64m1(__VA_ARGS__)
+#define vsuxseg3ei16_v_u64m1(...) __riscv_vsuxseg3ei16_v_u64m1(__VA_ARGS__)
+#define vsuxseg4ei16_v_u64m1(...) __riscv_vsuxseg4ei16_v_u64m1(__VA_ARGS__)
+#define vsuxseg5ei16_v_u64m1(...) __riscv_vsuxseg5ei16_v_u64m1(__VA_ARGS__)
+#define vsuxseg6ei16_v_u64m1(...) __riscv_vsuxseg6ei16_v_u64m1(__VA_ARGS__)
+#define vsuxseg7ei16_v_u64m1(...) __riscv_vsuxseg7ei16_v_u64m1(__VA_ARGS__)
+#define vsuxseg8ei16_v_u64m1(...) __riscv_vsuxseg8ei16_v_u64m1(__VA_ARGS__)
+#define vsuxseg2ei16_v_u64m2(...) __riscv_vsuxseg2ei16_v_u64m2(__VA_ARGS__)
+#define vsuxseg3ei16_v_u64m2(...) __riscv_vsuxseg3ei16_v_u64m2(__VA_ARGS__)
+#define vsuxseg4ei16_v_u64m2(...) __riscv_vsuxseg4ei16_v_u64m2(__VA_ARGS__)
+#define vsuxseg2ei16_v_u64m4(...) __riscv_vsuxseg2ei16_v_u64m4(__VA_ARGS__)
+#define vsuxseg2ei32_v_u64m1(...) __riscv_vsuxseg2ei32_v_u64m1(__VA_ARGS__)
+#define vsuxseg3ei32_v_u64m1(...) __riscv_vsuxseg3ei32_v_u64m1(__VA_ARGS__)
+#define vsuxseg4ei32_v_u64m1(...) __riscv_vsuxseg4ei32_v_u64m1(__VA_ARGS__)
+#define vsuxseg5ei32_v_u64m1(...) __riscv_vsuxseg5ei32_v_u64m1(__VA_ARGS__)
+#define vsuxseg6ei32_v_u64m1(...) __riscv_vsuxseg6ei32_v_u64m1(__VA_ARGS__)
+#define vsuxseg7ei32_v_u64m1(...) __riscv_vsuxseg7ei32_v_u64m1(__VA_ARGS__)
+#define vsuxseg8ei32_v_u64m1(...) __riscv_vsuxseg8ei32_v_u64m1(__VA_ARGS__)
+#define vsuxseg2ei32_v_u64m2(...) __riscv_vsuxseg2ei32_v_u64m2(__VA_ARGS__)
+#define vsuxseg3ei32_v_u64m2(...) __riscv_vsuxseg3ei32_v_u64m2(__VA_ARGS__)
+#define vsuxseg4ei32_v_u64m2(...) __riscv_vsuxseg4ei32_v_u64m2(__VA_ARGS__)
+#define vsuxseg2ei32_v_u64m4(...) __riscv_vsuxseg2ei32_v_u64m4(__VA_ARGS__)
+#define vsuxseg2ei64_v_u64m1(...) __riscv_vsuxseg2ei64_v_u64m1(__VA_ARGS__)
+#define vsuxseg3ei64_v_u64m1(...) __riscv_vsuxseg3ei64_v_u64m1(__VA_ARGS__)
+#define vsuxseg4ei64_v_u64m1(...) __riscv_vsuxseg4ei64_v_u64m1(__VA_ARGS__)
+#define vsuxseg5ei64_v_u64m1(...) __riscv_vsuxseg5ei64_v_u64m1(__VA_ARGS__)
+#define vsuxseg6ei64_v_u64m1(...) __riscv_vsuxseg6ei64_v_u64m1(__VA_ARGS__)
+#define vsuxseg7ei64_v_u64m1(...) __riscv_vsuxseg7ei64_v_u64m1(__VA_ARGS__)
+#define vsuxseg8ei64_v_u64m1(...) __riscv_vsuxseg8ei64_v_u64m1(__VA_ARGS__)
+#define vsuxseg2ei64_v_u64m2(...) __riscv_vsuxseg2ei64_v_u64m2(__VA_ARGS__)
+#define vsuxseg3ei64_v_u64m2(...) __riscv_vsuxseg3ei64_v_u64m2(__VA_ARGS__)
+#define vsuxseg4ei64_v_u64m2(...) __riscv_vsuxseg4ei64_v_u64m2(__VA_ARGS__)
+#define vsuxseg2ei64_v_u64m4(...) __riscv_vsuxseg2ei64_v_u64m4(__VA_ARGS__)
+// masked functions
+#define vsoxseg2ei8_v_f16mf4_m(...) __riscv_vsoxseg2ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_f16mf4_m(...) __riscv_vsoxseg3ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_f16mf4_m(...) __riscv_vsoxseg4ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_f16mf4_m(...) __riscv_vsoxseg5ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_f16mf4_m(...) __riscv_vsoxseg6ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_f16mf4_m(...) __riscv_vsoxseg7ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_f16mf4_m(...) __riscv_vsoxseg8ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f16mf2_m(...) __riscv_vsoxseg2ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_f16mf2_m(...) __riscv_vsoxseg3ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_f16mf2_m(...) __riscv_vsoxseg4ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_f16mf2_m(...) __riscv_vsoxseg5ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_f16mf2_m(...) __riscv_vsoxseg6ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_f16mf2_m(...) __riscv_vsoxseg7ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_f16mf2_m(...) __riscv_vsoxseg8ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f16m1_m(...) __riscv_vsoxseg2ei8_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_f16m1_m(...) __riscv_vsoxseg3ei8_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_f16m1_m(...) __riscv_vsoxseg4ei8_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_f16m1_m(...) __riscv_vsoxseg5ei8_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_f16m1_m(...) __riscv_vsoxseg6ei8_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_f16m1_m(...) __riscv_vsoxseg7ei8_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_f16m1_m(...) __riscv_vsoxseg8ei8_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f16m2_m(...) __riscv_vsoxseg2ei8_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_f16m2_m(...) __riscv_vsoxseg3ei8_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_f16m2_m(...) __riscv_vsoxseg4ei8_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f16m4_m(...) __riscv_vsoxseg2ei8_v_f16m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16mf4_m(...) __riscv_vsoxseg2ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_f16mf4_m(...) __riscv_vsoxseg3ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_f16mf4_m(...) __riscv_vsoxseg4ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_f16mf4_m(...) __riscv_vsoxseg5ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_f16mf4_m(...) __riscv_vsoxseg6ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_f16mf4_m(...) __riscv_vsoxseg7ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_f16mf4_m(...) __riscv_vsoxseg8ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16mf2_m(...) __riscv_vsoxseg2ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_f16mf2_m(...) __riscv_vsoxseg3ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_f16mf2_m(...) __riscv_vsoxseg4ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_f16mf2_m(...) __riscv_vsoxseg5ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_f16mf2_m(...) __riscv_vsoxseg6ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_f16mf2_m(...) __riscv_vsoxseg7ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_f16mf2_m(...) __riscv_vsoxseg8ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16m1_m(...) __riscv_vsoxseg2ei16_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_f16m1_m(...) __riscv_vsoxseg3ei16_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_f16m1_m(...) __riscv_vsoxseg4ei16_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_f16m1_m(...) __riscv_vsoxseg5ei16_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_f16m1_m(...) __riscv_vsoxseg6ei16_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_f16m1_m(...) __riscv_vsoxseg7ei16_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_f16m1_m(...) __riscv_vsoxseg8ei16_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16m2_m(...) __riscv_vsoxseg2ei16_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_f16m2_m(...) __riscv_vsoxseg3ei16_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_f16m2_m(...) __riscv_vsoxseg4ei16_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f16m4_m(...) __riscv_vsoxseg2ei16_v_f16m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16mf4_m(...) __riscv_vsoxseg2ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_f16mf4_m(...) __riscv_vsoxseg3ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_f16mf4_m(...) __riscv_vsoxseg4ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_f16mf4_m(...) __riscv_vsoxseg5ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_f16mf4_m(...) __riscv_vsoxseg6ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_f16mf4_m(...) __riscv_vsoxseg7ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_f16mf4_m(...) __riscv_vsoxseg8ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16mf2_m(...) __riscv_vsoxseg2ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_f16mf2_m(...) __riscv_vsoxseg3ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_f16mf2_m(...) __riscv_vsoxseg4ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_f16mf2_m(...) __riscv_vsoxseg5ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_f16mf2_m(...) __riscv_vsoxseg6ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_f16mf2_m(...) __riscv_vsoxseg7ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_f16mf2_m(...) __riscv_vsoxseg8ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16m1_m(...) __riscv_vsoxseg2ei32_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_f16m1_m(...) __riscv_vsoxseg3ei32_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_f16m1_m(...) __riscv_vsoxseg4ei32_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_f16m1_m(...) __riscv_vsoxseg5ei32_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_f16m1_m(...) __riscv_vsoxseg6ei32_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_f16m1_m(...) __riscv_vsoxseg7ei32_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_f16m1_m(...) __riscv_vsoxseg8ei32_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16m2_m(...) __riscv_vsoxseg2ei32_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_f16m2_m(...) __riscv_vsoxseg3ei32_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_f16m2_m(...) __riscv_vsoxseg4ei32_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f16m4_m(...) __riscv_vsoxseg2ei32_v_f16m4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f16mf4_m(...) __riscv_vsoxseg2ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_f16mf4_m(...) __riscv_vsoxseg3ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_f16mf4_m(...) __riscv_vsoxseg4ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_f16mf4_m(...) __riscv_vsoxseg5ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_f16mf4_m(...) __riscv_vsoxseg6ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_f16mf4_m(...) __riscv_vsoxseg7ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_f16mf4_m(...) __riscv_vsoxseg8ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f16mf2_m(...) __riscv_vsoxseg2ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_f16mf2_m(...) __riscv_vsoxseg3ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_f16mf2_m(...) __riscv_vsoxseg4ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_f16mf2_m(...) __riscv_vsoxseg5ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_f16mf2_m(...) __riscv_vsoxseg6ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_f16mf2_m(...) __riscv_vsoxseg7ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_f16mf2_m(...) __riscv_vsoxseg8ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f16m1_m(...) __riscv_vsoxseg2ei64_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_f16m1_m(...) __riscv_vsoxseg3ei64_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_f16m1_m(...) __riscv_vsoxseg4ei64_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_f16m1_m(...) __riscv_vsoxseg5ei64_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_f16m1_m(...) __riscv_vsoxseg6ei64_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_f16m1_m(...) __riscv_vsoxseg7ei64_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_f16m1_m(...) __riscv_vsoxseg8ei64_v_f16m1_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f16m2_m(...) __riscv_vsoxseg2ei64_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_f16m2_m(...) __riscv_vsoxseg3ei64_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_f16m2_m(...) __riscv_vsoxseg4ei64_v_f16m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f32mf2_m(...) __riscv_vsoxseg2ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_f32mf2_m(...) __riscv_vsoxseg3ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_f32mf2_m(...) __riscv_vsoxseg4ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_f32mf2_m(...) __riscv_vsoxseg5ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_f32mf2_m(...) __riscv_vsoxseg6ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_f32mf2_m(...) __riscv_vsoxseg7ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_f32mf2_m(...) __riscv_vsoxseg8ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f32m1_m(...) __riscv_vsoxseg2ei8_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_f32m1_m(...) __riscv_vsoxseg3ei8_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_f32m1_m(...) __riscv_vsoxseg4ei8_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_f32m1_m(...) __riscv_vsoxseg5ei8_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_f32m1_m(...) __riscv_vsoxseg6ei8_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_f32m1_m(...) __riscv_vsoxseg7ei8_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_f32m1_m(...) __riscv_vsoxseg8ei8_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f32m2_m(...) __riscv_vsoxseg2ei8_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_f32m2_m(...) __riscv_vsoxseg3ei8_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_f32m2_m(...) __riscv_vsoxseg4ei8_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f32m4_m(...) __riscv_vsoxseg2ei8_v_f32m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f32mf2_m(...) __riscv_vsoxseg2ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_f32mf2_m(...) __riscv_vsoxseg3ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_f32mf2_m(...) __riscv_vsoxseg4ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_f32mf2_m(...) __riscv_vsoxseg5ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_f32mf2_m(...) __riscv_vsoxseg6ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_f32mf2_m(...) __riscv_vsoxseg7ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_f32mf2_m(...) __riscv_vsoxseg8ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f32m1_m(...) __riscv_vsoxseg2ei16_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_f32m1_m(...) __riscv_vsoxseg3ei16_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_f32m1_m(...) __riscv_vsoxseg4ei16_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_f32m1_m(...) __riscv_vsoxseg5ei16_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_f32m1_m(...) __riscv_vsoxseg6ei16_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_f32m1_m(...) __riscv_vsoxseg7ei16_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_f32m1_m(...) __riscv_vsoxseg8ei16_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f32m2_m(...) __riscv_vsoxseg2ei16_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_f32m2_m(...) __riscv_vsoxseg3ei16_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_f32m2_m(...) __riscv_vsoxseg4ei16_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f32m4_m(...) __riscv_vsoxseg2ei16_v_f32m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f32mf2_m(...) __riscv_vsoxseg2ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_f32mf2_m(...) __riscv_vsoxseg3ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_f32mf2_m(...) __riscv_vsoxseg4ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_f32mf2_m(...) __riscv_vsoxseg5ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_f32mf2_m(...) __riscv_vsoxseg6ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_f32mf2_m(...) __riscv_vsoxseg7ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_f32mf2_m(...) __riscv_vsoxseg8ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f32m1_m(...) __riscv_vsoxseg2ei32_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_f32m1_m(...) __riscv_vsoxseg3ei32_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_f32m1_m(...) __riscv_vsoxseg4ei32_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_f32m1_m(...) __riscv_vsoxseg5ei32_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_f32m1_m(...) __riscv_vsoxseg6ei32_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_f32m1_m(...) __riscv_vsoxseg7ei32_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_f32m1_m(...) __riscv_vsoxseg8ei32_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f32m2_m(...) __riscv_vsoxseg2ei32_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_f32m2_m(...) __riscv_vsoxseg3ei32_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_f32m2_m(...) __riscv_vsoxseg4ei32_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f32m4_m(...) __riscv_vsoxseg2ei32_v_f32m4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f32mf2_m(...) __riscv_vsoxseg2ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_f32mf2_m(...) __riscv_vsoxseg3ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_f32mf2_m(...) __riscv_vsoxseg4ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_f32mf2_m(...) __riscv_vsoxseg5ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_f32mf2_m(...) __riscv_vsoxseg6ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_f32mf2_m(...) __riscv_vsoxseg7ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_f32mf2_m(...) __riscv_vsoxseg8ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f32m1_m(...) __riscv_vsoxseg2ei64_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_f32m1_m(...) __riscv_vsoxseg3ei64_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_f32m1_m(...) __riscv_vsoxseg4ei64_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_f32m1_m(...) __riscv_vsoxseg5ei64_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_f32m1_m(...) __riscv_vsoxseg6ei64_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_f32m1_m(...) __riscv_vsoxseg7ei64_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_f32m1_m(...) __riscv_vsoxseg8ei64_v_f32m1_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f32m2_m(...) __riscv_vsoxseg2ei64_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_f32m2_m(...) __riscv_vsoxseg3ei64_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_f32m2_m(...) __riscv_vsoxseg4ei64_v_f32m2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f32m4_m(...) __riscv_vsoxseg2ei64_v_f32m4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f64m1_m(...) __riscv_vsoxseg2ei8_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_f64m1_m(...) __riscv_vsoxseg3ei8_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_f64m1_m(...) __riscv_vsoxseg4ei8_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_f64m1_m(...) __riscv_vsoxseg5ei8_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_f64m1_m(...) __riscv_vsoxseg6ei8_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_f64m1_m(...) __riscv_vsoxseg7ei8_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_f64m1_m(...) __riscv_vsoxseg8ei8_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f64m2_m(...) __riscv_vsoxseg2ei8_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_f64m2_m(...) __riscv_vsoxseg3ei8_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_f64m2_m(...) __riscv_vsoxseg4ei8_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_f64m4_m(...) __riscv_vsoxseg2ei8_v_f64m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f64m1_m(...) __riscv_vsoxseg2ei16_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_f64m1_m(...) __riscv_vsoxseg3ei16_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_f64m1_m(...) __riscv_vsoxseg4ei16_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_f64m1_m(...) __riscv_vsoxseg5ei16_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_f64m1_m(...) __riscv_vsoxseg6ei16_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_f64m1_m(...) __riscv_vsoxseg7ei16_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_f64m1_m(...) __riscv_vsoxseg8ei16_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f64m2_m(...) __riscv_vsoxseg2ei16_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_f64m2_m(...) __riscv_vsoxseg3ei16_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_f64m2_m(...) __riscv_vsoxseg4ei16_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_f64m4_m(...) __riscv_vsoxseg2ei16_v_f64m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f64m1_m(...) __riscv_vsoxseg2ei32_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_f64m1_m(...) __riscv_vsoxseg3ei32_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_f64m1_m(...) __riscv_vsoxseg4ei32_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_f64m1_m(...) __riscv_vsoxseg5ei32_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_f64m1_m(...) __riscv_vsoxseg6ei32_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_f64m1_m(...) __riscv_vsoxseg7ei32_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_f64m1_m(...) __riscv_vsoxseg8ei32_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f64m2_m(...) __riscv_vsoxseg2ei32_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_f64m2_m(...) __riscv_vsoxseg3ei32_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_f64m2_m(...) __riscv_vsoxseg4ei32_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_f64m4_m(...) __riscv_vsoxseg2ei32_v_f64m4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f64m1_m(...) __riscv_vsoxseg2ei64_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_f64m1_m(...) __riscv_vsoxseg3ei64_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_f64m1_m(...) __riscv_vsoxseg4ei64_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_f64m1_m(...) __riscv_vsoxseg5ei64_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_f64m1_m(...) __riscv_vsoxseg6ei64_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_f64m1_m(...) __riscv_vsoxseg7ei64_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_f64m1_m(...) __riscv_vsoxseg8ei64_v_f64m1_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f64m2_m(...) __riscv_vsoxseg2ei64_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_f64m2_m(...) __riscv_vsoxseg3ei64_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_f64m2_m(...) __riscv_vsoxseg4ei64_v_f64m2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_f64m4_m(...) __riscv_vsoxseg2ei64_v_f64m4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16mf4_m(...) __riscv_vsuxseg2ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_f16mf4_m(...) __riscv_vsuxseg3ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_f16mf4_m(...) __riscv_vsuxseg4ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_f16mf4_m(...) __riscv_vsuxseg5ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_f16mf4_m(...) __riscv_vsuxseg6ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_f16mf4_m(...) __riscv_vsuxseg7ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_f16mf4_m(...) __riscv_vsuxseg8ei8_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16mf2_m(...) __riscv_vsuxseg2ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_f16mf2_m(...) __riscv_vsuxseg3ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_f16mf2_m(...) __riscv_vsuxseg4ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_f16mf2_m(...) __riscv_vsuxseg5ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_f16mf2_m(...) __riscv_vsuxseg6ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_f16mf2_m(...) __riscv_vsuxseg7ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_f16mf2_m(...) __riscv_vsuxseg8ei8_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16m1_m(...) __riscv_vsuxseg2ei8_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_f16m1_m(...) __riscv_vsuxseg3ei8_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_f16m1_m(...) __riscv_vsuxseg4ei8_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_f16m1_m(...) __riscv_vsuxseg5ei8_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_f16m1_m(...) __riscv_vsuxseg6ei8_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_f16m1_m(...) __riscv_vsuxseg7ei8_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_f16m1_m(...) __riscv_vsuxseg8ei8_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16m2_m(...) __riscv_vsuxseg2ei8_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_f16m2_m(...) __riscv_vsuxseg3ei8_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_f16m2_m(...) __riscv_vsuxseg4ei8_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f16m4_m(...) __riscv_vsuxseg2ei8_v_f16m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16mf4_m(...) __riscv_vsuxseg2ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_f16mf4_m(...) __riscv_vsuxseg3ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_f16mf4_m(...) __riscv_vsuxseg4ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_f16mf4_m(...) __riscv_vsuxseg5ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_f16mf4_m(...) __riscv_vsuxseg6ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_f16mf4_m(...) __riscv_vsuxseg7ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_f16mf4_m(...) __riscv_vsuxseg8ei16_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16mf2_m(...) __riscv_vsuxseg2ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_f16mf2_m(...) __riscv_vsuxseg3ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_f16mf2_m(...) __riscv_vsuxseg4ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_f16mf2_m(...) __riscv_vsuxseg5ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_f16mf2_m(...) __riscv_vsuxseg6ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_f16mf2_m(...) __riscv_vsuxseg7ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_f16mf2_m(...) __riscv_vsuxseg8ei16_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16m1_m(...) __riscv_vsuxseg2ei16_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_f16m1_m(...) __riscv_vsuxseg3ei16_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_f16m1_m(...) __riscv_vsuxseg4ei16_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_f16m1_m(...) __riscv_vsuxseg5ei16_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_f16m1_m(...) __riscv_vsuxseg6ei16_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_f16m1_m(...) __riscv_vsuxseg7ei16_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_f16m1_m(...) __riscv_vsuxseg8ei16_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16m2_m(...) __riscv_vsuxseg2ei16_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_f16m2_m(...) __riscv_vsuxseg3ei16_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_f16m2_m(...) __riscv_vsuxseg4ei16_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f16m4_m(...) __riscv_vsuxseg2ei16_v_f16m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16mf4_m(...) __riscv_vsuxseg2ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_f16mf4_m(...) __riscv_vsuxseg3ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_f16mf4_m(...) __riscv_vsuxseg4ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_f16mf4_m(...) __riscv_vsuxseg5ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_f16mf4_m(...) __riscv_vsuxseg6ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_f16mf4_m(...) __riscv_vsuxseg7ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_f16mf4_m(...) __riscv_vsuxseg8ei32_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16mf2_m(...) __riscv_vsuxseg2ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_f16mf2_m(...) __riscv_vsuxseg3ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_f16mf2_m(...) __riscv_vsuxseg4ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_f16mf2_m(...) __riscv_vsuxseg5ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_f16mf2_m(...) __riscv_vsuxseg6ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_f16mf2_m(...) __riscv_vsuxseg7ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_f16mf2_m(...) __riscv_vsuxseg8ei32_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16m1_m(...) __riscv_vsuxseg2ei32_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_f16m1_m(...) __riscv_vsuxseg3ei32_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_f16m1_m(...) __riscv_vsuxseg4ei32_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_f16m1_m(...) __riscv_vsuxseg5ei32_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_f16m1_m(...) __riscv_vsuxseg6ei32_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_f16m1_m(...) __riscv_vsuxseg7ei32_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_f16m1_m(...) __riscv_vsuxseg8ei32_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16m2_m(...) __riscv_vsuxseg2ei32_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_f16m2_m(...) __riscv_vsuxseg3ei32_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_f16m2_m(...) __riscv_vsuxseg4ei32_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f16m4_m(...) __riscv_vsuxseg2ei32_v_f16m4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f16mf4_m(...) __riscv_vsuxseg2ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_f16mf4_m(...) __riscv_vsuxseg3ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_f16mf4_m(...) __riscv_vsuxseg4ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_f16mf4_m(...) __riscv_vsuxseg5ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_f16mf4_m(...) __riscv_vsuxseg6ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_f16mf4_m(...) __riscv_vsuxseg7ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_f16mf4_m(...) __riscv_vsuxseg8ei64_v_f16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f16mf2_m(...) __riscv_vsuxseg2ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_f16mf2_m(...) __riscv_vsuxseg3ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_f16mf2_m(...) __riscv_vsuxseg4ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_f16mf2_m(...) __riscv_vsuxseg5ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_f16mf2_m(...) __riscv_vsuxseg6ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_f16mf2_m(...) __riscv_vsuxseg7ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_f16mf2_m(...) __riscv_vsuxseg8ei64_v_f16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f16m1_m(...) __riscv_vsuxseg2ei64_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_f16m1_m(...) __riscv_vsuxseg3ei64_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_f16m1_m(...) __riscv_vsuxseg4ei64_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_f16m1_m(...) __riscv_vsuxseg5ei64_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_f16m1_m(...) __riscv_vsuxseg6ei64_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_f16m1_m(...) __riscv_vsuxseg7ei64_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_f16m1_m(...) __riscv_vsuxseg8ei64_v_f16m1_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f16m2_m(...) __riscv_vsuxseg2ei64_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_f16m2_m(...) __riscv_vsuxseg3ei64_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_f16m2_m(...) __riscv_vsuxseg4ei64_v_f16m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f32mf2_m(...) __riscv_vsuxseg2ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_f32mf2_m(...) __riscv_vsuxseg3ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_f32mf2_m(...) __riscv_vsuxseg4ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_f32mf2_m(...) __riscv_vsuxseg5ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_f32mf2_m(...) __riscv_vsuxseg6ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_f32mf2_m(...) __riscv_vsuxseg7ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_f32mf2_m(...) __riscv_vsuxseg8ei8_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f32m1_m(...) __riscv_vsuxseg2ei8_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_f32m1_m(...) __riscv_vsuxseg3ei8_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_f32m1_m(...) __riscv_vsuxseg4ei8_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_f32m1_m(...) __riscv_vsuxseg5ei8_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_f32m1_m(...) __riscv_vsuxseg6ei8_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_f32m1_m(...) __riscv_vsuxseg7ei8_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_f32m1_m(...) __riscv_vsuxseg8ei8_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f32m2_m(...) __riscv_vsuxseg2ei8_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_f32m2_m(...) __riscv_vsuxseg3ei8_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_f32m2_m(...) __riscv_vsuxseg4ei8_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f32m4_m(...) __riscv_vsuxseg2ei8_v_f32m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f32mf2_m(...) __riscv_vsuxseg2ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_f32mf2_m(...) __riscv_vsuxseg3ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_f32mf2_m(...) __riscv_vsuxseg4ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_f32mf2_m(...) __riscv_vsuxseg5ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_f32mf2_m(...) __riscv_vsuxseg6ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_f32mf2_m(...) __riscv_vsuxseg7ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_f32mf2_m(...) __riscv_vsuxseg8ei16_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f32m1_m(...) __riscv_vsuxseg2ei16_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_f32m1_m(...) __riscv_vsuxseg3ei16_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_f32m1_m(...) __riscv_vsuxseg4ei16_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_f32m1_m(...) __riscv_vsuxseg5ei16_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_f32m1_m(...) __riscv_vsuxseg6ei16_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_f32m1_m(...) __riscv_vsuxseg7ei16_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_f32m1_m(...) __riscv_vsuxseg8ei16_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f32m2_m(...) __riscv_vsuxseg2ei16_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_f32m2_m(...) __riscv_vsuxseg3ei16_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_f32m2_m(...) __riscv_vsuxseg4ei16_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f32m4_m(...) __riscv_vsuxseg2ei16_v_f32m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f32mf2_m(...) __riscv_vsuxseg2ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_f32mf2_m(...) __riscv_vsuxseg3ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_f32mf2_m(...) __riscv_vsuxseg4ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_f32mf2_m(...) __riscv_vsuxseg5ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_f32mf2_m(...) __riscv_vsuxseg6ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_f32mf2_m(...) __riscv_vsuxseg7ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_f32mf2_m(...) __riscv_vsuxseg8ei32_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f32m1_m(...) __riscv_vsuxseg2ei32_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_f32m1_m(...) __riscv_vsuxseg3ei32_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_f32m1_m(...) __riscv_vsuxseg4ei32_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_f32m1_m(...) __riscv_vsuxseg5ei32_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_f32m1_m(...) __riscv_vsuxseg6ei32_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_f32m1_m(...) __riscv_vsuxseg7ei32_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_f32m1_m(...) __riscv_vsuxseg8ei32_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f32m2_m(...) __riscv_vsuxseg2ei32_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_f32m2_m(...) __riscv_vsuxseg3ei32_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_f32m2_m(...) __riscv_vsuxseg4ei32_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f32m4_m(...) __riscv_vsuxseg2ei32_v_f32m4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f32mf2_m(...) __riscv_vsuxseg2ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_f32mf2_m(...) __riscv_vsuxseg3ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_f32mf2_m(...) __riscv_vsuxseg4ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_f32mf2_m(...) __riscv_vsuxseg5ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_f32mf2_m(...) __riscv_vsuxseg6ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_f32mf2_m(...) __riscv_vsuxseg7ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_f32mf2_m(...) __riscv_vsuxseg8ei64_v_f32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f32m1_m(...) __riscv_vsuxseg2ei64_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_f32m1_m(...) __riscv_vsuxseg3ei64_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_f32m1_m(...) __riscv_vsuxseg4ei64_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_f32m1_m(...) __riscv_vsuxseg5ei64_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_f32m1_m(...) __riscv_vsuxseg6ei64_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_f32m1_m(...) __riscv_vsuxseg7ei64_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_f32m1_m(...) __riscv_vsuxseg8ei64_v_f32m1_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f32m2_m(...) __riscv_vsuxseg2ei64_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_f32m2_m(...) __riscv_vsuxseg3ei64_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_f32m2_m(...) __riscv_vsuxseg4ei64_v_f32m2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f32m4_m(...) __riscv_vsuxseg2ei64_v_f32m4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f64m1_m(...) __riscv_vsuxseg2ei8_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_f64m1_m(...) __riscv_vsuxseg3ei8_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_f64m1_m(...) __riscv_vsuxseg4ei8_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_f64m1_m(...) __riscv_vsuxseg5ei8_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_f64m1_m(...) __riscv_vsuxseg6ei8_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_f64m1_m(...) __riscv_vsuxseg7ei8_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_f64m1_m(...) __riscv_vsuxseg8ei8_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f64m2_m(...) __riscv_vsuxseg2ei8_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_f64m2_m(...) __riscv_vsuxseg3ei8_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_f64m2_m(...) __riscv_vsuxseg4ei8_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_f64m4_m(...) __riscv_vsuxseg2ei8_v_f64m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f64m1_m(...) __riscv_vsuxseg2ei16_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_f64m1_m(...) __riscv_vsuxseg3ei16_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_f64m1_m(...) __riscv_vsuxseg4ei16_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_f64m1_m(...) __riscv_vsuxseg5ei16_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_f64m1_m(...) __riscv_vsuxseg6ei16_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_f64m1_m(...) __riscv_vsuxseg7ei16_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_f64m1_m(...) __riscv_vsuxseg8ei16_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f64m2_m(...) __riscv_vsuxseg2ei16_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_f64m2_m(...) __riscv_vsuxseg3ei16_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_f64m2_m(...) __riscv_vsuxseg4ei16_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_f64m4_m(...) __riscv_vsuxseg2ei16_v_f64m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f64m1_m(...) __riscv_vsuxseg2ei32_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_f64m1_m(...) __riscv_vsuxseg3ei32_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_f64m1_m(...) __riscv_vsuxseg4ei32_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_f64m1_m(...) __riscv_vsuxseg5ei32_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_f64m1_m(...) __riscv_vsuxseg6ei32_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_f64m1_m(...) __riscv_vsuxseg7ei32_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_f64m1_m(...) __riscv_vsuxseg8ei32_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f64m2_m(...) __riscv_vsuxseg2ei32_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_f64m2_m(...) __riscv_vsuxseg3ei32_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_f64m2_m(...) __riscv_vsuxseg4ei32_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_f64m4_m(...) __riscv_vsuxseg2ei32_v_f64m4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f64m1_m(...) __riscv_vsuxseg2ei64_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_f64m1_m(...) __riscv_vsuxseg3ei64_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_f64m1_m(...) __riscv_vsuxseg4ei64_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_f64m1_m(...) __riscv_vsuxseg5ei64_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_f64m1_m(...) __riscv_vsuxseg6ei64_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_f64m1_m(...) __riscv_vsuxseg7ei64_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_f64m1_m(...) __riscv_vsuxseg8ei64_v_f64m1_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f64m2_m(...) __riscv_vsuxseg2ei64_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_f64m2_m(...) __riscv_vsuxseg3ei64_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_f64m2_m(...) __riscv_vsuxseg4ei64_v_f64m2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_f64m4_m(...) __riscv_vsuxseg2ei64_v_f64m4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8mf8_m(...) __riscv_vsoxseg2ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8mf8_m(...) __riscv_vsoxseg3ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8mf8_m(...) __riscv_vsoxseg4ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i8mf8_m(...) __riscv_vsoxseg5ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i8mf8_m(...) __riscv_vsoxseg6ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i8mf8_m(...) __riscv_vsoxseg7ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i8mf8_m(...) __riscv_vsoxseg8ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8mf4_m(...) __riscv_vsoxseg2ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8mf4_m(...) __riscv_vsoxseg3ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8mf4_m(...) __riscv_vsoxseg4ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i8mf4_m(...) __riscv_vsoxseg5ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i8mf4_m(...) __riscv_vsoxseg6ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i8mf4_m(...) __riscv_vsoxseg7ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i8mf4_m(...) __riscv_vsoxseg8ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8mf2_m(...) __riscv_vsoxseg2ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8mf2_m(...) __riscv_vsoxseg3ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8mf2_m(...) __riscv_vsoxseg4ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i8mf2_m(...) __riscv_vsoxseg5ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i8mf2_m(...) __riscv_vsoxseg6ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i8mf2_m(...) __riscv_vsoxseg7ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i8mf2_m(...) __riscv_vsoxseg8ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8m1_m(...) __riscv_vsoxseg2ei8_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8m1_m(...) __riscv_vsoxseg3ei8_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8m1_m(...) __riscv_vsoxseg4ei8_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i8m1_m(...) __riscv_vsoxseg5ei8_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i8m1_m(...) __riscv_vsoxseg6ei8_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i8m1_m(...) __riscv_vsoxseg7ei8_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i8m1_m(...) __riscv_vsoxseg8ei8_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8m2_m(...) __riscv_vsoxseg2ei8_v_i8m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i8m2_m(...) __riscv_vsoxseg3ei8_v_i8m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i8m2_m(...) __riscv_vsoxseg4ei8_v_i8m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i8m4_m(...) __riscv_vsoxseg2ei8_v_i8m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8mf8_m(...) __riscv_vsoxseg2ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8mf8_m(...) __riscv_vsoxseg3ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8mf8_m(...) __riscv_vsoxseg4ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i8mf8_m(...) __riscv_vsoxseg5ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i8mf8_m(...) __riscv_vsoxseg6ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i8mf8_m(...) __riscv_vsoxseg7ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i8mf8_m(...) __riscv_vsoxseg8ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8mf4_m(...) __riscv_vsoxseg2ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8mf4_m(...) __riscv_vsoxseg3ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8mf4_m(...) __riscv_vsoxseg4ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i8mf4_m(...) __riscv_vsoxseg5ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i8mf4_m(...) __riscv_vsoxseg6ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i8mf4_m(...) __riscv_vsoxseg7ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i8mf4_m(...) __riscv_vsoxseg8ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8mf2_m(...) __riscv_vsoxseg2ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8mf2_m(...) __riscv_vsoxseg3ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8mf2_m(...) __riscv_vsoxseg4ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i8mf2_m(...) __riscv_vsoxseg5ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i8mf2_m(...) __riscv_vsoxseg6ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i8mf2_m(...) __riscv_vsoxseg7ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i8mf2_m(...) __riscv_vsoxseg8ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8m1_m(...) __riscv_vsoxseg2ei16_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8m1_m(...) __riscv_vsoxseg3ei16_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8m1_m(...) __riscv_vsoxseg4ei16_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i8m1_m(...) __riscv_vsoxseg5ei16_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i8m1_m(...) __riscv_vsoxseg6ei16_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i8m1_m(...) __riscv_vsoxseg7ei16_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i8m1_m(...) __riscv_vsoxseg8ei16_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8m2_m(...) __riscv_vsoxseg2ei16_v_i8m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i8m2_m(...) __riscv_vsoxseg3ei16_v_i8m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i8m2_m(...) __riscv_vsoxseg4ei16_v_i8m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i8m4_m(...) __riscv_vsoxseg2ei16_v_i8m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8mf8_m(...) __riscv_vsoxseg2ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8mf8_m(...) __riscv_vsoxseg3ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8mf8_m(...) __riscv_vsoxseg4ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i8mf8_m(...) __riscv_vsoxseg5ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i8mf8_m(...) __riscv_vsoxseg6ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i8mf8_m(...) __riscv_vsoxseg7ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i8mf8_m(...) __riscv_vsoxseg8ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8mf4_m(...) __riscv_vsoxseg2ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8mf4_m(...) __riscv_vsoxseg3ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8mf4_m(...) __riscv_vsoxseg4ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i8mf4_m(...) __riscv_vsoxseg5ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i8mf4_m(...) __riscv_vsoxseg6ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i8mf4_m(...) __riscv_vsoxseg7ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i8mf4_m(...) __riscv_vsoxseg8ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8mf2_m(...) __riscv_vsoxseg2ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8mf2_m(...) __riscv_vsoxseg3ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8mf2_m(...) __riscv_vsoxseg4ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i8mf2_m(...) __riscv_vsoxseg5ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i8mf2_m(...) __riscv_vsoxseg6ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i8mf2_m(...) __riscv_vsoxseg7ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i8mf2_m(...) __riscv_vsoxseg8ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8m1_m(...) __riscv_vsoxseg2ei32_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8m1_m(...) __riscv_vsoxseg3ei32_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8m1_m(...) __riscv_vsoxseg4ei32_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i8m1_m(...) __riscv_vsoxseg5ei32_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i8m1_m(...) __riscv_vsoxseg6ei32_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i8m1_m(...) __riscv_vsoxseg7ei32_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i8m1_m(...) __riscv_vsoxseg8ei32_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i8m2_m(...) __riscv_vsoxseg2ei32_v_i8m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i8m2_m(...) __riscv_vsoxseg3ei32_v_i8m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i8m2_m(...) __riscv_vsoxseg4ei32_v_i8m2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i8mf8_m(...) __riscv_vsoxseg2ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i8mf8_m(...) __riscv_vsoxseg3ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i8mf8_m(...) __riscv_vsoxseg4ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i8mf8_m(...) __riscv_vsoxseg5ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i8mf8_m(...) __riscv_vsoxseg6ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i8mf8_m(...) __riscv_vsoxseg7ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i8mf8_m(...) __riscv_vsoxseg8ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i8mf4_m(...) __riscv_vsoxseg2ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i8mf4_m(...) __riscv_vsoxseg3ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i8mf4_m(...) __riscv_vsoxseg4ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i8mf4_m(...) __riscv_vsoxseg5ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i8mf4_m(...) __riscv_vsoxseg6ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i8mf4_m(...) __riscv_vsoxseg7ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i8mf4_m(...) __riscv_vsoxseg8ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i8mf2_m(...) __riscv_vsoxseg2ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i8mf2_m(...) __riscv_vsoxseg3ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i8mf2_m(...) __riscv_vsoxseg4ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i8mf2_m(...) __riscv_vsoxseg5ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i8mf2_m(...) __riscv_vsoxseg6ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i8mf2_m(...) __riscv_vsoxseg7ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i8mf2_m(...) __riscv_vsoxseg8ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i8m1_m(...) __riscv_vsoxseg2ei64_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i8m1_m(...) __riscv_vsoxseg3ei64_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i8m1_m(...) __riscv_vsoxseg4ei64_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i8m1_m(...) __riscv_vsoxseg5ei64_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i8m1_m(...) __riscv_vsoxseg6ei64_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i8m1_m(...) __riscv_vsoxseg7ei64_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i8m1_m(...) __riscv_vsoxseg8ei64_v_i8m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16mf4_m(...) __riscv_vsoxseg2ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i16mf4_m(...) __riscv_vsoxseg3ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i16mf4_m(...) __riscv_vsoxseg4ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i16mf4_m(...) __riscv_vsoxseg5ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i16mf4_m(...) __riscv_vsoxseg6ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i16mf4_m(...) __riscv_vsoxseg7ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i16mf4_m(...) __riscv_vsoxseg8ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16mf2_m(...) __riscv_vsoxseg2ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i16mf2_m(...) __riscv_vsoxseg3ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i16mf2_m(...) __riscv_vsoxseg4ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i16mf2_m(...) __riscv_vsoxseg5ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i16mf2_m(...) __riscv_vsoxseg6ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i16mf2_m(...) __riscv_vsoxseg7ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i16mf2_m(...) __riscv_vsoxseg8ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16m1_m(...) __riscv_vsoxseg2ei8_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i16m1_m(...) __riscv_vsoxseg3ei8_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i16m1_m(...) __riscv_vsoxseg4ei8_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i16m1_m(...) __riscv_vsoxseg5ei8_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i16m1_m(...) __riscv_vsoxseg6ei8_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i16m1_m(...) __riscv_vsoxseg7ei8_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i16m1_m(...) __riscv_vsoxseg8ei8_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16m2_m(...) __riscv_vsoxseg2ei8_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i16m2_m(...) __riscv_vsoxseg3ei8_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i16m2_m(...) __riscv_vsoxseg4ei8_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i16m4_m(...) __riscv_vsoxseg2ei8_v_i16m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16mf4_m(...) __riscv_vsoxseg2ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i16mf4_m(...) __riscv_vsoxseg3ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i16mf4_m(...) __riscv_vsoxseg4ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i16mf4_m(...) __riscv_vsoxseg5ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i16mf4_m(...) __riscv_vsoxseg6ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i16mf4_m(...) __riscv_vsoxseg7ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i16mf4_m(...) __riscv_vsoxseg8ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16mf2_m(...) __riscv_vsoxseg2ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i16mf2_m(...) __riscv_vsoxseg3ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i16mf2_m(...) __riscv_vsoxseg4ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i16mf2_m(...) __riscv_vsoxseg5ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i16mf2_m(...) __riscv_vsoxseg6ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i16mf2_m(...) __riscv_vsoxseg7ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i16mf2_m(...) __riscv_vsoxseg8ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16m1_m(...) __riscv_vsoxseg2ei16_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i16m1_m(...) __riscv_vsoxseg3ei16_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i16m1_m(...) __riscv_vsoxseg4ei16_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i16m1_m(...) __riscv_vsoxseg5ei16_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i16m1_m(...) __riscv_vsoxseg6ei16_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i16m1_m(...) __riscv_vsoxseg7ei16_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i16m1_m(...) __riscv_vsoxseg8ei16_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16m2_m(...) __riscv_vsoxseg2ei16_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i16m2_m(...) __riscv_vsoxseg3ei16_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i16m2_m(...) __riscv_vsoxseg4ei16_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i16m4_m(...) __riscv_vsoxseg2ei16_v_i16m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16mf4_m(...) __riscv_vsoxseg2ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i16mf4_m(...) __riscv_vsoxseg3ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i16mf4_m(...) __riscv_vsoxseg4ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i16mf4_m(...) __riscv_vsoxseg5ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i16mf4_m(...) __riscv_vsoxseg6ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i16mf4_m(...) __riscv_vsoxseg7ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i16mf4_m(...) __riscv_vsoxseg8ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16mf2_m(...) __riscv_vsoxseg2ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i16mf2_m(...) __riscv_vsoxseg3ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i16mf2_m(...) __riscv_vsoxseg4ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i16mf2_m(...) __riscv_vsoxseg5ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i16mf2_m(...) __riscv_vsoxseg6ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i16mf2_m(...) __riscv_vsoxseg7ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i16mf2_m(...) __riscv_vsoxseg8ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16m1_m(...) __riscv_vsoxseg2ei32_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i16m1_m(...) __riscv_vsoxseg3ei32_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i16m1_m(...) __riscv_vsoxseg4ei32_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i16m1_m(...) __riscv_vsoxseg5ei32_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i16m1_m(...) __riscv_vsoxseg6ei32_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i16m1_m(...) __riscv_vsoxseg7ei32_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i16m1_m(...) __riscv_vsoxseg8ei32_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16m2_m(...) __riscv_vsoxseg2ei32_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i16m2_m(...) __riscv_vsoxseg3ei32_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i16m2_m(...) __riscv_vsoxseg4ei32_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i16m4_m(...) __riscv_vsoxseg2ei32_v_i16m4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i16mf4_m(...) __riscv_vsoxseg2ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i16mf4_m(...) __riscv_vsoxseg3ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i16mf4_m(...) __riscv_vsoxseg4ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i16mf4_m(...) __riscv_vsoxseg5ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i16mf4_m(...) __riscv_vsoxseg6ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i16mf4_m(...) __riscv_vsoxseg7ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i16mf4_m(...) __riscv_vsoxseg8ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i16mf2_m(...) __riscv_vsoxseg2ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i16mf2_m(...) __riscv_vsoxseg3ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i16mf2_m(...) __riscv_vsoxseg4ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i16mf2_m(...) __riscv_vsoxseg5ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i16mf2_m(...) __riscv_vsoxseg6ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i16mf2_m(...) __riscv_vsoxseg7ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i16mf2_m(...) __riscv_vsoxseg8ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i16m1_m(...) __riscv_vsoxseg2ei64_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i16m1_m(...) __riscv_vsoxseg3ei64_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i16m1_m(...) __riscv_vsoxseg4ei64_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i16m1_m(...) __riscv_vsoxseg5ei64_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i16m1_m(...) __riscv_vsoxseg6ei64_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i16m1_m(...) __riscv_vsoxseg7ei64_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i16m1_m(...) __riscv_vsoxseg8ei64_v_i16m1_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i16m2_m(...) __riscv_vsoxseg2ei64_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i16m2_m(...) __riscv_vsoxseg3ei64_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i16m2_m(...) __riscv_vsoxseg4ei64_v_i16m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i32mf2_m(...) __riscv_vsoxseg2ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i32mf2_m(...) __riscv_vsoxseg3ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i32mf2_m(...) __riscv_vsoxseg4ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i32mf2_m(...) __riscv_vsoxseg5ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i32mf2_m(...) __riscv_vsoxseg6ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i32mf2_m(...) __riscv_vsoxseg7ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i32mf2_m(...) __riscv_vsoxseg8ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i32m1_m(...) __riscv_vsoxseg2ei8_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i32m1_m(...) __riscv_vsoxseg3ei8_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i32m1_m(...) __riscv_vsoxseg4ei8_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i32m1_m(...) __riscv_vsoxseg5ei8_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i32m1_m(...) __riscv_vsoxseg6ei8_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i32m1_m(...) __riscv_vsoxseg7ei8_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i32m1_m(...) __riscv_vsoxseg8ei8_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i32m2_m(...) __riscv_vsoxseg2ei8_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i32m2_m(...) __riscv_vsoxseg3ei8_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i32m2_m(...) __riscv_vsoxseg4ei8_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i32m4_m(...) __riscv_vsoxseg2ei8_v_i32m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i32mf2_m(...) __riscv_vsoxseg2ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i32mf2_m(...) __riscv_vsoxseg3ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i32mf2_m(...) __riscv_vsoxseg4ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i32mf2_m(...) __riscv_vsoxseg5ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i32mf2_m(...) __riscv_vsoxseg6ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i32mf2_m(...) __riscv_vsoxseg7ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i32mf2_m(...) __riscv_vsoxseg8ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i32m1_m(...) __riscv_vsoxseg2ei16_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i32m1_m(...) __riscv_vsoxseg3ei16_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i32m1_m(...) __riscv_vsoxseg4ei16_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i32m1_m(...) __riscv_vsoxseg5ei16_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i32m1_m(...) __riscv_vsoxseg6ei16_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i32m1_m(...) __riscv_vsoxseg7ei16_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i32m1_m(...) __riscv_vsoxseg8ei16_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i32m2_m(...) __riscv_vsoxseg2ei16_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i32m2_m(...) __riscv_vsoxseg3ei16_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i32m2_m(...) __riscv_vsoxseg4ei16_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i32m4_m(...) __riscv_vsoxseg2ei16_v_i32m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i32mf2_m(...) __riscv_vsoxseg2ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i32mf2_m(...) __riscv_vsoxseg3ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i32mf2_m(...) __riscv_vsoxseg4ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i32mf2_m(...) __riscv_vsoxseg5ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i32mf2_m(...) __riscv_vsoxseg6ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i32mf2_m(...) __riscv_vsoxseg7ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i32mf2_m(...) __riscv_vsoxseg8ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i32m1_m(...) __riscv_vsoxseg2ei32_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i32m1_m(...) __riscv_vsoxseg3ei32_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i32m1_m(...) __riscv_vsoxseg4ei32_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i32m1_m(...) __riscv_vsoxseg5ei32_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i32m1_m(...) __riscv_vsoxseg6ei32_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i32m1_m(...) __riscv_vsoxseg7ei32_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i32m1_m(...) __riscv_vsoxseg8ei32_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i32m2_m(...) __riscv_vsoxseg2ei32_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i32m2_m(...) __riscv_vsoxseg3ei32_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i32m2_m(...) __riscv_vsoxseg4ei32_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i32m4_m(...) __riscv_vsoxseg2ei32_v_i32m4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i32mf2_m(...) __riscv_vsoxseg2ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i32mf2_m(...) __riscv_vsoxseg3ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i32mf2_m(...) __riscv_vsoxseg4ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i32mf2_m(...) __riscv_vsoxseg5ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i32mf2_m(...) __riscv_vsoxseg6ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i32mf2_m(...) __riscv_vsoxseg7ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i32mf2_m(...) __riscv_vsoxseg8ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i32m1_m(...) __riscv_vsoxseg2ei64_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i32m1_m(...) __riscv_vsoxseg3ei64_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i32m1_m(...) __riscv_vsoxseg4ei64_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i32m1_m(...) __riscv_vsoxseg5ei64_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i32m1_m(...) __riscv_vsoxseg6ei64_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i32m1_m(...) __riscv_vsoxseg7ei64_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i32m1_m(...) __riscv_vsoxseg8ei64_v_i32m1_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i32m2_m(...) __riscv_vsoxseg2ei64_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i32m2_m(...) __riscv_vsoxseg3ei64_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i32m2_m(...) __riscv_vsoxseg4ei64_v_i32m2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i32m4_m(...) __riscv_vsoxseg2ei64_v_i32m4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i64m1_m(...) __riscv_vsoxseg2ei8_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i64m1_m(...) __riscv_vsoxseg3ei8_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i64m1_m(...) __riscv_vsoxseg4ei8_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_i64m1_m(...) __riscv_vsoxseg5ei8_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_i64m1_m(...) __riscv_vsoxseg6ei8_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_i64m1_m(...) __riscv_vsoxseg7ei8_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_i64m1_m(...) __riscv_vsoxseg8ei8_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i64m2_m(...) __riscv_vsoxseg2ei8_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_i64m2_m(...) __riscv_vsoxseg3ei8_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_i64m2_m(...) __riscv_vsoxseg4ei8_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_i64m4_m(...) __riscv_vsoxseg2ei8_v_i64m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i64m1_m(...) __riscv_vsoxseg2ei16_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i64m1_m(...) __riscv_vsoxseg3ei16_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i64m1_m(...) __riscv_vsoxseg4ei16_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_i64m1_m(...) __riscv_vsoxseg5ei16_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_i64m1_m(...) __riscv_vsoxseg6ei16_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_i64m1_m(...) __riscv_vsoxseg7ei16_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_i64m1_m(...) __riscv_vsoxseg8ei16_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i64m2_m(...) __riscv_vsoxseg2ei16_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_i64m2_m(...) __riscv_vsoxseg3ei16_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_i64m2_m(...) __riscv_vsoxseg4ei16_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_i64m4_m(...) __riscv_vsoxseg2ei16_v_i64m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i64m1_m(...) __riscv_vsoxseg2ei32_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i64m1_m(...) __riscv_vsoxseg3ei32_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i64m1_m(...) __riscv_vsoxseg4ei32_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_i64m1_m(...) __riscv_vsoxseg5ei32_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_i64m1_m(...) __riscv_vsoxseg6ei32_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_i64m1_m(...) __riscv_vsoxseg7ei32_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_i64m1_m(...) __riscv_vsoxseg8ei32_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i64m2_m(...) __riscv_vsoxseg2ei32_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_i64m2_m(...) __riscv_vsoxseg3ei32_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_i64m2_m(...) __riscv_vsoxseg4ei32_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_i64m4_m(...) __riscv_vsoxseg2ei32_v_i64m4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i64m1_m(...) __riscv_vsoxseg2ei64_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i64m1_m(...) __riscv_vsoxseg3ei64_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i64m1_m(...) __riscv_vsoxseg4ei64_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_i64m1_m(...) __riscv_vsoxseg5ei64_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_i64m1_m(...) __riscv_vsoxseg6ei64_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_i64m1_m(...) __riscv_vsoxseg7ei64_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_i64m1_m(...) __riscv_vsoxseg8ei64_v_i64m1_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i64m2_m(...) __riscv_vsoxseg2ei64_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_i64m2_m(...) __riscv_vsoxseg3ei64_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_i64m2_m(...) __riscv_vsoxseg4ei64_v_i64m2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_i64m4_m(...) __riscv_vsoxseg2ei64_v_i64m4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8mf8_m(...) __riscv_vsuxseg2ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8mf8_m(...) __riscv_vsuxseg3ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8mf8_m(...) __riscv_vsuxseg4ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i8mf8_m(...) __riscv_vsuxseg5ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i8mf8_m(...) __riscv_vsuxseg6ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i8mf8_m(...) __riscv_vsuxseg7ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i8mf8_m(...) __riscv_vsuxseg8ei8_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8mf4_m(...) __riscv_vsuxseg2ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8mf4_m(...) __riscv_vsuxseg3ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8mf4_m(...) __riscv_vsuxseg4ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i8mf4_m(...) __riscv_vsuxseg5ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i8mf4_m(...) __riscv_vsuxseg6ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i8mf4_m(...) __riscv_vsuxseg7ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i8mf4_m(...) __riscv_vsuxseg8ei8_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8mf2_m(...) __riscv_vsuxseg2ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8mf2_m(...) __riscv_vsuxseg3ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8mf2_m(...) __riscv_vsuxseg4ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i8mf2_m(...) __riscv_vsuxseg5ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i8mf2_m(...) __riscv_vsuxseg6ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i8mf2_m(...) __riscv_vsuxseg7ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i8mf2_m(...) __riscv_vsuxseg8ei8_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8m1_m(...) __riscv_vsuxseg2ei8_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8m1_m(...) __riscv_vsuxseg3ei8_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8m1_m(...) __riscv_vsuxseg4ei8_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i8m1_m(...) __riscv_vsuxseg5ei8_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i8m1_m(...) __riscv_vsuxseg6ei8_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i8m1_m(...) __riscv_vsuxseg7ei8_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i8m1_m(...) __riscv_vsuxseg8ei8_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8m2_m(...) __riscv_vsuxseg2ei8_v_i8m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i8m2_m(...) __riscv_vsuxseg3ei8_v_i8m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i8m2_m(...) __riscv_vsuxseg4ei8_v_i8m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i8m4_m(...) __riscv_vsuxseg2ei8_v_i8m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8mf8_m(...) __riscv_vsuxseg2ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8mf8_m(...) __riscv_vsuxseg3ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8mf8_m(...) __riscv_vsuxseg4ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i8mf8_m(...) __riscv_vsuxseg5ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i8mf8_m(...) __riscv_vsuxseg6ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i8mf8_m(...) __riscv_vsuxseg7ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i8mf8_m(...) __riscv_vsuxseg8ei16_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8mf4_m(...) __riscv_vsuxseg2ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8mf4_m(...) __riscv_vsuxseg3ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8mf4_m(...) __riscv_vsuxseg4ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i8mf4_m(...) __riscv_vsuxseg5ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i8mf4_m(...) __riscv_vsuxseg6ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i8mf4_m(...) __riscv_vsuxseg7ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i8mf4_m(...) __riscv_vsuxseg8ei16_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8mf2_m(...) __riscv_vsuxseg2ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8mf2_m(...) __riscv_vsuxseg3ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8mf2_m(...) __riscv_vsuxseg4ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i8mf2_m(...) __riscv_vsuxseg5ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i8mf2_m(...) __riscv_vsuxseg6ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i8mf2_m(...) __riscv_vsuxseg7ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i8mf2_m(...) __riscv_vsuxseg8ei16_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8m1_m(...) __riscv_vsuxseg2ei16_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8m1_m(...) __riscv_vsuxseg3ei16_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8m1_m(...) __riscv_vsuxseg4ei16_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i8m1_m(...) __riscv_vsuxseg5ei16_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i8m1_m(...) __riscv_vsuxseg6ei16_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i8m1_m(...) __riscv_vsuxseg7ei16_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i8m1_m(...) __riscv_vsuxseg8ei16_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8m2_m(...) __riscv_vsuxseg2ei16_v_i8m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i8m2_m(...) __riscv_vsuxseg3ei16_v_i8m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i8m2_m(...) __riscv_vsuxseg4ei16_v_i8m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i8m4_m(...) __riscv_vsuxseg2ei16_v_i8m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8mf8_m(...) __riscv_vsuxseg2ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8mf8_m(...) __riscv_vsuxseg3ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8mf8_m(...) __riscv_vsuxseg4ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i8mf8_m(...) __riscv_vsuxseg5ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i8mf8_m(...) __riscv_vsuxseg6ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i8mf8_m(...) __riscv_vsuxseg7ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i8mf8_m(...) __riscv_vsuxseg8ei32_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8mf4_m(...) __riscv_vsuxseg2ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8mf4_m(...) __riscv_vsuxseg3ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8mf4_m(...) __riscv_vsuxseg4ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i8mf4_m(...) __riscv_vsuxseg5ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i8mf4_m(...) __riscv_vsuxseg6ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i8mf4_m(...) __riscv_vsuxseg7ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i8mf4_m(...) __riscv_vsuxseg8ei32_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8mf2_m(...) __riscv_vsuxseg2ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8mf2_m(...) __riscv_vsuxseg3ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8mf2_m(...) __riscv_vsuxseg4ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i8mf2_m(...) __riscv_vsuxseg5ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i8mf2_m(...) __riscv_vsuxseg6ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i8mf2_m(...) __riscv_vsuxseg7ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i8mf2_m(...) __riscv_vsuxseg8ei32_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8m1_m(...) __riscv_vsuxseg2ei32_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8m1_m(...) __riscv_vsuxseg3ei32_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8m1_m(...) __riscv_vsuxseg4ei32_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i8m1_m(...) __riscv_vsuxseg5ei32_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i8m1_m(...) __riscv_vsuxseg6ei32_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i8m1_m(...) __riscv_vsuxseg7ei32_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i8m1_m(...) __riscv_vsuxseg8ei32_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i8m2_m(...) __riscv_vsuxseg2ei32_v_i8m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i8m2_m(...) __riscv_vsuxseg3ei32_v_i8m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i8m2_m(...) __riscv_vsuxseg4ei32_v_i8m2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i8mf8_m(...) __riscv_vsuxseg2ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i8mf8_m(...) __riscv_vsuxseg3ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i8mf8_m(...) __riscv_vsuxseg4ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i8mf8_m(...) __riscv_vsuxseg5ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i8mf8_m(...) __riscv_vsuxseg6ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i8mf8_m(...) __riscv_vsuxseg7ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i8mf8_m(...) __riscv_vsuxseg8ei64_v_i8mf8_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i8mf4_m(...) __riscv_vsuxseg2ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i8mf4_m(...) __riscv_vsuxseg3ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i8mf4_m(...) __riscv_vsuxseg4ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i8mf4_m(...) __riscv_vsuxseg5ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i8mf4_m(...) __riscv_vsuxseg6ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i8mf4_m(...) __riscv_vsuxseg7ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i8mf4_m(...) __riscv_vsuxseg8ei64_v_i8mf4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i8mf2_m(...) __riscv_vsuxseg2ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i8mf2_m(...) __riscv_vsuxseg3ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i8mf2_m(...) __riscv_vsuxseg4ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i8mf2_m(...) __riscv_vsuxseg5ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i8mf2_m(...) __riscv_vsuxseg6ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i8mf2_m(...) __riscv_vsuxseg7ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i8mf2_m(...) __riscv_vsuxseg8ei64_v_i8mf2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i8m1_m(...) __riscv_vsuxseg2ei64_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i8m1_m(...) __riscv_vsuxseg3ei64_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i8m1_m(...) __riscv_vsuxseg4ei64_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i8m1_m(...) __riscv_vsuxseg5ei64_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i8m1_m(...) __riscv_vsuxseg6ei64_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i8m1_m(...) __riscv_vsuxseg7ei64_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i8m1_m(...) __riscv_vsuxseg8ei64_v_i8m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16mf4_m(...) __riscv_vsuxseg2ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i16mf4_m(...) __riscv_vsuxseg3ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i16mf4_m(...) __riscv_vsuxseg4ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i16mf4_m(...) __riscv_vsuxseg5ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i16mf4_m(...) __riscv_vsuxseg6ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i16mf4_m(...) __riscv_vsuxseg7ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i16mf4_m(...) __riscv_vsuxseg8ei8_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16mf2_m(...) __riscv_vsuxseg2ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i16mf2_m(...) __riscv_vsuxseg3ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i16mf2_m(...) __riscv_vsuxseg4ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i16mf2_m(...) __riscv_vsuxseg5ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i16mf2_m(...) __riscv_vsuxseg6ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i16mf2_m(...) __riscv_vsuxseg7ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i16mf2_m(...) __riscv_vsuxseg8ei8_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16m1_m(...) __riscv_vsuxseg2ei8_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i16m1_m(...) __riscv_vsuxseg3ei8_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i16m1_m(...) __riscv_vsuxseg4ei8_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i16m1_m(...) __riscv_vsuxseg5ei8_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i16m1_m(...) __riscv_vsuxseg6ei8_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i16m1_m(...) __riscv_vsuxseg7ei8_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i16m1_m(...) __riscv_vsuxseg8ei8_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16m2_m(...) __riscv_vsuxseg2ei8_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i16m2_m(...) __riscv_vsuxseg3ei8_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i16m2_m(...) __riscv_vsuxseg4ei8_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i16m4_m(...) __riscv_vsuxseg2ei8_v_i16m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16mf4_m(...) __riscv_vsuxseg2ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i16mf4_m(...) __riscv_vsuxseg3ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i16mf4_m(...) __riscv_vsuxseg4ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i16mf4_m(...) __riscv_vsuxseg5ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i16mf4_m(...) __riscv_vsuxseg6ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i16mf4_m(...) __riscv_vsuxseg7ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i16mf4_m(...) __riscv_vsuxseg8ei16_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16mf2_m(...) __riscv_vsuxseg2ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i16mf2_m(...) __riscv_vsuxseg3ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i16mf2_m(...) __riscv_vsuxseg4ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i16mf2_m(...) __riscv_vsuxseg5ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i16mf2_m(...) __riscv_vsuxseg6ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i16mf2_m(...) __riscv_vsuxseg7ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i16mf2_m(...) __riscv_vsuxseg8ei16_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16m1_m(...) __riscv_vsuxseg2ei16_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i16m1_m(...) __riscv_vsuxseg3ei16_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i16m1_m(...) __riscv_vsuxseg4ei16_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i16m1_m(...) __riscv_vsuxseg5ei16_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i16m1_m(...) __riscv_vsuxseg6ei16_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i16m1_m(...) __riscv_vsuxseg7ei16_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i16m1_m(...) __riscv_vsuxseg8ei16_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16m2_m(...) __riscv_vsuxseg2ei16_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i16m2_m(...) __riscv_vsuxseg3ei16_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i16m2_m(...) __riscv_vsuxseg4ei16_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i16m4_m(...) __riscv_vsuxseg2ei16_v_i16m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16mf4_m(...) __riscv_vsuxseg2ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i16mf4_m(...) __riscv_vsuxseg3ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i16mf4_m(...) __riscv_vsuxseg4ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i16mf4_m(...) __riscv_vsuxseg5ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i16mf4_m(...) __riscv_vsuxseg6ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i16mf4_m(...) __riscv_vsuxseg7ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i16mf4_m(...) __riscv_vsuxseg8ei32_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16mf2_m(...) __riscv_vsuxseg2ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i16mf2_m(...) __riscv_vsuxseg3ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i16mf2_m(...) __riscv_vsuxseg4ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i16mf2_m(...) __riscv_vsuxseg5ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i16mf2_m(...) __riscv_vsuxseg6ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i16mf2_m(...) __riscv_vsuxseg7ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i16mf2_m(...) __riscv_vsuxseg8ei32_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16m1_m(...) __riscv_vsuxseg2ei32_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i16m1_m(...) __riscv_vsuxseg3ei32_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i16m1_m(...) __riscv_vsuxseg4ei32_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i16m1_m(...) __riscv_vsuxseg5ei32_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i16m1_m(...) __riscv_vsuxseg6ei32_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i16m1_m(...) __riscv_vsuxseg7ei32_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i16m1_m(...) __riscv_vsuxseg8ei32_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16m2_m(...) __riscv_vsuxseg2ei32_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i16m2_m(...) __riscv_vsuxseg3ei32_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i16m2_m(...) __riscv_vsuxseg4ei32_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i16m4_m(...) __riscv_vsuxseg2ei32_v_i16m4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i16mf4_m(...) __riscv_vsuxseg2ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i16mf4_m(...) __riscv_vsuxseg3ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i16mf4_m(...) __riscv_vsuxseg4ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i16mf4_m(...) __riscv_vsuxseg5ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i16mf4_m(...) __riscv_vsuxseg6ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i16mf4_m(...) __riscv_vsuxseg7ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i16mf4_m(...) __riscv_vsuxseg8ei64_v_i16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i16mf2_m(...) __riscv_vsuxseg2ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i16mf2_m(...) __riscv_vsuxseg3ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i16mf2_m(...) __riscv_vsuxseg4ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i16mf2_m(...) __riscv_vsuxseg5ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i16mf2_m(...) __riscv_vsuxseg6ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i16mf2_m(...) __riscv_vsuxseg7ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i16mf2_m(...) __riscv_vsuxseg8ei64_v_i16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i16m1_m(...) __riscv_vsuxseg2ei64_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i16m1_m(...) __riscv_vsuxseg3ei64_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i16m1_m(...) __riscv_vsuxseg4ei64_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i16m1_m(...) __riscv_vsuxseg5ei64_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i16m1_m(...) __riscv_vsuxseg6ei64_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i16m1_m(...) __riscv_vsuxseg7ei64_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i16m1_m(...) __riscv_vsuxseg8ei64_v_i16m1_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i16m2_m(...) __riscv_vsuxseg2ei64_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i16m2_m(...) __riscv_vsuxseg3ei64_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i16m2_m(...) __riscv_vsuxseg4ei64_v_i16m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i32mf2_m(...) __riscv_vsuxseg2ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i32mf2_m(...) __riscv_vsuxseg3ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i32mf2_m(...) __riscv_vsuxseg4ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i32mf2_m(...) __riscv_vsuxseg5ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i32mf2_m(...) __riscv_vsuxseg6ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i32mf2_m(...) __riscv_vsuxseg7ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i32mf2_m(...) __riscv_vsuxseg8ei8_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i32m1_m(...) __riscv_vsuxseg2ei8_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i32m1_m(...) __riscv_vsuxseg3ei8_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i32m1_m(...) __riscv_vsuxseg4ei8_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i32m1_m(...) __riscv_vsuxseg5ei8_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i32m1_m(...) __riscv_vsuxseg6ei8_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i32m1_m(...) __riscv_vsuxseg7ei8_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i32m1_m(...) __riscv_vsuxseg8ei8_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i32m2_m(...) __riscv_vsuxseg2ei8_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i32m2_m(...) __riscv_vsuxseg3ei8_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i32m2_m(...) __riscv_vsuxseg4ei8_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i32m4_m(...) __riscv_vsuxseg2ei8_v_i32m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i32mf2_m(...) __riscv_vsuxseg2ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i32mf2_m(...) __riscv_vsuxseg3ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i32mf2_m(...) __riscv_vsuxseg4ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i32mf2_m(...) __riscv_vsuxseg5ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i32mf2_m(...) __riscv_vsuxseg6ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i32mf2_m(...) __riscv_vsuxseg7ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i32mf2_m(...) __riscv_vsuxseg8ei16_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i32m1_m(...) __riscv_vsuxseg2ei16_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i32m1_m(...) __riscv_vsuxseg3ei16_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i32m1_m(...) __riscv_vsuxseg4ei16_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i32m1_m(...) __riscv_vsuxseg5ei16_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i32m1_m(...) __riscv_vsuxseg6ei16_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i32m1_m(...) __riscv_vsuxseg7ei16_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i32m1_m(...) __riscv_vsuxseg8ei16_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i32m2_m(...) __riscv_vsuxseg2ei16_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i32m2_m(...) __riscv_vsuxseg3ei16_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i32m2_m(...) __riscv_vsuxseg4ei16_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i32m4_m(...) __riscv_vsuxseg2ei16_v_i32m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i32mf2_m(...) __riscv_vsuxseg2ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i32mf2_m(...) __riscv_vsuxseg3ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i32mf2_m(...) __riscv_vsuxseg4ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i32mf2_m(...) __riscv_vsuxseg5ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i32mf2_m(...) __riscv_vsuxseg6ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i32mf2_m(...) __riscv_vsuxseg7ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i32mf2_m(...) __riscv_vsuxseg8ei32_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i32m1_m(...) __riscv_vsuxseg2ei32_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i32m1_m(...) __riscv_vsuxseg3ei32_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i32m1_m(...) __riscv_vsuxseg4ei32_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i32m1_m(...) __riscv_vsuxseg5ei32_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i32m1_m(...) __riscv_vsuxseg6ei32_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i32m1_m(...) __riscv_vsuxseg7ei32_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i32m1_m(...) __riscv_vsuxseg8ei32_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i32m2_m(...) __riscv_vsuxseg2ei32_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i32m2_m(...) __riscv_vsuxseg3ei32_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i32m2_m(...) __riscv_vsuxseg4ei32_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i32m4_m(...) __riscv_vsuxseg2ei32_v_i32m4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i32mf2_m(...) __riscv_vsuxseg2ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i32mf2_m(...) __riscv_vsuxseg3ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i32mf2_m(...) __riscv_vsuxseg4ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i32mf2_m(...) __riscv_vsuxseg5ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i32mf2_m(...) __riscv_vsuxseg6ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i32mf2_m(...) __riscv_vsuxseg7ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i32mf2_m(...) __riscv_vsuxseg8ei64_v_i32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i32m1_m(...) __riscv_vsuxseg2ei64_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i32m1_m(...) __riscv_vsuxseg3ei64_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i32m1_m(...) __riscv_vsuxseg4ei64_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i32m1_m(...) __riscv_vsuxseg5ei64_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i32m1_m(...) __riscv_vsuxseg6ei64_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i32m1_m(...) __riscv_vsuxseg7ei64_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i32m1_m(...) __riscv_vsuxseg8ei64_v_i32m1_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i32m2_m(...) __riscv_vsuxseg2ei64_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i32m2_m(...) __riscv_vsuxseg3ei64_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i32m2_m(...) __riscv_vsuxseg4ei64_v_i32m2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i32m4_m(...) __riscv_vsuxseg2ei64_v_i32m4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i64m1_m(...) __riscv_vsuxseg2ei8_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i64m1_m(...) __riscv_vsuxseg3ei8_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i64m1_m(...) __riscv_vsuxseg4ei8_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_i64m1_m(...) __riscv_vsuxseg5ei8_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_i64m1_m(...) __riscv_vsuxseg6ei8_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_i64m1_m(...) __riscv_vsuxseg7ei8_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_i64m1_m(...) __riscv_vsuxseg8ei8_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i64m2_m(...) __riscv_vsuxseg2ei8_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_i64m2_m(...) __riscv_vsuxseg3ei8_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_i64m2_m(...) __riscv_vsuxseg4ei8_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_i64m4_m(...) __riscv_vsuxseg2ei8_v_i64m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i64m1_m(...) __riscv_vsuxseg2ei16_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i64m1_m(...) __riscv_vsuxseg3ei16_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i64m1_m(...) __riscv_vsuxseg4ei16_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_i64m1_m(...) __riscv_vsuxseg5ei16_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_i64m1_m(...) __riscv_vsuxseg6ei16_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_i64m1_m(...) __riscv_vsuxseg7ei16_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_i64m1_m(...) __riscv_vsuxseg8ei16_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i64m2_m(...) __riscv_vsuxseg2ei16_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_i64m2_m(...) __riscv_vsuxseg3ei16_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_i64m2_m(...) __riscv_vsuxseg4ei16_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_i64m4_m(...) __riscv_vsuxseg2ei16_v_i64m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i64m1_m(...) __riscv_vsuxseg2ei32_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i64m1_m(...) __riscv_vsuxseg3ei32_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i64m1_m(...) __riscv_vsuxseg4ei32_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_i64m1_m(...) __riscv_vsuxseg5ei32_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_i64m1_m(...) __riscv_vsuxseg6ei32_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_i64m1_m(...) __riscv_vsuxseg7ei32_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_i64m1_m(...) __riscv_vsuxseg8ei32_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i64m2_m(...) __riscv_vsuxseg2ei32_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_i64m2_m(...) __riscv_vsuxseg3ei32_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_i64m2_m(...) __riscv_vsuxseg4ei32_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_i64m4_m(...) __riscv_vsuxseg2ei32_v_i64m4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i64m1_m(...) __riscv_vsuxseg2ei64_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i64m1_m(...) __riscv_vsuxseg3ei64_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i64m1_m(...) __riscv_vsuxseg4ei64_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_i64m1_m(...) __riscv_vsuxseg5ei64_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_i64m1_m(...) __riscv_vsuxseg6ei64_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_i64m1_m(...) __riscv_vsuxseg7ei64_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_i64m1_m(...) __riscv_vsuxseg8ei64_v_i64m1_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i64m2_m(...) __riscv_vsuxseg2ei64_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_i64m2_m(...) __riscv_vsuxseg3ei64_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_i64m2_m(...) __riscv_vsuxseg4ei64_v_i64m2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_i64m4_m(...) __riscv_vsuxseg2ei64_v_i64m4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8mf8_m(...) __riscv_vsoxseg2ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8mf8_m(...) __riscv_vsoxseg3ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8mf8_m(...) __riscv_vsoxseg4ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u8mf8_m(...) __riscv_vsoxseg5ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u8mf8_m(...) __riscv_vsoxseg6ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u8mf8_m(...) __riscv_vsoxseg7ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u8mf8_m(...) __riscv_vsoxseg8ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8mf4_m(...) __riscv_vsoxseg2ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8mf4_m(...) __riscv_vsoxseg3ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8mf4_m(...) __riscv_vsoxseg4ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u8mf4_m(...) __riscv_vsoxseg5ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u8mf4_m(...) __riscv_vsoxseg6ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u8mf4_m(...) __riscv_vsoxseg7ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u8mf4_m(...) __riscv_vsoxseg8ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8mf2_m(...) __riscv_vsoxseg2ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8mf2_m(...) __riscv_vsoxseg3ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8mf2_m(...) __riscv_vsoxseg4ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u8mf2_m(...) __riscv_vsoxseg5ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u8mf2_m(...) __riscv_vsoxseg6ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u8mf2_m(...) __riscv_vsoxseg7ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u8mf2_m(...) __riscv_vsoxseg8ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8m1_m(...) __riscv_vsoxseg2ei8_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8m1_m(...) __riscv_vsoxseg3ei8_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8m1_m(...) __riscv_vsoxseg4ei8_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u8m1_m(...) __riscv_vsoxseg5ei8_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u8m1_m(...) __riscv_vsoxseg6ei8_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u8m1_m(...) __riscv_vsoxseg7ei8_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u8m1_m(...) __riscv_vsoxseg8ei8_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8m2_m(...) __riscv_vsoxseg2ei8_v_u8m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u8m2_m(...) __riscv_vsoxseg3ei8_v_u8m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u8m2_m(...) __riscv_vsoxseg4ei8_v_u8m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u8m4_m(...) __riscv_vsoxseg2ei8_v_u8m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8mf8_m(...) __riscv_vsoxseg2ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8mf8_m(...) __riscv_vsoxseg3ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8mf8_m(...) __riscv_vsoxseg4ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u8mf8_m(...) __riscv_vsoxseg5ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u8mf8_m(...) __riscv_vsoxseg6ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u8mf8_m(...) __riscv_vsoxseg7ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u8mf8_m(...) __riscv_vsoxseg8ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8mf4_m(...) __riscv_vsoxseg2ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8mf4_m(...) __riscv_vsoxseg3ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8mf4_m(...) __riscv_vsoxseg4ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u8mf4_m(...) __riscv_vsoxseg5ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u8mf4_m(...) __riscv_vsoxseg6ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u8mf4_m(...) __riscv_vsoxseg7ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u8mf4_m(...) __riscv_vsoxseg8ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8mf2_m(...) __riscv_vsoxseg2ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8mf2_m(...) __riscv_vsoxseg3ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8mf2_m(...) __riscv_vsoxseg4ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u8mf2_m(...) __riscv_vsoxseg5ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u8mf2_m(...) __riscv_vsoxseg6ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u8mf2_m(...) __riscv_vsoxseg7ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u8mf2_m(...) __riscv_vsoxseg8ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8m1_m(...) __riscv_vsoxseg2ei16_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8m1_m(...) __riscv_vsoxseg3ei16_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8m1_m(...) __riscv_vsoxseg4ei16_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u8m1_m(...) __riscv_vsoxseg5ei16_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u8m1_m(...) __riscv_vsoxseg6ei16_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u8m1_m(...) __riscv_vsoxseg7ei16_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u8m1_m(...) __riscv_vsoxseg8ei16_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8m2_m(...) __riscv_vsoxseg2ei16_v_u8m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u8m2_m(...) __riscv_vsoxseg3ei16_v_u8m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u8m2_m(...) __riscv_vsoxseg4ei16_v_u8m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u8m4_m(...) __riscv_vsoxseg2ei16_v_u8m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8mf8_m(...) __riscv_vsoxseg2ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8mf8_m(...) __riscv_vsoxseg3ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8mf8_m(...) __riscv_vsoxseg4ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u8mf8_m(...) __riscv_vsoxseg5ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u8mf8_m(...) __riscv_vsoxseg6ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u8mf8_m(...) __riscv_vsoxseg7ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u8mf8_m(...) __riscv_vsoxseg8ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8mf4_m(...) __riscv_vsoxseg2ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8mf4_m(...) __riscv_vsoxseg3ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8mf4_m(...) __riscv_vsoxseg4ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u8mf4_m(...) __riscv_vsoxseg5ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u8mf4_m(...) __riscv_vsoxseg6ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u8mf4_m(...) __riscv_vsoxseg7ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u8mf4_m(...) __riscv_vsoxseg8ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8mf2_m(...) __riscv_vsoxseg2ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8mf2_m(...) __riscv_vsoxseg3ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8mf2_m(...) __riscv_vsoxseg4ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u8mf2_m(...) __riscv_vsoxseg5ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u8mf2_m(...) __riscv_vsoxseg6ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u8mf2_m(...) __riscv_vsoxseg7ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u8mf2_m(...) __riscv_vsoxseg8ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8m1_m(...) __riscv_vsoxseg2ei32_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8m1_m(...) __riscv_vsoxseg3ei32_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8m1_m(...) __riscv_vsoxseg4ei32_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u8m1_m(...) __riscv_vsoxseg5ei32_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u8m1_m(...) __riscv_vsoxseg6ei32_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u8m1_m(...) __riscv_vsoxseg7ei32_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u8m1_m(...) __riscv_vsoxseg8ei32_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u8m2_m(...) __riscv_vsoxseg2ei32_v_u8m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u8m2_m(...) __riscv_vsoxseg3ei32_v_u8m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u8m2_m(...) __riscv_vsoxseg4ei32_v_u8m2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u8mf8_m(...) __riscv_vsoxseg2ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u8mf8_m(...) __riscv_vsoxseg3ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u8mf8_m(...) __riscv_vsoxseg4ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u8mf8_m(...) __riscv_vsoxseg5ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u8mf8_m(...) __riscv_vsoxseg6ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u8mf8_m(...) __riscv_vsoxseg7ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u8mf8_m(...) __riscv_vsoxseg8ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u8mf4_m(...) __riscv_vsoxseg2ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u8mf4_m(...) __riscv_vsoxseg3ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u8mf4_m(...) __riscv_vsoxseg4ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u8mf4_m(...) __riscv_vsoxseg5ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u8mf4_m(...) __riscv_vsoxseg6ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u8mf4_m(...) __riscv_vsoxseg7ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u8mf4_m(...) __riscv_vsoxseg8ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u8mf2_m(...) __riscv_vsoxseg2ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u8mf2_m(...) __riscv_vsoxseg3ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u8mf2_m(...) __riscv_vsoxseg4ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u8mf2_m(...) __riscv_vsoxseg5ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u8mf2_m(...) __riscv_vsoxseg6ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u8mf2_m(...) __riscv_vsoxseg7ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u8mf2_m(...) __riscv_vsoxseg8ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u8m1_m(...) __riscv_vsoxseg2ei64_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u8m1_m(...) __riscv_vsoxseg3ei64_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u8m1_m(...) __riscv_vsoxseg4ei64_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u8m1_m(...) __riscv_vsoxseg5ei64_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u8m1_m(...) __riscv_vsoxseg6ei64_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u8m1_m(...) __riscv_vsoxseg7ei64_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u8m1_m(...) __riscv_vsoxseg8ei64_v_u8m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16mf4_m(...) __riscv_vsoxseg2ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u16mf4_m(...) __riscv_vsoxseg3ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u16mf4_m(...) __riscv_vsoxseg4ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u16mf4_m(...) __riscv_vsoxseg5ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u16mf4_m(...) __riscv_vsoxseg6ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u16mf4_m(...) __riscv_vsoxseg7ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u16mf4_m(...) __riscv_vsoxseg8ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16mf2_m(...) __riscv_vsoxseg2ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u16mf2_m(...) __riscv_vsoxseg3ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u16mf2_m(...) __riscv_vsoxseg4ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u16mf2_m(...) __riscv_vsoxseg5ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u16mf2_m(...) __riscv_vsoxseg6ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u16mf2_m(...) __riscv_vsoxseg7ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u16mf2_m(...) __riscv_vsoxseg8ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16m1_m(...) __riscv_vsoxseg2ei8_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u16m1_m(...) __riscv_vsoxseg3ei8_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u16m1_m(...) __riscv_vsoxseg4ei8_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u16m1_m(...) __riscv_vsoxseg5ei8_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u16m1_m(...) __riscv_vsoxseg6ei8_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u16m1_m(...) __riscv_vsoxseg7ei8_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u16m1_m(...) __riscv_vsoxseg8ei8_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16m2_m(...) __riscv_vsoxseg2ei8_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u16m2_m(...) __riscv_vsoxseg3ei8_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u16m2_m(...) __riscv_vsoxseg4ei8_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u16m4_m(...) __riscv_vsoxseg2ei8_v_u16m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16mf4_m(...) __riscv_vsoxseg2ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u16mf4_m(...) __riscv_vsoxseg3ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u16mf4_m(...) __riscv_vsoxseg4ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u16mf4_m(...) __riscv_vsoxseg5ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u16mf4_m(...) __riscv_vsoxseg6ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u16mf4_m(...) __riscv_vsoxseg7ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u16mf4_m(...) __riscv_vsoxseg8ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16mf2_m(...) __riscv_vsoxseg2ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u16mf2_m(...) __riscv_vsoxseg3ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u16mf2_m(...) __riscv_vsoxseg4ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u16mf2_m(...) __riscv_vsoxseg5ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u16mf2_m(...) __riscv_vsoxseg6ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u16mf2_m(...) __riscv_vsoxseg7ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u16mf2_m(...) __riscv_vsoxseg8ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16m1_m(...) __riscv_vsoxseg2ei16_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u16m1_m(...) __riscv_vsoxseg3ei16_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u16m1_m(...) __riscv_vsoxseg4ei16_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u16m1_m(...) __riscv_vsoxseg5ei16_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u16m1_m(...) __riscv_vsoxseg6ei16_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u16m1_m(...) __riscv_vsoxseg7ei16_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u16m1_m(...) __riscv_vsoxseg8ei16_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16m2_m(...) __riscv_vsoxseg2ei16_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u16m2_m(...) __riscv_vsoxseg3ei16_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u16m2_m(...) __riscv_vsoxseg4ei16_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u16m4_m(...) __riscv_vsoxseg2ei16_v_u16m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16mf4_m(...) __riscv_vsoxseg2ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u16mf4_m(...) __riscv_vsoxseg3ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u16mf4_m(...) __riscv_vsoxseg4ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u16mf4_m(...) __riscv_vsoxseg5ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u16mf4_m(...) __riscv_vsoxseg6ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u16mf4_m(...) __riscv_vsoxseg7ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u16mf4_m(...) __riscv_vsoxseg8ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16mf2_m(...) __riscv_vsoxseg2ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u16mf2_m(...) __riscv_vsoxseg3ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u16mf2_m(...) __riscv_vsoxseg4ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u16mf2_m(...) __riscv_vsoxseg5ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u16mf2_m(...) __riscv_vsoxseg6ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u16mf2_m(...) __riscv_vsoxseg7ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u16mf2_m(...) __riscv_vsoxseg8ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16m1_m(...) __riscv_vsoxseg2ei32_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u16m1_m(...) __riscv_vsoxseg3ei32_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u16m1_m(...) __riscv_vsoxseg4ei32_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u16m1_m(...) __riscv_vsoxseg5ei32_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u16m1_m(...) __riscv_vsoxseg6ei32_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u16m1_m(...) __riscv_vsoxseg7ei32_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u16m1_m(...) __riscv_vsoxseg8ei32_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16m2_m(...) __riscv_vsoxseg2ei32_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u16m2_m(...) __riscv_vsoxseg3ei32_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u16m2_m(...) __riscv_vsoxseg4ei32_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u16m4_m(...) __riscv_vsoxseg2ei32_v_u16m4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u16mf4_m(...) __riscv_vsoxseg2ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u16mf4_m(...) __riscv_vsoxseg3ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u16mf4_m(...) __riscv_vsoxseg4ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u16mf4_m(...) __riscv_vsoxseg5ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u16mf4_m(...) __riscv_vsoxseg6ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u16mf4_m(...) __riscv_vsoxseg7ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u16mf4_m(...) __riscv_vsoxseg8ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u16mf2_m(...) __riscv_vsoxseg2ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u16mf2_m(...) __riscv_vsoxseg3ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u16mf2_m(...) __riscv_vsoxseg4ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u16mf2_m(...) __riscv_vsoxseg5ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u16mf2_m(...) __riscv_vsoxseg6ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u16mf2_m(...) __riscv_vsoxseg7ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u16mf2_m(...) __riscv_vsoxseg8ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u16m1_m(...) __riscv_vsoxseg2ei64_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u16m1_m(...) __riscv_vsoxseg3ei64_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u16m1_m(...) __riscv_vsoxseg4ei64_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u16m1_m(...) __riscv_vsoxseg5ei64_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u16m1_m(...) __riscv_vsoxseg6ei64_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u16m1_m(...) __riscv_vsoxseg7ei64_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u16m1_m(...) __riscv_vsoxseg8ei64_v_u16m1_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u16m2_m(...) __riscv_vsoxseg2ei64_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u16m2_m(...) __riscv_vsoxseg3ei64_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u16m2_m(...) __riscv_vsoxseg4ei64_v_u16m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u32mf2_m(...) __riscv_vsoxseg2ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u32mf2_m(...) __riscv_vsoxseg3ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u32mf2_m(...) __riscv_vsoxseg4ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u32mf2_m(...) __riscv_vsoxseg5ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u32mf2_m(...) __riscv_vsoxseg6ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u32mf2_m(...) __riscv_vsoxseg7ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u32mf2_m(...) __riscv_vsoxseg8ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u32m1_m(...) __riscv_vsoxseg2ei8_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u32m1_m(...) __riscv_vsoxseg3ei8_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u32m1_m(...) __riscv_vsoxseg4ei8_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u32m1_m(...) __riscv_vsoxseg5ei8_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u32m1_m(...) __riscv_vsoxseg6ei8_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u32m1_m(...) __riscv_vsoxseg7ei8_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u32m1_m(...) __riscv_vsoxseg8ei8_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u32m2_m(...) __riscv_vsoxseg2ei8_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u32m2_m(...) __riscv_vsoxseg3ei8_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u32m2_m(...) __riscv_vsoxseg4ei8_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u32m4_m(...) __riscv_vsoxseg2ei8_v_u32m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u32mf2_m(...) __riscv_vsoxseg2ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u32mf2_m(...) __riscv_vsoxseg3ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u32mf2_m(...) __riscv_vsoxseg4ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u32mf2_m(...) __riscv_vsoxseg5ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u32mf2_m(...) __riscv_vsoxseg6ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u32mf2_m(...) __riscv_vsoxseg7ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u32mf2_m(...) __riscv_vsoxseg8ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u32m1_m(...) __riscv_vsoxseg2ei16_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u32m1_m(...) __riscv_vsoxseg3ei16_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u32m1_m(...) __riscv_vsoxseg4ei16_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u32m1_m(...) __riscv_vsoxseg5ei16_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u32m1_m(...) __riscv_vsoxseg6ei16_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u32m1_m(...) __riscv_vsoxseg7ei16_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u32m1_m(...) __riscv_vsoxseg8ei16_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u32m2_m(...) __riscv_vsoxseg2ei16_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u32m2_m(...) __riscv_vsoxseg3ei16_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u32m2_m(...) __riscv_vsoxseg4ei16_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u32m4_m(...) __riscv_vsoxseg2ei16_v_u32m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u32mf2_m(...) __riscv_vsoxseg2ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u32mf2_m(...) __riscv_vsoxseg3ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u32mf2_m(...) __riscv_vsoxseg4ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u32mf2_m(...) __riscv_vsoxseg5ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u32mf2_m(...) __riscv_vsoxseg6ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u32mf2_m(...) __riscv_vsoxseg7ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u32mf2_m(...) __riscv_vsoxseg8ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u32m1_m(...) __riscv_vsoxseg2ei32_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u32m1_m(...) __riscv_vsoxseg3ei32_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u32m1_m(...) __riscv_vsoxseg4ei32_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u32m1_m(...) __riscv_vsoxseg5ei32_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u32m1_m(...) __riscv_vsoxseg6ei32_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u32m1_m(...) __riscv_vsoxseg7ei32_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u32m1_m(...) __riscv_vsoxseg8ei32_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u32m2_m(...) __riscv_vsoxseg2ei32_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u32m2_m(...) __riscv_vsoxseg3ei32_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u32m2_m(...) __riscv_vsoxseg4ei32_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u32m4_m(...) __riscv_vsoxseg2ei32_v_u32m4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u32mf2_m(...) __riscv_vsoxseg2ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u32mf2_m(...) __riscv_vsoxseg3ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u32mf2_m(...) __riscv_vsoxseg4ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u32mf2_m(...) __riscv_vsoxseg5ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u32mf2_m(...) __riscv_vsoxseg6ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u32mf2_m(...) __riscv_vsoxseg7ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u32mf2_m(...) __riscv_vsoxseg8ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u32m1_m(...) __riscv_vsoxseg2ei64_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u32m1_m(...) __riscv_vsoxseg3ei64_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u32m1_m(...) __riscv_vsoxseg4ei64_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u32m1_m(...) __riscv_vsoxseg5ei64_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u32m1_m(...) __riscv_vsoxseg6ei64_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u32m1_m(...) __riscv_vsoxseg7ei64_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u32m1_m(...) __riscv_vsoxseg8ei64_v_u32m1_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u32m2_m(...) __riscv_vsoxseg2ei64_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u32m2_m(...) __riscv_vsoxseg3ei64_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u32m2_m(...) __riscv_vsoxseg4ei64_v_u32m2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u32m4_m(...) __riscv_vsoxseg2ei64_v_u32m4_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u64m1_m(...) __riscv_vsoxseg2ei8_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u64m1_m(...) __riscv_vsoxseg3ei8_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u64m1_m(...) __riscv_vsoxseg4ei8_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg5ei8_v_u64m1_m(...) __riscv_vsoxseg5ei8_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg6ei8_v_u64m1_m(...) __riscv_vsoxseg6ei8_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg7ei8_v_u64m1_m(...) __riscv_vsoxseg7ei8_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg8ei8_v_u64m1_m(...) __riscv_vsoxseg8ei8_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u64m2_m(...) __riscv_vsoxseg2ei8_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg3ei8_v_u64m2_m(...) __riscv_vsoxseg3ei8_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg4ei8_v_u64m2_m(...) __riscv_vsoxseg4ei8_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg2ei8_v_u64m4_m(...) __riscv_vsoxseg2ei8_v_u64m4_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u64m1_m(...) __riscv_vsoxseg2ei16_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u64m1_m(...) __riscv_vsoxseg3ei16_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u64m1_m(...) __riscv_vsoxseg4ei16_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg5ei16_v_u64m1_m(...) __riscv_vsoxseg5ei16_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg6ei16_v_u64m1_m(...) __riscv_vsoxseg6ei16_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg7ei16_v_u64m1_m(...) __riscv_vsoxseg7ei16_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg8ei16_v_u64m1_m(...) __riscv_vsoxseg8ei16_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u64m2_m(...) __riscv_vsoxseg2ei16_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg3ei16_v_u64m2_m(...) __riscv_vsoxseg3ei16_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg4ei16_v_u64m2_m(...) __riscv_vsoxseg4ei16_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg2ei16_v_u64m4_m(...) __riscv_vsoxseg2ei16_v_u64m4_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u64m1_m(...) __riscv_vsoxseg2ei32_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u64m1_m(...) __riscv_vsoxseg3ei32_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u64m1_m(...) __riscv_vsoxseg4ei32_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg5ei32_v_u64m1_m(...) __riscv_vsoxseg5ei32_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg6ei32_v_u64m1_m(...) __riscv_vsoxseg6ei32_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg7ei32_v_u64m1_m(...) __riscv_vsoxseg7ei32_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg8ei32_v_u64m1_m(...) __riscv_vsoxseg8ei32_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u64m2_m(...) __riscv_vsoxseg2ei32_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg3ei32_v_u64m2_m(...) __riscv_vsoxseg3ei32_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg4ei32_v_u64m2_m(...) __riscv_vsoxseg4ei32_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg2ei32_v_u64m4_m(...) __riscv_vsoxseg2ei32_v_u64m4_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u64m1_m(...) __riscv_vsoxseg2ei64_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u64m1_m(...) __riscv_vsoxseg3ei64_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u64m1_m(...) __riscv_vsoxseg4ei64_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg5ei64_v_u64m1_m(...) __riscv_vsoxseg5ei64_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg6ei64_v_u64m1_m(...) __riscv_vsoxseg6ei64_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg7ei64_v_u64m1_m(...) __riscv_vsoxseg7ei64_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg8ei64_v_u64m1_m(...) __riscv_vsoxseg8ei64_v_u64m1_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u64m2_m(...) __riscv_vsoxseg2ei64_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg3ei64_v_u64m2_m(...) __riscv_vsoxseg3ei64_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg4ei64_v_u64m2_m(...) __riscv_vsoxseg4ei64_v_u64m2_m(__VA_ARGS__)
+#define vsoxseg2ei64_v_u64m4_m(...) __riscv_vsoxseg2ei64_v_u64m4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8mf8_m(...) __riscv_vsuxseg2ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8mf8_m(...) __riscv_vsuxseg3ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8mf8_m(...) __riscv_vsuxseg4ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u8mf8_m(...) __riscv_vsuxseg5ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u8mf8_m(...) __riscv_vsuxseg6ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u8mf8_m(...) __riscv_vsuxseg7ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u8mf8_m(...) __riscv_vsuxseg8ei8_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8mf4_m(...) __riscv_vsuxseg2ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8mf4_m(...) __riscv_vsuxseg3ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8mf4_m(...) __riscv_vsuxseg4ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u8mf4_m(...) __riscv_vsuxseg5ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u8mf4_m(...) __riscv_vsuxseg6ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u8mf4_m(...) __riscv_vsuxseg7ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u8mf4_m(...) __riscv_vsuxseg8ei8_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8mf2_m(...) __riscv_vsuxseg2ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8mf2_m(...) __riscv_vsuxseg3ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8mf2_m(...) __riscv_vsuxseg4ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u8mf2_m(...) __riscv_vsuxseg5ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u8mf2_m(...) __riscv_vsuxseg6ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u8mf2_m(...) __riscv_vsuxseg7ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u8mf2_m(...) __riscv_vsuxseg8ei8_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8m1_m(...) __riscv_vsuxseg2ei8_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8m1_m(...) __riscv_vsuxseg3ei8_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8m1_m(...) __riscv_vsuxseg4ei8_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u8m1_m(...) __riscv_vsuxseg5ei8_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u8m1_m(...) __riscv_vsuxseg6ei8_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u8m1_m(...) __riscv_vsuxseg7ei8_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u8m1_m(...) __riscv_vsuxseg8ei8_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8m2_m(...) __riscv_vsuxseg2ei8_v_u8m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u8m2_m(...) __riscv_vsuxseg3ei8_v_u8m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u8m2_m(...) __riscv_vsuxseg4ei8_v_u8m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u8m4_m(...) __riscv_vsuxseg2ei8_v_u8m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8mf8_m(...) __riscv_vsuxseg2ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8mf8_m(...) __riscv_vsuxseg3ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8mf8_m(...) __riscv_vsuxseg4ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u8mf8_m(...) __riscv_vsuxseg5ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u8mf8_m(...) __riscv_vsuxseg6ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u8mf8_m(...) __riscv_vsuxseg7ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u8mf8_m(...) __riscv_vsuxseg8ei16_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8mf4_m(...) __riscv_vsuxseg2ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8mf4_m(...) __riscv_vsuxseg3ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8mf4_m(...) __riscv_vsuxseg4ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u8mf4_m(...) __riscv_vsuxseg5ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u8mf4_m(...) __riscv_vsuxseg6ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u8mf4_m(...) __riscv_vsuxseg7ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u8mf4_m(...) __riscv_vsuxseg8ei16_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8mf2_m(...) __riscv_vsuxseg2ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8mf2_m(...) __riscv_vsuxseg3ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8mf2_m(...) __riscv_vsuxseg4ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u8mf2_m(...) __riscv_vsuxseg5ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u8mf2_m(...) __riscv_vsuxseg6ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u8mf2_m(...) __riscv_vsuxseg7ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u8mf2_m(...) __riscv_vsuxseg8ei16_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8m1_m(...) __riscv_vsuxseg2ei16_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8m1_m(...) __riscv_vsuxseg3ei16_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8m1_m(...) __riscv_vsuxseg4ei16_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u8m1_m(...) __riscv_vsuxseg5ei16_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u8m1_m(...) __riscv_vsuxseg6ei16_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u8m1_m(...) __riscv_vsuxseg7ei16_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u8m1_m(...) __riscv_vsuxseg8ei16_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8m2_m(...) __riscv_vsuxseg2ei16_v_u8m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u8m2_m(...) __riscv_vsuxseg3ei16_v_u8m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u8m2_m(...) __riscv_vsuxseg4ei16_v_u8m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u8m4_m(...) __riscv_vsuxseg2ei16_v_u8m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8mf8_m(...) __riscv_vsuxseg2ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8mf8_m(...) __riscv_vsuxseg3ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8mf8_m(...) __riscv_vsuxseg4ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u8mf8_m(...) __riscv_vsuxseg5ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u8mf8_m(...) __riscv_vsuxseg6ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u8mf8_m(...) __riscv_vsuxseg7ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u8mf8_m(...) __riscv_vsuxseg8ei32_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8mf4_m(...) __riscv_vsuxseg2ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8mf4_m(...) __riscv_vsuxseg3ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8mf4_m(...) __riscv_vsuxseg4ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u8mf4_m(...) __riscv_vsuxseg5ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u8mf4_m(...) __riscv_vsuxseg6ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u8mf4_m(...) __riscv_vsuxseg7ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u8mf4_m(...) __riscv_vsuxseg8ei32_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8mf2_m(...) __riscv_vsuxseg2ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8mf2_m(...) __riscv_vsuxseg3ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8mf2_m(...) __riscv_vsuxseg4ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u8mf2_m(...) __riscv_vsuxseg5ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u8mf2_m(...) __riscv_vsuxseg6ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u8mf2_m(...) __riscv_vsuxseg7ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u8mf2_m(...) __riscv_vsuxseg8ei32_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8m1_m(...) __riscv_vsuxseg2ei32_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8m1_m(...) __riscv_vsuxseg3ei32_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8m1_m(...) __riscv_vsuxseg4ei32_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u8m1_m(...) __riscv_vsuxseg5ei32_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u8m1_m(...) __riscv_vsuxseg6ei32_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u8m1_m(...) __riscv_vsuxseg7ei32_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u8m1_m(...) __riscv_vsuxseg8ei32_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u8m2_m(...) __riscv_vsuxseg2ei32_v_u8m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u8m2_m(...) __riscv_vsuxseg3ei32_v_u8m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u8m2_m(...) __riscv_vsuxseg4ei32_v_u8m2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u8mf8_m(...) __riscv_vsuxseg2ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u8mf8_m(...) __riscv_vsuxseg3ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u8mf8_m(...) __riscv_vsuxseg4ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u8mf8_m(...) __riscv_vsuxseg5ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u8mf8_m(...) __riscv_vsuxseg6ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u8mf8_m(...) __riscv_vsuxseg7ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u8mf8_m(...) __riscv_vsuxseg8ei64_v_u8mf8_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u8mf4_m(...) __riscv_vsuxseg2ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u8mf4_m(...) __riscv_vsuxseg3ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u8mf4_m(...) __riscv_vsuxseg4ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u8mf4_m(...) __riscv_vsuxseg5ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u8mf4_m(...) __riscv_vsuxseg6ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u8mf4_m(...) __riscv_vsuxseg7ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u8mf4_m(...) __riscv_vsuxseg8ei64_v_u8mf4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u8mf2_m(...) __riscv_vsuxseg2ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u8mf2_m(...) __riscv_vsuxseg3ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u8mf2_m(...) __riscv_vsuxseg4ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u8mf2_m(...) __riscv_vsuxseg5ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u8mf2_m(...) __riscv_vsuxseg6ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u8mf2_m(...) __riscv_vsuxseg7ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u8mf2_m(...) __riscv_vsuxseg8ei64_v_u8mf2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u8m1_m(...) __riscv_vsuxseg2ei64_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u8m1_m(...) __riscv_vsuxseg3ei64_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u8m1_m(...) __riscv_vsuxseg4ei64_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u8m1_m(...) __riscv_vsuxseg5ei64_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u8m1_m(...) __riscv_vsuxseg6ei64_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u8m1_m(...) __riscv_vsuxseg7ei64_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u8m1_m(...) __riscv_vsuxseg8ei64_v_u8m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16mf4_m(...) __riscv_vsuxseg2ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u16mf4_m(...) __riscv_vsuxseg3ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u16mf4_m(...) __riscv_vsuxseg4ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u16mf4_m(...) __riscv_vsuxseg5ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u16mf4_m(...) __riscv_vsuxseg6ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u16mf4_m(...) __riscv_vsuxseg7ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u16mf4_m(...) __riscv_vsuxseg8ei8_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16mf2_m(...) __riscv_vsuxseg2ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u16mf2_m(...) __riscv_vsuxseg3ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u16mf2_m(...) __riscv_vsuxseg4ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u16mf2_m(...) __riscv_vsuxseg5ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u16mf2_m(...) __riscv_vsuxseg6ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u16mf2_m(...) __riscv_vsuxseg7ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u16mf2_m(...) __riscv_vsuxseg8ei8_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16m1_m(...) __riscv_vsuxseg2ei8_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u16m1_m(...) __riscv_vsuxseg3ei8_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u16m1_m(...) __riscv_vsuxseg4ei8_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u16m1_m(...) __riscv_vsuxseg5ei8_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u16m1_m(...) __riscv_vsuxseg6ei8_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u16m1_m(...) __riscv_vsuxseg7ei8_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u16m1_m(...) __riscv_vsuxseg8ei8_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16m2_m(...) __riscv_vsuxseg2ei8_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u16m2_m(...) __riscv_vsuxseg3ei8_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u16m2_m(...) __riscv_vsuxseg4ei8_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u16m4_m(...) __riscv_vsuxseg2ei8_v_u16m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16mf4_m(...) __riscv_vsuxseg2ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u16mf4_m(...) __riscv_vsuxseg3ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u16mf4_m(...) __riscv_vsuxseg4ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u16mf4_m(...) __riscv_vsuxseg5ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u16mf4_m(...) __riscv_vsuxseg6ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u16mf4_m(...) __riscv_vsuxseg7ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u16mf4_m(...) __riscv_vsuxseg8ei16_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16mf2_m(...) __riscv_vsuxseg2ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u16mf2_m(...) __riscv_vsuxseg3ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u16mf2_m(...) __riscv_vsuxseg4ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u16mf2_m(...) __riscv_vsuxseg5ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u16mf2_m(...) __riscv_vsuxseg6ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u16mf2_m(...) __riscv_vsuxseg7ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u16mf2_m(...) __riscv_vsuxseg8ei16_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16m1_m(...) __riscv_vsuxseg2ei16_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u16m1_m(...) __riscv_vsuxseg3ei16_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u16m1_m(...) __riscv_vsuxseg4ei16_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u16m1_m(...) __riscv_vsuxseg5ei16_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u16m1_m(...) __riscv_vsuxseg6ei16_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u16m1_m(...) __riscv_vsuxseg7ei16_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u16m1_m(...) __riscv_vsuxseg8ei16_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16m2_m(...) __riscv_vsuxseg2ei16_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u16m2_m(...) __riscv_vsuxseg3ei16_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u16m2_m(...) __riscv_vsuxseg4ei16_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u16m4_m(...) __riscv_vsuxseg2ei16_v_u16m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16mf4_m(...) __riscv_vsuxseg2ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u16mf4_m(...) __riscv_vsuxseg3ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u16mf4_m(...) __riscv_vsuxseg4ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u16mf4_m(...) __riscv_vsuxseg5ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u16mf4_m(...) __riscv_vsuxseg6ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u16mf4_m(...) __riscv_vsuxseg7ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u16mf4_m(...) __riscv_vsuxseg8ei32_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16mf2_m(...) __riscv_vsuxseg2ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u16mf2_m(...) __riscv_vsuxseg3ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u16mf2_m(...) __riscv_vsuxseg4ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u16mf2_m(...) __riscv_vsuxseg5ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u16mf2_m(...) __riscv_vsuxseg6ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u16mf2_m(...) __riscv_vsuxseg7ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u16mf2_m(...) __riscv_vsuxseg8ei32_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16m1_m(...) __riscv_vsuxseg2ei32_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u16m1_m(...) __riscv_vsuxseg3ei32_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u16m1_m(...) __riscv_vsuxseg4ei32_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u16m1_m(...) __riscv_vsuxseg5ei32_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u16m1_m(...) __riscv_vsuxseg6ei32_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u16m1_m(...) __riscv_vsuxseg7ei32_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u16m1_m(...) __riscv_vsuxseg8ei32_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16m2_m(...) __riscv_vsuxseg2ei32_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u16m2_m(...) __riscv_vsuxseg3ei32_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u16m2_m(...) __riscv_vsuxseg4ei32_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u16m4_m(...) __riscv_vsuxseg2ei32_v_u16m4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u16mf4_m(...) __riscv_vsuxseg2ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u16mf4_m(...) __riscv_vsuxseg3ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u16mf4_m(...) __riscv_vsuxseg4ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u16mf4_m(...) __riscv_vsuxseg5ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u16mf4_m(...) __riscv_vsuxseg6ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u16mf4_m(...) __riscv_vsuxseg7ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u16mf4_m(...) __riscv_vsuxseg8ei64_v_u16mf4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u16mf2_m(...) __riscv_vsuxseg2ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u16mf2_m(...) __riscv_vsuxseg3ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u16mf2_m(...) __riscv_vsuxseg4ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u16mf2_m(...) __riscv_vsuxseg5ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u16mf2_m(...) __riscv_vsuxseg6ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u16mf2_m(...) __riscv_vsuxseg7ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u16mf2_m(...) __riscv_vsuxseg8ei64_v_u16mf2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u16m1_m(...) __riscv_vsuxseg2ei64_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u16m1_m(...) __riscv_vsuxseg3ei64_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u16m1_m(...) __riscv_vsuxseg4ei64_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u16m1_m(...) __riscv_vsuxseg5ei64_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u16m1_m(...) __riscv_vsuxseg6ei64_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u16m1_m(...) __riscv_vsuxseg7ei64_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u16m1_m(...) __riscv_vsuxseg8ei64_v_u16m1_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u16m2_m(...) __riscv_vsuxseg2ei64_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u16m2_m(...) __riscv_vsuxseg3ei64_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u16m2_m(...) __riscv_vsuxseg4ei64_v_u16m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u32mf2_m(...) __riscv_vsuxseg2ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u32mf2_m(...) __riscv_vsuxseg3ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u32mf2_m(...) __riscv_vsuxseg4ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u32mf2_m(...) __riscv_vsuxseg5ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u32mf2_m(...) __riscv_vsuxseg6ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u32mf2_m(...) __riscv_vsuxseg7ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u32mf2_m(...) __riscv_vsuxseg8ei8_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u32m1_m(...) __riscv_vsuxseg2ei8_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u32m1_m(...) __riscv_vsuxseg3ei8_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u32m1_m(...) __riscv_vsuxseg4ei8_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u32m1_m(...) __riscv_vsuxseg5ei8_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u32m1_m(...) __riscv_vsuxseg6ei8_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u32m1_m(...) __riscv_vsuxseg7ei8_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u32m1_m(...) __riscv_vsuxseg8ei8_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u32m2_m(...) __riscv_vsuxseg2ei8_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u32m2_m(...) __riscv_vsuxseg3ei8_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u32m2_m(...) __riscv_vsuxseg4ei8_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u32m4_m(...) __riscv_vsuxseg2ei8_v_u32m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u32mf2_m(...) __riscv_vsuxseg2ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u32mf2_m(...) __riscv_vsuxseg3ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u32mf2_m(...) __riscv_vsuxseg4ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u32mf2_m(...) __riscv_vsuxseg5ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u32mf2_m(...) __riscv_vsuxseg6ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u32mf2_m(...) __riscv_vsuxseg7ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u32mf2_m(...) __riscv_vsuxseg8ei16_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u32m1_m(...) __riscv_vsuxseg2ei16_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u32m1_m(...) __riscv_vsuxseg3ei16_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u32m1_m(...) __riscv_vsuxseg4ei16_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u32m1_m(...) __riscv_vsuxseg5ei16_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u32m1_m(...) __riscv_vsuxseg6ei16_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u32m1_m(...) __riscv_vsuxseg7ei16_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u32m1_m(...) __riscv_vsuxseg8ei16_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u32m2_m(...) __riscv_vsuxseg2ei16_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u32m2_m(...) __riscv_vsuxseg3ei16_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u32m2_m(...) __riscv_vsuxseg4ei16_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u32m4_m(...) __riscv_vsuxseg2ei16_v_u32m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u32mf2_m(...) __riscv_vsuxseg2ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u32mf2_m(...) __riscv_vsuxseg3ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u32mf2_m(...) __riscv_vsuxseg4ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u32mf2_m(...) __riscv_vsuxseg5ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u32mf2_m(...) __riscv_vsuxseg6ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u32mf2_m(...) __riscv_vsuxseg7ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u32mf2_m(...) __riscv_vsuxseg8ei32_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u32m1_m(...) __riscv_vsuxseg2ei32_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u32m1_m(...) __riscv_vsuxseg3ei32_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u32m1_m(...) __riscv_vsuxseg4ei32_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u32m1_m(...) __riscv_vsuxseg5ei32_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u32m1_m(...) __riscv_vsuxseg6ei32_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u32m1_m(...) __riscv_vsuxseg7ei32_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u32m1_m(...) __riscv_vsuxseg8ei32_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u32m2_m(...) __riscv_vsuxseg2ei32_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u32m2_m(...) __riscv_vsuxseg3ei32_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u32m2_m(...) __riscv_vsuxseg4ei32_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u32m4_m(...) __riscv_vsuxseg2ei32_v_u32m4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u32mf2_m(...) __riscv_vsuxseg2ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u32mf2_m(...) __riscv_vsuxseg3ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u32mf2_m(...) __riscv_vsuxseg4ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u32mf2_m(...) __riscv_vsuxseg5ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u32mf2_m(...) __riscv_vsuxseg6ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u32mf2_m(...) __riscv_vsuxseg7ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u32mf2_m(...) __riscv_vsuxseg8ei64_v_u32mf2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u32m1_m(...) __riscv_vsuxseg2ei64_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u32m1_m(...) __riscv_vsuxseg3ei64_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u32m1_m(...) __riscv_vsuxseg4ei64_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u32m1_m(...) __riscv_vsuxseg5ei64_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u32m1_m(...) __riscv_vsuxseg6ei64_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u32m1_m(...) __riscv_vsuxseg7ei64_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u32m1_m(...) __riscv_vsuxseg8ei64_v_u32m1_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u32m2_m(...) __riscv_vsuxseg2ei64_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u32m2_m(...) __riscv_vsuxseg3ei64_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u32m2_m(...) __riscv_vsuxseg4ei64_v_u32m2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u32m4_m(...) __riscv_vsuxseg2ei64_v_u32m4_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u64m1_m(...) __riscv_vsuxseg2ei8_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u64m1_m(...) __riscv_vsuxseg3ei8_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u64m1_m(...) __riscv_vsuxseg4ei8_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg5ei8_v_u64m1_m(...) __riscv_vsuxseg5ei8_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg6ei8_v_u64m1_m(...) __riscv_vsuxseg6ei8_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg7ei8_v_u64m1_m(...) __riscv_vsuxseg7ei8_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg8ei8_v_u64m1_m(...) __riscv_vsuxseg8ei8_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u64m2_m(...) __riscv_vsuxseg2ei8_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg3ei8_v_u64m2_m(...) __riscv_vsuxseg3ei8_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg4ei8_v_u64m2_m(...) __riscv_vsuxseg4ei8_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg2ei8_v_u64m4_m(...) __riscv_vsuxseg2ei8_v_u64m4_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u64m1_m(...) __riscv_vsuxseg2ei16_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u64m1_m(...) __riscv_vsuxseg3ei16_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u64m1_m(...) __riscv_vsuxseg4ei16_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg5ei16_v_u64m1_m(...) __riscv_vsuxseg5ei16_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg6ei16_v_u64m1_m(...) __riscv_vsuxseg6ei16_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg7ei16_v_u64m1_m(...) __riscv_vsuxseg7ei16_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg8ei16_v_u64m1_m(...) __riscv_vsuxseg8ei16_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u64m2_m(...) __riscv_vsuxseg2ei16_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg3ei16_v_u64m2_m(...) __riscv_vsuxseg3ei16_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg4ei16_v_u64m2_m(...) __riscv_vsuxseg4ei16_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg2ei16_v_u64m4_m(...) __riscv_vsuxseg2ei16_v_u64m4_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u64m1_m(...) __riscv_vsuxseg2ei32_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u64m1_m(...) __riscv_vsuxseg3ei32_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u64m1_m(...) __riscv_vsuxseg4ei32_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg5ei32_v_u64m1_m(...) __riscv_vsuxseg5ei32_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg6ei32_v_u64m1_m(...) __riscv_vsuxseg6ei32_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg7ei32_v_u64m1_m(...) __riscv_vsuxseg7ei32_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg8ei32_v_u64m1_m(...) __riscv_vsuxseg8ei32_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u64m2_m(...) __riscv_vsuxseg2ei32_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg3ei32_v_u64m2_m(...) __riscv_vsuxseg3ei32_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg4ei32_v_u64m2_m(...) __riscv_vsuxseg4ei32_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg2ei32_v_u64m4_m(...) __riscv_vsuxseg2ei32_v_u64m4_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u64m1_m(...) __riscv_vsuxseg2ei64_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u64m1_m(...) __riscv_vsuxseg3ei64_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u64m1_m(...) __riscv_vsuxseg4ei64_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg5ei64_v_u64m1_m(...) __riscv_vsuxseg5ei64_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg6ei64_v_u64m1_m(...) __riscv_vsuxseg6ei64_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg7ei64_v_u64m1_m(...) __riscv_vsuxseg7ei64_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg8ei64_v_u64m1_m(...) __riscv_vsuxseg8ei64_v_u64m1_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u64m2_m(...) __riscv_vsuxseg2ei64_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg3ei64_v_u64m2_m(...) __riscv_vsuxseg3ei64_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg4ei64_v_u64m2_m(...) __riscv_vsuxseg4ei64_v_u64m2_m(__VA_ARGS__)
+#define vsuxseg2ei64_v_u64m4_m(...) __riscv_vsuxseg2ei64_v_u64m4_m(__VA_ARGS__)
+#define vadd_vv_i8mf8(...) __riscv_vadd_vv_i8mf8(__VA_ARGS__)
+#define vadd_vx_i8mf8(...) __riscv_vadd_vx_i8mf8(__VA_ARGS__)
+#define vadd_vv_i8mf4(...) __riscv_vadd_vv_i8mf4(__VA_ARGS__)
+#define vadd_vx_i8mf4(...) __riscv_vadd_vx_i8mf4(__VA_ARGS__)
+#define vadd_vv_i8mf2(...) __riscv_vadd_vv_i8mf2(__VA_ARGS__)
+#define vadd_vx_i8mf2(...) __riscv_vadd_vx_i8mf2(__VA_ARGS__)
+#define vadd_vv_i8m1(...) __riscv_vadd_vv_i8m1(__VA_ARGS__)
+#define vadd_vx_i8m1(...) __riscv_vadd_vx_i8m1(__VA_ARGS__)
+#define vadd_vv_i8m2(...) __riscv_vadd_vv_i8m2(__VA_ARGS__)
+#define vadd_vx_i8m2(...) __riscv_vadd_vx_i8m2(__VA_ARGS__)
+#define vadd_vv_i8m4(...) __riscv_vadd_vv_i8m4(__VA_ARGS__)
+#define vadd_vx_i8m4(...) __riscv_vadd_vx_i8m4(__VA_ARGS__)
+#define vadd_vv_i8m8(...) __riscv_vadd_vv_i8m8(__VA_ARGS__)
+#define vadd_vx_i8m8(...) __riscv_vadd_vx_i8m8(__VA_ARGS__)
+#define vadd_vv_i16mf4(...) __riscv_vadd_vv_i16mf4(__VA_ARGS__)
+#define vadd_vx_i16mf4(...) __riscv_vadd_vx_i16mf4(__VA_ARGS__)
+#define vadd_vv_i16mf2(...) __riscv_vadd_vv_i16mf2(__VA_ARGS__)
+#define vadd_vx_i16mf2(...) __riscv_vadd_vx_i16mf2(__VA_ARGS__)
+#define vadd_vv_i16m1(...) __riscv_vadd_vv_i16m1(__VA_ARGS__)
+#define vadd_vx_i16m1(...) __riscv_vadd_vx_i16m1(__VA_ARGS__)
+#define vadd_vv_i16m2(...) __riscv_vadd_vv_i16m2(__VA_ARGS__)
+#define vadd_vx_i16m2(...) __riscv_vadd_vx_i16m2(__VA_ARGS__)
+#define vadd_vv_i16m4(...) __riscv_vadd_vv_i16m4(__VA_ARGS__)
+#define vadd_vx_i16m4(...) __riscv_vadd_vx_i16m4(__VA_ARGS__)
+#define vadd_vv_i16m8(...) __riscv_vadd_vv_i16m8(__VA_ARGS__)
+#define vadd_vx_i16m8(...) __riscv_vadd_vx_i16m8(__VA_ARGS__)
+#define vadd_vv_i32mf2(...) __riscv_vadd_vv_i32mf2(__VA_ARGS__)
+#define vadd_vx_i32mf2(...) __riscv_vadd_vx_i32mf2(__VA_ARGS__)
+#define vadd_vv_i32m1(...) __riscv_vadd_vv_i32m1(__VA_ARGS__)
+#define vadd_vx_i32m1(...) __riscv_vadd_vx_i32m1(__VA_ARGS__)
+#define vadd_vv_i32m2(...) __riscv_vadd_vv_i32m2(__VA_ARGS__)
+#define vadd_vx_i32m2(...) __riscv_vadd_vx_i32m2(__VA_ARGS__)
+#define vadd_vv_i32m4(...) __riscv_vadd_vv_i32m4(__VA_ARGS__)
+#define vadd_vx_i32m4(...) __riscv_vadd_vx_i32m4(__VA_ARGS__)
+#define vadd_vv_i32m8(...) __riscv_vadd_vv_i32m8(__VA_ARGS__)
+#define vadd_vx_i32m8(...) __riscv_vadd_vx_i32m8(__VA_ARGS__)
+#define vadd_vv_i64m1(...) __riscv_vadd_vv_i64m1(__VA_ARGS__)
+#define vadd_vx_i64m1(...) __riscv_vadd_vx_i64m1(__VA_ARGS__)
+#define vadd_vv_i64m2(...) __riscv_vadd_vv_i64m2(__VA_ARGS__)
+#define vadd_vx_i64m2(...) __riscv_vadd_vx_i64m2(__VA_ARGS__)
+#define vadd_vv_i64m4(...) __riscv_vadd_vv_i64m4(__VA_ARGS__)
+#define vadd_vx_i64m4(...) __riscv_vadd_vx_i64m4(__VA_ARGS__)
+#define vadd_vv_i64m8(...) __riscv_vadd_vv_i64m8(__VA_ARGS__)
+#define vadd_vx_i64m8(...) __riscv_vadd_vx_i64m8(__VA_ARGS__)
+#define vsub_vv_i8mf8(...) __riscv_vsub_vv_i8mf8(__VA_ARGS__)
+#define vsub_vx_i8mf8(...) __riscv_vsub_vx_i8mf8(__VA_ARGS__)
+#define vsub_vv_i8mf4(...) __riscv_vsub_vv_i8mf4(__VA_ARGS__)
+#define vsub_vx_i8mf4(...) __riscv_vsub_vx_i8mf4(__VA_ARGS__)
+#define vsub_vv_i8mf2(...) __riscv_vsub_vv_i8mf2(__VA_ARGS__)
+#define vsub_vx_i8mf2(...) __riscv_vsub_vx_i8mf2(__VA_ARGS__)
+#define vsub_vv_i8m1(...) __riscv_vsub_vv_i8m1(__VA_ARGS__)
+#define vsub_vx_i8m1(...) __riscv_vsub_vx_i8m1(__VA_ARGS__)
+#define vsub_vv_i8m2(...) __riscv_vsub_vv_i8m2(__VA_ARGS__)
+#define vsub_vx_i8m2(...) __riscv_vsub_vx_i8m2(__VA_ARGS__)
+#define vsub_vv_i8m4(...) __riscv_vsub_vv_i8m4(__VA_ARGS__)
+#define vsub_vx_i8m4(...) __riscv_vsub_vx_i8m4(__VA_ARGS__)
+#define vsub_vv_i8m8(...) __riscv_vsub_vv_i8m8(__VA_ARGS__)
+#define vsub_vx_i8m8(...) __riscv_vsub_vx_i8m8(__VA_ARGS__)
+#define vsub_vv_i16mf4(...) __riscv_vsub_vv_i16mf4(__VA_ARGS__)
+#define vsub_vx_i16mf4(...) __riscv_vsub_vx_i16mf4(__VA_ARGS__)
+#define vsub_vv_i16mf2(...) __riscv_vsub_vv_i16mf2(__VA_ARGS__)
+#define vsub_vx_i16mf2(...) __riscv_vsub_vx_i16mf2(__VA_ARGS__)
+#define vsub_vv_i16m1(...) __riscv_vsub_vv_i16m1(__VA_ARGS__)
+#define vsub_vx_i16m1(...) __riscv_vsub_vx_i16m1(__VA_ARGS__)
+#define vsub_vv_i16m2(...) __riscv_vsub_vv_i16m2(__VA_ARGS__)
+#define vsub_vx_i16m2(...) __riscv_vsub_vx_i16m2(__VA_ARGS__)
+#define vsub_vv_i16m4(...) __riscv_vsub_vv_i16m4(__VA_ARGS__)
+#define vsub_vx_i16m4(...) __riscv_vsub_vx_i16m4(__VA_ARGS__)
+#define vsub_vv_i16m8(...) __riscv_vsub_vv_i16m8(__VA_ARGS__)
+#define vsub_vx_i16m8(...) __riscv_vsub_vx_i16m8(__VA_ARGS__)
+#define vsub_vv_i32mf2(...) __riscv_vsub_vv_i32mf2(__VA_ARGS__)
+#define vsub_vx_i32mf2(...) __riscv_vsub_vx_i32mf2(__VA_ARGS__)
+#define vsub_vv_i32m1(...) __riscv_vsub_vv_i32m1(__VA_ARGS__)
+#define vsub_vx_i32m1(...) __riscv_vsub_vx_i32m1(__VA_ARGS__)
+#define vsub_vv_i32m2(...) __riscv_vsub_vv_i32m2(__VA_ARGS__)
+#define vsub_vx_i32m2(...) __riscv_vsub_vx_i32m2(__VA_ARGS__)
+#define vsub_vv_i32m4(...) __riscv_vsub_vv_i32m4(__VA_ARGS__)
+#define vsub_vx_i32m4(...) __riscv_vsub_vx_i32m4(__VA_ARGS__)
+#define vsub_vv_i32m8(...) __riscv_vsub_vv_i32m8(__VA_ARGS__)
+#define vsub_vx_i32m8(...) __riscv_vsub_vx_i32m8(__VA_ARGS__)
+#define vsub_vv_i64m1(...) __riscv_vsub_vv_i64m1(__VA_ARGS__)
+#define vsub_vx_i64m1(...) __riscv_vsub_vx_i64m1(__VA_ARGS__)
+#define vsub_vv_i64m2(...) __riscv_vsub_vv_i64m2(__VA_ARGS__)
+#define vsub_vx_i64m2(...) __riscv_vsub_vx_i64m2(__VA_ARGS__)
+#define vsub_vv_i64m4(...) __riscv_vsub_vv_i64m4(__VA_ARGS__)
+#define vsub_vx_i64m4(...) __riscv_vsub_vx_i64m4(__VA_ARGS__)
+#define vsub_vv_i64m8(...) __riscv_vsub_vv_i64m8(__VA_ARGS__)
+#define vsub_vx_i64m8(...) __riscv_vsub_vx_i64m8(__VA_ARGS__)
+#define vrsub_vx_i8mf8(...) __riscv_vrsub_vx_i8mf8(__VA_ARGS__)
+#define vrsub_vx_i8mf4(...) __riscv_vrsub_vx_i8mf4(__VA_ARGS__)
+#define vrsub_vx_i8mf2(...) __riscv_vrsub_vx_i8mf2(__VA_ARGS__)
+#define vrsub_vx_i8m1(...) __riscv_vrsub_vx_i8m1(__VA_ARGS__)
+#define vrsub_vx_i8m2(...) __riscv_vrsub_vx_i8m2(__VA_ARGS__)
+#define vrsub_vx_i8m4(...) __riscv_vrsub_vx_i8m4(__VA_ARGS__)
+#define vrsub_vx_i8m8(...) __riscv_vrsub_vx_i8m8(__VA_ARGS__)
+#define vrsub_vx_i16mf4(...) __riscv_vrsub_vx_i16mf4(__VA_ARGS__)
+#define vrsub_vx_i16mf2(...) __riscv_vrsub_vx_i16mf2(__VA_ARGS__)
+#define vrsub_vx_i16m1(...) __riscv_vrsub_vx_i16m1(__VA_ARGS__)
+#define vrsub_vx_i16m2(...) __riscv_vrsub_vx_i16m2(__VA_ARGS__)
+#define vrsub_vx_i16m4(...) __riscv_vrsub_vx_i16m4(__VA_ARGS__)
+#define vrsub_vx_i16m8(...) __riscv_vrsub_vx_i16m8(__VA_ARGS__)
+#define vrsub_vx_i32mf2(...) __riscv_vrsub_vx_i32mf2(__VA_ARGS__)
+#define vrsub_vx_i32m1(...) __riscv_vrsub_vx_i32m1(__VA_ARGS__)
+#define vrsub_vx_i32m2(...) __riscv_vrsub_vx_i32m2(__VA_ARGS__)
+#define vrsub_vx_i32m4(...) __riscv_vrsub_vx_i32m4(__VA_ARGS__)
+#define vrsub_vx_i32m8(...) __riscv_vrsub_vx_i32m8(__VA_ARGS__)
+#define vrsub_vx_i64m1(...) __riscv_vrsub_vx_i64m1(__VA_ARGS__)
+#define vrsub_vx_i64m2(...) __riscv_vrsub_vx_i64m2(__VA_ARGS__)
+#define vrsub_vx_i64m4(...) __riscv_vrsub_vx_i64m4(__VA_ARGS__)
+#define vrsub_vx_i64m8(...) __riscv_vrsub_vx_i64m8(__VA_ARGS__)
+#define vneg_v_i8mf8(...) __riscv_vneg_v_i8mf8(__VA_ARGS__)
+#define vneg_v_i8mf4(...) __riscv_vneg_v_i8mf4(__VA_ARGS__)
+#define vneg_v_i8mf2(...) __riscv_vneg_v_i8mf2(__VA_ARGS__)
+#define vneg_v_i8m1(...) __riscv_vneg_v_i8m1(__VA_ARGS__)
+#define vneg_v_i8m2(...) __riscv_vneg_v_i8m2(__VA_ARGS__)
+#define vneg_v_i8m4(...) __riscv_vneg_v_i8m4(__VA_ARGS__)
+#define vneg_v_i8m8(...) __riscv_vneg_v_i8m8(__VA_ARGS__)
+#define vneg_v_i16mf4(...) __riscv_vneg_v_i16mf4(__VA_ARGS__)
+#define vneg_v_i16mf2(...) __riscv_vneg_v_i16mf2(__VA_ARGS__)
+#define vneg_v_i16m1(...) __riscv_vneg_v_i16m1(__VA_ARGS__)
+#define vneg_v_i16m2(...) __riscv_vneg_v_i16m2(__VA_ARGS__)
+#define vneg_v_i16m4(...) __riscv_vneg_v_i16m4(__VA_ARGS__)
+#define vneg_v_i16m8(...) __riscv_vneg_v_i16m8(__VA_ARGS__)
+#define vneg_v_i32mf2(...) __riscv_vneg_v_i32mf2(__VA_ARGS__)
+#define vneg_v_i32m1(...) __riscv_vneg_v_i32m1(__VA_ARGS__)
+#define vneg_v_i32m2(...) __riscv_vneg_v_i32m2(__VA_ARGS__)
+#define vneg_v_i32m4(...) __riscv_vneg_v_i32m4(__VA_ARGS__)
+#define vneg_v_i32m8(...) __riscv_vneg_v_i32m8(__VA_ARGS__)
+#define vneg_v_i64m1(...) __riscv_vneg_v_i64m1(__VA_ARGS__)
+#define vneg_v_i64m2(...) __riscv_vneg_v_i64m2(__VA_ARGS__)
+#define vneg_v_i64m4(...) __riscv_vneg_v_i64m4(__VA_ARGS__)
+#define vneg_v_i64m8(...) __riscv_vneg_v_i64m8(__VA_ARGS__)
+#define vadd_vv_u8mf8(...) __riscv_vadd_vv_u8mf8(__VA_ARGS__)
+#define vadd_vx_u8mf8(...) __riscv_vadd_vx_u8mf8(__VA_ARGS__)
+#define vadd_vv_u8mf4(...) __riscv_vadd_vv_u8mf4(__VA_ARGS__)
+#define vadd_vx_u8mf4(...) __riscv_vadd_vx_u8mf4(__VA_ARGS__)
+#define vadd_vv_u8mf2(...) __riscv_vadd_vv_u8mf2(__VA_ARGS__)
+#define vadd_vx_u8mf2(...) __riscv_vadd_vx_u8mf2(__VA_ARGS__)
+#define vadd_vv_u8m1(...) __riscv_vadd_vv_u8m1(__VA_ARGS__)
+#define vadd_vx_u8m1(...) __riscv_vadd_vx_u8m1(__VA_ARGS__)
+#define vadd_vv_u8m2(...) __riscv_vadd_vv_u8m2(__VA_ARGS__)
+#define vadd_vx_u8m2(...) __riscv_vadd_vx_u8m2(__VA_ARGS__)
+#define vadd_vv_u8m4(...) __riscv_vadd_vv_u8m4(__VA_ARGS__)
+#define vadd_vx_u8m4(...) __riscv_vadd_vx_u8m4(__VA_ARGS__)
+#define vadd_vv_u8m8(...) __riscv_vadd_vv_u8m8(__VA_ARGS__)
+#define vadd_vx_u8m8(...) __riscv_vadd_vx_u8m8(__VA_ARGS__)
+#define vadd_vv_u16mf4(...) __riscv_vadd_vv_u16mf4(__VA_ARGS__)
+#define vadd_vx_u16mf4(...) __riscv_vadd_vx_u16mf4(__VA_ARGS__)
+#define vadd_vv_u16mf2(...) __riscv_vadd_vv_u16mf2(__VA_ARGS__)
+#define vadd_vx_u16mf2(...) __riscv_vadd_vx_u16mf2(__VA_ARGS__)
+#define vadd_vv_u16m1(...) __riscv_vadd_vv_u16m1(__VA_ARGS__)
+#define vadd_vx_u16m1(...) __riscv_vadd_vx_u16m1(__VA_ARGS__)
+#define vadd_vv_u16m2(...) __riscv_vadd_vv_u16m2(__VA_ARGS__)
+#define vadd_vx_u16m2(...) __riscv_vadd_vx_u16m2(__VA_ARGS__)
+#define vadd_vv_u16m4(...) __riscv_vadd_vv_u16m4(__VA_ARGS__)
+#define vadd_vx_u16m4(...) __riscv_vadd_vx_u16m4(__VA_ARGS__)
+#define vadd_vv_u16m8(...) __riscv_vadd_vv_u16m8(__VA_ARGS__)
+#define vadd_vx_u16m8(...) __riscv_vadd_vx_u16m8(__VA_ARGS__)
+#define vadd_vv_u32mf2(...) __riscv_vadd_vv_u32mf2(__VA_ARGS__)
+#define vadd_vx_u32mf2(...) __riscv_vadd_vx_u32mf2(__VA_ARGS__)
+#define vadd_vv_u32m1(...) __riscv_vadd_vv_u32m1(__VA_ARGS__)
+#define vadd_vx_u32m1(...) __riscv_vadd_vx_u32m1(__VA_ARGS__)
+#define vadd_vv_u32m2(...) __riscv_vadd_vv_u32m2(__VA_ARGS__)
+#define vadd_vx_u32m2(...) __riscv_vadd_vx_u32m2(__VA_ARGS__)
+#define vadd_vv_u32m4(...) __riscv_vadd_vv_u32m4(__VA_ARGS__)
+#define vadd_vx_u32m4(...) __riscv_vadd_vx_u32m4(__VA_ARGS__)
+#define vadd_vv_u32m8(...) __riscv_vadd_vv_u32m8(__VA_ARGS__)
+#define vadd_vx_u32m8(...) __riscv_vadd_vx_u32m8(__VA_ARGS__)
+#define vadd_vv_u64m1(...) __riscv_vadd_vv_u64m1(__VA_ARGS__)
+#define vadd_vx_u64m1(...) __riscv_vadd_vx_u64m1(__VA_ARGS__)
+#define vadd_vv_u64m2(...) __riscv_vadd_vv_u64m2(__VA_ARGS__)
+#define vadd_vx_u64m2(...) __riscv_vadd_vx_u64m2(__VA_ARGS__)
+#define vadd_vv_u64m4(...) __riscv_vadd_vv_u64m4(__VA_ARGS__)
+#define vadd_vx_u64m4(...) __riscv_vadd_vx_u64m4(__VA_ARGS__)
+#define vadd_vv_u64m8(...) __riscv_vadd_vv_u64m8(__VA_ARGS__)
+#define vadd_vx_u64m8(...) __riscv_vadd_vx_u64m8(__VA_ARGS__)
+#define vsub_vv_u8mf8(...) __riscv_vsub_vv_u8mf8(__VA_ARGS__)
+#define vsub_vx_u8mf8(...) __riscv_vsub_vx_u8mf8(__VA_ARGS__)
+#define vsub_vv_u8mf4(...) __riscv_vsub_vv_u8mf4(__VA_ARGS__)
+#define vsub_vx_u8mf4(...) __riscv_vsub_vx_u8mf4(__VA_ARGS__)
+#define vsub_vv_u8mf2(...) __riscv_vsub_vv_u8mf2(__VA_ARGS__)
+#define vsub_vx_u8mf2(...) __riscv_vsub_vx_u8mf2(__VA_ARGS__)
+#define vsub_vv_u8m1(...) __riscv_vsub_vv_u8m1(__VA_ARGS__)
+#define vsub_vx_u8m1(...) __riscv_vsub_vx_u8m1(__VA_ARGS__)
+#define vsub_vv_u8m2(...) __riscv_vsub_vv_u8m2(__VA_ARGS__)
+#define vsub_vx_u8m2(...) __riscv_vsub_vx_u8m2(__VA_ARGS__)
+#define vsub_vv_u8m4(...) __riscv_vsub_vv_u8m4(__VA_ARGS__)
+#define vsub_vx_u8m4(...) __riscv_vsub_vx_u8m4(__VA_ARGS__)
+#define vsub_vv_u8m8(...) __riscv_vsub_vv_u8m8(__VA_ARGS__)
+#define vsub_vx_u8m8(...) __riscv_vsub_vx_u8m8(__VA_ARGS__)
+#define vsub_vv_u16mf4(...) __riscv_vsub_vv_u16mf4(__VA_ARGS__)
+#define vsub_vx_u16mf4(...) __riscv_vsub_vx_u16mf4(__VA_ARGS__)
+#define vsub_vv_u16mf2(...) __riscv_vsub_vv_u16mf2(__VA_ARGS__)
+#define vsub_vx_u16mf2(...) __riscv_vsub_vx_u16mf2(__VA_ARGS__)
+#define vsub_vv_u16m1(...) __riscv_vsub_vv_u16m1(__VA_ARGS__)
+#define vsub_vx_u16m1(...) __riscv_vsub_vx_u16m1(__VA_ARGS__)
+#define vsub_vv_u16m2(...) __riscv_vsub_vv_u16m2(__VA_ARGS__)
+#define vsub_vx_u16m2(...) __riscv_vsub_vx_u16m2(__VA_ARGS__)
+#define vsub_vv_u16m4(...) __riscv_vsub_vv_u16m4(__VA_ARGS__)
+#define vsub_vx_u16m4(...) __riscv_vsub_vx_u16m4(__VA_ARGS__)
+#define vsub_vv_u16m8(...) __riscv_vsub_vv_u16m8(__VA_ARGS__)
+#define vsub_vx_u16m8(...) __riscv_vsub_vx_u16m8(__VA_ARGS__)
+#define vsub_vv_u32mf2(...) __riscv_vsub_vv_u32mf2(__VA_ARGS__)
+#define vsub_vx_u32mf2(...) __riscv_vsub_vx_u32mf2(__VA_ARGS__)
+#define vsub_vv_u32m1(...) __riscv_vsub_vv_u32m1(__VA_ARGS__)
+#define vsub_vx_u32m1(...) __riscv_vsub_vx_u32m1(__VA_ARGS__)
+#define vsub_vv_u32m2(...) __riscv_vsub_vv_u32m2(__VA_ARGS__)
+#define vsub_vx_u32m2(...) __riscv_vsub_vx_u32m2(__VA_ARGS__)
+#define vsub_vv_u32m4(...) __riscv_vsub_vv_u32m4(__VA_ARGS__)
+#define vsub_vx_u32m4(...) __riscv_vsub_vx_u32m4(__VA_ARGS__)
+#define vsub_vv_u32m8(...) __riscv_vsub_vv_u32m8(__VA_ARGS__)
+#define vsub_vx_u32m8(...) __riscv_vsub_vx_u32m8(__VA_ARGS__)
+#define vsub_vv_u64m1(...) __riscv_vsub_vv_u64m1(__VA_ARGS__)
+#define vsub_vx_u64m1(...) __riscv_vsub_vx_u64m1(__VA_ARGS__)
+#define vsub_vv_u64m2(...) __riscv_vsub_vv_u64m2(__VA_ARGS__)
+#define vsub_vx_u64m2(...) __riscv_vsub_vx_u64m2(__VA_ARGS__)
+#define vsub_vv_u64m4(...) __riscv_vsub_vv_u64m4(__VA_ARGS__)
+#define vsub_vx_u64m4(...) __riscv_vsub_vx_u64m4(__VA_ARGS__)
+#define vsub_vv_u64m8(...) __riscv_vsub_vv_u64m8(__VA_ARGS__)
+#define vsub_vx_u64m8(...) __riscv_vsub_vx_u64m8(__VA_ARGS__)
+#define vrsub_vx_u8mf8(...) __riscv_vrsub_vx_u8mf8(__VA_ARGS__)
+#define vrsub_vx_u8mf4(...) __riscv_vrsub_vx_u8mf4(__VA_ARGS__)
+#define vrsub_vx_u8mf2(...) __riscv_vrsub_vx_u8mf2(__VA_ARGS__)
+#define vrsub_vx_u8m1(...) __riscv_vrsub_vx_u8m1(__VA_ARGS__)
+#define vrsub_vx_u8m2(...) __riscv_vrsub_vx_u8m2(__VA_ARGS__)
+#define vrsub_vx_u8m4(...) __riscv_vrsub_vx_u8m4(__VA_ARGS__)
+#define vrsub_vx_u8m8(...) __riscv_vrsub_vx_u8m8(__VA_ARGS__)
+#define vrsub_vx_u16mf4(...) __riscv_vrsub_vx_u16mf4(__VA_ARGS__)
+#define vrsub_vx_u16mf2(...) __riscv_vrsub_vx_u16mf2(__VA_ARGS__)
+#define vrsub_vx_u16m1(...) __riscv_vrsub_vx_u16m1(__VA_ARGS__)
+#define vrsub_vx_u16m2(...) __riscv_vrsub_vx_u16m2(__VA_ARGS__)
+#define vrsub_vx_u16m4(...) __riscv_vrsub_vx_u16m4(__VA_ARGS__)
+#define vrsub_vx_u16m8(...) __riscv_vrsub_vx_u16m8(__VA_ARGS__)
+#define vrsub_vx_u32mf2(...) __riscv_vrsub_vx_u32mf2(__VA_ARGS__)
+#define vrsub_vx_u32m1(...) __riscv_vrsub_vx_u32m1(__VA_ARGS__)
+#define vrsub_vx_u32m2(...) __riscv_vrsub_vx_u32m2(__VA_ARGS__)
+#define vrsub_vx_u32m4(...) __riscv_vrsub_vx_u32m4(__VA_ARGS__)
+#define vrsub_vx_u32m8(...) __riscv_vrsub_vx_u32m8(__VA_ARGS__)
+#define vrsub_vx_u64m1(...) __riscv_vrsub_vx_u64m1(__VA_ARGS__)
+#define vrsub_vx_u64m2(...) __riscv_vrsub_vx_u64m2(__VA_ARGS__)
+#define vrsub_vx_u64m4(...) __riscv_vrsub_vx_u64m4(__VA_ARGS__)
+#define vrsub_vx_u64m8(...) __riscv_vrsub_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vadd_vv_i8mf8_m(...) __riscv_vadd_vv_i8mf8_tumu(__VA_ARGS__)
+#define vadd_vx_i8mf8_m(...) __riscv_vadd_vx_i8mf8_tumu(__VA_ARGS__)
+#define vadd_vv_i8mf4_m(...) __riscv_vadd_vv_i8mf4_tumu(__VA_ARGS__)
+#define vadd_vx_i8mf4_m(...) __riscv_vadd_vx_i8mf4_tumu(__VA_ARGS__)
+#define vadd_vv_i8mf2_m(...) __riscv_vadd_vv_i8mf2_tumu(__VA_ARGS__)
+#define vadd_vx_i8mf2_m(...) __riscv_vadd_vx_i8mf2_tumu(__VA_ARGS__)
+#define vadd_vv_i8m1_m(...) __riscv_vadd_vv_i8m1_tumu(__VA_ARGS__)
+#define vadd_vx_i8m1_m(...) __riscv_vadd_vx_i8m1_tumu(__VA_ARGS__)
+#define vadd_vv_i8m2_m(...) __riscv_vadd_vv_i8m2_tumu(__VA_ARGS__)
+#define vadd_vx_i8m2_m(...) __riscv_vadd_vx_i8m2_tumu(__VA_ARGS__)
+#define vadd_vv_i8m4_m(...) __riscv_vadd_vv_i8m4_tumu(__VA_ARGS__)
+#define vadd_vx_i8m4_m(...) __riscv_vadd_vx_i8m4_tumu(__VA_ARGS__)
+#define vadd_vv_i8m8_m(...) __riscv_vadd_vv_i8m8_tumu(__VA_ARGS__)
+#define vadd_vx_i8m8_m(...) __riscv_vadd_vx_i8m8_tumu(__VA_ARGS__)
+#define vadd_vv_i16mf4_m(...) __riscv_vadd_vv_i16mf4_tumu(__VA_ARGS__)
+#define vadd_vx_i16mf4_m(...) __riscv_vadd_vx_i16mf4_tumu(__VA_ARGS__)
+#define vadd_vv_i16mf2_m(...) __riscv_vadd_vv_i16mf2_tumu(__VA_ARGS__)
+#define vadd_vx_i16mf2_m(...) __riscv_vadd_vx_i16mf2_tumu(__VA_ARGS__)
+#define vadd_vv_i16m1_m(...) __riscv_vadd_vv_i16m1_tumu(__VA_ARGS__)
+#define vadd_vx_i16m1_m(...) __riscv_vadd_vx_i16m1_tumu(__VA_ARGS__)
+#define vadd_vv_i16m2_m(...) __riscv_vadd_vv_i16m2_tumu(__VA_ARGS__)
+#define vadd_vx_i16m2_m(...) __riscv_vadd_vx_i16m2_tumu(__VA_ARGS__)
+#define vadd_vv_i16m4_m(...) __riscv_vadd_vv_i16m4_tumu(__VA_ARGS__)
+#define vadd_vx_i16m4_m(...) __riscv_vadd_vx_i16m4_tumu(__VA_ARGS__)
+#define vadd_vv_i16m8_m(...) __riscv_vadd_vv_i16m8_tumu(__VA_ARGS__)
+#define vadd_vx_i16m8_m(...) __riscv_vadd_vx_i16m8_tumu(__VA_ARGS__)
+#define vadd_vv_i32mf2_m(...) __riscv_vadd_vv_i32mf2_tumu(__VA_ARGS__)
+#define vadd_vx_i32mf2_m(...) __riscv_vadd_vx_i32mf2_tumu(__VA_ARGS__)
+#define vadd_vv_i32m1_m(...) __riscv_vadd_vv_i32m1_tumu(__VA_ARGS__)
+#define vadd_vx_i32m1_m(...) __riscv_vadd_vx_i32m1_tumu(__VA_ARGS__)
+#define vadd_vv_i32m2_m(...) __riscv_vadd_vv_i32m2_tumu(__VA_ARGS__)
+#define vadd_vx_i32m2_m(...) __riscv_vadd_vx_i32m2_tumu(__VA_ARGS__)
+#define vadd_vv_i32m4_m(...) __riscv_vadd_vv_i32m4_tumu(__VA_ARGS__)
+#define vadd_vx_i32m4_m(...) __riscv_vadd_vx_i32m4_tumu(__VA_ARGS__)
+#define vadd_vv_i32m8_m(...) __riscv_vadd_vv_i32m8_tumu(__VA_ARGS__)
+#define vadd_vx_i32m8_m(...) __riscv_vadd_vx_i32m8_tumu(__VA_ARGS__)
+#define vadd_vv_i64m1_m(...) __riscv_vadd_vv_i64m1_tumu(__VA_ARGS__)
+#define vadd_vx_i64m1_m(...) __riscv_vadd_vx_i64m1_tumu(__VA_ARGS__)
+#define vadd_vv_i64m2_m(...) __riscv_vadd_vv_i64m2_tumu(__VA_ARGS__)
+#define vadd_vx_i64m2_m(...) __riscv_vadd_vx_i64m2_tumu(__VA_ARGS__)
+#define vadd_vv_i64m4_m(...) __riscv_vadd_vv_i64m4_tumu(__VA_ARGS__)
+#define vadd_vx_i64m4_m(...) __riscv_vadd_vx_i64m4_tumu(__VA_ARGS__)
+#define vadd_vv_i64m8_m(...) __riscv_vadd_vv_i64m8_tumu(__VA_ARGS__)
+#define vadd_vx_i64m8_m(...) __riscv_vadd_vx_i64m8_tumu(__VA_ARGS__)
+#define vsub_vv_i8mf8_m(...) __riscv_vsub_vv_i8mf8_tumu(__VA_ARGS__)
+#define vsub_vx_i8mf8_m(...) __riscv_vsub_vx_i8mf8_tumu(__VA_ARGS__)
+#define vsub_vv_i8mf4_m(...) __riscv_vsub_vv_i8mf4_tumu(__VA_ARGS__)
+#define vsub_vx_i8mf4_m(...) __riscv_vsub_vx_i8mf4_tumu(__VA_ARGS__)
+#define vsub_vv_i8mf2_m(...) __riscv_vsub_vv_i8mf2_tumu(__VA_ARGS__)
+#define vsub_vx_i8mf2_m(...) __riscv_vsub_vx_i8mf2_tumu(__VA_ARGS__)
+#define vsub_vv_i8m1_m(...) __riscv_vsub_vv_i8m1_tumu(__VA_ARGS__)
+#define vsub_vx_i8m1_m(...) __riscv_vsub_vx_i8m1_tumu(__VA_ARGS__)
+#define vsub_vv_i8m2_m(...) __riscv_vsub_vv_i8m2_tumu(__VA_ARGS__)
+#define vsub_vx_i8m2_m(...) __riscv_vsub_vx_i8m2_tumu(__VA_ARGS__)
+#define vsub_vv_i8m4_m(...) __riscv_vsub_vv_i8m4_tumu(__VA_ARGS__)
+#define vsub_vx_i8m4_m(...) __riscv_vsub_vx_i8m4_tumu(__VA_ARGS__)
+#define vsub_vv_i8m8_m(...) __riscv_vsub_vv_i8m8_tumu(__VA_ARGS__)
+#define vsub_vx_i8m8_m(...) __riscv_vsub_vx_i8m8_tumu(__VA_ARGS__)
+#define vsub_vv_i16mf4_m(...) __riscv_vsub_vv_i16mf4_tumu(__VA_ARGS__)
+#define vsub_vx_i16mf4_m(...) __riscv_vsub_vx_i16mf4_tumu(__VA_ARGS__)
+#define vsub_vv_i16mf2_m(...) __riscv_vsub_vv_i16mf2_tumu(__VA_ARGS__)
+#define vsub_vx_i16mf2_m(...) __riscv_vsub_vx_i16mf2_tumu(__VA_ARGS__)
+#define vsub_vv_i16m1_m(...) __riscv_vsub_vv_i16m1_tumu(__VA_ARGS__)
+#define vsub_vx_i16m1_m(...) __riscv_vsub_vx_i16m1_tumu(__VA_ARGS__)
+#define vsub_vv_i16m2_m(...) __riscv_vsub_vv_i16m2_tumu(__VA_ARGS__)
+#define vsub_vx_i16m2_m(...) __riscv_vsub_vx_i16m2_tumu(__VA_ARGS__)
+#define vsub_vv_i16m4_m(...) __riscv_vsub_vv_i16m4_tumu(__VA_ARGS__)
+#define vsub_vx_i16m4_m(...) __riscv_vsub_vx_i16m4_tumu(__VA_ARGS__)
+#define vsub_vv_i16m8_m(...) __riscv_vsub_vv_i16m8_tumu(__VA_ARGS__)
+#define vsub_vx_i16m8_m(...) __riscv_vsub_vx_i16m8_tumu(__VA_ARGS__)
+#define vsub_vv_i32mf2_m(...) __riscv_vsub_vv_i32mf2_tumu(__VA_ARGS__)
+#define vsub_vx_i32mf2_m(...) __riscv_vsub_vx_i32mf2_tumu(__VA_ARGS__)
+#define vsub_vv_i32m1_m(...) __riscv_vsub_vv_i32m1_tumu(__VA_ARGS__)
+#define vsub_vx_i32m1_m(...) __riscv_vsub_vx_i32m1_tumu(__VA_ARGS__)
+#define vsub_vv_i32m2_m(...) __riscv_vsub_vv_i32m2_tumu(__VA_ARGS__)
+#define vsub_vx_i32m2_m(...) __riscv_vsub_vx_i32m2_tumu(__VA_ARGS__)
+#define vsub_vv_i32m4_m(...) __riscv_vsub_vv_i32m4_tumu(__VA_ARGS__)
+#define vsub_vx_i32m4_m(...) __riscv_vsub_vx_i32m4_tumu(__VA_ARGS__)
+#define vsub_vv_i32m8_m(...) __riscv_vsub_vv_i32m8_tumu(__VA_ARGS__)
+#define vsub_vx_i32m8_m(...) __riscv_vsub_vx_i32m8_tumu(__VA_ARGS__)
+#define vsub_vv_i64m1_m(...) __riscv_vsub_vv_i64m1_tumu(__VA_ARGS__)
+#define vsub_vx_i64m1_m(...) __riscv_vsub_vx_i64m1_tumu(__VA_ARGS__)
+#define vsub_vv_i64m2_m(...) __riscv_vsub_vv_i64m2_tumu(__VA_ARGS__)
+#define vsub_vx_i64m2_m(...) __riscv_vsub_vx_i64m2_tumu(__VA_ARGS__)
+#define vsub_vv_i64m4_m(...) __riscv_vsub_vv_i64m4_tumu(__VA_ARGS__)
+#define vsub_vx_i64m4_m(...) __riscv_vsub_vx_i64m4_tumu(__VA_ARGS__)
+#define vsub_vv_i64m8_m(...) __riscv_vsub_vv_i64m8_tumu(__VA_ARGS__)
+#define vsub_vx_i64m8_m(...) __riscv_vsub_vx_i64m8_tumu(__VA_ARGS__)
+#define vrsub_vx_i8mf8_m(...) __riscv_vrsub_vx_i8mf8_tumu(__VA_ARGS__)
+#define vrsub_vx_i8mf4_m(...) __riscv_vrsub_vx_i8mf4_tumu(__VA_ARGS__)
+#define vrsub_vx_i8mf2_m(...) __riscv_vrsub_vx_i8mf2_tumu(__VA_ARGS__)
+#define vrsub_vx_i8m1_m(...) __riscv_vrsub_vx_i8m1_tumu(__VA_ARGS__)
+#define vrsub_vx_i8m2_m(...) __riscv_vrsub_vx_i8m2_tumu(__VA_ARGS__)
+#define vrsub_vx_i8m4_m(...) __riscv_vrsub_vx_i8m4_tumu(__VA_ARGS__)
+#define vrsub_vx_i8m8_m(...) __riscv_vrsub_vx_i8m8_tumu(__VA_ARGS__)
+#define vrsub_vx_i16mf4_m(...) __riscv_vrsub_vx_i16mf4_tumu(__VA_ARGS__)
+#define vrsub_vx_i16mf2_m(...) __riscv_vrsub_vx_i16mf2_tumu(__VA_ARGS__)
+#define vrsub_vx_i16m1_m(...) __riscv_vrsub_vx_i16m1_tumu(__VA_ARGS__)
+#define vrsub_vx_i16m2_m(...) __riscv_vrsub_vx_i16m2_tumu(__VA_ARGS__)
+#define vrsub_vx_i16m4_m(...) __riscv_vrsub_vx_i16m4_tumu(__VA_ARGS__)
+#define vrsub_vx_i16m8_m(...) __riscv_vrsub_vx_i16m8_tumu(__VA_ARGS__)
+#define vrsub_vx_i32mf2_m(...) __riscv_vrsub_vx_i32mf2_tumu(__VA_ARGS__)
+#define vrsub_vx_i32m1_m(...) __riscv_vrsub_vx_i32m1_tumu(__VA_ARGS__)
+#define vrsub_vx_i32m2_m(...) __riscv_vrsub_vx_i32m2_tumu(__VA_ARGS__)
+#define vrsub_vx_i32m4_m(...) __riscv_vrsub_vx_i32m4_tumu(__VA_ARGS__)
+#define vrsub_vx_i32m8_m(...) __riscv_vrsub_vx_i32m8_tumu(__VA_ARGS__)
+#define vrsub_vx_i64m1_m(...) __riscv_vrsub_vx_i64m1_tumu(__VA_ARGS__)
+#define vrsub_vx_i64m2_m(...) __riscv_vrsub_vx_i64m2_tumu(__VA_ARGS__)
+#define vrsub_vx_i64m4_m(...) __riscv_vrsub_vx_i64m4_tumu(__VA_ARGS__)
+#define vrsub_vx_i64m8_m(...) __riscv_vrsub_vx_i64m8_tumu(__VA_ARGS__)
+#define vneg_v_i8mf8_m(...) __riscv_vneg_v_i8mf8_tumu(__VA_ARGS__)
+#define vneg_v_i8mf4_m(...) __riscv_vneg_v_i8mf4_tumu(__VA_ARGS__)
+#define vneg_v_i8mf2_m(...) __riscv_vneg_v_i8mf2_tumu(__VA_ARGS__)
+#define vneg_v_i8m1_m(...) __riscv_vneg_v_i8m1_tumu(__VA_ARGS__)
+#define vneg_v_i8m2_m(...) __riscv_vneg_v_i8m2_tumu(__VA_ARGS__)
+#define vneg_v_i8m4_m(...) __riscv_vneg_v_i8m4_tumu(__VA_ARGS__)
+#define vneg_v_i8m8_m(...) __riscv_vneg_v_i8m8_tumu(__VA_ARGS__)
+#define vneg_v_i16mf4_m(...) __riscv_vneg_v_i16mf4_tumu(__VA_ARGS__)
+#define vneg_v_i16mf2_m(...) __riscv_vneg_v_i16mf2_tumu(__VA_ARGS__)
+#define vneg_v_i16m1_m(...) __riscv_vneg_v_i16m1_tumu(__VA_ARGS__)
+#define vneg_v_i16m2_m(...) __riscv_vneg_v_i16m2_tumu(__VA_ARGS__)
+#define vneg_v_i16m4_m(...) __riscv_vneg_v_i16m4_tumu(__VA_ARGS__)
+#define vneg_v_i16m8_m(...) __riscv_vneg_v_i16m8_tumu(__VA_ARGS__)
+#define vneg_v_i32mf2_m(...) __riscv_vneg_v_i32mf2_tumu(__VA_ARGS__)
+#define vneg_v_i32m1_m(...) __riscv_vneg_v_i32m1_tumu(__VA_ARGS__)
+#define vneg_v_i32m2_m(...) __riscv_vneg_v_i32m2_tumu(__VA_ARGS__)
+#define vneg_v_i32m4_m(...) __riscv_vneg_v_i32m4_tumu(__VA_ARGS__)
+#define vneg_v_i32m8_m(...) __riscv_vneg_v_i32m8_tumu(__VA_ARGS__)
+#define vneg_v_i64m1_m(...) __riscv_vneg_v_i64m1_tumu(__VA_ARGS__)
+#define vneg_v_i64m2_m(...) __riscv_vneg_v_i64m2_tumu(__VA_ARGS__)
+#define vneg_v_i64m4_m(...) __riscv_vneg_v_i64m4_tumu(__VA_ARGS__)
+#define vneg_v_i64m8_m(...) __riscv_vneg_v_i64m8_tumu(__VA_ARGS__)
+#define vadd_vv_u8mf8_m(...) __riscv_vadd_vv_u8mf8_tumu(__VA_ARGS__)
+#define vadd_vx_u8mf8_m(...) __riscv_vadd_vx_u8mf8_tumu(__VA_ARGS__)
+#define vadd_vv_u8mf4_m(...) __riscv_vadd_vv_u8mf4_tumu(__VA_ARGS__)
+#define vadd_vx_u8mf4_m(...) __riscv_vadd_vx_u8mf4_tumu(__VA_ARGS__)
+#define vadd_vv_u8mf2_m(...) __riscv_vadd_vv_u8mf2_tumu(__VA_ARGS__)
+#define vadd_vx_u8mf2_m(...) __riscv_vadd_vx_u8mf2_tumu(__VA_ARGS__)
+#define vadd_vv_u8m1_m(...) __riscv_vadd_vv_u8m1_tumu(__VA_ARGS__)
+#define vadd_vx_u8m1_m(...) __riscv_vadd_vx_u8m1_tumu(__VA_ARGS__)
+#define vadd_vv_u8m2_m(...) __riscv_vadd_vv_u8m2_tumu(__VA_ARGS__)
+#define vadd_vx_u8m2_m(...) __riscv_vadd_vx_u8m2_tumu(__VA_ARGS__)
+#define vadd_vv_u8m4_m(...) __riscv_vadd_vv_u8m4_tumu(__VA_ARGS__)
+#define vadd_vx_u8m4_m(...) __riscv_vadd_vx_u8m4_tumu(__VA_ARGS__)
+#define vadd_vv_u8m8_m(...) __riscv_vadd_vv_u8m8_tumu(__VA_ARGS__)
+#define vadd_vx_u8m8_m(...) __riscv_vadd_vx_u8m8_tumu(__VA_ARGS__)
+#define vadd_vv_u16mf4_m(...) __riscv_vadd_vv_u16mf4_tumu(__VA_ARGS__)
+#define vadd_vx_u16mf4_m(...) __riscv_vadd_vx_u16mf4_tumu(__VA_ARGS__)
+#define vadd_vv_u16mf2_m(...) __riscv_vadd_vv_u16mf2_tumu(__VA_ARGS__)
+#define vadd_vx_u16mf2_m(...) __riscv_vadd_vx_u16mf2_tumu(__VA_ARGS__)
+#define vadd_vv_u16m1_m(...) __riscv_vadd_vv_u16m1_tumu(__VA_ARGS__)
+#define vadd_vx_u16m1_m(...) __riscv_vadd_vx_u16m1_tumu(__VA_ARGS__)
+#define vadd_vv_u16m2_m(...) __riscv_vadd_vv_u16m2_tumu(__VA_ARGS__)
+#define vadd_vx_u16m2_m(...) __riscv_vadd_vx_u16m2_tumu(__VA_ARGS__)
+#define vadd_vv_u16m4_m(...) __riscv_vadd_vv_u16m4_tumu(__VA_ARGS__)
+#define vadd_vx_u16m4_m(...) __riscv_vadd_vx_u16m4_tumu(__VA_ARGS__)
+#define vadd_vv_u16m8_m(...) __riscv_vadd_vv_u16m8_tumu(__VA_ARGS__)
+#define vadd_vx_u16m8_m(...) __riscv_vadd_vx_u16m8_tumu(__VA_ARGS__)
+#define vadd_vv_u32mf2_m(...) __riscv_vadd_vv_u32mf2_tumu(__VA_ARGS__)
+#define vadd_vx_u32mf2_m(...) __riscv_vadd_vx_u32mf2_tumu(__VA_ARGS__)
+#define vadd_vv_u32m1_m(...) __riscv_vadd_vv_u32m1_tumu(__VA_ARGS__)
+#define vadd_vx_u32m1_m(...) __riscv_vadd_vx_u32m1_tumu(__VA_ARGS__)
+#define vadd_vv_u32m2_m(...) __riscv_vadd_vv_u32m2_tumu(__VA_ARGS__)
+#define vadd_vx_u32m2_m(...) __riscv_vadd_vx_u32m2_tumu(__VA_ARGS__)
+#define vadd_vv_u32m4_m(...) __riscv_vadd_vv_u32m4_tumu(__VA_ARGS__)
+#define vadd_vx_u32m4_m(...) __riscv_vadd_vx_u32m4_tumu(__VA_ARGS__)
+#define vadd_vv_u32m8_m(...) __riscv_vadd_vv_u32m8_tumu(__VA_ARGS__)
+#define vadd_vx_u32m8_m(...) __riscv_vadd_vx_u32m8_tumu(__VA_ARGS__)
+#define vadd_vv_u64m1_m(...) __riscv_vadd_vv_u64m1_tumu(__VA_ARGS__)
+#define vadd_vx_u64m1_m(...) __riscv_vadd_vx_u64m1_tumu(__VA_ARGS__)
+#define vadd_vv_u64m2_m(...) __riscv_vadd_vv_u64m2_tumu(__VA_ARGS__)
+#define vadd_vx_u64m2_m(...) __riscv_vadd_vx_u64m2_tumu(__VA_ARGS__)
+#define vadd_vv_u64m4_m(...) __riscv_vadd_vv_u64m4_tumu(__VA_ARGS__)
+#define vadd_vx_u64m4_m(...) __riscv_vadd_vx_u64m4_tumu(__VA_ARGS__)
+#define vadd_vv_u64m8_m(...) __riscv_vadd_vv_u64m8_tumu(__VA_ARGS__)
+#define vadd_vx_u64m8_m(...) __riscv_vadd_vx_u64m8_tumu(__VA_ARGS__)
+#define vsub_vv_u8mf8_m(...) __riscv_vsub_vv_u8mf8_tumu(__VA_ARGS__)
+#define vsub_vx_u8mf8_m(...) __riscv_vsub_vx_u8mf8_tumu(__VA_ARGS__)
+#define vsub_vv_u8mf4_m(...) __riscv_vsub_vv_u8mf4_tumu(__VA_ARGS__)
+#define vsub_vx_u8mf4_m(...) __riscv_vsub_vx_u8mf4_tumu(__VA_ARGS__)
+#define vsub_vv_u8mf2_m(...) __riscv_vsub_vv_u8mf2_tumu(__VA_ARGS__)
+#define vsub_vx_u8mf2_m(...) __riscv_vsub_vx_u8mf2_tumu(__VA_ARGS__)
+#define vsub_vv_u8m1_m(...) __riscv_vsub_vv_u8m1_tumu(__VA_ARGS__)
+#define vsub_vx_u8m1_m(...) __riscv_vsub_vx_u8m1_tumu(__VA_ARGS__)
+#define vsub_vv_u8m2_m(...) __riscv_vsub_vv_u8m2_tumu(__VA_ARGS__)
+#define vsub_vx_u8m2_m(...) __riscv_vsub_vx_u8m2_tumu(__VA_ARGS__)
+#define vsub_vv_u8m4_m(...) __riscv_vsub_vv_u8m4_tumu(__VA_ARGS__)
+#define vsub_vx_u8m4_m(...) __riscv_vsub_vx_u8m4_tumu(__VA_ARGS__)
+#define vsub_vv_u8m8_m(...) __riscv_vsub_vv_u8m8_tumu(__VA_ARGS__)
+#define vsub_vx_u8m8_m(...) __riscv_vsub_vx_u8m8_tumu(__VA_ARGS__)
+#define vsub_vv_u16mf4_m(...) __riscv_vsub_vv_u16mf4_tumu(__VA_ARGS__)
+#define vsub_vx_u16mf4_m(...) __riscv_vsub_vx_u16mf4_tumu(__VA_ARGS__)
+#define vsub_vv_u16mf2_m(...) __riscv_vsub_vv_u16mf2_tumu(__VA_ARGS__)
+#define vsub_vx_u16mf2_m(...) __riscv_vsub_vx_u16mf2_tumu(__VA_ARGS__)
+#define vsub_vv_u16m1_m(...) __riscv_vsub_vv_u16m1_tumu(__VA_ARGS__)
+#define vsub_vx_u16m1_m(...) __riscv_vsub_vx_u16m1_tumu(__VA_ARGS__)
+#define vsub_vv_u16m2_m(...) __riscv_vsub_vv_u16m2_tumu(__VA_ARGS__)
+#define vsub_vx_u16m2_m(...) __riscv_vsub_vx_u16m2_tumu(__VA_ARGS__)
+#define vsub_vv_u16m4_m(...) __riscv_vsub_vv_u16m4_tumu(__VA_ARGS__)
+#define vsub_vx_u16m4_m(...) __riscv_vsub_vx_u16m4_tumu(__VA_ARGS__)
+#define vsub_vv_u16m8_m(...) __riscv_vsub_vv_u16m8_tumu(__VA_ARGS__)
+#define vsub_vx_u16m8_m(...) __riscv_vsub_vx_u16m8_tumu(__VA_ARGS__)
+#define vsub_vv_u32mf2_m(...) __riscv_vsub_vv_u32mf2_tumu(__VA_ARGS__)
+#define vsub_vx_u32mf2_m(...) __riscv_vsub_vx_u32mf2_tumu(__VA_ARGS__)
+#define vsub_vv_u32m1_m(...) __riscv_vsub_vv_u32m1_tumu(__VA_ARGS__)
+#define vsub_vx_u32m1_m(...) __riscv_vsub_vx_u32m1_tumu(__VA_ARGS__)
+#define vsub_vv_u32m2_m(...) __riscv_vsub_vv_u32m2_tumu(__VA_ARGS__)
+#define vsub_vx_u32m2_m(...) __riscv_vsub_vx_u32m2_tumu(__VA_ARGS__)
+#define vsub_vv_u32m4_m(...) __riscv_vsub_vv_u32m4_tumu(__VA_ARGS__)
+#define vsub_vx_u32m4_m(...) __riscv_vsub_vx_u32m4_tumu(__VA_ARGS__)
+#define vsub_vv_u32m8_m(...) __riscv_vsub_vv_u32m8_tumu(__VA_ARGS__)
+#define vsub_vx_u32m8_m(...) __riscv_vsub_vx_u32m8_tumu(__VA_ARGS__)
+#define vsub_vv_u64m1_m(...) __riscv_vsub_vv_u64m1_tumu(__VA_ARGS__)
+#define vsub_vx_u64m1_m(...) __riscv_vsub_vx_u64m1_tumu(__VA_ARGS__)
+#define vsub_vv_u64m2_m(...) __riscv_vsub_vv_u64m2_tumu(__VA_ARGS__)
+#define vsub_vx_u64m2_m(...) __riscv_vsub_vx_u64m2_tumu(__VA_ARGS__)
+#define vsub_vv_u64m4_m(...) __riscv_vsub_vv_u64m4_tumu(__VA_ARGS__)
+#define vsub_vx_u64m4_m(...) __riscv_vsub_vx_u64m4_tumu(__VA_ARGS__)
+#define vsub_vv_u64m8_m(...) __riscv_vsub_vv_u64m8_tumu(__VA_ARGS__)
+#define vsub_vx_u64m8_m(...) __riscv_vsub_vx_u64m8_tumu(__VA_ARGS__)
+#define vrsub_vx_u8mf8_m(...) __riscv_vrsub_vx_u8mf8_tumu(__VA_ARGS__)
+#define vrsub_vx_u8mf4_m(...) __riscv_vrsub_vx_u8mf4_tumu(__VA_ARGS__)
+#define vrsub_vx_u8mf2_m(...) __riscv_vrsub_vx_u8mf2_tumu(__VA_ARGS__)
+#define vrsub_vx_u8m1_m(...) __riscv_vrsub_vx_u8m1_tumu(__VA_ARGS__)
+#define vrsub_vx_u8m2_m(...) __riscv_vrsub_vx_u8m2_tumu(__VA_ARGS__)
+#define vrsub_vx_u8m4_m(...) __riscv_vrsub_vx_u8m4_tumu(__VA_ARGS__)
+#define vrsub_vx_u8m8_m(...) __riscv_vrsub_vx_u8m8_tumu(__VA_ARGS__)
+#define vrsub_vx_u16mf4_m(...) __riscv_vrsub_vx_u16mf4_tumu(__VA_ARGS__)
+#define vrsub_vx_u16mf2_m(...) __riscv_vrsub_vx_u16mf2_tumu(__VA_ARGS__)
+#define vrsub_vx_u16m1_m(...) __riscv_vrsub_vx_u16m1_tumu(__VA_ARGS__)
+#define vrsub_vx_u16m2_m(...) __riscv_vrsub_vx_u16m2_tumu(__VA_ARGS__)
+#define vrsub_vx_u16m4_m(...) __riscv_vrsub_vx_u16m4_tumu(__VA_ARGS__)
+#define vrsub_vx_u16m8_m(...) __riscv_vrsub_vx_u16m8_tumu(__VA_ARGS__)
+#define vrsub_vx_u32mf2_m(...) __riscv_vrsub_vx_u32mf2_tumu(__VA_ARGS__)
+#define vrsub_vx_u32m1_m(...) __riscv_vrsub_vx_u32m1_tumu(__VA_ARGS__)
+#define vrsub_vx_u32m2_m(...) __riscv_vrsub_vx_u32m2_tumu(__VA_ARGS__)
+#define vrsub_vx_u32m4_m(...) __riscv_vrsub_vx_u32m4_tumu(__VA_ARGS__)
+#define vrsub_vx_u32m8_m(...) __riscv_vrsub_vx_u32m8_tumu(__VA_ARGS__)
+#define vrsub_vx_u64m1_m(...) __riscv_vrsub_vx_u64m1_tumu(__VA_ARGS__)
+#define vrsub_vx_u64m2_m(...) __riscv_vrsub_vx_u64m2_tumu(__VA_ARGS__)
+#define vrsub_vx_u64m4_m(...) __riscv_vrsub_vx_u64m4_tumu(__VA_ARGS__)
+#define vrsub_vx_u64m8_m(...) __riscv_vrsub_vx_u64m8_tumu(__VA_ARGS__)
+#define vwadd_vv_i16mf4(...) __riscv_vwadd_vv_i16mf4(__VA_ARGS__)
+#define vwadd_vx_i16mf4(...) __riscv_vwadd_vx_i16mf4(__VA_ARGS__)
+#define vwadd_wv_i16mf4(...) __riscv_vwadd_wv_i16mf4(__VA_ARGS__)
+#define vwadd_wx_i16mf4(...) __riscv_vwadd_wx_i16mf4(__VA_ARGS__)
+#define vwadd_vv_i16mf2(...) __riscv_vwadd_vv_i16mf2(__VA_ARGS__)
+#define vwadd_vx_i16mf2(...) __riscv_vwadd_vx_i16mf2(__VA_ARGS__)
+#define vwadd_wv_i16mf2(...) __riscv_vwadd_wv_i16mf2(__VA_ARGS__)
+#define vwadd_wx_i16mf2(...) __riscv_vwadd_wx_i16mf2(__VA_ARGS__)
+#define vwadd_vv_i16m1(...) __riscv_vwadd_vv_i16m1(__VA_ARGS__)
+#define vwadd_vx_i16m1(...) __riscv_vwadd_vx_i16m1(__VA_ARGS__)
+#define vwadd_wv_i16m1(...) __riscv_vwadd_wv_i16m1(__VA_ARGS__)
+#define vwadd_wx_i16m1(...) __riscv_vwadd_wx_i16m1(__VA_ARGS__)
+#define vwadd_vv_i16m2(...) __riscv_vwadd_vv_i16m2(__VA_ARGS__)
+#define vwadd_vx_i16m2(...) __riscv_vwadd_vx_i16m2(__VA_ARGS__)
+#define vwadd_wv_i16m2(...) __riscv_vwadd_wv_i16m2(__VA_ARGS__)
+#define vwadd_wx_i16m2(...) __riscv_vwadd_wx_i16m2(__VA_ARGS__)
+#define vwadd_vv_i16m4(...) __riscv_vwadd_vv_i16m4(__VA_ARGS__)
+#define vwadd_vx_i16m4(...) __riscv_vwadd_vx_i16m4(__VA_ARGS__)
+#define vwadd_wv_i16m4(...) __riscv_vwadd_wv_i16m4(__VA_ARGS__)
+#define vwadd_wx_i16m4(...) __riscv_vwadd_wx_i16m4(__VA_ARGS__)
+#define vwadd_vv_i16m8(...) __riscv_vwadd_vv_i16m8(__VA_ARGS__)
+#define vwadd_vx_i16m8(...) __riscv_vwadd_vx_i16m8(__VA_ARGS__)
+#define vwadd_wv_i16m8(...) __riscv_vwadd_wv_i16m8(__VA_ARGS__)
+#define vwadd_wx_i16m8(...) __riscv_vwadd_wx_i16m8(__VA_ARGS__)
+#define vwadd_vv_i32mf2(...) __riscv_vwadd_vv_i32mf2(__VA_ARGS__)
+#define vwadd_vx_i32mf2(...) __riscv_vwadd_vx_i32mf2(__VA_ARGS__)
+#define vwadd_wv_i32mf2(...) __riscv_vwadd_wv_i32mf2(__VA_ARGS__)
+#define vwadd_wx_i32mf2(...) __riscv_vwadd_wx_i32mf2(__VA_ARGS__)
+#define vwadd_vv_i32m1(...) __riscv_vwadd_vv_i32m1(__VA_ARGS__)
+#define vwadd_vx_i32m1(...) __riscv_vwadd_vx_i32m1(__VA_ARGS__)
+#define vwadd_wv_i32m1(...) __riscv_vwadd_wv_i32m1(__VA_ARGS__)
+#define vwadd_wx_i32m1(...) __riscv_vwadd_wx_i32m1(__VA_ARGS__)
+#define vwadd_vv_i32m2(...) __riscv_vwadd_vv_i32m2(__VA_ARGS__)
+#define vwadd_vx_i32m2(...) __riscv_vwadd_vx_i32m2(__VA_ARGS__)
+#define vwadd_wv_i32m2(...) __riscv_vwadd_wv_i32m2(__VA_ARGS__)
+#define vwadd_wx_i32m2(...) __riscv_vwadd_wx_i32m2(__VA_ARGS__)
+#define vwadd_vv_i32m4(...) __riscv_vwadd_vv_i32m4(__VA_ARGS__)
+#define vwadd_vx_i32m4(...) __riscv_vwadd_vx_i32m4(__VA_ARGS__)
+#define vwadd_wv_i32m4(...) __riscv_vwadd_wv_i32m4(__VA_ARGS__)
+#define vwadd_wx_i32m4(...) __riscv_vwadd_wx_i32m4(__VA_ARGS__)
+#define vwadd_vv_i32m8(...) __riscv_vwadd_vv_i32m8(__VA_ARGS__)
+#define vwadd_vx_i32m8(...) __riscv_vwadd_vx_i32m8(__VA_ARGS__)
+#define vwadd_wv_i32m8(...) __riscv_vwadd_wv_i32m8(__VA_ARGS__)
+#define vwadd_wx_i32m8(...) __riscv_vwadd_wx_i32m8(__VA_ARGS__)
+#define vwadd_vv_i64m1(...) __riscv_vwadd_vv_i64m1(__VA_ARGS__)
+#define vwadd_vx_i64m1(...) __riscv_vwadd_vx_i64m1(__VA_ARGS__)
+#define vwadd_wv_i64m1(...) __riscv_vwadd_wv_i64m1(__VA_ARGS__)
+#define vwadd_wx_i64m1(...) __riscv_vwadd_wx_i64m1(__VA_ARGS__)
+#define vwadd_vv_i64m2(...) __riscv_vwadd_vv_i64m2(__VA_ARGS__)
+#define vwadd_vx_i64m2(...) __riscv_vwadd_vx_i64m2(__VA_ARGS__)
+#define vwadd_wv_i64m2(...) __riscv_vwadd_wv_i64m2(__VA_ARGS__)
+#define vwadd_wx_i64m2(...) __riscv_vwadd_wx_i64m2(__VA_ARGS__)
+#define vwadd_vv_i64m4(...) __riscv_vwadd_vv_i64m4(__VA_ARGS__)
+#define vwadd_vx_i64m4(...) __riscv_vwadd_vx_i64m4(__VA_ARGS__)
+#define vwadd_wv_i64m4(...) __riscv_vwadd_wv_i64m4(__VA_ARGS__)
+#define vwadd_wx_i64m4(...) __riscv_vwadd_wx_i64m4(__VA_ARGS__)
+#define vwadd_vv_i64m8(...) __riscv_vwadd_vv_i64m8(__VA_ARGS__)
+#define vwadd_vx_i64m8(...) __riscv_vwadd_vx_i64m8(__VA_ARGS__)
+#define vwadd_wv_i64m8(...) __riscv_vwadd_wv_i64m8(__VA_ARGS__)
+#define vwadd_wx_i64m8(...) __riscv_vwadd_wx_i64m8(__VA_ARGS__)
+#define vwsub_vv_i16mf4(...) __riscv_vwsub_vv_i16mf4(__VA_ARGS__)
+#define vwsub_vx_i16mf4(...) __riscv_vwsub_vx_i16mf4(__VA_ARGS__)
+#define vwsub_wv_i16mf4(...) __riscv_vwsub_wv_i16mf4(__VA_ARGS__)
+#define vwsub_wx_i16mf4(...) __riscv_vwsub_wx_i16mf4(__VA_ARGS__)
+#define vwsub_vv_i16mf2(...) __riscv_vwsub_vv_i16mf2(__VA_ARGS__)
+#define vwsub_vx_i16mf2(...) __riscv_vwsub_vx_i16mf2(__VA_ARGS__)
+#define vwsub_wv_i16mf2(...) __riscv_vwsub_wv_i16mf2(__VA_ARGS__)
+#define vwsub_wx_i16mf2(...) __riscv_vwsub_wx_i16mf2(__VA_ARGS__)
+#define vwsub_vv_i16m1(...) __riscv_vwsub_vv_i16m1(__VA_ARGS__)
+#define vwsub_vx_i16m1(...) __riscv_vwsub_vx_i16m1(__VA_ARGS__)
+#define vwsub_wv_i16m1(...) __riscv_vwsub_wv_i16m1(__VA_ARGS__)
+#define vwsub_wx_i16m1(...) __riscv_vwsub_wx_i16m1(__VA_ARGS__)
+#define vwsub_vv_i16m2(...) __riscv_vwsub_vv_i16m2(__VA_ARGS__)
+#define vwsub_vx_i16m2(...) __riscv_vwsub_vx_i16m2(__VA_ARGS__)
+#define vwsub_wv_i16m2(...) __riscv_vwsub_wv_i16m2(__VA_ARGS__)
+#define vwsub_wx_i16m2(...) __riscv_vwsub_wx_i16m2(__VA_ARGS__)
+#define vwsub_vv_i16m4(...) __riscv_vwsub_vv_i16m4(__VA_ARGS__)
+#define vwsub_vx_i16m4(...) __riscv_vwsub_vx_i16m4(__VA_ARGS__)
+#define vwsub_wv_i16m4(...) __riscv_vwsub_wv_i16m4(__VA_ARGS__)
+#define vwsub_wx_i16m4(...) __riscv_vwsub_wx_i16m4(__VA_ARGS__)
+#define vwsub_vv_i16m8(...) __riscv_vwsub_vv_i16m8(__VA_ARGS__)
+#define vwsub_vx_i16m8(...) __riscv_vwsub_vx_i16m8(__VA_ARGS__)
+#define vwsub_wv_i16m8(...) __riscv_vwsub_wv_i16m8(__VA_ARGS__)
+#define vwsub_wx_i16m8(...) __riscv_vwsub_wx_i16m8(__VA_ARGS__)
+#define vwsub_vv_i32mf2(...) __riscv_vwsub_vv_i32mf2(__VA_ARGS__)
+#define vwsub_vx_i32mf2(...) __riscv_vwsub_vx_i32mf2(__VA_ARGS__)
+#define vwsub_wv_i32mf2(...) __riscv_vwsub_wv_i32mf2(__VA_ARGS__)
+#define vwsub_wx_i32mf2(...) __riscv_vwsub_wx_i32mf2(__VA_ARGS__)
+#define vwsub_vv_i32m1(...) __riscv_vwsub_vv_i32m1(__VA_ARGS__)
+#define vwsub_vx_i32m1(...) __riscv_vwsub_vx_i32m1(__VA_ARGS__)
+#define vwsub_wv_i32m1(...) __riscv_vwsub_wv_i32m1(__VA_ARGS__)
+#define vwsub_wx_i32m1(...) __riscv_vwsub_wx_i32m1(__VA_ARGS__)
+#define vwsub_vv_i32m2(...) __riscv_vwsub_vv_i32m2(__VA_ARGS__)
+#define vwsub_vx_i32m2(...) __riscv_vwsub_vx_i32m2(__VA_ARGS__)
+#define vwsub_wv_i32m2(...) __riscv_vwsub_wv_i32m2(__VA_ARGS__)
+#define vwsub_wx_i32m2(...) __riscv_vwsub_wx_i32m2(__VA_ARGS__)
+#define vwsub_vv_i32m4(...) __riscv_vwsub_vv_i32m4(__VA_ARGS__)
+#define vwsub_vx_i32m4(...) __riscv_vwsub_vx_i32m4(__VA_ARGS__)
+#define vwsub_wv_i32m4(...) __riscv_vwsub_wv_i32m4(__VA_ARGS__)
+#define vwsub_wx_i32m4(...) __riscv_vwsub_wx_i32m4(__VA_ARGS__)
+#define vwsub_vv_i32m8(...) __riscv_vwsub_vv_i32m8(__VA_ARGS__)
+#define vwsub_vx_i32m8(...) __riscv_vwsub_vx_i32m8(__VA_ARGS__)
+#define vwsub_wv_i32m8(...) __riscv_vwsub_wv_i32m8(__VA_ARGS__)
+#define vwsub_wx_i32m8(...) __riscv_vwsub_wx_i32m8(__VA_ARGS__)
+#define vwsub_vv_i64m1(...) __riscv_vwsub_vv_i64m1(__VA_ARGS__)
+#define vwsub_vx_i64m1(...) __riscv_vwsub_vx_i64m1(__VA_ARGS__)
+#define vwsub_wv_i64m1(...) __riscv_vwsub_wv_i64m1(__VA_ARGS__)
+#define vwsub_wx_i64m1(...) __riscv_vwsub_wx_i64m1(__VA_ARGS__)
+#define vwsub_vv_i64m2(...) __riscv_vwsub_vv_i64m2(__VA_ARGS__)
+#define vwsub_vx_i64m2(...) __riscv_vwsub_vx_i64m2(__VA_ARGS__)
+#define vwsub_wv_i64m2(...) __riscv_vwsub_wv_i64m2(__VA_ARGS__)
+#define vwsub_wx_i64m2(...) __riscv_vwsub_wx_i64m2(__VA_ARGS__)
+#define vwsub_vv_i64m4(...) __riscv_vwsub_vv_i64m4(__VA_ARGS__)
+#define vwsub_vx_i64m4(...) __riscv_vwsub_vx_i64m4(__VA_ARGS__)
+#define vwsub_wv_i64m4(...) __riscv_vwsub_wv_i64m4(__VA_ARGS__)
+#define vwsub_wx_i64m4(...) __riscv_vwsub_wx_i64m4(__VA_ARGS__)
+#define vwsub_vv_i64m8(...) __riscv_vwsub_vv_i64m8(__VA_ARGS__)
+#define vwsub_vx_i64m8(...) __riscv_vwsub_vx_i64m8(__VA_ARGS__)
+#define vwsub_wv_i64m8(...) __riscv_vwsub_wv_i64m8(__VA_ARGS__)
+#define vwsub_wx_i64m8(...) __riscv_vwsub_wx_i64m8(__VA_ARGS__)
+#define vwaddu_vv_u16mf4(...) __riscv_vwaddu_vv_u16mf4(__VA_ARGS__)
+#define vwaddu_vx_u16mf4(...) __riscv_vwaddu_vx_u16mf4(__VA_ARGS__)
+#define vwaddu_wv_u16mf4(...) __riscv_vwaddu_wv_u16mf4(__VA_ARGS__)
+#define vwaddu_wx_u16mf4(...) __riscv_vwaddu_wx_u16mf4(__VA_ARGS__)
+#define vwaddu_vv_u16mf2(...) __riscv_vwaddu_vv_u16mf2(__VA_ARGS__)
+#define vwaddu_vx_u16mf2(...) __riscv_vwaddu_vx_u16mf2(__VA_ARGS__)
+#define vwaddu_wv_u16mf2(...) __riscv_vwaddu_wv_u16mf2(__VA_ARGS__)
+#define vwaddu_wx_u16mf2(...) __riscv_vwaddu_wx_u16mf2(__VA_ARGS__)
+#define vwaddu_vv_u16m1(...) __riscv_vwaddu_vv_u16m1(__VA_ARGS__)
+#define vwaddu_vx_u16m1(...) __riscv_vwaddu_vx_u16m1(__VA_ARGS__)
+#define vwaddu_wv_u16m1(...) __riscv_vwaddu_wv_u16m1(__VA_ARGS__)
+#define vwaddu_wx_u16m1(...) __riscv_vwaddu_wx_u16m1(__VA_ARGS__)
+#define vwaddu_vv_u16m2(...) __riscv_vwaddu_vv_u16m2(__VA_ARGS__)
+#define vwaddu_vx_u16m2(...) __riscv_vwaddu_vx_u16m2(__VA_ARGS__)
+#define vwaddu_wv_u16m2(...) __riscv_vwaddu_wv_u16m2(__VA_ARGS__)
+#define vwaddu_wx_u16m2(...) __riscv_vwaddu_wx_u16m2(__VA_ARGS__)
+#define vwaddu_vv_u16m4(...) __riscv_vwaddu_vv_u16m4(__VA_ARGS__)
+#define vwaddu_vx_u16m4(...) __riscv_vwaddu_vx_u16m4(__VA_ARGS__)
+#define vwaddu_wv_u16m4(...) __riscv_vwaddu_wv_u16m4(__VA_ARGS__)
+#define vwaddu_wx_u16m4(...) __riscv_vwaddu_wx_u16m4(__VA_ARGS__)
+#define vwaddu_vv_u16m8(...) __riscv_vwaddu_vv_u16m8(__VA_ARGS__)
+#define vwaddu_vx_u16m8(...) __riscv_vwaddu_vx_u16m8(__VA_ARGS__)
+#define vwaddu_wv_u16m8(...) __riscv_vwaddu_wv_u16m8(__VA_ARGS__)
+#define vwaddu_wx_u16m8(...) __riscv_vwaddu_wx_u16m8(__VA_ARGS__)
+#define vwaddu_vv_u32mf2(...) __riscv_vwaddu_vv_u32mf2(__VA_ARGS__)
+#define vwaddu_vx_u32mf2(...) __riscv_vwaddu_vx_u32mf2(__VA_ARGS__)
+#define vwaddu_wv_u32mf2(...) __riscv_vwaddu_wv_u32mf2(__VA_ARGS__)
+#define vwaddu_wx_u32mf2(...) __riscv_vwaddu_wx_u32mf2(__VA_ARGS__)
+#define vwaddu_vv_u32m1(...) __riscv_vwaddu_vv_u32m1(__VA_ARGS__)
+#define vwaddu_vx_u32m1(...) __riscv_vwaddu_vx_u32m1(__VA_ARGS__)
+#define vwaddu_wv_u32m1(...) __riscv_vwaddu_wv_u32m1(__VA_ARGS__)
+#define vwaddu_wx_u32m1(...) __riscv_vwaddu_wx_u32m1(__VA_ARGS__)
+#define vwaddu_vv_u32m2(...) __riscv_vwaddu_vv_u32m2(__VA_ARGS__)
+#define vwaddu_vx_u32m2(...) __riscv_vwaddu_vx_u32m2(__VA_ARGS__)
+#define vwaddu_wv_u32m2(...) __riscv_vwaddu_wv_u32m2(__VA_ARGS__)
+#define vwaddu_wx_u32m2(...) __riscv_vwaddu_wx_u32m2(__VA_ARGS__)
+#define vwaddu_vv_u32m4(...) __riscv_vwaddu_vv_u32m4(__VA_ARGS__)
+#define vwaddu_vx_u32m4(...) __riscv_vwaddu_vx_u32m4(__VA_ARGS__)
+#define vwaddu_wv_u32m4(...) __riscv_vwaddu_wv_u32m4(__VA_ARGS__)
+#define vwaddu_wx_u32m4(...) __riscv_vwaddu_wx_u32m4(__VA_ARGS__)
+#define vwaddu_vv_u32m8(...) __riscv_vwaddu_vv_u32m8(__VA_ARGS__)
+#define vwaddu_vx_u32m8(...) __riscv_vwaddu_vx_u32m8(__VA_ARGS__)
+#define vwaddu_wv_u32m8(...) __riscv_vwaddu_wv_u32m8(__VA_ARGS__)
+#define vwaddu_wx_u32m8(...) __riscv_vwaddu_wx_u32m8(__VA_ARGS__)
+#define vwaddu_vv_u64m1(...) __riscv_vwaddu_vv_u64m1(__VA_ARGS__)
+#define vwaddu_vx_u64m1(...) __riscv_vwaddu_vx_u64m1(__VA_ARGS__)
+#define vwaddu_wv_u64m1(...) __riscv_vwaddu_wv_u64m1(__VA_ARGS__)
+#define vwaddu_wx_u64m1(...) __riscv_vwaddu_wx_u64m1(__VA_ARGS__)
+#define vwaddu_vv_u64m2(...) __riscv_vwaddu_vv_u64m2(__VA_ARGS__)
+#define vwaddu_vx_u64m2(...) __riscv_vwaddu_vx_u64m2(__VA_ARGS__)
+#define vwaddu_wv_u64m2(...) __riscv_vwaddu_wv_u64m2(__VA_ARGS__)
+#define vwaddu_wx_u64m2(...) __riscv_vwaddu_wx_u64m2(__VA_ARGS__)
+#define vwaddu_vv_u64m4(...) __riscv_vwaddu_vv_u64m4(__VA_ARGS__)
+#define vwaddu_vx_u64m4(...) __riscv_vwaddu_vx_u64m4(__VA_ARGS__)
+#define vwaddu_wv_u64m4(...) __riscv_vwaddu_wv_u64m4(__VA_ARGS__)
+#define vwaddu_wx_u64m4(...) __riscv_vwaddu_wx_u64m4(__VA_ARGS__)
+#define vwaddu_vv_u64m8(...) __riscv_vwaddu_vv_u64m8(__VA_ARGS__)
+#define vwaddu_vx_u64m8(...) __riscv_vwaddu_vx_u64m8(__VA_ARGS__)
+#define vwaddu_wv_u64m8(...) __riscv_vwaddu_wv_u64m8(__VA_ARGS__)
+#define vwaddu_wx_u64m8(...) __riscv_vwaddu_wx_u64m8(__VA_ARGS__)
+#define vwsubu_vv_u16mf4(...) __riscv_vwsubu_vv_u16mf4(__VA_ARGS__)
+#define vwsubu_vx_u16mf4(...) __riscv_vwsubu_vx_u16mf4(__VA_ARGS__)
+#define vwsubu_wv_u16mf4(...) __riscv_vwsubu_wv_u16mf4(__VA_ARGS__)
+#define vwsubu_wx_u16mf4(...) __riscv_vwsubu_wx_u16mf4(__VA_ARGS__)
+#define vwsubu_vv_u16mf2(...) __riscv_vwsubu_vv_u16mf2(__VA_ARGS__)
+#define vwsubu_vx_u16mf2(...) __riscv_vwsubu_vx_u16mf2(__VA_ARGS__)
+#define vwsubu_wv_u16mf2(...) __riscv_vwsubu_wv_u16mf2(__VA_ARGS__)
+#define vwsubu_wx_u16mf2(...) __riscv_vwsubu_wx_u16mf2(__VA_ARGS__)
+#define vwsubu_vv_u16m1(...) __riscv_vwsubu_vv_u16m1(__VA_ARGS__)
+#define vwsubu_vx_u16m1(...) __riscv_vwsubu_vx_u16m1(__VA_ARGS__)
+#define vwsubu_wv_u16m1(...) __riscv_vwsubu_wv_u16m1(__VA_ARGS__)
+#define vwsubu_wx_u16m1(...) __riscv_vwsubu_wx_u16m1(__VA_ARGS__)
+#define vwsubu_vv_u16m2(...) __riscv_vwsubu_vv_u16m2(__VA_ARGS__)
+#define vwsubu_vx_u16m2(...) __riscv_vwsubu_vx_u16m2(__VA_ARGS__)
+#define vwsubu_wv_u16m2(...) __riscv_vwsubu_wv_u16m2(__VA_ARGS__)
+#define vwsubu_wx_u16m2(...) __riscv_vwsubu_wx_u16m2(__VA_ARGS__)
+#define vwsubu_vv_u16m4(...) __riscv_vwsubu_vv_u16m4(__VA_ARGS__)
+#define vwsubu_vx_u16m4(...) __riscv_vwsubu_vx_u16m4(__VA_ARGS__)
+#define vwsubu_wv_u16m4(...) __riscv_vwsubu_wv_u16m4(__VA_ARGS__)
+#define vwsubu_wx_u16m4(...) __riscv_vwsubu_wx_u16m4(__VA_ARGS__)
+#define vwsubu_vv_u16m8(...) __riscv_vwsubu_vv_u16m8(__VA_ARGS__)
+#define vwsubu_vx_u16m8(...) __riscv_vwsubu_vx_u16m8(__VA_ARGS__)
+#define vwsubu_wv_u16m8(...) __riscv_vwsubu_wv_u16m8(__VA_ARGS__)
+#define vwsubu_wx_u16m8(...) __riscv_vwsubu_wx_u16m8(__VA_ARGS__)
+#define vwsubu_vv_u32mf2(...) __riscv_vwsubu_vv_u32mf2(__VA_ARGS__)
+#define vwsubu_vx_u32mf2(...) __riscv_vwsubu_vx_u32mf2(__VA_ARGS__)
+#define vwsubu_wv_u32mf2(...) __riscv_vwsubu_wv_u32mf2(__VA_ARGS__)
+#define vwsubu_wx_u32mf2(...) __riscv_vwsubu_wx_u32mf2(__VA_ARGS__)
+#define vwsubu_vv_u32m1(...) __riscv_vwsubu_vv_u32m1(__VA_ARGS__)
+#define vwsubu_vx_u32m1(...) __riscv_vwsubu_vx_u32m1(__VA_ARGS__)
+#define vwsubu_wv_u32m1(...) __riscv_vwsubu_wv_u32m1(__VA_ARGS__)
+#define vwsubu_wx_u32m1(...) __riscv_vwsubu_wx_u32m1(__VA_ARGS__)
+#define vwsubu_vv_u32m2(...) __riscv_vwsubu_vv_u32m2(__VA_ARGS__)
+#define vwsubu_vx_u32m2(...) __riscv_vwsubu_vx_u32m2(__VA_ARGS__)
+#define vwsubu_wv_u32m2(...) __riscv_vwsubu_wv_u32m2(__VA_ARGS__)
+#define vwsubu_wx_u32m2(...) __riscv_vwsubu_wx_u32m2(__VA_ARGS__)
+#define vwsubu_vv_u32m4(...) __riscv_vwsubu_vv_u32m4(__VA_ARGS__)
+#define vwsubu_vx_u32m4(...) __riscv_vwsubu_vx_u32m4(__VA_ARGS__)
+#define vwsubu_wv_u32m4(...) __riscv_vwsubu_wv_u32m4(__VA_ARGS__)
+#define vwsubu_wx_u32m4(...) __riscv_vwsubu_wx_u32m4(__VA_ARGS__)
+#define vwsubu_vv_u32m8(...) __riscv_vwsubu_vv_u32m8(__VA_ARGS__)
+#define vwsubu_vx_u32m8(...) __riscv_vwsubu_vx_u32m8(__VA_ARGS__)
+#define vwsubu_wv_u32m8(...) __riscv_vwsubu_wv_u32m8(__VA_ARGS__)
+#define vwsubu_wx_u32m8(...) __riscv_vwsubu_wx_u32m8(__VA_ARGS__)
+#define vwsubu_vv_u64m1(...) __riscv_vwsubu_vv_u64m1(__VA_ARGS__)
+#define vwsubu_vx_u64m1(...) __riscv_vwsubu_vx_u64m1(__VA_ARGS__)
+#define vwsubu_wv_u64m1(...) __riscv_vwsubu_wv_u64m1(__VA_ARGS__)
+#define vwsubu_wx_u64m1(...) __riscv_vwsubu_wx_u64m1(__VA_ARGS__)
+#define vwsubu_vv_u64m2(...) __riscv_vwsubu_vv_u64m2(__VA_ARGS__)
+#define vwsubu_vx_u64m2(...) __riscv_vwsubu_vx_u64m2(__VA_ARGS__)
+#define vwsubu_wv_u64m2(...) __riscv_vwsubu_wv_u64m2(__VA_ARGS__)
+#define vwsubu_wx_u64m2(...) __riscv_vwsubu_wx_u64m2(__VA_ARGS__)
+#define vwsubu_vv_u64m4(...) __riscv_vwsubu_vv_u64m4(__VA_ARGS__)
+#define vwsubu_vx_u64m4(...) __riscv_vwsubu_vx_u64m4(__VA_ARGS__)
+#define vwsubu_wv_u64m4(...) __riscv_vwsubu_wv_u64m4(__VA_ARGS__)
+#define vwsubu_wx_u64m4(...) __riscv_vwsubu_wx_u64m4(__VA_ARGS__)
+#define vwsubu_vv_u64m8(...) __riscv_vwsubu_vv_u64m8(__VA_ARGS__)
+#define vwsubu_vx_u64m8(...) __riscv_vwsubu_vx_u64m8(__VA_ARGS__)
+#define vwsubu_wv_u64m8(...) __riscv_vwsubu_wv_u64m8(__VA_ARGS__)
+#define vwsubu_wx_u64m8(...) __riscv_vwsubu_wx_u64m8(__VA_ARGS__)
+// masked functions
+#define vwadd_vv_i16mf4_m(...) __riscv_vwadd_vv_i16mf4_tumu(__VA_ARGS__)
+#define vwadd_vx_i16mf4_m(...) __riscv_vwadd_vx_i16mf4_tumu(__VA_ARGS__)
+#define vwadd_wv_i16mf4_m(...) __riscv_vwadd_wv_i16mf4_tumu(__VA_ARGS__)
+#define vwadd_wx_i16mf4_m(...) __riscv_vwadd_wx_i16mf4_tumu(__VA_ARGS__)
+#define vwadd_vv_i16mf2_m(...) __riscv_vwadd_vv_i16mf2_tumu(__VA_ARGS__)
+#define vwadd_vx_i16mf2_m(...) __riscv_vwadd_vx_i16mf2_tumu(__VA_ARGS__)
+#define vwadd_wv_i16mf2_m(...) __riscv_vwadd_wv_i16mf2_tumu(__VA_ARGS__)
+#define vwadd_wx_i16mf2_m(...) __riscv_vwadd_wx_i16mf2_tumu(__VA_ARGS__)
+#define vwadd_vv_i16m1_m(...) __riscv_vwadd_vv_i16m1_tumu(__VA_ARGS__)
+#define vwadd_vx_i16m1_m(...) __riscv_vwadd_vx_i16m1_tumu(__VA_ARGS__)
+#define vwadd_wv_i16m1_m(...) __riscv_vwadd_wv_i16m1_tumu(__VA_ARGS__)
+#define vwadd_wx_i16m1_m(...) __riscv_vwadd_wx_i16m1_tumu(__VA_ARGS__)
+#define vwadd_vv_i16m2_m(...) __riscv_vwadd_vv_i16m2_tumu(__VA_ARGS__)
+#define vwadd_vx_i16m2_m(...) __riscv_vwadd_vx_i16m2_tumu(__VA_ARGS__)
+#define vwadd_wv_i16m2_m(...) __riscv_vwadd_wv_i16m2_tumu(__VA_ARGS__)
+#define vwadd_wx_i16m2_m(...) __riscv_vwadd_wx_i16m2_tumu(__VA_ARGS__)
+#define vwadd_vv_i16m4_m(...) __riscv_vwadd_vv_i16m4_tumu(__VA_ARGS__)
+#define vwadd_vx_i16m4_m(...) __riscv_vwadd_vx_i16m4_tumu(__VA_ARGS__)
+#define vwadd_wv_i16m4_m(...) __riscv_vwadd_wv_i16m4_tumu(__VA_ARGS__)
+#define vwadd_wx_i16m4_m(...) __riscv_vwadd_wx_i16m4_tumu(__VA_ARGS__)
+#define vwadd_vv_i16m8_m(...) __riscv_vwadd_vv_i16m8_tumu(__VA_ARGS__)
+#define vwadd_vx_i16m8_m(...) __riscv_vwadd_vx_i16m8_tumu(__VA_ARGS__)
+#define vwadd_wv_i16m8_m(...) __riscv_vwadd_wv_i16m8_tumu(__VA_ARGS__)
+#define vwadd_wx_i16m8_m(...) __riscv_vwadd_wx_i16m8_tumu(__VA_ARGS__)
+#define vwadd_vv_i32mf2_m(...) __riscv_vwadd_vv_i32mf2_tumu(__VA_ARGS__)
+#define vwadd_vx_i32mf2_m(...) __riscv_vwadd_vx_i32mf2_tumu(__VA_ARGS__)
+#define vwadd_wv_i32mf2_m(...) __riscv_vwadd_wv_i32mf2_tumu(__VA_ARGS__)
+#define vwadd_wx_i32mf2_m(...) __riscv_vwadd_wx_i32mf2_tumu(__VA_ARGS__)
+#define vwadd_vv_i32m1_m(...) __riscv_vwadd_vv_i32m1_tumu(__VA_ARGS__)
+#define vwadd_vx_i32m1_m(...) __riscv_vwadd_vx_i32m1_tumu(__VA_ARGS__)
+#define vwadd_wv_i32m1_m(...) __riscv_vwadd_wv_i32m1_tumu(__VA_ARGS__)
+#define vwadd_wx_i32m1_m(...) __riscv_vwadd_wx_i32m1_tumu(__VA_ARGS__)
+#define vwadd_vv_i32m2_m(...) __riscv_vwadd_vv_i32m2_tumu(__VA_ARGS__)
+#define vwadd_vx_i32m2_m(...) __riscv_vwadd_vx_i32m2_tumu(__VA_ARGS__)
+#define vwadd_wv_i32m2_m(...) __riscv_vwadd_wv_i32m2_tumu(__VA_ARGS__)
+#define vwadd_wx_i32m2_m(...) __riscv_vwadd_wx_i32m2_tumu(__VA_ARGS__)
+#define vwadd_vv_i32m4_m(...) __riscv_vwadd_vv_i32m4_tumu(__VA_ARGS__)
+#define vwadd_vx_i32m4_m(...) __riscv_vwadd_vx_i32m4_tumu(__VA_ARGS__)
+#define vwadd_wv_i32m4_m(...) __riscv_vwadd_wv_i32m4_tumu(__VA_ARGS__)
+#define vwadd_wx_i32m4_m(...) __riscv_vwadd_wx_i32m4_tumu(__VA_ARGS__)
+#define vwadd_vv_i32m8_m(...) __riscv_vwadd_vv_i32m8_tumu(__VA_ARGS__)
+#define vwadd_vx_i32m8_m(...) __riscv_vwadd_vx_i32m8_tumu(__VA_ARGS__)
+#define vwadd_wv_i32m8_m(...) __riscv_vwadd_wv_i32m8_tumu(__VA_ARGS__)
+#define vwadd_wx_i32m8_m(...) __riscv_vwadd_wx_i32m8_tumu(__VA_ARGS__)
+#define vwadd_vv_i64m1_m(...) __riscv_vwadd_vv_i64m1_tumu(__VA_ARGS__)
+#define vwadd_vx_i64m1_m(...) __riscv_vwadd_vx_i64m1_tumu(__VA_ARGS__)
+#define vwadd_wv_i64m1_m(...) __riscv_vwadd_wv_i64m1_tumu(__VA_ARGS__)
+#define vwadd_wx_i64m1_m(...) __riscv_vwadd_wx_i64m1_tumu(__VA_ARGS__)
+#define vwadd_vv_i64m2_m(...) __riscv_vwadd_vv_i64m2_tumu(__VA_ARGS__)
+#define vwadd_vx_i64m2_m(...) __riscv_vwadd_vx_i64m2_tumu(__VA_ARGS__)
+#define vwadd_wv_i64m2_m(...) __riscv_vwadd_wv_i64m2_tumu(__VA_ARGS__)
+#define vwadd_wx_i64m2_m(...) __riscv_vwadd_wx_i64m2_tumu(__VA_ARGS__)
+#define vwadd_vv_i64m4_m(...) __riscv_vwadd_vv_i64m4_tumu(__VA_ARGS__)
+#define vwadd_vx_i64m4_m(...) __riscv_vwadd_vx_i64m4_tumu(__VA_ARGS__)
+#define vwadd_wv_i64m4_m(...) __riscv_vwadd_wv_i64m4_tumu(__VA_ARGS__)
+#define vwadd_wx_i64m4_m(...) __riscv_vwadd_wx_i64m4_tumu(__VA_ARGS__)
+#define vwadd_vv_i64m8_m(...) __riscv_vwadd_vv_i64m8_tumu(__VA_ARGS__)
+#define vwadd_vx_i64m8_m(...) __riscv_vwadd_vx_i64m8_tumu(__VA_ARGS__)
+#define vwadd_wv_i64m8_m(...) __riscv_vwadd_wv_i64m8_tumu(__VA_ARGS__)
+#define vwadd_wx_i64m8_m(...) __riscv_vwadd_wx_i64m8_tumu(__VA_ARGS__)
+#define vwsub_vv_i16mf4_m(...) __riscv_vwsub_vv_i16mf4_tumu(__VA_ARGS__)
+#define vwsub_vx_i16mf4_m(...) __riscv_vwsub_vx_i16mf4_tumu(__VA_ARGS__)
+#define vwsub_wv_i16mf4_m(...) __riscv_vwsub_wv_i16mf4_tumu(__VA_ARGS__)
+#define vwsub_wx_i16mf4_m(...) __riscv_vwsub_wx_i16mf4_tumu(__VA_ARGS__)
+#define vwsub_vv_i16mf2_m(...) __riscv_vwsub_vv_i16mf2_tumu(__VA_ARGS__)
+#define vwsub_vx_i16mf2_m(...) __riscv_vwsub_vx_i16mf2_tumu(__VA_ARGS__)
+#define vwsub_wv_i16mf2_m(...) __riscv_vwsub_wv_i16mf2_tumu(__VA_ARGS__)
+#define vwsub_wx_i16mf2_m(...) __riscv_vwsub_wx_i16mf2_tumu(__VA_ARGS__)
+#define vwsub_vv_i16m1_m(...) __riscv_vwsub_vv_i16m1_tumu(__VA_ARGS__)
+#define vwsub_vx_i16m1_m(...) __riscv_vwsub_vx_i16m1_tumu(__VA_ARGS__)
+#define vwsub_wv_i16m1_m(...) __riscv_vwsub_wv_i16m1_tumu(__VA_ARGS__)
+#define vwsub_wx_i16m1_m(...) __riscv_vwsub_wx_i16m1_tumu(__VA_ARGS__)
+#define vwsub_vv_i16m2_m(...) __riscv_vwsub_vv_i16m2_tumu(__VA_ARGS__)
+#define vwsub_vx_i16m2_m(...) __riscv_vwsub_vx_i16m2_tumu(__VA_ARGS__)
+#define vwsub_wv_i16m2_m(...) __riscv_vwsub_wv_i16m2_tumu(__VA_ARGS__)
+#define vwsub_wx_i16m2_m(...) __riscv_vwsub_wx_i16m2_tumu(__VA_ARGS__)
+#define vwsub_vv_i16m4_m(...) __riscv_vwsub_vv_i16m4_tumu(__VA_ARGS__)
+#define vwsub_vx_i16m4_m(...) __riscv_vwsub_vx_i16m4_tumu(__VA_ARGS__)
+#define vwsub_wv_i16m4_m(...) __riscv_vwsub_wv_i16m4_tumu(__VA_ARGS__)
+#define vwsub_wx_i16m4_m(...) __riscv_vwsub_wx_i16m4_tumu(__VA_ARGS__)
+#define vwsub_vv_i16m8_m(...) __riscv_vwsub_vv_i16m8_tumu(__VA_ARGS__)
+#define vwsub_vx_i16m8_m(...) __riscv_vwsub_vx_i16m8_tumu(__VA_ARGS__)
+#define vwsub_wv_i16m8_m(...) __riscv_vwsub_wv_i16m8_tumu(__VA_ARGS__)
+#define vwsub_wx_i16m8_m(...) __riscv_vwsub_wx_i16m8_tumu(__VA_ARGS__)
+#define vwsub_vv_i32mf2_m(...) __riscv_vwsub_vv_i32mf2_tumu(__VA_ARGS__)
+#define vwsub_vx_i32mf2_m(...) __riscv_vwsub_vx_i32mf2_tumu(__VA_ARGS__)
+#define vwsub_wv_i32mf2_m(...) __riscv_vwsub_wv_i32mf2_tumu(__VA_ARGS__)
+#define vwsub_wx_i32mf2_m(...) __riscv_vwsub_wx_i32mf2_tumu(__VA_ARGS__)
+#define vwsub_vv_i32m1_m(...) __riscv_vwsub_vv_i32m1_tumu(__VA_ARGS__)
+#define vwsub_vx_i32m1_m(...) __riscv_vwsub_vx_i32m1_tumu(__VA_ARGS__)
+#define vwsub_wv_i32m1_m(...) __riscv_vwsub_wv_i32m1_tumu(__VA_ARGS__)
+#define vwsub_wx_i32m1_m(...) __riscv_vwsub_wx_i32m1_tumu(__VA_ARGS__)
+#define vwsub_vv_i32m2_m(...) __riscv_vwsub_vv_i32m2_tumu(__VA_ARGS__)
+#define vwsub_vx_i32m2_m(...) __riscv_vwsub_vx_i32m2_tumu(__VA_ARGS__)
+#define vwsub_wv_i32m2_m(...) __riscv_vwsub_wv_i32m2_tumu(__VA_ARGS__)
+#define vwsub_wx_i32m2_m(...) __riscv_vwsub_wx_i32m2_tumu(__VA_ARGS__)
+#define vwsub_vv_i32m4_m(...) __riscv_vwsub_vv_i32m4_tumu(__VA_ARGS__)
+#define vwsub_vx_i32m4_m(...) __riscv_vwsub_vx_i32m4_tumu(__VA_ARGS__)
+#define vwsub_wv_i32m4_m(...) __riscv_vwsub_wv_i32m4_tumu(__VA_ARGS__)
+#define vwsub_wx_i32m4_m(...) __riscv_vwsub_wx_i32m4_tumu(__VA_ARGS__)
+#define vwsub_vv_i32m8_m(...) __riscv_vwsub_vv_i32m8_tumu(__VA_ARGS__)
+#define vwsub_vx_i32m8_m(...) __riscv_vwsub_vx_i32m8_tumu(__VA_ARGS__)
+#define vwsub_wv_i32m8_m(...) __riscv_vwsub_wv_i32m8_tumu(__VA_ARGS__)
+#define vwsub_wx_i32m8_m(...) __riscv_vwsub_wx_i32m8_tumu(__VA_ARGS__)
+#define vwsub_vv_i64m1_m(...) __riscv_vwsub_vv_i64m1_tumu(__VA_ARGS__)
+#define vwsub_vx_i64m1_m(...) __riscv_vwsub_vx_i64m1_tumu(__VA_ARGS__)
+#define vwsub_wv_i64m1_m(...) __riscv_vwsub_wv_i64m1_tumu(__VA_ARGS__)
+#define vwsub_wx_i64m1_m(...) __riscv_vwsub_wx_i64m1_tumu(__VA_ARGS__)
+#define vwsub_vv_i64m2_m(...) __riscv_vwsub_vv_i64m2_tumu(__VA_ARGS__)
+#define vwsub_vx_i64m2_m(...) __riscv_vwsub_vx_i64m2_tumu(__VA_ARGS__)
+#define vwsub_wv_i64m2_m(...) __riscv_vwsub_wv_i64m2_tumu(__VA_ARGS__)
+#define vwsub_wx_i64m2_m(...) __riscv_vwsub_wx_i64m2_tumu(__VA_ARGS__)
+#define vwsub_vv_i64m4_m(...) __riscv_vwsub_vv_i64m4_tumu(__VA_ARGS__)
+#define vwsub_vx_i64m4_m(...) __riscv_vwsub_vx_i64m4_tumu(__VA_ARGS__)
+#define vwsub_wv_i64m4_m(...) __riscv_vwsub_wv_i64m4_tumu(__VA_ARGS__)
+#define vwsub_wx_i64m4_m(...) __riscv_vwsub_wx_i64m4_tumu(__VA_ARGS__)
+#define vwsub_vv_i64m8_m(...) __riscv_vwsub_vv_i64m8_tumu(__VA_ARGS__)
+#define vwsub_vx_i64m8_m(...) __riscv_vwsub_vx_i64m8_tumu(__VA_ARGS__)
+#define vwsub_wv_i64m8_m(...) __riscv_vwsub_wv_i64m8_tumu(__VA_ARGS__)
+#define vwsub_wx_i64m8_m(...) __riscv_vwsub_wx_i64m8_tumu(__VA_ARGS__)
+#define vwaddu_vv_u16mf4_m(...) __riscv_vwaddu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vwaddu_vx_u16mf4_m(...) __riscv_vwaddu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vwaddu_wv_u16mf4_m(...) __riscv_vwaddu_wv_u16mf4_tumu(__VA_ARGS__)
+#define vwaddu_wx_u16mf4_m(...) __riscv_vwaddu_wx_u16mf4_tumu(__VA_ARGS__)
+#define vwaddu_vv_u16mf2_m(...) __riscv_vwaddu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vwaddu_vx_u16mf2_m(...) __riscv_vwaddu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vwaddu_wv_u16mf2_m(...) __riscv_vwaddu_wv_u16mf2_tumu(__VA_ARGS__)
+#define vwaddu_wx_u16mf2_m(...) __riscv_vwaddu_wx_u16mf2_tumu(__VA_ARGS__)
+#define vwaddu_vv_u16m1_m(...) __riscv_vwaddu_vv_u16m1_tumu(__VA_ARGS__)
+#define vwaddu_vx_u16m1_m(...) __riscv_vwaddu_vx_u16m1_tumu(__VA_ARGS__)
+#define vwaddu_wv_u16m1_m(...) __riscv_vwaddu_wv_u16m1_tumu(__VA_ARGS__)
+#define vwaddu_wx_u16m1_m(...) __riscv_vwaddu_wx_u16m1_tumu(__VA_ARGS__)
+#define vwaddu_vv_u16m2_m(...) __riscv_vwaddu_vv_u16m2_tumu(__VA_ARGS__)
+#define vwaddu_vx_u16m2_m(...) __riscv_vwaddu_vx_u16m2_tumu(__VA_ARGS__)
+#define vwaddu_wv_u16m2_m(...) __riscv_vwaddu_wv_u16m2_tumu(__VA_ARGS__)
+#define vwaddu_wx_u16m2_m(...) __riscv_vwaddu_wx_u16m2_tumu(__VA_ARGS__)
+#define vwaddu_vv_u16m4_m(...) __riscv_vwaddu_vv_u16m4_tumu(__VA_ARGS__)
+#define vwaddu_vx_u16m4_m(...) __riscv_vwaddu_vx_u16m4_tumu(__VA_ARGS__)
+#define vwaddu_wv_u16m4_m(...) __riscv_vwaddu_wv_u16m4_tumu(__VA_ARGS__)
+#define vwaddu_wx_u16m4_m(...) __riscv_vwaddu_wx_u16m4_tumu(__VA_ARGS__)
+#define vwaddu_vv_u16m8_m(...) __riscv_vwaddu_vv_u16m8_tumu(__VA_ARGS__)
+#define vwaddu_vx_u16m8_m(...) __riscv_vwaddu_vx_u16m8_tumu(__VA_ARGS__)
+#define vwaddu_wv_u16m8_m(...) __riscv_vwaddu_wv_u16m8_tumu(__VA_ARGS__)
+#define vwaddu_wx_u16m8_m(...) __riscv_vwaddu_wx_u16m8_tumu(__VA_ARGS__)
+#define vwaddu_vv_u32mf2_m(...) __riscv_vwaddu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vwaddu_vx_u32mf2_m(...) __riscv_vwaddu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vwaddu_wv_u32mf2_m(...) __riscv_vwaddu_wv_u32mf2_tumu(__VA_ARGS__)
+#define vwaddu_wx_u32mf2_m(...) __riscv_vwaddu_wx_u32mf2_tumu(__VA_ARGS__)
+#define vwaddu_vv_u32m1_m(...) __riscv_vwaddu_vv_u32m1_tumu(__VA_ARGS__)
+#define vwaddu_vx_u32m1_m(...) __riscv_vwaddu_vx_u32m1_tumu(__VA_ARGS__)
+#define vwaddu_wv_u32m1_m(...) __riscv_vwaddu_wv_u32m1_tumu(__VA_ARGS__)
+#define vwaddu_wx_u32m1_m(...) __riscv_vwaddu_wx_u32m1_tumu(__VA_ARGS__)
+#define vwaddu_vv_u32m2_m(...) __riscv_vwaddu_vv_u32m2_tumu(__VA_ARGS__)
+#define vwaddu_vx_u32m2_m(...) __riscv_vwaddu_vx_u32m2_tumu(__VA_ARGS__)
+#define vwaddu_wv_u32m2_m(...) __riscv_vwaddu_wv_u32m2_tumu(__VA_ARGS__)
+#define vwaddu_wx_u32m2_m(...) __riscv_vwaddu_wx_u32m2_tumu(__VA_ARGS__)
+#define vwaddu_vv_u32m4_m(...) __riscv_vwaddu_vv_u32m4_tumu(__VA_ARGS__)
+#define vwaddu_vx_u32m4_m(...) __riscv_vwaddu_vx_u32m4_tumu(__VA_ARGS__)
+#define vwaddu_wv_u32m4_m(...) __riscv_vwaddu_wv_u32m4_tumu(__VA_ARGS__)
+#define vwaddu_wx_u32m4_m(...) __riscv_vwaddu_wx_u32m4_tumu(__VA_ARGS__)
+#define vwaddu_vv_u32m8_m(...) __riscv_vwaddu_vv_u32m8_tumu(__VA_ARGS__)
+#define vwaddu_vx_u32m8_m(...) __riscv_vwaddu_vx_u32m8_tumu(__VA_ARGS__)
+#define vwaddu_wv_u32m8_m(...) __riscv_vwaddu_wv_u32m8_tumu(__VA_ARGS__)
+#define vwaddu_wx_u32m8_m(...) __riscv_vwaddu_wx_u32m8_tumu(__VA_ARGS__)
+#define vwaddu_vv_u64m1_m(...) __riscv_vwaddu_vv_u64m1_tumu(__VA_ARGS__)
+#define vwaddu_vx_u64m1_m(...) __riscv_vwaddu_vx_u64m1_tumu(__VA_ARGS__)
+#define vwaddu_wv_u64m1_m(...) __riscv_vwaddu_wv_u64m1_tumu(__VA_ARGS__)
+#define vwaddu_wx_u64m1_m(...) __riscv_vwaddu_wx_u64m1_tumu(__VA_ARGS__)
+#define vwaddu_vv_u64m2_m(...) __riscv_vwaddu_vv_u64m2_tumu(__VA_ARGS__)
+#define vwaddu_vx_u64m2_m(...) __riscv_vwaddu_vx_u64m2_tumu(__VA_ARGS__)
+#define vwaddu_wv_u64m2_m(...) __riscv_vwaddu_wv_u64m2_tumu(__VA_ARGS__)
+#define vwaddu_wx_u64m2_m(...) __riscv_vwaddu_wx_u64m2_tumu(__VA_ARGS__)
+#define vwaddu_vv_u64m4_m(...) __riscv_vwaddu_vv_u64m4_tumu(__VA_ARGS__)
+#define vwaddu_vx_u64m4_m(...) __riscv_vwaddu_vx_u64m4_tumu(__VA_ARGS__)
+#define vwaddu_wv_u64m4_m(...) __riscv_vwaddu_wv_u64m4_tumu(__VA_ARGS__)
+#define vwaddu_wx_u64m4_m(...) __riscv_vwaddu_wx_u64m4_tumu(__VA_ARGS__)
+#define vwaddu_vv_u64m8_m(...) __riscv_vwaddu_vv_u64m8_tumu(__VA_ARGS__)
+#define vwaddu_vx_u64m8_m(...) __riscv_vwaddu_vx_u64m8_tumu(__VA_ARGS__)
+#define vwaddu_wv_u64m8_m(...) __riscv_vwaddu_wv_u64m8_tumu(__VA_ARGS__)
+#define vwaddu_wx_u64m8_m(...) __riscv_vwaddu_wx_u64m8_tumu(__VA_ARGS__)
+#define vwsubu_vv_u16mf4_m(...) __riscv_vwsubu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vwsubu_vx_u16mf4_m(...) __riscv_vwsubu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vwsubu_wv_u16mf4_m(...) __riscv_vwsubu_wv_u16mf4_tumu(__VA_ARGS__)
+#define vwsubu_wx_u16mf4_m(...) __riscv_vwsubu_wx_u16mf4_tumu(__VA_ARGS__)
+#define vwsubu_vv_u16mf2_m(...) __riscv_vwsubu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vwsubu_vx_u16mf2_m(...) __riscv_vwsubu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vwsubu_wv_u16mf2_m(...) __riscv_vwsubu_wv_u16mf2_tumu(__VA_ARGS__)
+#define vwsubu_wx_u16mf2_m(...) __riscv_vwsubu_wx_u16mf2_tumu(__VA_ARGS__)
+#define vwsubu_vv_u16m1_m(...) __riscv_vwsubu_vv_u16m1_tumu(__VA_ARGS__)
+#define vwsubu_vx_u16m1_m(...) __riscv_vwsubu_vx_u16m1_tumu(__VA_ARGS__)
+#define vwsubu_wv_u16m1_m(...) __riscv_vwsubu_wv_u16m1_tumu(__VA_ARGS__)
+#define vwsubu_wx_u16m1_m(...) __riscv_vwsubu_wx_u16m1_tumu(__VA_ARGS__)
+#define vwsubu_vv_u16m2_m(...) __riscv_vwsubu_vv_u16m2_tumu(__VA_ARGS__)
+#define vwsubu_vx_u16m2_m(...) __riscv_vwsubu_vx_u16m2_tumu(__VA_ARGS__)
+#define vwsubu_wv_u16m2_m(...) __riscv_vwsubu_wv_u16m2_tumu(__VA_ARGS__)
+#define vwsubu_wx_u16m2_m(...) __riscv_vwsubu_wx_u16m2_tumu(__VA_ARGS__)
+#define vwsubu_vv_u16m4_m(...) __riscv_vwsubu_vv_u16m4_tumu(__VA_ARGS__)
+#define vwsubu_vx_u16m4_m(...) __riscv_vwsubu_vx_u16m4_tumu(__VA_ARGS__)
+#define vwsubu_wv_u16m4_m(...) __riscv_vwsubu_wv_u16m4_tumu(__VA_ARGS__)
+#define vwsubu_wx_u16m4_m(...) __riscv_vwsubu_wx_u16m4_tumu(__VA_ARGS__)
+#define vwsubu_vv_u16m8_m(...) __riscv_vwsubu_vv_u16m8_tumu(__VA_ARGS__)
+#define vwsubu_vx_u16m8_m(...) __riscv_vwsubu_vx_u16m8_tumu(__VA_ARGS__)
+#define vwsubu_wv_u16m8_m(...) __riscv_vwsubu_wv_u16m8_tumu(__VA_ARGS__)
+#define vwsubu_wx_u16m8_m(...) __riscv_vwsubu_wx_u16m8_tumu(__VA_ARGS__)
+#define vwsubu_vv_u32mf2_m(...) __riscv_vwsubu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vwsubu_vx_u32mf2_m(...) __riscv_vwsubu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vwsubu_wv_u32mf2_m(...) __riscv_vwsubu_wv_u32mf2_tumu(__VA_ARGS__)
+#define vwsubu_wx_u32mf2_m(...) __riscv_vwsubu_wx_u32mf2_tumu(__VA_ARGS__)
+#define vwsubu_vv_u32m1_m(...) __riscv_vwsubu_vv_u32m1_tumu(__VA_ARGS__)
+#define vwsubu_vx_u32m1_m(...) __riscv_vwsubu_vx_u32m1_tumu(__VA_ARGS__)
+#define vwsubu_wv_u32m1_m(...) __riscv_vwsubu_wv_u32m1_tumu(__VA_ARGS__)
+#define vwsubu_wx_u32m1_m(...) __riscv_vwsubu_wx_u32m1_tumu(__VA_ARGS__)
+#define vwsubu_vv_u32m2_m(...) __riscv_vwsubu_vv_u32m2_tumu(__VA_ARGS__)
+#define vwsubu_vx_u32m2_m(...) __riscv_vwsubu_vx_u32m2_tumu(__VA_ARGS__)
+#define vwsubu_wv_u32m2_m(...) __riscv_vwsubu_wv_u32m2_tumu(__VA_ARGS__)
+#define vwsubu_wx_u32m2_m(...) __riscv_vwsubu_wx_u32m2_tumu(__VA_ARGS__)
+#define vwsubu_vv_u32m4_m(...) __riscv_vwsubu_vv_u32m4_tumu(__VA_ARGS__)
+#define vwsubu_vx_u32m4_m(...) __riscv_vwsubu_vx_u32m4_tumu(__VA_ARGS__)
+#define vwsubu_wv_u32m4_m(...) __riscv_vwsubu_wv_u32m4_tumu(__VA_ARGS__)
+#define vwsubu_wx_u32m4_m(...) __riscv_vwsubu_wx_u32m4_tumu(__VA_ARGS__)
+#define vwsubu_vv_u32m8_m(...) __riscv_vwsubu_vv_u32m8_tumu(__VA_ARGS__)
+#define vwsubu_vx_u32m8_m(...) __riscv_vwsubu_vx_u32m8_tumu(__VA_ARGS__)
+#define vwsubu_wv_u32m8_m(...) __riscv_vwsubu_wv_u32m8_tumu(__VA_ARGS__)
+#define vwsubu_wx_u32m8_m(...) __riscv_vwsubu_wx_u32m8_tumu(__VA_ARGS__)
+#define vwsubu_vv_u64m1_m(...) __riscv_vwsubu_vv_u64m1_tumu(__VA_ARGS__)
+#define vwsubu_vx_u64m1_m(...) __riscv_vwsubu_vx_u64m1_tumu(__VA_ARGS__)
+#define vwsubu_wv_u64m1_m(...) __riscv_vwsubu_wv_u64m1_tumu(__VA_ARGS__)
+#define vwsubu_wx_u64m1_m(...) __riscv_vwsubu_wx_u64m1_tumu(__VA_ARGS__)
+#define vwsubu_vv_u64m2_m(...) __riscv_vwsubu_vv_u64m2_tumu(__VA_ARGS__)
+#define vwsubu_vx_u64m2_m(...) __riscv_vwsubu_vx_u64m2_tumu(__VA_ARGS__)
+#define vwsubu_wv_u64m2_m(...) __riscv_vwsubu_wv_u64m2_tumu(__VA_ARGS__)
+#define vwsubu_wx_u64m2_m(...) __riscv_vwsubu_wx_u64m2_tumu(__VA_ARGS__)
+#define vwsubu_vv_u64m4_m(...) __riscv_vwsubu_vv_u64m4_tumu(__VA_ARGS__)
+#define vwsubu_vx_u64m4_m(...) __riscv_vwsubu_vx_u64m4_tumu(__VA_ARGS__)
+#define vwsubu_wv_u64m4_m(...) __riscv_vwsubu_wv_u64m4_tumu(__VA_ARGS__)
+#define vwsubu_wx_u64m4_m(...) __riscv_vwsubu_wx_u64m4_tumu(__VA_ARGS__)
+#define vwsubu_vv_u64m8_m(...) __riscv_vwsubu_vv_u64m8_tumu(__VA_ARGS__)
+#define vwsubu_vx_u64m8_m(...) __riscv_vwsubu_vx_u64m8_tumu(__VA_ARGS__)
+#define vwsubu_wv_u64m8_m(...) __riscv_vwsubu_wv_u64m8_tumu(__VA_ARGS__)
+#define vwsubu_wx_u64m8_m(...) __riscv_vwsubu_wx_u64m8_tumu(__VA_ARGS__)
+#define vsext_vf2_i16mf4(...) __riscv_vsext_vf2_i16mf4(__VA_ARGS__)
+#define vsext_vf2_i16mf2(...) __riscv_vsext_vf2_i16mf2(__VA_ARGS__)
+#define vsext_vf2_i16m1(...) __riscv_vsext_vf2_i16m1(__VA_ARGS__)
+#define vsext_vf2_i16m2(...) __riscv_vsext_vf2_i16m2(__VA_ARGS__)
+#define vsext_vf2_i16m4(...) __riscv_vsext_vf2_i16m4(__VA_ARGS__)
+#define vsext_vf2_i16m8(...) __riscv_vsext_vf2_i16m8(__VA_ARGS__)
+#define vsext_vf4_i32mf2(...) __riscv_vsext_vf4_i32mf2(__VA_ARGS__)
+#define vsext_vf4_i32m1(...) __riscv_vsext_vf4_i32m1(__VA_ARGS__)
+#define vsext_vf4_i32m2(...) __riscv_vsext_vf4_i32m2(__VA_ARGS__)
+#define vsext_vf4_i32m4(...) __riscv_vsext_vf4_i32m4(__VA_ARGS__)
+#define vsext_vf4_i32m8(...) __riscv_vsext_vf4_i32m8(__VA_ARGS__)
+#define vsext_vf8_i64m1(...) __riscv_vsext_vf8_i64m1(__VA_ARGS__)
+#define vsext_vf8_i64m2(...) __riscv_vsext_vf8_i64m2(__VA_ARGS__)
+#define vsext_vf8_i64m4(...) __riscv_vsext_vf8_i64m4(__VA_ARGS__)
+#define vsext_vf8_i64m8(...) __riscv_vsext_vf8_i64m8(__VA_ARGS__)
+#define vsext_vf2_i32mf2(...) __riscv_vsext_vf2_i32mf2(__VA_ARGS__)
+#define vsext_vf2_i32m1(...) __riscv_vsext_vf2_i32m1(__VA_ARGS__)
+#define vsext_vf2_i32m2(...) __riscv_vsext_vf2_i32m2(__VA_ARGS__)
+#define vsext_vf2_i32m4(...) __riscv_vsext_vf2_i32m4(__VA_ARGS__)
+#define vsext_vf2_i32m8(...) __riscv_vsext_vf2_i32m8(__VA_ARGS__)
+#define vsext_vf4_i64m1(...) __riscv_vsext_vf4_i64m1(__VA_ARGS__)
+#define vsext_vf4_i64m2(...) __riscv_vsext_vf4_i64m2(__VA_ARGS__)
+#define vsext_vf4_i64m4(...) __riscv_vsext_vf4_i64m4(__VA_ARGS__)
+#define vsext_vf4_i64m8(...) __riscv_vsext_vf4_i64m8(__VA_ARGS__)
+#define vsext_vf2_i64m1(...) __riscv_vsext_vf2_i64m1(__VA_ARGS__)
+#define vsext_vf2_i64m2(...) __riscv_vsext_vf2_i64m2(__VA_ARGS__)
+#define vsext_vf2_i64m4(...) __riscv_vsext_vf2_i64m4(__VA_ARGS__)
+#define vsext_vf2_i64m8(...) __riscv_vsext_vf2_i64m8(__VA_ARGS__)
+#define vzext_vf2_u16mf4(...) __riscv_vzext_vf2_u16mf4(__VA_ARGS__)
+#define vzext_vf2_u16mf2(...) __riscv_vzext_vf2_u16mf2(__VA_ARGS__)
+#define vzext_vf2_u16m1(...) __riscv_vzext_vf2_u16m1(__VA_ARGS__)
+#define vzext_vf2_u16m2(...) __riscv_vzext_vf2_u16m2(__VA_ARGS__)
+#define vzext_vf2_u16m4(...) __riscv_vzext_vf2_u16m4(__VA_ARGS__)
+#define vzext_vf2_u16m8(...) __riscv_vzext_vf2_u16m8(__VA_ARGS__)
+#define vzext_vf4_u32mf2(...) __riscv_vzext_vf4_u32mf2(__VA_ARGS__)
+#define vzext_vf4_u32m1(...) __riscv_vzext_vf4_u32m1(__VA_ARGS__)
+#define vzext_vf4_u32m2(...) __riscv_vzext_vf4_u32m2(__VA_ARGS__)
+#define vzext_vf4_u32m4(...) __riscv_vzext_vf4_u32m4(__VA_ARGS__)
+#define vzext_vf4_u32m8(...) __riscv_vzext_vf4_u32m8(__VA_ARGS__)
+#define vzext_vf8_u64m1(...) __riscv_vzext_vf8_u64m1(__VA_ARGS__)
+#define vzext_vf8_u64m2(...) __riscv_vzext_vf8_u64m2(__VA_ARGS__)
+#define vzext_vf8_u64m4(...) __riscv_vzext_vf8_u64m4(__VA_ARGS__)
+#define vzext_vf8_u64m8(...) __riscv_vzext_vf8_u64m8(__VA_ARGS__)
+#define vzext_vf2_u32mf2(...) __riscv_vzext_vf2_u32mf2(__VA_ARGS__)
+#define vzext_vf2_u32m1(...) __riscv_vzext_vf2_u32m1(__VA_ARGS__)
+#define vzext_vf2_u32m2(...) __riscv_vzext_vf2_u32m2(__VA_ARGS__)
+#define vzext_vf2_u32m4(...) __riscv_vzext_vf2_u32m4(__VA_ARGS__)
+#define vzext_vf2_u32m8(...) __riscv_vzext_vf2_u32m8(__VA_ARGS__)
+#define vzext_vf4_u64m1(...) __riscv_vzext_vf4_u64m1(__VA_ARGS__)
+#define vzext_vf4_u64m2(...) __riscv_vzext_vf4_u64m2(__VA_ARGS__)
+#define vzext_vf4_u64m4(...) __riscv_vzext_vf4_u64m4(__VA_ARGS__)
+#define vzext_vf4_u64m8(...) __riscv_vzext_vf4_u64m8(__VA_ARGS__)
+#define vzext_vf2_u64m1(...) __riscv_vzext_vf2_u64m1(__VA_ARGS__)
+#define vzext_vf2_u64m2(...) __riscv_vzext_vf2_u64m2(__VA_ARGS__)
+#define vzext_vf2_u64m4(...) __riscv_vzext_vf2_u64m4(__VA_ARGS__)
+#define vzext_vf2_u64m8(...) __riscv_vzext_vf2_u64m8(__VA_ARGS__)
+// masked functions
+#define vsext_vf2_i16mf4_m(...) __riscv_vsext_vf2_i16mf4_tumu(__VA_ARGS__)
+#define vsext_vf2_i16mf2_m(...) __riscv_vsext_vf2_i16mf2_tumu(__VA_ARGS__)
+#define vsext_vf2_i16m1_m(...) __riscv_vsext_vf2_i16m1_tumu(__VA_ARGS__)
+#define vsext_vf2_i16m2_m(...) __riscv_vsext_vf2_i16m2_tumu(__VA_ARGS__)
+#define vsext_vf2_i16m4_m(...) __riscv_vsext_vf2_i16m4_tumu(__VA_ARGS__)
+#define vsext_vf2_i16m8_m(...) __riscv_vsext_vf2_i16m8_tumu(__VA_ARGS__)
+#define vsext_vf4_i32mf2_m(...) __riscv_vsext_vf4_i32mf2_tumu(__VA_ARGS__)
+#define vsext_vf4_i32m1_m(...) __riscv_vsext_vf4_i32m1_tumu(__VA_ARGS__)
+#define vsext_vf4_i32m2_m(...) __riscv_vsext_vf4_i32m2_tumu(__VA_ARGS__)
+#define vsext_vf4_i32m4_m(...) __riscv_vsext_vf4_i32m4_tumu(__VA_ARGS__)
+#define vsext_vf4_i32m8_m(...) __riscv_vsext_vf4_i32m8_tumu(__VA_ARGS__)
+#define vsext_vf8_i64m1_m(...) __riscv_vsext_vf8_i64m1_tumu(__VA_ARGS__)
+#define vsext_vf8_i64m2_m(...) __riscv_vsext_vf8_i64m2_tumu(__VA_ARGS__)
+#define vsext_vf8_i64m4_m(...) __riscv_vsext_vf8_i64m4_tumu(__VA_ARGS__)
+#define vsext_vf8_i64m8_m(...) __riscv_vsext_vf8_i64m8_tumu(__VA_ARGS__)
+#define vsext_vf2_i32mf2_m(...) __riscv_vsext_vf2_i32mf2_tumu(__VA_ARGS__)
+#define vsext_vf2_i32m1_m(...) __riscv_vsext_vf2_i32m1_tumu(__VA_ARGS__)
+#define vsext_vf2_i32m2_m(...) __riscv_vsext_vf2_i32m2_tumu(__VA_ARGS__)
+#define vsext_vf2_i32m4_m(...) __riscv_vsext_vf2_i32m4_tumu(__VA_ARGS__)
+#define vsext_vf2_i32m8_m(...) __riscv_vsext_vf2_i32m8_tumu(__VA_ARGS__)
+#define vsext_vf4_i64m1_m(...) __riscv_vsext_vf4_i64m1_tumu(__VA_ARGS__)
+#define vsext_vf4_i64m2_m(...) __riscv_vsext_vf4_i64m2_tumu(__VA_ARGS__)
+#define vsext_vf4_i64m4_m(...) __riscv_vsext_vf4_i64m4_tumu(__VA_ARGS__)
+#define vsext_vf4_i64m8_m(...) __riscv_vsext_vf4_i64m8_tumu(__VA_ARGS__)
+#define vsext_vf2_i64m1_m(...) __riscv_vsext_vf2_i64m1_tumu(__VA_ARGS__)
+#define vsext_vf2_i64m2_m(...) __riscv_vsext_vf2_i64m2_tumu(__VA_ARGS__)
+#define vsext_vf2_i64m4_m(...) __riscv_vsext_vf2_i64m4_tumu(__VA_ARGS__)
+#define vsext_vf2_i64m8_m(...) __riscv_vsext_vf2_i64m8_tumu(__VA_ARGS__)
+#define vzext_vf2_u16mf4_m(...) __riscv_vzext_vf2_u16mf4_tumu(__VA_ARGS__)
+#define vzext_vf2_u16mf2_m(...) __riscv_vzext_vf2_u16mf2_tumu(__VA_ARGS__)
+#define vzext_vf2_u16m1_m(...) __riscv_vzext_vf2_u16m1_tumu(__VA_ARGS__)
+#define vzext_vf2_u16m2_m(...) __riscv_vzext_vf2_u16m2_tumu(__VA_ARGS__)
+#define vzext_vf2_u16m4_m(...) __riscv_vzext_vf2_u16m4_tumu(__VA_ARGS__)
+#define vzext_vf2_u16m8_m(...) __riscv_vzext_vf2_u16m8_tumu(__VA_ARGS__)
+#define vzext_vf4_u32mf2_m(...) __riscv_vzext_vf4_u32mf2_tumu(__VA_ARGS__)
+#define vzext_vf4_u32m1_m(...) __riscv_vzext_vf4_u32m1_tumu(__VA_ARGS__)
+#define vzext_vf4_u32m2_m(...) __riscv_vzext_vf4_u32m2_tumu(__VA_ARGS__)
+#define vzext_vf4_u32m4_m(...) __riscv_vzext_vf4_u32m4_tumu(__VA_ARGS__)
+#define vzext_vf4_u32m8_m(...) __riscv_vzext_vf4_u32m8_tumu(__VA_ARGS__)
+#define vzext_vf8_u64m1_m(...) __riscv_vzext_vf8_u64m1_tumu(__VA_ARGS__)
+#define vzext_vf8_u64m2_m(...) __riscv_vzext_vf8_u64m2_tumu(__VA_ARGS__)
+#define vzext_vf8_u64m4_m(...) __riscv_vzext_vf8_u64m4_tumu(__VA_ARGS__)
+#define vzext_vf8_u64m8_m(...) __riscv_vzext_vf8_u64m8_tumu(__VA_ARGS__)
+#define vzext_vf2_u32mf2_m(...) __riscv_vzext_vf2_u32mf2_tumu(__VA_ARGS__)
+#define vzext_vf2_u32m1_m(...) __riscv_vzext_vf2_u32m1_tumu(__VA_ARGS__)
+#define vzext_vf2_u32m2_m(...) __riscv_vzext_vf2_u32m2_tumu(__VA_ARGS__)
+#define vzext_vf2_u32m4_m(...) __riscv_vzext_vf2_u32m4_tumu(__VA_ARGS__)
+#define vzext_vf2_u32m8_m(...) __riscv_vzext_vf2_u32m8_tumu(__VA_ARGS__)
+#define vzext_vf4_u64m1_m(...) __riscv_vzext_vf4_u64m1_tumu(__VA_ARGS__)
+#define vzext_vf4_u64m2_m(...) __riscv_vzext_vf4_u64m2_tumu(__VA_ARGS__)
+#define vzext_vf4_u64m4_m(...) __riscv_vzext_vf4_u64m4_tumu(__VA_ARGS__)
+#define vzext_vf4_u64m8_m(...) __riscv_vzext_vf4_u64m8_tumu(__VA_ARGS__)
+#define vzext_vf2_u64m1_m(...) __riscv_vzext_vf2_u64m1_tumu(__VA_ARGS__)
+#define vzext_vf2_u64m2_m(...) __riscv_vzext_vf2_u64m2_tumu(__VA_ARGS__)
+#define vzext_vf2_u64m4_m(...) __riscv_vzext_vf2_u64m4_tumu(__VA_ARGS__)
+#define vzext_vf2_u64m8_m(...) __riscv_vzext_vf2_u64m8_tumu(__VA_ARGS__)
+#define vadc_vvm_i8mf8(...) __riscv_vadc_vvm_i8mf8(__VA_ARGS__)
+#define vadc_vxm_i8mf8(...) __riscv_vadc_vxm_i8mf8(__VA_ARGS__)
+#define vadc_vvm_i8mf4(...) __riscv_vadc_vvm_i8mf4(__VA_ARGS__)
+#define vadc_vxm_i8mf4(...) __riscv_vadc_vxm_i8mf4(__VA_ARGS__)
+#define vadc_vvm_i8mf2(...) __riscv_vadc_vvm_i8mf2(__VA_ARGS__)
+#define vadc_vxm_i8mf2(...) __riscv_vadc_vxm_i8mf2(__VA_ARGS__)
+#define vadc_vvm_i8m1(...) __riscv_vadc_vvm_i8m1(__VA_ARGS__)
+#define vadc_vxm_i8m1(...) __riscv_vadc_vxm_i8m1(__VA_ARGS__)
+#define vadc_vvm_i8m2(...) __riscv_vadc_vvm_i8m2(__VA_ARGS__)
+#define vadc_vxm_i8m2(...) __riscv_vadc_vxm_i8m2(__VA_ARGS__)
+#define vadc_vvm_i8m4(...) __riscv_vadc_vvm_i8m4(__VA_ARGS__)
+#define vadc_vxm_i8m4(...) __riscv_vadc_vxm_i8m4(__VA_ARGS__)
+#define vadc_vvm_i8m8(...) __riscv_vadc_vvm_i8m8(__VA_ARGS__)
+#define vadc_vxm_i8m8(...) __riscv_vadc_vxm_i8m8(__VA_ARGS__)
+#define vadc_vvm_i16mf4(...) __riscv_vadc_vvm_i16mf4(__VA_ARGS__)
+#define vadc_vxm_i16mf4(...) __riscv_vadc_vxm_i16mf4(__VA_ARGS__)
+#define vadc_vvm_i16mf2(...) __riscv_vadc_vvm_i16mf2(__VA_ARGS__)
+#define vadc_vxm_i16mf2(...) __riscv_vadc_vxm_i16mf2(__VA_ARGS__)
+#define vadc_vvm_i16m1(...) __riscv_vadc_vvm_i16m1(__VA_ARGS__)
+#define vadc_vxm_i16m1(...) __riscv_vadc_vxm_i16m1(__VA_ARGS__)
+#define vadc_vvm_i16m2(...) __riscv_vadc_vvm_i16m2(__VA_ARGS__)
+#define vadc_vxm_i16m2(...) __riscv_vadc_vxm_i16m2(__VA_ARGS__)
+#define vadc_vvm_i16m4(...) __riscv_vadc_vvm_i16m4(__VA_ARGS__)
+#define vadc_vxm_i16m4(...) __riscv_vadc_vxm_i16m4(__VA_ARGS__)
+#define vadc_vvm_i16m8(...) __riscv_vadc_vvm_i16m8(__VA_ARGS__)
+#define vadc_vxm_i16m8(...) __riscv_vadc_vxm_i16m8(__VA_ARGS__)
+#define vadc_vvm_i32mf2(...) __riscv_vadc_vvm_i32mf2(__VA_ARGS__)
+#define vadc_vxm_i32mf2(...) __riscv_vadc_vxm_i32mf2(__VA_ARGS__)
+#define vadc_vvm_i32m1(...) __riscv_vadc_vvm_i32m1(__VA_ARGS__)
+#define vadc_vxm_i32m1(...) __riscv_vadc_vxm_i32m1(__VA_ARGS__)
+#define vadc_vvm_i32m2(...) __riscv_vadc_vvm_i32m2(__VA_ARGS__)
+#define vadc_vxm_i32m2(...) __riscv_vadc_vxm_i32m2(__VA_ARGS__)
+#define vadc_vvm_i32m4(...) __riscv_vadc_vvm_i32m4(__VA_ARGS__)
+#define vadc_vxm_i32m4(...) __riscv_vadc_vxm_i32m4(__VA_ARGS__)
+#define vadc_vvm_i32m8(...) __riscv_vadc_vvm_i32m8(__VA_ARGS__)
+#define vadc_vxm_i32m8(...) __riscv_vadc_vxm_i32m8(__VA_ARGS__)
+#define vadc_vvm_i64m1(...) __riscv_vadc_vvm_i64m1(__VA_ARGS__)
+#define vadc_vxm_i64m1(...) __riscv_vadc_vxm_i64m1(__VA_ARGS__)
+#define vadc_vvm_i64m2(...) __riscv_vadc_vvm_i64m2(__VA_ARGS__)
+#define vadc_vxm_i64m2(...) __riscv_vadc_vxm_i64m2(__VA_ARGS__)
+#define vadc_vvm_i64m4(...) __riscv_vadc_vvm_i64m4(__VA_ARGS__)
+#define vadc_vxm_i64m4(...) __riscv_vadc_vxm_i64m4(__VA_ARGS__)
+#define vadc_vvm_i64m8(...) __riscv_vadc_vvm_i64m8(__VA_ARGS__)
+#define vadc_vxm_i64m8(...) __riscv_vadc_vxm_i64m8(__VA_ARGS__)
+#define vsbc_vvm_i8mf8(...) __riscv_vsbc_vvm_i8mf8(__VA_ARGS__)
+#define vsbc_vxm_i8mf8(...) __riscv_vsbc_vxm_i8mf8(__VA_ARGS__)
+#define vsbc_vvm_i8mf4(...) __riscv_vsbc_vvm_i8mf4(__VA_ARGS__)
+#define vsbc_vxm_i8mf4(...) __riscv_vsbc_vxm_i8mf4(__VA_ARGS__)
+#define vsbc_vvm_i8mf2(...) __riscv_vsbc_vvm_i8mf2(__VA_ARGS__)
+#define vsbc_vxm_i8mf2(...) __riscv_vsbc_vxm_i8mf2(__VA_ARGS__)
+#define vsbc_vvm_i8m1(...) __riscv_vsbc_vvm_i8m1(__VA_ARGS__)
+#define vsbc_vxm_i8m1(...) __riscv_vsbc_vxm_i8m1(__VA_ARGS__)
+#define vsbc_vvm_i8m2(...) __riscv_vsbc_vvm_i8m2(__VA_ARGS__)
+#define vsbc_vxm_i8m2(...) __riscv_vsbc_vxm_i8m2(__VA_ARGS__)
+#define vsbc_vvm_i8m4(...) __riscv_vsbc_vvm_i8m4(__VA_ARGS__)
+#define vsbc_vxm_i8m4(...) __riscv_vsbc_vxm_i8m4(__VA_ARGS__)
+#define vsbc_vvm_i8m8(...) __riscv_vsbc_vvm_i8m8(__VA_ARGS__)
+#define vsbc_vxm_i8m8(...) __riscv_vsbc_vxm_i8m8(__VA_ARGS__)
+#define vsbc_vvm_i16mf4(...) __riscv_vsbc_vvm_i16mf4(__VA_ARGS__)
+#define vsbc_vxm_i16mf4(...) __riscv_vsbc_vxm_i16mf4(__VA_ARGS__)
+#define vsbc_vvm_i16mf2(...) __riscv_vsbc_vvm_i16mf2(__VA_ARGS__)
+#define vsbc_vxm_i16mf2(...) __riscv_vsbc_vxm_i16mf2(__VA_ARGS__)
+#define vsbc_vvm_i16m1(...) __riscv_vsbc_vvm_i16m1(__VA_ARGS__)
+#define vsbc_vxm_i16m1(...) __riscv_vsbc_vxm_i16m1(__VA_ARGS__)
+#define vsbc_vvm_i16m2(...) __riscv_vsbc_vvm_i16m2(__VA_ARGS__)
+#define vsbc_vxm_i16m2(...) __riscv_vsbc_vxm_i16m2(__VA_ARGS__)
+#define vsbc_vvm_i16m4(...) __riscv_vsbc_vvm_i16m4(__VA_ARGS__)
+#define vsbc_vxm_i16m4(...) __riscv_vsbc_vxm_i16m4(__VA_ARGS__)
+#define vsbc_vvm_i16m8(...) __riscv_vsbc_vvm_i16m8(__VA_ARGS__)
+#define vsbc_vxm_i16m8(...) __riscv_vsbc_vxm_i16m8(__VA_ARGS__)
+#define vsbc_vvm_i32mf2(...) __riscv_vsbc_vvm_i32mf2(__VA_ARGS__)
+#define vsbc_vxm_i32mf2(...) __riscv_vsbc_vxm_i32mf2(__VA_ARGS__)
+#define vsbc_vvm_i32m1(...) __riscv_vsbc_vvm_i32m1(__VA_ARGS__)
+#define vsbc_vxm_i32m1(...) __riscv_vsbc_vxm_i32m1(__VA_ARGS__)
+#define vsbc_vvm_i32m2(...) __riscv_vsbc_vvm_i32m2(__VA_ARGS__)
+#define vsbc_vxm_i32m2(...) __riscv_vsbc_vxm_i32m2(__VA_ARGS__)
+#define vsbc_vvm_i32m4(...) __riscv_vsbc_vvm_i32m4(__VA_ARGS__)
+#define vsbc_vxm_i32m4(...) __riscv_vsbc_vxm_i32m4(__VA_ARGS__)
+#define vsbc_vvm_i32m8(...) __riscv_vsbc_vvm_i32m8(__VA_ARGS__)
+#define vsbc_vxm_i32m8(...) __riscv_vsbc_vxm_i32m8(__VA_ARGS__)
+#define vsbc_vvm_i64m1(...) __riscv_vsbc_vvm_i64m1(__VA_ARGS__)
+#define vsbc_vxm_i64m1(...) __riscv_vsbc_vxm_i64m1(__VA_ARGS__)
+#define vsbc_vvm_i64m2(...) __riscv_vsbc_vvm_i64m2(__VA_ARGS__)
+#define vsbc_vxm_i64m2(...) __riscv_vsbc_vxm_i64m2(__VA_ARGS__)
+#define vsbc_vvm_i64m4(...) __riscv_vsbc_vvm_i64m4(__VA_ARGS__)
+#define vsbc_vxm_i64m4(...) __riscv_vsbc_vxm_i64m4(__VA_ARGS__)
+#define vsbc_vvm_i64m8(...) __riscv_vsbc_vvm_i64m8(__VA_ARGS__)
+#define vsbc_vxm_i64m8(...) __riscv_vsbc_vxm_i64m8(__VA_ARGS__)
+#define vadc_vvm_u8mf8(...) __riscv_vadc_vvm_u8mf8(__VA_ARGS__)
+#define vadc_vxm_u8mf8(...) __riscv_vadc_vxm_u8mf8(__VA_ARGS__)
+#define vadc_vvm_u8mf4(...) __riscv_vadc_vvm_u8mf4(__VA_ARGS__)
+#define vadc_vxm_u8mf4(...) __riscv_vadc_vxm_u8mf4(__VA_ARGS__)
+#define vadc_vvm_u8mf2(...) __riscv_vadc_vvm_u8mf2(__VA_ARGS__)
+#define vadc_vxm_u8mf2(...) __riscv_vadc_vxm_u8mf2(__VA_ARGS__)
+#define vadc_vvm_u8m1(...) __riscv_vadc_vvm_u8m1(__VA_ARGS__)
+#define vadc_vxm_u8m1(...) __riscv_vadc_vxm_u8m1(__VA_ARGS__)
+#define vadc_vvm_u8m2(...) __riscv_vadc_vvm_u8m2(__VA_ARGS__)
+#define vadc_vxm_u8m2(...) __riscv_vadc_vxm_u8m2(__VA_ARGS__)
+#define vadc_vvm_u8m4(...) __riscv_vadc_vvm_u8m4(__VA_ARGS__)
+#define vadc_vxm_u8m4(...) __riscv_vadc_vxm_u8m4(__VA_ARGS__)
+#define vadc_vvm_u8m8(...) __riscv_vadc_vvm_u8m8(__VA_ARGS__)
+#define vadc_vxm_u8m8(...) __riscv_vadc_vxm_u8m8(__VA_ARGS__)
+#define vadc_vvm_u16mf4(...) __riscv_vadc_vvm_u16mf4(__VA_ARGS__)
+#define vadc_vxm_u16mf4(...) __riscv_vadc_vxm_u16mf4(__VA_ARGS__)
+#define vadc_vvm_u16mf2(...) __riscv_vadc_vvm_u16mf2(__VA_ARGS__)
+#define vadc_vxm_u16mf2(...) __riscv_vadc_vxm_u16mf2(__VA_ARGS__)
+#define vadc_vvm_u16m1(...) __riscv_vadc_vvm_u16m1(__VA_ARGS__)
+#define vadc_vxm_u16m1(...) __riscv_vadc_vxm_u16m1(__VA_ARGS__)
+#define vadc_vvm_u16m2(...) __riscv_vadc_vvm_u16m2(__VA_ARGS__)
+#define vadc_vxm_u16m2(...) __riscv_vadc_vxm_u16m2(__VA_ARGS__)
+#define vadc_vvm_u16m4(...) __riscv_vadc_vvm_u16m4(__VA_ARGS__)
+#define vadc_vxm_u16m4(...) __riscv_vadc_vxm_u16m4(__VA_ARGS__)
+#define vadc_vvm_u16m8(...) __riscv_vadc_vvm_u16m8(__VA_ARGS__)
+#define vadc_vxm_u16m8(...) __riscv_vadc_vxm_u16m8(__VA_ARGS__)
+#define vadc_vvm_u32mf2(...) __riscv_vadc_vvm_u32mf2(__VA_ARGS__)
+#define vadc_vxm_u32mf2(...) __riscv_vadc_vxm_u32mf2(__VA_ARGS__)
+#define vadc_vvm_u32m1(...) __riscv_vadc_vvm_u32m1(__VA_ARGS__)
+#define vadc_vxm_u32m1(...) __riscv_vadc_vxm_u32m1(__VA_ARGS__)
+#define vadc_vvm_u32m2(...) __riscv_vadc_vvm_u32m2(__VA_ARGS__)
+#define vadc_vxm_u32m2(...) __riscv_vadc_vxm_u32m2(__VA_ARGS__)
+#define vadc_vvm_u32m4(...) __riscv_vadc_vvm_u32m4(__VA_ARGS__)
+#define vadc_vxm_u32m4(...) __riscv_vadc_vxm_u32m4(__VA_ARGS__)
+#define vadc_vvm_u32m8(...) __riscv_vadc_vvm_u32m8(__VA_ARGS__)
+#define vadc_vxm_u32m8(...) __riscv_vadc_vxm_u32m8(__VA_ARGS__)
+#define vadc_vvm_u64m1(...) __riscv_vadc_vvm_u64m1(__VA_ARGS__)
+#define vadc_vxm_u64m1(...) __riscv_vadc_vxm_u64m1(__VA_ARGS__)
+#define vadc_vvm_u64m2(...) __riscv_vadc_vvm_u64m2(__VA_ARGS__)
+#define vadc_vxm_u64m2(...) __riscv_vadc_vxm_u64m2(__VA_ARGS__)
+#define vadc_vvm_u64m4(...) __riscv_vadc_vvm_u64m4(__VA_ARGS__)
+#define vadc_vxm_u64m4(...) __riscv_vadc_vxm_u64m4(__VA_ARGS__)
+#define vadc_vvm_u64m8(...) __riscv_vadc_vvm_u64m8(__VA_ARGS__)
+#define vadc_vxm_u64m8(...) __riscv_vadc_vxm_u64m8(__VA_ARGS__)
+#define vsbc_vvm_u8mf8(...) __riscv_vsbc_vvm_u8mf8(__VA_ARGS__)
+#define vsbc_vxm_u8mf8(...) __riscv_vsbc_vxm_u8mf8(__VA_ARGS__)
+#define vsbc_vvm_u8mf4(...) __riscv_vsbc_vvm_u8mf4(__VA_ARGS__)
+#define vsbc_vxm_u8mf4(...) __riscv_vsbc_vxm_u8mf4(__VA_ARGS__)
+#define vsbc_vvm_u8mf2(...) __riscv_vsbc_vvm_u8mf2(__VA_ARGS__)
+#define vsbc_vxm_u8mf2(...) __riscv_vsbc_vxm_u8mf2(__VA_ARGS__)
+#define vsbc_vvm_u8m1(...) __riscv_vsbc_vvm_u8m1(__VA_ARGS__)
+#define vsbc_vxm_u8m1(...) __riscv_vsbc_vxm_u8m1(__VA_ARGS__)
+#define vsbc_vvm_u8m2(...) __riscv_vsbc_vvm_u8m2(__VA_ARGS__)
+#define vsbc_vxm_u8m2(...) __riscv_vsbc_vxm_u8m2(__VA_ARGS__)
+#define vsbc_vvm_u8m4(...) __riscv_vsbc_vvm_u8m4(__VA_ARGS__)
+#define vsbc_vxm_u8m4(...) __riscv_vsbc_vxm_u8m4(__VA_ARGS__)
+#define vsbc_vvm_u8m8(...) __riscv_vsbc_vvm_u8m8(__VA_ARGS__)
+#define vsbc_vxm_u8m8(...) __riscv_vsbc_vxm_u8m8(__VA_ARGS__)
+#define vsbc_vvm_u16mf4(...) __riscv_vsbc_vvm_u16mf4(__VA_ARGS__)
+#define vsbc_vxm_u16mf4(...) __riscv_vsbc_vxm_u16mf4(__VA_ARGS__)
+#define vsbc_vvm_u16mf2(...) __riscv_vsbc_vvm_u16mf2(__VA_ARGS__)
+#define vsbc_vxm_u16mf2(...) __riscv_vsbc_vxm_u16mf2(__VA_ARGS__)
+#define vsbc_vvm_u16m1(...) __riscv_vsbc_vvm_u16m1(__VA_ARGS__)
+#define vsbc_vxm_u16m1(...) __riscv_vsbc_vxm_u16m1(__VA_ARGS__)
+#define vsbc_vvm_u16m2(...) __riscv_vsbc_vvm_u16m2(__VA_ARGS__)
+#define vsbc_vxm_u16m2(...) __riscv_vsbc_vxm_u16m2(__VA_ARGS__)
+#define vsbc_vvm_u16m4(...) __riscv_vsbc_vvm_u16m4(__VA_ARGS__)
+#define vsbc_vxm_u16m4(...) __riscv_vsbc_vxm_u16m4(__VA_ARGS__)
+#define vsbc_vvm_u16m8(...) __riscv_vsbc_vvm_u16m8(__VA_ARGS__)
+#define vsbc_vxm_u16m8(...) __riscv_vsbc_vxm_u16m8(__VA_ARGS__)
+#define vsbc_vvm_u32mf2(...) __riscv_vsbc_vvm_u32mf2(__VA_ARGS__)
+#define vsbc_vxm_u32mf2(...) __riscv_vsbc_vxm_u32mf2(__VA_ARGS__)
+#define vsbc_vvm_u32m1(...) __riscv_vsbc_vvm_u32m1(__VA_ARGS__)
+#define vsbc_vxm_u32m1(...) __riscv_vsbc_vxm_u32m1(__VA_ARGS__)
+#define vsbc_vvm_u32m2(...) __riscv_vsbc_vvm_u32m2(__VA_ARGS__)
+#define vsbc_vxm_u32m2(...) __riscv_vsbc_vxm_u32m2(__VA_ARGS__)
+#define vsbc_vvm_u32m4(...) __riscv_vsbc_vvm_u32m4(__VA_ARGS__)
+#define vsbc_vxm_u32m4(...) __riscv_vsbc_vxm_u32m4(__VA_ARGS__)
+#define vsbc_vvm_u32m8(...) __riscv_vsbc_vvm_u32m8(__VA_ARGS__)
+#define vsbc_vxm_u32m8(...) __riscv_vsbc_vxm_u32m8(__VA_ARGS__)
+#define vsbc_vvm_u64m1(...) __riscv_vsbc_vvm_u64m1(__VA_ARGS__)
+#define vsbc_vxm_u64m1(...) __riscv_vsbc_vxm_u64m1(__VA_ARGS__)
+#define vsbc_vvm_u64m2(...) __riscv_vsbc_vvm_u64m2(__VA_ARGS__)
+#define vsbc_vxm_u64m2(...) __riscv_vsbc_vxm_u64m2(__VA_ARGS__)
+#define vsbc_vvm_u64m4(...) __riscv_vsbc_vvm_u64m4(__VA_ARGS__)
+#define vsbc_vxm_u64m4(...) __riscv_vsbc_vxm_u64m4(__VA_ARGS__)
+#define vsbc_vvm_u64m8(...) __riscv_vsbc_vvm_u64m8(__VA_ARGS__)
+#define vsbc_vxm_u64m8(...) __riscv_vsbc_vxm_u64m8(__VA_ARGS__)
+#define vmadc_vvm_i8mf8_b64(...) __riscv_vmadc_vvm_i8mf8_b64(__VA_ARGS__)
+#define vmadc_vxm_i8mf8_b64(...) __riscv_vmadc_vxm_i8mf8_b64(__VA_ARGS__)
+#define vmadc_vv_i8mf8_b64(...) __riscv_vmadc_vv_i8mf8_b64(__VA_ARGS__)
+#define vmadc_vx_i8mf8_b64(...) __riscv_vmadc_vx_i8mf8_b64(__VA_ARGS__)
+#define vmadc_vvm_i8mf4_b32(...) __riscv_vmadc_vvm_i8mf4_b32(__VA_ARGS__)
+#define vmadc_vxm_i8mf4_b32(...) __riscv_vmadc_vxm_i8mf4_b32(__VA_ARGS__)
+#define vmadc_vv_i8mf4_b32(...) __riscv_vmadc_vv_i8mf4_b32(__VA_ARGS__)
+#define vmadc_vx_i8mf4_b32(...) __riscv_vmadc_vx_i8mf4_b32(__VA_ARGS__)
+#define vmadc_vvm_i8mf2_b16(...) __riscv_vmadc_vvm_i8mf2_b16(__VA_ARGS__)
+#define vmadc_vxm_i8mf2_b16(...) __riscv_vmadc_vxm_i8mf2_b16(__VA_ARGS__)
+#define vmadc_vv_i8mf2_b16(...) __riscv_vmadc_vv_i8mf2_b16(__VA_ARGS__)
+#define vmadc_vx_i8mf2_b16(...) __riscv_vmadc_vx_i8mf2_b16(__VA_ARGS__)
+#define vmadc_vvm_i8m1_b8(...) __riscv_vmadc_vvm_i8m1_b8(__VA_ARGS__)
+#define vmadc_vxm_i8m1_b8(...) __riscv_vmadc_vxm_i8m1_b8(__VA_ARGS__)
+#define vmadc_vv_i8m1_b8(...) __riscv_vmadc_vv_i8m1_b8(__VA_ARGS__)
+#define vmadc_vx_i8m1_b8(...) __riscv_vmadc_vx_i8m1_b8(__VA_ARGS__)
+#define vmadc_vvm_i8m2_b4(...) __riscv_vmadc_vvm_i8m2_b4(__VA_ARGS__)
+#define vmadc_vxm_i8m2_b4(...) __riscv_vmadc_vxm_i8m2_b4(__VA_ARGS__)
+#define vmadc_vv_i8m2_b4(...) __riscv_vmadc_vv_i8m2_b4(__VA_ARGS__)
+#define vmadc_vx_i8m2_b4(...) __riscv_vmadc_vx_i8m2_b4(__VA_ARGS__)
+#define vmadc_vvm_i8m4_b2(...) __riscv_vmadc_vvm_i8m4_b2(__VA_ARGS__)
+#define vmadc_vxm_i8m4_b2(...) __riscv_vmadc_vxm_i8m4_b2(__VA_ARGS__)
+#define vmadc_vv_i8m4_b2(...) __riscv_vmadc_vv_i8m4_b2(__VA_ARGS__)
+#define vmadc_vx_i8m4_b2(...) __riscv_vmadc_vx_i8m4_b2(__VA_ARGS__)
+#define vmadc_vvm_i8m8_b1(...) __riscv_vmadc_vvm_i8m8_b1(__VA_ARGS__)
+#define vmadc_vxm_i8m8_b1(...) __riscv_vmadc_vxm_i8m8_b1(__VA_ARGS__)
+#define vmadc_vv_i8m8_b1(...) __riscv_vmadc_vv_i8m8_b1(__VA_ARGS__)
+#define vmadc_vx_i8m8_b1(...) __riscv_vmadc_vx_i8m8_b1(__VA_ARGS__)
+#define vmadc_vvm_i16mf4_b64(...) __riscv_vmadc_vvm_i16mf4_b64(__VA_ARGS__)
+#define vmadc_vxm_i16mf4_b64(...) __riscv_vmadc_vxm_i16mf4_b64(__VA_ARGS__)
+#define vmadc_vv_i16mf4_b64(...) __riscv_vmadc_vv_i16mf4_b64(__VA_ARGS__)
+#define vmadc_vx_i16mf4_b64(...) __riscv_vmadc_vx_i16mf4_b64(__VA_ARGS__)
+#define vmadc_vvm_i16mf2_b32(...) __riscv_vmadc_vvm_i16mf2_b32(__VA_ARGS__)
+#define vmadc_vxm_i16mf2_b32(...) __riscv_vmadc_vxm_i16mf2_b32(__VA_ARGS__)
+#define vmadc_vv_i16mf2_b32(...) __riscv_vmadc_vv_i16mf2_b32(__VA_ARGS__)
+#define vmadc_vx_i16mf2_b32(...) __riscv_vmadc_vx_i16mf2_b32(__VA_ARGS__)
+#define vmadc_vvm_i16m1_b16(...) __riscv_vmadc_vvm_i16m1_b16(__VA_ARGS__)
+#define vmadc_vxm_i16m1_b16(...) __riscv_vmadc_vxm_i16m1_b16(__VA_ARGS__)
+#define vmadc_vv_i16m1_b16(...) __riscv_vmadc_vv_i16m1_b16(__VA_ARGS__)
+#define vmadc_vx_i16m1_b16(...) __riscv_vmadc_vx_i16m1_b16(__VA_ARGS__)
+#define vmadc_vvm_i16m2_b8(...) __riscv_vmadc_vvm_i16m2_b8(__VA_ARGS__)
+#define vmadc_vxm_i16m2_b8(...) __riscv_vmadc_vxm_i16m2_b8(__VA_ARGS__)
+#define vmadc_vv_i16m2_b8(...) __riscv_vmadc_vv_i16m2_b8(__VA_ARGS__)
+#define vmadc_vx_i16m2_b8(...) __riscv_vmadc_vx_i16m2_b8(__VA_ARGS__)
+#define vmadc_vvm_i16m4_b4(...) __riscv_vmadc_vvm_i16m4_b4(__VA_ARGS__)
+#define vmadc_vxm_i16m4_b4(...) __riscv_vmadc_vxm_i16m4_b4(__VA_ARGS__)
+#define vmadc_vv_i16m4_b4(...) __riscv_vmadc_vv_i16m4_b4(__VA_ARGS__)
+#define vmadc_vx_i16m4_b4(...) __riscv_vmadc_vx_i16m4_b4(__VA_ARGS__)
+#define vmadc_vvm_i16m8_b2(...) __riscv_vmadc_vvm_i16m8_b2(__VA_ARGS__)
+#define vmadc_vxm_i16m8_b2(...) __riscv_vmadc_vxm_i16m8_b2(__VA_ARGS__)
+#define vmadc_vv_i16m8_b2(...) __riscv_vmadc_vv_i16m8_b2(__VA_ARGS__)
+#define vmadc_vx_i16m8_b2(...) __riscv_vmadc_vx_i16m8_b2(__VA_ARGS__)
+#define vmadc_vvm_i32mf2_b64(...) __riscv_vmadc_vvm_i32mf2_b64(__VA_ARGS__)
+#define vmadc_vxm_i32mf2_b64(...) __riscv_vmadc_vxm_i32mf2_b64(__VA_ARGS__)
+#define vmadc_vv_i32mf2_b64(...) __riscv_vmadc_vv_i32mf2_b64(__VA_ARGS__)
+#define vmadc_vx_i32mf2_b64(...) __riscv_vmadc_vx_i32mf2_b64(__VA_ARGS__)
+#define vmadc_vvm_i32m1_b32(...) __riscv_vmadc_vvm_i32m1_b32(__VA_ARGS__)
+#define vmadc_vxm_i32m1_b32(...) __riscv_vmadc_vxm_i32m1_b32(__VA_ARGS__)
+#define vmadc_vv_i32m1_b32(...) __riscv_vmadc_vv_i32m1_b32(__VA_ARGS__)
+#define vmadc_vx_i32m1_b32(...) __riscv_vmadc_vx_i32m1_b32(__VA_ARGS__)
+#define vmadc_vvm_i32m2_b16(...) __riscv_vmadc_vvm_i32m2_b16(__VA_ARGS__)
+#define vmadc_vxm_i32m2_b16(...) __riscv_vmadc_vxm_i32m2_b16(__VA_ARGS__)
+#define vmadc_vv_i32m2_b16(...) __riscv_vmadc_vv_i32m2_b16(__VA_ARGS__)
+#define vmadc_vx_i32m2_b16(...) __riscv_vmadc_vx_i32m2_b16(__VA_ARGS__)
+#define vmadc_vvm_i32m4_b8(...) __riscv_vmadc_vvm_i32m4_b8(__VA_ARGS__)
+#define vmadc_vxm_i32m4_b8(...) __riscv_vmadc_vxm_i32m4_b8(__VA_ARGS__)
+#define vmadc_vv_i32m4_b8(...) __riscv_vmadc_vv_i32m4_b8(__VA_ARGS__)
+#define vmadc_vx_i32m4_b8(...) __riscv_vmadc_vx_i32m4_b8(__VA_ARGS__)
+#define vmadc_vvm_i32m8_b4(...) __riscv_vmadc_vvm_i32m8_b4(__VA_ARGS__)
+#define vmadc_vxm_i32m8_b4(...) __riscv_vmadc_vxm_i32m8_b4(__VA_ARGS__)
+#define vmadc_vv_i32m8_b4(...) __riscv_vmadc_vv_i32m8_b4(__VA_ARGS__)
+#define vmadc_vx_i32m8_b4(...) __riscv_vmadc_vx_i32m8_b4(__VA_ARGS__)
+#define vmadc_vvm_i64m1_b64(...) __riscv_vmadc_vvm_i64m1_b64(__VA_ARGS__)
+#define vmadc_vxm_i64m1_b64(...) __riscv_vmadc_vxm_i64m1_b64(__VA_ARGS__)
+#define vmadc_vv_i64m1_b64(...) __riscv_vmadc_vv_i64m1_b64(__VA_ARGS__)
+#define vmadc_vx_i64m1_b64(...) __riscv_vmadc_vx_i64m1_b64(__VA_ARGS__)
+#define vmadc_vvm_i64m2_b32(...) __riscv_vmadc_vvm_i64m2_b32(__VA_ARGS__)
+#define vmadc_vxm_i64m2_b32(...) __riscv_vmadc_vxm_i64m2_b32(__VA_ARGS__)
+#define vmadc_vv_i64m2_b32(...) __riscv_vmadc_vv_i64m2_b32(__VA_ARGS__)
+#define vmadc_vx_i64m2_b32(...) __riscv_vmadc_vx_i64m2_b32(__VA_ARGS__)
+#define vmadc_vvm_i64m4_b16(...) __riscv_vmadc_vvm_i64m4_b16(__VA_ARGS__)
+#define vmadc_vxm_i64m4_b16(...) __riscv_vmadc_vxm_i64m4_b16(__VA_ARGS__)
+#define vmadc_vv_i64m4_b16(...) __riscv_vmadc_vv_i64m4_b16(__VA_ARGS__)
+#define vmadc_vx_i64m4_b16(...) __riscv_vmadc_vx_i64m4_b16(__VA_ARGS__)
+#define vmadc_vvm_i64m8_b8(...) __riscv_vmadc_vvm_i64m8_b8(__VA_ARGS__)
+#define vmadc_vxm_i64m8_b8(...) __riscv_vmadc_vxm_i64m8_b8(__VA_ARGS__)
+#define vmadc_vv_i64m8_b8(...) __riscv_vmadc_vv_i64m8_b8(__VA_ARGS__)
+#define vmadc_vx_i64m8_b8(...) __riscv_vmadc_vx_i64m8_b8(__VA_ARGS__)
+#define vmsbc_vvm_i8mf8_b64(...) __riscv_vmsbc_vvm_i8mf8_b64(__VA_ARGS__)
+#define vmsbc_vxm_i8mf8_b64(...) __riscv_vmsbc_vxm_i8mf8_b64(__VA_ARGS__)
+#define vmsbc_vv_i8mf8_b64(...) __riscv_vmsbc_vv_i8mf8_b64(__VA_ARGS__)
+#define vmsbc_vx_i8mf8_b64(...) __riscv_vmsbc_vx_i8mf8_b64(__VA_ARGS__)
+#define vmsbc_vvm_i8mf4_b32(...) __riscv_vmsbc_vvm_i8mf4_b32(__VA_ARGS__)
+#define vmsbc_vxm_i8mf4_b32(...) __riscv_vmsbc_vxm_i8mf4_b32(__VA_ARGS__)
+#define vmsbc_vv_i8mf4_b32(...) __riscv_vmsbc_vv_i8mf4_b32(__VA_ARGS__)
+#define vmsbc_vx_i8mf4_b32(...) __riscv_vmsbc_vx_i8mf4_b32(__VA_ARGS__)
+#define vmsbc_vvm_i8mf2_b16(...) __riscv_vmsbc_vvm_i8mf2_b16(__VA_ARGS__)
+#define vmsbc_vxm_i8mf2_b16(...) __riscv_vmsbc_vxm_i8mf2_b16(__VA_ARGS__)
+#define vmsbc_vv_i8mf2_b16(...) __riscv_vmsbc_vv_i8mf2_b16(__VA_ARGS__)
+#define vmsbc_vx_i8mf2_b16(...) __riscv_vmsbc_vx_i8mf2_b16(__VA_ARGS__)
+#define vmsbc_vvm_i8m1_b8(...) __riscv_vmsbc_vvm_i8m1_b8(__VA_ARGS__)
+#define vmsbc_vxm_i8m1_b8(...) __riscv_vmsbc_vxm_i8m1_b8(__VA_ARGS__)
+#define vmsbc_vv_i8m1_b8(...) __riscv_vmsbc_vv_i8m1_b8(__VA_ARGS__)
+#define vmsbc_vx_i8m1_b8(...) __riscv_vmsbc_vx_i8m1_b8(__VA_ARGS__)
+#define vmsbc_vvm_i8m2_b4(...) __riscv_vmsbc_vvm_i8m2_b4(__VA_ARGS__)
+#define vmsbc_vxm_i8m2_b4(...) __riscv_vmsbc_vxm_i8m2_b4(__VA_ARGS__)
+#define vmsbc_vv_i8m2_b4(...) __riscv_vmsbc_vv_i8m2_b4(__VA_ARGS__)
+#define vmsbc_vx_i8m2_b4(...) __riscv_vmsbc_vx_i8m2_b4(__VA_ARGS__)
+#define vmsbc_vvm_i8m4_b2(...) __riscv_vmsbc_vvm_i8m4_b2(__VA_ARGS__)
+#define vmsbc_vxm_i8m4_b2(...) __riscv_vmsbc_vxm_i8m4_b2(__VA_ARGS__)
+#define vmsbc_vv_i8m4_b2(...) __riscv_vmsbc_vv_i8m4_b2(__VA_ARGS__)
+#define vmsbc_vx_i8m4_b2(...) __riscv_vmsbc_vx_i8m4_b2(__VA_ARGS__)
+#define vmsbc_vvm_i8m8_b1(...) __riscv_vmsbc_vvm_i8m8_b1(__VA_ARGS__)
+#define vmsbc_vxm_i8m8_b1(...) __riscv_vmsbc_vxm_i8m8_b1(__VA_ARGS__)
+#define vmsbc_vv_i8m8_b1(...) __riscv_vmsbc_vv_i8m8_b1(__VA_ARGS__)
+#define vmsbc_vx_i8m8_b1(...) __riscv_vmsbc_vx_i8m8_b1(__VA_ARGS__)
+#define vmsbc_vvm_i16mf4_b64(...) __riscv_vmsbc_vvm_i16mf4_b64(__VA_ARGS__)
+#define vmsbc_vxm_i16mf4_b64(...) __riscv_vmsbc_vxm_i16mf4_b64(__VA_ARGS__)
+#define vmsbc_vv_i16mf4_b64(...) __riscv_vmsbc_vv_i16mf4_b64(__VA_ARGS__)
+#define vmsbc_vx_i16mf4_b64(...) __riscv_vmsbc_vx_i16mf4_b64(__VA_ARGS__)
+#define vmsbc_vvm_i16mf2_b32(...) __riscv_vmsbc_vvm_i16mf2_b32(__VA_ARGS__)
+#define vmsbc_vxm_i16mf2_b32(...) __riscv_vmsbc_vxm_i16mf2_b32(__VA_ARGS__)
+#define vmsbc_vv_i16mf2_b32(...) __riscv_vmsbc_vv_i16mf2_b32(__VA_ARGS__)
+#define vmsbc_vx_i16mf2_b32(...) __riscv_vmsbc_vx_i16mf2_b32(__VA_ARGS__)
+#define vmsbc_vvm_i16m1_b16(...) __riscv_vmsbc_vvm_i16m1_b16(__VA_ARGS__)
+#define vmsbc_vxm_i16m1_b16(...) __riscv_vmsbc_vxm_i16m1_b16(__VA_ARGS__)
+#define vmsbc_vv_i16m1_b16(...) __riscv_vmsbc_vv_i16m1_b16(__VA_ARGS__)
+#define vmsbc_vx_i16m1_b16(...) __riscv_vmsbc_vx_i16m1_b16(__VA_ARGS__)
+#define vmsbc_vvm_i16m2_b8(...) __riscv_vmsbc_vvm_i16m2_b8(__VA_ARGS__)
+#define vmsbc_vxm_i16m2_b8(...) __riscv_vmsbc_vxm_i16m2_b8(__VA_ARGS__)
+#define vmsbc_vv_i16m2_b8(...) __riscv_vmsbc_vv_i16m2_b8(__VA_ARGS__)
+#define vmsbc_vx_i16m2_b8(...) __riscv_vmsbc_vx_i16m2_b8(__VA_ARGS__)
+#define vmsbc_vvm_i16m4_b4(...) __riscv_vmsbc_vvm_i16m4_b4(__VA_ARGS__)
+#define vmsbc_vxm_i16m4_b4(...) __riscv_vmsbc_vxm_i16m4_b4(__VA_ARGS__)
+#define vmsbc_vv_i16m4_b4(...) __riscv_vmsbc_vv_i16m4_b4(__VA_ARGS__)
+#define vmsbc_vx_i16m4_b4(...) __riscv_vmsbc_vx_i16m4_b4(__VA_ARGS__)
+#define vmsbc_vvm_i16m8_b2(...) __riscv_vmsbc_vvm_i16m8_b2(__VA_ARGS__)
+#define vmsbc_vxm_i16m8_b2(...) __riscv_vmsbc_vxm_i16m8_b2(__VA_ARGS__)
+#define vmsbc_vv_i16m8_b2(...) __riscv_vmsbc_vv_i16m8_b2(__VA_ARGS__)
+#define vmsbc_vx_i16m8_b2(...) __riscv_vmsbc_vx_i16m8_b2(__VA_ARGS__)
+#define vmsbc_vvm_i32mf2_b64(...) __riscv_vmsbc_vvm_i32mf2_b64(__VA_ARGS__)
+#define vmsbc_vxm_i32mf2_b64(...) __riscv_vmsbc_vxm_i32mf2_b64(__VA_ARGS__)
+#define vmsbc_vv_i32mf2_b64(...) __riscv_vmsbc_vv_i32mf2_b64(__VA_ARGS__)
+#define vmsbc_vx_i32mf2_b64(...) __riscv_vmsbc_vx_i32mf2_b64(__VA_ARGS__)
+#define vmsbc_vvm_i32m1_b32(...) __riscv_vmsbc_vvm_i32m1_b32(__VA_ARGS__)
+#define vmsbc_vxm_i32m1_b32(...) __riscv_vmsbc_vxm_i32m1_b32(__VA_ARGS__)
+#define vmsbc_vv_i32m1_b32(...) __riscv_vmsbc_vv_i32m1_b32(__VA_ARGS__)
+#define vmsbc_vx_i32m1_b32(...) __riscv_vmsbc_vx_i32m1_b32(__VA_ARGS__)
+#define vmsbc_vvm_i32m2_b16(...) __riscv_vmsbc_vvm_i32m2_b16(__VA_ARGS__)
+#define vmsbc_vxm_i32m2_b16(...) __riscv_vmsbc_vxm_i32m2_b16(__VA_ARGS__)
+#define vmsbc_vv_i32m2_b16(...) __riscv_vmsbc_vv_i32m2_b16(__VA_ARGS__)
+#define vmsbc_vx_i32m2_b16(...) __riscv_vmsbc_vx_i32m2_b16(__VA_ARGS__)
+#define vmsbc_vvm_i32m4_b8(...) __riscv_vmsbc_vvm_i32m4_b8(__VA_ARGS__)
+#define vmsbc_vxm_i32m4_b8(...) __riscv_vmsbc_vxm_i32m4_b8(__VA_ARGS__)
+#define vmsbc_vv_i32m4_b8(...) __riscv_vmsbc_vv_i32m4_b8(__VA_ARGS__)
+#define vmsbc_vx_i32m4_b8(...) __riscv_vmsbc_vx_i32m4_b8(__VA_ARGS__)
+#define vmsbc_vvm_i32m8_b4(...) __riscv_vmsbc_vvm_i32m8_b4(__VA_ARGS__)
+#define vmsbc_vxm_i32m8_b4(...) __riscv_vmsbc_vxm_i32m8_b4(__VA_ARGS__)
+#define vmsbc_vv_i32m8_b4(...) __riscv_vmsbc_vv_i32m8_b4(__VA_ARGS__)
+#define vmsbc_vx_i32m8_b4(...) __riscv_vmsbc_vx_i32m8_b4(__VA_ARGS__)
+#define vmsbc_vvm_i64m1_b64(...) __riscv_vmsbc_vvm_i64m1_b64(__VA_ARGS__)
+#define vmsbc_vxm_i64m1_b64(...) __riscv_vmsbc_vxm_i64m1_b64(__VA_ARGS__)
+#define vmsbc_vv_i64m1_b64(...) __riscv_vmsbc_vv_i64m1_b64(__VA_ARGS__)
+#define vmsbc_vx_i64m1_b64(...) __riscv_vmsbc_vx_i64m1_b64(__VA_ARGS__)
+#define vmsbc_vvm_i64m2_b32(...) __riscv_vmsbc_vvm_i64m2_b32(__VA_ARGS__)
+#define vmsbc_vxm_i64m2_b32(...) __riscv_vmsbc_vxm_i64m2_b32(__VA_ARGS__)
+#define vmsbc_vv_i64m2_b32(...) __riscv_vmsbc_vv_i64m2_b32(__VA_ARGS__)
+#define vmsbc_vx_i64m2_b32(...) __riscv_vmsbc_vx_i64m2_b32(__VA_ARGS__)
+#define vmsbc_vvm_i64m4_b16(...) __riscv_vmsbc_vvm_i64m4_b16(__VA_ARGS__)
+#define vmsbc_vxm_i64m4_b16(...) __riscv_vmsbc_vxm_i64m4_b16(__VA_ARGS__)
+#define vmsbc_vv_i64m4_b16(...) __riscv_vmsbc_vv_i64m4_b16(__VA_ARGS__)
+#define vmsbc_vx_i64m4_b16(...) __riscv_vmsbc_vx_i64m4_b16(__VA_ARGS__)
+#define vmsbc_vvm_i64m8_b8(...) __riscv_vmsbc_vvm_i64m8_b8(__VA_ARGS__)
+#define vmsbc_vxm_i64m8_b8(...) __riscv_vmsbc_vxm_i64m8_b8(__VA_ARGS__)
+#define vmsbc_vv_i64m8_b8(...) __riscv_vmsbc_vv_i64m8_b8(__VA_ARGS__)
+#define vmsbc_vx_i64m8_b8(...) __riscv_vmsbc_vx_i64m8_b8(__VA_ARGS__)
+#define vmadc_vvm_u8mf8_b64(...) __riscv_vmadc_vvm_u8mf8_b64(__VA_ARGS__)
+#define vmadc_vxm_u8mf8_b64(...) __riscv_vmadc_vxm_u8mf8_b64(__VA_ARGS__)
+#define vmadc_vv_u8mf8_b64(...) __riscv_vmadc_vv_u8mf8_b64(__VA_ARGS__)
+#define vmadc_vx_u8mf8_b64(...) __riscv_vmadc_vx_u8mf8_b64(__VA_ARGS__)
+#define vmadc_vvm_u8mf4_b32(...) __riscv_vmadc_vvm_u8mf4_b32(__VA_ARGS__)
+#define vmadc_vxm_u8mf4_b32(...) __riscv_vmadc_vxm_u8mf4_b32(__VA_ARGS__)
+#define vmadc_vv_u8mf4_b32(...) __riscv_vmadc_vv_u8mf4_b32(__VA_ARGS__)
+#define vmadc_vx_u8mf4_b32(...) __riscv_vmadc_vx_u8mf4_b32(__VA_ARGS__)
+#define vmadc_vvm_u8mf2_b16(...) __riscv_vmadc_vvm_u8mf2_b16(__VA_ARGS__)
+#define vmadc_vxm_u8mf2_b16(...) __riscv_vmadc_vxm_u8mf2_b16(__VA_ARGS__)
+#define vmadc_vv_u8mf2_b16(...) __riscv_vmadc_vv_u8mf2_b16(__VA_ARGS__)
+#define vmadc_vx_u8mf2_b16(...) __riscv_vmadc_vx_u8mf2_b16(__VA_ARGS__)
+#define vmadc_vvm_u8m1_b8(...) __riscv_vmadc_vvm_u8m1_b8(__VA_ARGS__)
+#define vmadc_vxm_u8m1_b8(...) __riscv_vmadc_vxm_u8m1_b8(__VA_ARGS__)
+#define vmadc_vv_u8m1_b8(...) __riscv_vmadc_vv_u8m1_b8(__VA_ARGS__)
+#define vmadc_vx_u8m1_b8(...) __riscv_vmadc_vx_u8m1_b8(__VA_ARGS__)
+#define vmadc_vvm_u8m2_b4(...) __riscv_vmadc_vvm_u8m2_b4(__VA_ARGS__)
+#define vmadc_vxm_u8m2_b4(...) __riscv_vmadc_vxm_u8m2_b4(__VA_ARGS__)
+#define vmadc_vv_u8m2_b4(...) __riscv_vmadc_vv_u8m2_b4(__VA_ARGS__)
+#define vmadc_vx_u8m2_b4(...) __riscv_vmadc_vx_u8m2_b4(__VA_ARGS__)
+#define vmadc_vvm_u8m4_b2(...) __riscv_vmadc_vvm_u8m4_b2(__VA_ARGS__)
+#define vmadc_vxm_u8m4_b2(...) __riscv_vmadc_vxm_u8m4_b2(__VA_ARGS__)
+#define vmadc_vv_u8m4_b2(...) __riscv_vmadc_vv_u8m4_b2(__VA_ARGS__)
+#define vmadc_vx_u8m4_b2(...) __riscv_vmadc_vx_u8m4_b2(__VA_ARGS__)
+#define vmadc_vvm_u8m8_b1(...) __riscv_vmadc_vvm_u8m8_b1(__VA_ARGS__)
+#define vmadc_vxm_u8m8_b1(...) __riscv_vmadc_vxm_u8m8_b1(__VA_ARGS__)
+#define vmadc_vv_u8m8_b1(...) __riscv_vmadc_vv_u8m8_b1(__VA_ARGS__)
+#define vmadc_vx_u8m8_b1(...) __riscv_vmadc_vx_u8m8_b1(__VA_ARGS__)
+#define vmadc_vvm_u16mf4_b64(...) __riscv_vmadc_vvm_u16mf4_b64(__VA_ARGS__)
+#define vmadc_vxm_u16mf4_b64(...) __riscv_vmadc_vxm_u16mf4_b64(__VA_ARGS__)
+#define vmadc_vv_u16mf4_b64(...) __riscv_vmadc_vv_u16mf4_b64(__VA_ARGS__)
+#define vmadc_vx_u16mf4_b64(...) __riscv_vmadc_vx_u16mf4_b64(__VA_ARGS__)
+#define vmadc_vvm_u16mf2_b32(...) __riscv_vmadc_vvm_u16mf2_b32(__VA_ARGS__)
+#define vmadc_vxm_u16mf2_b32(...) __riscv_vmadc_vxm_u16mf2_b32(__VA_ARGS__)
+#define vmadc_vv_u16mf2_b32(...) __riscv_vmadc_vv_u16mf2_b32(__VA_ARGS__)
+#define vmadc_vx_u16mf2_b32(...) __riscv_vmadc_vx_u16mf2_b32(__VA_ARGS__)
+#define vmadc_vvm_u16m1_b16(...) __riscv_vmadc_vvm_u16m1_b16(__VA_ARGS__)
+#define vmadc_vxm_u16m1_b16(...) __riscv_vmadc_vxm_u16m1_b16(__VA_ARGS__)
+#define vmadc_vv_u16m1_b16(...) __riscv_vmadc_vv_u16m1_b16(__VA_ARGS__)
+#define vmadc_vx_u16m1_b16(...) __riscv_vmadc_vx_u16m1_b16(__VA_ARGS__)
+#define vmadc_vvm_u16m2_b8(...) __riscv_vmadc_vvm_u16m2_b8(__VA_ARGS__)
+#define vmadc_vxm_u16m2_b8(...) __riscv_vmadc_vxm_u16m2_b8(__VA_ARGS__)
+#define vmadc_vv_u16m2_b8(...) __riscv_vmadc_vv_u16m2_b8(__VA_ARGS__)
+#define vmadc_vx_u16m2_b8(...) __riscv_vmadc_vx_u16m2_b8(__VA_ARGS__)
+#define vmadc_vvm_u16m4_b4(...) __riscv_vmadc_vvm_u16m4_b4(__VA_ARGS__)
+#define vmadc_vxm_u16m4_b4(...) __riscv_vmadc_vxm_u16m4_b4(__VA_ARGS__)
+#define vmadc_vv_u16m4_b4(...) __riscv_vmadc_vv_u16m4_b4(__VA_ARGS__)
+#define vmadc_vx_u16m4_b4(...) __riscv_vmadc_vx_u16m4_b4(__VA_ARGS__)
+#define vmadc_vvm_u16m8_b2(...) __riscv_vmadc_vvm_u16m8_b2(__VA_ARGS__)
+#define vmadc_vxm_u16m8_b2(...) __riscv_vmadc_vxm_u16m8_b2(__VA_ARGS__)
+#define vmadc_vv_u16m8_b2(...) __riscv_vmadc_vv_u16m8_b2(__VA_ARGS__)
+#define vmadc_vx_u16m8_b2(...) __riscv_vmadc_vx_u16m8_b2(__VA_ARGS__)
+#define vmadc_vvm_u32mf2_b64(...) __riscv_vmadc_vvm_u32mf2_b64(__VA_ARGS__)
+#define vmadc_vxm_u32mf2_b64(...) __riscv_vmadc_vxm_u32mf2_b64(__VA_ARGS__)
+#define vmadc_vv_u32mf2_b64(...) __riscv_vmadc_vv_u32mf2_b64(__VA_ARGS__)
+#define vmadc_vx_u32mf2_b64(...) __riscv_vmadc_vx_u32mf2_b64(__VA_ARGS__)
+#define vmadc_vvm_u32m1_b32(...) __riscv_vmadc_vvm_u32m1_b32(__VA_ARGS__)
+#define vmadc_vxm_u32m1_b32(...) __riscv_vmadc_vxm_u32m1_b32(__VA_ARGS__)
+#define vmadc_vv_u32m1_b32(...) __riscv_vmadc_vv_u32m1_b32(__VA_ARGS__)
+#define vmadc_vx_u32m1_b32(...) __riscv_vmadc_vx_u32m1_b32(__VA_ARGS__)
+#define vmadc_vvm_u32m2_b16(...) __riscv_vmadc_vvm_u32m2_b16(__VA_ARGS__)
+#define vmadc_vxm_u32m2_b16(...) __riscv_vmadc_vxm_u32m2_b16(__VA_ARGS__)
+#define vmadc_vv_u32m2_b16(...) __riscv_vmadc_vv_u32m2_b16(__VA_ARGS__)
+#define vmadc_vx_u32m2_b16(...) __riscv_vmadc_vx_u32m2_b16(__VA_ARGS__)
+#define vmadc_vvm_u32m4_b8(...) __riscv_vmadc_vvm_u32m4_b8(__VA_ARGS__)
+#define vmadc_vxm_u32m4_b8(...) __riscv_vmadc_vxm_u32m4_b8(__VA_ARGS__)
+#define vmadc_vv_u32m4_b8(...) __riscv_vmadc_vv_u32m4_b8(__VA_ARGS__)
+#define vmadc_vx_u32m4_b8(...) __riscv_vmadc_vx_u32m4_b8(__VA_ARGS__)
+#define vmadc_vvm_u32m8_b4(...) __riscv_vmadc_vvm_u32m8_b4(__VA_ARGS__)
+#define vmadc_vxm_u32m8_b4(...) __riscv_vmadc_vxm_u32m8_b4(__VA_ARGS__)
+#define vmadc_vv_u32m8_b4(...) __riscv_vmadc_vv_u32m8_b4(__VA_ARGS__)
+#define vmadc_vx_u32m8_b4(...) __riscv_vmadc_vx_u32m8_b4(__VA_ARGS__)
+#define vmadc_vvm_u64m1_b64(...) __riscv_vmadc_vvm_u64m1_b64(__VA_ARGS__)
+#define vmadc_vxm_u64m1_b64(...) __riscv_vmadc_vxm_u64m1_b64(__VA_ARGS__)
+#define vmadc_vv_u64m1_b64(...) __riscv_vmadc_vv_u64m1_b64(__VA_ARGS__)
+#define vmadc_vx_u64m1_b64(...) __riscv_vmadc_vx_u64m1_b64(__VA_ARGS__)
+#define vmadc_vvm_u64m2_b32(...) __riscv_vmadc_vvm_u64m2_b32(__VA_ARGS__)
+#define vmadc_vxm_u64m2_b32(...) __riscv_vmadc_vxm_u64m2_b32(__VA_ARGS__)
+#define vmadc_vv_u64m2_b32(...) __riscv_vmadc_vv_u64m2_b32(__VA_ARGS__)
+#define vmadc_vx_u64m2_b32(...) __riscv_vmadc_vx_u64m2_b32(__VA_ARGS__)
+#define vmadc_vvm_u64m4_b16(...) __riscv_vmadc_vvm_u64m4_b16(__VA_ARGS__)
+#define vmadc_vxm_u64m4_b16(...) __riscv_vmadc_vxm_u64m4_b16(__VA_ARGS__)
+#define vmadc_vv_u64m4_b16(...) __riscv_vmadc_vv_u64m4_b16(__VA_ARGS__)
+#define vmadc_vx_u64m4_b16(...) __riscv_vmadc_vx_u64m4_b16(__VA_ARGS__)
+#define vmadc_vvm_u64m8_b8(...) __riscv_vmadc_vvm_u64m8_b8(__VA_ARGS__)
+#define vmadc_vxm_u64m8_b8(...) __riscv_vmadc_vxm_u64m8_b8(__VA_ARGS__)
+#define vmadc_vv_u64m8_b8(...) __riscv_vmadc_vv_u64m8_b8(__VA_ARGS__)
+#define vmadc_vx_u64m8_b8(...) __riscv_vmadc_vx_u64m8_b8(__VA_ARGS__)
+#define vmsbc_vvm_u8mf8_b64(...) __riscv_vmsbc_vvm_u8mf8_b64(__VA_ARGS__)
+#define vmsbc_vxm_u8mf8_b64(...) __riscv_vmsbc_vxm_u8mf8_b64(__VA_ARGS__)
+#define vmsbc_vv_u8mf8_b64(...) __riscv_vmsbc_vv_u8mf8_b64(__VA_ARGS__)
+#define vmsbc_vx_u8mf8_b64(...) __riscv_vmsbc_vx_u8mf8_b64(__VA_ARGS__)
+#define vmsbc_vvm_u8mf4_b32(...) __riscv_vmsbc_vvm_u8mf4_b32(__VA_ARGS__)
+#define vmsbc_vxm_u8mf4_b32(...) __riscv_vmsbc_vxm_u8mf4_b32(__VA_ARGS__)
+#define vmsbc_vv_u8mf4_b32(...) __riscv_vmsbc_vv_u8mf4_b32(__VA_ARGS__)
+#define vmsbc_vx_u8mf4_b32(...) __riscv_vmsbc_vx_u8mf4_b32(__VA_ARGS__)
+#define vmsbc_vvm_u8mf2_b16(...) __riscv_vmsbc_vvm_u8mf2_b16(__VA_ARGS__)
+#define vmsbc_vxm_u8mf2_b16(...) __riscv_vmsbc_vxm_u8mf2_b16(__VA_ARGS__)
+#define vmsbc_vv_u8mf2_b16(...) __riscv_vmsbc_vv_u8mf2_b16(__VA_ARGS__)
+#define vmsbc_vx_u8mf2_b16(...) __riscv_vmsbc_vx_u8mf2_b16(__VA_ARGS__)
+#define vmsbc_vvm_u8m1_b8(...) __riscv_vmsbc_vvm_u8m1_b8(__VA_ARGS__)
+#define vmsbc_vxm_u8m1_b8(...) __riscv_vmsbc_vxm_u8m1_b8(__VA_ARGS__)
+#define vmsbc_vv_u8m1_b8(...) __riscv_vmsbc_vv_u8m1_b8(__VA_ARGS__)
+#define vmsbc_vx_u8m1_b8(...) __riscv_vmsbc_vx_u8m1_b8(__VA_ARGS__)
+#define vmsbc_vvm_u8m2_b4(...) __riscv_vmsbc_vvm_u8m2_b4(__VA_ARGS__)
+#define vmsbc_vxm_u8m2_b4(...) __riscv_vmsbc_vxm_u8m2_b4(__VA_ARGS__)
+#define vmsbc_vv_u8m2_b4(...) __riscv_vmsbc_vv_u8m2_b4(__VA_ARGS__)
+#define vmsbc_vx_u8m2_b4(...) __riscv_vmsbc_vx_u8m2_b4(__VA_ARGS__)
+#define vmsbc_vvm_u8m4_b2(...) __riscv_vmsbc_vvm_u8m4_b2(__VA_ARGS__)
+#define vmsbc_vxm_u8m4_b2(...) __riscv_vmsbc_vxm_u8m4_b2(__VA_ARGS__)
+#define vmsbc_vv_u8m4_b2(...) __riscv_vmsbc_vv_u8m4_b2(__VA_ARGS__)
+#define vmsbc_vx_u8m4_b2(...) __riscv_vmsbc_vx_u8m4_b2(__VA_ARGS__)
+#define vmsbc_vvm_u8m8_b1(...) __riscv_vmsbc_vvm_u8m8_b1(__VA_ARGS__)
+#define vmsbc_vxm_u8m8_b1(...) __riscv_vmsbc_vxm_u8m8_b1(__VA_ARGS__)
+#define vmsbc_vv_u8m8_b1(...) __riscv_vmsbc_vv_u8m8_b1(__VA_ARGS__)
+#define vmsbc_vx_u8m8_b1(...) __riscv_vmsbc_vx_u8m8_b1(__VA_ARGS__)
+#define vmsbc_vvm_u16mf4_b64(...) __riscv_vmsbc_vvm_u16mf4_b64(__VA_ARGS__)
+#define vmsbc_vxm_u16mf4_b64(...) __riscv_vmsbc_vxm_u16mf4_b64(__VA_ARGS__)
+#define vmsbc_vv_u16mf4_b64(...) __riscv_vmsbc_vv_u16mf4_b64(__VA_ARGS__)
+#define vmsbc_vx_u16mf4_b64(...) __riscv_vmsbc_vx_u16mf4_b64(__VA_ARGS__)
+#define vmsbc_vvm_u16mf2_b32(...) __riscv_vmsbc_vvm_u16mf2_b32(__VA_ARGS__)
+#define vmsbc_vxm_u16mf2_b32(...) __riscv_vmsbc_vxm_u16mf2_b32(__VA_ARGS__)
+#define vmsbc_vv_u16mf2_b32(...) __riscv_vmsbc_vv_u16mf2_b32(__VA_ARGS__)
+#define vmsbc_vx_u16mf2_b32(...) __riscv_vmsbc_vx_u16mf2_b32(__VA_ARGS__)
+#define vmsbc_vvm_u16m1_b16(...) __riscv_vmsbc_vvm_u16m1_b16(__VA_ARGS__)
+#define vmsbc_vxm_u16m1_b16(...) __riscv_vmsbc_vxm_u16m1_b16(__VA_ARGS__)
+#define vmsbc_vv_u16m1_b16(...) __riscv_vmsbc_vv_u16m1_b16(__VA_ARGS__)
+#define vmsbc_vx_u16m1_b16(...) __riscv_vmsbc_vx_u16m1_b16(__VA_ARGS__)
+#define vmsbc_vvm_u16m2_b8(...) __riscv_vmsbc_vvm_u16m2_b8(__VA_ARGS__)
+#define vmsbc_vxm_u16m2_b8(...) __riscv_vmsbc_vxm_u16m2_b8(__VA_ARGS__)
+#define vmsbc_vv_u16m2_b8(...) __riscv_vmsbc_vv_u16m2_b8(__VA_ARGS__)
+#define vmsbc_vx_u16m2_b8(...) __riscv_vmsbc_vx_u16m2_b8(__VA_ARGS__)
+#define vmsbc_vvm_u16m4_b4(...) __riscv_vmsbc_vvm_u16m4_b4(__VA_ARGS__)
+#define vmsbc_vxm_u16m4_b4(...) __riscv_vmsbc_vxm_u16m4_b4(__VA_ARGS__)
+#define vmsbc_vv_u16m4_b4(...) __riscv_vmsbc_vv_u16m4_b4(__VA_ARGS__)
+#define vmsbc_vx_u16m4_b4(...) __riscv_vmsbc_vx_u16m4_b4(__VA_ARGS__)
+#define vmsbc_vvm_u16m8_b2(...) __riscv_vmsbc_vvm_u16m8_b2(__VA_ARGS__)
+#define vmsbc_vxm_u16m8_b2(...) __riscv_vmsbc_vxm_u16m8_b2(__VA_ARGS__)
+#define vmsbc_vv_u16m8_b2(...) __riscv_vmsbc_vv_u16m8_b2(__VA_ARGS__)
+#define vmsbc_vx_u16m8_b2(...) __riscv_vmsbc_vx_u16m8_b2(__VA_ARGS__)
+#define vmsbc_vvm_u32mf2_b64(...) __riscv_vmsbc_vvm_u32mf2_b64(__VA_ARGS__)
+#define vmsbc_vxm_u32mf2_b64(...) __riscv_vmsbc_vxm_u32mf2_b64(__VA_ARGS__)
+#define vmsbc_vv_u32mf2_b64(...) __riscv_vmsbc_vv_u32mf2_b64(__VA_ARGS__)
+#define vmsbc_vx_u32mf2_b64(...) __riscv_vmsbc_vx_u32mf2_b64(__VA_ARGS__)
+#define vmsbc_vvm_u32m1_b32(...) __riscv_vmsbc_vvm_u32m1_b32(__VA_ARGS__)
+#define vmsbc_vxm_u32m1_b32(...) __riscv_vmsbc_vxm_u32m1_b32(__VA_ARGS__)
+#define vmsbc_vv_u32m1_b32(...) __riscv_vmsbc_vv_u32m1_b32(__VA_ARGS__)
+#define vmsbc_vx_u32m1_b32(...) __riscv_vmsbc_vx_u32m1_b32(__VA_ARGS__)
+#define vmsbc_vvm_u32m2_b16(...) __riscv_vmsbc_vvm_u32m2_b16(__VA_ARGS__)
+#define vmsbc_vxm_u32m2_b16(...) __riscv_vmsbc_vxm_u32m2_b16(__VA_ARGS__)
+#define vmsbc_vv_u32m2_b16(...) __riscv_vmsbc_vv_u32m2_b16(__VA_ARGS__)
+#define vmsbc_vx_u32m2_b16(...) __riscv_vmsbc_vx_u32m2_b16(__VA_ARGS__)
+#define vmsbc_vvm_u32m4_b8(...) __riscv_vmsbc_vvm_u32m4_b8(__VA_ARGS__)
+#define vmsbc_vxm_u32m4_b8(...) __riscv_vmsbc_vxm_u32m4_b8(__VA_ARGS__)
+#define vmsbc_vv_u32m4_b8(...) __riscv_vmsbc_vv_u32m4_b8(__VA_ARGS__)
+#define vmsbc_vx_u32m4_b8(...) __riscv_vmsbc_vx_u32m4_b8(__VA_ARGS__)
+#define vmsbc_vvm_u32m8_b4(...) __riscv_vmsbc_vvm_u32m8_b4(__VA_ARGS__)
+#define vmsbc_vxm_u32m8_b4(...) __riscv_vmsbc_vxm_u32m8_b4(__VA_ARGS__)
+#define vmsbc_vv_u32m8_b4(...) __riscv_vmsbc_vv_u32m8_b4(__VA_ARGS__)
+#define vmsbc_vx_u32m8_b4(...) __riscv_vmsbc_vx_u32m8_b4(__VA_ARGS__)
+#define vmsbc_vvm_u64m1_b64(...) __riscv_vmsbc_vvm_u64m1_b64(__VA_ARGS__)
+#define vmsbc_vxm_u64m1_b64(...) __riscv_vmsbc_vxm_u64m1_b64(__VA_ARGS__)
+#define vmsbc_vv_u64m1_b64(...) __riscv_vmsbc_vv_u64m1_b64(__VA_ARGS__)
+#define vmsbc_vx_u64m1_b64(...) __riscv_vmsbc_vx_u64m1_b64(__VA_ARGS__)
+#define vmsbc_vvm_u64m2_b32(...) __riscv_vmsbc_vvm_u64m2_b32(__VA_ARGS__)
+#define vmsbc_vxm_u64m2_b32(...) __riscv_vmsbc_vxm_u64m2_b32(__VA_ARGS__)
+#define vmsbc_vv_u64m2_b32(...) __riscv_vmsbc_vv_u64m2_b32(__VA_ARGS__)
+#define vmsbc_vx_u64m2_b32(...) __riscv_vmsbc_vx_u64m2_b32(__VA_ARGS__)
+#define vmsbc_vvm_u64m4_b16(...) __riscv_vmsbc_vvm_u64m4_b16(__VA_ARGS__)
+#define vmsbc_vxm_u64m4_b16(...) __riscv_vmsbc_vxm_u64m4_b16(__VA_ARGS__)
+#define vmsbc_vv_u64m4_b16(...) __riscv_vmsbc_vv_u64m4_b16(__VA_ARGS__)
+#define vmsbc_vx_u64m4_b16(...) __riscv_vmsbc_vx_u64m4_b16(__VA_ARGS__)
+#define vmsbc_vvm_u64m8_b8(...) __riscv_vmsbc_vvm_u64m8_b8(__VA_ARGS__)
+#define vmsbc_vxm_u64m8_b8(...) __riscv_vmsbc_vxm_u64m8_b8(__VA_ARGS__)
+#define vmsbc_vv_u64m8_b8(...) __riscv_vmsbc_vv_u64m8_b8(__VA_ARGS__)
+#define vmsbc_vx_u64m8_b8(...) __riscv_vmsbc_vx_u64m8_b8(__VA_ARGS__)
+#define vand_vv_i8mf8(...) __riscv_vand_vv_i8mf8(__VA_ARGS__)
+#define vand_vx_i8mf8(...) __riscv_vand_vx_i8mf8(__VA_ARGS__)
+#define vand_vv_i8mf4(...) __riscv_vand_vv_i8mf4(__VA_ARGS__)
+#define vand_vx_i8mf4(...) __riscv_vand_vx_i8mf4(__VA_ARGS__)
+#define vand_vv_i8mf2(...) __riscv_vand_vv_i8mf2(__VA_ARGS__)
+#define vand_vx_i8mf2(...) __riscv_vand_vx_i8mf2(__VA_ARGS__)
+#define vand_vv_i8m1(...) __riscv_vand_vv_i8m1(__VA_ARGS__)
+#define vand_vx_i8m1(...) __riscv_vand_vx_i8m1(__VA_ARGS__)
+#define vand_vv_i8m2(...) __riscv_vand_vv_i8m2(__VA_ARGS__)
+#define vand_vx_i8m2(...) __riscv_vand_vx_i8m2(__VA_ARGS__)
+#define vand_vv_i8m4(...) __riscv_vand_vv_i8m4(__VA_ARGS__)
+#define vand_vx_i8m4(...) __riscv_vand_vx_i8m4(__VA_ARGS__)
+#define vand_vv_i8m8(...) __riscv_vand_vv_i8m8(__VA_ARGS__)
+#define vand_vx_i8m8(...) __riscv_vand_vx_i8m8(__VA_ARGS__)
+#define vand_vv_i16mf4(...) __riscv_vand_vv_i16mf4(__VA_ARGS__)
+#define vand_vx_i16mf4(...) __riscv_vand_vx_i16mf4(__VA_ARGS__)
+#define vand_vv_i16mf2(...) __riscv_vand_vv_i16mf2(__VA_ARGS__)
+#define vand_vx_i16mf2(...) __riscv_vand_vx_i16mf2(__VA_ARGS__)
+#define vand_vv_i16m1(...) __riscv_vand_vv_i16m1(__VA_ARGS__)
+#define vand_vx_i16m1(...) __riscv_vand_vx_i16m1(__VA_ARGS__)
+#define vand_vv_i16m2(...) __riscv_vand_vv_i16m2(__VA_ARGS__)
+#define vand_vx_i16m2(...) __riscv_vand_vx_i16m2(__VA_ARGS__)
+#define vand_vv_i16m4(...) __riscv_vand_vv_i16m4(__VA_ARGS__)
+#define vand_vx_i16m4(...) __riscv_vand_vx_i16m4(__VA_ARGS__)
+#define vand_vv_i16m8(...) __riscv_vand_vv_i16m8(__VA_ARGS__)
+#define vand_vx_i16m8(...) __riscv_vand_vx_i16m8(__VA_ARGS__)
+#define vand_vv_i32mf2(...) __riscv_vand_vv_i32mf2(__VA_ARGS__)
+#define vand_vx_i32mf2(...) __riscv_vand_vx_i32mf2(__VA_ARGS__)
+#define vand_vv_i32m1(...) __riscv_vand_vv_i32m1(__VA_ARGS__)
+#define vand_vx_i32m1(...) __riscv_vand_vx_i32m1(__VA_ARGS__)
+#define vand_vv_i32m2(...) __riscv_vand_vv_i32m2(__VA_ARGS__)
+#define vand_vx_i32m2(...) __riscv_vand_vx_i32m2(__VA_ARGS__)
+#define vand_vv_i32m4(...) __riscv_vand_vv_i32m4(__VA_ARGS__)
+#define vand_vx_i32m4(...) __riscv_vand_vx_i32m4(__VA_ARGS__)
+#define vand_vv_i32m8(...) __riscv_vand_vv_i32m8(__VA_ARGS__)
+#define vand_vx_i32m8(...) __riscv_vand_vx_i32m8(__VA_ARGS__)
+#define vand_vv_i64m1(...) __riscv_vand_vv_i64m1(__VA_ARGS__)
+#define vand_vx_i64m1(...) __riscv_vand_vx_i64m1(__VA_ARGS__)
+#define vand_vv_i64m2(...) __riscv_vand_vv_i64m2(__VA_ARGS__)
+#define vand_vx_i64m2(...) __riscv_vand_vx_i64m2(__VA_ARGS__)
+#define vand_vv_i64m4(...) __riscv_vand_vv_i64m4(__VA_ARGS__)
+#define vand_vx_i64m4(...) __riscv_vand_vx_i64m4(__VA_ARGS__)
+#define vand_vv_i64m8(...) __riscv_vand_vv_i64m8(__VA_ARGS__)
+#define vand_vx_i64m8(...) __riscv_vand_vx_i64m8(__VA_ARGS__)
+#define vor_vv_i8mf8(...) __riscv_vor_vv_i8mf8(__VA_ARGS__)
+#define vor_vx_i8mf8(...) __riscv_vor_vx_i8mf8(__VA_ARGS__)
+#define vor_vv_i8mf4(...) __riscv_vor_vv_i8mf4(__VA_ARGS__)
+#define vor_vx_i8mf4(...) __riscv_vor_vx_i8mf4(__VA_ARGS__)
+#define vor_vv_i8mf2(...) __riscv_vor_vv_i8mf2(__VA_ARGS__)
+#define vor_vx_i8mf2(...) __riscv_vor_vx_i8mf2(__VA_ARGS__)
+#define vor_vv_i8m1(...) __riscv_vor_vv_i8m1(__VA_ARGS__)
+#define vor_vx_i8m1(...) __riscv_vor_vx_i8m1(__VA_ARGS__)
+#define vor_vv_i8m2(...) __riscv_vor_vv_i8m2(__VA_ARGS__)
+#define vor_vx_i8m2(...) __riscv_vor_vx_i8m2(__VA_ARGS__)
+#define vor_vv_i8m4(...) __riscv_vor_vv_i8m4(__VA_ARGS__)
+#define vor_vx_i8m4(...) __riscv_vor_vx_i8m4(__VA_ARGS__)
+#define vor_vv_i8m8(...) __riscv_vor_vv_i8m8(__VA_ARGS__)
+#define vor_vx_i8m8(...) __riscv_vor_vx_i8m8(__VA_ARGS__)
+#define vor_vv_i16mf4(...) __riscv_vor_vv_i16mf4(__VA_ARGS__)
+#define vor_vx_i16mf4(...) __riscv_vor_vx_i16mf4(__VA_ARGS__)
+#define vor_vv_i16mf2(...) __riscv_vor_vv_i16mf2(__VA_ARGS__)
+#define vor_vx_i16mf2(...) __riscv_vor_vx_i16mf2(__VA_ARGS__)
+#define vor_vv_i16m1(...) __riscv_vor_vv_i16m1(__VA_ARGS__)
+#define vor_vx_i16m1(...) __riscv_vor_vx_i16m1(__VA_ARGS__)
+#define vor_vv_i16m2(...) __riscv_vor_vv_i16m2(__VA_ARGS__)
+#define vor_vx_i16m2(...) __riscv_vor_vx_i16m2(__VA_ARGS__)
+#define vor_vv_i16m4(...) __riscv_vor_vv_i16m4(__VA_ARGS__)
+#define vor_vx_i16m4(...) __riscv_vor_vx_i16m4(__VA_ARGS__)
+#define vor_vv_i16m8(...) __riscv_vor_vv_i16m8(__VA_ARGS__)
+#define vor_vx_i16m8(...) __riscv_vor_vx_i16m8(__VA_ARGS__)
+#define vor_vv_i32mf2(...) __riscv_vor_vv_i32mf2(__VA_ARGS__)
+#define vor_vx_i32mf2(...) __riscv_vor_vx_i32mf2(__VA_ARGS__)
+#define vor_vv_i32m1(...) __riscv_vor_vv_i32m1(__VA_ARGS__)
+#define vor_vx_i32m1(...) __riscv_vor_vx_i32m1(__VA_ARGS__)
+#define vor_vv_i32m2(...) __riscv_vor_vv_i32m2(__VA_ARGS__)
+#define vor_vx_i32m2(...) __riscv_vor_vx_i32m2(__VA_ARGS__)
+#define vor_vv_i32m4(...) __riscv_vor_vv_i32m4(__VA_ARGS__)
+#define vor_vx_i32m4(...) __riscv_vor_vx_i32m4(__VA_ARGS__)
+#define vor_vv_i32m8(...) __riscv_vor_vv_i32m8(__VA_ARGS__)
+#define vor_vx_i32m8(...) __riscv_vor_vx_i32m8(__VA_ARGS__)
+#define vor_vv_i64m1(...) __riscv_vor_vv_i64m1(__VA_ARGS__)
+#define vor_vx_i64m1(...) __riscv_vor_vx_i64m1(__VA_ARGS__)
+#define vor_vv_i64m2(...) __riscv_vor_vv_i64m2(__VA_ARGS__)
+#define vor_vx_i64m2(...) __riscv_vor_vx_i64m2(__VA_ARGS__)
+#define vor_vv_i64m4(...) __riscv_vor_vv_i64m4(__VA_ARGS__)
+#define vor_vx_i64m4(...) __riscv_vor_vx_i64m4(__VA_ARGS__)
+#define vor_vv_i64m8(...) __riscv_vor_vv_i64m8(__VA_ARGS__)
+#define vor_vx_i64m8(...) __riscv_vor_vx_i64m8(__VA_ARGS__)
+#define vxor_vv_i8mf8(...) __riscv_vxor_vv_i8mf8(__VA_ARGS__)
+#define vxor_vx_i8mf8(...) __riscv_vxor_vx_i8mf8(__VA_ARGS__)
+#define vxor_vv_i8mf4(...) __riscv_vxor_vv_i8mf4(__VA_ARGS__)
+#define vxor_vx_i8mf4(...) __riscv_vxor_vx_i8mf4(__VA_ARGS__)
+#define vxor_vv_i8mf2(...) __riscv_vxor_vv_i8mf2(__VA_ARGS__)
+#define vxor_vx_i8mf2(...) __riscv_vxor_vx_i8mf2(__VA_ARGS__)
+#define vxor_vv_i8m1(...) __riscv_vxor_vv_i8m1(__VA_ARGS__)
+#define vxor_vx_i8m1(...) __riscv_vxor_vx_i8m1(__VA_ARGS__)
+#define vxor_vv_i8m2(...) __riscv_vxor_vv_i8m2(__VA_ARGS__)
+#define vxor_vx_i8m2(...) __riscv_vxor_vx_i8m2(__VA_ARGS__)
+#define vxor_vv_i8m4(...) __riscv_vxor_vv_i8m4(__VA_ARGS__)
+#define vxor_vx_i8m4(...) __riscv_vxor_vx_i8m4(__VA_ARGS__)
+#define vxor_vv_i8m8(...) __riscv_vxor_vv_i8m8(__VA_ARGS__)
+#define vxor_vx_i8m8(...) __riscv_vxor_vx_i8m8(__VA_ARGS__)
+#define vxor_vv_i16mf4(...) __riscv_vxor_vv_i16mf4(__VA_ARGS__)
+#define vxor_vx_i16mf4(...) __riscv_vxor_vx_i16mf4(__VA_ARGS__)
+#define vxor_vv_i16mf2(...) __riscv_vxor_vv_i16mf2(__VA_ARGS__)
+#define vxor_vx_i16mf2(...) __riscv_vxor_vx_i16mf2(__VA_ARGS__)
+#define vxor_vv_i16m1(...) __riscv_vxor_vv_i16m1(__VA_ARGS__)
+#define vxor_vx_i16m1(...) __riscv_vxor_vx_i16m1(__VA_ARGS__)
+#define vxor_vv_i16m2(...) __riscv_vxor_vv_i16m2(__VA_ARGS__)
+#define vxor_vx_i16m2(...) __riscv_vxor_vx_i16m2(__VA_ARGS__)
+#define vxor_vv_i16m4(...) __riscv_vxor_vv_i16m4(__VA_ARGS__)
+#define vxor_vx_i16m4(...) __riscv_vxor_vx_i16m4(__VA_ARGS__)
+#define vxor_vv_i16m8(...) __riscv_vxor_vv_i16m8(__VA_ARGS__)
+#define vxor_vx_i16m8(...) __riscv_vxor_vx_i16m8(__VA_ARGS__)
+#define vxor_vv_i32mf2(...) __riscv_vxor_vv_i32mf2(__VA_ARGS__)
+#define vxor_vx_i32mf2(...) __riscv_vxor_vx_i32mf2(__VA_ARGS__)
+#define vxor_vv_i32m1(...) __riscv_vxor_vv_i32m1(__VA_ARGS__)
+#define vxor_vx_i32m1(...) __riscv_vxor_vx_i32m1(__VA_ARGS__)
+#define vxor_vv_i32m2(...) __riscv_vxor_vv_i32m2(__VA_ARGS__)
+#define vxor_vx_i32m2(...) __riscv_vxor_vx_i32m2(__VA_ARGS__)
+#define vxor_vv_i32m4(...) __riscv_vxor_vv_i32m4(__VA_ARGS__)
+#define vxor_vx_i32m4(...) __riscv_vxor_vx_i32m4(__VA_ARGS__)
+#define vxor_vv_i32m8(...) __riscv_vxor_vv_i32m8(__VA_ARGS__)
+#define vxor_vx_i32m8(...) __riscv_vxor_vx_i32m8(__VA_ARGS__)
+#define vxor_vv_i64m1(...) __riscv_vxor_vv_i64m1(__VA_ARGS__)
+#define vxor_vx_i64m1(...) __riscv_vxor_vx_i64m1(__VA_ARGS__)
+#define vxor_vv_i64m2(...) __riscv_vxor_vv_i64m2(__VA_ARGS__)
+#define vxor_vx_i64m2(...) __riscv_vxor_vx_i64m2(__VA_ARGS__)
+#define vxor_vv_i64m4(...) __riscv_vxor_vv_i64m4(__VA_ARGS__)
+#define vxor_vx_i64m4(...) __riscv_vxor_vx_i64m4(__VA_ARGS__)
+#define vxor_vv_i64m8(...) __riscv_vxor_vv_i64m8(__VA_ARGS__)
+#define vxor_vx_i64m8(...) __riscv_vxor_vx_i64m8(__VA_ARGS__)
+#define vand_vv_u8mf8(...) __riscv_vand_vv_u8mf8(__VA_ARGS__)
+#define vand_vx_u8mf8(...) __riscv_vand_vx_u8mf8(__VA_ARGS__)
+#define vand_vv_u8mf4(...) __riscv_vand_vv_u8mf4(__VA_ARGS__)
+#define vand_vx_u8mf4(...) __riscv_vand_vx_u8mf4(__VA_ARGS__)
+#define vand_vv_u8mf2(...) __riscv_vand_vv_u8mf2(__VA_ARGS__)
+#define vand_vx_u8mf2(...) __riscv_vand_vx_u8mf2(__VA_ARGS__)
+#define vand_vv_u8m1(...) __riscv_vand_vv_u8m1(__VA_ARGS__)
+#define vand_vx_u8m1(...) __riscv_vand_vx_u8m1(__VA_ARGS__)
+#define vand_vv_u8m2(...) __riscv_vand_vv_u8m2(__VA_ARGS__)
+#define vand_vx_u8m2(...) __riscv_vand_vx_u8m2(__VA_ARGS__)
+#define vand_vv_u8m4(...) __riscv_vand_vv_u8m4(__VA_ARGS__)
+#define vand_vx_u8m4(...) __riscv_vand_vx_u8m4(__VA_ARGS__)
+#define vand_vv_u8m8(...) __riscv_vand_vv_u8m8(__VA_ARGS__)
+#define vand_vx_u8m8(...) __riscv_vand_vx_u8m8(__VA_ARGS__)
+#define vand_vv_u16mf4(...) __riscv_vand_vv_u16mf4(__VA_ARGS__)
+#define vand_vx_u16mf4(...) __riscv_vand_vx_u16mf4(__VA_ARGS__)
+#define vand_vv_u16mf2(...) __riscv_vand_vv_u16mf2(__VA_ARGS__)
+#define vand_vx_u16mf2(...) __riscv_vand_vx_u16mf2(__VA_ARGS__)
+#define vand_vv_u16m1(...) __riscv_vand_vv_u16m1(__VA_ARGS__)
+#define vand_vx_u16m1(...) __riscv_vand_vx_u16m1(__VA_ARGS__)
+#define vand_vv_u16m2(...) __riscv_vand_vv_u16m2(__VA_ARGS__)
+#define vand_vx_u16m2(...) __riscv_vand_vx_u16m2(__VA_ARGS__)
+#define vand_vv_u16m4(...) __riscv_vand_vv_u16m4(__VA_ARGS__)
+#define vand_vx_u16m4(...) __riscv_vand_vx_u16m4(__VA_ARGS__)
+#define vand_vv_u16m8(...) __riscv_vand_vv_u16m8(__VA_ARGS__)
+#define vand_vx_u16m8(...) __riscv_vand_vx_u16m8(__VA_ARGS__)
+#define vand_vv_u32mf2(...) __riscv_vand_vv_u32mf2(__VA_ARGS__)
+#define vand_vx_u32mf2(...) __riscv_vand_vx_u32mf2(__VA_ARGS__)
+#define vand_vv_u32m1(...) __riscv_vand_vv_u32m1(__VA_ARGS__)
+#define vand_vx_u32m1(...) __riscv_vand_vx_u32m1(__VA_ARGS__)
+#define vand_vv_u32m2(...) __riscv_vand_vv_u32m2(__VA_ARGS__)
+#define vand_vx_u32m2(...) __riscv_vand_vx_u32m2(__VA_ARGS__)
+#define vand_vv_u32m4(...) __riscv_vand_vv_u32m4(__VA_ARGS__)
+#define vand_vx_u32m4(...) __riscv_vand_vx_u32m4(__VA_ARGS__)
+#define vand_vv_u32m8(...) __riscv_vand_vv_u32m8(__VA_ARGS__)
+#define vand_vx_u32m8(...) __riscv_vand_vx_u32m8(__VA_ARGS__)
+#define vand_vv_u64m1(...) __riscv_vand_vv_u64m1(__VA_ARGS__)
+#define vand_vx_u64m1(...) __riscv_vand_vx_u64m1(__VA_ARGS__)
+#define vand_vv_u64m2(...) __riscv_vand_vv_u64m2(__VA_ARGS__)
+#define vand_vx_u64m2(...) __riscv_vand_vx_u64m2(__VA_ARGS__)
+#define vand_vv_u64m4(...) __riscv_vand_vv_u64m4(__VA_ARGS__)
+#define vand_vx_u64m4(...) __riscv_vand_vx_u64m4(__VA_ARGS__)
+#define vand_vv_u64m8(...) __riscv_vand_vv_u64m8(__VA_ARGS__)
+#define vand_vx_u64m8(...) __riscv_vand_vx_u64m8(__VA_ARGS__)
+#define vor_vv_u8mf8(...) __riscv_vor_vv_u8mf8(__VA_ARGS__)
+#define vor_vx_u8mf8(...) __riscv_vor_vx_u8mf8(__VA_ARGS__)
+#define vor_vv_u8mf4(...) __riscv_vor_vv_u8mf4(__VA_ARGS__)
+#define vor_vx_u8mf4(...) __riscv_vor_vx_u8mf4(__VA_ARGS__)
+#define vor_vv_u8mf2(...) __riscv_vor_vv_u8mf2(__VA_ARGS__)
+#define vor_vx_u8mf2(...) __riscv_vor_vx_u8mf2(__VA_ARGS__)
+#define vor_vv_u8m1(...) __riscv_vor_vv_u8m1(__VA_ARGS__)
+#define vor_vx_u8m1(...) __riscv_vor_vx_u8m1(__VA_ARGS__)
+#define vor_vv_u8m2(...) __riscv_vor_vv_u8m2(__VA_ARGS__)
+#define vor_vx_u8m2(...) __riscv_vor_vx_u8m2(__VA_ARGS__)
+#define vor_vv_u8m4(...) __riscv_vor_vv_u8m4(__VA_ARGS__)
+#define vor_vx_u8m4(...) __riscv_vor_vx_u8m4(__VA_ARGS__)
+#define vor_vv_u8m8(...) __riscv_vor_vv_u8m8(__VA_ARGS__)
+#define vor_vx_u8m8(...) __riscv_vor_vx_u8m8(__VA_ARGS__)
+#define vor_vv_u16mf4(...) __riscv_vor_vv_u16mf4(__VA_ARGS__)
+#define vor_vx_u16mf4(...) __riscv_vor_vx_u16mf4(__VA_ARGS__)
+#define vor_vv_u16mf2(...) __riscv_vor_vv_u16mf2(__VA_ARGS__)
+#define vor_vx_u16mf2(...) __riscv_vor_vx_u16mf2(__VA_ARGS__)
+#define vor_vv_u16m1(...) __riscv_vor_vv_u16m1(__VA_ARGS__)
+#define vor_vx_u16m1(...) __riscv_vor_vx_u16m1(__VA_ARGS__)
+#define vor_vv_u16m2(...) __riscv_vor_vv_u16m2(__VA_ARGS__)
+#define vor_vx_u16m2(...) __riscv_vor_vx_u16m2(__VA_ARGS__)
+#define vor_vv_u16m4(...) __riscv_vor_vv_u16m4(__VA_ARGS__)
+#define vor_vx_u16m4(...) __riscv_vor_vx_u16m4(__VA_ARGS__)
+#define vor_vv_u16m8(...) __riscv_vor_vv_u16m8(__VA_ARGS__)
+#define vor_vx_u16m8(...) __riscv_vor_vx_u16m8(__VA_ARGS__)
+#define vor_vv_u32mf2(...) __riscv_vor_vv_u32mf2(__VA_ARGS__)
+#define vor_vx_u32mf2(...) __riscv_vor_vx_u32mf2(__VA_ARGS__)
+#define vor_vv_u32m1(...) __riscv_vor_vv_u32m1(__VA_ARGS__)
+#define vor_vx_u32m1(...) __riscv_vor_vx_u32m1(__VA_ARGS__)
+#define vor_vv_u32m2(...) __riscv_vor_vv_u32m2(__VA_ARGS__)
+#define vor_vx_u32m2(...) __riscv_vor_vx_u32m2(__VA_ARGS__)
+#define vor_vv_u32m4(...) __riscv_vor_vv_u32m4(__VA_ARGS__)
+#define vor_vx_u32m4(...) __riscv_vor_vx_u32m4(__VA_ARGS__)
+#define vor_vv_u32m8(...) __riscv_vor_vv_u32m8(__VA_ARGS__)
+#define vor_vx_u32m8(...) __riscv_vor_vx_u32m8(__VA_ARGS__)
+#define vor_vv_u64m1(...) __riscv_vor_vv_u64m1(__VA_ARGS__)
+#define vor_vx_u64m1(...) __riscv_vor_vx_u64m1(__VA_ARGS__)
+#define vor_vv_u64m2(...) __riscv_vor_vv_u64m2(__VA_ARGS__)
+#define vor_vx_u64m2(...) __riscv_vor_vx_u64m2(__VA_ARGS__)
+#define vor_vv_u64m4(...) __riscv_vor_vv_u64m4(__VA_ARGS__)
+#define vor_vx_u64m4(...) __riscv_vor_vx_u64m4(__VA_ARGS__)
+#define vor_vv_u64m8(...) __riscv_vor_vv_u64m8(__VA_ARGS__)
+#define vor_vx_u64m8(...) __riscv_vor_vx_u64m8(__VA_ARGS__)
+#define vxor_vv_u8mf8(...) __riscv_vxor_vv_u8mf8(__VA_ARGS__)
+#define vxor_vx_u8mf8(...) __riscv_vxor_vx_u8mf8(__VA_ARGS__)
+#define vxor_vv_u8mf4(...) __riscv_vxor_vv_u8mf4(__VA_ARGS__)
+#define vxor_vx_u8mf4(...) __riscv_vxor_vx_u8mf4(__VA_ARGS__)
+#define vxor_vv_u8mf2(...) __riscv_vxor_vv_u8mf2(__VA_ARGS__)
+#define vxor_vx_u8mf2(...) __riscv_vxor_vx_u8mf2(__VA_ARGS__)
+#define vxor_vv_u8m1(...) __riscv_vxor_vv_u8m1(__VA_ARGS__)
+#define vxor_vx_u8m1(...) __riscv_vxor_vx_u8m1(__VA_ARGS__)
+#define vxor_vv_u8m2(...) __riscv_vxor_vv_u8m2(__VA_ARGS__)
+#define vxor_vx_u8m2(...) __riscv_vxor_vx_u8m2(__VA_ARGS__)
+#define vxor_vv_u8m4(...) __riscv_vxor_vv_u8m4(__VA_ARGS__)
+#define vxor_vx_u8m4(...) __riscv_vxor_vx_u8m4(__VA_ARGS__)
+#define vxor_vv_u8m8(...) __riscv_vxor_vv_u8m8(__VA_ARGS__)
+#define vxor_vx_u8m8(...) __riscv_vxor_vx_u8m8(__VA_ARGS__)
+#define vxor_vv_u16mf4(...) __riscv_vxor_vv_u16mf4(__VA_ARGS__)
+#define vxor_vx_u16mf4(...) __riscv_vxor_vx_u16mf4(__VA_ARGS__)
+#define vxor_vv_u16mf2(...) __riscv_vxor_vv_u16mf2(__VA_ARGS__)
+#define vxor_vx_u16mf2(...) __riscv_vxor_vx_u16mf2(__VA_ARGS__)
+#define vxor_vv_u16m1(...) __riscv_vxor_vv_u16m1(__VA_ARGS__)
+#define vxor_vx_u16m1(...) __riscv_vxor_vx_u16m1(__VA_ARGS__)
+#define vxor_vv_u16m2(...) __riscv_vxor_vv_u16m2(__VA_ARGS__)
+#define vxor_vx_u16m2(...) __riscv_vxor_vx_u16m2(__VA_ARGS__)
+#define vxor_vv_u16m4(...) __riscv_vxor_vv_u16m4(__VA_ARGS__)
+#define vxor_vx_u16m4(...) __riscv_vxor_vx_u16m4(__VA_ARGS__)
+#define vxor_vv_u16m8(...) __riscv_vxor_vv_u16m8(__VA_ARGS__)
+#define vxor_vx_u16m8(...) __riscv_vxor_vx_u16m8(__VA_ARGS__)
+#define vxor_vv_u32mf2(...) __riscv_vxor_vv_u32mf2(__VA_ARGS__)
+#define vxor_vx_u32mf2(...) __riscv_vxor_vx_u32mf2(__VA_ARGS__)
+#define vxor_vv_u32m1(...) __riscv_vxor_vv_u32m1(__VA_ARGS__)
+#define vxor_vx_u32m1(...) __riscv_vxor_vx_u32m1(__VA_ARGS__)
+#define vxor_vv_u32m2(...) __riscv_vxor_vv_u32m2(__VA_ARGS__)
+#define vxor_vx_u32m2(...) __riscv_vxor_vx_u32m2(__VA_ARGS__)
+#define vxor_vv_u32m4(...) __riscv_vxor_vv_u32m4(__VA_ARGS__)
+#define vxor_vx_u32m4(...) __riscv_vxor_vx_u32m4(__VA_ARGS__)
+#define vxor_vv_u32m8(...) __riscv_vxor_vv_u32m8(__VA_ARGS__)
+#define vxor_vx_u32m8(...) __riscv_vxor_vx_u32m8(__VA_ARGS__)
+#define vxor_vv_u64m1(...) __riscv_vxor_vv_u64m1(__VA_ARGS__)
+#define vxor_vx_u64m1(...) __riscv_vxor_vx_u64m1(__VA_ARGS__)
+#define vxor_vv_u64m2(...) __riscv_vxor_vv_u64m2(__VA_ARGS__)
+#define vxor_vx_u64m2(...) __riscv_vxor_vx_u64m2(__VA_ARGS__)
+#define vxor_vv_u64m4(...) __riscv_vxor_vv_u64m4(__VA_ARGS__)
+#define vxor_vx_u64m4(...) __riscv_vxor_vx_u64m4(__VA_ARGS__)
+#define vxor_vv_u64m8(...) __riscv_vxor_vv_u64m8(__VA_ARGS__)
+#define vxor_vx_u64m8(...) __riscv_vxor_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vand_vv_i8mf8_m(...) __riscv_vand_vv_i8mf8_tumu(__VA_ARGS__)
+#define vand_vx_i8mf8_m(...) __riscv_vand_vx_i8mf8_tumu(__VA_ARGS__)
+#define vand_vv_i8mf4_m(...) __riscv_vand_vv_i8mf4_tumu(__VA_ARGS__)
+#define vand_vx_i8mf4_m(...) __riscv_vand_vx_i8mf4_tumu(__VA_ARGS__)
+#define vand_vv_i8mf2_m(...) __riscv_vand_vv_i8mf2_tumu(__VA_ARGS__)
+#define vand_vx_i8mf2_m(...) __riscv_vand_vx_i8mf2_tumu(__VA_ARGS__)
+#define vand_vv_i8m1_m(...) __riscv_vand_vv_i8m1_tumu(__VA_ARGS__)
+#define vand_vx_i8m1_m(...) __riscv_vand_vx_i8m1_tumu(__VA_ARGS__)
+#define vand_vv_i8m2_m(...) __riscv_vand_vv_i8m2_tumu(__VA_ARGS__)
+#define vand_vx_i8m2_m(...) __riscv_vand_vx_i8m2_tumu(__VA_ARGS__)
+#define vand_vv_i8m4_m(...) __riscv_vand_vv_i8m4_tumu(__VA_ARGS__)
+#define vand_vx_i8m4_m(...) __riscv_vand_vx_i8m4_tumu(__VA_ARGS__)
+#define vand_vv_i8m8_m(...) __riscv_vand_vv_i8m8_tumu(__VA_ARGS__)
+#define vand_vx_i8m8_m(...) __riscv_vand_vx_i8m8_tumu(__VA_ARGS__)
+#define vand_vv_i16mf4_m(...) __riscv_vand_vv_i16mf4_tumu(__VA_ARGS__)
+#define vand_vx_i16mf4_m(...) __riscv_vand_vx_i16mf4_tumu(__VA_ARGS__)
+#define vand_vv_i16mf2_m(...) __riscv_vand_vv_i16mf2_tumu(__VA_ARGS__)
+#define vand_vx_i16mf2_m(...) __riscv_vand_vx_i16mf2_tumu(__VA_ARGS__)
+#define vand_vv_i16m1_m(...) __riscv_vand_vv_i16m1_tumu(__VA_ARGS__)
+#define vand_vx_i16m1_m(...) __riscv_vand_vx_i16m1_tumu(__VA_ARGS__)
+#define vand_vv_i16m2_m(...) __riscv_vand_vv_i16m2_tumu(__VA_ARGS__)
+#define vand_vx_i16m2_m(...) __riscv_vand_vx_i16m2_tumu(__VA_ARGS__)
+#define vand_vv_i16m4_m(...) __riscv_vand_vv_i16m4_tumu(__VA_ARGS__)
+#define vand_vx_i16m4_m(...) __riscv_vand_vx_i16m4_tumu(__VA_ARGS__)
+#define vand_vv_i16m8_m(...) __riscv_vand_vv_i16m8_tumu(__VA_ARGS__)
+#define vand_vx_i16m8_m(...) __riscv_vand_vx_i16m8_tumu(__VA_ARGS__)
+#define vand_vv_i32mf2_m(...) __riscv_vand_vv_i32mf2_tumu(__VA_ARGS__)
+#define vand_vx_i32mf2_m(...) __riscv_vand_vx_i32mf2_tumu(__VA_ARGS__)
+#define vand_vv_i32m1_m(...) __riscv_vand_vv_i32m1_tumu(__VA_ARGS__)
+#define vand_vx_i32m1_m(...) __riscv_vand_vx_i32m1_tumu(__VA_ARGS__)
+#define vand_vv_i32m2_m(...) __riscv_vand_vv_i32m2_tumu(__VA_ARGS__)
+#define vand_vx_i32m2_m(...) __riscv_vand_vx_i32m2_tumu(__VA_ARGS__)
+#define vand_vv_i32m4_m(...) __riscv_vand_vv_i32m4_tumu(__VA_ARGS__)
+#define vand_vx_i32m4_m(...) __riscv_vand_vx_i32m4_tumu(__VA_ARGS__)
+#define vand_vv_i32m8_m(...) __riscv_vand_vv_i32m8_tumu(__VA_ARGS__)
+#define vand_vx_i32m8_m(...) __riscv_vand_vx_i32m8_tumu(__VA_ARGS__)
+#define vand_vv_i64m1_m(...) __riscv_vand_vv_i64m1_tumu(__VA_ARGS__)
+#define vand_vx_i64m1_m(...) __riscv_vand_vx_i64m1_tumu(__VA_ARGS__)
+#define vand_vv_i64m2_m(...) __riscv_vand_vv_i64m2_tumu(__VA_ARGS__)
+#define vand_vx_i64m2_m(...) __riscv_vand_vx_i64m2_tumu(__VA_ARGS__)
+#define vand_vv_i64m4_m(...) __riscv_vand_vv_i64m4_tumu(__VA_ARGS__)
+#define vand_vx_i64m4_m(...) __riscv_vand_vx_i64m4_tumu(__VA_ARGS__)
+#define vand_vv_i64m8_m(...) __riscv_vand_vv_i64m8_tumu(__VA_ARGS__)
+#define vand_vx_i64m8_m(...) __riscv_vand_vx_i64m8_tumu(__VA_ARGS__)
+#define vor_vv_i8mf8_m(...) __riscv_vor_vv_i8mf8_tumu(__VA_ARGS__)
+#define vor_vx_i8mf8_m(...) __riscv_vor_vx_i8mf8_tumu(__VA_ARGS__)
+#define vor_vv_i8mf4_m(...) __riscv_vor_vv_i8mf4_tumu(__VA_ARGS__)
+#define vor_vx_i8mf4_m(...) __riscv_vor_vx_i8mf4_tumu(__VA_ARGS__)
+#define vor_vv_i8mf2_m(...) __riscv_vor_vv_i8mf2_tumu(__VA_ARGS__)
+#define vor_vx_i8mf2_m(...) __riscv_vor_vx_i8mf2_tumu(__VA_ARGS__)
+#define vor_vv_i8m1_m(...) __riscv_vor_vv_i8m1_tumu(__VA_ARGS__)
+#define vor_vx_i8m1_m(...) __riscv_vor_vx_i8m1_tumu(__VA_ARGS__)
+#define vor_vv_i8m2_m(...) __riscv_vor_vv_i8m2_tumu(__VA_ARGS__)
+#define vor_vx_i8m2_m(...) __riscv_vor_vx_i8m2_tumu(__VA_ARGS__)
+#define vor_vv_i8m4_m(...) __riscv_vor_vv_i8m4_tumu(__VA_ARGS__)
+#define vor_vx_i8m4_m(...) __riscv_vor_vx_i8m4_tumu(__VA_ARGS__)
+#define vor_vv_i8m8_m(...) __riscv_vor_vv_i8m8_tumu(__VA_ARGS__)
+#define vor_vx_i8m8_m(...) __riscv_vor_vx_i8m8_tumu(__VA_ARGS__)
+#define vor_vv_i16mf4_m(...) __riscv_vor_vv_i16mf4_tumu(__VA_ARGS__)
+#define vor_vx_i16mf4_m(...) __riscv_vor_vx_i16mf4_tumu(__VA_ARGS__)
+#define vor_vv_i16mf2_m(...) __riscv_vor_vv_i16mf2_tumu(__VA_ARGS__)
+#define vor_vx_i16mf2_m(...) __riscv_vor_vx_i16mf2_tumu(__VA_ARGS__)
+#define vor_vv_i16m1_m(...) __riscv_vor_vv_i16m1_tumu(__VA_ARGS__)
+#define vor_vx_i16m1_m(...) __riscv_vor_vx_i16m1_tumu(__VA_ARGS__)
+#define vor_vv_i16m2_m(...) __riscv_vor_vv_i16m2_tumu(__VA_ARGS__)
+#define vor_vx_i16m2_m(...) __riscv_vor_vx_i16m2_tumu(__VA_ARGS__)
+#define vor_vv_i16m4_m(...) __riscv_vor_vv_i16m4_tumu(__VA_ARGS__)
+#define vor_vx_i16m4_m(...) __riscv_vor_vx_i16m4_tumu(__VA_ARGS__)
+#define vor_vv_i16m8_m(...) __riscv_vor_vv_i16m8_tumu(__VA_ARGS__)
+#define vor_vx_i16m8_m(...) __riscv_vor_vx_i16m8_tumu(__VA_ARGS__)
+#define vor_vv_i32mf2_m(...) __riscv_vor_vv_i32mf2_tumu(__VA_ARGS__)
+#define vor_vx_i32mf2_m(...) __riscv_vor_vx_i32mf2_tumu(__VA_ARGS__)
+#define vor_vv_i32m1_m(...) __riscv_vor_vv_i32m1_tumu(__VA_ARGS__)
+#define vor_vx_i32m1_m(...) __riscv_vor_vx_i32m1_tumu(__VA_ARGS__)
+#define vor_vv_i32m2_m(...) __riscv_vor_vv_i32m2_tumu(__VA_ARGS__)
+#define vor_vx_i32m2_m(...) __riscv_vor_vx_i32m2_tumu(__VA_ARGS__)
+#define vor_vv_i32m4_m(...) __riscv_vor_vv_i32m4_tumu(__VA_ARGS__)
+#define vor_vx_i32m4_m(...) __riscv_vor_vx_i32m4_tumu(__VA_ARGS__)
+#define vor_vv_i32m8_m(...) __riscv_vor_vv_i32m8_tumu(__VA_ARGS__)
+#define vor_vx_i32m8_m(...) __riscv_vor_vx_i32m8_tumu(__VA_ARGS__)
+#define vor_vv_i64m1_m(...) __riscv_vor_vv_i64m1_tumu(__VA_ARGS__)
+#define vor_vx_i64m1_m(...) __riscv_vor_vx_i64m1_tumu(__VA_ARGS__)
+#define vor_vv_i64m2_m(...) __riscv_vor_vv_i64m2_tumu(__VA_ARGS__)
+#define vor_vx_i64m2_m(...) __riscv_vor_vx_i64m2_tumu(__VA_ARGS__)
+#define vor_vv_i64m4_m(...) __riscv_vor_vv_i64m4_tumu(__VA_ARGS__)
+#define vor_vx_i64m4_m(...) __riscv_vor_vx_i64m4_tumu(__VA_ARGS__)
+#define vor_vv_i64m8_m(...) __riscv_vor_vv_i64m8_tumu(__VA_ARGS__)
+#define vor_vx_i64m8_m(...) __riscv_vor_vx_i64m8_tumu(__VA_ARGS__)
+#define vxor_vv_i8mf8_m(...) __riscv_vxor_vv_i8mf8_tumu(__VA_ARGS__)
+#define vxor_vx_i8mf8_m(...) __riscv_vxor_vx_i8mf8_tumu(__VA_ARGS__)
+#define vxor_vv_i8mf4_m(...) __riscv_vxor_vv_i8mf4_tumu(__VA_ARGS__)
+#define vxor_vx_i8mf4_m(...) __riscv_vxor_vx_i8mf4_tumu(__VA_ARGS__)
+#define vxor_vv_i8mf2_m(...) __riscv_vxor_vv_i8mf2_tumu(__VA_ARGS__)
+#define vxor_vx_i8mf2_m(...) __riscv_vxor_vx_i8mf2_tumu(__VA_ARGS__)
+#define vxor_vv_i8m1_m(...) __riscv_vxor_vv_i8m1_tumu(__VA_ARGS__)
+#define vxor_vx_i8m1_m(...) __riscv_vxor_vx_i8m1_tumu(__VA_ARGS__)
+#define vxor_vv_i8m2_m(...) __riscv_vxor_vv_i8m2_tumu(__VA_ARGS__)
+#define vxor_vx_i8m2_m(...) __riscv_vxor_vx_i8m2_tumu(__VA_ARGS__)
+#define vxor_vv_i8m4_m(...) __riscv_vxor_vv_i8m4_tumu(__VA_ARGS__)
+#define vxor_vx_i8m4_m(...) __riscv_vxor_vx_i8m4_tumu(__VA_ARGS__)
+#define vxor_vv_i8m8_m(...) __riscv_vxor_vv_i8m8_tumu(__VA_ARGS__)
+#define vxor_vx_i8m8_m(...) __riscv_vxor_vx_i8m8_tumu(__VA_ARGS__)
+#define vxor_vv_i16mf4_m(...) __riscv_vxor_vv_i16mf4_tumu(__VA_ARGS__)
+#define vxor_vx_i16mf4_m(...) __riscv_vxor_vx_i16mf4_tumu(__VA_ARGS__)
+#define vxor_vv_i16mf2_m(...) __riscv_vxor_vv_i16mf2_tumu(__VA_ARGS__)
+#define vxor_vx_i16mf2_m(...) __riscv_vxor_vx_i16mf2_tumu(__VA_ARGS__)
+#define vxor_vv_i16m1_m(...) __riscv_vxor_vv_i16m1_tumu(__VA_ARGS__)
+#define vxor_vx_i16m1_m(...) __riscv_vxor_vx_i16m1_tumu(__VA_ARGS__)
+#define vxor_vv_i16m2_m(...) __riscv_vxor_vv_i16m2_tumu(__VA_ARGS__)
+#define vxor_vx_i16m2_m(...) __riscv_vxor_vx_i16m2_tumu(__VA_ARGS__)
+#define vxor_vv_i16m4_m(...) __riscv_vxor_vv_i16m4_tumu(__VA_ARGS__)
+#define vxor_vx_i16m4_m(...) __riscv_vxor_vx_i16m4_tumu(__VA_ARGS__)
+#define vxor_vv_i16m8_m(...) __riscv_vxor_vv_i16m8_tumu(__VA_ARGS__)
+#define vxor_vx_i16m8_m(...) __riscv_vxor_vx_i16m8_tumu(__VA_ARGS__)
+#define vxor_vv_i32mf2_m(...) __riscv_vxor_vv_i32mf2_tumu(__VA_ARGS__)
+#define vxor_vx_i32mf2_m(...) __riscv_vxor_vx_i32mf2_tumu(__VA_ARGS__)
+#define vxor_vv_i32m1_m(...) __riscv_vxor_vv_i32m1_tumu(__VA_ARGS__)
+#define vxor_vx_i32m1_m(...) __riscv_vxor_vx_i32m1_tumu(__VA_ARGS__)
+#define vxor_vv_i32m2_m(...) __riscv_vxor_vv_i32m2_tumu(__VA_ARGS__)
+#define vxor_vx_i32m2_m(...) __riscv_vxor_vx_i32m2_tumu(__VA_ARGS__)
+#define vxor_vv_i32m4_m(...) __riscv_vxor_vv_i32m4_tumu(__VA_ARGS__)
+#define vxor_vx_i32m4_m(...) __riscv_vxor_vx_i32m4_tumu(__VA_ARGS__)
+#define vxor_vv_i32m8_m(...) __riscv_vxor_vv_i32m8_tumu(__VA_ARGS__)
+#define vxor_vx_i32m8_m(...) __riscv_vxor_vx_i32m8_tumu(__VA_ARGS__)
+#define vxor_vv_i64m1_m(...) __riscv_vxor_vv_i64m1_tumu(__VA_ARGS__)
+#define vxor_vx_i64m1_m(...) __riscv_vxor_vx_i64m1_tumu(__VA_ARGS__)
+#define vxor_vv_i64m2_m(...) __riscv_vxor_vv_i64m2_tumu(__VA_ARGS__)
+#define vxor_vx_i64m2_m(...) __riscv_vxor_vx_i64m2_tumu(__VA_ARGS__)
+#define vxor_vv_i64m4_m(...) __riscv_vxor_vv_i64m4_tumu(__VA_ARGS__)
+#define vxor_vx_i64m4_m(...) __riscv_vxor_vx_i64m4_tumu(__VA_ARGS__)
+#define vxor_vv_i64m8_m(...) __riscv_vxor_vv_i64m8_tumu(__VA_ARGS__)
+#define vxor_vx_i64m8_m(...) __riscv_vxor_vx_i64m8_tumu(__VA_ARGS__)
+#define vand_vv_u8mf8_m(...) __riscv_vand_vv_u8mf8_tumu(__VA_ARGS__)
+#define vand_vx_u8mf8_m(...) __riscv_vand_vx_u8mf8_tumu(__VA_ARGS__)
+#define vand_vv_u8mf4_m(...) __riscv_vand_vv_u8mf4_tumu(__VA_ARGS__)
+#define vand_vx_u8mf4_m(...) __riscv_vand_vx_u8mf4_tumu(__VA_ARGS__)
+#define vand_vv_u8mf2_m(...) __riscv_vand_vv_u8mf2_tumu(__VA_ARGS__)
+#define vand_vx_u8mf2_m(...) __riscv_vand_vx_u8mf2_tumu(__VA_ARGS__)
+#define vand_vv_u8m1_m(...) __riscv_vand_vv_u8m1_tumu(__VA_ARGS__)
+#define vand_vx_u8m1_m(...) __riscv_vand_vx_u8m1_tumu(__VA_ARGS__)
+#define vand_vv_u8m2_m(...) __riscv_vand_vv_u8m2_tumu(__VA_ARGS__)
+#define vand_vx_u8m2_m(...) __riscv_vand_vx_u8m2_tumu(__VA_ARGS__)
+#define vand_vv_u8m4_m(...) __riscv_vand_vv_u8m4_tumu(__VA_ARGS__)
+#define vand_vx_u8m4_m(...) __riscv_vand_vx_u8m4_tumu(__VA_ARGS__)
+#define vand_vv_u8m8_m(...) __riscv_vand_vv_u8m8_tumu(__VA_ARGS__)
+#define vand_vx_u8m8_m(...) __riscv_vand_vx_u8m8_tumu(__VA_ARGS__)
+#define vand_vv_u16mf4_m(...) __riscv_vand_vv_u16mf4_tumu(__VA_ARGS__)
+#define vand_vx_u16mf4_m(...) __riscv_vand_vx_u16mf4_tumu(__VA_ARGS__)
+#define vand_vv_u16mf2_m(...) __riscv_vand_vv_u16mf2_tumu(__VA_ARGS__)
+#define vand_vx_u16mf2_m(...) __riscv_vand_vx_u16mf2_tumu(__VA_ARGS__)
+#define vand_vv_u16m1_m(...) __riscv_vand_vv_u16m1_tumu(__VA_ARGS__)
+#define vand_vx_u16m1_m(...) __riscv_vand_vx_u16m1_tumu(__VA_ARGS__)
+#define vand_vv_u16m2_m(...) __riscv_vand_vv_u16m2_tumu(__VA_ARGS__)
+#define vand_vx_u16m2_m(...) __riscv_vand_vx_u16m2_tumu(__VA_ARGS__)
+#define vand_vv_u16m4_m(...) __riscv_vand_vv_u16m4_tumu(__VA_ARGS__)
+#define vand_vx_u16m4_m(...) __riscv_vand_vx_u16m4_tumu(__VA_ARGS__)
+#define vand_vv_u16m8_m(...) __riscv_vand_vv_u16m8_tumu(__VA_ARGS__)
+#define vand_vx_u16m8_m(...) __riscv_vand_vx_u16m8_tumu(__VA_ARGS__)
+#define vand_vv_u32mf2_m(...) __riscv_vand_vv_u32mf2_tumu(__VA_ARGS__)
+#define vand_vx_u32mf2_m(...) __riscv_vand_vx_u32mf2_tumu(__VA_ARGS__)
+#define vand_vv_u32m1_m(...) __riscv_vand_vv_u32m1_tumu(__VA_ARGS__)
+#define vand_vx_u32m1_m(...) __riscv_vand_vx_u32m1_tumu(__VA_ARGS__)
+#define vand_vv_u32m2_m(...) __riscv_vand_vv_u32m2_tumu(__VA_ARGS__)
+#define vand_vx_u32m2_m(...) __riscv_vand_vx_u32m2_tumu(__VA_ARGS__)
+#define vand_vv_u32m4_m(...) __riscv_vand_vv_u32m4_tumu(__VA_ARGS__)
+#define vand_vx_u32m4_m(...) __riscv_vand_vx_u32m4_tumu(__VA_ARGS__)
+#define vand_vv_u32m8_m(...) __riscv_vand_vv_u32m8_tumu(__VA_ARGS__)
+#define vand_vx_u32m8_m(...) __riscv_vand_vx_u32m8_tumu(__VA_ARGS__)
+#define vand_vv_u64m1_m(...) __riscv_vand_vv_u64m1_tumu(__VA_ARGS__)
+#define vand_vx_u64m1_m(...) __riscv_vand_vx_u64m1_tumu(__VA_ARGS__)
+#define vand_vv_u64m2_m(...) __riscv_vand_vv_u64m2_tumu(__VA_ARGS__)
+#define vand_vx_u64m2_m(...) __riscv_vand_vx_u64m2_tumu(__VA_ARGS__)
+#define vand_vv_u64m4_m(...) __riscv_vand_vv_u64m4_tumu(__VA_ARGS__)
+#define vand_vx_u64m4_m(...) __riscv_vand_vx_u64m4_tumu(__VA_ARGS__)
+#define vand_vv_u64m8_m(...) __riscv_vand_vv_u64m8_tumu(__VA_ARGS__)
+#define vand_vx_u64m8_m(...) __riscv_vand_vx_u64m8_tumu(__VA_ARGS__)
+#define vor_vv_u8mf8_m(...) __riscv_vor_vv_u8mf8_tumu(__VA_ARGS__)
+#define vor_vx_u8mf8_m(...) __riscv_vor_vx_u8mf8_tumu(__VA_ARGS__)
+#define vor_vv_u8mf4_m(...) __riscv_vor_vv_u8mf4_tumu(__VA_ARGS__)
+#define vor_vx_u8mf4_m(...) __riscv_vor_vx_u8mf4_tumu(__VA_ARGS__)
+#define vor_vv_u8mf2_m(...) __riscv_vor_vv_u8mf2_tumu(__VA_ARGS__)
+#define vor_vx_u8mf2_m(...) __riscv_vor_vx_u8mf2_tumu(__VA_ARGS__)
+#define vor_vv_u8m1_m(...) __riscv_vor_vv_u8m1_tumu(__VA_ARGS__)
+#define vor_vx_u8m1_m(...) __riscv_vor_vx_u8m1_tumu(__VA_ARGS__)
+#define vor_vv_u8m2_m(...) __riscv_vor_vv_u8m2_tumu(__VA_ARGS__)
+#define vor_vx_u8m2_m(...) __riscv_vor_vx_u8m2_tumu(__VA_ARGS__)
+#define vor_vv_u8m4_m(...) __riscv_vor_vv_u8m4_tumu(__VA_ARGS__)
+#define vor_vx_u8m4_m(...) __riscv_vor_vx_u8m4_tumu(__VA_ARGS__)
+#define vor_vv_u8m8_m(...) __riscv_vor_vv_u8m8_tumu(__VA_ARGS__)
+#define vor_vx_u8m8_m(...) __riscv_vor_vx_u8m8_tumu(__VA_ARGS__)
+#define vor_vv_u16mf4_m(...) __riscv_vor_vv_u16mf4_tumu(__VA_ARGS__)
+#define vor_vx_u16mf4_m(...) __riscv_vor_vx_u16mf4_tumu(__VA_ARGS__)
+#define vor_vv_u16mf2_m(...) __riscv_vor_vv_u16mf2_tumu(__VA_ARGS__)
+#define vor_vx_u16mf2_m(...) __riscv_vor_vx_u16mf2_tumu(__VA_ARGS__)
+#define vor_vv_u16m1_m(...) __riscv_vor_vv_u16m1_tumu(__VA_ARGS__)
+#define vor_vx_u16m1_m(...) __riscv_vor_vx_u16m1_tumu(__VA_ARGS__)
+#define vor_vv_u16m2_m(...) __riscv_vor_vv_u16m2_tumu(__VA_ARGS__)
+#define vor_vx_u16m2_m(...) __riscv_vor_vx_u16m2_tumu(__VA_ARGS__)
+#define vor_vv_u16m4_m(...) __riscv_vor_vv_u16m4_tumu(__VA_ARGS__)
+#define vor_vx_u16m4_m(...) __riscv_vor_vx_u16m4_tumu(__VA_ARGS__)
+#define vor_vv_u16m8_m(...) __riscv_vor_vv_u16m8_tumu(__VA_ARGS__)
+#define vor_vx_u16m8_m(...) __riscv_vor_vx_u16m8_tumu(__VA_ARGS__)
+#define vor_vv_u32mf2_m(...) __riscv_vor_vv_u32mf2_tumu(__VA_ARGS__)
+#define vor_vx_u32mf2_m(...) __riscv_vor_vx_u32mf2_tumu(__VA_ARGS__)
+#define vor_vv_u32m1_m(...) __riscv_vor_vv_u32m1_tumu(__VA_ARGS__)
+#define vor_vx_u32m1_m(...) __riscv_vor_vx_u32m1_tumu(__VA_ARGS__)
+#define vor_vv_u32m2_m(...) __riscv_vor_vv_u32m2_tumu(__VA_ARGS__)
+#define vor_vx_u32m2_m(...) __riscv_vor_vx_u32m2_tumu(__VA_ARGS__)
+#define vor_vv_u32m4_m(...) __riscv_vor_vv_u32m4_tumu(__VA_ARGS__)
+#define vor_vx_u32m4_m(...) __riscv_vor_vx_u32m4_tumu(__VA_ARGS__)
+#define vor_vv_u32m8_m(...) __riscv_vor_vv_u32m8_tumu(__VA_ARGS__)
+#define vor_vx_u32m8_m(...) __riscv_vor_vx_u32m8_tumu(__VA_ARGS__)
+#define vor_vv_u64m1_m(...) __riscv_vor_vv_u64m1_tumu(__VA_ARGS__)
+#define vor_vx_u64m1_m(...) __riscv_vor_vx_u64m1_tumu(__VA_ARGS__)
+#define vor_vv_u64m2_m(...) __riscv_vor_vv_u64m2_tumu(__VA_ARGS__)
+#define vor_vx_u64m2_m(...) __riscv_vor_vx_u64m2_tumu(__VA_ARGS__)
+#define vor_vv_u64m4_m(...) __riscv_vor_vv_u64m4_tumu(__VA_ARGS__)
+#define vor_vx_u64m4_m(...) __riscv_vor_vx_u64m4_tumu(__VA_ARGS__)
+#define vor_vv_u64m8_m(...) __riscv_vor_vv_u64m8_tumu(__VA_ARGS__)
+#define vor_vx_u64m8_m(...) __riscv_vor_vx_u64m8_tumu(__VA_ARGS__)
+#define vxor_vv_u8mf8_m(...) __riscv_vxor_vv_u8mf8_tumu(__VA_ARGS__)
+#define vxor_vx_u8mf8_m(...) __riscv_vxor_vx_u8mf8_tumu(__VA_ARGS__)
+#define vxor_vv_u8mf4_m(...) __riscv_vxor_vv_u8mf4_tumu(__VA_ARGS__)
+#define vxor_vx_u8mf4_m(...) __riscv_vxor_vx_u8mf4_tumu(__VA_ARGS__)
+#define vxor_vv_u8mf2_m(...) __riscv_vxor_vv_u8mf2_tumu(__VA_ARGS__)
+#define vxor_vx_u8mf2_m(...) __riscv_vxor_vx_u8mf2_tumu(__VA_ARGS__)
+#define vxor_vv_u8m1_m(...) __riscv_vxor_vv_u8m1_tumu(__VA_ARGS__)
+#define vxor_vx_u8m1_m(...) __riscv_vxor_vx_u8m1_tumu(__VA_ARGS__)
+#define vxor_vv_u8m2_m(...) __riscv_vxor_vv_u8m2_tumu(__VA_ARGS__)
+#define vxor_vx_u8m2_m(...) __riscv_vxor_vx_u8m2_tumu(__VA_ARGS__)
+#define vxor_vv_u8m4_m(...) __riscv_vxor_vv_u8m4_tumu(__VA_ARGS__)
+#define vxor_vx_u8m4_m(...) __riscv_vxor_vx_u8m4_tumu(__VA_ARGS__)
+#define vxor_vv_u8m8_m(...) __riscv_vxor_vv_u8m8_tumu(__VA_ARGS__)
+#define vxor_vx_u8m8_m(...) __riscv_vxor_vx_u8m8_tumu(__VA_ARGS__)
+#define vxor_vv_u16mf4_m(...) __riscv_vxor_vv_u16mf4_tumu(__VA_ARGS__)
+#define vxor_vx_u16mf4_m(...) __riscv_vxor_vx_u16mf4_tumu(__VA_ARGS__)
+#define vxor_vv_u16mf2_m(...) __riscv_vxor_vv_u16mf2_tumu(__VA_ARGS__)
+#define vxor_vx_u16mf2_m(...) __riscv_vxor_vx_u16mf2_tumu(__VA_ARGS__)
+#define vxor_vv_u16m1_m(...) __riscv_vxor_vv_u16m1_tumu(__VA_ARGS__)
+#define vxor_vx_u16m1_m(...) __riscv_vxor_vx_u16m1_tumu(__VA_ARGS__)
+#define vxor_vv_u16m2_m(...) __riscv_vxor_vv_u16m2_tumu(__VA_ARGS__)
+#define vxor_vx_u16m2_m(...) __riscv_vxor_vx_u16m2_tumu(__VA_ARGS__)
+#define vxor_vv_u16m4_m(...) __riscv_vxor_vv_u16m4_tumu(__VA_ARGS__)
+#define vxor_vx_u16m4_m(...) __riscv_vxor_vx_u16m4_tumu(__VA_ARGS__)
+#define vxor_vv_u16m8_m(...) __riscv_vxor_vv_u16m8_tumu(__VA_ARGS__)
+#define vxor_vx_u16m8_m(...) __riscv_vxor_vx_u16m8_tumu(__VA_ARGS__)
+#define vxor_vv_u32mf2_m(...) __riscv_vxor_vv_u32mf2_tumu(__VA_ARGS__)
+#define vxor_vx_u32mf2_m(...) __riscv_vxor_vx_u32mf2_tumu(__VA_ARGS__)
+#define vxor_vv_u32m1_m(...) __riscv_vxor_vv_u32m1_tumu(__VA_ARGS__)
+#define vxor_vx_u32m1_m(...) __riscv_vxor_vx_u32m1_tumu(__VA_ARGS__)
+#define vxor_vv_u32m2_m(...) __riscv_vxor_vv_u32m2_tumu(__VA_ARGS__)
+#define vxor_vx_u32m2_m(...) __riscv_vxor_vx_u32m2_tumu(__VA_ARGS__)
+#define vxor_vv_u32m4_m(...) __riscv_vxor_vv_u32m4_tumu(__VA_ARGS__)
+#define vxor_vx_u32m4_m(...) __riscv_vxor_vx_u32m4_tumu(__VA_ARGS__)
+#define vxor_vv_u32m8_m(...) __riscv_vxor_vv_u32m8_tumu(__VA_ARGS__)
+#define vxor_vx_u32m8_m(...) __riscv_vxor_vx_u32m8_tumu(__VA_ARGS__)
+#define vxor_vv_u64m1_m(...) __riscv_vxor_vv_u64m1_tumu(__VA_ARGS__)
+#define vxor_vx_u64m1_m(...) __riscv_vxor_vx_u64m1_tumu(__VA_ARGS__)
+#define vxor_vv_u64m2_m(...) __riscv_vxor_vv_u64m2_tumu(__VA_ARGS__)
+#define vxor_vx_u64m2_m(...) __riscv_vxor_vx_u64m2_tumu(__VA_ARGS__)
+#define vxor_vv_u64m4_m(...) __riscv_vxor_vv_u64m4_tumu(__VA_ARGS__)
+#define vxor_vx_u64m4_m(...) __riscv_vxor_vx_u64m4_tumu(__VA_ARGS__)
+#define vxor_vv_u64m8_m(...) __riscv_vxor_vv_u64m8_tumu(__VA_ARGS__)
+#define vxor_vx_u64m8_m(...) __riscv_vxor_vx_u64m8_tumu(__VA_ARGS__)
+#define vnot_v_i8mf8(...) __riscv_vnot_v_i8mf8(__VA_ARGS__)
+#define vnot_v_i8mf4(...) __riscv_vnot_v_i8mf4(__VA_ARGS__)
+#define vnot_v_i8mf2(...) __riscv_vnot_v_i8mf2(__VA_ARGS__)
+#define vnot_v_i8m1(...) __riscv_vnot_v_i8m1(__VA_ARGS__)
+#define vnot_v_i8m2(...) __riscv_vnot_v_i8m2(__VA_ARGS__)
+#define vnot_v_i8m4(...) __riscv_vnot_v_i8m4(__VA_ARGS__)
+#define vnot_v_i8m8(...) __riscv_vnot_v_i8m8(__VA_ARGS__)
+#define vnot_v_i16mf4(...) __riscv_vnot_v_i16mf4(__VA_ARGS__)
+#define vnot_v_i16mf2(...) __riscv_vnot_v_i16mf2(__VA_ARGS__)
+#define vnot_v_i16m1(...) __riscv_vnot_v_i16m1(__VA_ARGS__)
+#define vnot_v_i16m2(...) __riscv_vnot_v_i16m2(__VA_ARGS__)
+#define vnot_v_i16m4(...) __riscv_vnot_v_i16m4(__VA_ARGS__)
+#define vnot_v_i16m8(...) __riscv_vnot_v_i16m8(__VA_ARGS__)
+#define vnot_v_i32mf2(...) __riscv_vnot_v_i32mf2(__VA_ARGS__)
+#define vnot_v_i32m1(...) __riscv_vnot_v_i32m1(__VA_ARGS__)
+#define vnot_v_i32m2(...) __riscv_vnot_v_i32m2(__VA_ARGS__)
+#define vnot_v_i32m4(...) __riscv_vnot_v_i32m4(__VA_ARGS__)
+#define vnot_v_i32m8(...) __riscv_vnot_v_i32m8(__VA_ARGS__)
+#define vnot_v_i64m1(...) __riscv_vnot_v_i64m1(__VA_ARGS__)
+#define vnot_v_i64m2(...) __riscv_vnot_v_i64m2(__VA_ARGS__)
+#define vnot_v_i64m4(...) __riscv_vnot_v_i64m4(__VA_ARGS__)
+#define vnot_v_i64m8(...) __riscv_vnot_v_i64m8(__VA_ARGS__)
+#define vnot_v_u8mf8(...) __riscv_vnot_v_u8mf8(__VA_ARGS__)
+#define vnot_v_u8mf4(...) __riscv_vnot_v_u8mf4(__VA_ARGS__)
+#define vnot_v_u8mf2(...) __riscv_vnot_v_u8mf2(__VA_ARGS__)
+#define vnot_v_u8m1(...) __riscv_vnot_v_u8m1(__VA_ARGS__)
+#define vnot_v_u8m2(...) __riscv_vnot_v_u8m2(__VA_ARGS__)
+#define vnot_v_u8m4(...) __riscv_vnot_v_u8m4(__VA_ARGS__)
+#define vnot_v_u8m8(...) __riscv_vnot_v_u8m8(__VA_ARGS__)
+#define vnot_v_u16mf4(...) __riscv_vnot_v_u16mf4(__VA_ARGS__)
+#define vnot_v_u16mf2(...) __riscv_vnot_v_u16mf2(__VA_ARGS__)
+#define vnot_v_u16m1(...) __riscv_vnot_v_u16m1(__VA_ARGS__)
+#define vnot_v_u16m2(...) __riscv_vnot_v_u16m2(__VA_ARGS__)
+#define vnot_v_u16m4(...) __riscv_vnot_v_u16m4(__VA_ARGS__)
+#define vnot_v_u16m8(...) __riscv_vnot_v_u16m8(__VA_ARGS__)
+#define vnot_v_u32mf2(...) __riscv_vnot_v_u32mf2(__VA_ARGS__)
+#define vnot_v_u32m1(...) __riscv_vnot_v_u32m1(__VA_ARGS__)
+#define vnot_v_u32m2(...) __riscv_vnot_v_u32m2(__VA_ARGS__)
+#define vnot_v_u32m4(...) __riscv_vnot_v_u32m4(__VA_ARGS__)
+#define vnot_v_u32m8(...) __riscv_vnot_v_u32m8(__VA_ARGS__)
+#define vnot_v_u64m1(...) __riscv_vnot_v_u64m1(__VA_ARGS__)
+#define vnot_v_u64m2(...) __riscv_vnot_v_u64m2(__VA_ARGS__)
+#define vnot_v_u64m4(...) __riscv_vnot_v_u64m4(__VA_ARGS__)
+#define vnot_v_u64m8(...) __riscv_vnot_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vnot_v_i8mf8_m(...) __riscv_vnot_v_i8mf8_tumu(__VA_ARGS__)
+#define vnot_v_i8mf4_m(...) __riscv_vnot_v_i8mf4_tumu(__VA_ARGS__)
+#define vnot_v_i8mf2_m(...) __riscv_vnot_v_i8mf2_tumu(__VA_ARGS__)
+#define vnot_v_i8m1_m(...) __riscv_vnot_v_i8m1_tumu(__VA_ARGS__)
+#define vnot_v_i8m2_m(...) __riscv_vnot_v_i8m2_tumu(__VA_ARGS__)
+#define vnot_v_i8m4_m(...) __riscv_vnot_v_i8m4_tumu(__VA_ARGS__)
+#define vnot_v_i8m8_m(...) __riscv_vnot_v_i8m8_tumu(__VA_ARGS__)
+#define vnot_v_i16mf4_m(...) __riscv_vnot_v_i16mf4_tumu(__VA_ARGS__)
+#define vnot_v_i16mf2_m(...) __riscv_vnot_v_i16mf2_tumu(__VA_ARGS__)
+#define vnot_v_i16m1_m(...) __riscv_vnot_v_i16m1_tumu(__VA_ARGS__)
+#define vnot_v_i16m2_m(...) __riscv_vnot_v_i16m2_tumu(__VA_ARGS__)
+#define vnot_v_i16m4_m(...) __riscv_vnot_v_i16m4_tumu(__VA_ARGS__)
+#define vnot_v_i16m8_m(...) __riscv_vnot_v_i16m8_tumu(__VA_ARGS__)
+#define vnot_v_i32mf2_m(...) __riscv_vnot_v_i32mf2_tumu(__VA_ARGS__)
+#define vnot_v_i32m1_m(...) __riscv_vnot_v_i32m1_tumu(__VA_ARGS__)
+#define vnot_v_i32m2_m(...) __riscv_vnot_v_i32m2_tumu(__VA_ARGS__)
+#define vnot_v_i32m4_m(...) __riscv_vnot_v_i32m4_tumu(__VA_ARGS__)
+#define vnot_v_i32m8_m(...) __riscv_vnot_v_i32m8_tumu(__VA_ARGS__)
+#define vnot_v_i64m1_m(...) __riscv_vnot_v_i64m1_tumu(__VA_ARGS__)
+#define vnot_v_i64m2_m(...) __riscv_vnot_v_i64m2_tumu(__VA_ARGS__)
+#define vnot_v_i64m4_m(...) __riscv_vnot_v_i64m4_tumu(__VA_ARGS__)
+#define vnot_v_i64m8_m(...) __riscv_vnot_v_i64m8_tumu(__VA_ARGS__)
+#define vnot_v_u8mf8_m(...) __riscv_vnot_v_u8mf8_tumu(__VA_ARGS__)
+#define vnot_v_u8mf4_m(...) __riscv_vnot_v_u8mf4_tumu(__VA_ARGS__)
+#define vnot_v_u8mf2_m(...) __riscv_vnot_v_u8mf2_tumu(__VA_ARGS__)
+#define vnot_v_u8m1_m(...) __riscv_vnot_v_u8m1_tumu(__VA_ARGS__)
+#define vnot_v_u8m2_m(...) __riscv_vnot_v_u8m2_tumu(__VA_ARGS__)
+#define vnot_v_u8m4_m(...) __riscv_vnot_v_u8m4_tumu(__VA_ARGS__)
+#define vnot_v_u8m8_m(...) __riscv_vnot_v_u8m8_tumu(__VA_ARGS__)
+#define vnot_v_u16mf4_m(...) __riscv_vnot_v_u16mf4_tumu(__VA_ARGS__)
+#define vnot_v_u16mf2_m(...) __riscv_vnot_v_u16mf2_tumu(__VA_ARGS__)
+#define vnot_v_u16m1_m(...) __riscv_vnot_v_u16m1_tumu(__VA_ARGS__)
+#define vnot_v_u16m2_m(...) __riscv_vnot_v_u16m2_tumu(__VA_ARGS__)
+#define vnot_v_u16m4_m(...) __riscv_vnot_v_u16m4_tumu(__VA_ARGS__)
+#define vnot_v_u16m8_m(...) __riscv_vnot_v_u16m8_tumu(__VA_ARGS__)
+#define vnot_v_u32mf2_m(...) __riscv_vnot_v_u32mf2_tumu(__VA_ARGS__)
+#define vnot_v_u32m1_m(...) __riscv_vnot_v_u32m1_tumu(__VA_ARGS__)
+#define vnot_v_u32m2_m(...) __riscv_vnot_v_u32m2_tumu(__VA_ARGS__)
+#define vnot_v_u32m4_m(...) __riscv_vnot_v_u32m4_tumu(__VA_ARGS__)
+#define vnot_v_u32m8_m(...) __riscv_vnot_v_u32m8_tumu(__VA_ARGS__)
+#define vnot_v_u64m1_m(...) __riscv_vnot_v_u64m1_tumu(__VA_ARGS__)
+#define vnot_v_u64m2_m(...) __riscv_vnot_v_u64m2_tumu(__VA_ARGS__)
+#define vnot_v_u64m4_m(...) __riscv_vnot_v_u64m4_tumu(__VA_ARGS__)
+#define vnot_v_u64m8_m(...) __riscv_vnot_v_u64m8_tumu(__VA_ARGS__)
+#define vsll_vv_i8mf8(...) __riscv_vsll_vv_i8mf8(__VA_ARGS__)
+#define vsll_vx_i8mf8(...) __riscv_vsll_vx_i8mf8(__VA_ARGS__)
+#define vsll_vv_i8mf4(...) __riscv_vsll_vv_i8mf4(__VA_ARGS__)
+#define vsll_vx_i8mf4(...) __riscv_vsll_vx_i8mf4(__VA_ARGS__)
+#define vsll_vv_i8mf2(...) __riscv_vsll_vv_i8mf2(__VA_ARGS__)
+#define vsll_vx_i8mf2(...) __riscv_vsll_vx_i8mf2(__VA_ARGS__)
+#define vsll_vv_i8m1(...) __riscv_vsll_vv_i8m1(__VA_ARGS__)
+#define vsll_vx_i8m1(...) __riscv_vsll_vx_i8m1(__VA_ARGS__)
+#define vsll_vv_i8m2(...) __riscv_vsll_vv_i8m2(__VA_ARGS__)
+#define vsll_vx_i8m2(...) __riscv_vsll_vx_i8m2(__VA_ARGS__)
+#define vsll_vv_i8m4(...) __riscv_vsll_vv_i8m4(__VA_ARGS__)
+#define vsll_vx_i8m4(...) __riscv_vsll_vx_i8m4(__VA_ARGS__)
+#define vsll_vv_i8m8(...) __riscv_vsll_vv_i8m8(__VA_ARGS__)
+#define vsll_vx_i8m8(...) __riscv_vsll_vx_i8m8(__VA_ARGS__)
+#define vsll_vv_i16mf4(...) __riscv_vsll_vv_i16mf4(__VA_ARGS__)
+#define vsll_vx_i16mf4(...) __riscv_vsll_vx_i16mf4(__VA_ARGS__)
+#define vsll_vv_i16mf2(...) __riscv_vsll_vv_i16mf2(__VA_ARGS__)
+#define vsll_vx_i16mf2(...) __riscv_vsll_vx_i16mf2(__VA_ARGS__)
+#define vsll_vv_i16m1(...) __riscv_vsll_vv_i16m1(__VA_ARGS__)
+#define vsll_vx_i16m1(...) __riscv_vsll_vx_i16m1(__VA_ARGS__)
+#define vsll_vv_i16m2(...) __riscv_vsll_vv_i16m2(__VA_ARGS__)
+#define vsll_vx_i16m2(...) __riscv_vsll_vx_i16m2(__VA_ARGS__)
+#define vsll_vv_i16m4(...) __riscv_vsll_vv_i16m4(__VA_ARGS__)
+#define vsll_vx_i16m4(...) __riscv_vsll_vx_i16m4(__VA_ARGS__)
+#define vsll_vv_i16m8(...) __riscv_vsll_vv_i16m8(__VA_ARGS__)
+#define vsll_vx_i16m8(...) __riscv_vsll_vx_i16m8(__VA_ARGS__)
+#define vsll_vv_i32mf2(...) __riscv_vsll_vv_i32mf2(__VA_ARGS__)
+#define vsll_vx_i32mf2(...) __riscv_vsll_vx_i32mf2(__VA_ARGS__)
+#define vsll_vv_i32m1(...) __riscv_vsll_vv_i32m1(__VA_ARGS__)
+#define vsll_vx_i32m1(...) __riscv_vsll_vx_i32m1(__VA_ARGS__)
+#define vsll_vv_i32m2(...) __riscv_vsll_vv_i32m2(__VA_ARGS__)
+#define vsll_vx_i32m2(...) __riscv_vsll_vx_i32m2(__VA_ARGS__)
+#define vsll_vv_i32m4(...) __riscv_vsll_vv_i32m4(__VA_ARGS__)
+#define vsll_vx_i32m4(...) __riscv_vsll_vx_i32m4(__VA_ARGS__)
+#define vsll_vv_i32m8(...) __riscv_vsll_vv_i32m8(__VA_ARGS__)
+#define vsll_vx_i32m8(...) __riscv_vsll_vx_i32m8(__VA_ARGS__)
+#define vsll_vv_i64m1(...) __riscv_vsll_vv_i64m1(__VA_ARGS__)
+#define vsll_vx_i64m1(...) __riscv_vsll_vx_i64m1(__VA_ARGS__)
+#define vsll_vv_i64m2(...) __riscv_vsll_vv_i64m2(__VA_ARGS__)
+#define vsll_vx_i64m2(...) __riscv_vsll_vx_i64m2(__VA_ARGS__)
+#define vsll_vv_i64m4(...) __riscv_vsll_vv_i64m4(__VA_ARGS__)
+#define vsll_vx_i64m4(...) __riscv_vsll_vx_i64m4(__VA_ARGS__)
+#define vsll_vv_i64m8(...) __riscv_vsll_vv_i64m8(__VA_ARGS__)
+#define vsll_vx_i64m8(...) __riscv_vsll_vx_i64m8(__VA_ARGS__)
+#define vsra_vv_i8mf8(...) __riscv_vsra_vv_i8mf8(__VA_ARGS__)
+#define vsra_vx_i8mf8(...) __riscv_vsra_vx_i8mf8(__VA_ARGS__)
+#define vsra_vv_i8mf4(...) __riscv_vsra_vv_i8mf4(__VA_ARGS__)
+#define vsra_vx_i8mf4(...) __riscv_vsra_vx_i8mf4(__VA_ARGS__)
+#define vsra_vv_i8mf2(...) __riscv_vsra_vv_i8mf2(__VA_ARGS__)
+#define vsra_vx_i8mf2(...) __riscv_vsra_vx_i8mf2(__VA_ARGS__)
+#define vsra_vv_i8m1(...) __riscv_vsra_vv_i8m1(__VA_ARGS__)
+#define vsra_vx_i8m1(...) __riscv_vsra_vx_i8m1(__VA_ARGS__)
+#define vsra_vv_i8m2(...) __riscv_vsra_vv_i8m2(__VA_ARGS__)
+#define vsra_vx_i8m2(...) __riscv_vsra_vx_i8m2(__VA_ARGS__)
+#define vsra_vv_i8m4(...) __riscv_vsra_vv_i8m4(__VA_ARGS__)
+#define vsra_vx_i8m4(...) __riscv_vsra_vx_i8m4(__VA_ARGS__)
+#define vsra_vv_i8m8(...) __riscv_vsra_vv_i8m8(__VA_ARGS__)
+#define vsra_vx_i8m8(...) __riscv_vsra_vx_i8m8(__VA_ARGS__)
+#define vsra_vv_i16mf4(...) __riscv_vsra_vv_i16mf4(__VA_ARGS__)
+#define vsra_vx_i16mf4(...) __riscv_vsra_vx_i16mf4(__VA_ARGS__)
+#define vsra_vv_i16mf2(...) __riscv_vsra_vv_i16mf2(__VA_ARGS__)
+#define vsra_vx_i16mf2(...) __riscv_vsra_vx_i16mf2(__VA_ARGS__)
+#define vsra_vv_i16m1(...) __riscv_vsra_vv_i16m1(__VA_ARGS__)
+#define vsra_vx_i16m1(...) __riscv_vsra_vx_i16m1(__VA_ARGS__)
+#define vsra_vv_i16m2(...) __riscv_vsra_vv_i16m2(__VA_ARGS__)
+#define vsra_vx_i16m2(...) __riscv_vsra_vx_i16m2(__VA_ARGS__)
+#define vsra_vv_i16m4(...) __riscv_vsra_vv_i16m4(__VA_ARGS__)
+#define vsra_vx_i16m4(...) __riscv_vsra_vx_i16m4(__VA_ARGS__)
+#define vsra_vv_i16m8(...) __riscv_vsra_vv_i16m8(__VA_ARGS__)
+#define vsra_vx_i16m8(...) __riscv_vsra_vx_i16m8(__VA_ARGS__)
+#define vsra_vv_i32mf2(...) __riscv_vsra_vv_i32mf2(__VA_ARGS__)
+#define vsra_vx_i32mf2(...) __riscv_vsra_vx_i32mf2(__VA_ARGS__)
+#define vsra_vv_i32m1(...) __riscv_vsra_vv_i32m1(__VA_ARGS__)
+#define vsra_vx_i32m1(...) __riscv_vsra_vx_i32m1(__VA_ARGS__)
+#define vsra_vv_i32m2(...) __riscv_vsra_vv_i32m2(__VA_ARGS__)
+#define vsra_vx_i32m2(...) __riscv_vsra_vx_i32m2(__VA_ARGS__)
+#define vsra_vv_i32m4(...) __riscv_vsra_vv_i32m4(__VA_ARGS__)
+#define vsra_vx_i32m4(...) __riscv_vsra_vx_i32m4(__VA_ARGS__)
+#define vsra_vv_i32m8(...) __riscv_vsra_vv_i32m8(__VA_ARGS__)
+#define vsra_vx_i32m8(...) __riscv_vsra_vx_i32m8(__VA_ARGS__)
+#define vsra_vv_i64m1(...) __riscv_vsra_vv_i64m1(__VA_ARGS__)
+#define vsra_vx_i64m1(...) __riscv_vsra_vx_i64m1(__VA_ARGS__)
+#define vsra_vv_i64m2(...) __riscv_vsra_vv_i64m2(__VA_ARGS__)
+#define vsra_vx_i64m2(...) __riscv_vsra_vx_i64m2(__VA_ARGS__)
+#define vsra_vv_i64m4(...) __riscv_vsra_vv_i64m4(__VA_ARGS__)
+#define vsra_vx_i64m4(...) __riscv_vsra_vx_i64m4(__VA_ARGS__)
+#define vsra_vv_i64m8(...) __riscv_vsra_vv_i64m8(__VA_ARGS__)
+#define vsra_vx_i64m8(...) __riscv_vsra_vx_i64m8(__VA_ARGS__)
+#define vsll_vv_u8mf8(...) __riscv_vsll_vv_u8mf8(__VA_ARGS__)
+#define vsll_vx_u8mf8(...) __riscv_vsll_vx_u8mf8(__VA_ARGS__)
+#define vsll_vv_u8mf4(...) __riscv_vsll_vv_u8mf4(__VA_ARGS__)
+#define vsll_vx_u8mf4(...) __riscv_vsll_vx_u8mf4(__VA_ARGS__)
+#define vsll_vv_u8mf2(...) __riscv_vsll_vv_u8mf2(__VA_ARGS__)
+#define vsll_vx_u8mf2(...) __riscv_vsll_vx_u8mf2(__VA_ARGS__)
+#define vsll_vv_u8m1(...) __riscv_vsll_vv_u8m1(__VA_ARGS__)
+#define vsll_vx_u8m1(...) __riscv_vsll_vx_u8m1(__VA_ARGS__)
+#define vsll_vv_u8m2(...) __riscv_vsll_vv_u8m2(__VA_ARGS__)
+#define vsll_vx_u8m2(...) __riscv_vsll_vx_u8m2(__VA_ARGS__)
+#define vsll_vv_u8m4(...) __riscv_vsll_vv_u8m4(__VA_ARGS__)
+#define vsll_vx_u8m4(...) __riscv_vsll_vx_u8m4(__VA_ARGS__)
+#define vsll_vv_u8m8(...) __riscv_vsll_vv_u8m8(__VA_ARGS__)
+#define vsll_vx_u8m8(...) __riscv_vsll_vx_u8m8(__VA_ARGS__)
+#define vsll_vv_u16mf4(...) __riscv_vsll_vv_u16mf4(__VA_ARGS__)
+#define vsll_vx_u16mf4(...) __riscv_vsll_vx_u16mf4(__VA_ARGS__)
+#define vsll_vv_u16mf2(...) __riscv_vsll_vv_u16mf2(__VA_ARGS__)
+#define vsll_vx_u16mf2(...) __riscv_vsll_vx_u16mf2(__VA_ARGS__)
+#define vsll_vv_u16m1(...) __riscv_vsll_vv_u16m1(__VA_ARGS__)
+#define vsll_vx_u16m1(...) __riscv_vsll_vx_u16m1(__VA_ARGS__)
+#define vsll_vv_u16m2(...) __riscv_vsll_vv_u16m2(__VA_ARGS__)
+#define vsll_vx_u16m2(...) __riscv_vsll_vx_u16m2(__VA_ARGS__)
+#define vsll_vv_u16m4(...) __riscv_vsll_vv_u16m4(__VA_ARGS__)
+#define vsll_vx_u16m4(...) __riscv_vsll_vx_u16m4(__VA_ARGS__)
+#define vsll_vv_u16m8(...) __riscv_vsll_vv_u16m8(__VA_ARGS__)
+#define vsll_vx_u16m8(...) __riscv_vsll_vx_u16m8(__VA_ARGS__)
+#define vsll_vv_u32mf2(...) __riscv_vsll_vv_u32mf2(__VA_ARGS__)
+#define vsll_vx_u32mf2(...) __riscv_vsll_vx_u32mf2(__VA_ARGS__)
+#define vsll_vv_u32m1(...) __riscv_vsll_vv_u32m1(__VA_ARGS__)
+#define vsll_vx_u32m1(...) __riscv_vsll_vx_u32m1(__VA_ARGS__)
+#define vsll_vv_u32m2(...) __riscv_vsll_vv_u32m2(__VA_ARGS__)
+#define vsll_vx_u32m2(...) __riscv_vsll_vx_u32m2(__VA_ARGS__)
+#define vsll_vv_u32m4(...) __riscv_vsll_vv_u32m4(__VA_ARGS__)
+#define vsll_vx_u32m4(...) __riscv_vsll_vx_u32m4(__VA_ARGS__)
+#define vsll_vv_u32m8(...) __riscv_vsll_vv_u32m8(__VA_ARGS__)
+#define vsll_vx_u32m8(...) __riscv_vsll_vx_u32m8(__VA_ARGS__)
+#define vsll_vv_u64m1(...) __riscv_vsll_vv_u64m1(__VA_ARGS__)
+#define vsll_vx_u64m1(...) __riscv_vsll_vx_u64m1(__VA_ARGS__)
+#define vsll_vv_u64m2(...) __riscv_vsll_vv_u64m2(__VA_ARGS__)
+#define vsll_vx_u64m2(...) __riscv_vsll_vx_u64m2(__VA_ARGS__)
+#define vsll_vv_u64m4(...) __riscv_vsll_vv_u64m4(__VA_ARGS__)
+#define vsll_vx_u64m4(...) __riscv_vsll_vx_u64m4(__VA_ARGS__)
+#define vsll_vv_u64m8(...) __riscv_vsll_vv_u64m8(__VA_ARGS__)
+#define vsll_vx_u64m8(...) __riscv_vsll_vx_u64m8(__VA_ARGS__)
+#define vsrl_vv_u8mf8(...) __riscv_vsrl_vv_u8mf8(__VA_ARGS__)
+#define vsrl_vx_u8mf8(...) __riscv_vsrl_vx_u8mf8(__VA_ARGS__)
+#define vsrl_vv_u8mf4(...) __riscv_vsrl_vv_u8mf4(__VA_ARGS__)
+#define vsrl_vx_u8mf4(...) __riscv_vsrl_vx_u8mf4(__VA_ARGS__)
+#define vsrl_vv_u8mf2(...) __riscv_vsrl_vv_u8mf2(__VA_ARGS__)
+#define vsrl_vx_u8mf2(...) __riscv_vsrl_vx_u8mf2(__VA_ARGS__)
+#define vsrl_vv_u8m1(...) __riscv_vsrl_vv_u8m1(__VA_ARGS__)
+#define vsrl_vx_u8m1(...) __riscv_vsrl_vx_u8m1(__VA_ARGS__)
+#define vsrl_vv_u8m2(...) __riscv_vsrl_vv_u8m2(__VA_ARGS__)
+#define vsrl_vx_u8m2(...) __riscv_vsrl_vx_u8m2(__VA_ARGS__)
+#define vsrl_vv_u8m4(...) __riscv_vsrl_vv_u8m4(__VA_ARGS__)
+#define vsrl_vx_u8m4(...) __riscv_vsrl_vx_u8m4(__VA_ARGS__)
+#define vsrl_vv_u8m8(...) __riscv_vsrl_vv_u8m8(__VA_ARGS__)
+#define vsrl_vx_u8m8(...) __riscv_vsrl_vx_u8m8(__VA_ARGS__)
+#define vsrl_vv_u16mf4(...) __riscv_vsrl_vv_u16mf4(__VA_ARGS__)
+#define vsrl_vx_u16mf4(...) __riscv_vsrl_vx_u16mf4(__VA_ARGS__)
+#define vsrl_vv_u16mf2(...) __riscv_vsrl_vv_u16mf2(__VA_ARGS__)
+#define vsrl_vx_u16mf2(...) __riscv_vsrl_vx_u16mf2(__VA_ARGS__)
+#define vsrl_vv_u16m1(...) __riscv_vsrl_vv_u16m1(__VA_ARGS__)
+#define vsrl_vx_u16m1(...) __riscv_vsrl_vx_u16m1(__VA_ARGS__)
+#define vsrl_vv_u16m2(...) __riscv_vsrl_vv_u16m2(__VA_ARGS__)
+#define vsrl_vx_u16m2(...) __riscv_vsrl_vx_u16m2(__VA_ARGS__)
+#define vsrl_vv_u16m4(...) __riscv_vsrl_vv_u16m4(__VA_ARGS__)
+#define vsrl_vx_u16m4(...) __riscv_vsrl_vx_u16m4(__VA_ARGS__)
+#define vsrl_vv_u16m8(...) __riscv_vsrl_vv_u16m8(__VA_ARGS__)
+#define vsrl_vx_u16m8(...) __riscv_vsrl_vx_u16m8(__VA_ARGS__)
+#define vsrl_vv_u32mf2(...) __riscv_vsrl_vv_u32mf2(__VA_ARGS__)
+#define vsrl_vx_u32mf2(...) __riscv_vsrl_vx_u32mf2(__VA_ARGS__)
+#define vsrl_vv_u32m1(...) __riscv_vsrl_vv_u32m1(__VA_ARGS__)
+#define vsrl_vx_u32m1(...) __riscv_vsrl_vx_u32m1(__VA_ARGS__)
+#define vsrl_vv_u32m2(...) __riscv_vsrl_vv_u32m2(__VA_ARGS__)
+#define vsrl_vx_u32m2(...) __riscv_vsrl_vx_u32m2(__VA_ARGS__)
+#define vsrl_vv_u32m4(...) __riscv_vsrl_vv_u32m4(__VA_ARGS__)
+#define vsrl_vx_u32m4(...) __riscv_vsrl_vx_u32m4(__VA_ARGS__)
+#define vsrl_vv_u32m8(...) __riscv_vsrl_vv_u32m8(__VA_ARGS__)
+#define vsrl_vx_u32m8(...) __riscv_vsrl_vx_u32m8(__VA_ARGS__)
+#define vsrl_vv_u64m1(...) __riscv_vsrl_vv_u64m1(__VA_ARGS__)
+#define vsrl_vx_u64m1(...) __riscv_vsrl_vx_u64m1(__VA_ARGS__)
+#define vsrl_vv_u64m2(...) __riscv_vsrl_vv_u64m2(__VA_ARGS__)
+#define vsrl_vx_u64m2(...) __riscv_vsrl_vx_u64m2(__VA_ARGS__)
+#define vsrl_vv_u64m4(...) __riscv_vsrl_vv_u64m4(__VA_ARGS__)
+#define vsrl_vx_u64m4(...) __riscv_vsrl_vx_u64m4(__VA_ARGS__)
+#define vsrl_vv_u64m8(...) __riscv_vsrl_vv_u64m8(__VA_ARGS__)
+#define vsrl_vx_u64m8(...) __riscv_vsrl_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vsll_vv_i8mf8_m(...) __riscv_vsll_vv_i8mf8_tumu(__VA_ARGS__)
+#define vsll_vx_i8mf8_m(...) __riscv_vsll_vx_i8mf8_tumu(__VA_ARGS__)
+#define vsll_vv_i8mf4_m(...) __riscv_vsll_vv_i8mf4_tumu(__VA_ARGS__)
+#define vsll_vx_i8mf4_m(...) __riscv_vsll_vx_i8mf4_tumu(__VA_ARGS__)
+#define vsll_vv_i8mf2_m(...) __riscv_vsll_vv_i8mf2_tumu(__VA_ARGS__)
+#define vsll_vx_i8mf2_m(...) __riscv_vsll_vx_i8mf2_tumu(__VA_ARGS__)
+#define vsll_vv_i8m1_m(...) __riscv_vsll_vv_i8m1_tumu(__VA_ARGS__)
+#define vsll_vx_i8m1_m(...) __riscv_vsll_vx_i8m1_tumu(__VA_ARGS__)
+#define vsll_vv_i8m2_m(...) __riscv_vsll_vv_i8m2_tumu(__VA_ARGS__)
+#define vsll_vx_i8m2_m(...) __riscv_vsll_vx_i8m2_tumu(__VA_ARGS__)
+#define vsll_vv_i8m4_m(...) __riscv_vsll_vv_i8m4_tumu(__VA_ARGS__)
+#define vsll_vx_i8m4_m(...) __riscv_vsll_vx_i8m4_tumu(__VA_ARGS__)
+#define vsll_vv_i8m8_m(...) __riscv_vsll_vv_i8m8_tumu(__VA_ARGS__)
+#define vsll_vx_i8m8_m(...) __riscv_vsll_vx_i8m8_tumu(__VA_ARGS__)
+#define vsll_vv_i16mf4_m(...) __riscv_vsll_vv_i16mf4_tumu(__VA_ARGS__)
+#define vsll_vx_i16mf4_m(...) __riscv_vsll_vx_i16mf4_tumu(__VA_ARGS__)
+#define vsll_vv_i16mf2_m(...) __riscv_vsll_vv_i16mf2_tumu(__VA_ARGS__)
+#define vsll_vx_i16mf2_m(...) __riscv_vsll_vx_i16mf2_tumu(__VA_ARGS__)
+#define vsll_vv_i16m1_m(...) __riscv_vsll_vv_i16m1_tumu(__VA_ARGS__)
+#define vsll_vx_i16m1_m(...) __riscv_vsll_vx_i16m1_tumu(__VA_ARGS__)
+#define vsll_vv_i16m2_m(...) __riscv_vsll_vv_i16m2_tumu(__VA_ARGS__)
+#define vsll_vx_i16m2_m(...) __riscv_vsll_vx_i16m2_tumu(__VA_ARGS__)
+#define vsll_vv_i16m4_m(...) __riscv_vsll_vv_i16m4_tumu(__VA_ARGS__)
+#define vsll_vx_i16m4_m(...) __riscv_vsll_vx_i16m4_tumu(__VA_ARGS__)
+#define vsll_vv_i16m8_m(...) __riscv_vsll_vv_i16m8_tumu(__VA_ARGS__)
+#define vsll_vx_i16m8_m(...) __riscv_vsll_vx_i16m8_tumu(__VA_ARGS__)
+#define vsll_vv_i32mf2_m(...) __riscv_vsll_vv_i32mf2_tumu(__VA_ARGS__)
+#define vsll_vx_i32mf2_m(...) __riscv_vsll_vx_i32mf2_tumu(__VA_ARGS__)
+#define vsll_vv_i32m1_m(...) __riscv_vsll_vv_i32m1_tumu(__VA_ARGS__)
+#define vsll_vx_i32m1_m(...) __riscv_vsll_vx_i32m1_tumu(__VA_ARGS__)
+#define vsll_vv_i32m2_m(...) __riscv_vsll_vv_i32m2_tumu(__VA_ARGS__)
+#define vsll_vx_i32m2_m(...) __riscv_vsll_vx_i32m2_tumu(__VA_ARGS__)
+#define vsll_vv_i32m4_m(...) __riscv_vsll_vv_i32m4_tumu(__VA_ARGS__)
+#define vsll_vx_i32m4_m(...) __riscv_vsll_vx_i32m4_tumu(__VA_ARGS__)
+#define vsll_vv_i32m8_m(...) __riscv_vsll_vv_i32m8_tumu(__VA_ARGS__)
+#define vsll_vx_i32m8_m(...) __riscv_vsll_vx_i32m8_tumu(__VA_ARGS__)
+#define vsll_vv_i64m1_m(...) __riscv_vsll_vv_i64m1_tumu(__VA_ARGS__)
+#define vsll_vx_i64m1_m(...) __riscv_vsll_vx_i64m1_tumu(__VA_ARGS__)
+#define vsll_vv_i64m2_m(...) __riscv_vsll_vv_i64m2_tumu(__VA_ARGS__)
+#define vsll_vx_i64m2_m(...) __riscv_vsll_vx_i64m2_tumu(__VA_ARGS__)
+#define vsll_vv_i64m4_m(...) __riscv_vsll_vv_i64m4_tumu(__VA_ARGS__)
+#define vsll_vx_i64m4_m(...) __riscv_vsll_vx_i64m4_tumu(__VA_ARGS__)
+#define vsll_vv_i64m8_m(...) __riscv_vsll_vv_i64m8_tumu(__VA_ARGS__)
+#define vsll_vx_i64m8_m(...) __riscv_vsll_vx_i64m8_tumu(__VA_ARGS__)
+#define vsra_vv_i8mf8_m(...) __riscv_vsra_vv_i8mf8_tumu(__VA_ARGS__)
+#define vsra_vx_i8mf8_m(...) __riscv_vsra_vx_i8mf8_tumu(__VA_ARGS__)
+#define vsra_vv_i8mf4_m(...) __riscv_vsra_vv_i8mf4_tumu(__VA_ARGS__)
+#define vsra_vx_i8mf4_m(...) __riscv_vsra_vx_i8mf4_tumu(__VA_ARGS__)
+#define vsra_vv_i8mf2_m(...) __riscv_vsra_vv_i8mf2_tumu(__VA_ARGS__)
+#define vsra_vx_i8mf2_m(...) __riscv_vsra_vx_i8mf2_tumu(__VA_ARGS__)
+#define vsra_vv_i8m1_m(...) __riscv_vsra_vv_i8m1_tumu(__VA_ARGS__)
+#define vsra_vx_i8m1_m(...) __riscv_vsra_vx_i8m1_tumu(__VA_ARGS__)
+#define vsra_vv_i8m2_m(...) __riscv_vsra_vv_i8m2_tumu(__VA_ARGS__)
+#define vsra_vx_i8m2_m(...) __riscv_vsra_vx_i8m2_tumu(__VA_ARGS__)
+#define vsra_vv_i8m4_m(...) __riscv_vsra_vv_i8m4_tumu(__VA_ARGS__)
+#define vsra_vx_i8m4_m(...) __riscv_vsra_vx_i8m4_tumu(__VA_ARGS__)
+#define vsra_vv_i8m8_m(...) __riscv_vsra_vv_i8m8_tumu(__VA_ARGS__)
+#define vsra_vx_i8m8_m(...) __riscv_vsra_vx_i8m8_tumu(__VA_ARGS__)
+#define vsra_vv_i16mf4_m(...) __riscv_vsra_vv_i16mf4_tumu(__VA_ARGS__)
+#define vsra_vx_i16mf4_m(...) __riscv_vsra_vx_i16mf4_tumu(__VA_ARGS__)
+#define vsra_vv_i16mf2_m(...) __riscv_vsra_vv_i16mf2_tumu(__VA_ARGS__)
+#define vsra_vx_i16mf2_m(...) __riscv_vsra_vx_i16mf2_tumu(__VA_ARGS__)
+#define vsra_vv_i16m1_m(...) __riscv_vsra_vv_i16m1_tumu(__VA_ARGS__)
+#define vsra_vx_i16m1_m(...) __riscv_vsra_vx_i16m1_tumu(__VA_ARGS__)
+#define vsra_vv_i16m2_m(...) __riscv_vsra_vv_i16m2_tumu(__VA_ARGS__)
+#define vsra_vx_i16m2_m(...) __riscv_vsra_vx_i16m2_tumu(__VA_ARGS__)
+#define vsra_vv_i16m4_m(...) __riscv_vsra_vv_i16m4_tumu(__VA_ARGS__)
+#define vsra_vx_i16m4_m(...) __riscv_vsra_vx_i16m4_tumu(__VA_ARGS__)
+#define vsra_vv_i16m8_m(...) __riscv_vsra_vv_i16m8_tumu(__VA_ARGS__)
+#define vsra_vx_i16m8_m(...) __riscv_vsra_vx_i16m8_tumu(__VA_ARGS__)
+#define vsra_vv_i32mf2_m(...) __riscv_vsra_vv_i32mf2_tumu(__VA_ARGS__)
+#define vsra_vx_i32mf2_m(...) __riscv_vsra_vx_i32mf2_tumu(__VA_ARGS__)
+#define vsra_vv_i32m1_m(...) __riscv_vsra_vv_i32m1_tumu(__VA_ARGS__)
+#define vsra_vx_i32m1_m(...) __riscv_vsra_vx_i32m1_tumu(__VA_ARGS__)
+#define vsra_vv_i32m2_m(...) __riscv_vsra_vv_i32m2_tumu(__VA_ARGS__)
+#define vsra_vx_i32m2_m(...) __riscv_vsra_vx_i32m2_tumu(__VA_ARGS__)
+#define vsra_vv_i32m4_m(...) __riscv_vsra_vv_i32m4_tumu(__VA_ARGS__)
+#define vsra_vx_i32m4_m(...) __riscv_vsra_vx_i32m4_tumu(__VA_ARGS__)
+#define vsra_vv_i32m8_m(...) __riscv_vsra_vv_i32m8_tumu(__VA_ARGS__)
+#define vsra_vx_i32m8_m(...) __riscv_vsra_vx_i32m8_tumu(__VA_ARGS__)
+#define vsra_vv_i64m1_m(...) __riscv_vsra_vv_i64m1_tumu(__VA_ARGS__)
+#define vsra_vx_i64m1_m(...) __riscv_vsra_vx_i64m1_tumu(__VA_ARGS__)
+#define vsra_vv_i64m2_m(...) __riscv_vsra_vv_i64m2_tumu(__VA_ARGS__)
+#define vsra_vx_i64m2_m(...) __riscv_vsra_vx_i64m2_tumu(__VA_ARGS__)
+#define vsra_vv_i64m4_m(...) __riscv_vsra_vv_i64m4_tumu(__VA_ARGS__)
+#define vsra_vx_i64m4_m(...) __riscv_vsra_vx_i64m4_tumu(__VA_ARGS__)
+#define vsra_vv_i64m8_m(...) __riscv_vsra_vv_i64m8_tumu(__VA_ARGS__)
+#define vsra_vx_i64m8_m(...) __riscv_vsra_vx_i64m8_tumu(__VA_ARGS__)
+#define vsll_vv_u8mf8_m(...) __riscv_vsll_vv_u8mf8_tumu(__VA_ARGS__)
+#define vsll_vx_u8mf8_m(...) __riscv_vsll_vx_u8mf8_tumu(__VA_ARGS__)
+#define vsll_vv_u8mf4_m(...) __riscv_vsll_vv_u8mf4_tumu(__VA_ARGS__)
+#define vsll_vx_u8mf4_m(...) __riscv_vsll_vx_u8mf4_tumu(__VA_ARGS__)
+#define vsll_vv_u8mf2_m(...) __riscv_vsll_vv_u8mf2_tumu(__VA_ARGS__)
+#define vsll_vx_u8mf2_m(...) __riscv_vsll_vx_u8mf2_tumu(__VA_ARGS__)
+#define vsll_vv_u8m1_m(...) __riscv_vsll_vv_u8m1_tumu(__VA_ARGS__)
+#define vsll_vx_u8m1_m(...) __riscv_vsll_vx_u8m1_tumu(__VA_ARGS__)
+#define vsll_vv_u8m2_m(...) __riscv_vsll_vv_u8m2_tumu(__VA_ARGS__)
+#define vsll_vx_u8m2_m(...) __riscv_vsll_vx_u8m2_tumu(__VA_ARGS__)
+#define vsll_vv_u8m4_m(...) __riscv_vsll_vv_u8m4_tumu(__VA_ARGS__)
+#define vsll_vx_u8m4_m(...) __riscv_vsll_vx_u8m4_tumu(__VA_ARGS__)
+#define vsll_vv_u8m8_m(...) __riscv_vsll_vv_u8m8_tumu(__VA_ARGS__)
+#define vsll_vx_u8m8_m(...) __riscv_vsll_vx_u8m8_tumu(__VA_ARGS__)
+#define vsll_vv_u16mf4_m(...) __riscv_vsll_vv_u16mf4_tumu(__VA_ARGS__)
+#define vsll_vx_u16mf4_m(...) __riscv_vsll_vx_u16mf4_tumu(__VA_ARGS__)
+#define vsll_vv_u16mf2_m(...) __riscv_vsll_vv_u16mf2_tumu(__VA_ARGS__)
+#define vsll_vx_u16mf2_m(...) __riscv_vsll_vx_u16mf2_tumu(__VA_ARGS__)
+#define vsll_vv_u16m1_m(...) __riscv_vsll_vv_u16m1_tumu(__VA_ARGS__)
+#define vsll_vx_u16m1_m(...) __riscv_vsll_vx_u16m1_tumu(__VA_ARGS__)
+#define vsll_vv_u16m2_m(...) __riscv_vsll_vv_u16m2_tumu(__VA_ARGS__)
+#define vsll_vx_u16m2_m(...) __riscv_vsll_vx_u16m2_tumu(__VA_ARGS__)
+#define vsll_vv_u16m4_m(...) __riscv_vsll_vv_u16m4_tumu(__VA_ARGS__)
+#define vsll_vx_u16m4_m(...) __riscv_vsll_vx_u16m4_tumu(__VA_ARGS__)
+#define vsll_vv_u16m8_m(...) __riscv_vsll_vv_u16m8_tumu(__VA_ARGS__)
+#define vsll_vx_u16m8_m(...) __riscv_vsll_vx_u16m8_tumu(__VA_ARGS__)
+#define vsll_vv_u32mf2_m(...) __riscv_vsll_vv_u32mf2_tumu(__VA_ARGS__)
+#define vsll_vx_u32mf2_m(...) __riscv_vsll_vx_u32mf2_tumu(__VA_ARGS__)
+#define vsll_vv_u32m1_m(...) __riscv_vsll_vv_u32m1_tumu(__VA_ARGS__)
+#define vsll_vx_u32m1_m(...) __riscv_vsll_vx_u32m1_tumu(__VA_ARGS__)
+#define vsll_vv_u32m2_m(...) __riscv_vsll_vv_u32m2_tumu(__VA_ARGS__)
+#define vsll_vx_u32m2_m(...) __riscv_vsll_vx_u32m2_tumu(__VA_ARGS__)
+#define vsll_vv_u32m4_m(...) __riscv_vsll_vv_u32m4_tumu(__VA_ARGS__)
+#define vsll_vx_u32m4_m(...) __riscv_vsll_vx_u32m4_tumu(__VA_ARGS__)
+#define vsll_vv_u32m8_m(...) __riscv_vsll_vv_u32m8_tumu(__VA_ARGS__)
+#define vsll_vx_u32m8_m(...) __riscv_vsll_vx_u32m8_tumu(__VA_ARGS__)
+#define vsll_vv_u64m1_m(...) __riscv_vsll_vv_u64m1_tumu(__VA_ARGS__)
+#define vsll_vx_u64m1_m(...) __riscv_vsll_vx_u64m1_tumu(__VA_ARGS__)
+#define vsll_vv_u64m2_m(...) __riscv_vsll_vv_u64m2_tumu(__VA_ARGS__)
+#define vsll_vx_u64m2_m(...) __riscv_vsll_vx_u64m2_tumu(__VA_ARGS__)
+#define vsll_vv_u64m4_m(...) __riscv_vsll_vv_u64m4_tumu(__VA_ARGS__)
+#define vsll_vx_u64m4_m(...) __riscv_vsll_vx_u64m4_tumu(__VA_ARGS__)
+#define vsll_vv_u64m8_m(...) __riscv_vsll_vv_u64m8_tumu(__VA_ARGS__)
+#define vsll_vx_u64m8_m(...) __riscv_vsll_vx_u64m8_tumu(__VA_ARGS__)
+#define vsrl_vv_u8mf8_m(...) __riscv_vsrl_vv_u8mf8_tumu(__VA_ARGS__)
+#define vsrl_vx_u8mf8_m(...) __riscv_vsrl_vx_u8mf8_tumu(__VA_ARGS__)
+#define vsrl_vv_u8mf4_m(...) __riscv_vsrl_vv_u8mf4_tumu(__VA_ARGS__)
+#define vsrl_vx_u8mf4_m(...) __riscv_vsrl_vx_u8mf4_tumu(__VA_ARGS__)
+#define vsrl_vv_u8mf2_m(...) __riscv_vsrl_vv_u8mf2_tumu(__VA_ARGS__)
+#define vsrl_vx_u8mf2_m(...) __riscv_vsrl_vx_u8mf2_tumu(__VA_ARGS__)
+#define vsrl_vv_u8m1_m(...) __riscv_vsrl_vv_u8m1_tumu(__VA_ARGS__)
+#define vsrl_vx_u8m1_m(...) __riscv_vsrl_vx_u8m1_tumu(__VA_ARGS__)
+#define vsrl_vv_u8m2_m(...) __riscv_vsrl_vv_u8m2_tumu(__VA_ARGS__)
+#define vsrl_vx_u8m2_m(...) __riscv_vsrl_vx_u8m2_tumu(__VA_ARGS__)
+#define vsrl_vv_u8m4_m(...) __riscv_vsrl_vv_u8m4_tumu(__VA_ARGS__)
+#define vsrl_vx_u8m4_m(...) __riscv_vsrl_vx_u8m4_tumu(__VA_ARGS__)
+#define vsrl_vv_u8m8_m(...) __riscv_vsrl_vv_u8m8_tumu(__VA_ARGS__)
+#define vsrl_vx_u8m8_m(...) __riscv_vsrl_vx_u8m8_tumu(__VA_ARGS__)
+#define vsrl_vv_u16mf4_m(...) __riscv_vsrl_vv_u16mf4_tumu(__VA_ARGS__)
+#define vsrl_vx_u16mf4_m(...) __riscv_vsrl_vx_u16mf4_tumu(__VA_ARGS__)
+#define vsrl_vv_u16mf2_m(...) __riscv_vsrl_vv_u16mf2_tumu(__VA_ARGS__)
+#define vsrl_vx_u16mf2_m(...) __riscv_vsrl_vx_u16mf2_tumu(__VA_ARGS__)
+#define vsrl_vv_u16m1_m(...) __riscv_vsrl_vv_u16m1_tumu(__VA_ARGS__)
+#define vsrl_vx_u16m1_m(...) __riscv_vsrl_vx_u16m1_tumu(__VA_ARGS__)
+#define vsrl_vv_u16m2_m(...) __riscv_vsrl_vv_u16m2_tumu(__VA_ARGS__)
+#define vsrl_vx_u16m2_m(...) __riscv_vsrl_vx_u16m2_tumu(__VA_ARGS__)
+#define vsrl_vv_u16m4_m(...) __riscv_vsrl_vv_u16m4_tumu(__VA_ARGS__)
+#define vsrl_vx_u16m4_m(...) __riscv_vsrl_vx_u16m4_tumu(__VA_ARGS__)
+#define vsrl_vv_u16m8_m(...) __riscv_vsrl_vv_u16m8_tumu(__VA_ARGS__)
+#define vsrl_vx_u16m8_m(...) __riscv_vsrl_vx_u16m8_tumu(__VA_ARGS__)
+#define vsrl_vv_u32mf2_m(...) __riscv_vsrl_vv_u32mf2_tumu(__VA_ARGS__)
+#define vsrl_vx_u32mf2_m(...) __riscv_vsrl_vx_u32mf2_tumu(__VA_ARGS__)
+#define vsrl_vv_u32m1_m(...) __riscv_vsrl_vv_u32m1_tumu(__VA_ARGS__)
+#define vsrl_vx_u32m1_m(...) __riscv_vsrl_vx_u32m1_tumu(__VA_ARGS__)
+#define vsrl_vv_u32m2_m(...) __riscv_vsrl_vv_u32m2_tumu(__VA_ARGS__)
+#define vsrl_vx_u32m2_m(...) __riscv_vsrl_vx_u32m2_tumu(__VA_ARGS__)
+#define vsrl_vv_u32m4_m(...) __riscv_vsrl_vv_u32m4_tumu(__VA_ARGS__)
+#define vsrl_vx_u32m4_m(...) __riscv_vsrl_vx_u32m4_tumu(__VA_ARGS__)
+#define vsrl_vv_u32m8_m(...) __riscv_vsrl_vv_u32m8_tumu(__VA_ARGS__)
+#define vsrl_vx_u32m8_m(...) __riscv_vsrl_vx_u32m8_tumu(__VA_ARGS__)
+#define vsrl_vv_u64m1_m(...) __riscv_vsrl_vv_u64m1_tumu(__VA_ARGS__)
+#define vsrl_vx_u64m1_m(...) __riscv_vsrl_vx_u64m1_tumu(__VA_ARGS__)
+#define vsrl_vv_u64m2_m(...) __riscv_vsrl_vv_u64m2_tumu(__VA_ARGS__)
+#define vsrl_vx_u64m2_m(...) __riscv_vsrl_vx_u64m2_tumu(__VA_ARGS__)
+#define vsrl_vv_u64m4_m(...) __riscv_vsrl_vv_u64m4_tumu(__VA_ARGS__)
+#define vsrl_vx_u64m4_m(...) __riscv_vsrl_vx_u64m4_tumu(__VA_ARGS__)
+#define vsrl_vv_u64m8_m(...) __riscv_vsrl_vv_u64m8_tumu(__VA_ARGS__)
+#define vsrl_vx_u64m8_m(...) __riscv_vsrl_vx_u64m8_tumu(__VA_ARGS__)
+#define vnsra_wv_i8mf8(...) __riscv_vnsra_wv_i8mf8(__VA_ARGS__)
+#define vnsra_wx_i8mf8(...) __riscv_vnsra_wx_i8mf8(__VA_ARGS__)
+#define vnsra_wv_i8mf4(...) __riscv_vnsra_wv_i8mf4(__VA_ARGS__)
+#define vnsra_wx_i8mf4(...) __riscv_vnsra_wx_i8mf4(__VA_ARGS__)
+#define vnsra_wv_i8mf2(...) __riscv_vnsra_wv_i8mf2(__VA_ARGS__)
+#define vnsra_wx_i8mf2(...) __riscv_vnsra_wx_i8mf2(__VA_ARGS__)
+#define vnsra_wv_i8m1(...) __riscv_vnsra_wv_i8m1(__VA_ARGS__)
+#define vnsra_wx_i8m1(...) __riscv_vnsra_wx_i8m1(__VA_ARGS__)
+#define vnsra_wv_i8m2(...) __riscv_vnsra_wv_i8m2(__VA_ARGS__)
+#define vnsra_wx_i8m2(...) __riscv_vnsra_wx_i8m2(__VA_ARGS__)
+#define vnsra_wv_i8m4(...) __riscv_vnsra_wv_i8m4(__VA_ARGS__)
+#define vnsra_wx_i8m4(...) __riscv_vnsra_wx_i8m4(__VA_ARGS__)
+#define vnsra_wv_i16mf4(...) __riscv_vnsra_wv_i16mf4(__VA_ARGS__)
+#define vnsra_wx_i16mf4(...) __riscv_vnsra_wx_i16mf4(__VA_ARGS__)
+#define vnsra_wv_i16mf2(...) __riscv_vnsra_wv_i16mf2(__VA_ARGS__)
+#define vnsra_wx_i16mf2(...) __riscv_vnsra_wx_i16mf2(__VA_ARGS__)
+#define vnsra_wv_i16m1(...) __riscv_vnsra_wv_i16m1(__VA_ARGS__)
+#define vnsra_wx_i16m1(...) __riscv_vnsra_wx_i16m1(__VA_ARGS__)
+#define vnsra_wv_i16m2(...) __riscv_vnsra_wv_i16m2(__VA_ARGS__)
+#define vnsra_wx_i16m2(...) __riscv_vnsra_wx_i16m2(__VA_ARGS__)
+#define vnsra_wv_i16m4(...) __riscv_vnsra_wv_i16m4(__VA_ARGS__)
+#define vnsra_wx_i16m4(...) __riscv_vnsra_wx_i16m4(__VA_ARGS__)
+#define vnsra_wv_i32mf2(...) __riscv_vnsra_wv_i32mf2(__VA_ARGS__)
+#define vnsra_wx_i32mf2(...) __riscv_vnsra_wx_i32mf2(__VA_ARGS__)
+#define vnsra_wv_i32m1(...) __riscv_vnsra_wv_i32m1(__VA_ARGS__)
+#define vnsra_wx_i32m1(...) __riscv_vnsra_wx_i32m1(__VA_ARGS__)
+#define vnsra_wv_i32m2(...) __riscv_vnsra_wv_i32m2(__VA_ARGS__)
+#define vnsra_wx_i32m2(...) __riscv_vnsra_wx_i32m2(__VA_ARGS__)
+#define vnsra_wv_i32m4(...) __riscv_vnsra_wv_i32m4(__VA_ARGS__)
+#define vnsra_wx_i32m4(...) __riscv_vnsra_wx_i32m4(__VA_ARGS__)
+#define vnsrl_wv_u8mf8(...) __riscv_vnsrl_wv_u8mf8(__VA_ARGS__)
+#define vnsrl_wx_u8mf8(...) __riscv_vnsrl_wx_u8mf8(__VA_ARGS__)
+#define vnsrl_wv_u8mf4(...) __riscv_vnsrl_wv_u8mf4(__VA_ARGS__)
+#define vnsrl_wx_u8mf4(...) __riscv_vnsrl_wx_u8mf4(__VA_ARGS__)
+#define vnsrl_wv_u8mf2(...) __riscv_vnsrl_wv_u8mf2(__VA_ARGS__)
+#define vnsrl_wx_u8mf2(...) __riscv_vnsrl_wx_u8mf2(__VA_ARGS__)
+#define vnsrl_wv_u8m1(...) __riscv_vnsrl_wv_u8m1(__VA_ARGS__)
+#define vnsrl_wx_u8m1(...) __riscv_vnsrl_wx_u8m1(__VA_ARGS__)
+#define vnsrl_wv_u8m2(...) __riscv_vnsrl_wv_u8m2(__VA_ARGS__)
+#define vnsrl_wx_u8m2(...) __riscv_vnsrl_wx_u8m2(__VA_ARGS__)
+#define vnsrl_wv_u8m4(...) __riscv_vnsrl_wv_u8m4(__VA_ARGS__)
+#define vnsrl_wx_u8m4(...) __riscv_vnsrl_wx_u8m4(__VA_ARGS__)
+#define vnsrl_wv_u16mf4(...) __riscv_vnsrl_wv_u16mf4(__VA_ARGS__)
+#define vnsrl_wx_u16mf4(...) __riscv_vnsrl_wx_u16mf4(__VA_ARGS__)
+#define vnsrl_wv_u16mf2(...) __riscv_vnsrl_wv_u16mf2(__VA_ARGS__)
+#define vnsrl_wx_u16mf2(...) __riscv_vnsrl_wx_u16mf2(__VA_ARGS__)
+#define vnsrl_wv_u16m1(...) __riscv_vnsrl_wv_u16m1(__VA_ARGS__)
+#define vnsrl_wx_u16m1(...) __riscv_vnsrl_wx_u16m1(__VA_ARGS__)
+#define vnsrl_wv_u16m2(...) __riscv_vnsrl_wv_u16m2(__VA_ARGS__)
+#define vnsrl_wx_u16m2(...) __riscv_vnsrl_wx_u16m2(__VA_ARGS__)
+#define vnsrl_wv_u16m4(...) __riscv_vnsrl_wv_u16m4(__VA_ARGS__)
+#define vnsrl_wx_u16m4(...) __riscv_vnsrl_wx_u16m4(__VA_ARGS__)
+#define vnsrl_wv_u32mf2(...) __riscv_vnsrl_wv_u32mf2(__VA_ARGS__)
+#define vnsrl_wx_u32mf2(...) __riscv_vnsrl_wx_u32mf2(__VA_ARGS__)
+#define vnsrl_wv_u32m1(...) __riscv_vnsrl_wv_u32m1(__VA_ARGS__)
+#define vnsrl_wx_u32m1(...) __riscv_vnsrl_wx_u32m1(__VA_ARGS__)
+#define vnsrl_wv_u32m2(...) __riscv_vnsrl_wv_u32m2(__VA_ARGS__)
+#define vnsrl_wx_u32m2(...) __riscv_vnsrl_wx_u32m2(__VA_ARGS__)
+#define vnsrl_wv_u32m4(...) __riscv_vnsrl_wv_u32m4(__VA_ARGS__)
+#define vnsrl_wx_u32m4(...) __riscv_vnsrl_wx_u32m4(__VA_ARGS__)
+// masked functions
+#define vnsra_wv_i8mf8_m(...) __riscv_vnsra_wv_i8mf8_tumu(__VA_ARGS__)
+#define vnsra_wx_i8mf8_m(...) __riscv_vnsra_wx_i8mf8_tumu(__VA_ARGS__)
+#define vnsra_wv_i8mf4_m(...) __riscv_vnsra_wv_i8mf4_tumu(__VA_ARGS__)
+#define vnsra_wx_i8mf4_m(...) __riscv_vnsra_wx_i8mf4_tumu(__VA_ARGS__)
+#define vnsra_wv_i8mf2_m(...) __riscv_vnsra_wv_i8mf2_tumu(__VA_ARGS__)
+#define vnsra_wx_i8mf2_m(...) __riscv_vnsra_wx_i8mf2_tumu(__VA_ARGS__)
+#define vnsra_wv_i8m1_m(...) __riscv_vnsra_wv_i8m1_tumu(__VA_ARGS__)
+#define vnsra_wx_i8m1_m(...) __riscv_vnsra_wx_i8m1_tumu(__VA_ARGS__)
+#define vnsra_wv_i8m2_m(...) __riscv_vnsra_wv_i8m2_tumu(__VA_ARGS__)
+#define vnsra_wx_i8m2_m(...) __riscv_vnsra_wx_i8m2_tumu(__VA_ARGS__)
+#define vnsra_wv_i8m4_m(...) __riscv_vnsra_wv_i8m4_tumu(__VA_ARGS__)
+#define vnsra_wx_i8m4_m(...) __riscv_vnsra_wx_i8m4_tumu(__VA_ARGS__)
+#define vnsra_wv_i16mf4_m(...) __riscv_vnsra_wv_i16mf4_tumu(__VA_ARGS__)
+#define vnsra_wx_i16mf4_m(...) __riscv_vnsra_wx_i16mf4_tumu(__VA_ARGS__)
+#define vnsra_wv_i16mf2_m(...) __riscv_vnsra_wv_i16mf2_tumu(__VA_ARGS__)
+#define vnsra_wx_i16mf2_m(...) __riscv_vnsra_wx_i16mf2_tumu(__VA_ARGS__)
+#define vnsra_wv_i16m1_m(...) __riscv_vnsra_wv_i16m1_tumu(__VA_ARGS__)
+#define vnsra_wx_i16m1_m(...) __riscv_vnsra_wx_i16m1_tumu(__VA_ARGS__)
+#define vnsra_wv_i16m2_m(...) __riscv_vnsra_wv_i16m2_tumu(__VA_ARGS__)
+#define vnsra_wx_i16m2_m(...) __riscv_vnsra_wx_i16m2_tumu(__VA_ARGS__)
+#define vnsra_wv_i16m4_m(...) __riscv_vnsra_wv_i16m4_tumu(__VA_ARGS__)
+#define vnsra_wx_i16m4_m(...) __riscv_vnsra_wx_i16m4_tumu(__VA_ARGS__)
+#define vnsra_wv_i32mf2_m(...) __riscv_vnsra_wv_i32mf2_tumu(__VA_ARGS__)
+#define vnsra_wx_i32mf2_m(...) __riscv_vnsra_wx_i32mf2_tumu(__VA_ARGS__)
+#define vnsra_wv_i32m1_m(...) __riscv_vnsra_wv_i32m1_tumu(__VA_ARGS__)
+#define vnsra_wx_i32m1_m(...) __riscv_vnsra_wx_i32m1_tumu(__VA_ARGS__)
+#define vnsra_wv_i32m2_m(...) __riscv_vnsra_wv_i32m2_tumu(__VA_ARGS__)
+#define vnsra_wx_i32m2_m(...) __riscv_vnsra_wx_i32m2_tumu(__VA_ARGS__)
+#define vnsra_wv_i32m4_m(...) __riscv_vnsra_wv_i32m4_tumu(__VA_ARGS__)
+#define vnsra_wx_i32m4_m(...) __riscv_vnsra_wx_i32m4_tumu(__VA_ARGS__)
+#define vnsrl_wv_u8mf8_m(...) __riscv_vnsrl_wv_u8mf8_tumu(__VA_ARGS__)
+#define vnsrl_wx_u8mf8_m(...) __riscv_vnsrl_wx_u8mf8_tumu(__VA_ARGS__)
+#define vnsrl_wv_u8mf4_m(...) __riscv_vnsrl_wv_u8mf4_tumu(__VA_ARGS__)
+#define vnsrl_wx_u8mf4_m(...) __riscv_vnsrl_wx_u8mf4_tumu(__VA_ARGS__)
+#define vnsrl_wv_u8mf2_m(...) __riscv_vnsrl_wv_u8mf2_tumu(__VA_ARGS__)
+#define vnsrl_wx_u8mf2_m(...) __riscv_vnsrl_wx_u8mf2_tumu(__VA_ARGS__)
+#define vnsrl_wv_u8m1_m(...) __riscv_vnsrl_wv_u8m1_tumu(__VA_ARGS__)
+#define vnsrl_wx_u8m1_m(...) __riscv_vnsrl_wx_u8m1_tumu(__VA_ARGS__)
+#define vnsrl_wv_u8m2_m(...) __riscv_vnsrl_wv_u8m2_tumu(__VA_ARGS__)
+#define vnsrl_wx_u8m2_m(...) __riscv_vnsrl_wx_u8m2_tumu(__VA_ARGS__)
+#define vnsrl_wv_u8m4_m(...) __riscv_vnsrl_wv_u8m4_tumu(__VA_ARGS__)
+#define vnsrl_wx_u8m4_m(...) __riscv_vnsrl_wx_u8m4_tumu(__VA_ARGS__)
+#define vnsrl_wv_u16mf4_m(...) __riscv_vnsrl_wv_u16mf4_tumu(__VA_ARGS__)
+#define vnsrl_wx_u16mf4_m(...) __riscv_vnsrl_wx_u16mf4_tumu(__VA_ARGS__)
+#define vnsrl_wv_u16mf2_m(...) __riscv_vnsrl_wv_u16mf2_tumu(__VA_ARGS__)
+#define vnsrl_wx_u16mf2_m(...) __riscv_vnsrl_wx_u16mf2_tumu(__VA_ARGS__)
+#define vnsrl_wv_u16m1_m(...) __riscv_vnsrl_wv_u16m1_tumu(__VA_ARGS__)
+#define vnsrl_wx_u16m1_m(...) __riscv_vnsrl_wx_u16m1_tumu(__VA_ARGS__)
+#define vnsrl_wv_u16m2_m(...) __riscv_vnsrl_wv_u16m2_tumu(__VA_ARGS__)
+#define vnsrl_wx_u16m2_m(...) __riscv_vnsrl_wx_u16m2_tumu(__VA_ARGS__)
+#define vnsrl_wv_u16m4_m(...) __riscv_vnsrl_wv_u16m4_tumu(__VA_ARGS__)
+#define vnsrl_wx_u16m4_m(...) __riscv_vnsrl_wx_u16m4_tumu(__VA_ARGS__)
+#define vnsrl_wv_u32mf2_m(...) __riscv_vnsrl_wv_u32mf2_tumu(__VA_ARGS__)
+#define vnsrl_wx_u32mf2_m(...) __riscv_vnsrl_wx_u32mf2_tumu(__VA_ARGS__)
+#define vnsrl_wv_u32m1_m(...) __riscv_vnsrl_wv_u32m1_tumu(__VA_ARGS__)
+#define vnsrl_wx_u32m1_m(...) __riscv_vnsrl_wx_u32m1_tumu(__VA_ARGS__)
+#define vnsrl_wv_u32m2_m(...) __riscv_vnsrl_wv_u32m2_tumu(__VA_ARGS__)
+#define vnsrl_wx_u32m2_m(...) __riscv_vnsrl_wx_u32m2_tumu(__VA_ARGS__)
+#define vnsrl_wv_u32m4_m(...) __riscv_vnsrl_wv_u32m4_tumu(__VA_ARGS__)
+#define vnsrl_wx_u32m4_m(...) __riscv_vnsrl_wx_u32m4_tumu(__VA_ARGS__)
+#define vmseq_vv_i8mf8_b64(...) __riscv_vmseq_vv_i8mf8_b64(__VA_ARGS__)
+#define vmseq_vx_i8mf8_b64(...) __riscv_vmseq_vx_i8mf8_b64(__VA_ARGS__)
+#define vmseq_vv_i8mf4_b32(...) __riscv_vmseq_vv_i8mf4_b32(__VA_ARGS__)
+#define vmseq_vx_i8mf4_b32(...) __riscv_vmseq_vx_i8mf4_b32(__VA_ARGS__)
+#define vmseq_vv_i8mf2_b16(...) __riscv_vmseq_vv_i8mf2_b16(__VA_ARGS__)
+#define vmseq_vx_i8mf2_b16(...) __riscv_vmseq_vx_i8mf2_b16(__VA_ARGS__)
+#define vmseq_vv_i8m1_b8(...) __riscv_vmseq_vv_i8m1_b8(__VA_ARGS__)
+#define vmseq_vx_i8m1_b8(...) __riscv_vmseq_vx_i8m1_b8(__VA_ARGS__)
+#define vmseq_vv_i8m2_b4(...) __riscv_vmseq_vv_i8m2_b4(__VA_ARGS__)
+#define vmseq_vx_i8m2_b4(...) __riscv_vmseq_vx_i8m2_b4(__VA_ARGS__)
+#define vmseq_vv_i8m4_b2(...) __riscv_vmseq_vv_i8m4_b2(__VA_ARGS__)
+#define vmseq_vx_i8m4_b2(...) __riscv_vmseq_vx_i8m4_b2(__VA_ARGS__)
+#define vmseq_vv_i8m8_b1(...) __riscv_vmseq_vv_i8m8_b1(__VA_ARGS__)
+#define vmseq_vx_i8m8_b1(...) __riscv_vmseq_vx_i8m8_b1(__VA_ARGS__)
+#define vmseq_vv_i16mf4_b64(...) __riscv_vmseq_vv_i16mf4_b64(__VA_ARGS__)
+#define vmseq_vx_i16mf4_b64(...) __riscv_vmseq_vx_i16mf4_b64(__VA_ARGS__)
+#define vmseq_vv_i16mf2_b32(...) __riscv_vmseq_vv_i16mf2_b32(__VA_ARGS__)
+#define vmseq_vx_i16mf2_b32(...) __riscv_vmseq_vx_i16mf2_b32(__VA_ARGS__)
+#define vmseq_vv_i16m1_b16(...) __riscv_vmseq_vv_i16m1_b16(__VA_ARGS__)
+#define vmseq_vx_i16m1_b16(...) __riscv_vmseq_vx_i16m1_b16(__VA_ARGS__)
+#define vmseq_vv_i16m2_b8(...) __riscv_vmseq_vv_i16m2_b8(__VA_ARGS__)
+#define vmseq_vx_i16m2_b8(...) __riscv_vmseq_vx_i16m2_b8(__VA_ARGS__)
+#define vmseq_vv_i16m4_b4(...) __riscv_vmseq_vv_i16m4_b4(__VA_ARGS__)
+#define vmseq_vx_i16m4_b4(...) __riscv_vmseq_vx_i16m4_b4(__VA_ARGS__)
+#define vmseq_vv_i16m8_b2(...) __riscv_vmseq_vv_i16m8_b2(__VA_ARGS__)
+#define vmseq_vx_i16m8_b2(...) __riscv_vmseq_vx_i16m8_b2(__VA_ARGS__)
+#define vmseq_vv_i32mf2_b64(...) __riscv_vmseq_vv_i32mf2_b64(__VA_ARGS__)
+#define vmseq_vx_i32mf2_b64(...) __riscv_vmseq_vx_i32mf2_b64(__VA_ARGS__)
+#define vmseq_vv_i32m1_b32(...) __riscv_vmseq_vv_i32m1_b32(__VA_ARGS__)
+#define vmseq_vx_i32m1_b32(...) __riscv_vmseq_vx_i32m1_b32(__VA_ARGS__)
+#define vmseq_vv_i32m2_b16(...) __riscv_vmseq_vv_i32m2_b16(__VA_ARGS__)
+#define vmseq_vx_i32m2_b16(...) __riscv_vmseq_vx_i32m2_b16(__VA_ARGS__)
+#define vmseq_vv_i32m4_b8(...) __riscv_vmseq_vv_i32m4_b8(__VA_ARGS__)
+#define vmseq_vx_i32m4_b8(...) __riscv_vmseq_vx_i32m4_b8(__VA_ARGS__)
+#define vmseq_vv_i32m8_b4(...) __riscv_vmseq_vv_i32m8_b4(__VA_ARGS__)
+#define vmseq_vx_i32m8_b4(...) __riscv_vmseq_vx_i32m8_b4(__VA_ARGS__)
+#define vmseq_vv_i64m1_b64(...) __riscv_vmseq_vv_i64m1_b64(__VA_ARGS__)
+#define vmseq_vx_i64m1_b64(...) __riscv_vmseq_vx_i64m1_b64(__VA_ARGS__)
+#define vmseq_vv_i64m2_b32(...) __riscv_vmseq_vv_i64m2_b32(__VA_ARGS__)
+#define vmseq_vx_i64m2_b32(...) __riscv_vmseq_vx_i64m2_b32(__VA_ARGS__)
+#define vmseq_vv_i64m4_b16(...) __riscv_vmseq_vv_i64m4_b16(__VA_ARGS__)
+#define vmseq_vx_i64m4_b16(...) __riscv_vmseq_vx_i64m4_b16(__VA_ARGS__)
+#define vmseq_vv_i64m8_b8(...) __riscv_vmseq_vv_i64m8_b8(__VA_ARGS__)
+#define vmseq_vx_i64m8_b8(...) __riscv_vmseq_vx_i64m8_b8(__VA_ARGS__)
+#define vmsne_vv_i8mf8_b64(...) __riscv_vmsne_vv_i8mf8_b64(__VA_ARGS__)
+#define vmsne_vx_i8mf8_b64(...) __riscv_vmsne_vx_i8mf8_b64(__VA_ARGS__)
+#define vmsne_vv_i8mf4_b32(...) __riscv_vmsne_vv_i8mf4_b32(__VA_ARGS__)
+#define vmsne_vx_i8mf4_b32(...) __riscv_vmsne_vx_i8mf4_b32(__VA_ARGS__)
+#define vmsne_vv_i8mf2_b16(...) __riscv_vmsne_vv_i8mf2_b16(__VA_ARGS__)
+#define vmsne_vx_i8mf2_b16(...) __riscv_vmsne_vx_i8mf2_b16(__VA_ARGS__)
+#define vmsne_vv_i8m1_b8(...) __riscv_vmsne_vv_i8m1_b8(__VA_ARGS__)
+#define vmsne_vx_i8m1_b8(...) __riscv_vmsne_vx_i8m1_b8(__VA_ARGS__)
+#define vmsne_vv_i8m2_b4(...) __riscv_vmsne_vv_i8m2_b4(__VA_ARGS__)
+#define vmsne_vx_i8m2_b4(...) __riscv_vmsne_vx_i8m2_b4(__VA_ARGS__)
+#define vmsne_vv_i8m4_b2(...) __riscv_vmsne_vv_i8m4_b2(__VA_ARGS__)
+#define vmsne_vx_i8m4_b2(...) __riscv_vmsne_vx_i8m4_b2(__VA_ARGS__)
+#define vmsne_vv_i8m8_b1(...) __riscv_vmsne_vv_i8m8_b1(__VA_ARGS__)
+#define vmsne_vx_i8m8_b1(...) __riscv_vmsne_vx_i8m8_b1(__VA_ARGS__)
+#define vmsne_vv_i16mf4_b64(...) __riscv_vmsne_vv_i16mf4_b64(__VA_ARGS__)
+#define vmsne_vx_i16mf4_b64(...) __riscv_vmsne_vx_i16mf4_b64(__VA_ARGS__)
+#define vmsne_vv_i16mf2_b32(...) __riscv_vmsne_vv_i16mf2_b32(__VA_ARGS__)
+#define vmsne_vx_i16mf2_b32(...) __riscv_vmsne_vx_i16mf2_b32(__VA_ARGS__)
+#define vmsne_vv_i16m1_b16(...) __riscv_vmsne_vv_i16m1_b16(__VA_ARGS__)
+#define vmsne_vx_i16m1_b16(...) __riscv_vmsne_vx_i16m1_b16(__VA_ARGS__)
+#define vmsne_vv_i16m2_b8(...) __riscv_vmsne_vv_i16m2_b8(__VA_ARGS__)
+#define vmsne_vx_i16m2_b8(...) __riscv_vmsne_vx_i16m2_b8(__VA_ARGS__)
+#define vmsne_vv_i16m4_b4(...) __riscv_vmsne_vv_i16m4_b4(__VA_ARGS__)
+#define vmsne_vx_i16m4_b4(...) __riscv_vmsne_vx_i16m4_b4(__VA_ARGS__)
+#define vmsne_vv_i16m8_b2(...) __riscv_vmsne_vv_i16m8_b2(__VA_ARGS__)
+#define vmsne_vx_i16m8_b2(...) __riscv_vmsne_vx_i16m8_b2(__VA_ARGS__)
+#define vmsne_vv_i32mf2_b64(...) __riscv_vmsne_vv_i32mf2_b64(__VA_ARGS__)
+#define vmsne_vx_i32mf2_b64(...) __riscv_vmsne_vx_i32mf2_b64(__VA_ARGS__)
+#define vmsne_vv_i32m1_b32(...) __riscv_vmsne_vv_i32m1_b32(__VA_ARGS__)
+#define vmsne_vx_i32m1_b32(...) __riscv_vmsne_vx_i32m1_b32(__VA_ARGS__)
+#define vmsne_vv_i32m2_b16(...) __riscv_vmsne_vv_i32m2_b16(__VA_ARGS__)
+#define vmsne_vx_i32m2_b16(...) __riscv_vmsne_vx_i32m2_b16(__VA_ARGS__)
+#define vmsne_vv_i32m4_b8(...) __riscv_vmsne_vv_i32m4_b8(__VA_ARGS__)
+#define vmsne_vx_i32m4_b8(...) __riscv_vmsne_vx_i32m4_b8(__VA_ARGS__)
+#define vmsne_vv_i32m8_b4(...) __riscv_vmsne_vv_i32m8_b4(__VA_ARGS__)
+#define vmsne_vx_i32m8_b4(...) __riscv_vmsne_vx_i32m8_b4(__VA_ARGS__)
+#define vmsne_vv_i64m1_b64(...) __riscv_vmsne_vv_i64m1_b64(__VA_ARGS__)
+#define vmsne_vx_i64m1_b64(...) __riscv_vmsne_vx_i64m1_b64(__VA_ARGS__)
+#define vmsne_vv_i64m2_b32(...) __riscv_vmsne_vv_i64m2_b32(__VA_ARGS__)
+#define vmsne_vx_i64m2_b32(...) __riscv_vmsne_vx_i64m2_b32(__VA_ARGS__)
+#define vmsne_vv_i64m4_b16(...) __riscv_vmsne_vv_i64m4_b16(__VA_ARGS__)
+#define vmsne_vx_i64m4_b16(...) __riscv_vmsne_vx_i64m4_b16(__VA_ARGS__)
+#define vmsne_vv_i64m8_b8(...) __riscv_vmsne_vv_i64m8_b8(__VA_ARGS__)
+#define vmsne_vx_i64m8_b8(...) __riscv_vmsne_vx_i64m8_b8(__VA_ARGS__)
+#define vmslt_vv_i8mf8_b64(...) __riscv_vmslt_vv_i8mf8_b64(__VA_ARGS__)
+#define vmslt_vx_i8mf8_b64(...) __riscv_vmslt_vx_i8mf8_b64(__VA_ARGS__)
+#define vmslt_vv_i8mf4_b32(...) __riscv_vmslt_vv_i8mf4_b32(__VA_ARGS__)
+#define vmslt_vx_i8mf4_b32(...) __riscv_vmslt_vx_i8mf4_b32(__VA_ARGS__)
+#define vmslt_vv_i8mf2_b16(...) __riscv_vmslt_vv_i8mf2_b16(__VA_ARGS__)
+#define vmslt_vx_i8mf2_b16(...) __riscv_vmslt_vx_i8mf2_b16(__VA_ARGS__)
+#define vmslt_vv_i8m1_b8(...) __riscv_vmslt_vv_i8m1_b8(__VA_ARGS__)
+#define vmslt_vx_i8m1_b8(...) __riscv_vmslt_vx_i8m1_b8(__VA_ARGS__)
+#define vmslt_vv_i8m2_b4(...) __riscv_vmslt_vv_i8m2_b4(__VA_ARGS__)
+#define vmslt_vx_i8m2_b4(...) __riscv_vmslt_vx_i8m2_b4(__VA_ARGS__)
+#define vmslt_vv_i8m4_b2(...) __riscv_vmslt_vv_i8m4_b2(__VA_ARGS__)
+#define vmslt_vx_i8m4_b2(...) __riscv_vmslt_vx_i8m4_b2(__VA_ARGS__)
+#define vmslt_vv_i8m8_b1(...) __riscv_vmslt_vv_i8m8_b1(__VA_ARGS__)
+#define vmslt_vx_i8m8_b1(...) __riscv_vmslt_vx_i8m8_b1(__VA_ARGS__)
+#define vmslt_vv_i16mf4_b64(...) __riscv_vmslt_vv_i16mf4_b64(__VA_ARGS__)
+#define vmslt_vx_i16mf4_b64(...) __riscv_vmslt_vx_i16mf4_b64(__VA_ARGS__)
+#define vmslt_vv_i16mf2_b32(...) __riscv_vmslt_vv_i16mf2_b32(__VA_ARGS__)
+#define vmslt_vx_i16mf2_b32(...) __riscv_vmslt_vx_i16mf2_b32(__VA_ARGS__)
+#define vmslt_vv_i16m1_b16(...) __riscv_vmslt_vv_i16m1_b16(__VA_ARGS__)
+#define vmslt_vx_i16m1_b16(...) __riscv_vmslt_vx_i16m1_b16(__VA_ARGS__)
+#define vmslt_vv_i16m2_b8(...) __riscv_vmslt_vv_i16m2_b8(__VA_ARGS__)
+#define vmslt_vx_i16m2_b8(...) __riscv_vmslt_vx_i16m2_b8(__VA_ARGS__)
+#define vmslt_vv_i16m4_b4(...) __riscv_vmslt_vv_i16m4_b4(__VA_ARGS__)
+#define vmslt_vx_i16m4_b4(...) __riscv_vmslt_vx_i16m4_b4(__VA_ARGS__)
+#define vmslt_vv_i16m8_b2(...) __riscv_vmslt_vv_i16m8_b2(__VA_ARGS__)
+#define vmslt_vx_i16m8_b2(...) __riscv_vmslt_vx_i16m8_b2(__VA_ARGS__)
+#define vmslt_vv_i32mf2_b64(...) __riscv_vmslt_vv_i32mf2_b64(__VA_ARGS__)
+#define vmslt_vx_i32mf2_b64(...) __riscv_vmslt_vx_i32mf2_b64(__VA_ARGS__)
+#define vmslt_vv_i32m1_b32(...) __riscv_vmslt_vv_i32m1_b32(__VA_ARGS__)
+#define vmslt_vx_i32m1_b32(...) __riscv_vmslt_vx_i32m1_b32(__VA_ARGS__)
+#define vmslt_vv_i32m2_b16(...) __riscv_vmslt_vv_i32m2_b16(__VA_ARGS__)
+#define vmslt_vx_i32m2_b16(...) __riscv_vmslt_vx_i32m2_b16(__VA_ARGS__)
+#define vmslt_vv_i32m4_b8(...) __riscv_vmslt_vv_i32m4_b8(__VA_ARGS__)
+#define vmslt_vx_i32m4_b8(...) __riscv_vmslt_vx_i32m4_b8(__VA_ARGS__)
+#define vmslt_vv_i32m8_b4(...) __riscv_vmslt_vv_i32m8_b4(__VA_ARGS__)
+#define vmslt_vx_i32m8_b4(...) __riscv_vmslt_vx_i32m8_b4(__VA_ARGS__)
+#define vmslt_vv_i64m1_b64(...) __riscv_vmslt_vv_i64m1_b64(__VA_ARGS__)
+#define vmslt_vx_i64m1_b64(...) __riscv_vmslt_vx_i64m1_b64(__VA_ARGS__)
+#define vmslt_vv_i64m2_b32(...) __riscv_vmslt_vv_i64m2_b32(__VA_ARGS__)
+#define vmslt_vx_i64m2_b32(...) __riscv_vmslt_vx_i64m2_b32(__VA_ARGS__)
+#define vmslt_vv_i64m4_b16(...) __riscv_vmslt_vv_i64m4_b16(__VA_ARGS__)
+#define vmslt_vx_i64m4_b16(...) __riscv_vmslt_vx_i64m4_b16(__VA_ARGS__)
+#define vmslt_vv_i64m8_b8(...) __riscv_vmslt_vv_i64m8_b8(__VA_ARGS__)
+#define vmslt_vx_i64m8_b8(...) __riscv_vmslt_vx_i64m8_b8(__VA_ARGS__)
+#define vmsle_vv_i8mf8_b64(...) __riscv_vmsle_vv_i8mf8_b64(__VA_ARGS__)
+#define vmsle_vx_i8mf8_b64(...) __riscv_vmsle_vx_i8mf8_b64(__VA_ARGS__)
+#define vmsle_vv_i8mf4_b32(...) __riscv_vmsle_vv_i8mf4_b32(__VA_ARGS__)
+#define vmsle_vx_i8mf4_b32(...) __riscv_vmsle_vx_i8mf4_b32(__VA_ARGS__)
+#define vmsle_vv_i8mf2_b16(...) __riscv_vmsle_vv_i8mf2_b16(__VA_ARGS__)
+#define vmsle_vx_i8mf2_b16(...) __riscv_vmsle_vx_i8mf2_b16(__VA_ARGS__)
+#define vmsle_vv_i8m1_b8(...) __riscv_vmsle_vv_i8m1_b8(__VA_ARGS__)
+#define vmsle_vx_i8m1_b8(...) __riscv_vmsle_vx_i8m1_b8(__VA_ARGS__)
+#define vmsle_vv_i8m2_b4(...) __riscv_vmsle_vv_i8m2_b4(__VA_ARGS__)
+#define vmsle_vx_i8m2_b4(...) __riscv_vmsle_vx_i8m2_b4(__VA_ARGS__)
+#define vmsle_vv_i8m4_b2(...) __riscv_vmsle_vv_i8m4_b2(__VA_ARGS__)
+#define vmsle_vx_i8m4_b2(...) __riscv_vmsle_vx_i8m4_b2(__VA_ARGS__)
+#define vmsle_vv_i8m8_b1(...) __riscv_vmsle_vv_i8m8_b1(__VA_ARGS__)
+#define vmsle_vx_i8m8_b1(...) __riscv_vmsle_vx_i8m8_b1(__VA_ARGS__)
+#define vmsle_vv_i16mf4_b64(...) __riscv_vmsle_vv_i16mf4_b64(__VA_ARGS__)
+#define vmsle_vx_i16mf4_b64(...) __riscv_vmsle_vx_i16mf4_b64(__VA_ARGS__)
+#define vmsle_vv_i16mf2_b32(...) __riscv_vmsle_vv_i16mf2_b32(__VA_ARGS__)
+#define vmsle_vx_i16mf2_b32(...) __riscv_vmsle_vx_i16mf2_b32(__VA_ARGS__)
+#define vmsle_vv_i16m1_b16(...) __riscv_vmsle_vv_i16m1_b16(__VA_ARGS__)
+#define vmsle_vx_i16m1_b16(...) __riscv_vmsle_vx_i16m1_b16(__VA_ARGS__)
+#define vmsle_vv_i16m2_b8(...) __riscv_vmsle_vv_i16m2_b8(__VA_ARGS__)
+#define vmsle_vx_i16m2_b8(...) __riscv_vmsle_vx_i16m2_b8(__VA_ARGS__)
+#define vmsle_vv_i16m4_b4(...) __riscv_vmsle_vv_i16m4_b4(__VA_ARGS__)
+#define vmsle_vx_i16m4_b4(...) __riscv_vmsle_vx_i16m4_b4(__VA_ARGS__)
+#define vmsle_vv_i16m8_b2(...) __riscv_vmsle_vv_i16m8_b2(__VA_ARGS__)
+#define vmsle_vx_i16m8_b2(...) __riscv_vmsle_vx_i16m8_b2(__VA_ARGS__)
+#define vmsle_vv_i32mf2_b64(...) __riscv_vmsle_vv_i32mf2_b64(__VA_ARGS__)
+#define vmsle_vx_i32mf2_b64(...) __riscv_vmsle_vx_i32mf2_b64(__VA_ARGS__)
+#define vmsle_vv_i32m1_b32(...) __riscv_vmsle_vv_i32m1_b32(__VA_ARGS__)
+#define vmsle_vx_i32m1_b32(...) __riscv_vmsle_vx_i32m1_b32(__VA_ARGS__)
+#define vmsle_vv_i32m2_b16(...) __riscv_vmsle_vv_i32m2_b16(__VA_ARGS__)
+#define vmsle_vx_i32m2_b16(...) __riscv_vmsle_vx_i32m2_b16(__VA_ARGS__)
+#define vmsle_vv_i32m4_b8(...) __riscv_vmsle_vv_i32m4_b8(__VA_ARGS__)
+#define vmsle_vx_i32m4_b8(...) __riscv_vmsle_vx_i32m4_b8(__VA_ARGS__)
+#define vmsle_vv_i32m8_b4(...) __riscv_vmsle_vv_i32m8_b4(__VA_ARGS__)
+#define vmsle_vx_i32m8_b4(...) __riscv_vmsle_vx_i32m8_b4(__VA_ARGS__)
+#define vmsle_vv_i64m1_b64(...) __riscv_vmsle_vv_i64m1_b64(__VA_ARGS__)
+#define vmsle_vx_i64m1_b64(...) __riscv_vmsle_vx_i64m1_b64(__VA_ARGS__)
+#define vmsle_vv_i64m2_b32(...) __riscv_vmsle_vv_i64m2_b32(__VA_ARGS__)
+#define vmsle_vx_i64m2_b32(...) __riscv_vmsle_vx_i64m2_b32(__VA_ARGS__)
+#define vmsle_vv_i64m4_b16(...) __riscv_vmsle_vv_i64m4_b16(__VA_ARGS__)
+#define vmsle_vx_i64m4_b16(...) __riscv_vmsle_vx_i64m4_b16(__VA_ARGS__)
+#define vmsle_vv_i64m8_b8(...) __riscv_vmsle_vv_i64m8_b8(__VA_ARGS__)
+#define vmsle_vx_i64m8_b8(...) __riscv_vmsle_vx_i64m8_b8(__VA_ARGS__)
+#define vmsgt_vv_i8mf8_b64(...) __riscv_vmsgt_vv_i8mf8_b64(__VA_ARGS__)
+#define vmsgt_vx_i8mf8_b64(...) __riscv_vmsgt_vx_i8mf8_b64(__VA_ARGS__)
+#define vmsgt_vv_i8mf4_b32(...) __riscv_vmsgt_vv_i8mf4_b32(__VA_ARGS__)
+#define vmsgt_vx_i8mf4_b32(...) __riscv_vmsgt_vx_i8mf4_b32(__VA_ARGS__)
+#define vmsgt_vv_i8mf2_b16(...) __riscv_vmsgt_vv_i8mf2_b16(__VA_ARGS__)
+#define vmsgt_vx_i8mf2_b16(...) __riscv_vmsgt_vx_i8mf2_b16(__VA_ARGS__)
+#define vmsgt_vv_i8m1_b8(...) __riscv_vmsgt_vv_i8m1_b8(__VA_ARGS__)
+#define vmsgt_vx_i8m1_b8(...) __riscv_vmsgt_vx_i8m1_b8(__VA_ARGS__)
+#define vmsgt_vv_i8m2_b4(...) __riscv_vmsgt_vv_i8m2_b4(__VA_ARGS__)
+#define vmsgt_vx_i8m2_b4(...) __riscv_vmsgt_vx_i8m2_b4(__VA_ARGS__)
+#define vmsgt_vv_i8m4_b2(...) __riscv_vmsgt_vv_i8m4_b2(__VA_ARGS__)
+#define vmsgt_vx_i8m4_b2(...) __riscv_vmsgt_vx_i8m4_b2(__VA_ARGS__)
+#define vmsgt_vv_i8m8_b1(...) __riscv_vmsgt_vv_i8m8_b1(__VA_ARGS__)
+#define vmsgt_vx_i8m8_b1(...) __riscv_vmsgt_vx_i8m8_b1(__VA_ARGS__)
+#define vmsgt_vv_i16mf4_b64(...) __riscv_vmsgt_vv_i16mf4_b64(__VA_ARGS__)
+#define vmsgt_vx_i16mf4_b64(...) __riscv_vmsgt_vx_i16mf4_b64(__VA_ARGS__)
+#define vmsgt_vv_i16mf2_b32(...) __riscv_vmsgt_vv_i16mf2_b32(__VA_ARGS__)
+#define vmsgt_vx_i16mf2_b32(...) __riscv_vmsgt_vx_i16mf2_b32(__VA_ARGS__)
+#define vmsgt_vv_i16m1_b16(...) __riscv_vmsgt_vv_i16m1_b16(__VA_ARGS__)
+#define vmsgt_vx_i16m1_b16(...) __riscv_vmsgt_vx_i16m1_b16(__VA_ARGS__)
+#define vmsgt_vv_i16m2_b8(...) __riscv_vmsgt_vv_i16m2_b8(__VA_ARGS__)
+#define vmsgt_vx_i16m2_b8(...) __riscv_vmsgt_vx_i16m2_b8(__VA_ARGS__)
+#define vmsgt_vv_i16m4_b4(...) __riscv_vmsgt_vv_i16m4_b4(__VA_ARGS__)
+#define vmsgt_vx_i16m4_b4(...) __riscv_vmsgt_vx_i16m4_b4(__VA_ARGS__)
+#define vmsgt_vv_i16m8_b2(...) __riscv_vmsgt_vv_i16m8_b2(__VA_ARGS__)
+#define vmsgt_vx_i16m8_b2(...) __riscv_vmsgt_vx_i16m8_b2(__VA_ARGS__)
+#define vmsgt_vv_i32mf2_b64(...) __riscv_vmsgt_vv_i32mf2_b64(__VA_ARGS__)
+#define vmsgt_vx_i32mf2_b64(...) __riscv_vmsgt_vx_i32mf2_b64(__VA_ARGS__)
+#define vmsgt_vv_i32m1_b32(...) __riscv_vmsgt_vv_i32m1_b32(__VA_ARGS__)
+#define vmsgt_vx_i32m1_b32(...) __riscv_vmsgt_vx_i32m1_b32(__VA_ARGS__)
+#define vmsgt_vv_i32m2_b16(...) __riscv_vmsgt_vv_i32m2_b16(__VA_ARGS__)
+#define vmsgt_vx_i32m2_b16(...) __riscv_vmsgt_vx_i32m2_b16(__VA_ARGS__)
+#define vmsgt_vv_i32m4_b8(...) __riscv_vmsgt_vv_i32m4_b8(__VA_ARGS__)
+#define vmsgt_vx_i32m4_b8(...) __riscv_vmsgt_vx_i32m4_b8(__VA_ARGS__)
+#define vmsgt_vv_i32m8_b4(...) __riscv_vmsgt_vv_i32m8_b4(__VA_ARGS__)
+#define vmsgt_vx_i32m8_b4(...) __riscv_vmsgt_vx_i32m8_b4(__VA_ARGS__)
+#define vmsgt_vv_i64m1_b64(...) __riscv_vmsgt_vv_i64m1_b64(__VA_ARGS__)
+#define vmsgt_vx_i64m1_b64(...) __riscv_vmsgt_vx_i64m1_b64(__VA_ARGS__)
+#define vmsgt_vv_i64m2_b32(...) __riscv_vmsgt_vv_i64m2_b32(__VA_ARGS__)
+#define vmsgt_vx_i64m2_b32(...) __riscv_vmsgt_vx_i64m2_b32(__VA_ARGS__)
+#define vmsgt_vv_i64m4_b16(...) __riscv_vmsgt_vv_i64m4_b16(__VA_ARGS__)
+#define vmsgt_vx_i64m4_b16(...) __riscv_vmsgt_vx_i64m4_b16(__VA_ARGS__)
+#define vmsgt_vv_i64m8_b8(...) __riscv_vmsgt_vv_i64m8_b8(__VA_ARGS__)
+#define vmsgt_vx_i64m8_b8(...) __riscv_vmsgt_vx_i64m8_b8(__VA_ARGS__)
+#define vmsge_vv_i8mf8_b64(...) __riscv_vmsge_vv_i8mf8_b64(__VA_ARGS__)
+#define vmsge_vx_i8mf8_b64(...) __riscv_vmsge_vx_i8mf8_b64(__VA_ARGS__)
+#define vmsge_vv_i8mf4_b32(...) __riscv_vmsge_vv_i8mf4_b32(__VA_ARGS__)
+#define vmsge_vx_i8mf4_b32(...) __riscv_vmsge_vx_i8mf4_b32(__VA_ARGS__)
+#define vmsge_vv_i8mf2_b16(...) __riscv_vmsge_vv_i8mf2_b16(__VA_ARGS__)
+#define vmsge_vx_i8mf2_b16(...) __riscv_vmsge_vx_i8mf2_b16(__VA_ARGS__)
+#define vmsge_vv_i8m1_b8(...) __riscv_vmsge_vv_i8m1_b8(__VA_ARGS__)
+#define vmsge_vx_i8m1_b8(...) __riscv_vmsge_vx_i8m1_b8(__VA_ARGS__)
+#define vmsge_vv_i8m2_b4(...) __riscv_vmsge_vv_i8m2_b4(__VA_ARGS__)
+#define vmsge_vx_i8m2_b4(...) __riscv_vmsge_vx_i8m2_b4(__VA_ARGS__)
+#define vmsge_vv_i8m4_b2(...) __riscv_vmsge_vv_i8m4_b2(__VA_ARGS__)
+#define vmsge_vx_i8m4_b2(...) __riscv_vmsge_vx_i8m4_b2(__VA_ARGS__)
+#define vmsge_vv_i8m8_b1(...) __riscv_vmsge_vv_i8m8_b1(__VA_ARGS__)
+#define vmsge_vx_i8m8_b1(...) __riscv_vmsge_vx_i8m8_b1(__VA_ARGS__)
+#define vmsge_vv_i16mf4_b64(...) __riscv_vmsge_vv_i16mf4_b64(__VA_ARGS__)
+#define vmsge_vx_i16mf4_b64(...) __riscv_vmsge_vx_i16mf4_b64(__VA_ARGS__)
+#define vmsge_vv_i16mf2_b32(...) __riscv_vmsge_vv_i16mf2_b32(__VA_ARGS__)
+#define vmsge_vx_i16mf2_b32(...) __riscv_vmsge_vx_i16mf2_b32(__VA_ARGS__)
+#define vmsge_vv_i16m1_b16(...) __riscv_vmsge_vv_i16m1_b16(__VA_ARGS__)
+#define vmsge_vx_i16m1_b16(...) __riscv_vmsge_vx_i16m1_b16(__VA_ARGS__)
+#define vmsge_vv_i16m2_b8(...) __riscv_vmsge_vv_i16m2_b8(__VA_ARGS__)
+#define vmsge_vx_i16m2_b8(...) __riscv_vmsge_vx_i16m2_b8(__VA_ARGS__)
+#define vmsge_vv_i16m4_b4(...) __riscv_vmsge_vv_i16m4_b4(__VA_ARGS__)
+#define vmsge_vx_i16m4_b4(...) __riscv_vmsge_vx_i16m4_b4(__VA_ARGS__)
+#define vmsge_vv_i16m8_b2(...) __riscv_vmsge_vv_i16m8_b2(__VA_ARGS__)
+#define vmsge_vx_i16m8_b2(...) __riscv_vmsge_vx_i16m8_b2(__VA_ARGS__)
+#define vmsge_vv_i32mf2_b64(...) __riscv_vmsge_vv_i32mf2_b64(__VA_ARGS__)
+#define vmsge_vx_i32mf2_b64(...) __riscv_vmsge_vx_i32mf2_b64(__VA_ARGS__)
+#define vmsge_vv_i32m1_b32(...) __riscv_vmsge_vv_i32m1_b32(__VA_ARGS__)
+#define vmsge_vx_i32m1_b32(...) __riscv_vmsge_vx_i32m1_b32(__VA_ARGS__)
+#define vmsge_vv_i32m2_b16(...) __riscv_vmsge_vv_i32m2_b16(__VA_ARGS__)
+#define vmsge_vx_i32m2_b16(...) __riscv_vmsge_vx_i32m2_b16(__VA_ARGS__)
+#define vmsge_vv_i32m4_b8(...) __riscv_vmsge_vv_i32m4_b8(__VA_ARGS__)
+#define vmsge_vx_i32m4_b8(...) __riscv_vmsge_vx_i32m4_b8(__VA_ARGS__)
+#define vmsge_vv_i32m8_b4(...) __riscv_vmsge_vv_i32m8_b4(__VA_ARGS__)
+#define vmsge_vx_i32m8_b4(...) __riscv_vmsge_vx_i32m8_b4(__VA_ARGS__)
+#define vmsge_vv_i64m1_b64(...) __riscv_vmsge_vv_i64m1_b64(__VA_ARGS__)
+#define vmsge_vx_i64m1_b64(...) __riscv_vmsge_vx_i64m1_b64(__VA_ARGS__)
+#define vmsge_vv_i64m2_b32(...) __riscv_vmsge_vv_i64m2_b32(__VA_ARGS__)
+#define vmsge_vx_i64m2_b32(...) __riscv_vmsge_vx_i64m2_b32(__VA_ARGS__)
+#define vmsge_vv_i64m4_b16(...) __riscv_vmsge_vv_i64m4_b16(__VA_ARGS__)
+#define vmsge_vx_i64m4_b16(...) __riscv_vmsge_vx_i64m4_b16(__VA_ARGS__)
+#define vmsge_vv_i64m8_b8(...) __riscv_vmsge_vv_i64m8_b8(__VA_ARGS__)
+#define vmsge_vx_i64m8_b8(...) __riscv_vmsge_vx_i64m8_b8(__VA_ARGS__)
+#define vmseq_vv_u8mf8_b64(...) __riscv_vmseq_vv_u8mf8_b64(__VA_ARGS__)
+#define vmseq_vx_u8mf8_b64(...) __riscv_vmseq_vx_u8mf8_b64(__VA_ARGS__)
+#define vmseq_vv_u8mf4_b32(...) __riscv_vmseq_vv_u8mf4_b32(__VA_ARGS__)
+#define vmseq_vx_u8mf4_b32(...) __riscv_vmseq_vx_u8mf4_b32(__VA_ARGS__)
+#define vmseq_vv_u8mf2_b16(...) __riscv_vmseq_vv_u8mf2_b16(__VA_ARGS__)
+#define vmseq_vx_u8mf2_b16(...) __riscv_vmseq_vx_u8mf2_b16(__VA_ARGS__)
+#define vmseq_vv_u8m1_b8(...) __riscv_vmseq_vv_u8m1_b8(__VA_ARGS__)
+#define vmseq_vx_u8m1_b8(...) __riscv_vmseq_vx_u8m1_b8(__VA_ARGS__)
+#define vmseq_vv_u8m2_b4(...) __riscv_vmseq_vv_u8m2_b4(__VA_ARGS__)
+#define vmseq_vx_u8m2_b4(...) __riscv_vmseq_vx_u8m2_b4(__VA_ARGS__)
+#define vmseq_vv_u8m4_b2(...) __riscv_vmseq_vv_u8m4_b2(__VA_ARGS__)
+#define vmseq_vx_u8m4_b2(...) __riscv_vmseq_vx_u8m4_b2(__VA_ARGS__)
+#define vmseq_vv_u8m8_b1(...) __riscv_vmseq_vv_u8m8_b1(__VA_ARGS__)
+#define vmseq_vx_u8m8_b1(...) __riscv_vmseq_vx_u8m8_b1(__VA_ARGS__)
+#define vmseq_vv_u16mf4_b64(...) __riscv_vmseq_vv_u16mf4_b64(__VA_ARGS__)
+#define vmseq_vx_u16mf4_b64(...) __riscv_vmseq_vx_u16mf4_b64(__VA_ARGS__)
+#define vmseq_vv_u16mf2_b32(...) __riscv_vmseq_vv_u16mf2_b32(__VA_ARGS__)
+#define vmseq_vx_u16mf2_b32(...) __riscv_vmseq_vx_u16mf2_b32(__VA_ARGS__)
+#define vmseq_vv_u16m1_b16(...) __riscv_vmseq_vv_u16m1_b16(__VA_ARGS__)
+#define vmseq_vx_u16m1_b16(...) __riscv_vmseq_vx_u16m1_b16(__VA_ARGS__)
+#define vmseq_vv_u16m2_b8(...) __riscv_vmseq_vv_u16m2_b8(__VA_ARGS__)
+#define vmseq_vx_u16m2_b8(...) __riscv_vmseq_vx_u16m2_b8(__VA_ARGS__)
+#define vmseq_vv_u16m4_b4(...) __riscv_vmseq_vv_u16m4_b4(__VA_ARGS__)
+#define vmseq_vx_u16m4_b4(...) __riscv_vmseq_vx_u16m4_b4(__VA_ARGS__)
+#define vmseq_vv_u16m8_b2(...) __riscv_vmseq_vv_u16m8_b2(__VA_ARGS__)
+#define vmseq_vx_u16m8_b2(...) __riscv_vmseq_vx_u16m8_b2(__VA_ARGS__)
+#define vmseq_vv_u32mf2_b64(...) __riscv_vmseq_vv_u32mf2_b64(__VA_ARGS__)
+#define vmseq_vx_u32mf2_b64(...) __riscv_vmseq_vx_u32mf2_b64(__VA_ARGS__)
+#define vmseq_vv_u32m1_b32(...) __riscv_vmseq_vv_u32m1_b32(__VA_ARGS__)
+#define vmseq_vx_u32m1_b32(...) __riscv_vmseq_vx_u32m1_b32(__VA_ARGS__)
+#define vmseq_vv_u32m2_b16(...) __riscv_vmseq_vv_u32m2_b16(__VA_ARGS__)
+#define vmseq_vx_u32m2_b16(...) __riscv_vmseq_vx_u32m2_b16(__VA_ARGS__)
+#define vmseq_vv_u32m4_b8(...) __riscv_vmseq_vv_u32m4_b8(__VA_ARGS__)
+#define vmseq_vx_u32m4_b8(...) __riscv_vmseq_vx_u32m4_b8(__VA_ARGS__)
+#define vmseq_vv_u32m8_b4(...) __riscv_vmseq_vv_u32m8_b4(__VA_ARGS__)
+#define vmseq_vx_u32m8_b4(...) __riscv_vmseq_vx_u32m8_b4(__VA_ARGS__)
+#define vmseq_vv_u64m1_b64(...) __riscv_vmseq_vv_u64m1_b64(__VA_ARGS__)
+#define vmseq_vx_u64m1_b64(...) __riscv_vmseq_vx_u64m1_b64(__VA_ARGS__)
+#define vmseq_vv_u64m2_b32(...) __riscv_vmseq_vv_u64m2_b32(__VA_ARGS__)
+#define vmseq_vx_u64m2_b32(...) __riscv_vmseq_vx_u64m2_b32(__VA_ARGS__)
+#define vmseq_vv_u64m4_b16(...) __riscv_vmseq_vv_u64m4_b16(__VA_ARGS__)
+#define vmseq_vx_u64m4_b16(...) __riscv_vmseq_vx_u64m4_b16(__VA_ARGS__)
+#define vmseq_vv_u64m8_b8(...) __riscv_vmseq_vv_u64m8_b8(__VA_ARGS__)
+#define vmseq_vx_u64m8_b8(...) __riscv_vmseq_vx_u64m8_b8(__VA_ARGS__)
+#define vmsne_vv_u8mf8_b64(...) __riscv_vmsne_vv_u8mf8_b64(__VA_ARGS__)
+#define vmsne_vx_u8mf8_b64(...) __riscv_vmsne_vx_u8mf8_b64(__VA_ARGS__)
+#define vmsne_vv_u8mf4_b32(...) __riscv_vmsne_vv_u8mf4_b32(__VA_ARGS__)
+#define vmsne_vx_u8mf4_b32(...) __riscv_vmsne_vx_u8mf4_b32(__VA_ARGS__)
+#define vmsne_vv_u8mf2_b16(...) __riscv_vmsne_vv_u8mf2_b16(__VA_ARGS__)
+#define vmsne_vx_u8mf2_b16(...) __riscv_vmsne_vx_u8mf2_b16(__VA_ARGS__)
+#define vmsne_vv_u8m1_b8(...) __riscv_vmsne_vv_u8m1_b8(__VA_ARGS__)
+#define vmsne_vx_u8m1_b8(...) __riscv_vmsne_vx_u8m1_b8(__VA_ARGS__)
+#define vmsne_vv_u8m2_b4(...) __riscv_vmsne_vv_u8m2_b4(__VA_ARGS__)
+#define vmsne_vx_u8m2_b4(...) __riscv_vmsne_vx_u8m2_b4(__VA_ARGS__)
+#define vmsne_vv_u8m4_b2(...) __riscv_vmsne_vv_u8m4_b2(__VA_ARGS__)
+#define vmsne_vx_u8m4_b2(...) __riscv_vmsne_vx_u8m4_b2(__VA_ARGS__)
+#define vmsne_vv_u8m8_b1(...) __riscv_vmsne_vv_u8m8_b1(__VA_ARGS__)
+#define vmsne_vx_u8m8_b1(...) __riscv_vmsne_vx_u8m8_b1(__VA_ARGS__)
+#define vmsne_vv_u16mf4_b64(...) __riscv_vmsne_vv_u16mf4_b64(__VA_ARGS__)
+#define vmsne_vx_u16mf4_b64(...) __riscv_vmsne_vx_u16mf4_b64(__VA_ARGS__)
+#define vmsne_vv_u16mf2_b32(...) __riscv_vmsne_vv_u16mf2_b32(__VA_ARGS__)
+#define vmsne_vx_u16mf2_b32(...) __riscv_vmsne_vx_u16mf2_b32(__VA_ARGS__)
+#define vmsne_vv_u16m1_b16(...) __riscv_vmsne_vv_u16m1_b16(__VA_ARGS__)
+#define vmsne_vx_u16m1_b16(...) __riscv_vmsne_vx_u16m1_b16(__VA_ARGS__)
+#define vmsne_vv_u16m2_b8(...) __riscv_vmsne_vv_u16m2_b8(__VA_ARGS__)
+#define vmsne_vx_u16m2_b8(...) __riscv_vmsne_vx_u16m2_b8(__VA_ARGS__)
+#define vmsne_vv_u16m4_b4(...) __riscv_vmsne_vv_u16m4_b4(__VA_ARGS__)
+#define vmsne_vx_u16m4_b4(...) __riscv_vmsne_vx_u16m4_b4(__VA_ARGS__)
+#define vmsne_vv_u16m8_b2(...) __riscv_vmsne_vv_u16m8_b2(__VA_ARGS__)
+#define vmsne_vx_u16m8_b2(...) __riscv_vmsne_vx_u16m8_b2(__VA_ARGS__)
+#define vmsne_vv_u32mf2_b64(...) __riscv_vmsne_vv_u32mf2_b64(__VA_ARGS__)
+#define vmsne_vx_u32mf2_b64(...) __riscv_vmsne_vx_u32mf2_b64(__VA_ARGS__)
+#define vmsne_vv_u32m1_b32(...) __riscv_vmsne_vv_u32m1_b32(__VA_ARGS__)
+#define vmsne_vx_u32m1_b32(...) __riscv_vmsne_vx_u32m1_b32(__VA_ARGS__)
+#define vmsne_vv_u32m2_b16(...) __riscv_vmsne_vv_u32m2_b16(__VA_ARGS__)
+#define vmsne_vx_u32m2_b16(...) __riscv_vmsne_vx_u32m2_b16(__VA_ARGS__)
+#define vmsne_vv_u32m4_b8(...) __riscv_vmsne_vv_u32m4_b8(__VA_ARGS__)
+#define vmsne_vx_u32m4_b8(...) __riscv_vmsne_vx_u32m4_b8(__VA_ARGS__)
+#define vmsne_vv_u32m8_b4(...) __riscv_vmsne_vv_u32m8_b4(__VA_ARGS__)
+#define vmsne_vx_u32m8_b4(...) __riscv_vmsne_vx_u32m8_b4(__VA_ARGS__)
+#define vmsne_vv_u64m1_b64(...) __riscv_vmsne_vv_u64m1_b64(__VA_ARGS__)
+#define vmsne_vx_u64m1_b64(...) __riscv_vmsne_vx_u64m1_b64(__VA_ARGS__)
+#define vmsne_vv_u64m2_b32(...) __riscv_vmsne_vv_u64m2_b32(__VA_ARGS__)
+#define vmsne_vx_u64m2_b32(...) __riscv_vmsne_vx_u64m2_b32(__VA_ARGS__)
+#define vmsne_vv_u64m4_b16(...) __riscv_vmsne_vv_u64m4_b16(__VA_ARGS__)
+#define vmsne_vx_u64m4_b16(...) __riscv_vmsne_vx_u64m4_b16(__VA_ARGS__)
+#define vmsne_vv_u64m8_b8(...) __riscv_vmsne_vv_u64m8_b8(__VA_ARGS__)
+#define vmsne_vx_u64m8_b8(...) __riscv_vmsne_vx_u64m8_b8(__VA_ARGS__)
+#define vmsltu_vv_u8mf8_b64(...) __riscv_vmsltu_vv_u8mf8_b64(__VA_ARGS__)
+#define vmsltu_vx_u8mf8_b64(...) __riscv_vmsltu_vx_u8mf8_b64(__VA_ARGS__)
+#define vmsltu_vv_u8mf4_b32(...) __riscv_vmsltu_vv_u8mf4_b32(__VA_ARGS__)
+#define vmsltu_vx_u8mf4_b32(...) __riscv_vmsltu_vx_u8mf4_b32(__VA_ARGS__)
+#define vmsltu_vv_u8mf2_b16(...) __riscv_vmsltu_vv_u8mf2_b16(__VA_ARGS__)
+#define vmsltu_vx_u8mf2_b16(...) __riscv_vmsltu_vx_u8mf2_b16(__VA_ARGS__)
+#define vmsltu_vv_u8m1_b8(...) __riscv_vmsltu_vv_u8m1_b8(__VA_ARGS__)
+#define vmsltu_vx_u8m1_b8(...) __riscv_vmsltu_vx_u8m1_b8(__VA_ARGS__)
+#define vmsltu_vv_u8m2_b4(...) __riscv_vmsltu_vv_u8m2_b4(__VA_ARGS__)
+#define vmsltu_vx_u8m2_b4(...) __riscv_vmsltu_vx_u8m2_b4(__VA_ARGS__)
+#define vmsltu_vv_u8m4_b2(...) __riscv_vmsltu_vv_u8m4_b2(__VA_ARGS__)
+#define vmsltu_vx_u8m4_b2(...) __riscv_vmsltu_vx_u8m4_b2(__VA_ARGS__)
+#define vmsltu_vv_u8m8_b1(...) __riscv_vmsltu_vv_u8m8_b1(__VA_ARGS__)
+#define vmsltu_vx_u8m8_b1(...) __riscv_vmsltu_vx_u8m8_b1(__VA_ARGS__)
+#define vmsltu_vv_u16mf4_b64(...) __riscv_vmsltu_vv_u16mf4_b64(__VA_ARGS__)
+#define vmsltu_vx_u16mf4_b64(...) __riscv_vmsltu_vx_u16mf4_b64(__VA_ARGS__)
+#define vmsltu_vv_u16mf2_b32(...) __riscv_vmsltu_vv_u16mf2_b32(__VA_ARGS__)
+#define vmsltu_vx_u16mf2_b32(...) __riscv_vmsltu_vx_u16mf2_b32(__VA_ARGS__)
+#define vmsltu_vv_u16m1_b16(...) __riscv_vmsltu_vv_u16m1_b16(__VA_ARGS__)
+#define vmsltu_vx_u16m1_b16(...) __riscv_vmsltu_vx_u16m1_b16(__VA_ARGS__)
+#define vmsltu_vv_u16m2_b8(...) __riscv_vmsltu_vv_u16m2_b8(__VA_ARGS__)
+#define vmsltu_vx_u16m2_b8(...) __riscv_vmsltu_vx_u16m2_b8(__VA_ARGS__)
+#define vmsltu_vv_u16m4_b4(...) __riscv_vmsltu_vv_u16m4_b4(__VA_ARGS__)
+#define vmsltu_vx_u16m4_b4(...) __riscv_vmsltu_vx_u16m4_b4(__VA_ARGS__)
+#define vmsltu_vv_u16m8_b2(...) __riscv_vmsltu_vv_u16m8_b2(__VA_ARGS__)
+#define vmsltu_vx_u16m8_b2(...) __riscv_vmsltu_vx_u16m8_b2(__VA_ARGS__)
+#define vmsltu_vv_u32mf2_b64(...) __riscv_vmsltu_vv_u32mf2_b64(__VA_ARGS__)
+#define vmsltu_vx_u32mf2_b64(...) __riscv_vmsltu_vx_u32mf2_b64(__VA_ARGS__)
+#define vmsltu_vv_u32m1_b32(...) __riscv_vmsltu_vv_u32m1_b32(__VA_ARGS__)
+#define vmsltu_vx_u32m1_b32(...) __riscv_vmsltu_vx_u32m1_b32(__VA_ARGS__)
+#define vmsltu_vv_u32m2_b16(...) __riscv_vmsltu_vv_u32m2_b16(__VA_ARGS__)
+#define vmsltu_vx_u32m2_b16(...) __riscv_vmsltu_vx_u32m2_b16(__VA_ARGS__)
+#define vmsltu_vv_u32m4_b8(...) __riscv_vmsltu_vv_u32m4_b8(__VA_ARGS__)
+#define vmsltu_vx_u32m4_b8(...) __riscv_vmsltu_vx_u32m4_b8(__VA_ARGS__)
+#define vmsltu_vv_u32m8_b4(...) __riscv_vmsltu_vv_u32m8_b4(__VA_ARGS__)
+#define vmsltu_vx_u32m8_b4(...) __riscv_vmsltu_vx_u32m8_b4(__VA_ARGS__)
+#define vmsltu_vv_u64m1_b64(...) __riscv_vmsltu_vv_u64m1_b64(__VA_ARGS__)
+#define vmsltu_vx_u64m1_b64(...) __riscv_vmsltu_vx_u64m1_b64(__VA_ARGS__)
+#define vmsltu_vv_u64m2_b32(...) __riscv_vmsltu_vv_u64m2_b32(__VA_ARGS__)
+#define vmsltu_vx_u64m2_b32(...) __riscv_vmsltu_vx_u64m2_b32(__VA_ARGS__)
+#define vmsltu_vv_u64m4_b16(...) __riscv_vmsltu_vv_u64m4_b16(__VA_ARGS__)
+#define vmsltu_vx_u64m4_b16(...) __riscv_vmsltu_vx_u64m4_b16(__VA_ARGS__)
+#define vmsltu_vv_u64m8_b8(...) __riscv_vmsltu_vv_u64m8_b8(__VA_ARGS__)
+#define vmsltu_vx_u64m8_b8(...) __riscv_vmsltu_vx_u64m8_b8(__VA_ARGS__)
+#define vmsleu_vv_u8mf8_b64(...) __riscv_vmsleu_vv_u8mf8_b64(__VA_ARGS__)
+#define vmsleu_vx_u8mf8_b64(...) __riscv_vmsleu_vx_u8mf8_b64(__VA_ARGS__)
+#define vmsleu_vv_u8mf4_b32(...) __riscv_vmsleu_vv_u8mf4_b32(__VA_ARGS__)
+#define vmsleu_vx_u8mf4_b32(...) __riscv_vmsleu_vx_u8mf4_b32(__VA_ARGS__)
+#define vmsleu_vv_u8mf2_b16(...) __riscv_vmsleu_vv_u8mf2_b16(__VA_ARGS__)
+#define vmsleu_vx_u8mf2_b16(...) __riscv_vmsleu_vx_u8mf2_b16(__VA_ARGS__)
+#define vmsleu_vv_u8m1_b8(...) __riscv_vmsleu_vv_u8m1_b8(__VA_ARGS__)
+#define vmsleu_vx_u8m1_b8(...) __riscv_vmsleu_vx_u8m1_b8(__VA_ARGS__)
+#define vmsleu_vv_u8m2_b4(...) __riscv_vmsleu_vv_u8m2_b4(__VA_ARGS__)
+#define vmsleu_vx_u8m2_b4(...) __riscv_vmsleu_vx_u8m2_b4(__VA_ARGS__)
+#define vmsleu_vv_u8m4_b2(...) __riscv_vmsleu_vv_u8m4_b2(__VA_ARGS__)
+#define vmsleu_vx_u8m4_b2(...) __riscv_vmsleu_vx_u8m4_b2(__VA_ARGS__)
+#define vmsleu_vv_u8m8_b1(...) __riscv_vmsleu_vv_u8m8_b1(__VA_ARGS__)
+#define vmsleu_vx_u8m8_b1(...) __riscv_vmsleu_vx_u8m8_b1(__VA_ARGS__)
+#define vmsleu_vv_u16mf4_b64(...) __riscv_vmsleu_vv_u16mf4_b64(__VA_ARGS__)
+#define vmsleu_vx_u16mf4_b64(...) __riscv_vmsleu_vx_u16mf4_b64(__VA_ARGS__)
+#define vmsleu_vv_u16mf2_b32(...) __riscv_vmsleu_vv_u16mf2_b32(__VA_ARGS__)
+#define vmsleu_vx_u16mf2_b32(...) __riscv_vmsleu_vx_u16mf2_b32(__VA_ARGS__)
+#define vmsleu_vv_u16m1_b16(...) __riscv_vmsleu_vv_u16m1_b16(__VA_ARGS__)
+#define vmsleu_vx_u16m1_b16(...) __riscv_vmsleu_vx_u16m1_b16(__VA_ARGS__)
+#define vmsleu_vv_u16m2_b8(...) __riscv_vmsleu_vv_u16m2_b8(__VA_ARGS__)
+#define vmsleu_vx_u16m2_b8(...) __riscv_vmsleu_vx_u16m2_b8(__VA_ARGS__)
+#define vmsleu_vv_u16m4_b4(...) __riscv_vmsleu_vv_u16m4_b4(__VA_ARGS__)
+#define vmsleu_vx_u16m4_b4(...) __riscv_vmsleu_vx_u16m4_b4(__VA_ARGS__)
+#define vmsleu_vv_u16m8_b2(...) __riscv_vmsleu_vv_u16m8_b2(__VA_ARGS__)
+#define vmsleu_vx_u16m8_b2(...) __riscv_vmsleu_vx_u16m8_b2(__VA_ARGS__)
+#define vmsleu_vv_u32mf2_b64(...) __riscv_vmsleu_vv_u32mf2_b64(__VA_ARGS__)
+#define vmsleu_vx_u32mf2_b64(...) __riscv_vmsleu_vx_u32mf2_b64(__VA_ARGS__)
+#define vmsleu_vv_u32m1_b32(...) __riscv_vmsleu_vv_u32m1_b32(__VA_ARGS__)
+#define vmsleu_vx_u32m1_b32(...) __riscv_vmsleu_vx_u32m1_b32(__VA_ARGS__)
+#define vmsleu_vv_u32m2_b16(...) __riscv_vmsleu_vv_u32m2_b16(__VA_ARGS__)
+#define vmsleu_vx_u32m2_b16(...) __riscv_vmsleu_vx_u32m2_b16(__VA_ARGS__)
+#define vmsleu_vv_u32m4_b8(...) __riscv_vmsleu_vv_u32m4_b8(__VA_ARGS__)
+#define vmsleu_vx_u32m4_b8(...) __riscv_vmsleu_vx_u32m4_b8(__VA_ARGS__)
+#define vmsleu_vv_u32m8_b4(...) __riscv_vmsleu_vv_u32m8_b4(__VA_ARGS__)
+#define vmsleu_vx_u32m8_b4(...) __riscv_vmsleu_vx_u32m8_b4(__VA_ARGS__)
+#define vmsleu_vv_u64m1_b64(...) __riscv_vmsleu_vv_u64m1_b64(__VA_ARGS__)
+#define vmsleu_vx_u64m1_b64(...) __riscv_vmsleu_vx_u64m1_b64(__VA_ARGS__)
+#define vmsleu_vv_u64m2_b32(...) __riscv_vmsleu_vv_u64m2_b32(__VA_ARGS__)
+#define vmsleu_vx_u64m2_b32(...) __riscv_vmsleu_vx_u64m2_b32(__VA_ARGS__)
+#define vmsleu_vv_u64m4_b16(...) __riscv_vmsleu_vv_u64m4_b16(__VA_ARGS__)
+#define vmsleu_vx_u64m4_b16(...) __riscv_vmsleu_vx_u64m4_b16(__VA_ARGS__)
+#define vmsleu_vv_u64m8_b8(...) __riscv_vmsleu_vv_u64m8_b8(__VA_ARGS__)
+#define vmsleu_vx_u64m8_b8(...) __riscv_vmsleu_vx_u64m8_b8(__VA_ARGS__)
+#define vmsgtu_vv_u8mf8_b64(...) __riscv_vmsgtu_vv_u8mf8_b64(__VA_ARGS__)
+#define vmsgtu_vx_u8mf8_b64(...) __riscv_vmsgtu_vx_u8mf8_b64(__VA_ARGS__)
+#define vmsgtu_vv_u8mf4_b32(...) __riscv_vmsgtu_vv_u8mf4_b32(__VA_ARGS__)
+#define vmsgtu_vx_u8mf4_b32(...) __riscv_vmsgtu_vx_u8mf4_b32(__VA_ARGS__)
+#define vmsgtu_vv_u8mf2_b16(...) __riscv_vmsgtu_vv_u8mf2_b16(__VA_ARGS__)
+#define vmsgtu_vx_u8mf2_b16(...) __riscv_vmsgtu_vx_u8mf2_b16(__VA_ARGS__)
+#define vmsgtu_vv_u8m1_b8(...) __riscv_vmsgtu_vv_u8m1_b8(__VA_ARGS__)
+#define vmsgtu_vx_u8m1_b8(...) __riscv_vmsgtu_vx_u8m1_b8(__VA_ARGS__)
+#define vmsgtu_vv_u8m2_b4(...) __riscv_vmsgtu_vv_u8m2_b4(__VA_ARGS__)
+#define vmsgtu_vx_u8m2_b4(...) __riscv_vmsgtu_vx_u8m2_b4(__VA_ARGS__)
+#define vmsgtu_vv_u8m4_b2(...) __riscv_vmsgtu_vv_u8m4_b2(__VA_ARGS__)
+#define vmsgtu_vx_u8m4_b2(...) __riscv_vmsgtu_vx_u8m4_b2(__VA_ARGS__)
+#define vmsgtu_vv_u8m8_b1(...) __riscv_vmsgtu_vv_u8m8_b1(__VA_ARGS__)
+#define vmsgtu_vx_u8m8_b1(...) __riscv_vmsgtu_vx_u8m8_b1(__VA_ARGS__)
+#define vmsgtu_vv_u16mf4_b64(...) __riscv_vmsgtu_vv_u16mf4_b64(__VA_ARGS__)
+#define vmsgtu_vx_u16mf4_b64(...) __riscv_vmsgtu_vx_u16mf4_b64(__VA_ARGS__)
+#define vmsgtu_vv_u16mf2_b32(...) __riscv_vmsgtu_vv_u16mf2_b32(__VA_ARGS__)
+#define vmsgtu_vx_u16mf2_b32(...) __riscv_vmsgtu_vx_u16mf2_b32(__VA_ARGS__)
+#define vmsgtu_vv_u16m1_b16(...) __riscv_vmsgtu_vv_u16m1_b16(__VA_ARGS__)
+#define vmsgtu_vx_u16m1_b16(...) __riscv_vmsgtu_vx_u16m1_b16(__VA_ARGS__)
+#define vmsgtu_vv_u16m2_b8(...) __riscv_vmsgtu_vv_u16m2_b8(__VA_ARGS__)
+#define vmsgtu_vx_u16m2_b8(...) __riscv_vmsgtu_vx_u16m2_b8(__VA_ARGS__)
+#define vmsgtu_vv_u16m4_b4(...) __riscv_vmsgtu_vv_u16m4_b4(__VA_ARGS__)
+#define vmsgtu_vx_u16m4_b4(...) __riscv_vmsgtu_vx_u16m4_b4(__VA_ARGS__)
+#define vmsgtu_vv_u16m8_b2(...) __riscv_vmsgtu_vv_u16m8_b2(__VA_ARGS__)
+#define vmsgtu_vx_u16m8_b2(...) __riscv_vmsgtu_vx_u16m8_b2(__VA_ARGS__)
+#define vmsgtu_vv_u32mf2_b64(...) __riscv_vmsgtu_vv_u32mf2_b64(__VA_ARGS__)
+#define vmsgtu_vx_u32mf2_b64(...) __riscv_vmsgtu_vx_u32mf2_b64(__VA_ARGS__)
+#define vmsgtu_vv_u32m1_b32(...) __riscv_vmsgtu_vv_u32m1_b32(__VA_ARGS__)
+#define vmsgtu_vx_u32m1_b32(...) __riscv_vmsgtu_vx_u32m1_b32(__VA_ARGS__)
+#define vmsgtu_vv_u32m2_b16(...) __riscv_vmsgtu_vv_u32m2_b16(__VA_ARGS__)
+#define vmsgtu_vx_u32m2_b16(...) __riscv_vmsgtu_vx_u32m2_b16(__VA_ARGS__)
+#define vmsgtu_vv_u32m4_b8(...) __riscv_vmsgtu_vv_u32m4_b8(__VA_ARGS__)
+#define vmsgtu_vx_u32m4_b8(...) __riscv_vmsgtu_vx_u32m4_b8(__VA_ARGS__)
+#define vmsgtu_vv_u32m8_b4(...) __riscv_vmsgtu_vv_u32m8_b4(__VA_ARGS__)
+#define vmsgtu_vx_u32m8_b4(...) __riscv_vmsgtu_vx_u32m8_b4(__VA_ARGS__)
+#define vmsgtu_vv_u64m1_b64(...) __riscv_vmsgtu_vv_u64m1_b64(__VA_ARGS__)
+#define vmsgtu_vx_u64m1_b64(...) __riscv_vmsgtu_vx_u64m1_b64(__VA_ARGS__)
+#define vmsgtu_vv_u64m2_b32(...) __riscv_vmsgtu_vv_u64m2_b32(__VA_ARGS__)
+#define vmsgtu_vx_u64m2_b32(...) __riscv_vmsgtu_vx_u64m2_b32(__VA_ARGS__)
+#define vmsgtu_vv_u64m4_b16(...) __riscv_vmsgtu_vv_u64m4_b16(__VA_ARGS__)
+#define vmsgtu_vx_u64m4_b16(...) __riscv_vmsgtu_vx_u64m4_b16(__VA_ARGS__)
+#define vmsgtu_vv_u64m8_b8(...) __riscv_vmsgtu_vv_u64m8_b8(__VA_ARGS__)
+#define vmsgtu_vx_u64m8_b8(...) __riscv_vmsgtu_vx_u64m8_b8(__VA_ARGS__)
+#define vmsgeu_vv_u8mf8_b64(...) __riscv_vmsgeu_vv_u8mf8_b64(__VA_ARGS__)
+#define vmsgeu_vx_u8mf8_b64(...) __riscv_vmsgeu_vx_u8mf8_b64(__VA_ARGS__)
+#define vmsgeu_vv_u8mf4_b32(...) __riscv_vmsgeu_vv_u8mf4_b32(__VA_ARGS__)
+#define vmsgeu_vx_u8mf4_b32(...) __riscv_vmsgeu_vx_u8mf4_b32(__VA_ARGS__)
+#define vmsgeu_vv_u8mf2_b16(...) __riscv_vmsgeu_vv_u8mf2_b16(__VA_ARGS__)
+#define vmsgeu_vx_u8mf2_b16(...) __riscv_vmsgeu_vx_u8mf2_b16(__VA_ARGS__)
+#define vmsgeu_vv_u8m1_b8(...) __riscv_vmsgeu_vv_u8m1_b8(__VA_ARGS__)
+#define vmsgeu_vx_u8m1_b8(...) __riscv_vmsgeu_vx_u8m1_b8(__VA_ARGS__)
+#define vmsgeu_vv_u8m2_b4(...) __riscv_vmsgeu_vv_u8m2_b4(__VA_ARGS__)
+#define vmsgeu_vx_u8m2_b4(...) __riscv_vmsgeu_vx_u8m2_b4(__VA_ARGS__)
+#define vmsgeu_vv_u8m4_b2(...) __riscv_vmsgeu_vv_u8m4_b2(__VA_ARGS__)
+#define vmsgeu_vx_u8m4_b2(...) __riscv_vmsgeu_vx_u8m4_b2(__VA_ARGS__)
+#define vmsgeu_vv_u8m8_b1(...) __riscv_vmsgeu_vv_u8m8_b1(__VA_ARGS__)
+#define vmsgeu_vx_u8m8_b1(...) __riscv_vmsgeu_vx_u8m8_b1(__VA_ARGS__)
+#define vmsgeu_vv_u16mf4_b64(...) __riscv_vmsgeu_vv_u16mf4_b64(__VA_ARGS__)
+#define vmsgeu_vx_u16mf4_b64(...) __riscv_vmsgeu_vx_u16mf4_b64(__VA_ARGS__)
+#define vmsgeu_vv_u16mf2_b32(...) __riscv_vmsgeu_vv_u16mf2_b32(__VA_ARGS__)
+#define vmsgeu_vx_u16mf2_b32(...) __riscv_vmsgeu_vx_u16mf2_b32(__VA_ARGS__)
+#define vmsgeu_vv_u16m1_b16(...) __riscv_vmsgeu_vv_u16m1_b16(__VA_ARGS__)
+#define vmsgeu_vx_u16m1_b16(...) __riscv_vmsgeu_vx_u16m1_b16(__VA_ARGS__)
+#define vmsgeu_vv_u16m2_b8(...) __riscv_vmsgeu_vv_u16m2_b8(__VA_ARGS__)
+#define vmsgeu_vx_u16m2_b8(...) __riscv_vmsgeu_vx_u16m2_b8(__VA_ARGS__)
+#define vmsgeu_vv_u16m4_b4(...) __riscv_vmsgeu_vv_u16m4_b4(__VA_ARGS__)
+#define vmsgeu_vx_u16m4_b4(...) __riscv_vmsgeu_vx_u16m4_b4(__VA_ARGS__)
+#define vmsgeu_vv_u16m8_b2(...) __riscv_vmsgeu_vv_u16m8_b2(__VA_ARGS__)
+#define vmsgeu_vx_u16m8_b2(...) __riscv_vmsgeu_vx_u16m8_b2(__VA_ARGS__)
+#define vmsgeu_vv_u32mf2_b64(...) __riscv_vmsgeu_vv_u32mf2_b64(__VA_ARGS__)
+#define vmsgeu_vx_u32mf2_b64(...) __riscv_vmsgeu_vx_u32mf2_b64(__VA_ARGS__)
+#define vmsgeu_vv_u32m1_b32(...) __riscv_vmsgeu_vv_u32m1_b32(__VA_ARGS__)
+#define vmsgeu_vx_u32m1_b32(...) __riscv_vmsgeu_vx_u32m1_b32(__VA_ARGS__)
+#define vmsgeu_vv_u32m2_b16(...) __riscv_vmsgeu_vv_u32m2_b16(__VA_ARGS__)
+#define vmsgeu_vx_u32m2_b16(...) __riscv_vmsgeu_vx_u32m2_b16(__VA_ARGS__)
+#define vmsgeu_vv_u32m4_b8(...) __riscv_vmsgeu_vv_u32m4_b8(__VA_ARGS__)
+#define vmsgeu_vx_u32m4_b8(...) __riscv_vmsgeu_vx_u32m4_b8(__VA_ARGS__)
+#define vmsgeu_vv_u32m8_b4(...) __riscv_vmsgeu_vv_u32m8_b4(__VA_ARGS__)
+#define vmsgeu_vx_u32m8_b4(...) __riscv_vmsgeu_vx_u32m8_b4(__VA_ARGS__)
+#define vmsgeu_vv_u64m1_b64(...) __riscv_vmsgeu_vv_u64m1_b64(__VA_ARGS__)
+#define vmsgeu_vx_u64m1_b64(...) __riscv_vmsgeu_vx_u64m1_b64(__VA_ARGS__)
+#define vmsgeu_vv_u64m2_b32(...) __riscv_vmsgeu_vv_u64m2_b32(__VA_ARGS__)
+#define vmsgeu_vx_u64m2_b32(...) __riscv_vmsgeu_vx_u64m2_b32(__VA_ARGS__)
+#define vmsgeu_vv_u64m4_b16(...) __riscv_vmsgeu_vv_u64m4_b16(__VA_ARGS__)
+#define vmsgeu_vx_u64m4_b16(...) __riscv_vmsgeu_vx_u64m4_b16(__VA_ARGS__)
+#define vmsgeu_vv_u64m8_b8(...) __riscv_vmsgeu_vv_u64m8_b8(__VA_ARGS__)
+#define vmsgeu_vx_u64m8_b8(...) __riscv_vmsgeu_vx_u64m8_b8(__VA_ARGS__)
+// masked functions
+#define vmseq_vv_i8mf8_b64_m(...) __riscv_vmseq_vv_i8mf8_b64_mu(__VA_ARGS__)
+#define vmseq_vx_i8mf8_b64_m(...) __riscv_vmseq_vx_i8mf8_b64_mu(__VA_ARGS__)
+#define vmseq_vv_i8mf4_b32_m(...) __riscv_vmseq_vv_i8mf4_b32_mu(__VA_ARGS__)
+#define vmseq_vx_i8mf4_b32_m(...) __riscv_vmseq_vx_i8mf4_b32_mu(__VA_ARGS__)
+#define vmseq_vv_i8mf2_b16_m(...) __riscv_vmseq_vv_i8mf2_b16_mu(__VA_ARGS__)
+#define vmseq_vx_i8mf2_b16_m(...) __riscv_vmseq_vx_i8mf2_b16_mu(__VA_ARGS__)
+#define vmseq_vv_i8m1_b8_m(...) __riscv_vmseq_vv_i8m1_b8_mu(__VA_ARGS__)
+#define vmseq_vx_i8m1_b8_m(...) __riscv_vmseq_vx_i8m1_b8_mu(__VA_ARGS__)
+#define vmseq_vv_i8m2_b4_m(...) __riscv_vmseq_vv_i8m2_b4_mu(__VA_ARGS__)
+#define vmseq_vx_i8m2_b4_m(...) __riscv_vmseq_vx_i8m2_b4_mu(__VA_ARGS__)
+#define vmseq_vv_i8m4_b2_m(...) __riscv_vmseq_vv_i8m4_b2_mu(__VA_ARGS__)
+#define vmseq_vx_i8m4_b2_m(...) __riscv_vmseq_vx_i8m4_b2_mu(__VA_ARGS__)
+#define vmseq_vv_i8m8_b1_m(...) __riscv_vmseq_vv_i8m8_b1_mu(__VA_ARGS__)
+#define vmseq_vx_i8m8_b1_m(...) __riscv_vmseq_vx_i8m8_b1_mu(__VA_ARGS__)
+#define vmseq_vv_i16mf4_b64_m(...) __riscv_vmseq_vv_i16mf4_b64_mu(__VA_ARGS__)
+#define vmseq_vx_i16mf4_b64_m(...) __riscv_vmseq_vx_i16mf4_b64_mu(__VA_ARGS__)
+#define vmseq_vv_i16mf2_b32_m(...) __riscv_vmseq_vv_i16mf2_b32_mu(__VA_ARGS__)
+#define vmseq_vx_i16mf2_b32_m(...) __riscv_vmseq_vx_i16mf2_b32_mu(__VA_ARGS__)
+#define vmseq_vv_i16m1_b16_m(...) __riscv_vmseq_vv_i16m1_b16_mu(__VA_ARGS__)
+#define vmseq_vx_i16m1_b16_m(...) __riscv_vmseq_vx_i16m1_b16_mu(__VA_ARGS__)
+#define vmseq_vv_i16m2_b8_m(...) __riscv_vmseq_vv_i16m2_b8_mu(__VA_ARGS__)
+#define vmseq_vx_i16m2_b8_m(...) __riscv_vmseq_vx_i16m2_b8_mu(__VA_ARGS__)
+#define vmseq_vv_i16m4_b4_m(...) __riscv_vmseq_vv_i16m4_b4_mu(__VA_ARGS__)
+#define vmseq_vx_i16m4_b4_m(...) __riscv_vmseq_vx_i16m4_b4_mu(__VA_ARGS__)
+#define vmseq_vv_i16m8_b2_m(...) __riscv_vmseq_vv_i16m8_b2_mu(__VA_ARGS__)
+#define vmseq_vx_i16m8_b2_m(...) __riscv_vmseq_vx_i16m8_b2_mu(__VA_ARGS__)
+#define vmseq_vv_i32mf2_b64_m(...) __riscv_vmseq_vv_i32mf2_b64_mu(__VA_ARGS__)
+#define vmseq_vx_i32mf2_b64_m(...) __riscv_vmseq_vx_i32mf2_b64_mu(__VA_ARGS__)
+#define vmseq_vv_i32m1_b32_m(...) __riscv_vmseq_vv_i32m1_b32_mu(__VA_ARGS__)
+#define vmseq_vx_i32m1_b32_m(...) __riscv_vmseq_vx_i32m1_b32_mu(__VA_ARGS__)
+#define vmseq_vv_i32m2_b16_m(...) __riscv_vmseq_vv_i32m2_b16_mu(__VA_ARGS__)
+#define vmseq_vx_i32m2_b16_m(...) __riscv_vmseq_vx_i32m2_b16_mu(__VA_ARGS__)
+#define vmseq_vv_i32m4_b8_m(...) __riscv_vmseq_vv_i32m4_b8_mu(__VA_ARGS__)
+#define vmseq_vx_i32m4_b8_m(...) __riscv_vmseq_vx_i32m4_b8_mu(__VA_ARGS__)
+#define vmseq_vv_i32m8_b4_m(...) __riscv_vmseq_vv_i32m8_b4_mu(__VA_ARGS__)
+#define vmseq_vx_i32m8_b4_m(...) __riscv_vmseq_vx_i32m8_b4_mu(__VA_ARGS__)
+#define vmseq_vv_i64m1_b64_m(...) __riscv_vmseq_vv_i64m1_b64_mu(__VA_ARGS__)
+#define vmseq_vx_i64m1_b64_m(...) __riscv_vmseq_vx_i64m1_b64_mu(__VA_ARGS__)
+#define vmseq_vv_i64m2_b32_m(...) __riscv_vmseq_vv_i64m2_b32_mu(__VA_ARGS__)
+#define vmseq_vx_i64m2_b32_m(...) __riscv_vmseq_vx_i64m2_b32_mu(__VA_ARGS__)
+#define vmseq_vv_i64m4_b16_m(...) __riscv_vmseq_vv_i64m4_b16_mu(__VA_ARGS__)
+#define vmseq_vx_i64m4_b16_m(...) __riscv_vmseq_vx_i64m4_b16_mu(__VA_ARGS__)
+#define vmseq_vv_i64m8_b8_m(...) __riscv_vmseq_vv_i64m8_b8_mu(__VA_ARGS__)
+#define vmseq_vx_i64m8_b8_m(...) __riscv_vmseq_vx_i64m8_b8_mu(__VA_ARGS__)
+#define vmsne_vv_i8mf8_b64_m(...) __riscv_vmsne_vv_i8mf8_b64_mu(__VA_ARGS__)
+#define vmsne_vx_i8mf8_b64_m(...) __riscv_vmsne_vx_i8mf8_b64_mu(__VA_ARGS__)
+#define vmsne_vv_i8mf4_b32_m(...) __riscv_vmsne_vv_i8mf4_b32_mu(__VA_ARGS__)
+#define vmsne_vx_i8mf4_b32_m(...) __riscv_vmsne_vx_i8mf4_b32_mu(__VA_ARGS__)
+#define vmsne_vv_i8mf2_b16_m(...) __riscv_vmsne_vv_i8mf2_b16_mu(__VA_ARGS__)
+#define vmsne_vx_i8mf2_b16_m(...) __riscv_vmsne_vx_i8mf2_b16_mu(__VA_ARGS__)
+#define vmsne_vv_i8m1_b8_m(...) __riscv_vmsne_vv_i8m1_b8_mu(__VA_ARGS__)
+#define vmsne_vx_i8m1_b8_m(...) __riscv_vmsne_vx_i8m1_b8_mu(__VA_ARGS__)
+#define vmsne_vv_i8m2_b4_m(...) __riscv_vmsne_vv_i8m2_b4_mu(__VA_ARGS__)
+#define vmsne_vx_i8m2_b4_m(...) __riscv_vmsne_vx_i8m2_b4_mu(__VA_ARGS__)
+#define vmsne_vv_i8m4_b2_m(...) __riscv_vmsne_vv_i8m4_b2_mu(__VA_ARGS__)
+#define vmsne_vx_i8m4_b2_m(...) __riscv_vmsne_vx_i8m4_b2_mu(__VA_ARGS__)
+#define vmsne_vv_i8m8_b1_m(...) __riscv_vmsne_vv_i8m8_b1_mu(__VA_ARGS__)
+#define vmsne_vx_i8m8_b1_m(...) __riscv_vmsne_vx_i8m8_b1_mu(__VA_ARGS__)
+#define vmsne_vv_i16mf4_b64_m(...) __riscv_vmsne_vv_i16mf4_b64_mu(__VA_ARGS__)
+#define vmsne_vx_i16mf4_b64_m(...) __riscv_vmsne_vx_i16mf4_b64_mu(__VA_ARGS__)
+#define vmsne_vv_i16mf2_b32_m(...) __riscv_vmsne_vv_i16mf2_b32_mu(__VA_ARGS__)
+#define vmsne_vx_i16mf2_b32_m(...) __riscv_vmsne_vx_i16mf2_b32_mu(__VA_ARGS__)
+#define vmsne_vv_i16m1_b16_m(...) __riscv_vmsne_vv_i16m1_b16_mu(__VA_ARGS__)
+#define vmsne_vx_i16m1_b16_m(...) __riscv_vmsne_vx_i16m1_b16_mu(__VA_ARGS__)
+#define vmsne_vv_i16m2_b8_m(...) __riscv_vmsne_vv_i16m2_b8_mu(__VA_ARGS__)
+#define vmsne_vx_i16m2_b8_m(...) __riscv_vmsne_vx_i16m2_b8_mu(__VA_ARGS__)
+#define vmsne_vv_i16m4_b4_m(...) __riscv_vmsne_vv_i16m4_b4_mu(__VA_ARGS__)
+#define vmsne_vx_i16m4_b4_m(...) __riscv_vmsne_vx_i16m4_b4_mu(__VA_ARGS__)
+#define vmsne_vv_i16m8_b2_m(...) __riscv_vmsne_vv_i16m8_b2_mu(__VA_ARGS__)
+#define vmsne_vx_i16m8_b2_m(...) __riscv_vmsne_vx_i16m8_b2_mu(__VA_ARGS__)
+#define vmsne_vv_i32mf2_b64_m(...) __riscv_vmsne_vv_i32mf2_b64_mu(__VA_ARGS__)
+#define vmsne_vx_i32mf2_b64_m(...) __riscv_vmsne_vx_i32mf2_b64_mu(__VA_ARGS__)
+#define vmsne_vv_i32m1_b32_m(...) __riscv_vmsne_vv_i32m1_b32_mu(__VA_ARGS__)
+#define vmsne_vx_i32m1_b32_m(...) __riscv_vmsne_vx_i32m1_b32_mu(__VA_ARGS__)
+#define vmsne_vv_i32m2_b16_m(...) __riscv_vmsne_vv_i32m2_b16_mu(__VA_ARGS__)
+#define vmsne_vx_i32m2_b16_m(...) __riscv_vmsne_vx_i32m2_b16_mu(__VA_ARGS__)
+#define vmsne_vv_i32m4_b8_m(...) __riscv_vmsne_vv_i32m4_b8_mu(__VA_ARGS__)
+#define vmsne_vx_i32m4_b8_m(...) __riscv_vmsne_vx_i32m4_b8_mu(__VA_ARGS__)
+#define vmsne_vv_i32m8_b4_m(...) __riscv_vmsne_vv_i32m8_b4_mu(__VA_ARGS__)
+#define vmsne_vx_i32m8_b4_m(...) __riscv_vmsne_vx_i32m8_b4_mu(__VA_ARGS__)
+#define vmsne_vv_i64m1_b64_m(...) __riscv_vmsne_vv_i64m1_b64_mu(__VA_ARGS__)
+#define vmsne_vx_i64m1_b64_m(...) __riscv_vmsne_vx_i64m1_b64_mu(__VA_ARGS__)
+#define vmsne_vv_i64m2_b32_m(...) __riscv_vmsne_vv_i64m2_b32_mu(__VA_ARGS__)
+#define vmsne_vx_i64m2_b32_m(...) __riscv_vmsne_vx_i64m2_b32_mu(__VA_ARGS__)
+#define vmsne_vv_i64m4_b16_m(...) __riscv_vmsne_vv_i64m4_b16_mu(__VA_ARGS__)
+#define vmsne_vx_i64m4_b16_m(...) __riscv_vmsne_vx_i64m4_b16_mu(__VA_ARGS__)
+#define vmsne_vv_i64m8_b8_m(...) __riscv_vmsne_vv_i64m8_b8_mu(__VA_ARGS__)
+#define vmsne_vx_i64m8_b8_m(...) __riscv_vmsne_vx_i64m8_b8_mu(__VA_ARGS__)
+#define vmslt_vv_i8mf8_b64_m(...) __riscv_vmslt_vv_i8mf8_b64_mu(__VA_ARGS__)
+#define vmslt_vx_i8mf8_b64_m(...) __riscv_vmslt_vx_i8mf8_b64_mu(__VA_ARGS__)
+#define vmslt_vv_i8mf4_b32_m(...) __riscv_vmslt_vv_i8mf4_b32_mu(__VA_ARGS__)
+#define vmslt_vx_i8mf4_b32_m(...) __riscv_vmslt_vx_i8mf4_b32_mu(__VA_ARGS__)
+#define vmslt_vv_i8mf2_b16_m(...) __riscv_vmslt_vv_i8mf2_b16_mu(__VA_ARGS__)
+#define vmslt_vx_i8mf2_b16_m(...) __riscv_vmslt_vx_i8mf2_b16_mu(__VA_ARGS__)
+#define vmslt_vv_i8m1_b8_m(...) __riscv_vmslt_vv_i8m1_b8_mu(__VA_ARGS__)
+#define vmslt_vx_i8m1_b8_m(...) __riscv_vmslt_vx_i8m1_b8_mu(__VA_ARGS__)
+#define vmslt_vv_i8m2_b4_m(...) __riscv_vmslt_vv_i8m2_b4_mu(__VA_ARGS__)
+#define vmslt_vx_i8m2_b4_m(...) __riscv_vmslt_vx_i8m2_b4_mu(__VA_ARGS__)
+#define vmslt_vv_i8m4_b2_m(...) __riscv_vmslt_vv_i8m4_b2_mu(__VA_ARGS__)
+#define vmslt_vx_i8m4_b2_m(...) __riscv_vmslt_vx_i8m4_b2_mu(__VA_ARGS__)
+#define vmslt_vv_i8m8_b1_m(...) __riscv_vmslt_vv_i8m8_b1_mu(__VA_ARGS__)
+#define vmslt_vx_i8m8_b1_m(...) __riscv_vmslt_vx_i8m8_b1_mu(__VA_ARGS__)
+#define vmslt_vv_i16mf4_b64_m(...) __riscv_vmslt_vv_i16mf4_b64_mu(__VA_ARGS__)
+#define vmslt_vx_i16mf4_b64_m(...) __riscv_vmslt_vx_i16mf4_b64_mu(__VA_ARGS__)
+#define vmslt_vv_i16mf2_b32_m(...) __riscv_vmslt_vv_i16mf2_b32_mu(__VA_ARGS__)
+#define vmslt_vx_i16mf2_b32_m(...) __riscv_vmslt_vx_i16mf2_b32_mu(__VA_ARGS__)
+#define vmslt_vv_i16m1_b16_m(...) __riscv_vmslt_vv_i16m1_b16_mu(__VA_ARGS__)
+#define vmslt_vx_i16m1_b16_m(...) __riscv_vmslt_vx_i16m1_b16_mu(__VA_ARGS__)
+#define vmslt_vv_i16m2_b8_m(...) __riscv_vmslt_vv_i16m2_b8_mu(__VA_ARGS__)
+#define vmslt_vx_i16m2_b8_m(...) __riscv_vmslt_vx_i16m2_b8_mu(__VA_ARGS__)
+#define vmslt_vv_i16m4_b4_m(...) __riscv_vmslt_vv_i16m4_b4_mu(__VA_ARGS__)
+#define vmslt_vx_i16m4_b4_m(...) __riscv_vmslt_vx_i16m4_b4_mu(__VA_ARGS__)
+#define vmslt_vv_i16m8_b2_m(...) __riscv_vmslt_vv_i16m8_b2_mu(__VA_ARGS__)
+#define vmslt_vx_i16m8_b2_m(...) __riscv_vmslt_vx_i16m8_b2_mu(__VA_ARGS__)
+#define vmslt_vv_i32mf2_b64_m(...) __riscv_vmslt_vv_i32mf2_b64_mu(__VA_ARGS__)
+#define vmslt_vx_i32mf2_b64_m(...) __riscv_vmslt_vx_i32mf2_b64_mu(__VA_ARGS__)
+#define vmslt_vv_i32m1_b32_m(...) __riscv_vmslt_vv_i32m1_b32_mu(__VA_ARGS__)
+#define vmslt_vx_i32m1_b32_m(...) __riscv_vmslt_vx_i32m1_b32_mu(__VA_ARGS__)
+#define vmslt_vv_i32m2_b16_m(...) __riscv_vmslt_vv_i32m2_b16_mu(__VA_ARGS__)
+#define vmslt_vx_i32m2_b16_m(...) __riscv_vmslt_vx_i32m2_b16_mu(__VA_ARGS__)
+#define vmslt_vv_i32m4_b8_m(...) __riscv_vmslt_vv_i32m4_b8_mu(__VA_ARGS__)
+#define vmslt_vx_i32m4_b8_m(...) __riscv_vmslt_vx_i32m4_b8_mu(__VA_ARGS__)
+#define vmslt_vv_i32m8_b4_m(...) __riscv_vmslt_vv_i32m8_b4_mu(__VA_ARGS__)
+#define vmslt_vx_i32m8_b4_m(...) __riscv_vmslt_vx_i32m8_b4_mu(__VA_ARGS__)
+#define vmslt_vv_i64m1_b64_m(...) __riscv_vmslt_vv_i64m1_b64_mu(__VA_ARGS__)
+#define vmslt_vx_i64m1_b64_m(...) __riscv_vmslt_vx_i64m1_b64_mu(__VA_ARGS__)
+#define vmslt_vv_i64m2_b32_m(...) __riscv_vmslt_vv_i64m2_b32_mu(__VA_ARGS__)
+#define vmslt_vx_i64m2_b32_m(...) __riscv_vmslt_vx_i64m2_b32_mu(__VA_ARGS__)
+#define vmslt_vv_i64m4_b16_m(...) __riscv_vmslt_vv_i64m4_b16_mu(__VA_ARGS__)
+#define vmslt_vx_i64m4_b16_m(...) __riscv_vmslt_vx_i64m4_b16_mu(__VA_ARGS__)
+#define vmslt_vv_i64m8_b8_m(...) __riscv_vmslt_vv_i64m8_b8_mu(__VA_ARGS__)
+#define vmslt_vx_i64m8_b8_m(...) __riscv_vmslt_vx_i64m8_b8_mu(__VA_ARGS__)
+#define vmsle_vv_i8mf8_b64_m(...) __riscv_vmsle_vv_i8mf8_b64_mu(__VA_ARGS__)
+#define vmsle_vx_i8mf8_b64_m(...) __riscv_vmsle_vx_i8mf8_b64_mu(__VA_ARGS__)
+#define vmsle_vv_i8mf4_b32_m(...) __riscv_vmsle_vv_i8mf4_b32_mu(__VA_ARGS__)
+#define vmsle_vx_i8mf4_b32_m(...) __riscv_vmsle_vx_i8mf4_b32_mu(__VA_ARGS__)
+#define vmsle_vv_i8mf2_b16_m(...) __riscv_vmsle_vv_i8mf2_b16_mu(__VA_ARGS__)
+#define vmsle_vx_i8mf2_b16_m(...) __riscv_vmsle_vx_i8mf2_b16_mu(__VA_ARGS__)
+#define vmsle_vv_i8m1_b8_m(...) __riscv_vmsle_vv_i8m1_b8_mu(__VA_ARGS__)
+#define vmsle_vx_i8m1_b8_m(...) __riscv_vmsle_vx_i8m1_b8_mu(__VA_ARGS__)
+#define vmsle_vv_i8m2_b4_m(...) __riscv_vmsle_vv_i8m2_b4_mu(__VA_ARGS__)
+#define vmsle_vx_i8m2_b4_m(...) __riscv_vmsle_vx_i8m2_b4_mu(__VA_ARGS__)
+#define vmsle_vv_i8m4_b2_m(...) __riscv_vmsle_vv_i8m4_b2_mu(__VA_ARGS__)
+#define vmsle_vx_i8m4_b2_m(...) __riscv_vmsle_vx_i8m4_b2_mu(__VA_ARGS__)
+#define vmsle_vv_i8m8_b1_m(...) __riscv_vmsle_vv_i8m8_b1_mu(__VA_ARGS__)
+#define vmsle_vx_i8m8_b1_m(...) __riscv_vmsle_vx_i8m8_b1_mu(__VA_ARGS__)
+#define vmsle_vv_i16mf4_b64_m(...) __riscv_vmsle_vv_i16mf4_b64_mu(__VA_ARGS__)
+#define vmsle_vx_i16mf4_b64_m(...) __riscv_vmsle_vx_i16mf4_b64_mu(__VA_ARGS__)
+#define vmsle_vv_i16mf2_b32_m(...) __riscv_vmsle_vv_i16mf2_b32_mu(__VA_ARGS__)
+#define vmsle_vx_i16mf2_b32_m(...) __riscv_vmsle_vx_i16mf2_b32_mu(__VA_ARGS__)
+#define vmsle_vv_i16m1_b16_m(...) __riscv_vmsle_vv_i16m1_b16_mu(__VA_ARGS__)
+#define vmsle_vx_i16m1_b16_m(...) __riscv_vmsle_vx_i16m1_b16_mu(__VA_ARGS__)
+#define vmsle_vv_i16m2_b8_m(...) __riscv_vmsle_vv_i16m2_b8_mu(__VA_ARGS__)
+#define vmsle_vx_i16m2_b8_m(...) __riscv_vmsle_vx_i16m2_b8_mu(__VA_ARGS__)
+#define vmsle_vv_i16m4_b4_m(...) __riscv_vmsle_vv_i16m4_b4_mu(__VA_ARGS__)
+#define vmsle_vx_i16m4_b4_m(...) __riscv_vmsle_vx_i16m4_b4_mu(__VA_ARGS__)
+#define vmsle_vv_i16m8_b2_m(...) __riscv_vmsle_vv_i16m8_b2_mu(__VA_ARGS__)
+#define vmsle_vx_i16m8_b2_m(...) __riscv_vmsle_vx_i16m8_b2_mu(__VA_ARGS__)
+#define vmsle_vv_i32mf2_b64_m(...) __riscv_vmsle_vv_i32mf2_b64_mu(__VA_ARGS__)
+#define vmsle_vx_i32mf2_b64_m(...) __riscv_vmsle_vx_i32mf2_b64_mu(__VA_ARGS__)
+#define vmsle_vv_i32m1_b32_m(...) __riscv_vmsle_vv_i32m1_b32_mu(__VA_ARGS__)
+#define vmsle_vx_i32m1_b32_m(...) __riscv_vmsle_vx_i32m1_b32_mu(__VA_ARGS__)
+#define vmsle_vv_i32m2_b16_m(...) __riscv_vmsle_vv_i32m2_b16_mu(__VA_ARGS__)
+#define vmsle_vx_i32m2_b16_m(...) __riscv_vmsle_vx_i32m2_b16_mu(__VA_ARGS__)
+#define vmsle_vv_i32m4_b8_m(...) __riscv_vmsle_vv_i32m4_b8_mu(__VA_ARGS__)
+#define vmsle_vx_i32m4_b8_m(...) __riscv_vmsle_vx_i32m4_b8_mu(__VA_ARGS__)
+#define vmsle_vv_i32m8_b4_m(...) __riscv_vmsle_vv_i32m8_b4_mu(__VA_ARGS__)
+#define vmsle_vx_i32m8_b4_m(...) __riscv_vmsle_vx_i32m8_b4_mu(__VA_ARGS__)
+#define vmsle_vv_i64m1_b64_m(...) __riscv_vmsle_vv_i64m1_b64_mu(__VA_ARGS__)
+#define vmsle_vx_i64m1_b64_m(...) __riscv_vmsle_vx_i64m1_b64_mu(__VA_ARGS__)
+#define vmsle_vv_i64m2_b32_m(...) __riscv_vmsle_vv_i64m2_b32_mu(__VA_ARGS__)
+#define vmsle_vx_i64m2_b32_m(...) __riscv_vmsle_vx_i64m2_b32_mu(__VA_ARGS__)
+#define vmsle_vv_i64m4_b16_m(...) __riscv_vmsle_vv_i64m4_b16_mu(__VA_ARGS__)
+#define vmsle_vx_i64m4_b16_m(...) __riscv_vmsle_vx_i64m4_b16_mu(__VA_ARGS__)
+#define vmsle_vv_i64m8_b8_m(...) __riscv_vmsle_vv_i64m8_b8_mu(__VA_ARGS__)
+#define vmsle_vx_i64m8_b8_m(...) __riscv_vmsle_vx_i64m8_b8_mu(__VA_ARGS__)
+#define vmsgt_vv_i8mf8_b64_m(...) __riscv_vmsgt_vv_i8mf8_b64_mu(__VA_ARGS__)
+#define vmsgt_vx_i8mf8_b64_m(...) __riscv_vmsgt_vx_i8mf8_b64_mu(__VA_ARGS__)
+#define vmsgt_vv_i8mf4_b32_m(...) __riscv_vmsgt_vv_i8mf4_b32_mu(__VA_ARGS__)
+#define vmsgt_vx_i8mf4_b32_m(...) __riscv_vmsgt_vx_i8mf4_b32_mu(__VA_ARGS__)
+#define vmsgt_vv_i8mf2_b16_m(...) __riscv_vmsgt_vv_i8mf2_b16_mu(__VA_ARGS__)
+#define vmsgt_vx_i8mf2_b16_m(...) __riscv_vmsgt_vx_i8mf2_b16_mu(__VA_ARGS__)
+#define vmsgt_vv_i8m1_b8_m(...) __riscv_vmsgt_vv_i8m1_b8_mu(__VA_ARGS__)
+#define vmsgt_vx_i8m1_b8_m(...) __riscv_vmsgt_vx_i8m1_b8_mu(__VA_ARGS__)
+#define vmsgt_vv_i8m2_b4_m(...) __riscv_vmsgt_vv_i8m2_b4_mu(__VA_ARGS__)
+#define vmsgt_vx_i8m2_b4_m(...) __riscv_vmsgt_vx_i8m2_b4_mu(__VA_ARGS__)
+#define vmsgt_vv_i8m4_b2_m(...) __riscv_vmsgt_vv_i8m4_b2_mu(__VA_ARGS__)
+#define vmsgt_vx_i8m4_b2_m(...) __riscv_vmsgt_vx_i8m4_b2_mu(__VA_ARGS__)
+#define vmsgt_vv_i8m8_b1_m(...) __riscv_vmsgt_vv_i8m8_b1_mu(__VA_ARGS__)
+#define vmsgt_vx_i8m8_b1_m(...) __riscv_vmsgt_vx_i8m8_b1_mu(__VA_ARGS__)
+#define vmsgt_vv_i16mf4_b64_m(...) __riscv_vmsgt_vv_i16mf4_b64_mu(__VA_ARGS__)
+#define vmsgt_vx_i16mf4_b64_m(...) __riscv_vmsgt_vx_i16mf4_b64_mu(__VA_ARGS__)
+#define vmsgt_vv_i16mf2_b32_m(...) __riscv_vmsgt_vv_i16mf2_b32_mu(__VA_ARGS__)
+#define vmsgt_vx_i16mf2_b32_m(...) __riscv_vmsgt_vx_i16mf2_b32_mu(__VA_ARGS__)
+#define vmsgt_vv_i16m1_b16_m(...) __riscv_vmsgt_vv_i16m1_b16_mu(__VA_ARGS__)
+#define vmsgt_vx_i16m1_b16_m(...) __riscv_vmsgt_vx_i16m1_b16_mu(__VA_ARGS__)
+#define vmsgt_vv_i16m2_b8_m(...) __riscv_vmsgt_vv_i16m2_b8_mu(__VA_ARGS__)
+#define vmsgt_vx_i16m2_b8_m(...) __riscv_vmsgt_vx_i16m2_b8_mu(__VA_ARGS__)
+#define vmsgt_vv_i16m4_b4_m(...) __riscv_vmsgt_vv_i16m4_b4_mu(__VA_ARGS__)
+#define vmsgt_vx_i16m4_b4_m(...) __riscv_vmsgt_vx_i16m4_b4_mu(__VA_ARGS__)
+#define vmsgt_vv_i16m8_b2_m(...) __riscv_vmsgt_vv_i16m8_b2_mu(__VA_ARGS__)
+#define vmsgt_vx_i16m8_b2_m(...) __riscv_vmsgt_vx_i16m8_b2_mu(__VA_ARGS__)
+#define vmsgt_vv_i32mf2_b64_m(...) __riscv_vmsgt_vv_i32mf2_b64_mu(__VA_ARGS__)
+#define vmsgt_vx_i32mf2_b64_m(...) __riscv_vmsgt_vx_i32mf2_b64_mu(__VA_ARGS__)
+#define vmsgt_vv_i32m1_b32_m(...) __riscv_vmsgt_vv_i32m1_b32_mu(__VA_ARGS__)
+#define vmsgt_vx_i32m1_b32_m(...) __riscv_vmsgt_vx_i32m1_b32_mu(__VA_ARGS__)
+#define vmsgt_vv_i32m2_b16_m(...) __riscv_vmsgt_vv_i32m2_b16_mu(__VA_ARGS__)
+#define vmsgt_vx_i32m2_b16_m(...) __riscv_vmsgt_vx_i32m2_b16_mu(__VA_ARGS__)
+#define vmsgt_vv_i32m4_b8_m(...) __riscv_vmsgt_vv_i32m4_b8_mu(__VA_ARGS__)
+#define vmsgt_vx_i32m4_b8_m(...) __riscv_vmsgt_vx_i32m4_b8_mu(__VA_ARGS__)
+#define vmsgt_vv_i32m8_b4_m(...) __riscv_vmsgt_vv_i32m8_b4_mu(__VA_ARGS__)
+#define vmsgt_vx_i32m8_b4_m(...) __riscv_vmsgt_vx_i32m8_b4_mu(__VA_ARGS__)
+#define vmsgt_vv_i64m1_b64_m(...) __riscv_vmsgt_vv_i64m1_b64_mu(__VA_ARGS__)
+#define vmsgt_vx_i64m1_b64_m(...) __riscv_vmsgt_vx_i64m1_b64_mu(__VA_ARGS__)
+#define vmsgt_vv_i64m2_b32_m(...) __riscv_vmsgt_vv_i64m2_b32_mu(__VA_ARGS__)
+#define vmsgt_vx_i64m2_b32_m(...) __riscv_vmsgt_vx_i64m2_b32_mu(__VA_ARGS__)
+#define vmsgt_vv_i64m4_b16_m(...) __riscv_vmsgt_vv_i64m4_b16_mu(__VA_ARGS__)
+#define vmsgt_vx_i64m4_b16_m(...) __riscv_vmsgt_vx_i64m4_b16_mu(__VA_ARGS__)
+#define vmsgt_vv_i64m8_b8_m(...) __riscv_vmsgt_vv_i64m8_b8_mu(__VA_ARGS__)
+#define vmsgt_vx_i64m8_b8_m(...) __riscv_vmsgt_vx_i64m8_b8_mu(__VA_ARGS__)
+#define vmsge_vv_i8mf8_b64_m(...) __riscv_vmsge_vv_i8mf8_b64_mu(__VA_ARGS__)
+#define vmsge_vx_i8mf8_b64_m(...) __riscv_vmsge_vx_i8mf8_b64_mu(__VA_ARGS__)
+#define vmsge_vv_i8mf4_b32_m(...) __riscv_vmsge_vv_i8mf4_b32_mu(__VA_ARGS__)
+#define vmsge_vx_i8mf4_b32_m(...) __riscv_vmsge_vx_i8mf4_b32_mu(__VA_ARGS__)
+#define vmsge_vv_i8mf2_b16_m(...) __riscv_vmsge_vv_i8mf2_b16_mu(__VA_ARGS__)
+#define vmsge_vx_i8mf2_b16_m(...) __riscv_vmsge_vx_i8mf2_b16_mu(__VA_ARGS__)
+#define vmsge_vv_i8m1_b8_m(...) __riscv_vmsge_vv_i8m1_b8_mu(__VA_ARGS__)
+#define vmsge_vx_i8m1_b8_m(...) __riscv_vmsge_vx_i8m1_b8_mu(__VA_ARGS__)
+#define vmsge_vv_i8m2_b4_m(...) __riscv_vmsge_vv_i8m2_b4_mu(__VA_ARGS__)
+#define vmsge_vx_i8m2_b4_m(...) __riscv_vmsge_vx_i8m2_b4_mu(__VA_ARGS__)
+#define vmsge_vv_i8m4_b2_m(...) __riscv_vmsge_vv_i8m4_b2_mu(__VA_ARGS__)
+#define vmsge_vx_i8m4_b2_m(...) __riscv_vmsge_vx_i8m4_b2_mu(__VA_ARGS__)
+#define vmsge_vv_i8m8_b1_m(...) __riscv_vmsge_vv_i8m8_b1_mu(__VA_ARGS__)
+#define vmsge_vx_i8m8_b1_m(...) __riscv_vmsge_vx_i8m8_b1_mu(__VA_ARGS__)
+#define vmsge_vv_i16mf4_b64_m(...) __riscv_vmsge_vv_i16mf4_b64_mu(__VA_ARGS__)
+#define vmsge_vx_i16mf4_b64_m(...) __riscv_vmsge_vx_i16mf4_b64_mu(__VA_ARGS__)
+#define vmsge_vv_i16mf2_b32_m(...) __riscv_vmsge_vv_i16mf2_b32_mu(__VA_ARGS__)
+#define vmsge_vx_i16mf2_b32_m(...) __riscv_vmsge_vx_i16mf2_b32_mu(__VA_ARGS__)
+#define vmsge_vv_i16m1_b16_m(...) __riscv_vmsge_vv_i16m1_b16_mu(__VA_ARGS__)
+#define vmsge_vx_i16m1_b16_m(...) __riscv_vmsge_vx_i16m1_b16_mu(__VA_ARGS__)
+#define vmsge_vv_i16m2_b8_m(...) __riscv_vmsge_vv_i16m2_b8_mu(__VA_ARGS__)
+#define vmsge_vx_i16m2_b8_m(...) __riscv_vmsge_vx_i16m2_b8_mu(__VA_ARGS__)
+#define vmsge_vv_i16m4_b4_m(...) __riscv_vmsge_vv_i16m4_b4_mu(__VA_ARGS__)
+#define vmsge_vx_i16m4_b4_m(...) __riscv_vmsge_vx_i16m4_b4_mu(__VA_ARGS__)
+#define vmsge_vv_i16m8_b2_m(...) __riscv_vmsge_vv_i16m8_b2_mu(__VA_ARGS__)
+#define vmsge_vx_i16m8_b2_m(...) __riscv_vmsge_vx_i16m8_b2_mu(__VA_ARGS__)
+#define vmsge_vv_i32mf2_b64_m(...) __riscv_vmsge_vv_i32mf2_b64_mu(__VA_ARGS__)
+#define vmsge_vx_i32mf2_b64_m(...) __riscv_vmsge_vx_i32mf2_b64_mu(__VA_ARGS__)
+#define vmsge_vv_i32m1_b32_m(...) __riscv_vmsge_vv_i32m1_b32_mu(__VA_ARGS__)
+#define vmsge_vx_i32m1_b32_m(...) __riscv_vmsge_vx_i32m1_b32_mu(__VA_ARGS__)
+#define vmsge_vv_i32m2_b16_m(...) __riscv_vmsge_vv_i32m2_b16_mu(__VA_ARGS__)
+#define vmsge_vx_i32m2_b16_m(...) __riscv_vmsge_vx_i32m2_b16_mu(__VA_ARGS__)
+#define vmsge_vv_i32m4_b8_m(...) __riscv_vmsge_vv_i32m4_b8_mu(__VA_ARGS__)
+#define vmsge_vx_i32m4_b8_m(...) __riscv_vmsge_vx_i32m4_b8_mu(__VA_ARGS__)
+#define vmsge_vv_i32m8_b4_m(...) __riscv_vmsge_vv_i32m8_b4_mu(__VA_ARGS__)
+#define vmsge_vx_i32m8_b4_m(...) __riscv_vmsge_vx_i32m8_b4_mu(__VA_ARGS__)
+#define vmsge_vv_i64m1_b64_m(...) __riscv_vmsge_vv_i64m1_b64_mu(__VA_ARGS__)
+#define vmsge_vx_i64m1_b64_m(...) __riscv_vmsge_vx_i64m1_b64_mu(__VA_ARGS__)
+#define vmsge_vv_i64m2_b32_m(...) __riscv_vmsge_vv_i64m2_b32_mu(__VA_ARGS__)
+#define vmsge_vx_i64m2_b32_m(...) __riscv_vmsge_vx_i64m2_b32_mu(__VA_ARGS__)
+#define vmsge_vv_i64m4_b16_m(...) __riscv_vmsge_vv_i64m4_b16_mu(__VA_ARGS__)
+#define vmsge_vx_i64m4_b16_m(...) __riscv_vmsge_vx_i64m4_b16_mu(__VA_ARGS__)
+#define vmsge_vv_i64m8_b8_m(...) __riscv_vmsge_vv_i64m8_b8_mu(__VA_ARGS__)
+#define vmsge_vx_i64m8_b8_m(...) __riscv_vmsge_vx_i64m8_b8_mu(__VA_ARGS__)
+#define vmseq_vv_u8mf8_b64_m(...) __riscv_vmseq_vv_u8mf8_b64_mu(__VA_ARGS__)
+#define vmseq_vx_u8mf8_b64_m(...) __riscv_vmseq_vx_u8mf8_b64_mu(__VA_ARGS__)
+#define vmseq_vv_u8mf4_b32_m(...) __riscv_vmseq_vv_u8mf4_b32_mu(__VA_ARGS__)
+#define vmseq_vx_u8mf4_b32_m(...) __riscv_vmseq_vx_u8mf4_b32_mu(__VA_ARGS__)
+#define vmseq_vv_u8mf2_b16_m(...) __riscv_vmseq_vv_u8mf2_b16_mu(__VA_ARGS__)
+#define vmseq_vx_u8mf2_b16_m(...) __riscv_vmseq_vx_u8mf2_b16_mu(__VA_ARGS__)
+#define vmseq_vv_u8m1_b8_m(...) __riscv_vmseq_vv_u8m1_b8_mu(__VA_ARGS__)
+#define vmseq_vx_u8m1_b8_m(...) __riscv_vmseq_vx_u8m1_b8_mu(__VA_ARGS__)
+#define vmseq_vv_u8m2_b4_m(...) __riscv_vmseq_vv_u8m2_b4_mu(__VA_ARGS__)
+#define vmseq_vx_u8m2_b4_m(...) __riscv_vmseq_vx_u8m2_b4_mu(__VA_ARGS__)
+#define vmseq_vv_u8m4_b2_m(...) __riscv_vmseq_vv_u8m4_b2_mu(__VA_ARGS__)
+#define vmseq_vx_u8m4_b2_m(...) __riscv_vmseq_vx_u8m4_b2_mu(__VA_ARGS__)
+#define vmseq_vv_u8m8_b1_m(...) __riscv_vmseq_vv_u8m8_b1_mu(__VA_ARGS__)
+#define vmseq_vx_u8m8_b1_m(...) __riscv_vmseq_vx_u8m8_b1_mu(__VA_ARGS__)
+#define vmseq_vv_u16mf4_b64_m(...) __riscv_vmseq_vv_u16mf4_b64_mu(__VA_ARGS__)
+#define vmseq_vx_u16mf4_b64_m(...) __riscv_vmseq_vx_u16mf4_b64_mu(__VA_ARGS__)
+#define vmseq_vv_u16mf2_b32_m(...) __riscv_vmseq_vv_u16mf2_b32_mu(__VA_ARGS__)
+#define vmseq_vx_u16mf2_b32_m(...) __riscv_vmseq_vx_u16mf2_b32_mu(__VA_ARGS__)
+#define vmseq_vv_u16m1_b16_m(...) __riscv_vmseq_vv_u16m1_b16_mu(__VA_ARGS__)
+#define vmseq_vx_u16m1_b16_m(...) __riscv_vmseq_vx_u16m1_b16_mu(__VA_ARGS__)
+#define vmseq_vv_u16m2_b8_m(...) __riscv_vmseq_vv_u16m2_b8_mu(__VA_ARGS__)
+#define vmseq_vx_u16m2_b8_m(...) __riscv_vmseq_vx_u16m2_b8_mu(__VA_ARGS__)
+#define vmseq_vv_u16m4_b4_m(...) __riscv_vmseq_vv_u16m4_b4_mu(__VA_ARGS__)
+#define vmseq_vx_u16m4_b4_m(...) __riscv_vmseq_vx_u16m4_b4_mu(__VA_ARGS__)
+#define vmseq_vv_u16m8_b2_m(...) __riscv_vmseq_vv_u16m8_b2_mu(__VA_ARGS__)
+#define vmseq_vx_u16m8_b2_m(...) __riscv_vmseq_vx_u16m8_b2_mu(__VA_ARGS__)
+#define vmseq_vv_u32mf2_b64_m(...) __riscv_vmseq_vv_u32mf2_b64_mu(__VA_ARGS__)
+#define vmseq_vx_u32mf2_b64_m(...) __riscv_vmseq_vx_u32mf2_b64_mu(__VA_ARGS__)
+#define vmseq_vv_u32m1_b32_m(...) __riscv_vmseq_vv_u32m1_b32_mu(__VA_ARGS__)
+#define vmseq_vx_u32m1_b32_m(...) __riscv_vmseq_vx_u32m1_b32_mu(__VA_ARGS__)
+#define vmseq_vv_u32m2_b16_m(...) __riscv_vmseq_vv_u32m2_b16_mu(__VA_ARGS__)
+#define vmseq_vx_u32m2_b16_m(...) __riscv_vmseq_vx_u32m2_b16_mu(__VA_ARGS__)
+#define vmseq_vv_u32m4_b8_m(...) __riscv_vmseq_vv_u32m4_b8_mu(__VA_ARGS__)
+#define vmseq_vx_u32m4_b8_m(...) __riscv_vmseq_vx_u32m4_b8_mu(__VA_ARGS__)
+#define vmseq_vv_u32m8_b4_m(...) __riscv_vmseq_vv_u32m8_b4_mu(__VA_ARGS__)
+#define vmseq_vx_u32m8_b4_m(...) __riscv_vmseq_vx_u32m8_b4_mu(__VA_ARGS__)
+#define vmseq_vv_u64m1_b64_m(...) __riscv_vmseq_vv_u64m1_b64_mu(__VA_ARGS__)
+#define vmseq_vx_u64m1_b64_m(...) __riscv_vmseq_vx_u64m1_b64_mu(__VA_ARGS__)
+#define vmseq_vv_u64m2_b32_m(...) __riscv_vmseq_vv_u64m2_b32_mu(__VA_ARGS__)
+#define vmseq_vx_u64m2_b32_m(...) __riscv_vmseq_vx_u64m2_b32_mu(__VA_ARGS__)
+#define vmseq_vv_u64m4_b16_m(...) __riscv_vmseq_vv_u64m4_b16_mu(__VA_ARGS__)
+#define vmseq_vx_u64m4_b16_m(...) __riscv_vmseq_vx_u64m4_b16_mu(__VA_ARGS__)
+#define vmseq_vv_u64m8_b8_m(...) __riscv_vmseq_vv_u64m8_b8_mu(__VA_ARGS__)
+#define vmseq_vx_u64m8_b8_m(...) __riscv_vmseq_vx_u64m8_b8_mu(__VA_ARGS__)
+#define vmsne_vv_u8mf8_b64_m(...) __riscv_vmsne_vv_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsne_vx_u8mf8_b64_m(...) __riscv_vmsne_vx_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsne_vv_u8mf4_b32_m(...) __riscv_vmsne_vv_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsne_vx_u8mf4_b32_m(...) __riscv_vmsne_vx_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsne_vv_u8mf2_b16_m(...) __riscv_vmsne_vv_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsne_vx_u8mf2_b16_m(...) __riscv_vmsne_vx_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsne_vv_u8m1_b8_m(...) __riscv_vmsne_vv_u8m1_b8_mu(__VA_ARGS__)
+#define vmsne_vx_u8m1_b8_m(...) __riscv_vmsne_vx_u8m1_b8_mu(__VA_ARGS__)
+#define vmsne_vv_u8m2_b4_m(...) __riscv_vmsne_vv_u8m2_b4_mu(__VA_ARGS__)
+#define vmsne_vx_u8m2_b4_m(...) __riscv_vmsne_vx_u8m2_b4_mu(__VA_ARGS__)
+#define vmsne_vv_u8m4_b2_m(...) __riscv_vmsne_vv_u8m4_b2_mu(__VA_ARGS__)
+#define vmsne_vx_u8m4_b2_m(...) __riscv_vmsne_vx_u8m4_b2_mu(__VA_ARGS__)
+#define vmsne_vv_u8m8_b1_m(...) __riscv_vmsne_vv_u8m8_b1_mu(__VA_ARGS__)
+#define vmsne_vx_u8m8_b1_m(...) __riscv_vmsne_vx_u8m8_b1_mu(__VA_ARGS__)
+#define vmsne_vv_u16mf4_b64_m(...) __riscv_vmsne_vv_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsne_vx_u16mf4_b64_m(...) __riscv_vmsne_vx_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsne_vv_u16mf2_b32_m(...) __riscv_vmsne_vv_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsne_vx_u16mf2_b32_m(...) __riscv_vmsne_vx_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsne_vv_u16m1_b16_m(...) __riscv_vmsne_vv_u16m1_b16_mu(__VA_ARGS__)
+#define vmsne_vx_u16m1_b16_m(...) __riscv_vmsne_vx_u16m1_b16_mu(__VA_ARGS__)
+#define vmsne_vv_u16m2_b8_m(...) __riscv_vmsne_vv_u16m2_b8_mu(__VA_ARGS__)
+#define vmsne_vx_u16m2_b8_m(...) __riscv_vmsne_vx_u16m2_b8_mu(__VA_ARGS__)
+#define vmsne_vv_u16m4_b4_m(...) __riscv_vmsne_vv_u16m4_b4_mu(__VA_ARGS__)
+#define vmsne_vx_u16m4_b4_m(...) __riscv_vmsne_vx_u16m4_b4_mu(__VA_ARGS__)
+#define vmsne_vv_u16m8_b2_m(...) __riscv_vmsne_vv_u16m8_b2_mu(__VA_ARGS__)
+#define vmsne_vx_u16m8_b2_m(...) __riscv_vmsne_vx_u16m8_b2_mu(__VA_ARGS__)
+#define vmsne_vv_u32mf2_b64_m(...) __riscv_vmsne_vv_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsne_vx_u32mf2_b64_m(...) __riscv_vmsne_vx_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsne_vv_u32m1_b32_m(...) __riscv_vmsne_vv_u32m1_b32_mu(__VA_ARGS__)
+#define vmsne_vx_u32m1_b32_m(...) __riscv_vmsne_vx_u32m1_b32_mu(__VA_ARGS__)
+#define vmsne_vv_u32m2_b16_m(...) __riscv_vmsne_vv_u32m2_b16_mu(__VA_ARGS__)
+#define vmsne_vx_u32m2_b16_m(...) __riscv_vmsne_vx_u32m2_b16_mu(__VA_ARGS__)
+#define vmsne_vv_u32m4_b8_m(...) __riscv_vmsne_vv_u32m4_b8_mu(__VA_ARGS__)
+#define vmsne_vx_u32m4_b8_m(...) __riscv_vmsne_vx_u32m4_b8_mu(__VA_ARGS__)
+#define vmsne_vv_u32m8_b4_m(...) __riscv_vmsne_vv_u32m8_b4_mu(__VA_ARGS__)
+#define vmsne_vx_u32m8_b4_m(...) __riscv_vmsne_vx_u32m8_b4_mu(__VA_ARGS__)
+#define vmsne_vv_u64m1_b64_m(...) __riscv_vmsne_vv_u64m1_b64_mu(__VA_ARGS__)
+#define vmsne_vx_u64m1_b64_m(...) __riscv_vmsne_vx_u64m1_b64_mu(__VA_ARGS__)
+#define vmsne_vv_u64m2_b32_m(...) __riscv_vmsne_vv_u64m2_b32_mu(__VA_ARGS__)
+#define vmsne_vx_u64m2_b32_m(...) __riscv_vmsne_vx_u64m2_b32_mu(__VA_ARGS__)
+#define vmsne_vv_u64m4_b16_m(...) __riscv_vmsne_vv_u64m4_b16_mu(__VA_ARGS__)
+#define vmsne_vx_u64m4_b16_m(...) __riscv_vmsne_vx_u64m4_b16_mu(__VA_ARGS__)
+#define vmsne_vv_u64m8_b8_m(...) __riscv_vmsne_vv_u64m8_b8_mu(__VA_ARGS__)
+#define vmsne_vx_u64m8_b8_m(...) __riscv_vmsne_vx_u64m8_b8_mu(__VA_ARGS__)
+#define vmsltu_vv_u8mf8_b64_m(...) __riscv_vmsltu_vv_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsltu_vx_u8mf8_b64_m(...) __riscv_vmsltu_vx_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsltu_vv_u8mf4_b32_m(...) __riscv_vmsltu_vv_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsltu_vx_u8mf4_b32_m(...) __riscv_vmsltu_vx_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsltu_vv_u8mf2_b16_m(...) __riscv_vmsltu_vv_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsltu_vx_u8mf2_b16_m(...) __riscv_vmsltu_vx_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsltu_vv_u8m1_b8_m(...) __riscv_vmsltu_vv_u8m1_b8_mu(__VA_ARGS__)
+#define vmsltu_vx_u8m1_b8_m(...) __riscv_vmsltu_vx_u8m1_b8_mu(__VA_ARGS__)
+#define vmsltu_vv_u8m2_b4_m(...) __riscv_vmsltu_vv_u8m2_b4_mu(__VA_ARGS__)
+#define vmsltu_vx_u8m2_b4_m(...) __riscv_vmsltu_vx_u8m2_b4_mu(__VA_ARGS__)
+#define vmsltu_vv_u8m4_b2_m(...) __riscv_vmsltu_vv_u8m4_b2_mu(__VA_ARGS__)
+#define vmsltu_vx_u8m4_b2_m(...) __riscv_vmsltu_vx_u8m4_b2_mu(__VA_ARGS__)
+#define vmsltu_vv_u8m8_b1_m(...) __riscv_vmsltu_vv_u8m8_b1_mu(__VA_ARGS__)
+#define vmsltu_vx_u8m8_b1_m(...) __riscv_vmsltu_vx_u8m8_b1_mu(__VA_ARGS__)
+#define vmsltu_vv_u16mf4_b64_m(...) __riscv_vmsltu_vv_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsltu_vx_u16mf4_b64_m(...) __riscv_vmsltu_vx_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsltu_vv_u16mf2_b32_m(...) __riscv_vmsltu_vv_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsltu_vx_u16mf2_b32_m(...) __riscv_vmsltu_vx_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsltu_vv_u16m1_b16_m(...) __riscv_vmsltu_vv_u16m1_b16_mu(__VA_ARGS__)
+#define vmsltu_vx_u16m1_b16_m(...) __riscv_vmsltu_vx_u16m1_b16_mu(__VA_ARGS__)
+#define vmsltu_vv_u16m2_b8_m(...) __riscv_vmsltu_vv_u16m2_b8_mu(__VA_ARGS__)
+#define vmsltu_vx_u16m2_b8_m(...) __riscv_vmsltu_vx_u16m2_b8_mu(__VA_ARGS__)
+#define vmsltu_vv_u16m4_b4_m(...) __riscv_vmsltu_vv_u16m4_b4_mu(__VA_ARGS__)
+#define vmsltu_vx_u16m4_b4_m(...) __riscv_vmsltu_vx_u16m4_b4_mu(__VA_ARGS__)
+#define vmsltu_vv_u16m8_b2_m(...) __riscv_vmsltu_vv_u16m8_b2_mu(__VA_ARGS__)
+#define vmsltu_vx_u16m8_b2_m(...) __riscv_vmsltu_vx_u16m8_b2_mu(__VA_ARGS__)
+#define vmsltu_vv_u32mf2_b64_m(...) __riscv_vmsltu_vv_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsltu_vx_u32mf2_b64_m(...) __riscv_vmsltu_vx_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsltu_vv_u32m1_b32_m(...) __riscv_vmsltu_vv_u32m1_b32_mu(__VA_ARGS__)
+#define vmsltu_vx_u32m1_b32_m(...) __riscv_vmsltu_vx_u32m1_b32_mu(__VA_ARGS__)
+#define vmsltu_vv_u32m2_b16_m(...) __riscv_vmsltu_vv_u32m2_b16_mu(__VA_ARGS__)
+#define vmsltu_vx_u32m2_b16_m(...) __riscv_vmsltu_vx_u32m2_b16_mu(__VA_ARGS__)
+#define vmsltu_vv_u32m4_b8_m(...) __riscv_vmsltu_vv_u32m4_b8_mu(__VA_ARGS__)
+#define vmsltu_vx_u32m4_b8_m(...) __riscv_vmsltu_vx_u32m4_b8_mu(__VA_ARGS__)
+#define vmsltu_vv_u32m8_b4_m(...) __riscv_vmsltu_vv_u32m8_b4_mu(__VA_ARGS__)
+#define vmsltu_vx_u32m8_b4_m(...) __riscv_vmsltu_vx_u32m8_b4_mu(__VA_ARGS__)
+#define vmsltu_vv_u64m1_b64_m(...) __riscv_vmsltu_vv_u64m1_b64_mu(__VA_ARGS__)
+#define vmsltu_vx_u64m1_b64_m(...) __riscv_vmsltu_vx_u64m1_b64_mu(__VA_ARGS__)
+#define vmsltu_vv_u64m2_b32_m(...) __riscv_vmsltu_vv_u64m2_b32_mu(__VA_ARGS__)
+#define vmsltu_vx_u64m2_b32_m(...) __riscv_vmsltu_vx_u64m2_b32_mu(__VA_ARGS__)
+#define vmsltu_vv_u64m4_b16_m(...) __riscv_vmsltu_vv_u64m4_b16_mu(__VA_ARGS__)
+#define vmsltu_vx_u64m4_b16_m(...) __riscv_vmsltu_vx_u64m4_b16_mu(__VA_ARGS__)
+#define vmsltu_vv_u64m8_b8_m(...) __riscv_vmsltu_vv_u64m8_b8_mu(__VA_ARGS__)
+#define vmsltu_vx_u64m8_b8_m(...) __riscv_vmsltu_vx_u64m8_b8_mu(__VA_ARGS__)
+#define vmsleu_vv_u8mf8_b64_m(...) __riscv_vmsleu_vv_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsleu_vx_u8mf8_b64_m(...) __riscv_vmsleu_vx_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsleu_vv_u8mf4_b32_m(...) __riscv_vmsleu_vv_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsleu_vx_u8mf4_b32_m(...) __riscv_vmsleu_vx_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsleu_vv_u8mf2_b16_m(...) __riscv_vmsleu_vv_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsleu_vx_u8mf2_b16_m(...) __riscv_vmsleu_vx_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsleu_vv_u8m1_b8_m(...) __riscv_vmsleu_vv_u8m1_b8_mu(__VA_ARGS__)
+#define vmsleu_vx_u8m1_b8_m(...) __riscv_vmsleu_vx_u8m1_b8_mu(__VA_ARGS__)
+#define vmsleu_vv_u8m2_b4_m(...) __riscv_vmsleu_vv_u8m2_b4_mu(__VA_ARGS__)
+#define vmsleu_vx_u8m2_b4_m(...) __riscv_vmsleu_vx_u8m2_b4_mu(__VA_ARGS__)
+#define vmsleu_vv_u8m4_b2_m(...) __riscv_vmsleu_vv_u8m4_b2_mu(__VA_ARGS__)
+#define vmsleu_vx_u8m4_b2_m(...) __riscv_vmsleu_vx_u8m4_b2_mu(__VA_ARGS__)
+#define vmsleu_vv_u8m8_b1_m(...) __riscv_vmsleu_vv_u8m8_b1_mu(__VA_ARGS__)
+#define vmsleu_vx_u8m8_b1_m(...) __riscv_vmsleu_vx_u8m8_b1_mu(__VA_ARGS__)
+#define vmsleu_vv_u16mf4_b64_m(...) __riscv_vmsleu_vv_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsleu_vx_u16mf4_b64_m(...) __riscv_vmsleu_vx_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsleu_vv_u16mf2_b32_m(...) __riscv_vmsleu_vv_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsleu_vx_u16mf2_b32_m(...) __riscv_vmsleu_vx_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsleu_vv_u16m1_b16_m(...) __riscv_vmsleu_vv_u16m1_b16_mu(__VA_ARGS__)
+#define vmsleu_vx_u16m1_b16_m(...) __riscv_vmsleu_vx_u16m1_b16_mu(__VA_ARGS__)
+#define vmsleu_vv_u16m2_b8_m(...) __riscv_vmsleu_vv_u16m2_b8_mu(__VA_ARGS__)
+#define vmsleu_vx_u16m2_b8_m(...) __riscv_vmsleu_vx_u16m2_b8_mu(__VA_ARGS__)
+#define vmsleu_vv_u16m4_b4_m(...) __riscv_vmsleu_vv_u16m4_b4_mu(__VA_ARGS__)
+#define vmsleu_vx_u16m4_b4_m(...) __riscv_vmsleu_vx_u16m4_b4_mu(__VA_ARGS__)
+#define vmsleu_vv_u16m8_b2_m(...) __riscv_vmsleu_vv_u16m8_b2_mu(__VA_ARGS__)
+#define vmsleu_vx_u16m8_b2_m(...) __riscv_vmsleu_vx_u16m8_b2_mu(__VA_ARGS__)
+#define vmsleu_vv_u32mf2_b64_m(...) __riscv_vmsleu_vv_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsleu_vx_u32mf2_b64_m(...) __riscv_vmsleu_vx_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsleu_vv_u32m1_b32_m(...) __riscv_vmsleu_vv_u32m1_b32_mu(__VA_ARGS__)
+#define vmsleu_vx_u32m1_b32_m(...) __riscv_vmsleu_vx_u32m1_b32_mu(__VA_ARGS__)
+#define vmsleu_vv_u32m2_b16_m(...) __riscv_vmsleu_vv_u32m2_b16_mu(__VA_ARGS__)
+#define vmsleu_vx_u32m2_b16_m(...) __riscv_vmsleu_vx_u32m2_b16_mu(__VA_ARGS__)
+#define vmsleu_vv_u32m4_b8_m(...) __riscv_vmsleu_vv_u32m4_b8_mu(__VA_ARGS__)
+#define vmsleu_vx_u32m4_b8_m(...) __riscv_vmsleu_vx_u32m4_b8_mu(__VA_ARGS__)
+#define vmsleu_vv_u32m8_b4_m(...) __riscv_vmsleu_vv_u32m8_b4_mu(__VA_ARGS__)
+#define vmsleu_vx_u32m8_b4_m(...) __riscv_vmsleu_vx_u32m8_b4_mu(__VA_ARGS__)
+#define vmsleu_vv_u64m1_b64_m(...) __riscv_vmsleu_vv_u64m1_b64_mu(__VA_ARGS__)
+#define vmsleu_vx_u64m1_b64_m(...) __riscv_vmsleu_vx_u64m1_b64_mu(__VA_ARGS__)
+#define vmsleu_vv_u64m2_b32_m(...) __riscv_vmsleu_vv_u64m2_b32_mu(__VA_ARGS__)
+#define vmsleu_vx_u64m2_b32_m(...) __riscv_vmsleu_vx_u64m2_b32_mu(__VA_ARGS__)
+#define vmsleu_vv_u64m4_b16_m(...) __riscv_vmsleu_vv_u64m4_b16_mu(__VA_ARGS__)
+#define vmsleu_vx_u64m4_b16_m(...) __riscv_vmsleu_vx_u64m4_b16_mu(__VA_ARGS__)
+#define vmsleu_vv_u64m8_b8_m(...) __riscv_vmsleu_vv_u64m8_b8_mu(__VA_ARGS__)
+#define vmsleu_vx_u64m8_b8_m(...) __riscv_vmsleu_vx_u64m8_b8_mu(__VA_ARGS__)
+#define vmsgtu_vv_u8mf8_b64_m(...) __riscv_vmsgtu_vv_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsgtu_vx_u8mf8_b64_m(...) __riscv_vmsgtu_vx_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsgtu_vv_u8mf4_b32_m(...) __riscv_vmsgtu_vv_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsgtu_vx_u8mf4_b32_m(...) __riscv_vmsgtu_vx_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsgtu_vv_u8mf2_b16_m(...) __riscv_vmsgtu_vv_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsgtu_vx_u8mf2_b16_m(...) __riscv_vmsgtu_vx_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsgtu_vv_u8m1_b8_m(...) __riscv_vmsgtu_vv_u8m1_b8_mu(__VA_ARGS__)
+#define vmsgtu_vx_u8m1_b8_m(...) __riscv_vmsgtu_vx_u8m1_b8_mu(__VA_ARGS__)
+#define vmsgtu_vv_u8m2_b4_m(...) __riscv_vmsgtu_vv_u8m2_b4_mu(__VA_ARGS__)
+#define vmsgtu_vx_u8m2_b4_m(...) __riscv_vmsgtu_vx_u8m2_b4_mu(__VA_ARGS__)
+#define vmsgtu_vv_u8m4_b2_m(...) __riscv_vmsgtu_vv_u8m4_b2_mu(__VA_ARGS__)
+#define vmsgtu_vx_u8m4_b2_m(...) __riscv_vmsgtu_vx_u8m4_b2_mu(__VA_ARGS__)
+#define vmsgtu_vv_u8m8_b1_m(...) __riscv_vmsgtu_vv_u8m8_b1_mu(__VA_ARGS__)
+#define vmsgtu_vx_u8m8_b1_m(...) __riscv_vmsgtu_vx_u8m8_b1_mu(__VA_ARGS__)
+#define vmsgtu_vv_u16mf4_b64_m(...) __riscv_vmsgtu_vv_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsgtu_vx_u16mf4_b64_m(...) __riscv_vmsgtu_vx_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsgtu_vv_u16mf2_b32_m(...) __riscv_vmsgtu_vv_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsgtu_vx_u16mf2_b32_m(...) __riscv_vmsgtu_vx_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsgtu_vv_u16m1_b16_m(...) __riscv_vmsgtu_vv_u16m1_b16_mu(__VA_ARGS__)
+#define vmsgtu_vx_u16m1_b16_m(...) __riscv_vmsgtu_vx_u16m1_b16_mu(__VA_ARGS__)
+#define vmsgtu_vv_u16m2_b8_m(...) __riscv_vmsgtu_vv_u16m2_b8_mu(__VA_ARGS__)
+#define vmsgtu_vx_u16m2_b8_m(...) __riscv_vmsgtu_vx_u16m2_b8_mu(__VA_ARGS__)
+#define vmsgtu_vv_u16m4_b4_m(...) __riscv_vmsgtu_vv_u16m4_b4_mu(__VA_ARGS__)
+#define vmsgtu_vx_u16m4_b4_m(...) __riscv_vmsgtu_vx_u16m4_b4_mu(__VA_ARGS__)
+#define vmsgtu_vv_u16m8_b2_m(...) __riscv_vmsgtu_vv_u16m8_b2_mu(__VA_ARGS__)
+#define vmsgtu_vx_u16m8_b2_m(...) __riscv_vmsgtu_vx_u16m8_b2_mu(__VA_ARGS__)
+#define vmsgtu_vv_u32mf2_b64_m(...) __riscv_vmsgtu_vv_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsgtu_vx_u32mf2_b64_m(...) __riscv_vmsgtu_vx_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsgtu_vv_u32m1_b32_m(...) __riscv_vmsgtu_vv_u32m1_b32_mu(__VA_ARGS__)
+#define vmsgtu_vx_u32m1_b32_m(...) __riscv_vmsgtu_vx_u32m1_b32_mu(__VA_ARGS__)
+#define vmsgtu_vv_u32m2_b16_m(...) __riscv_vmsgtu_vv_u32m2_b16_mu(__VA_ARGS__)
+#define vmsgtu_vx_u32m2_b16_m(...) __riscv_vmsgtu_vx_u32m2_b16_mu(__VA_ARGS__)
+#define vmsgtu_vv_u32m4_b8_m(...) __riscv_vmsgtu_vv_u32m4_b8_mu(__VA_ARGS__)
+#define vmsgtu_vx_u32m4_b8_m(...) __riscv_vmsgtu_vx_u32m4_b8_mu(__VA_ARGS__)
+#define vmsgtu_vv_u32m8_b4_m(...) __riscv_vmsgtu_vv_u32m8_b4_mu(__VA_ARGS__)
+#define vmsgtu_vx_u32m8_b4_m(...) __riscv_vmsgtu_vx_u32m8_b4_mu(__VA_ARGS__)
+#define vmsgtu_vv_u64m1_b64_m(...) __riscv_vmsgtu_vv_u64m1_b64_mu(__VA_ARGS__)
+#define vmsgtu_vx_u64m1_b64_m(...) __riscv_vmsgtu_vx_u64m1_b64_mu(__VA_ARGS__)
+#define vmsgtu_vv_u64m2_b32_m(...) __riscv_vmsgtu_vv_u64m2_b32_mu(__VA_ARGS__)
+#define vmsgtu_vx_u64m2_b32_m(...) __riscv_vmsgtu_vx_u64m2_b32_mu(__VA_ARGS__)
+#define vmsgtu_vv_u64m4_b16_m(...) __riscv_vmsgtu_vv_u64m4_b16_mu(__VA_ARGS__)
+#define vmsgtu_vx_u64m4_b16_m(...) __riscv_vmsgtu_vx_u64m4_b16_mu(__VA_ARGS__)
+#define vmsgtu_vv_u64m8_b8_m(...) __riscv_vmsgtu_vv_u64m8_b8_mu(__VA_ARGS__)
+#define vmsgtu_vx_u64m8_b8_m(...) __riscv_vmsgtu_vx_u64m8_b8_mu(__VA_ARGS__)
+#define vmsgeu_vv_u8mf8_b64_m(...) __riscv_vmsgeu_vv_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsgeu_vx_u8mf8_b64_m(...) __riscv_vmsgeu_vx_u8mf8_b64_mu(__VA_ARGS__)
+#define vmsgeu_vv_u8mf4_b32_m(...) __riscv_vmsgeu_vv_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsgeu_vx_u8mf4_b32_m(...) __riscv_vmsgeu_vx_u8mf4_b32_mu(__VA_ARGS__)
+#define vmsgeu_vv_u8mf2_b16_m(...) __riscv_vmsgeu_vv_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsgeu_vx_u8mf2_b16_m(...) __riscv_vmsgeu_vx_u8mf2_b16_mu(__VA_ARGS__)
+#define vmsgeu_vv_u8m1_b8_m(...) __riscv_vmsgeu_vv_u8m1_b8_mu(__VA_ARGS__)
+#define vmsgeu_vx_u8m1_b8_m(...) __riscv_vmsgeu_vx_u8m1_b8_mu(__VA_ARGS__)
+#define vmsgeu_vv_u8m2_b4_m(...) __riscv_vmsgeu_vv_u8m2_b4_mu(__VA_ARGS__)
+#define vmsgeu_vx_u8m2_b4_m(...) __riscv_vmsgeu_vx_u8m2_b4_mu(__VA_ARGS__)
+#define vmsgeu_vv_u8m4_b2_m(...) __riscv_vmsgeu_vv_u8m4_b2_mu(__VA_ARGS__)
+#define vmsgeu_vx_u8m4_b2_m(...) __riscv_vmsgeu_vx_u8m4_b2_mu(__VA_ARGS__)
+#define vmsgeu_vv_u8m8_b1_m(...) __riscv_vmsgeu_vv_u8m8_b1_mu(__VA_ARGS__)
+#define vmsgeu_vx_u8m8_b1_m(...) __riscv_vmsgeu_vx_u8m8_b1_mu(__VA_ARGS__)
+#define vmsgeu_vv_u16mf4_b64_m(...) __riscv_vmsgeu_vv_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsgeu_vx_u16mf4_b64_m(...) __riscv_vmsgeu_vx_u16mf4_b64_mu(__VA_ARGS__)
+#define vmsgeu_vv_u16mf2_b32_m(...) __riscv_vmsgeu_vv_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsgeu_vx_u16mf2_b32_m(...) __riscv_vmsgeu_vx_u16mf2_b32_mu(__VA_ARGS__)
+#define vmsgeu_vv_u16m1_b16_m(...) __riscv_vmsgeu_vv_u16m1_b16_mu(__VA_ARGS__)
+#define vmsgeu_vx_u16m1_b16_m(...) __riscv_vmsgeu_vx_u16m1_b16_mu(__VA_ARGS__)
+#define vmsgeu_vv_u16m2_b8_m(...) __riscv_vmsgeu_vv_u16m2_b8_mu(__VA_ARGS__)
+#define vmsgeu_vx_u16m2_b8_m(...) __riscv_vmsgeu_vx_u16m2_b8_mu(__VA_ARGS__)
+#define vmsgeu_vv_u16m4_b4_m(...) __riscv_vmsgeu_vv_u16m4_b4_mu(__VA_ARGS__)
+#define vmsgeu_vx_u16m4_b4_m(...) __riscv_vmsgeu_vx_u16m4_b4_mu(__VA_ARGS__)
+#define vmsgeu_vv_u16m8_b2_m(...) __riscv_vmsgeu_vv_u16m8_b2_mu(__VA_ARGS__)
+#define vmsgeu_vx_u16m8_b2_m(...) __riscv_vmsgeu_vx_u16m8_b2_mu(__VA_ARGS__)
+#define vmsgeu_vv_u32mf2_b64_m(...) __riscv_vmsgeu_vv_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsgeu_vx_u32mf2_b64_m(...) __riscv_vmsgeu_vx_u32mf2_b64_mu(__VA_ARGS__)
+#define vmsgeu_vv_u32m1_b32_m(...) __riscv_vmsgeu_vv_u32m1_b32_mu(__VA_ARGS__)
+#define vmsgeu_vx_u32m1_b32_m(...) __riscv_vmsgeu_vx_u32m1_b32_mu(__VA_ARGS__)
+#define vmsgeu_vv_u32m2_b16_m(...) __riscv_vmsgeu_vv_u32m2_b16_mu(__VA_ARGS__)
+#define vmsgeu_vx_u32m2_b16_m(...) __riscv_vmsgeu_vx_u32m2_b16_mu(__VA_ARGS__)
+#define vmsgeu_vv_u32m4_b8_m(...) __riscv_vmsgeu_vv_u32m4_b8_mu(__VA_ARGS__)
+#define vmsgeu_vx_u32m4_b8_m(...) __riscv_vmsgeu_vx_u32m4_b8_mu(__VA_ARGS__)
+#define vmsgeu_vv_u32m8_b4_m(...) __riscv_vmsgeu_vv_u32m8_b4_mu(__VA_ARGS__)
+#define vmsgeu_vx_u32m8_b4_m(...) __riscv_vmsgeu_vx_u32m8_b4_mu(__VA_ARGS__)
+#define vmsgeu_vv_u64m1_b64_m(...) __riscv_vmsgeu_vv_u64m1_b64_mu(__VA_ARGS__)
+#define vmsgeu_vx_u64m1_b64_m(...) __riscv_vmsgeu_vx_u64m1_b64_mu(__VA_ARGS__)
+#define vmsgeu_vv_u64m2_b32_m(...) __riscv_vmsgeu_vv_u64m2_b32_mu(__VA_ARGS__)
+#define vmsgeu_vx_u64m2_b32_m(...) __riscv_vmsgeu_vx_u64m2_b32_mu(__VA_ARGS__)
+#define vmsgeu_vv_u64m4_b16_m(...) __riscv_vmsgeu_vv_u64m4_b16_mu(__VA_ARGS__)
+#define vmsgeu_vx_u64m4_b16_m(...) __riscv_vmsgeu_vx_u64m4_b16_mu(__VA_ARGS__)
+#define vmsgeu_vv_u64m8_b8_m(...) __riscv_vmsgeu_vv_u64m8_b8_mu(__VA_ARGS__)
+#define vmsgeu_vx_u64m8_b8_m(...) __riscv_vmsgeu_vx_u64m8_b8_mu(__VA_ARGS__)
+#define vmin_vv_i8mf8(...) __riscv_vmin_vv_i8mf8(__VA_ARGS__)
+#define vmin_vx_i8mf8(...) __riscv_vmin_vx_i8mf8(__VA_ARGS__)
+#define vmin_vv_i8mf4(...) __riscv_vmin_vv_i8mf4(__VA_ARGS__)
+#define vmin_vx_i8mf4(...) __riscv_vmin_vx_i8mf4(__VA_ARGS__)
+#define vmin_vv_i8mf2(...) __riscv_vmin_vv_i8mf2(__VA_ARGS__)
+#define vmin_vx_i8mf2(...) __riscv_vmin_vx_i8mf2(__VA_ARGS__)
+#define vmin_vv_i8m1(...) __riscv_vmin_vv_i8m1(__VA_ARGS__)
+#define vmin_vx_i8m1(...) __riscv_vmin_vx_i8m1(__VA_ARGS__)
+#define vmin_vv_i8m2(...) __riscv_vmin_vv_i8m2(__VA_ARGS__)
+#define vmin_vx_i8m2(...) __riscv_vmin_vx_i8m2(__VA_ARGS__)
+#define vmin_vv_i8m4(...) __riscv_vmin_vv_i8m4(__VA_ARGS__)
+#define vmin_vx_i8m4(...) __riscv_vmin_vx_i8m4(__VA_ARGS__)
+#define vmin_vv_i8m8(...) __riscv_vmin_vv_i8m8(__VA_ARGS__)
+#define vmin_vx_i8m8(...) __riscv_vmin_vx_i8m8(__VA_ARGS__)
+#define vmin_vv_i16mf4(...) __riscv_vmin_vv_i16mf4(__VA_ARGS__)
+#define vmin_vx_i16mf4(...) __riscv_vmin_vx_i16mf4(__VA_ARGS__)
+#define vmin_vv_i16mf2(...) __riscv_vmin_vv_i16mf2(__VA_ARGS__)
+#define vmin_vx_i16mf2(...) __riscv_vmin_vx_i16mf2(__VA_ARGS__)
+#define vmin_vv_i16m1(...) __riscv_vmin_vv_i16m1(__VA_ARGS__)
+#define vmin_vx_i16m1(...) __riscv_vmin_vx_i16m1(__VA_ARGS__)
+#define vmin_vv_i16m2(...) __riscv_vmin_vv_i16m2(__VA_ARGS__)
+#define vmin_vx_i16m2(...) __riscv_vmin_vx_i16m2(__VA_ARGS__)
+#define vmin_vv_i16m4(...) __riscv_vmin_vv_i16m4(__VA_ARGS__)
+#define vmin_vx_i16m4(...) __riscv_vmin_vx_i16m4(__VA_ARGS__)
+#define vmin_vv_i16m8(...) __riscv_vmin_vv_i16m8(__VA_ARGS__)
+#define vmin_vx_i16m8(...) __riscv_vmin_vx_i16m8(__VA_ARGS__)
+#define vmin_vv_i32mf2(...) __riscv_vmin_vv_i32mf2(__VA_ARGS__)
+#define vmin_vx_i32mf2(...) __riscv_vmin_vx_i32mf2(__VA_ARGS__)
+#define vmin_vv_i32m1(...) __riscv_vmin_vv_i32m1(__VA_ARGS__)
+#define vmin_vx_i32m1(...) __riscv_vmin_vx_i32m1(__VA_ARGS__)
+#define vmin_vv_i32m2(...) __riscv_vmin_vv_i32m2(__VA_ARGS__)
+#define vmin_vx_i32m2(...) __riscv_vmin_vx_i32m2(__VA_ARGS__)
+#define vmin_vv_i32m4(...) __riscv_vmin_vv_i32m4(__VA_ARGS__)
+#define vmin_vx_i32m4(...) __riscv_vmin_vx_i32m4(__VA_ARGS__)
+#define vmin_vv_i32m8(...) __riscv_vmin_vv_i32m8(__VA_ARGS__)
+#define vmin_vx_i32m8(...) __riscv_vmin_vx_i32m8(__VA_ARGS__)
+#define vmin_vv_i64m1(...) __riscv_vmin_vv_i64m1(__VA_ARGS__)
+#define vmin_vx_i64m1(...) __riscv_vmin_vx_i64m1(__VA_ARGS__)
+#define vmin_vv_i64m2(...) __riscv_vmin_vv_i64m2(__VA_ARGS__)
+#define vmin_vx_i64m2(...) __riscv_vmin_vx_i64m2(__VA_ARGS__)
+#define vmin_vv_i64m4(...) __riscv_vmin_vv_i64m4(__VA_ARGS__)
+#define vmin_vx_i64m4(...) __riscv_vmin_vx_i64m4(__VA_ARGS__)
+#define vmin_vv_i64m8(...) __riscv_vmin_vv_i64m8(__VA_ARGS__)
+#define vmin_vx_i64m8(...) __riscv_vmin_vx_i64m8(__VA_ARGS__)
+#define vmax_vv_i8mf8(...) __riscv_vmax_vv_i8mf8(__VA_ARGS__)
+#define vmax_vx_i8mf8(...) __riscv_vmax_vx_i8mf8(__VA_ARGS__)
+#define vmax_vv_i8mf4(...) __riscv_vmax_vv_i8mf4(__VA_ARGS__)
+#define vmax_vx_i8mf4(...) __riscv_vmax_vx_i8mf4(__VA_ARGS__)
+#define vmax_vv_i8mf2(...) __riscv_vmax_vv_i8mf2(__VA_ARGS__)
+#define vmax_vx_i8mf2(...) __riscv_vmax_vx_i8mf2(__VA_ARGS__)
+#define vmax_vv_i8m1(...) __riscv_vmax_vv_i8m1(__VA_ARGS__)
+#define vmax_vx_i8m1(...) __riscv_vmax_vx_i8m1(__VA_ARGS__)
+#define vmax_vv_i8m2(...) __riscv_vmax_vv_i8m2(__VA_ARGS__)
+#define vmax_vx_i8m2(...) __riscv_vmax_vx_i8m2(__VA_ARGS__)
+#define vmax_vv_i8m4(...) __riscv_vmax_vv_i8m4(__VA_ARGS__)
+#define vmax_vx_i8m4(...) __riscv_vmax_vx_i8m4(__VA_ARGS__)
+#define vmax_vv_i8m8(...) __riscv_vmax_vv_i8m8(__VA_ARGS__)
+#define vmax_vx_i8m8(...) __riscv_vmax_vx_i8m8(__VA_ARGS__)
+#define vmax_vv_i16mf4(...) __riscv_vmax_vv_i16mf4(__VA_ARGS__)
+#define vmax_vx_i16mf4(...) __riscv_vmax_vx_i16mf4(__VA_ARGS__)
+#define vmax_vv_i16mf2(...) __riscv_vmax_vv_i16mf2(__VA_ARGS__)
+#define vmax_vx_i16mf2(...) __riscv_vmax_vx_i16mf2(__VA_ARGS__)
+#define vmax_vv_i16m1(...) __riscv_vmax_vv_i16m1(__VA_ARGS__)
+#define vmax_vx_i16m1(...) __riscv_vmax_vx_i16m1(__VA_ARGS__)
+#define vmax_vv_i16m2(...) __riscv_vmax_vv_i16m2(__VA_ARGS__)
+#define vmax_vx_i16m2(...) __riscv_vmax_vx_i16m2(__VA_ARGS__)
+#define vmax_vv_i16m4(...) __riscv_vmax_vv_i16m4(__VA_ARGS__)
+#define vmax_vx_i16m4(...) __riscv_vmax_vx_i16m4(__VA_ARGS__)
+#define vmax_vv_i16m8(...) __riscv_vmax_vv_i16m8(__VA_ARGS__)
+#define vmax_vx_i16m8(...) __riscv_vmax_vx_i16m8(__VA_ARGS__)
+#define vmax_vv_i32mf2(...) __riscv_vmax_vv_i32mf2(__VA_ARGS__)
+#define vmax_vx_i32mf2(...) __riscv_vmax_vx_i32mf2(__VA_ARGS__)
+#define vmax_vv_i32m1(...) __riscv_vmax_vv_i32m1(__VA_ARGS__)
+#define vmax_vx_i32m1(...) __riscv_vmax_vx_i32m1(__VA_ARGS__)
+#define vmax_vv_i32m2(...) __riscv_vmax_vv_i32m2(__VA_ARGS__)
+#define vmax_vx_i32m2(...) __riscv_vmax_vx_i32m2(__VA_ARGS__)
+#define vmax_vv_i32m4(...) __riscv_vmax_vv_i32m4(__VA_ARGS__)
+#define vmax_vx_i32m4(...) __riscv_vmax_vx_i32m4(__VA_ARGS__)
+#define vmax_vv_i32m8(...) __riscv_vmax_vv_i32m8(__VA_ARGS__)
+#define vmax_vx_i32m8(...) __riscv_vmax_vx_i32m8(__VA_ARGS__)
+#define vmax_vv_i64m1(...) __riscv_vmax_vv_i64m1(__VA_ARGS__)
+#define vmax_vx_i64m1(...) __riscv_vmax_vx_i64m1(__VA_ARGS__)
+#define vmax_vv_i64m2(...) __riscv_vmax_vv_i64m2(__VA_ARGS__)
+#define vmax_vx_i64m2(...) __riscv_vmax_vx_i64m2(__VA_ARGS__)
+#define vmax_vv_i64m4(...) __riscv_vmax_vv_i64m4(__VA_ARGS__)
+#define vmax_vx_i64m4(...) __riscv_vmax_vx_i64m4(__VA_ARGS__)
+#define vmax_vv_i64m8(...) __riscv_vmax_vv_i64m8(__VA_ARGS__)
+#define vmax_vx_i64m8(...) __riscv_vmax_vx_i64m8(__VA_ARGS__)
+#define vminu_vv_u8mf8(...) __riscv_vminu_vv_u8mf8(__VA_ARGS__)
+#define vminu_vx_u8mf8(...) __riscv_vminu_vx_u8mf8(__VA_ARGS__)
+#define vminu_vv_u8mf4(...) __riscv_vminu_vv_u8mf4(__VA_ARGS__)
+#define vminu_vx_u8mf4(...) __riscv_vminu_vx_u8mf4(__VA_ARGS__)
+#define vminu_vv_u8mf2(...) __riscv_vminu_vv_u8mf2(__VA_ARGS__)
+#define vminu_vx_u8mf2(...) __riscv_vminu_vx_u8mf2(__VA_ARGS__)
+#define vminu_vv_u8m1(...) __riscv_vminu_vv_u8m1(__VA_ARGS__)
+#define vminu_vx_u8m1(...) __riscv_vminu_vx_u8m1(__VA_ARGS__)
+#define vminu_vv_u8m2(...) __riscv_vminu_vv_u8m2(__VA_ARGS__)
+#define vminu_vx_u8m2(...) __riscv_vminu_vx_u8m2(__VA_ARGS__)
+#define vminu_vv_u8m4(...) __riscv_vminu_vv_u8m4(__VA_ARGS__)
+#define vminu_vx_u8m4(...) __riscv_vminu_vx_u8m4(__VA_ARGS__)
+#define vminu_vv_u8m8(...) __riscv_vminu_vv_u8m8(__VA_ARGS__)
+#define vminu_vx_u8m8(...) __riscv_vminu_vx_u8m8(__VA_ARGS__)
+#define vminu_vv_u16mf4(...) __riscv_vminu_vv_u16mf4(__VA_ARGS__)
+#define vminu_vx_u16mf4(...) __riscv_vminu_vx_u16mf4(__VA_ARGS__)
+#define vminu_vv_u16mf2(...) __riscv_vminu_vv_u16mf2(__VA_ARGS__)
+#define vminu_vx_u16mf2(...) __riscv_vminu_vx_u16mf2(__VA_ARGS__)
+#define vminu_vv_u16m1(...) __riscv_vminu_vv_u16m1(__VA_ARGS__)
+#define vminu_vx_u16m1(...) __riscv_vminu_vx_u16m1(__VA_ARGS__)
+#define vminu_vv_u16m2(...) __riscv_vminu_vv_u16m2(__VA_ARGS__)
+#define vminu_vx_u16m2(...) __riscv_vminu_vx_u16m2(__VA_ARGS__)
+#define vminu_vv_u16m4(...) __riscv_vminu_vv_u16m4(__VA_ARGS__)
+#define vminu_vx_u16m4(...) __riscv_vminu_vx_u16m4(__VA_ARGS__)
+#define vminu_vv_u16m8(...) __riscv_vminu_vv_u16m8(__VA_ARGS__)
+#define vminu_vx_u16m8(...) __riscv_vminu_vx_u16m8(__VA_ARGS__)
+#define vminu_vv_u32mf2(...) __riscv_vminu_vv_u32mf2(__VA_ARGS__)
+#define vminu_vx_u32mf2(...) __riscv_vminu_vx_u32mf2(__VA_ARGS__)
+#define vminu_vv_u32m1(...) __riscv_vminu_vv_u32m1(__VA_ARGS__)
+#define vminu_vx_u32m1(...) __riscv_vminu_vx_u32m1(__VA_ARGS__)
+#define vminu_vv_u32m2(...) __riscv_vminu_vv_u32m2(__VA_ARGS__)
+#define vminu_vx_u32m2(...) __riscv_vminu_vx_u32m2(__VA_ARGS__)
+#define vminu_vv_u32m4(...) __riscv_vminu_vv_u32m4(__VA_ARGS__)
+#define vminu_vx_u32m4(...) __riscv_vminu_vx_u32m4(__VA_ARGS__)
+#define vminu_vv_u32m8(...) __riscv_vminu_vv_u32m8(__VA_ARGS__)
+#define vminu_vx_u32m8(...) __riscv_vminu_vx_u32m8(__VA_ARGS__)
+#define vminu_vv_u64m1(...) __riscv_vminu_vv_u64m1(__VA_ARGS__)
+#define vminu_vx_u64m1(...) __riscv_vminu_vx_u64m1(__VA_ARGS__)
+#define vminu_vv_u64m2(...) __riscv_vminu_vv_u64m2(__VA_ARGS__)
+#define vminu_vx_u64m2(...) __riscv_vminu_vx_u64m2(__VA_ARGS__)
+#define vminu_vv_u64m4(...) __riscv_vminu_vv_u64m4(__VA_ARGS__)
+#define vminu_vx_u64m4(...) __riscv_vminu_vx_u64m4(__VA_ARGS__)
+#define vminu_vv_u64m8(...) __riscv_vminu_vv_u64m8(__VA_ARGS__)
+#define vminu_vx_u64m8(...) __riscv_vminu_vx_u64m8(__VA_ARGS__)
+#define vmaxu_vv_u8mf8(...) __riscv_vmaxu_vv_u8mf8(__VA_ARGS__)
+#define vmaxu_vx_u8mf8(...) __riscv_vmaxu_vx_u8mf8(__VA_ARGS__)
+#define vmaxu_vv_u8mf4(...) __riscv_vmaxu_vv_u8mf4(__VA_ARGS__)
+#define vmaxu_vx_u8mf4(...) __riscv_vmaxu_vx_u8mf4(__VA_ARGS__)
+#define vmaxu_vv_u8mf2(...) __riscv_vmaxu_vv_u8mf2(__VA_ARGS__)
+#define vmaxu_vx_u8mf2(...) __riscv_vmaxu_vx_u8mf2(__VA_ARGS__)
+#define vmaxu_vv_u8m1(...) __riscv_vmaxu_vv_u8m1(__VA_ARGS__)
+#define vmaxu_vx_u8m1(...) __riscv_vmaxu_vx_u8m1(__VA_ARGS__)
+#define vmaxu_vv_u8m2(...) __riscv_vmaxu_vv_u8m2(__VA_ARGS__)
+#define vmaxu_vx_u8m2(...) __riscv_vmaxu_vx_u8m2(__VA_ARGS__)
+#define vmaxu_vv_u8m4(...) __riscv_vmaxu_vv_u8m4(__VA_ARGS__)
+#define vmaxu_vx_u8m4(...) __riscv_vmaxu_vx_u8m4(__VA_ARGS__)
+#define vmaxu_vv_u8m8(...) __riscv_vmaxu_vv_u8m8(__VA_ARGS__)
+#define vmaxu_vx_u8m8(...) __riscv_vmaxu_vx_u8m8(__VA_ARGS__)
+#define vmaxu_vv_u16mf4(...) __riscv_vmaxu_vv_u16mf4(__VA_ARGS__)
+#define vmaxu_vx_u16mf4(...) __riscv_vmaxu_vx_u16mf4(__VA_ARGS__)
+#define vmaxu_vv_u16mf2(...) __riscv_vmaxu_vv_u16mf2(__VA_ARGS__)
+#define vmaxu_vx_u16mf2(...) __riscv_vmaxu_vx_u16mf2(__VA_ARGS__)
+#define vmaxu_vv_u16m1(...) __riscv_vmaxu_vv_u16m1(__VA_ARGS__)
+#define vmaxu_vx_u16m1(...) __riscv_vmaxu_vx_u16m1(__VA_ARGS__)
+#define vmaxu_vv_u16m2(...) __riscv_vmaxu_vv_u16m2(__VA_ARGS__)
+#define vmaxu_vx_u16m2(...) __riscv_vmaxu_vx_u16m2(__VA_ARGS__)
+#define vmaxu_vv_u16m4(...) __riscv_vmaxu_vv_u16m4(__VA_ARGS__)
+#define vmaxu_vx_u16m4(...) __riscv_vmaxu_vx_u16m4(__VA_ARGS__)
+#define vmaxu_vv_u16m8(...) __riscv_vmaxu_vv_u16m8(__VA_ARGS__)
+#define vmaxu_vx_u16m8(...) __riscv_vmaxu_vx_u16m8(__VA_ARGS__)
+#define vmaxu_vv_u32mf2(...) __riscv_vmaxu_vv_u32mf2(__VA_ARGS__)
+#define vmaxu_vx_u32mf2(...) __riscv_vmaxu_vx_u32mf2(__VA_ARGS__)
+#define vmaxu_vv_u32m1(...) __riscv_vmaxu_vv_u32m1(__VA_ARGS__)
+#define vmaxu_vx_u32m1(...) __riscv_vmaxu_vx_u32m1(__VA_ARGS__)
+#define vmaxu_vv_u32m2(...) __riscv_vmaxu_vv_u32m2(__VA_ARGS__)
+#define vmaxu_vx_u32m2(...) __riscv_vmaxu_vx_u32m2(__VA_ARGS__)
+#define vmaxu_vv_u32m4(...) __riscv_vmaxu_vv_u32m4(__VA_ARGS__)
+#define vmaxu_vx_u32m4(...) __riscv_vmaxu_vx_u32m4(__VA_ARGS__)
+#define vmaxu_vv_u32m8(...) __riscv_vmaxu_vv_u32m8(__VA_ARGS__)
+#define vmaxu_vx_u32m8(...) __riscv_vmaxu_vx_u32m8(__VA_ARGS__)
+#define vmaxu_vv_u64m1(...) __riscv_vmaxu_vv_u64m1(__VA_ARGS__)
+#define vmaxu_vx_u64m1(...) __riscv_vmaxu_vx_u64m1(__VA_ARGS__)
+#define vmaxu_vv_u64m2(...) __riscv_vmaxu_vv_u64m2(__VA_ARGS__)
+#define vmaxu_vx_u64m2(...) __riscv_vmaxu_vx_u64m2(__VA_ARGS__)
+#define vmaxu_vv_u64m4(...) __riscv_vmaxu_vv_u64m4(__VA_ARGS__)
+#define vmaxu_vx_u64m4(...) __riscv_vmaxu_vx_u64m4(__VA_ARGS__)
+#define vmaxu_vv_u64m8(...) __riscv_vmaxu_vv_u64m8(__VA_ARGS__)
+#define vmaxu_vx_u64m8(...) __riscv_vmaxu_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vmin_vv_i8mf8_m(...) __riscv_vmin_vv_i8mf8_tumu(__VA_ARGS__)
+#define vmin_vx_i8mf8_m(...) __riscv_vmin_vx_i8mf8_tumu(__VA_ARGS__)
+#define vmin_vv_i8mf4_m(...) __riscv_vmin_vv_i8mf4_tumu(__VA_ARGS__)
+#define vmin_vx_i8mf4_m(...) __riscv_vmin_vx_i8mf4_tumu(__VA_ARGS__)
+#define vmin_vv_i8mf2_m(...) __riscv_vmin_vv_i8mf2_tumu(__VA_ARGS__)
+#define vmin_vx_i8mf2_m(...) __riscv_vmin_vx_i8mf2_tumu(__VA_ARGS__)
+#define vmin_vv_i8m1_m(...) __riscv_vmin_vv_i8m1_tumu(__VA_ARGS__)
+#define vmin_vx_i8m1_m(...) __riscv_vmin_vx_i8m1_tumu(__VA_ARGS__)
+#define vmin_vv_i8m2_m(...) __riscv_vmin_vv_i8m2_tumu(__VA_ARGS__)
+#define vmin_vx_i8m2_m(...) __riscv_vmin_vx_i8m2_tumu(__VA_ARGS__)
+#define vmin_vv_i8m4_m(...) __riscv_vmin_vv_i8m4_tumu(__VA_ARGS__)
+#define vmin_vx_i8m4_m(...) __riscv_vmin_vx_i8m4_tumu(__VA_ARGS__)
+#define vmin_vv_i8m8_m(...) __riscv_vmin_vv_i8m8_tumu(__VA_ARGS__)
+#define vmin_vx_i8m8_m(...) __riscv_vmin_vx_i8m8_tumu(__VA_ARGS__)
+#define vmin_vv_i16mf4_m(...) __riscv_vmin_vv_i16mf4_tumu(__VA_ARGS__)
+#define vmin_vx_i16mf4_m(...) __riscv_vmin_vx_i16mf4_tumu(__VA_ARGS__)
+#define vmin_vv_i16mf2_m(...) __riscv_vmin_vv_i16mf2_tumu(__VA_ARGS__)
+#define vmin_vx_i16mf2_m(...) __riscv_vmin_vx_i16mf2_tumu(__VA_ARGS__)
+#define vmin_vv_i16m1_m(...) __riscv_vmin_vv_i16m1_tumu(__VA_ARGS__)
+#define vmin_vx_i16m1_m(...) __riscv_vmin_vx_i16m1_tumu(__VA_ARGS__)
+#define vmin_vv_i16m2_m(...) __riscv_vmin_vv_i16m2_tumu(__VA_ARGS__)
+#define vmin_vx_i16m2_m(...) __riscv_vmin_vx_i16m2_tumu(__VA_ARGS__)
+#define vmin_vv_i16m4_m(...) __riscv_vmin_vv_i16m4_tumu(__VA_ARGS__)
+#define vmin_vx_i16m4_m(...) __riscv_vmin_vx_i16m4_tumu(__VA_ARGS__)
+#define vmin_vv_i16m8_m(...) __riscv_vmin_vv_i16m8_tumu(__VA_ARGS__)
+#define vmin_vx_i16m8_m(...) __riscv_vmin_vx_i16m8_tumu(__VA_ARGS__)
+#define vmin_vv_i32mf2_m(...) __riscv_vmin_vv_i32mf2_tumu(__VA_ARGS__)
+#define vmin_vx_i32mf2_m(...) __riscv_vmin_vx_i32mf2_tumu(__VA_ARGS__)
+#define vmin_vv_i32m1_m(...) __riscv_vmin_vv_i32m1_tumu(__VA_ARGS__)
+#define vmin_vx_i32m1_m(...) __riscv_vmin_vx_i32m1_tumu(__VA_ARGS__)
+#define vmin_vv_i32m2_m(...) __riscv_vmin_vv_i32m2_tumu(__VA_ARGS__)
+#define vmin_vx_i32m2_m(...) __riscv_vmin_vx_i32m2_tumu(__VA_ARGS__)
+#define vmin_vv_i32m4_m(...) __riscv_vmin_vv_i32m4_tumu(__VA_ARGS__)
+#define vmin_vx_i32m4_m(...) __riscv_vmin_vx_i32m4_tumu(__VA_ARGS__)
+#define vmin_vv_i32m8_m(...) __riscv_vmin_vv_i32m8_tumu(__VA_ARGS__)
+#define vmin_vx_i32m8_m(...) __riscv_vmin_vx_i32m8_tumu(__VA_ARGS__)
+#define vmin_vv_i64m1_m(...) __riscv_vmin_vv_i64m1_tumu(__VA_ARGS__)
+#define vmin_vx_i64m1_m(...) __riscv_vmin_vx_i64m1_tumu(__VA_ARGS__)
+#define vmin_vv_i64m2_m(...) __riscv_vmin_vv_i64m2_tumu(__VA_ARGS__)
+#define vmin_vx_i64m2_m(...) __riscv_vmin_vx_i64m2_tumu(__VA_ARGS__)
+#define vmin_vv_i64m4_m(...) __riscv_vmin_vv_i64m4_tumu(__VA_ARGS__)
+#define vmin_vx_i64m4_m(...) __riscv_vmin_vx_i64m4_tumu(__VA_ARGS__)
+#define vmin_vv_i64m8_m(...) __riscv_vmin_vv_i64m8_tumu(__VA_ARGS__)
+#define vmin_vx_i64m8_m(...) __riscv_vmin_vx_i64m8_tumu(__VA_ARGS__)
+#define vmax_vv_i8mf8_m(...) __riscv_vmax_vv_i8mf8_tumu(__VA_ARGS__)
+#define vmax_vx_i8mf8_m(...) __riscv_vmax_vx_i8mf8_tumu(__VA_ARGS__)
+#define vmax_vv_i8mf4_m(...) __riscv_vmax_vv_i8mf4_tumu(__VA_ARGS__)
+#define vmax_vx_i8mf4_m(...) __riscv_vmax_vx_i8mf4_tumu(__VA_ARGS__)
+#define vmax_vv_i8mf2_m(...) __riscv_vmax_vv_i8mf2_tumu(__VA_ARGS__)
+#define vmax_vx_i8mf2_m(...) __riscv_vmax_vx_i8mf2_tumu(__VA_ARGS__)
+#define vmax_vv_i8m1_m(...) __riscv_vmax_vv_i8m1_tumu(__VA_ARGS__)
+#define vmax_vx_i8m1_m(...) __riscv_vmax_vx_i8m1_tumu(__VA_ARGS__)
+#define vmax_vv_i8m2_m(...) __riscv_vmax_vv_i8m2_tumu(__VA_ARGS__)
+#define vmax_vx_i8m2_m(...) __riscv_vmax_vx_i8m2_tumu(__VA_ARGS__)
+#define vmax_vv_i8m4_m(...) __riscv_vmax_vv_i8m4_tumu(__VA_ARGS__)
+#define vmax_vx_i8m4_m(...) __riscv_vmax_vx_i8m4_tumu(__VA_ARGS__)
+#define vmax_vv_i8m8_m(...) __riscv_vmax_vv_i8m8_tumu(__VA_ARGS__)
+#define vmax_vx_i8m8_m(...) __riscv_vmax_vx_i8m8_tumu(__VA_ARGS__)
+#define vmax_vv_i16mf4_m(...) __riscv_vmax_vv_i16mf4_tumu(__VA_ARGS__)
+#define vmax_vx_i16mf4_m(...) __riscv_vmax_vx_i16mf4_tumu(__VA_ARGS__)
+#define vmax_vv_i16mf2_m(...) __riscv_vmax_vv_i16mf2_tumu(__VA_ARGS__)
+#define vmax_vx_i16mf2_m(...) __riscv_vmax_vx_i16mf2_tumu(__VA_ARGS__)
+#define vmax_vv_i16m1_m(...) __riscv_vmax_vv_i16m1_tumu(__VA_ARGS__)
+#define vmax_vx_i16m1_m(...) __riscv_vmax_vx_i16m1_tumu(__VA_ARGS__)
+#define vmax_vv_i16m2_m(...) __riscv_vmax_vv_i16m2_tumu(__VA_ARGS__)
+#define vmax_vx_i16m2_m(...) __riscv_vmax_vx_i16m2_tumu(__VA_ARGS__)
+#define vmax_vv_i16m4_m(...) __riscv_vmax_vv_i16m4_tumu(__VA_ARGS__)
+#define vmax_vx_i16m4_m(...) __riscv_vmax_vx_i16m4_tumu(__VA_ARGS__)
+#define vmax_vv_i16m8_m(...) __riscv_vmax_vv_i16m8_tumu(__VA_ARGS__)
+#define vmax_vx_i16m8_m(...) __riscv_vmax_vx_i16m8_tumu(__VA_ARGS__)
+#define vmax_vv_i32mf2_m(...) __riscv_vmax_vv_i32mf2_tumu(__VA_ARGS__)
+#define vmax_vx_i32mf2_m(...) __riscv_vmax_vx_i32mf2_tumu(__VA_ARGS__)
+#define vmax_vv_i32m1_m(...) __riscv_vmax_vv_i32m1_tumu(__VA_ARGS__)
+#define vmax_vx_i32m1_m(...) __riscv_vmax_vx_i32m1_tumu(__VA_ARGS__)
+#define vmax_vv_i32m2_m(...) __riscv_vmax_vv_i32m2_tumu(__VA_ARGS__)
+#define vmax_vx_i32m2_m(...) __riscv_vmax_vx_i32m2_tumu(__VA_ARGS__)
+#define vmax_vv_i32m4_m(...) __riscv_vmax_vv_i32m4_tumu(__VA_ARGS__)
+#define vmax_vx_i32m4_m(...) __riscv_vmax_vx_i32m4_tumu(__VA_ARGS__)
+#define vmax_vv_i32m8_m(...) __riscv_vmax_vv_i32m8_tumu(__VA_ARGS__)
+#define vmax_vx_i32m8_m(...) __riscv_vmax_vx_i32m8_tumu(__VA_ARGS__)
+#define vmax_vv_i64m1_m(...) __riscv_vmax_vv_i64m1_tumu(__VA_ARGS__)
+#define vmax_vx_i64m1_m(...) __riscv_vmax_vx_i64m1_tumu(__VA_ARGS__)
+#define vmax_vv_i64m2_m(...) __riscv_vmax_vv_i64m2_tumu(__VA_ARGS__)
+#define vmax_vx_i64m2_m(...) __riscv_vmax_vx_i64m2_tumu(__VA_ARGS__)
+#define vmax_vv_i64m4_m(...) __riscv_vmax_vv_i64m4_tumu(__VA_ARGS__)
+#define vmax_vx_i64m4_m(...) __riscv_vmax_vx_i64m4_tumu(__VA_ARGS__)
+#define vmax_vv_i64m8_m(...) __riscv_vmax_vv_i64m8_tumu(__VA_ARGS__)
+#define vmax_vx_i64m8_m(...) __riscv_vmax_vx_i64m8_tumu(__VA_ARGS__)
+#define vminu_vv_u8mf8_m(...) __riscv_vminu_vv_u8mf8_tumu(__VA_ARGS__)
+#define vminu_vx_u8mf8_m(...) __riscv_vminu_vx_u8mf8_tumu(__VA_ARGS__)
+#define vminu_vv_u8mf4_m(...) __riscv_vminu_vv_u8mf4_tumu(__VA_ARGS__)
+#define vminu_vx_u8mf4_m(...) __riscv_vminu_vx_u8mf4_tumu(__VA_ARGS__)
+#define vminu_vv_u8mf2_m(...) __riscv_vminu_vv_u8mf2_tumu(__VA_ARGS__)
+#define vminu_vx_u8mf2_m(...) __riscv_vminu_vx_u8mf2_tumu(__VA_ARGS__)
+#define vminu_vv_u8m1_m(...) __riscv_vminu_vv_u8m1_tumu(__VA_ARGS__)
+#define vminu_vx_u8m1_m(...) __riscv_vminu_vx_u8m1_tumu(__VA_ARGS__)
+#define vminu_vv_u8m2_m(...) __riscv_vminu_vv_u8m2_tumu(__VA_ARGS__)
+#define vminu_vx_u8m2_m(...) __riscv_vminu_vx_u8m2_tumu(__VA_ARGS__)
+#define vminu_vv_u8m4_m(...) __riscv_vminu_vv_u8m4_tumu(__VA_ARGS__)
+#define vminu_vx_u8m4_m(...) __riscv_vminu_vx_u8m4_tumu(__VA_ARGS__)
+#define vminu_vv_u8m8_m(...) __riscv_vminu_vv_u8m8_tumu(__VA_ARGS__)
+#define vminu_vx_u8m8_m(...) __riscv_vminu_vx_u8m8_tumu(__VA_ARGS__)
+#define vminu_vv_u16mf4_m(...) __riscv_vminu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vminu_vx_u16mf4_m(...) __riscv_vminu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vminu_vv_u16mf2_m(...) __riscv_vminu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vminu_vx_u16mf2_m(...) __riscv_vminu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vminu_vv_u16m1_m(...) __riscv_vminu_vv_u16m1_tumu(__VA_ARGS__)
+#define vminu_vx_u16m1_m(...) __riscv_vminu_vx_u16m1_tumu(__VA_ARGS__)
+#define vminu_vv_u16m2_m(...) __riscv_vminu_vv_u16m2_tumu(__VA_ARGS__)
+#define vminu_vx_u16m2_m(...) __riscv_vminu_vx_u16m2_tumu(__VA_ARGS__)
+#define vminu_vv_u16m4_m(...) __riscv_vminu_vv_u16m4_tumu(__VA_ARGS__)
+#define vminu_vx_u16m4_m(...) __riscv_vminu_vx_u16m4_tumu(__VA_ARGS__)
+#define vminu_vv_u16m8_m(...) __riscv_vminu_vv_u16m8_tumu(__VA_ARGS__)
+#define vminu_vx_u16m8_m(...) __riscv_vminu_vx_u16m8_tumu(__VA_ARGS__)
+#define vminu_vv_u32mf2_m(...) __riscv_vminu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vminu_vx_u32mf2_m(...) __riscv_vminu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vminu_vv_u32m1_m(...) __riscv_vminu_vv_u32m1_tumu(__VA_ARGS__)
+#define vminu_vx_u32m1_m(...) __riscv_vminu_vx_u32m1_tumu(__VA_ARGS__)
+#define vminu_vv_u32m2_m(...) __riscv_vminu_vv_u32m2_tumu(__VA_ARGS__)
+#define vminu_vx_u32m2_m(...) __riscv_vminu_vx_u32m2_tumu(__VA_ARGS__)
+#define vminu_vv_u32m4_m(...) __riscv_vminu_vv_u32m4_tumu(__VA_ARGS__)
+#define vminu_vx_u32m4_m(...) __riscv_vminu_vx_u32m4_tumu(__VA_ARGS__)
+#define vminu_vv_u32m8_m(...) __riscv_vminu_vv_u32m8_tumu(__VA_ARGS__)
+#define vminu_vx_u32m8_m(...) __riscv_vminu_vx_u32m8_tumu(__VA_ARGS__)
+#define vminu_vv_u64m1_m(...) __riscv_vminu_vv_u64m1_tumu(__VA_ARGS__)
+#define vminu_vx_u64m1_m(...) __riscv_vminu_vx_u64m1_tumu(__VA_ARGS__)
+#define vminu_vv_u64m2_m(...) __riscv_vminu_vv_u64m2_tumu(__VA_ARGS__)
+#define vminu_vx_u64m2_m(...) __riscv_vminu_vx_u64m2_tumu(__VA_ARGS__)
+#define vminu_vv_u64m4_m(...) __riscv_vminu_vv_u64m4_tumu(__VA_ARGS__)
+#define vminu_vx_u64m4_m(...) __riscv_vminu_vx_u64m4_tumu(__VA_ARGS__)
+#define vminu_vv_u64m8_m(...) __riscv_vminu_vv_u64m8_tumu(__VA_ARGS__)
+#define vminu_vx_u64m8_m(...) __riscv_vminu_vx_u64m8_tumu(__VA_ARGS__)
+#define vmaxu_vv_u8mf8_m(...) __riscv_vmaxu_vv_u8mf8_tumu(__VA_ARGS__)
+#define vmaxu_vx_u8mf8_m(...) __riscv_vmaxu_vx_u8mf8_tumu(__VA_ARGS__)
+#define vmaxu_vv_u8mf4_m(...) __riscv_vmaxu_vv_u8mf4_tumu(__VA_ARGS__)
+#define vmaxu_vx_u8mf4_m(...) __riscv_vmaxu_vx_u8mf4_tumu(__VA_ARGS__)
+#define vmaxu_vv_u8mf2_m(...) __riscv_vmaxu_vv_u8mf2_tumu(__VA_ARGS__)
+#define vmaxu_vx_u8mf2_m(...) __riscv_vmaxu_vx_u8mf2_tumu(__VA_ARGS__)
+#define vmaxu_vv_u8m1_m(...) __riscv_vmaxu_vv_u8m1_tumu(__VA_ARGS__)
+#define vmaxu_vx_u8m1_m(...) __riscv_vmaxu_vx_u8m1_tumu(__VA_ARGS__)
+#define vmaxu_vv_u8m2_m(...) __riscv_vmaxu_vv_u8m2_tumu(__VA_ARGS__)
+#define vmaxu_vx_u8m2_m(...) __riscv_vmaxu_vx_u8m2_tumu(__VA_ARGS__)
+#define vmaxu_vv_u8m4_m(...) __riscv_vmaxu_vv_u8m4_tumu(__VA_ARGS__)
+#define vmaxu_vx_u8m4_m(...) __riscv_vmaxu_vx_u8m4_tumu(__VA_ARGS__)
+#define vmaxu_vv_u8m8_m(...) __riscv_vmaxu_vv_u8m8_tumu(__VA_ARGS__)
+#define vmaxu_vx_u8m8_m(...) __riscv_vmaxu_vx_u8m8_tumu(__VA_ARGS__)
+#define vmaxu_vv_u16mf4_m(...) __riscv_vmaxu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vmaxu_vx_u16mf4_m(...) __riscv_vmaxu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vmaxu_vv_u16mf2_m(...) __riscv_vmaxu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vmaxu_vx_u16mf2_m(...) __riscv_vmaxu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vmaxu_vv_u16m1_m(...) __riscv_vmaxu_vv_u16m1_tumu(__VA_ARGS__)
+#define vmaxu_vx_u16m1_m(...) __riscv_vmaxu_vx_u16m1_tumu(__VA_ARGS__)
+#define vmaxu_vv_u16m2_m(...) __riscv_vmaxu_vv_u16m2_tumu(__VA_ARGS__)
+#define vmaxu_vx_u16m2_m(...) __riscv_vmaxu_vx_u16m2_tumu(__VA_ARGS__)
+#define vmaxu_vv_u16m4_m(...) __riscv_vmaxu_vv_u16m4_tumu(__VA_ARGS__)
+#define vmaxu_vx_u16m4_m(...) __riscv_vmaxu_vx_u16m4_tumu(__VA_ARGS__)
+#define vmaxu_vv_u16m8_m(...) __riscv_vmaxu_vv_u16m8_tumu(__VA_ARGS__)
+#define vmaxu_vx_u16m8_m(...) __riscv_vmaxu_vx_u16m8_tumu(__VA_ARGS__)
+#define vmaxu_vv_u32mf2_m(...) __riscv_vmaxu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vmaxu_vx_u32mf2_m(...) __riscv_vmaxu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vmaxu_vv_u32m1_m(...) __riscv_vmaxu_vv_u32m1_tumu(__VA_ARGS__)
+#define vmaxu_vx_u32m1_m(...) __riscv_vmaxu_vx_u32m1_tumu(__VA_ARGS__)
+#define vmaxu_vv_u32m2_m(...) __riscv_vmaxu_vv_u32m2_tumu(__VA_ARGS__)
+#define vmaxu_vx_u32m2_m(...) __riscv_vmaxu_vx_u32m2_tumu(__VA_ARGS__)
+#define vmaxu_vv_u32m4_m(...) __riscv_vmaxu_vv_u32m4_tumu(__VA_ARGS__)
+#define vmaxu_vx_u32m4_m(...) __riscv_vmaxu_vx_u32m4_tumu(__VA_ARGS__)
+#define vmaxu_vv_u32m8_m(...) __riscv_vmaxu_vv_u32m8_tumu(__VA_ARGS__)
+#define vmaxu_vx_u32m8_m(...) __riscv_vmaxu_vx_u32m8_tumu(__VA_ARGS__)
+#define vmaxu_vv_u64m1_m(...) __riscv_vmaxu_vv_u64m1_tumu(__VA_ARGS__)
+#define vmaxu_vx_u64m1_m(...) __riscv_vmaxu_vx_u64m1_tumu(__VA_ARGS__)
+#define vmaxu_vv_u64m2_m(...) __riscv_vmaxu_vv_u64m2_tumu(__VA_ARGS__)
+#define vmaxu_vx_u64m2_m(...) __riscv_vmaxu_vx_u64m2_tumu(__VA_ARGS__)
+#define vmaxu_vv_u64m4_m(...) __riscv_vmaxu_vv_u64m4_tumu(__VA_ARGS__)
+#define vmaxu_vx_u64m4_m(...) __riscv_vmaxu_vx_u64m4_tumu(__VA_ARGS__)
+#define vmaxu_vv_u64m8_m(...) __riscv_vmaxu_vv_u64m8_tumu(__VA_ARGS__)
+#define vmaxu_vx_u64m8_m(...) __riscv_vmaxu_vx_u64m8_tumu(__VA_ARGS__)
+#define vmul_vv_i8mf8(...) __riscv_vmul_vv_i8mf8(__VA_ARGS__)
+#define vmul_vx_i8mf8(...) __riscv_vmul_vx_i8mf8(__VA_ARGS__)
+#define vmul_vv_i8mf4(...) __riscv_vmul_vv_i8mf4(__VA_ARGS__)
+#define vmul_vx_i8mf4(...) __riscv_vmul_vx_i8mf4(__VA_ARGS__)
+#define vmul_vv_i8mf2(...) __riscv_vmul_vv_i8mf2(__VA_ARGS__)
+#define vmul_vx_i8mf2(...) __riscv_vmul_vx_i8mf2(__VA_ARGS__)
+#define vmul_vv_i8m1(...) __riscv_vmul_vv_i8m1(__VA_ARGS__)
+#define vmul_vx_i8m1(...) __riscv_vmul_vx_i8m1(__VA_ARGS__)
+#define vmul_vv_i8m2(...) __riscv_vmul_vv_i8m2(__VA_ARGS__)
+#define vmul_vx_i8m2(...) __riscv_vmul_vx_i8m2(__VA_ARGS__)
+#define vmul_vv_i8m4(...) __riscv_vmul_vv_i8m4(__VA_ARGS__)
+#define vmul_vx_i8m4(...) __riscv_vmul_vx_i8m4(__VA_ARGS__)
+#define vmul_vv_i8m8(...) __riscv_vmul_vv_i8m8(__VA_ARGS__)
+#define vmul_vx_i8m8(...) __riscv_vmul_vx_i8m8(__VA_ARGS__)
+#define vmul_vv_i16mf4(...) __riscv_vmul_vv_i16mf4(__VA_ARGS__)
+#define vmul_vx_i16mf4(...) __riscv_vmul_vx_i16mf4(__VA_ARGS__)
+#define vmul_vv_i16mf2(...) __riscv_vmul_vv_i16mf2(__VA_ARGS__)
+#define vmul_vx_i16mf2(...) __riscv_vmul_vx_i16mf2(__VA_ARGS__)
+#define vmul_vv_i16m1(...) __riscv_vmul_vv_i16m1(__VA_ARGS__)
+#define vmul_vx_i16m1(...) __riscv_vmul_vx_i16m1(__VA_ARGS__)
+#define vmul_vv_i16m2(...) __riscv_vmul_vv_i16m2(__VA_ARGS__)
+#define vmul_vx_i16m2(...) __riscv_vmul_vx_i16m2(__VA_ARGS__)
+#define vmul_vv_i16m4(...) __riscv_vmul_vv_i16m4(__VA_ARGS__)
+#define vmul_vx_i16m4(...) __riscv_vmul_vx_i16m4(__VA_ARGS__)
+#define vmul_vv_i16m8(...) __riscv_vmul_vv_i16m8(__VA_ARGS__)
+#define vmul_vx_i16m8(...) __riscv_vmul_vx_i16m8(__VA_ARGS__)
+#define vmul_vv_i32mf2(...) __riscv_vmul_vv_i32mf2(__VA_ARGS__)
+#define vmul_vx_i32mf2(...) __riscv_vmul_vx_i32mf2(__VA_ARGS__)
+#define vmul_vv_i32m1(...) __riscv_vmul_vv_i32m1(__VA_ARGS__)
+#define vmul_vx_i32m1(...) __riscv_vmul_vx_i32m1(__VA_ARGS__)
+#define vmul_vv_i32m2(...) __riscv_vmul_vv_i32m2(__VA_ARGS__)
+#define vmul_vx_i32m2(...) __riscv_vmul_vx_i32m2(__VA_ARGS__)
+#define vmul_vv_i32m4(...) __riscv_vmul_vv_i32m4(__VA_ARGS__)
+#define vmul_vx_i32m4(...) __riscv_vmul_vx_i32m4(__VA_ARGS__)
+#define vmul_vv_i32m8(...) __riscv_vmul_vv_i32m8(__VA_ARGS__)
+#define vmul_vx_i32m8(...) __riscv_vmul_vx_i32m8(__VA_ARGS__)
+#define vmul_vv_i64m1(...) __riscv_vmul_vv_i64m1(__VA_ARGS__)
+#define vmul_vx_i64m1(...) __riscv_vmul_vx_i64m1(__VA_ARGS__)
+#define vmul_vv_i64m2(...) __riscv_vmul_vv_i64m2(__VA_ARGS__)
+#define vmul_vx_i64m2(...) __riscv_vmul_vx_i64m2(__VA_ARGS__)
+#define vmul_vv_i64m4(...) __riscv_vmul_vv_i64m4(__VA_ARGS__)
+#define vmul_vx_i64m4(...) __riscv_vmul_vx_i64m4(__VA_ARGS__)
+#define vmul_vv_i64m8(...) __riscv_vmul_vv_i64m8(__VA_ARGS__)
+#define vmul_vx_i64m8(...) __riscv_vmul_vx_i64m8(__VA_ARGS__)
+#define vmulh_vv_i8mf8(...) __riscv_vmulh_vv_i8mf8(__VA_ARGS__)
+#define vmulh_vx_i8mf8(...) __riscv_vmulh_vx_i8mf8(__VA_ARGS__)
+#define vmulh_vv_i8mf4(...) __riscv_vmulh_vv_i8mf4(__VA_ARGS__)
+#define vmulh_vx_i8mf4(...) __riscv_vmulh_vx_i8mf4(__VA_ARGS__)
+#define vmulh_vv_i8mf2(...) __riscv_vmulh_vv_i8mf2(__VA_ARGS__)
+#define vmulh_vx_i8mf2(...) __riscv_vmulh_vx_i8mf2(__VA_ARGS__)
+#define vmulh_vv_i8m1(...) __riscv_vmulh_vv_i8m1(__VA_ARGS__)
+#define vmulh_vx_i8m1(...) __riscv_vmulh_vx_i8m1(__VA_ARGS__)
+#define vmulh_vv_i8m2(...) __riscv_vmulh_vv_i8m2(__VA_ARGS__)
+#define vmulh_vx_i8m2(...) __riscv_vmulh_vx_i8m2(__VA_ARGS__)
+#define vmulh_vv_i8m4(...) __riscv_vmulh_vv_i8m4(__VA_ARGS__)
+#define vmulh_vx_i8m4(...) __riscv_vmulh_vx_i8m4(__VA_ARGS__)
+#define vmulh_vv_i8m8(...) __riscv_vmulh_vv_i8m8(__VA_ARGS__)
+#define vmulh_vx_i8m8(...) __riscv_vmulh_vx_i8m8(__VA_ARGS__)
+#define vmulh_vv_i16mf4(...) __riscv_vmulh_vv_i16mf4(__VA_ARGS__)
+#define vmulh_vx_i16mf4(...) __riscv_vmulh_vx_i16mf4(__VA_ARGS__)
+#define vmulh_vv_i16mf2(...) __riscv_vmulh_vv_i16mf2(__VA_ARGS__)
+#define vmulh_vx_i16mf2(...) __riscv_vmulh_vx_i16mf2(__VA_ARGS__)
+#define vmulh_vv_i16m1(...) __riscv_vmulh_vv_i16m1(__VA_ARGS__)
+#define vmulh_vx_i16m1(...) __riscv_vmulh_vx_i16m1(__VA_ARGS__)
+#define vmulh_vv_i16m2(...) __riscv_vmulh_vv_i16m2(__VA_ARGS__)
+#define vmulh_vx_i16m2(...) __riscv_vmulh_vx_i16m2(__VA_ARGS__)
+#define vmulh_vv_i16m4(...) __riscv_vmulh_vv_i16m4(__VA_ARGS__)
+#define vmulh_vx_i16m4(...) __riscv_vmulh_vx_i16m4(__VA_ARGS__)
+#define vmulh_vv_i16m8(...) __riscv_vmulh_vv_i16m8(__VA_ARGS__)
+#define vmulh_vx_i16m8(...) __riscv_vmulh_vx_i16m8(__VA_ARGS__)
+#define vmulh_vv_i32mf2(...) __riscv_vmulh_vv_i32mf2(__VA_ARGS__)
+#define vmulh_vx_i32mf2(...) __riscv_vmulh_vx_i32mf2(__VA_ARGS__)
+#define vmulh_vv_i32m1(...) __riscv_vmulh_vv_i32m1(__VA_ARGS__)
+#define vmulh_vx_i32m1(...) __riscv_vmulh_vx_i32m1(__VA_ARGS__)
+#define vmulh_vv_i32m2(...) __riscv_vmulh_vv_i32m2(__VA_ARGS__)
+#define vmulh_vx_i32m2(...) __riscv_vmulh_vx_i32m2(__VA_ARGS__)
+#define vmulh_vv_i32m4(...) __riscv_vmulh_vv_i32m4(__VA_ARGS__)
+#define vmulh_vx_i32m4(...) __riscv_vmulh_vx_i32m4(__VA_ARGS__)
+#define vmulh_vv_i32m8(...) __riscv_vmulh_vv_i32m8(__VA_ARGS__)
+#define vmulh_vx_i32m8(...) __riscv_vmulh_vx_i32m8(__VA_ARGS__)
+#define vmulh_vv_i64m1(...) __riscv_vmulh_vv_i64m1(__VA_ARGS__)
+#define vmulh_vx_i64m1(...) __riscv_vmulh_vx_i64m1(__VA_ARGS__)
+#define vmulh_vv_i64m2(...) __riscv_vmulh_vv_i64m2(__VA_ARGS__)
+#define vmulh_vx_i64m2(...) __riscv_vmulh_vx_i64m2(__VA_ARGS__)
+#define vmulh_vv_i64m4(...) __riscv_vmulh_vv_i64m4(__VA_ARGS__)
+#define vmulh_vx_i64m4(...) __riscv_vmulh_vx_i64m4(__VA_ARGS__)
+#define vmulh_vv_i64m8(...) __riscv_vmulh_vv_i64m8(__VA_ARGS__)
+#define vmulh_vx_i64m8(...) __riscv_vmulh_vx_i64m8(__VA_ARGS__)
+#define vmulhsu_vv_i8mf8(...) __riscv_vmulhsu_vv_i8mf8(__VA_ARGS__)
+#define vmulhsu_vx_i8mf8(...) __riscv_vmulhsu_vx_i8mf8(__VA_ARGS__)
+#define vmulhsu_vv_i8mf4(...) __riscv_vmulhsu_vv_i8mf4(__VA_ARGS__)
+#define vmulhsu_vx_i8mf4(...) __riscv_vmulhsu_vx_i8mf4(__VA_ARGS__)
+#define vmulhsu_vv_i8mf2(...) __riscv_vmulhsu_vv_i8mf2(__VA_ARGS__)
+#define vmulhsu_vx_i8mf2(...) __riscv_vmulhsu_vx_i8mf2(__VA_ARGS__)
+#define vmulhsu_vv_i8m1(...) __riscv_vmulhsu_vv_i8m1(__VA_ARGS__)
+#define vmulhsu_vx_i8m1(...) __riscv_vmulhsu_vx_i8m1(__VA_ARGS__)
+#define vmulhsu_vv_i8m2(...) __riscv_vmulhsu_vv_i8m2(__VA_ARGS__)
+#define vmulhsu_vx_i8m2(...) __riscv_vmulhsu_vx_i8m2(__VA_ARGS__)
+#define vmulhsu_vv_i8m4(...) __riscv_vmulhsu_vv_i8m4(__VA_ARGS__)
+#define vmulhsu_vx_i8m4(...) __riscv_vmulhsu_vx_i8m4(__VA_ARGS__)
+#define vmulhsu_vv_i8m8(...) __riscv_vmulhsu_vv_i8m8(__VA_ARGS__)
+#define vmulhsu_vx_i8m8(...) __riscv_vmulhsu_vx_i8m8(__VA_ARGS__)
+#define vmulhsu_vv_i16mf4(...) __riscv_vmulhsu_vv_i16mf4(__VA_ARGS__)
+#define vmulhsu_vx_i16mf4(...) __riscv_vmulhsu_vx_i16mf4(__VA_ARGS__)
+#define vmulhsu_vv_i16mf2(...) __riscv_vmulhsu_vv_i16mf2(__VA_ARGS__)
+#define vmulhsu_vx_i16mf2(...) __riscv_vmulhsu_vx_i16mf2(__VA_ARGS__)
+#define vmulhsu_vv_i16m1(...) __riscv_vmulhsu_vv_i16m1(__VA_ARGS__)
+#define vmulhsu_vx_i16m1(...) __riscv_vmulhsu_vx_i16m1(__VA_ARGS__)
+#define vmulhsu_vv_i16m2(...) __riscv_vmulhsu_vv_i16m2(__VA_ARGS__)
+#define vmulhsu_vx_i16m2(...) __riscv_vmulhsu_vx_i16m2(__VA_ARGS__)
+#define vmulhsu_vv_i16m4(...) __riscv_vmulhsu_vv_i16m4(__VA_ARGS__)
+#define vmulhsu_vx_i16m4(...) __riscv_vmulhsu_vx_i16m4(__VA_ARGS__)
+#define vmulhsu_vv_i16m8(...) __riscv_vmulhsu_vv_i16m8(__VA_ARGS__)
+#define vmulhsu_vx_i16m8(...) __riscv_vmulhsu_vx_i16m8(__VA_ARGS__)
+#define vmulhsu_vv_i32mf2(...) __riscv_vmulhsu_vv_i32mf2(__VA_ARGS__)
+#define vmulhsu_vx_i32mf2(...) __riscv_vmulhsu_vx_i32mf2(__VA_ARGS__)
+#define vmulhsu_vv_i32m1(...) __riscv_vmulhsu_vv_i32m1(__VA_ARGS__)
+#define vmulhsu_vx_i32m1(...) __riscv_vmulhsu_vx_i32m1(__VA_ARGS__)
+#define vmulhsu_vv_i32m2(...) __riscv_vmulhsu_vv_i32m2(__VA_ARGS__)
+#define vmulhsu_vx_i32m2(...) __riscv_vmulhsu_vx_i32m2(__VA_ARGS__)
+#define vmulhsu_vv_i32m4(...) __riscv_vmulhsu_vv_i32m4(__VA_ARGS__)
+#define vmulhsu_vx_i32m4(...) __riscv_vmulhsu_vx_i32m4(__VA_ARGS__)
+#define vmulhsu_vv_i32m8(...) __riscv_vmulhsu_vv_i32m8(__VA_ARGS__)
+#define vmulhsu_vx_i32m8(...) __riscv_vmulhsu_vx_i32m8(__VA_ARGS__)
+#define vmulhsu_vv_i64m1(...) __riscv_vmulhsu_vv_i64m1(__VA_ARGS__)
+#define vmulhsu_vx_i64m1(...) __riscv_vmulhsu_vx_i64m1(__VA_ARGS__)
+#define vmulhsu_vv_i64m2(...) __riscv_vmulhsu_vv_i64m2(__VA_ARGS__)
+#define vmulhsu_vx_i64m2(...) __riscv_vmulhsu_vx_i64m2(__VA_ARGS__)
+#define vmulhsu_vv_i64m4(...) __riscv_vmulhsu_vv_i64m4(__VA_ARGS__)
+#define vmulhsu_vx_i64m4(...) __riscv_vmulhsu_vx_i64m4(__VA_ARGS__)
+#define vmulhsu_vv_i64m8(...) __riscv_vmulhsu_vv_i64m8(__VA_ARGS__)
+#define vmulhsu_vx_i64m8(...) __riscv_vmulhsu_vx_i64m8(__VA_ARGS__)
+#define vmul_vv_u8mf8(...) __riscv_vmul_vv_u8mf8(__VA_ARGS__)
+#define vmul_vx_u8mf8(...) __riscv_vmul_vx_u8mf8(__VA_ARGS__)
+#define vmul_vv_u8mf4(...) __riscv_vmul_vv_u8mf4(__VA_ARGS__)
+#define vmul_vx_u8mf4(...) __riscv_vmul_vx_u8mf4(__VA_ARGS__)
+#define vmul_vv_u8mf2(...) __riscv_vmul_vv_u8mf2(__VA_ARGS__)
+#define vmul_vx_u8mf2(...) __riscv_vmul_vx_u8mf2(__VA_ARGS__)
+#define vmul_vv_u8m1(...) __riscv_vmul_vv_u8m1(__VA_ARGS__)
+#define vmul_vx_u8m1(...) __riscv_vmul_vx_u8m1(__VA_ARGS__)
+#define vmul_vv_u8m2(...) __riscv_vmul_vv_u8m2(__VA_ARGS__)
+#define vmul_vx_u8m2(...) __riscv_vmul_vx_u8m2(__VA_ARGS__)
+#define vmul_vv_u8m4(...) __riscv_vmul_vv_u8m4(__VA_ARGS__)
+#define vmul_vx_u8m4(...) __riscv_vmul_vx_u8m4(__VA_ARGS__)
+#define vmul_vv_u8m8(...) __riscv_vmul_vv_u8m8(__VA_ARGS__)
+#define vmul_vx_u8m8(...) __riscv_vmul_vx_u8m8(__VA_ARGS__)
+#define vmul_vv_u16mf4(...) __riscv_vmul_vv_u16mf4(__VA_ARGS__)
+#define vmul_vx_u16mf4(...) __riscv_vmul_vx_u16mf4(__VA_ARGS__)
+#define vmul_vv_u16mf2(...) __riscv_vmul_vv_u16mf2(__VA_ARGS__)
+#define vmul_vx_u16mf2(...) __riscv_vmul_vx_u16mf2(__VA_ARGS__)
+#define vmul_vv_u16m1(...) __riscv_vmul_vv_u16m1(__VA_ARGS__)
+#define vmul_vx_u16m1(...) __riscv_vmul_vx_u16m1(__VA_ARGS__)
+#define vmul_vv_u16m2(...) __riscv_vmul_vv_u16m2(__VA_ARGS__)
+#define vmul_vx_u16m2(...) __riscv_vmul_vx_u16m2(__VA_ARGS__)
+#define vmul_vv_u16m4(...) __riscv_vmul_vv_u16m4(__VA_ARGS__)
+#define vmul_vx_u16m4(...) __riscv_vmul_vx_u16m4(__VA_ARGS__)
+#define vmul_vv_u16m8(...) __riscv_vmul_vv_u16m8(__VA_ARGS__)
+#define vmul_vx_u16m8(...) __riscv_vmul_vx_u16m8(__VA_ARGS__)
+#define vmul_vv_u32mf2(...) __riscv_vmul_vv_u32mf2(__VA_ARGS__)
+#define vmul_vx_u32mf2(...) __riscv_vmul_vx_u32mf2(__VA_ARGS__)
+#define vmul_vv_u32m1(...) __riscv_vmul_vv_u32m1(__VA_ARGS__)
+#define vmul_vx_u32m1(...) __riscv_vmul_vx_u32m1(__VA_ARGS__)
+#define vmul_vv_u32m2(...) __riscv_vmul_vv_u32m2(__VA_ARGS__)
+#define vmul_vx_u32m2(...) __riscv_vmul_vx_u32m2(__VA_ARGS__)
+#define vmul_vv_u32m4(...) __riscv_vmul_vv_u32m4(__VA_ARGS__)
+#define vmul_vx_u32m4(...) __riscv_vmul_vx_u32m4(__VA_ARGS__)
+#define vmul_vv_u32m8(...) __riscv_vmul_vv_u32m8(__VA_ARGS__)
+#define vmul_vx_u32m8(...) __riscv_vmul_vx_u32m8(__VA_ARGS__)
+#define vmul_vv_u64m1(...) __riscv_vmul_vv_u64m1(__VA_ARGS__)
+#define vmul_vx_u64m1(...) __riscv_vmul_vx_u64m1(__VA_ARGS__)
+#define vmul_vv_u64m2(...) __riscv_vmul_vv_u64m2(__VA_ARGS__)
+#define vmul_vx_u64m2(...) __riscv_vmul_vx_u64m2(__VA_ARGS__)
+#define vmul_vv_u64m4(...) __riscv_vmul_vv_u64m4(__VA_ARGS__)
+#define vmul_vx_u64m4(...) __riscv_vmul_vx_u64m4(__VA_ARGS__)
+#define vmul_vv_u64m8(...) __riscv_vmul_vv_u64m8(__VA_ARGS__)
+#define vmul_vx_u64m8(...) __riscv_vmul_vx_u64m8(__VA_ARGS__)
+#define vmulhu_vv_u8mf8(...) __riscv_vmulhu_vv_u8mf8(__VA_ARGS__)
+#define vmulhu_vx_u8mf8(...) __riscv_vmulhu_vx_u8mf8(__VA_ARGS__)
+#define vmulhu_vv_u8mf4(...) __riscv_vmulhu_vv_u8mf4(__VA_ARGS__)
+#define vmulhu_vx_u8mf4(...) __riscv_vmulhu_vx_u8mf4(__VA_ARGS__)
+#define vmulhu_vv_u8mf2(...) __riscv_vmulhu_vv_u8mf2(__VA_ARGS__)
+#define vmulhu_vx_u8mf2(...) __riscv_vmulhu_vx_u8mf2(__VA_ARGS__)
+#define vmulhu_vv_u8m1(...) __riscv_vmulhu_vv_u8m1(__VA_ARGS__)
+#define vmulhu_vx_u8m1(...) __riscv_vmulhu_vx_u8m1(__VA_ARGS__)
+#define vmulhu_vv_u8m2(...) __riscv_vmulhu_vv_u8m2(__VA_ARGS__)
+#define vmulhu_vx_u8m2(...) __riscv_vmulhu_vx_u8m2(__VA_ARGS__)
+#define vmulhu_vv_u8m4(...) __riscv_vmulhu_vv_u8m4(__VA_ARGS__)
+#define vmulhu_vx_u8m4(...) __riscv_vmulhu_vx_u8m4(__VA_ARGS__)
+#define vmulhu_vv_u8m8(...) __riscv_vmulhu_vv_u8m8(__VA_ARGS__)
+#define vmulhu_vx_u8m8(...) __riscv_vmulhu_vx_u8m8(__VA_ARGS__)
+#define vmulhu_vv_u16mf4(...) __riscv_vmulhu_vv_u16mf4(__VA_ARGS__)
+#define vmulhu_vx_u16mf4(...) __riscv_vmulhu_vx_u16mf4(__VA_ARGS__)
+#define vmulhu_vv_u16mf2(...) __riscv_vmulhu_vv_u16mf2(__VA_ARGS__)
+#define vmulhu_vx_u16mf2(...) __riscv_vmulhu_vx_u16mf2(__VA_ARGS__)
+#define vmulhu_vv_u16m1(...) __riscv_vmulhu_vv_u16m1(__VA_ARGS__)
+#define vmulhu_vx_u16m1(...) __riscv_vmulhu_vx_u16m1(__VA_ARGS__)
+#define vmulhu_vv_u16m2(...) __riscv_vmulhu_vv_u16m2(__VA_ARGS__)
+#define vmulhu_vx_u16m2(...) __riscv_vmulhu_vx_u16m2(__VA_ARGS__)
+#define vmulhu_vv_u16m4(...) __riscv_vmulhu_vv_u16m4(__VA_ARGS__)
+#define vmulhu_vx_u16m4(...) __riscv_vmulhu_vx_u16m4(__VA_ARGS__)
+#define vmulhu_vv_u16m8(...) __riscv_vmulhu_vv_u16m8(__VA_ARGS__)
+#define vmulhu_vx_u16m8(...) __riscv_vmulhu_vx_u16m8(__VA_ARGS__)
+#define vmulhu_vv_u32mf2(...) __riscv_vmulhu_vv_u32mf2(__VA_ARGS__)
+#define vmulhu_vx_u32mf2(...) __riscv_vmulhu_vx_u32mf2(__VA_ARGS__)
+#define vmulhu_vv_u32m1(...) __riscv_vmulhu_vv_u32m1(__VA_ARGS__)
+#define vmulhu_vx_u32m1(...) __riscv_vmulhu_vx_u32m1(__VA_ARGS__)
+#define vmulhu_vv_u32m2(...) __riscv_vmulhu_vv_u32m2(__VA_ARGS__)
+#define vmulhu_vx_u32m2(...) __riscv_vmulhu_vx_u32m2(__VA_ARGS__)
+#define vmulhu_vv_u32m4(...) __riscv_vmulhu_vv_u32m4(__VA_ARGS__)
+#define vmulhu_vx_u32m4(...) __riscv_vmulhu_vx_u32m4(__VA_ARGS__)
+#define vmulhu_vv_u32m8(...) __riscv_vmulhu_vv_u32m8(__VA_ARGS__)
+#define vmulhu_vx_u32m8(...) __riscv_vmulhu_vx_u32m8(__VA_ARGS__)
+#define vmulhu_vv_u64m1(...) __riscv_vmulhu_vv_u64m1(__VA_ARGS__)
+#define vmulhu_vx_u64m1(...) __riscv_vmulhu_vx_u64m1(__VA_ARGS__)
+#define vmulhu_vv_u64m2(...) __riscv_vmulhu_vv_u64m2(__VA_ARGS__)
+#define vmulhu_vx_u64m2(...) __riscv_vmulhu_vx_u64m2(__VA_ARGS__)
+#define vmulhu_vv_u64m4(...) __riscv_vmulhu_vv_u64m4(__VA_ARGS__)
+#define vmulhu_vx_u64m4(...) __riscv_vmulhu_vx_u64m4(__VA_ARGS__)
+#define vmulhu_vv_u64m8(...) __riscv_vmulhu_vv_u64m8(__VA_ARGS__)
+#define vmulhu_vx_u64m8(...) __riscv_vmulhu_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vmul_vv_i8mf8_m(...) __riscv_vmul_vv_i8mf8_tumu(__VA_ARGS__)
+#define vmul_vx_i8mf8_m(...) __riscv_vmul_vx_i8mf8_tumu(__VA_ARGS__)
+#define vmul_vv_i8mf4_m(...) __riscv_vmul_vv_i8mf4_tumu(__VA_ARGS__)
+#define vmul_vx_i8mf4_m(...) __riscv_vmul_vx_i8mf4_tumu(__VA_ARGS__)
+#define vmul_vv_i8mf2_m(...) __riscv_vmul_vv_i8mf2_tumu(__VA_ARGS__)
+#define vmul_vx_i8mf2_m(...) __riscv_vmul_vx_i8mf2_tumu(__VA_ARGS__)
+#define vmul_vv_i8m1_m(...) __riscv_vmul_vv_i8m1_tumu(__VA_ARGS__)
+#define vmul_vx_i8m1_m(...) __riscv_vmul_vx_i8m1_tumu(__VA_ARGS__)
+#define vmul_vv_i8m2_m(...) __riscv_vmul_vv_i8m2_tumu(__VA_ARGS__)
+#define vmul_vx_i8m2_m(...) __riscv_vmul_vx_i8m2_tumu(__VA_ARGS__)
+#define vmul_vv_i8m4_m(...) __riscv_vmul_vv_i8m4_tumu(__VA_ARGS__)
+#define vmul_vx_i8m4_m(...) __riscv_vmul_vx_i8m4_tumu(__VA_ARGS__)
+#define vmul_vv_i8m8_m(...) __riscv_vmul_vv_i8m8_tumu(__VA_ARGS__)
+#define vmul_vx_i8m8_m(...) __riscv_vmul_vx_i8m8_tumu(__VA_ARGS__)
+#define vmul_vv_i16mf4_m(...) __riscv_vmul_vv_i16mf4_tumu(__VA_ARGS__)
+#define vmul_vx_i16mf4_m(...) __riscv_vmul_vx_i16mf4_tumu(__VA_ARGS__)
+#define vmul_vv_i16mf2_m(...) __riscv_vmul_vv_i16mf2_tumu(__VA_ARGS__)
+#define vmul_vx_i16mf2_m(...) __riscv_vmul_vx_i16mf2_tumu(__VA_ARGS__)
+#define vmul_vv_i16m1_m(...) __riscv_vmul_vv_i16m1_tumu(__VA_ARGS__)
+#define vmul_vx_i16m1_m(...) __riscv_vmul_vx_i16m1_tumu(__VA_ARGS__)
+#define vmul_vv_i16m2_m(...) __riscv_vmul_vv_i16m2_tumu(__VA_ARGS__)
+#define vmul_vx_i16m2_m(...) __riscv_vmul_vx_i16m2_tumu(__VA_ARGS__)
+#define vmul_vv_i16m4_m(...) __riscv_vmul_vv_i16m4_tumu(__VA_ARGS__)
+#define vmul_vx_i16m4_m(...) __riscv_vmul_vx_i16m4_tumu(__VA_ARGS__)
+#define vmul_vv_i16m8_m(...) __riscv_vmul_vv_i16m8_tumu(__VA_ARGS__)
+#define vmul_vx_i16m8_m(...) __riscv_vmul_vx_i16m8_tumu(__VA_ARGS__)
+#define vmul_vv_i32mf2_m(...) __riscv_vmul_vv_i32mf2_tumu(__VA_ARGS__)
+#define vmul_vx_i32mf2_m(...) __riscv_vmul_vx_i32mf2_tumu(__VA_ARGS__)
+#define vmul_vv_i32m1_m(...) __riscv_vmul_vv_i32m1_tumu(__VA_ARGS__)
+#define vmul_vx_i32m1_m(...) __riscv_vmul_vx_i32m1_tumu(__VA_ARGS__)
+#define vmul_vv_i32m2_m(...) __riscv_vmul_vv_i32m2_tumu(__VA_ARGS__)
+#define vmul_vx_i32m2_m(...) __riscv_vmul_vx_i32m2_tumu(__VA_ARGS__)
+#define vmul_vv_i32m4_m(...) __riscv_vmul_vv_i32m4_tumu(__VA_ARGS__)
+#define vmul_vx_i32m4_m(...) __riscv_vmul_vx_i32m4_tumu(__VA_ARGS__)
+#define vmul_vv_i32m8_m(...) __riscv_vmul_vv_i32m8_tumu(__VA_ARGS__)
+#define vmul_vx_i32m8_m(...) __riscv_vmul_vx_i32m8_tumu(__VA_ARGS__)
+#define vmul_vv_i64m1_m(...) __riscv_vmul_vv_i64m1_tumu(__VA_ARGS__)
+#define vmul_vx_i64m1_m(...) __riscv_vmul_vx_i64m1_tumu(__VA_ARGS__)
+#define vmul_vv_i64m2_m(...) __riscv_vmul_vv_i64m2_tumu(__VA_ARGS__)
+#define vmul_vx_i64m2_m(...) __riscv_vmul_vx_i64m2_tumu(__VA_ARGS__)
+#define vmul_vv_i64m4_m(...) __riscv_vmul_vv_i64m4_tumu(__VA_ARGS__)
+#define vmul_vx_i64m4_m(...) __riscv_vmul_vx_i64m4_tumu(__VA_ARGS__)
+#define vmul_vv_i64m8_m(...) __riscv_vmul_vv_i64m8_tumu(__VA_ARGS__)
+#define vmul_vx_i64m8_m(...) __riscv_vmul_vx_i64m8_tumu(__VA_ARGS__)
+#define vmulh_vv_i8mf8_m(...) __riscv_vmulh_vv_i8mf8_tumu(__VA_ARGS__)
+#define vmulh_vx_i8mf8_m(...) __riscv_vmulh_vx_i8mf8_tumu(__VA_ARGS__)
+#define vmulh_vv_i8mf4_m(...) __riscv_vmulh_vv_i8mf4_tumu(__VA_ARGS__)
+#define vmulh_vx_i8mf4_m(...) __riscv_vmulh_vx_i8mf4_tumu(__VA_ARGS__)
+#define vmulh_vv_i8mf2_m(...) __riscv_vmulh_vv_i8mf2_tumu(__VA_ARGS__)
+#define vmulh_vx_i8mf2_m(...) __riscv_vmulh_vx_i8mf2_tumu(__VA_ARGS__)
+#define vmulh_vv_i8m1_m(...) __riscv_vmulh_vv_i8m1_tumu(__VA_ARGS__)
+#define vmulh_vx_i8m1_m(...) __riscv_vmulh_vx_i8m1_tumu(__VA_ARGS__)
+#define vmulh_vv_i8m2_m(...) __riscv_vmulh_vv_i8m2_tumu(__VA_ARGS__)
+#define vmulh_vx_i8m2_m(...) __riscv_vmulh_vx_i8m2_tumu(__VA_ARGS__)
+#define vmulh_vv_i8m4_m(...) __riscv_vmulh_vv_i8m4_tumu(__VA_ARGS__)
+#define vmulh_vx_i8m4_m(...) __riscv_vmulh_vx_i8m4_tumu(__VA_ARGS__)
+#define vmulh_vv_i8m8_m(...) __riscv_vmulh_vv_i8m8_tumu(__VA_ARGS__)
+#define vmulh_vx_i8m8_m(...) __riscv_vmulh_vx_i8m8_tumu(__VA_ARGS__)
+#define vmulh_vv_i16mf4_m(...) __riscv_vmulh_vv_i16mf4_tumu(__VA_ARGS__)
+#define vmulh_vx_i16mf4_m(...) __riscv_vmulh_vx_i16mf4_tumu(__VA_ARGS__)
+#define vmulh_vv_i16mf2_m(...) __riscv_vmulh_vv_i16mf2_tumu(__VA_ARGS__)
+#define vmulh_vx_i16mf2_m(...) __riscv_vmulh_vx_i16mf2_tumu(__VA_ARGS__)
+#define vmulh_vv_i16m1_m(...) __riscv_vmulh_vv_i16m1_tumu(__VA_ARGS__)
+#define vmulh_vx_i16m1_m(...) __riscv_vmulh_vx_i16m1_tumu(__VA_ARGS__)
+#define vmulh_vv_i16m2_m(...) __riscv_vmulh_vv_i16m2_tumu(__VA_ARGS__)
+#define vmulh_vx_i16m2_m(...) __riscv_vmulh_vx_i16m2_tumu(__VA_ARGS__)
+#define vmulh_vv_i16m4_m(...) __riscv_vmulh_vv_i16m4_tumu(__VA_ARGS__)
+#define vmulh_vx_i16m4_m(...) __riscv_vmulh_vx_i16m4_tumu(__VA_ARGS__)
+#define vmulh_vv_i16m8_m(...) __riscv_vmulh_vv_i16m8_tumu(__VA_ARGS__)
+#define vmulh_vx_i16m8_m(...) __riscv_vmulh_vx_i16m8_tumu(__VA_ARGS__)
+#define vmulh_vv_i32mf2_m(...) __riscv_vmulh_vv_i32mf2_tumu(__VA_ARGS__)
+#define vmulh_vx_i32mf2_m(...) __riscv_vmulh_vx_i32mf2_tumu(__VA_ARGS__)
+#define vmulh_vv_i32m1_m(...) __riscv_vmulh_vv_i32m1_tumu(__VA_ARGS__)
+#define vmulh_vx_i32m1_m(...) __riscv_vmulh_vx_i32m1_tumu(__VA_ARGS__)
+#define vmulh_vv_i32m2_m(...) __riscv_vmulh_vv_i32m2_tumu(__VA_ARGS__)
+#define vmulh_vx_i32m2_m(...) __riscv_vmulh_vx_i32m2_tumu(__VA_ARGS__)
+#define vmulh_vv_i32m4_m(...) __riscv_vmulh_vv_i32m4_tumu(__VA_ARGS__)
+#define vmulh_vx_i32m4_m(...) __riscv_vmulh_vx_i32m4_tumu(__VA_ARGS__)
+#define vmulh_vv_i32m8_m(...) __riscv_vmulh_vv_i32m8_tumu(__VA_ARGS__)
+#define vmulh_vx_i32m8_m(...) __riscv_vmulh_vx_i32m8_tumu(__VA_ARGS__)
+#define vmulh_vv_i64m1_m(...) __riscv_vmulh_vv_i64m1_tumu(__VA_ARGS__)
+#define vmulh_vx_i64m1_m(...) __riscv_vmulh_vx_i64m1_tumu(__VA_ARGS__)
+#define vmulh_vv_i64m2_m(...) __riscv_vmulh_vv_i64m2_tumu(__VA_ARGS__)
+#define vmulh_vx_i64m2_m(...) __riscv_vmulh_vx_i64m2_tumu(__VA_ARGS__)
+#define vmulh_vv_i64m4_m(...) __riscv_vmulh_vv_i64m4_tumu(__VA_ARGS__)
+#define vmulh_vx_i64m4_m(...) __riscv_vmulh_vx_i64m4_tumu(__VA_ARGS__)
+#define vmulh_vv_i64m8_m(...) __riscv_vmulh_vv_i64m8_tumu(__VA_ARGS__)
+#define vmulh_vx_i64m8_m(...) __riscv_vmulh_vx_i64m8_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i8mf8_m(...) __riscv_vmulhsu_vv_i8mf8_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i8mf8_m(...) __riscv_vmulhsu_vx_i8mf8_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i8mf4_m(...) __riscv_vmulhsu_vv_i8mf4_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i8mf4_m(...) __riscv_vmulhsu_vx_i8mf4_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i8mf2_m(...) __riscv_vmulhsu_vv_i8mf2_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i8mf2_m(...) __riscv_vmulhsu_vx_i8mf2_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i8m1_m(...) __riscv_vmulhsu_vv_i8m1_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i8m1_m(...) __riscv_vmulhsu_vx_i8m1_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i8m2_m(...) __riscv_vmulhsu_vv_i8m2_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i8m2_m(...) __riscv_vmulhsu_vx_i8m2_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i8m4_m(...) __riscv_vmulhsu_vv_i8m4_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i8m4_m(...) __riscv_vmulhsu_vx_i8m4_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i8m8_m(...) __riscv_vmulhsu_vv_i8m8_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i8m8_m(...) __riscv_vmulhsu_vx_i8m8_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i16mf4_m(...) __riscv_vmulhsu_vv_i16mf4_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i16mf4_m(...) __riscv_vmulhsu_vx_i16mf4_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i16mf2_m(...) __riscv_vmulhsu_vv_i16mf2_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i16mf2_m(...) __riscv_vmulhsu_vx_i16mf2_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i16m1_m(...) __riscv_vmulhsu_vv_i16m1_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i16m1_m(...) __riscv_vmulhsu_vx_i16m1_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i16m2_m(...) __riscv_vmulhsu_vv_i16m2_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i16m2_m(...) __riscv_vmulhsu_vx_i16m2_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i16m4_m(...) __riscv_vmulhsu_vv_i16m4_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i16m4_m(...) __riscv_vmulhsu_vx_i16m4_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i16m8_m(...) __riscv_vmulhsu_vv_i16m8_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i16m8_m(...) __riscv_vmulhsu_vx_i16m8_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i32mf2_m(...) __riscv_vmulhsu_vv_i32mf2_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i32mf2_m(...) __riscv_vmulhsu_vx_i32mf2_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i32m1_m(...) __riscv_vmulhsu_vv_i32m1_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i32m1_m(...) __riscv_vmulhsu_vx_i32m1_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i32m2_m(...) __riscv_vmulhsu_vv_i32m2_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i32m2_m(...) __riscv_vmulhsu_vx_i32m2_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i32m4_m(...) __riscv_vmulhsu_vv_i32m4_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i32m4_m(...) __riscv_vmulhsu_vx_i32m4_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i32m8_m(...) __riscv_vmulhsu_vv_i32m8_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i32m8_m(...) __riscv_vmulhsu_vx_i32m8_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i64m1_m(...) __riscv_vmulhsu_vv_i64m1_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i64m1_m(...) __riscv_vmulhsu_vx_i64m1_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i64m2_m(...) __riscv_vmulhsu_vv_i64m2_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i64m2_m(...) __riscv_vmulhsu_vx_i64m2_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i64m4_m(...) __riscv_vmulhsu_vv_i64m4_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i64m4_m(...) __riscv_vmulhsu_vx_i64m4_tumu(__VA_ARGS__)
+#define vmulhsu_vv_i64m8_m(...) __riscv_vmulhsu_vv_i64m8_tumu(__VA_ARGS__)
+#define vmulhsu_vx_i64m8_m(...) __riscv_vmulhsu_vx_i64m8_tumu(__VA_ARGS__)
+#define vmul_vv_u8mf8_m(...) __riscv_vmul_vv_u8mf8_tumu(__VA_ARGS__)
+#define vmul_vx_u8mf8_m(...) __riscv_vmul_vx_u8mf8_tumu(__VA_ARGS__)
+#define vmul_vv_u8mf4_m(...) __riscv_vmul_vv_u8mf4_tumu(__VA_ARGS__)
+#define vmul_vx_u8mf4_m(...) __riscv_vmul_vx_u8mf4_tumu(__VA_ARGS__)
+#define vmul_vv_u8mf2_m(...) __riscv_vmul_vv_u8mf2_tumu(__VA_ARGS__)
+#define vmul_vx_u8mf2_m(...) __riscv_vmul_vx_u8mf2_tumu(__VA_ARGS__)
+#define vmul_vv_u8m1_m(...) __riscv_vmul_vv_u8m1_tumu(__VA_ARGS__)
+#define vmul_vx_u8m1_m(...) __riscv_vmul_vx_u8m1_tumu(__VA_ARGS__)
+#define vmul_vv_u8m2_m(...) __riscv_vmul_vv_u8m2_tumu(__VA_ARGS__)
+#define vmul_vx_u8m2_m(...) __riscv_vmul_vx_u8m2_tumu(__VA_ARGS__)
+#define vmul_vv_u8m4_m(...) __riscv_vmul_vv_u8m4_tumu(__VA_ARGS__)
+#define vmul_vx_u8m4_m(...) __riscv_vmul_vx_u8m4_tumu(__VA_ARGS__)
+#define vmul_vv_u8m8_m(...) __riscv_vmul_vv_u8m8_tumu(__VA_ARGS__)
+#define vmul_vx_u8m8_m(...) __riscv_vmul_vx_u8m8_tumu(__VA_ARGS__)
+#define vmul_vv_u16mf4_m(...) __riscv_vmul_vv_u16mf4_tumu(__VA_ARGS__)
+#define vmul_vx_u16mf4_m(...) __riscv_vmul_vx_u16mf4_tumu(__VA_ARGS__)
+#define vmul_vv_u16mf2_m(...) __riscv_vmul_vv_u16mf2_tumu(__VA_ARGS__)
+#define vmul_vx_u16mf2_m(...) __riscv_vmul_vx_u16mf2_tumu(__VA_ARGS__)
+#define vmul_vv_u16m1_m(...) __riscv_vmul_vv_u16m1_tumu(__VA_ARGS__)
+#define vmul_vx_u16m1_m(...) __riscv_vmul_vx_u16m1_tumu(__VA_ARGS__)
+#define vmul_vv_u16m2_m(...) __riscv_vmul_vv_u16m2_tumu(__VA_ARGS__)
+#define vmul_vx_u16m2_m(...) __riscv_vmul_vx_u16m2_tumu(__VA_ARGS__)
+#define vmul_vv_u16m4_m(...) __riscv_vmul_vv_u16m4_tumu(__VA_ARGS__)
+#define vmul_vx_u16m4_m(...) __riscv_vmul_vx_u16m4_tumu(__VA_ARGS__)
+#define vmul_vv_u16m8_m(...) __riscv_vmul_vv_u16m8_tumu(__VA_ARGS__)
+#define vmul_vx_u16m8_m(...) __riscv_vmul_vx_u16m8_tumu(__VA_ARGS__)
+#define vmul_vv_u32mf2_m(...) __riscv_vmul_vv_u32mf2_tumu(__VA_ARGS__)
+#define vmul_vx_u32mf2_m(...) __riscv_vmul_vx_u32mf2_tumu(__VA_ARGS__)
+#define vmul_vv_u32m1_m(...) __riscv_vmul_vv_u32m1_tumu(__VA_ARGS__)
+#define vmul_vx_u32m1_m(...) __riscv_vmul_vx_u32m1_tumu(__VA_ARGS__)
+#define vmul_vv_u32m2_m(...) __riscv_vmul_vv_u32m2_tumu(__VA_ARGS__)
+#define vmul_vx_u32m2_m(...) __riscv_vmul_vx_u32m2_tumu(__VA_ARGS__)
+#define vmul_vv_u32m4_m(...) __riscv_vmul_vv_u32m4_tumu(__VA_ARGS__)
+#define vmul_vx_u32m4_m(...) __riscv_vmul_vx_u32m4_tumu(__VA_ARGS__)
+#define vmul_vv_u32m8_m(...) __riscv_vmul_vv_u32m8_tumu(__VA_ARGS__)
+#define vmul_vx_u32m8_m(...) __riscv_vmul_vx_u32m8_tumu(__VA_ARGS__)
+#define vmul_vv_u64m1_m(...) __riscv_vmul_vv_u64m1_tumu(__VA_ARGS__)
+#define vmul_vx_u64m1_m(...) __riscv_vmul_vx_u64m1_tumu(__VA_ARGS__)
+#define vmul_vv_u64m2_m(...) __riscv_vmul_vv_u64m2_tumu(__VA_ARGS__)
+#define vmul_vx_u64m2_m(...) __riscv_vmul_vx_u64m2_tumu(__VA_ARGS__)
+#define vmul_vv_u64m4_m(...) __riscv_vmul_vv_u64m4_tumu(__VA_ARGS__)
+#define vmul_vx_u64m4_m(...) __riscv_vmul_vx_u64m4_tumu(__VA_ARGS__)
+#define vmul_vv_u64m8_m(...) __riscv_vmul_vv_u64m8_tumu(__VA_ARGS__)
+#define vmul_vx_u64m8_m(...) __riscv_vmul_vx_u64m8_tumu(__VA_ARGS__)
+#define vmulhu_vv_u8mf8_m(...) __riscv_vmulhu_vv_u8mf8_tumu(__VA_ARGS__)
+#define vmulhu_vx_u8mf8_m(...) __riscv_vmulhu_vx_u8mf8_tumu(__VA_ARGS__)
+#define vmulhu_vv_u8mf4_m(...) __riscv_vmulhu_vv_u8mf4_tumu(__VA_ARGS__)
+#define vmulhu_vx_u8mf4_m(...) __riscv_vmulhu_vx_u8mf4_tumu(__VA_ARGS__)
+#define vmulhu_vv_u8mf2_m(...) __riscv_vmulhu_vv_u8mf2_tumu(__VA_ARGS__)
+#define vmulhu_vx_u8mf2_m(...) __riscv_vmulhu_vx_u8mf2_tumu(__VA_ARGS__)
+#define vmulhu_vv_u8m1_m(...) __riscv_vmulhu_vv_u8m1_tumu(__VA_ARGS__)
+#define vmulhu_vx_u8m1_m(...) __riscv_vmulhu_vx_u8m1_tumu(__VA_ARGS__)
+#define vmulhu_vv_u8m2_m(...) __riscv_vmulhu_vv_u8m2_tumu(__VA_ARGS__)
+#define vmulhu_vx_u8m2_m(...) __riscv_vmulhu_vx_u8m2_tumu(__VA_ARGS__)
+#define vmulhu_vv_u8m4_m(...) __riscv_vmulhu_vv_u8m4_tumu(__VA_ARGS__)
+#define vmulhu_vx_u8m4_m(...) __riscv_vmulhu_vx_u8m4_tumu(__VA_ARGS__)
+#define vmulhu_vv_u8m8_m(...) __riscv_vmulhu_vv_u8m8_tumu(__VA_ARGS__)
+#define vmulhu_vx_u8m8_m(...) __riscv_vmulhu_vx_u8m8_tumu(__VA_ARGS__)
+#define vmulhu_vv_u16mf4_m(...) __riscv_vmulhu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vmulhu_vx_u16mf4_m(...) __riscv_vmulhu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vmulhu_vv_u16mf2_m(...) __riscv_vmulhu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vmulhu_vx_u16mf2_m(...) __riscv_vmulhu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vmulhu_vv_u16m1_m(...) __riscv_vmulhu_vv_u16m1_tumu(__VA_ARGS__)
+#define vmulhu_vx_u16m1_m(...) __riscv_vmulhu_vx_u16m1_tumu(__VA_ARGS__)
+#define vmulhu_vv_u16m2_m(...) __riscv_vmulhu_vv_u16m2_tumu(__VA_ARGS__)
+#define vmulhu_vx_u16m2_m(...) __riscv_vmulhu_vx_u16m2_tumu(__VA_ARGS__)
+#define vmulhu_vv_u16m4_m(...) __riscv_vmulhu_vv_u16m4_tumu(__VA_ARGS__)
+#define vmulhu_vx_u16m4_m(...) __riscv_vmulhu_vx_u16m4_tumu(__VA_ARGS__)
+#define vmulhu_vv_u16m8_m(...) __riscv_vmulhu_vv_u16m8_tumu(__VA_ARGS__)
+#define vmulhu_vx_u16m8_m(...) __riscv_vmulhu_vx_u16m8_tumu(__VA_ARGS__)
+#define vmulhu_vv_u32mf2_m(...) __riscv_vmulhu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vmulhu_vx_u32mf2_m(...) __riscv_vmulhu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vmulhu_vv_u32m1_m(...) __riscv_vmulhu_vv_u32m1_tumu(__VA_ARGS__)
+#define vmulhu_vx_u32m1_m(...) __riscv_vmulhu_vx_u32m1_tumu(__VA_ARGS__)
+#define vmulhu_vv_u32m2_m(...) __riscv_vmulhu_vv_u32m2_tumu(__VA_ARGS__)
+#define vmulhu_vx_u32m2_m(...) __riscv_vmulhu_vx_u32m2_tumu(__VA_ARGS__)
+#define vmulhu_vv_u32m4_m(...) __riscv_vmulhu_vv_u32m4_tumu(__VA_ARGS__)
+#define vmulhu_vx_u32m4_m(...) __riscv_vmulhu_vx_u32m4_tumu(__VA_ARGS__)
+#define vmulhu_vv_u32m8_m(...) __riscv_vmulhu_vv_u32m8_tumu(__VA_ARGS__)
+#define vmulhu_vx_u32m8_m(...) __riscv_vmulhu_vx_u32m8_tumu(__VA_ARGS__)
+#define vmulhu_vv_u64m1_m(...) __riscv_vmulhu_vv_u64m1_tumu(__VA_ARGS__)
+#define vmulhu_vx_u64m1_m(...) __riscv_vmulhu_vx_u64m1_tumu(__VA_ARGS__)
+#define vmulhu_vv_u64m2_m(...) __riscv_vmulhu_vv_u64m2_tumu(__VA_ARGS__)
+#define vmulhu_vx_u64m2_m(...) __riscv_vmulhu_vx_u64m2_tumu(__VA_ARGS__)
+#define vmulhu_vv_u64m4_m(...) __riscv_vmulhu_vv_u64m4_tumu(__VA_ARGS__)
+#define vmulhu_vx_u64m4_m(...) __riscv_vmulhu_vx_u64m4_tumu(__VA_ARGS__)
+#define vmulhu_vv_u64m8_m(...) __riscv_vmulhu_vv_u64m8_tumu(__VA_ARGS__)
+#define vmulhu_vx_u64m8_m(...) __riscv_vmulhu_vx_u64m8_tumu(__VA_ARGS__)
+#define vdiv_vv_i8mf8(...) __riscv_vdiv_vv_i8mf8(__VA_ARGS__)
+#define vdiv_vx_i8mf8(...) __riscv_vdiv_vx_i8mf8(__VA_ARGS__)
+#define vdiv_vv_i8mf4(...) __riscv_vdiv_vv_i8mf4(__VA_ARGS__)
+#define vdiv_vx_i8mf4(...) __riscv_vdiv_vx_i8mf4(__VA_ARGS__)
+#define vdiv_vv_i8mf2(...) __riscv_vdiv_vv_i8mf2(__VA_ARGS__)
+#define vdiv_vx_i8mf2(...) __riscv_vdiv_vx_i8mf2(__VA_ARGS__)
+#define vdiv_vv_i8m1(...) __riscv_vdiv_vv_i8m1(__VA_ARGS__)
+#define vdiv_vx_i8m1(...) __riscv_vdiv_vx_i8m1(__VA_ARGS__)
+#define vdiv_vv_i8m2(...) __riscv_vdiv_vv_i8m2(__VA_ARGS__)
+#define vdiv_vx_i8m2(...) __riscv_vdiv_vx_i8m2(__VA_ARGS__)
+#define vdiv_vv_i8m4(...) __riscv_vdiv_vv_i8m4(__VA_ARGS__)
+#define vdiv_vx_i8m4(...) __riscv_vdiv_vx_i8m4(__VA_ARGS__)
+#define vdiv_vv_i8m8(...) __riscv_vdiv_vv_i8m8(__VA_ARGS__)
+#define vdiv_vx_i8m8(...) __riscv_vdiv_vx_i8m8(__VA_ARGS__)
+#define vdiv_vv_i16mf4(...) __riscv_vdiv_vv_i16mf4(__VA_ARGS__)
+#define vdiv_vx_i16mf4(...) __riscv_vdiv_vx_i16mf4(__VA_ARGS__)
+#define vdiv_vv_i16mf2(...) __riscv_vdiv_vv_i16mf2(__VA_ARGS__)
+#define vdiv_vx_i16mf2(...) __riscv_vdiv_vx_i16mf2(__VA_ARGS__)
+#define vdiv_vv_i16m1(...) __riscv_vdiv_vv_i16m1(__VA_ARGS__)
+#define vdiv_vx_i16m1(...) __riscv_vdiv_vx_i16m1(__VA_ARGS__)
+#define vdiv_vv_i16m2(...) __riscv_vdiv_vv_i16m2(__VA_ARGS__)
+#define vdiv_vx_i16m2(...) __riscv_vdiv_vx_i16m2(__VA_ARGS__)
+#define vdiv_vv_i16m4(...) __riscv_vdiv_vv_i16m4(__VA_ARGS__)
+#define vdiv_vx_i16m4(...) __riscv_vdiv_vx_i16m4(__VA_ARGS__)
+#define vdiv_vv_i16m8(...) __riscv_vdiv_vv_i16m8(__VA_ARGS__)
+#define vdiv_vx_i16m8(...) __riscv_vdiv_vx_i16m8(__VA_ARGS__)
+#define vdiv_vv_i32mf2(...) __riscv_vdiv_vv_i32mf2(__VA_ARGS__)
+#define vdiv_vx_i32mf2(...) __riscv_vdiv_vx_i32mf2(__VA_ARGS__)
+#define vdiv_vv_i32m1(...) __riscv_vdiv_vv_i32m1(__VA_ARGS__)
+#define vdiv_vx_i32m1(...) __riscv_vdiv_vx_i32m1(__VA_ARGS__)
+#define vdiv_vv_i32m2(...) __riscv_vdiv_vv_i32m2(__VA_ARGS__)
+#define vdiv_vx_i32m2(...) __riscv_vdiv_vx_i32m2(__VA_ARGS__)
+#define vdiv_vv_i32m4(...) __riscv_vdiv_vv_i32m4(__VA_ARGS__)
+#define vdiv_vx_i32m4(...) __riscv_vdiv_vx_i32m4(__VA_ARGS__)
+#define vdiv_vv_i32m8(...) __riscv_vdiv_vv_i32m8(__VA_ARGS__)
+#define vdiv_vx_i32m8(...) __riscv_vdiv_vx_i32m8(__VA_ARGS__)
+#define vdiv_vv_i64m1(...) __riscv_vdiv_vv_i64m1(__VA_ARGS__)
+#define vdiv_vx_i64m1(...) __riscv_vdiv_vx_i64m1(__VA_ARGS__)
+#define vdiv_vv_i64m2(...) __riscv_vdiv_vv_i64m2(__VA_ARGS__)
+#define vdiv_vx_i64m2(...) __riscv_vdiv_vx_i64m2(__VA_ARGS__)
+#define vdiv_vv_i64m4(...) __riscv_vdiv_vv_i64m4(__VA_ARGS__)
+#define vdiv_vx_i64m4(...) __riscv_vdiv_vx_i64m4(__VA_ARGS__)
+#define vdiv_vv_i64m8(...) __riscv_vdiv_vv_i64m8(__VA_ARGS__)
+#define vdiv_vx_i64m8(...) __riscv_vdiv_vx_i64m8(__VA_ARGS__)
+#define vrem_vv_i8mf8(...) __riscv_vrem_vv_i8mf8(__VA_ARGS__)
+#define vrem_vx_i8mf8(...) __riscv_vrem_vx_i8mf8(__VA_ARGS__)
+#define vrem_vv_i8mf4(...) __riscv_vrem_vv_i8mf4(__VA_ARGS__)
+#define vrem_vx_i8mf4(...) __riscv_vrem_vx_i8mf4(__VA_ARGS__)
+#define vrem_vv_i8mf2(...) __riscv_vrem_vv_i8mf2(__VA_ARGS__)
+#define vrem_vx_i8mf2(...) __riscv_vrem_vx_i8mf2(__VA_ARGS__)
+#define vrem_vv_i8m1(...) __riscv_vrem_vv_i8m1(__VA_ARGS__)
+#define vrem_vx_i8m1(...) __riscv_vrem_vx_i8m1(__VA_ARGS__)
+#define vrem_vv_i8m2(...) __riscv_vrem_vv_i8m2(__VA_ARGS__)
+#define vrem_vx_i8m2(...) __riscv_vrem_vx_i8m2(__VA_ARGS__)
+#define vrem_vv_i8m4(...) __riscv_vrem_vv_i8m4(__VA_ARGS__)
+#define vrem_vx_i8m4(...) __riscv_vrem_vx_i8m4(__VA_ARGS__)
+#define vrem_vv_i8m8(...) __riscv_vrem_vv_i8m8(__VA_ARGS__)
+#define vrem_vx_i8m8(...) __riscv_vrem_vx_i8m8(__VA_ARGS__)
+#define vrem_vv_i16mf4(...) __riscv_vrem_vv_i16mf4(__VA_ARGS__)
+#define vrem_vx_i16mf4(...) __riscv_vrem_vx_i16mf4(__VA_ARGS__)
+#define vrem_vv_i16mf2(...) __riscv_vrem_vv_i16mf2(__VA_ARGS__)
+#define vrem_vx_i16mf2(...) __riscv_vrem_vx_i16mf2(__VA_ARGS__)
+#define vrem_vv_i16m1(...) __riscv_vrem_vv_i16m1(__VA_ARGS__)
+#define vrem_vx_i16m1(...) __riscv_vrem_vx_i16m1(__VA_ARGS__)
+#define vrem_vv_i16m2(...) __riscv_vrem_vv_i16m2(__VA_ARGS__)
+#define vrem_vx_i16m2(...) __riscv_vrem_vx_i16m2(__VA_ARGS__)
+#define vrem_vv_i16m4(...) __riscv_vrem_vv_i16m4(__VA_ARGS__)
+#define vrem_vx_i16m4(...) __riscv_vrem_vx_i16m4(__VA_ARGS__)
+#define vrem_vv_i16m8(...) __riscv_vrem_vv_i16m8(__VA_ARGS__)
+#define vrem_vx_i16m8(...) __riscv_vrem_vx_i16m8(__VA_ARGS__)
+#define vrem_vv_i32mf2(...) __riscv_vrem_vv_i32mf2(__VA_ARGS__)
+#define vrem_vx_i32mf2(...) __riscv_vrem_vx_i32mf2(__VA_ARGS__)
+#define vrem_vv_i32m1(...) __riscv_vrem_vv_i32m1(__VA_ARGS__)
+#define vrem_vx_i32m1(...) __riscv_vrem_vx_i32m1(__VA_ARGS__)
+#define vrem_vv_i32m2(...) __riscv_vrem_vv_i32m2(__VA_ARGS__)
+#define vrem_vx_i32m2(...) __riscv_vrem_vx_i32m2(__VA_ARGS__)
+#define vrem_vv_i32m4(...) __riscv_vrem_vv_i32m4(__VA_ARGS__)
+#define vrem_vx_i32m4(...) __riscv_vrem_vx_i32m4(__VA_ARGS__)
+#define vrem_vv_i32m8(...) __riscv_vrem_vv_i32m8(__VA_ARGS__)
+#define vrem_vx_i32m8(...) __riscv_vrem_vx_i32m8(__VA_ARGS__)
+#define vrem_vv_i64m1(...) __riscv_vrem_vv_i64m1(__VA_ARGS__)
+#define vrem_vx_i64m1(...) __riscv_vrem_vx_i64m1(__VA_ARGS__)
+#define vrem_vv_i64m2(...) __riscv_vrem_vv_i64m2(__VA_ARGS__)
+#define vrem_vx_i64m2(...) __riscv_vrem_vx_i64m2(__VA_ARGS__)
+#define vrem_vv_i64m4(...) __riscv_vrem_vv_i64m4(__VA_ARGS__)
+#define vrem_vx_i64m4(...) __riscv_vrem_vx_i64m4(__VA_ARGS__)
+#define vrem_vv_i64m8(...) __riscv_vrem_vv_i64m8(__VA_ARGS__)
+#define vrem_vx_i64m8(...) __riscv_vrem_vx_i64m8(__VA_ARGS__)
+#define vdivu_vv_u8mf8(...) __riscv_vdivu_vv_u8mf8(__VA_ARGS__)
+#define vdivu_vx_u8mf8(...) __riscv_vdivu_vx_u8mf8(__VA_ARGS__)
+#define vdivu_vv_u8mf4(...) __riscv_vdivu_vv_u8mf4(__VA_ARGS__)
+#define vdivu_vx_u8mf4(...) __riscv_vdivu_vx_u8mf4(__VA_ARGS__)
+#define vdivu_vv_u8mf2(...) __riscv_vdivu_vv_u8mf2(__VA_ARGS__)
+#define vdivu_vx_u8mf2(...) __riscv_vdivu_vx_u8mf2(__VA_ARGS__)
+#define vdivu_vv_u8m1(...) __riscv_vdivu_vv_u8m1(__VA_ARGS__)
+#define vdivu_vx_u8m1(...) __riscv_vdivu_vx_u8m1(__VA_ARGS__)
+#define vdivu_vv_u8m2(...) __riscv_vdivu_vv_u8m2(__VA_ARGS__)
+#define vdivu_vx_u8m2(...) __riscv_vdivu_vx_u8m2(__VA_ARGS__)
+#define vdivu_vv_u8m4(...) __riscv_vdivu_vv_u8m4(__VA_ARGS__)
+#define vdivu_vx_u8m4(...) __riscv_vdivu_vx_u8m4(__VA_ARGS__)
+#define vdivu_vv_u8m8(...) __riscv_vdivu_vv_u8m8(__VA_ARGS__)
+#define vdivu_vx_u8m8(...) __riscv_vdivu_vx_u8m8(__VA_ARGS__)
+#define vdivu_vv_u16mf4(...) __riscv_vdivu_vv_u16mf4(__VA_ARGS__)
+#define vdivu_vx_u16mf4(...) __riscv_vdivu_vx_u16mf4(__VA_ARGS__)
+#define vdivu_vv_u16mf2(...) __riscv_vdivu_vv_u16mf2(__VA_ARGS__)
+#define vdivu_vx_u16mf2(...) __riscv_vdivu_vx_u16mf2(__VA_ARGS__)
+#define vdivu_vv_u16m1(...) __riscv_vdivu_vv_u16m1(__VA_ARGS__)
+#define vdivu_vx_u16m1(...) __riscv_vdivu_vx_u16m1(__VA_ARGS__)
+#define vdivu_vv_u16m2(...) __riscv_vdivu_vv_u16m2(__VA_ARGS__)
+#define vdivu_vx_u16m2(...) __riscv_vdivu_vx_u16m2(__VA_ARGS__)
+#define vdivu_vv_u16m4(...) __riscv_vdivu_vv_u16m4(__VA_ARGS__)
+#define vdivu_vx_u16m4(...) __riscv_vdivu_vx_u16m4(__VA_ARGS__)
+#define vdivu_vv_u16m8(...) __riscv_vdivu_vv_u16m8(__VA_ARGS__)
+#define vdivu_vx_u16m8(...) __riscv_vdivu_vx_u16m8(__VA_ARGS__)
+#define vdivu_vv_u32mf2(...) __riscv_vdivu_vv_u32mf2(__VA_ARGS__)
+#define vdivu_vx_u32mf2(...) __riscv_vdivu_vx_u32mf2(__VA_ARGS__)
+#define vdivu_vv_u32m1(...) __riscv_vdivu_vv_u32m1(__VA_ARGS__)
+#define vdivu_vx_u32m1(...) __riscv_vdivu_vx_u32m1(__VA_ARGS__)
+#define vdivu_vv_u32m2(...) __riscv_vdivu_vv_u32m2(__VA_ARGS__)
+#define vdivu_vx_u32m2(...) __riscv_vdivu_vx_u32m2(__VA_ARGS__)
+#define vdivu_vv_u32m4(...) __riscv_vdivu_vv_u32m4(__VA_ARGS__)
+#define vdivu_vx_u32m4(...) __riscv_vdivu_vx_u32m4(__VA_ARGS__)
+#define vdivu_vv_u32m8(...) __riscv_vdivu_vv_u32m8(__VA_ARGS__)
+#define vdivu_vx_u32m8(...) __riscv_vdivu_vx_u32m8(__VA_ARGS__)
+#define vdivu_vv_u64m1(...) __riscv_vdivu_vv_u64m1(__VA_ARGS__)
+#define vdivu_vx_u64m1(...) __riscv_vdivu_vx_u64m1(__VA_ARGS__)
+#define vdivu_vv_u64m2(...) __riscv_vdivu_vv_u64m2(__VA_ARGS__)
+#define vdivu_vx_u64m2(...) __riscv_vdivu_vx_u64m2(__VA_ARGS__)
+#define vdivu_vv_u64m4(...) __riscv_vdivu_vv_u64m4(__VA_ARGS__)
+#define vdivu_vx_u64m4(...) __riscv_vdivu_vx_u64m4(__VA_ARGS__)
+#define vdivu_vv_u64m8(...) __riscv_vdivu_vv_u64m8(__VA_ARGS__)
+#define vdivu_vx_u64m8(...) __riscv_vdivu_vx_u64m8(__VA_ARGS__)
+#define vremu_vv_u8mf8(...) __riscv_vremu_vv_u8mf8(__VA_ARGS__)
+#define vremu_vx_u8mf8(...) __riscv_vremu_vx_u8mf8(__VA_ARGS__)
+#define vremu_vv_u8mf4(...) __riscv_vremu_vv_u8mf4(__VA_ARGS__)
+#define vremu_vx_u8mf4(...) __riscv_vremu_vx_u8mf4(__VA_ARGS__)
+#define vremu_vv_u8mf2(...) __riscv_vremu_vv_u8mf2(__VA_ARGS__)
+#define vremu_vx_u8mf2(...) __riscv_vremu_vx_u8mf2(__VA_ARGS__)
+#define vremu_vv_u8m1(...) __riscv_vremu_vv_u8m1(__VA_ARGS__)
+#define vremu_vx_u8m1(...) __riscv_vremu_vx_u8m1(__VA_ARGS__)
+#define vremu_vv_u8m2(...) __riscv_vremu_vv_u8m2(__VA_ARGS__)
+#define vremu_vx_u8m2(...) __riscv_vremu_vx_u8m2(__VA_ARGS__)
+#define vremu_vv_u8m4(...) __riscv_vremu_vv_u8m4(__VA_ARGS__)
+#define vremu_vx_u8m4(...) __riscv_vremu_vx_u8m4(__VA_ARGS__)
+#define vremu_vv_u8m8(...) __riscv_vremu_vv_u8m8(__VA_ARGS__)
+#define vremu_vx_u8m8(...) __riscv_vremu_vx_u8m8(__VA_ARGS__)
+#define vremu_vv_u16mf4(...) __riscv_vremu_vv_u16mf4(__VA_ARGS__)
+#define vremu_vx_u16mf4(...) __riscv_vremu_vx_u16mf4(__VA_ARGS__)
+#define vremu_vv_u16mf2(...) __riscv_vremu_vv_u16mf2(__VA_ARGS__)
+#define vremu_vx_u16mf2(...) __riscv_vremu_vx_u16mf2(__VA_ARGS__)
+#define vremu_vv_u16m1(...) __riscv_vremu_vv_u16m1(__VA_ARGS__)
+#define vremu_vx_u16m1(...) __riscv_vremu_vx_u16m1(__VA_ARGS__)
+#define vremu_vv_u16m2(...) __riscv_vremu_vv_u16m2(__VA_ARGS__)
+#define vremu_vx_u16m2(...) __riscv_vremu_vx_u16m2(__VA_ARGS__)
+#define vremu_vv_u16m4(...) __riscv_vremu_vv_u16m4(__VA_ARGS__)
+#define vremu_vx_u16m4(...) __riscv_vremu_vx_u16m4(__VA_ARGS__)
+#define vremu_vv_u16m8(...) __riscv_vremu_vv_u16m8(__VA_ARGS__)
+#define vremu_vx_u16m8(...) __riscv_vremu_vx_u16m8(__VA_ARGS__)
+#define vremu_vv_u32mf2(...) __riscv_vremu_vv_u32mf2(__VA_ARGS__)
+#define vremu_vx_u32mf2(...) __riscv_vremu_vx_u32mf2(__VA_ARGS__)
+#define vremu_vv_u32m1(...) __riscv_vremu_vv_u32m1(__VA_ARGS__)
+#define vremu_vx_u32m1(...) __riscv_vremu_vx_u32m1(__VA_ARGS__)
+#define vremu_vv_u32m2(...) __riscv_vremu_vv_u32m2(__VA_ARGS__)
+#define vremu_vx_u32m2(...) __riscv_vremu_vx_u32m2(__VA_ARGS__)
+#define vremu_vv_u32m4(...) __riscv_vremu_vv_u32m4(__VA_ARGS__)
+#define vremu_vx_u32m4(...) __riscv_vremu_vx_u32m4(__VA_ARGS__)
+#define vremu_vv_u32m8(...) __riscv_vremu_vv_u32m8(__VA_ARGS__)
+#define vremu_vx_u32m8(...) __riscv_vremu_vx_u32m8(__VA_ARGS__)
+#define vremu_vv_u64m1(...) __riscv_vremu_vv_u64m1(__VA_ARGS__)
+#define vremu_vx_u64m1(...) __riscv_vremu_vx_u64m1(__VA_ARGS__)
+#define vremu_vv_u64m2(...) __riscv_vremu_vv_u64m2(__VA_ARGS__)
+#define vremu_vx_u64m2(...) __riscv_vremu_vx_u64m2(__VA_ARGS__)
+#define vremu_vv_u64m4(...) __riscv_vremu_vv_u64m4(__VA_ARGS__)
+#define vremu_vx_u64m4(...) __riscv_vremu_vx_u64m4(__VA_ARGS__)
+#define vremu_vv_u64m8(...) __riscv_vremu_vv_u64m8(__VA_ARGS__)
+#define vremu_vx_u64m8(...) __riscv_vremu_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vdiv_vv_i8mf8_m(...) __riscv_vdiv_vv_i8mf8_tumu(__VA_ARGS__)
+#define vdiv_vx_i8mf8_m(...) __riscv_vdiv_vx_i8mf8_tumu(__VA_ARGS__)
+#define vdiv_vv_i8mf4_m(...) __riscv_vdiv_vv_i8mf4_tumu(__VA_ARGS__)
+#define vdiv_vx_i8mf4_m(...) __riscv_vdiv_vx_i8mf4_tumu(__VA_ARGS__)
+#define vdiv_vv_i8mf2_m(...) __riscv_vdiv_vv_i8mf2_tumu(__VA_ARGS__)
+#define vdiv_vx_i8mf2_m(...) __riscv_vdiv_vx_i8mf2_tumu(__VA_ARGS__)
+#define vdiv_vv_i8m1_m(...) __riscv_vdiv_vv_i8m1_tumu(__VA_ARGS__)
+#define vdiv_vx_i8m1_m(...) __riscv_vdiv_vx_i8m1_tumu(__VA_ARGS__)
+#define vdiv_vv_i8m2_m(...) __riscv_vdiv_vv_i8m2_tumu(__VA_ARGS__)
+#define vdiv_vx_i8m2_m(...) __riscv_vdiv_vx_i8m2_tumu(__VA_ARGS__)
+#define vdiv_vv_i8m4_m(...) __riscv_vdiv_vv_i8m4_tumu(__VA_ARGS__)
+#define vdiv_vx_i8m4_m(...) __riscv_vdiv_vx_i8m4_tumu(__VA_ARGS__)
+#define vdiv_vv_i8m8_m(...) __riscv_vdiv_vv_i8m8_tumu(__VA_ARGS__)
+#define vdiv_vx_i8m8_m(...) __riscv_vdiv_vx_i8m8_tumu(__VA_ARGS__)
+#define vdiv_vv_i16mf4_m(...) __riscv_vdiv_vv_i16mf4_tumu(__VA_ARGS__)
+#define vdiv_vx_i16mf4_m(...) __riscv_vdiv_vx_i16mf4_tumu(__VA_ARGS__)
+#define vdiv_vv_i16mf2_m(...) __riscv_vdiv_vv_i16mf2_tumu(__VA_ARGS__)
+#define vdiv_vx_i16mf2_m(...) __riscv_vdiv_vx_i16mf2_tumu(__VA_ARGS__)
+#define vdiv_vv_i16m1_m(...) __riscv_vdiv_vv_i16m1_tumu(__VA_ARGS__)
+#define vdiv_vx_i16m1_m(...) __riscv_vdiv_vx_i16m1_tumu(__VA_ARGS__)
+#define vdiv_vv_i16m2_m(...) __riscv_vdiv_vv_i16m2_tumu(__VA_ARGS__)
+#define vdiv_vx_i16m2_m(...) __riscv_vdiv_vx_i16m2_tumu(__VA_ARGS__)
+#define vdiv_vv_i16m4_m(...) __riscv_vdiv_vv_i16m4_tumu(__VA_ARGS__)
+#define vdiv_vx_i16m4_m(...) __riscv_vdiv_vx_i16m4_tumu(__VA_ARGS__)
+#define vdiv_vv_i16m8_m(...) __riscv_vdiv_vv_i16m8_tumu(__VA_ARGS__)
+#define vdiv_vx_i16m8_m(...) __riscv_vdiv_vx_i16m8_tumu(__VA_ARGS__)
+#define vdiv_vv_i32mf2_m(...) __riscv_vdiv_vv_i32mf2_tumu(__VA_ARGS__)
+#define vdiv_vx_i32mf2_m(...) __riscv_vdiv_vx_i32mf2_tumu(__VA_ARGS__)
+#define vdiv_vv_i32m1_m(...) __riscv_vdiv_vv_i32m1_tumu(__VA_ARGS__)
+#define vdiv_vx_i32m1_m(...) __riscv_vdiv_vx_i32m1_tumu(__VA_ARGS__)
+#define vdiv_vv_i32m2_m(...) __riscv_vdiv_vv_i32m2_tumu(__VA_ARGS__)
+#define vdiv_vx_i32m2_m(...) __riscv_vdiv_vx_i32m2_tumu(__VA_ARGS__)
+#define vdiv_vv_i32m4_m(...) __riscv_vdiv_vv_i32m4_tumu(__VA_ARGS__)
+#define vdiv_vx_i32m4_m(...) __riscv_vdiv_vx_i32m4_tumu(__VA_ARGS__)
+#define vdiv_vv_i32m8_m(...) __riscv_vdiv_vv_i32m8_tumu(__VA_ARGS__)
+#define vdiv_vx_i32m8_m(...) __riscv_vdiv_vx_i32m8_tumu(__VA_ARGS__)
+#define vdiv_vv_i64m1_m(...) __riscv_vdiv_vv_i64m1_tumu(__VA_ARGS__)
+#define vdiv_vx_i64m1_m(...) __riscv_vdiv_vx_i64m1_tumu(__VA_ARGS__)
+#define vdiv_vv_i64m2_m(...) __riscv_vdiv_vv_i64m2_tumu(__VA_ARGS__)
+#define vdiv_vx_i64m2_m(...) __riscv_vdiv_vx_i64m2_tumu(__VA_ARGS__)
+#define vdiv_vv_i64m4_m(...) __riscv_vdiv_vv_i64m4_tumu(__VA_ARGS__)
+#define vdiv_vx_i64m4_m(...) __riscv_vdiv_vx_i64m4_tumu(__VA_ARGS__)
+#define vdiv_vv_i64m8_m(...) __riscv_vdiv_vv_i64m8_tumu(__VA_ARGS__)
+#define vdiv_vx_i64m8_m(...) __riscv_vdiv_vx_i64m8_tumu(__VA_ARGS__)
+#define vrem_vv_i8mf8_m(...) __riscv_vrem_vv_i8mf8_tumu(__VA_ARGS__)
+#define vrem_vx_i8mf8_m(...) __riscv_vrem_vx_i8mf8_tumu(__VA_ARGS__)
+#define vrem_vv_i8mf4_m(...) __riscv_vrem_vv_i8mf4_tumu(__VA_ARGS__)
+#define vrem_vx_i8mf4_m(...) __riscv_vrem_vx_i8mf4_tumu(__VA_ARGS__)
+#define vrem_vv_i8mf2_m(...) __riscv_vrem_vv_i8mf2_tumu(__VA_ARGS__)
+#define vrem_vx_i8mf2_m(...) __riscv_vrem_vx_i8mf2_tumu(__VA_ARGS__)
+#define vrem_vv_i8m1_m(...) __riscv_vrem_vv_i8m1_tumu(__VA_ARGS__)
+#define vrem_vx_i8m1_m(...) __riscv_vrem_vx_i8m1_tumu(__VA_ARGS__)
+#define vrem_vv_i8m2_m(...) __riscv_vrem_vv_i8m2_tumu(__VA_ARGS__)
+#define vrem_vx_i8m2_m(...) __riscv_vrem_vx_i8m2_tumu(__VA_ARGS__)
+#define vrem_vv_i8m4_m(...) __riscv_vrem_vv_i8m4_tumu(__VA_ARGS__)
+#define vrem_vx_i8m4_m(...) __riscv_vrem_vx_i8m4_tumu(__VA_ARGS__)
+#define vrem_vv_i8m8_m(...) __riscv_vrem_vv_i8m8_tumu(__VA_ARGS__)
+#define vrem_vx_i8m8_m(...) __riscv_vrem_vx_i8m8_tumu(__VA_ARGS__)
+#define vrem_vv_i16mf4_m(...) __riscv_vrem_vv_i16mf4_tumu(__VA_ARGS__)
+#define vrem_vx_i16mf4_m(...) __riscv_vrem_vx_i16mf4_tumu(__VA_ARGS__)
+#define vrem_vv_i16mf2_m(...) __riscv_vrem_vv_i16mf2_tumu(__VA_ARGS__)
+#define vrem_vx_i16mf2_m(...) __riscv_vrem_vx_i16mf2_tumu(__VA_ARGS__)
+#define vrem_vv_i16m1_m(...) __riscv_vrem_vv_i16m1_tumu(__VA_ARGS__)
+#define vrem_vx_i16m1_m(...) __riscv_vrem_vx_i16m1_tumu(__VA_ARGS__)
+#define vrem_vv_i16m2_m(...) __riscv_vrem_vv_i16m2_tumu(__VA_ARGS__)
+#define vrem_vx_i16m2_m(...) __riscv_vrem_vx_i16m2_tumu(__VA_ARGS__)
+#define vrem_vv_i16m4_m(...) __riscv_vrem_vv_i16m4_tumu(__VA_ARGS__)
+#define vrem_vx_i16m4_m(...) __riscv_vrem_vx_i16m4_tumu(__VA_ARGS__)
+#define vrem_vv_i16m8_m(...) __riscv_vrem_vv_i16m8_tumu(__VA_ARGS__)
+#define vrem_vx_i16m8_m(...) __riscv_vrem_vx_i16m8_tumu(__VA_ARGS__)
+#define vrem_vv_i32mf2_m(...) __riscv_vrem_vv_i32mf2_tumu(__VA_ARGS__)
+#define vrem_vx_i32mf2_m(...) __riscv_vrem_vx_i32mf2_tumu(__VA_ARGS__)
+#define vrem_vv_i32m1_m(...) __riscv_vrem_vv_i32m1_tumu(__VA_ARGS__)
+#define vrem_vx_i32m1_m(...) __riscv_vrem_vx_i32m1_tumu(__VA_ARGS__)
+#define vrem_vv_i32m2_m(...) __riscv_vrem_vv_i32m2_tumu(__VA_ARGS__)
+#define vrem_vx_i32m2_m(...) __riscv_vrem_vx_i32m2_tumu(__VA_ARGS__)
+#define vrem_vv_i32m4_m(...) __riscv_vrem_vv_i32m4_tumu(__VA_ARGS__)
+#define vrem_vx_i32m4_m(...) __riscv_vrem_vx_i32m4_tumu(__VA_ARGS__)
+#define vrem_vv_i32m8_m(...) __riscv_vrem_vv_i32m8_tumu(__VA_ARGS__)
+#define vrem_vx_i32m8_m(...) __riscv_vrem_vx_i32m8_tumu(__VA_ARGS__)
+#define vrem_vv_i64m1_m(...) __riscv_vrem_vv_i64m1_tumu(__VA_ARGS__)
+#define vrem_vx_i64m1_m(...) __riscv_vrem_vx_i64m1_tumu(__VA_ARGS__)
+#define vrem_vv_i64m2_m(...) __riscv_vrem_vv_i64m2_tumu(__VA_ARGS__)
+#define vrem_vx_i64m2_m(...) __riscv_vrem_vx_i64m2_tumu(__VA_ARGS__)
+#define vrem_vv_i64m4_m(...) __riscv_vrem_vv_i64m4_tumu(__VA_ARGS__)
+#define vrem_vx_i64m4_m(...) __riscv_vrem_vx_i64m4_tumu(__VA_ARGS__)
+#define vrem_vv_i64m8_m(...) __riscv_vrem_vv_i64m8_tumu(__VA_ARGS__)
+#define vrem_vx_i64m8_m(...) __riscv_vrem_vx_i64m8_tumu(__VA_ARGS__)
+#define vdivu_vv_u8mf8_m(...) __riscv_vdivu_vv_u8mf8_tumu(__VA_ARGS__)
+#define vdivu_vx_u8mf8_m(...) __riscv_vdivu_vx_u8mf8_tumu(__VA_ARGS__)
+#define vdivu_vv_u8mf4_m(...) __riscv_vdivu_vv_u8mf4_tumu(__VA_ARGS__)
+#define vdivu_vx_u8mf4_m(...) __riscv_vdivu_vx_u8mf4_tumu(__VA_ARGS__)
+#define vdivu_vv_u8mf2_m(...) __riscv_vdivu_vv_u8mf2_tumu(__VA_ARGS__)
+#define vdivu_vx_u8mf2_m(...) __riscv_vdivu_vx_u8mf2_tumu(__VA_ARGS__)
+#define vdivu_vv_u8m1_m(...) __riscv_vdivu_vv_u8m1_tumu(__VA_ARGS__)
+#define vdivu_vx_u8m1_m(...) __riscv_vdivu_vx_u8m1_tumu(__VA_ARGS__)
+#define vdivu_vv_u8m2_m(...) __riscv_vdivu_vv_u8m2_tumu(__VA_ARGS__)
+#define vdivu_vx_u8m2_m(...) __riscv_vdivu_vx_u8m2_tumu(__VA_ARGS__)
+#define vdivu_vv_u8m4_m(...) __riscv_vdivu_vv_u8m4_tumu(__VA_ARGS__)
+#define vdivu_vx_u8m4_m(...) __riscv_vdivu_vx_u8m4_tumu(__VA_ARGS__)
+#define vdivu_vv_u8m8_m(...) __riscv_vdivu_vv_u8m8_tumu(__VA_ARGS__)
+#define vdivu_vx_u8m8_m(...) __riscv_vdivu_vx_u8m8_tumu(__VA_ARGS__)
+#define vdivu_vv_u16mf4_m(...) __riscv_vdivu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vdivu_vx_u16mf4_m(...) __riscv_vdivu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vdivu_vv_u16mf2_m(...) __riscv_vdivu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vdivu_vx_u16mf2_m(...) __riscv_vdivu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vdivu_vv_u16m1_m(...) __riscv_vdivu_vv_u16m1_tumu(__VA_ARGS__)
+#define vdivu_vx_u16m1_m(...) __riscv_vdivu_vx_u16m1_tumu(__VA_ARGS__)
+#define vdivu_vv_u16m2_m(...) __riscv_vdivu_vv_u16m2_tumu(__VA_ARGS__)
+#define vdivu_vx_u16m2_m(...) __riscv_vdivu_vx_u16m2_tumu(__VA_ARGS__)
+#define vdivu_vv_u16m4_m(...) __riscv_vdivu_vv_u16m4_tumu(__VA_ARGS__)
+#define vdivu_vx_u16m4_m(...) __riscv_vdivu_vx_u16m4_tumu(__VA_ARGS__)
+#define vdivu_vv_u16m8_m(...) __riscv_vdivu_vv_u16m8_tumu(__VA_ARGS__)
+#define vdivu_vx_u16m8_m(...) __riscv_vdivu_vx_u16m8_tumu(__VA_ARGS__)
+#define vdivu_vv_u32mf2_m(...) __riscv_vdivu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vdivu_vx_u32mf2_m(...) __riscv_vdivu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vdivu_vv_u32m1_m(...) __riscv_vdivu_vv_u32m1_tumu(__VA_ARGS__)
+#define vdivu_vx_u32m1_m(...) __riscv_vdivu_vx_u32m1_tumu(__VA_ARGS__)
+#define vdivu_vv_u32m2_m(...) __riscv_vdivu_vv_u32m2_tumu(__VA_ARGS__)
+#define vdivu_vx_u32m2_m(...) __riscv_vdivu_vx_u32m2_tumu(__VA_ARGS__)
+#define vdivu_vv_u32m4_m(...) __riscv_vdivu_vv_u32m4_tumu(__VA_ARGS__)
+#define vdivu_vx_u32m4_m(...) __riscv_vdivu_vx_u32m4_tumu(__VA_ARGS__)
+#define vdivu_vv_u32m8_m(...) __riscv_vdivu_vv_u32m8_tumu(__VA_ARGS__)
+#define vdivu_vx_u32m8_m(...) __riscv_vdivu_vx_u32m8_tumu(__VA_ARGS__)
+#define vdivu_vv_u64m1_m(...) __riscv_vdivu_vv_u64m1_tumu(__VA_ARGS__)
+#define vdivu_vx_u64m1_m(...) __riscv_vdivu_vx_u64m1_tumu(__VA_ARGS__)
+#define vdivu_vv_u64m2_m(...) __riscv_vdivu_vv_u64m2_tumu(__VA_ARGS__)
+#define vdivu_vx_u64m2_m(...) __riscv_vdivu_vx_u64m2_tumu(__VA_ARGS__)
+#define vdivu_vv_u64m4_m(...) __riscv_vdivu_vv_u64m4_tumu(__VA_ARGS__)
+#define vdivu_vx_u64m4_m(...) __riscv_vdivu_vx_u64m4_tumu(__VA_ARGS__)
+#define vdivu_vv_u64m8_m(...) __riscv_vdivu_vv_u64m8_tumu(__VA_ARGS__)
+#define vdivu_vx_u64m8_m(...) __riscv_vdivu_vx_u64m8_tumu(__VA_ARGS__)
+#define vremu_vv_u8mf8_m(...) __riscv_vremu_vv_u8mf8_tumu(__VA_ARGS__)
+#define vremu_vx_u8mf8_m(...) __riscv_vremu_vx_u8mf8_tumu(__VA_ARGS__)
+#define vremu_vv_u8mf4_m(...) __riscv_vremu_vv_u8mf4_tumu(__VA_ARGS__)
+#define vremu_vx_u8mf4_m(...) __riscv_vremu_vx_u8mf4_tumu(__VA_ARGS__)
+#define vremu_vv_u8mf2_m(...) __riscv_vremu_vv_u8mf2_tumu(__VA_ARGS__)
+#define vremu_vx_u8mf2_m(...) __riscv_vremu_vx_u8mf2_tumu(__VA_ARGS__)
+#define vremu_vv_u8m1_m(...) __riscv_vremu_vv_u8m1_tumu(__VA_ARGS__)
+#define vremu_vx_u8m1_m(...) __riscv_vremu_vx_u8m1_tumu(__VA_ARGS__)
+#define vremu_vv_u8m2_m(...) __riscv_vremu_vv_u8m2_tumu(__VA_ARGS__)
+#define vremu_vx_u8m2_m(...) __riscv_vremu_vx_u8m2_tumu(__VA_ARGS__)
+#define vremu_vv_u8m4_m(...) __riscv_vremu_vv_u8m4_tumu(__VA_ARGS__)
+#define vremu_vx_u8m4_m(...) __riscv_vremu_vx_u8m4_tumu(__VA_ARGS__)
+#define vremu_vv_u8m8_m(...) __riscv_vremu_vv_u8m8_tumu(__VA_ARGS__)
+#define vremu_vx_u8m8_m(...) __riscv_vremu_vx_u8m8_tumu(__VA_ARGS__)
+#define vremu_vv_u16mf4_m(...) __riscv_vremu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vremu_vx_u16mf4_m(...) __riscv_vremu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vremu_vv_u16mf2_m(...) __riscv_vremu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vremu_vx_u16mf2_m(...) __riscv_vremu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vremu_vv_u16m1_m(...) __riscv_vremu_vv_u16m1_tumu(__VA_ARGS__)
+#define vremu_vx_u16m1_m(...) __riscv_vremu_vx_u16m1_tumu(__VA_ARGS__)
+#define vremu_vv_u16m2_m(...) __riscv_vremu_vv_u16m2_tumu(__VA_ARGS__)
+#define vremu_vx_u16m2_m(...) __riscv_vremu_vx_u16m2_tumu(__VA_ARGS__)
+#define vremu_vv_u16m4_m(...) __riscv_vremu_vv_u16m4_tumu(__VA_ARGS__)
+#define vremu_vx_u16m4_m(...) __riscv_vremu_vx_u16m4_tumu(__VA_ARGS__)
+#define vremu_vv_u16m8_m(...) __riscv_vremu_vv_u16m8_tumu(__VA_ARGS__)
+#define vremu_vx_u16m8_m(...) __riscv_vremu_vx_u16m8_tumu(__VA_ARGS__)
+#define vremu_vv_u32mf2_m(...) __riscv_vremu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vremu_vx_u32mf2_m(...) __riscv_vremu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vremu_vv_u32m1_m(...) __riscv_vremu_vv_u32m1_tumu(__VA_ARGS__)
+#define vremu_vx_u32m1_m(...) __riscv_vremu_vx_u32m1_tumu(__VA_ARGS__)
+#define vremu_vv_u32m2_m(...) __riscv_vremu_vv_u32m2_tumu(__VA_ARGS__)
+#define vremu_vx_u32m2_m(...) __riscv_vremu_vx_u32m2_tumu(__VA_ARGS__)
+#define vremu_vv_u32m4_m(...) __riscv_vremu_vv_u32m4_tumu(__VA_ARGS__)
+#define vremu_vx_u32m4_m(...) __riscv_vremu_vx_u32m4_tumu(__VA_ARGS__)
+#define vremu_vv_u32m8_m(...) __riscv_vremu_vv_u32m8_tumu(__VA_ARGS__)
+#define vremu_vx_u32m8_m(...) __riscv_vremu_vx_u32m8_tumu(__VA_ARGS__)
+#define vremu_vv_u64m1_m(...) __riscv_vremu_vv_u64m1_tumu(__VA_ARGS__)
+#define vremu_vx_u64m1_m(...) __riscv_vremu_vx_u64m1_tumu(__VA_ARGS__)
+#define vremu_vv_u64m2_m(...) __riscv_vremu_vv_u64m2_tumu(__VA_ARGS__)
+#define vremu_vx_u64m2_m(...) __riscv_vremu_vx_u64m2_tumu(__VA_ARGS__)
+#define vremu_vv_u64m4_m(...) __riscv_vremu_vv_u64m4_tumu(__VA_ARGS__)
+#define vremu_vx_u64m4_m(...) __riscv_vremu_vx_u64m4_tumu(__VA_ARGS__)
+#define vremu_vv_u64m8_m(...) __riscv_vremu_vv_u64m8_tumu(__VA_ARGS__)
+#define vremu_vx_u64m8_m(...) __riscv_vremu_vx_u64m8_tumu(__VA_ARGS__)
+#define vwmul_vv_i16mf4(...) __riscv_vwmul_vv_i16mf4(__VA_ARGS__)
+#define vwmul_vx_i16mf4(...) __riscv_vwmul_vx_i16mf4(__VA_ARGS__)
+#define vwmul_vv_i16mf2(...) __riscv_vwmul_vv_i16mf2(__VA_ARGS__)
+#define vwmul_vx_i16mf2(...) __riscv_vwmul_vx_i16mf2(__VA_ARGS__)
+#define vwmul_vv_i16m1(...) __riscv_vwmul_vv_i16m1(__VA_ARGS__)
+#define vwmul_vx_i16m1(...) __riscv_vwmul_vx_i16m1(__VA_ARGS__)
+#define vwmul_vv_i16m2(...) __riscv_vwmul_vv_i16m2(__VA_ARGS__)
+#define vwmul_vx_i16m2(...) __riscv_vwmul_vx_i16m2(__VA_ARGS__)
+#define vwmul_vv_i16m4(...) __riscv_vwmul_vv_i16m4(__VA_ARGS__)
+#define vwmul_vx_i16m4(...) __riscv_vwmul_vx_i16m4(__VA_ARGS__)
+#define vwmul_vv_i16m8(...) __riscv_vwmul_vv_i16m8(__VA_ARGS__)
+#define vwmul_vx_i16m8(...) __riscv_vwmul_vx_i16m8(__VA_ARGS__)
+#define vwmul_vv_i32mf2(...) __riscv_vwmul_vv_i32mf2(__VA_ARGS__)
+#define vwmul_vx_i32mf2(...) __riscv_vwmul_vx_i32mf2(__VA_ARGS__)
+#define vwmul_vv_i32m1(...) __riscv_vwmul_vv_i32m1(__VA_ARGS__)
+#define vwmul_vx_i32m1(...) __riscv_vwmul_vx_i32m1(__VA_ARGS__)
+#define vwmul_vv_i32m2(...) __riscv_vwmul_vv_i32m2(__VA_ARGS__)
+#define vwmul_vx_i32m2(...) __riscv_vwmul_vx_i32m2(__VA_ARGS__)
+#define vwmul_vv_i32m4(...) __riscv_vwmul_vv_i32m4(__VA_ARGS__)
+#define vwmul_vx_i32m4(...) __riscv_vwmul_vx_i32m4(__VA_ARGS__)
+#define vwmul_vv_i32m8(...) __riscv_vwmul_vv_i32m8(__VA_ARGS__)
+#define vwmul_vx_i32m8(...) __riscv_vwmul_vx_i32m8(__VA_ARGS__)
+#define vwmul_vv_i64m1(...) __riscv_vwmul_vv_i64m1(__VA_ARGS__)
+#define vwmul_vx_i64m1(...) __riscv_vwmul_vx_i64m1(__VA_ARGS__)
+#define vwmul_vv_i64m2(...) __riscv_vwmul_vv_i64m2(__VA_ARGS__)
+#define vwmul_vx_i64m2(...) __riscv_vwmul_vx_i64m2(__VA_ARGS__)
+#define vwmul_vv_i64m4(...) __riscv_vwmul_vv_i64m4(__VA_ARGS__)
+#define vwmul_vx_i64m4(...) __riscv_vwmul_vx_i64m4(__VA_ARGS__)
+#define vwmul_vv_i64m8(...) __riscv_vwmul_vv_i64m8(__VA_ARGS__)
+#define vwmul_vx_i64m8(...) __riscv_vwmul_vx_i64m8(__VA_ARGS__)
+#define vwmulsu_vv_i16mf4(...) __riscv_vwmulsu_vv_i16mf4(__VA_ARGS__)
+#define vwmulsu_vx_i16mf4(...) __riscv_vwmulsu_vx_i16mf4(__VA_ARGS__)
+#define vwmulsu_vv_i16mf2(...) __riscv_vwmulsu_vv_i16mf2(__VA_ARGS__)
+#define vwmulsu_vx_i16mf2(...) __riscv_vwmulsu_vx_i16mf2(__VA_ARGS__)
+#define vwmulsu_vv_i16m1(...) __riscv_vwmulsu_vv_i16m1(__VA_ARGS__)
+#define vwmulsu_vx_i16m1(...) __riscv_vwmulsu_vx_i16m1(__VA_ARGS__)
+#define vwmulsu_vv_i16m2(...) __riscv_vwmulsu_vv_i16m2(__VA_ARGS__)
+#define vwmulsu_vx_i16m2(...) __riscv_vwmulsu_vx_i16m2(__VA_ARGS__)
+#define vwmulsu_vv_i16m4(...) __riscv_vwmulsu_vv_i16m4(__VA_ARGS__)
+#define vwmulsu_vx_i16m4(...) __riscv_vwmulsu_vx_i16m4(__VA_ARGS__)
+#define vwmulsu_vv_i16m8(...) __riscv_vwmulsu_vv_i16m8(__VA_ARGS__)
+#define vwmulsu_vx_i16m8(...) __riscv_vwmulsu_vx_i16m8(__VA_ARGS__)
+#define vwmulsu_vv_i32mf2(...) __riscv_vwmulsu_vv_i32mf2(__VA_ARGS__)
+#define vwmulsu_vx_i32mf2(...) __riscv_vwmulsu_vx_i32mf2(__VA_ARGS__)
+#define vwmulsu_vv_i32m1(...) __riscv_vwmulsu_vv_i32m1(__VA_ARGS__)
+#define vwmulsu_vx_i32m1(...) __riscv_vwmulsu_vx_i32m1(__VA_ARGS__)
+#define vwmulsu_vv_i32m2(...) __riscv_vwmulsu_vv_i32m2(__VA_ARGS__)
+#define vwmulsu_vx_i32m2(...) __riscv_vwmulsu_vx_i32m2(__VA_ARGS__)
+#define vwmulsu_vv_i32m4(...) __riscv_vwmulsu_vv_i32m4(__VA_ARGS__)
+#define vwmulsu_vx_i32m4(...) __riscv_vwmulsu_vx_i32m4(__VA_ARGS__)
+#define vwmulsu_vv_i32m8(...) __riscv_vwmulsu_vv_i32m8(__VA_ARGS__)
+#define vwmulsu_vx_i32m8(...) __riscv_vwmulsu_vx_i32m8(__VA_ARGS__)
+#define vwmulsu_vv_i64m1(...) __riscv_vwmulsu_vv_i64m1(__VA_ARGS__)
+#define vwmulsu_vx_i64m1(...) __riscv_vwmulsu_vx_i64m1(__VA_ARGS__)
+#define vwmulsu_vv_i64m2(...) __riscv_vwmulsu_vv_i64m2(__VA_ARGS__)
+#define vwmulsu_vx_i64m2(...) __riscv_vwmulsu_vx_i64m2(__VA_ARGS__)
+#define vwmulsu_vv_i64m4(...) __riscv_vwmulsu_vv_i64m4(__VA_ARGS__)
+#define vwmulsu_vx_i64m4(...) __riscv_vwmulsu_vx_i64m4(__VA_ARGS__)
+#define vwmulsu_vv_i64m8(...) __riscv_vwmulsu_vv_i64m8(__VA_ARGS__)
+#define vwmulsu_vx_i64m8(...) __riscv_vwmulsu_vx_i64m8(__VA_ARGS__)
+#define vwmulu_vv_u16mf4(...) __riscv_vwmulu_vv_u16mf4(__VA_ARGS__)
+#define vwmulu_vx_u16mf4(...) __riscv_vwmulu_vx_u16mf4(__VA_ARGS__)
+#define vwmulu_vv_u16mf2(...) __riscv_vwmulu_vv_u16mf2(__VA_ARGS__)
+#define vwmulu_vx_u16mf2(...) __riscv_vwmulu_vx_u16mf2(__VA_ARGS__)
+#define vwmulu_vv_u16m1(...) __riscv_vwmulu_vv_u16m1(__VA_ARGS__)
+#define vwmulu_vx_u16m1(...) __riscv_vwmulu_vx_u16m1(__VA_ARGS__)
+#define vwmulu_vv_u16m2(...) __riscv_vwmulu_vv_u16m2(__VA_ARGS__)
+#define vwmulu_vx_u16m2(...) __riscv_vwmulu_vx_u16m2(__VA_ARGS__)
+#define vwmulu_vv_u16m4(...) __riscv_vwmulu_vv_u16m4(__VA_ARGS__)
+#define vwmulu_vx_u16m4(...) __riscv_vwmulu_vx_u16m4(__VA_ARGS__)
+#define vwmulu_vv_u16m8(...) __riscv_vwmulu_vv_u16m8(__VA_ARGS__)
+#define vwmulu_vx_u16m8(...) __riscv_vwmulu_vx_u16m8(__VA_ARGS__)
+#define vwmulu_vv_u32mf2(...) __riscv_vwmulu_vv_u32mf2(__VA_ARGS__)
+#define vwmulu_vx_u32mf2(...) __riscv_vwmulu_vx_u32mf2(__VA_ARGS__)
+#define vwmulu_vv_u32m1(...) __riscv_vwmulu_vv_u32m1(__VA_ARGS__)
+#define vwmulu_vx_u32m1(...) __riscv_vwmulu_vx_u32m1(__VA_ARGS__)
+#define vwmulu_vv_u32m2(...) __riscv_vwmulu_vv_u32m2(__VA_ARGS__)
+#define vwmulu_vx_u32m2(...) __riscv_vwmulu_vx_u32m2(__VA_ARGS__)
+#define vwmulu_vv_u32m4(...) __riscv_vwmulu_vv_u32m4(__VA_ARGS__)
+#define vwmulu_vx_u32m4(...) __riscv_vwmulu_vx_u32m4(__VA_ARGS__)
+#define vwmulu_vv_u32m8(...) __riscv_vwmulu_vv_u32m8(__VA_ARGS__)
+#define vwmulu_vx_u32m8(...) __riscv_vwmulu_vx_u32m8(__VA_ARGS__)
+#define vwmulu_vv_u64m1(...) __riscv_vwmulu_vv_u64m1(__VA_ARGS__)
+#define vwmulu_vx_u64m1(...) __riscv_vwmulu_vx_u64m1(__VA_ARGS__)
+#define vwmulu_vv_u64m2(...) __riscv_vwmulu_vv_u64m2(__VA_ARGS__)
+#define vwmulu_vx_u64m2(...) __riscv_vwmulu_vx_u64m2(__VA_ARGS__)
+#define vwmulu_vv_u64m4(...) __riscv_vwmulu_vv_u64m4(__VA_ARGS__)
+#define vwmulu_vx_u64m4(...) __riscv_vwmulu_vx_u64m4(__VA_ARGS__)
+#define vwmulu_vv_u64m8(...) __riscv_vwmulu_vv_u64m8(__VA_ARGS__)
+#define vwmulu_vx_u64m8(...) __riscv_vwmulu_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vwmul_vv_i16mf4_m(...) __riscv_vwmul_vv_i16mf4_tumu(__VA_ARGS__)
+#define vwmul_vx_i16mf4_m(...) __riscv_vwmul_vx_i16mf4_tumu(__VA_ARGS__)
+#define vwmul_vv_i16mf2_m(...) __riscv_vwmul_vv_i16mf2_tumu(__VA_ARGS__)
+#define vwmul_vx_i16mf2_m(...) __riscv_vwmul_vx_i16mf2_tumu(__VA_ARGS__)
+#define vwmul_vv_i16m1_m(...) __riscv_vwmul_vv_i16m1_tumu(__VA_ARGS__)
+#define vwmul_vx_i16m1_m(...) __riscv_vwmul_vx_i16m1_tumu(__VA_ARGS__)
+#define vwmul_vv_i16m2_m(...) __riscv_vwmul_vv_i16m2_tumu(__VA_ARGS__)
+#define vwmul_vx_i16m2_m(...) __riscv_vwmul_vx_i16m2_tumu(__VA_ARGS__)
+#define vwmul_vv_i16m4_m(...) __riscv_vwmul_vv_i16m4_tumu(__VA_ARGS__)
+#define vwmul_vx_i16m4_m(...) __riscv_vwmul_vx_i16m4_tumu(__VA_ARGS__)
+#define vwmul_vv_i16m8_m(...) __riscv_vwmul_vv_i16m8_tumu(__VA_ARGS__)
+#define vwmul_vx_i16m8_m(...) __riscv_vwmul_vx_i16m8_tumu(__VA_ARGS__)
+#define vwmul_vv_i32mf2_m(...) __riscv_vwmul_vv_i32mf2_tumu(__VA_ARGS__)
+#define vwmul_vx_i32mf2_m(...) __riscv_vwmul_vx_i32mf2_tumu(__VA_ARGS__)
+#define vwmul_vv_i32m1_m(...) __riscv_vwmul_vv_i32m1_tumu(__VA_ARGS__)
+#define vwmul_vx_i32m1_m(...) __riscv_vwmul_vx_i32m1_tumu(__VA_ARGS__)
+#define vwmul_vv_i32m2_m(...) __riscv_vwmul_vv_i32m2_tumu(__VA_ARGS__)
+#define vwmul_vx_i32m2_m(...) __riscv_vwmul_vx_i32m2_tumu(__VA_ARGS__)
+#define vwmul_vv_i32m4_m(...) __riscv_vwmul_vv_i32m4_tumu(__VA_ARGS__)
+#define vwmul_vx_i32m4_m(...) __riscv_vwmul_vx_i32m4_tumu(__VA_ARGS__)
+#define vwmul_vv_i32m8_m(...) __riscv_vwmul_vv_i32m8_tumu(__VA_ARGS__)
+#define vwmul_vx_i32m8_m(...) __riscv_vwmul_vx_i32m8_tumu(__VA_ARGS__)
+#define vwmul_vv_i64m1_m(...) __riscv_vwmul_vv_i64m1_tumu(__VA_ARGS__)
+#define vwmul_vx_i64m1_m(...) __riscv_vwmul_vx_i64m1_tumu(__VA_ARGS__)
+#define vwmul_vv_i64m2_m(...) __riscv_vwmul_vv_i64m2_tumu(__VA_ARGS__)
+#define vwmul_vx_i64m2_m(...) __riscv_vwmul_vx_i64m2_tumu(__VA_ARGS__)
+#define vwmul_vv_i64m4_m(...) __riscv_vwmul_vv_i64m4_tumu(__VA_ARGS__)
+#define vwmul_vx_i64m4_m(...) __riscv_vwmul_vx_i64m4_tumu(__VA_ARGS__)
+#define vwmul_vv_i64m8_m(...) __riscv_vwmul_vv_i64m8_tumu(__VA_ARGS__)
+#define vwmul_vx_i64m8_m(...) __riscv_vwmul_vx_i64m8_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i16mf4_m(...) __riscv_vwmulsu_vv_i16mf4_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i16mf4_m(...) __riscv_vwmulsu_vx_i16mf4_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i16mf2_m(...) __riscv_vwmulsu_vv_i16mf2_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i16mf2_m(...) __riscv_vwmulsu_vx_i16mf2_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i16m1_m(...) __riscv_vwmulsu_vv_i16m1_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i16m1_m(...) __riscv_vwmulsu_vx_i16m1_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i16m2_m(...) __riscv_vwmulsu_vv_i16m2_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i16m2_m(...) __riscv_vwmulsu_vx_i16m2_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i16m4_m(...) __riscv_vwmulsu_vv_i16m4_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i16m4_m(...) __riscv_vwmulsu_vx_i16m4_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i16m8_m(...) __riscv_vwmulsu_vv_i16m8_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i16m8_m(...) __riscv_vwmulsu_vx_i16m8_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i32mf2_m(...) __riscv_vwmulsu_vv_i32mf2_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i32mf2_m(...) __riscv_vwmulsu_vx_i32mf2_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i32m1_m(...) __riscv_vwmulsu_vv_i32m1_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i32m1_m(...) __riscv_vwmulsu_vx_i32m1_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i32m2_m(...) __riscv_vwmulsu_vv_i32m2_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i32m2_m(...) __riscv_vwmulsu_vx_i32m2_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i32m4_m(...) __riscv_vwmulsu_vv_i32m4_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i32m4_m(...) __riscv_vwmulsu_vx_i32m4_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i32m8_m(...) __riscv_vwmulsu_vv_i32m8_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i32m8_m(...) __riscv_vwmulsu_vx_i32m8_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i64m1_m(...) __riscv_vwmulsu_vv_i64m1_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i64m1_m(...) __riscv_vwmulsu_vx_i64m1_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i64m2_m(...) __riscv_vwmulsu_vv_i64m2_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i64m2_m(...) __riscv_vwmulsu_vx_i64m2_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i64m4_m(...) __riscv_vwmulsu_vv_i64m4_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i64m4_m(...) __riscv_vwmulsu_vx_i64m4_tumu(__VA_ARGS__)
+#define vwmulsu_vv_i64m8_m(...) __riscv_vwmulsu_vv_i64m8_tumu(__VA_ARGS__)
+#define vwmulsu_vx_i64m8_m(...) __riscv_vwmulsu_vx_i64m8_tumu(__VA_ARGS__)
+#define vwmulu_vv_u16mf4_m(...) __riscv_vwmulu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vwmulu_vx_u16mf4_m(...) __riscv_vwmulu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vwmulu_vv_u16mf2_m(...) __riscv_vwmulu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vwmulu_vx_u16mf2_m(...) __riscv_vwmulu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vwmulu_vv_u16m1_m(...) __riscv_vwmulu_vv_u16m1_tumu(__VA_ARGS__)
+#define vwmulu_vx_u16m1_m(...) __riscv_vwmulu_vx_u16m1_tumu(__VA_ARGS__)
+#define vwmulu_vv_u16m2_m(...) __riscv_vwmulu_vv_u16m2_tumu(__VA_ARGS__)
+#define vwmulu_vx_u16m2_m(...) __riscv_vwmulu_vx_u16m2_tumu(__VA_ARGS__)
+#define vwmulu_vv_u16m4_m(...) __riscv_vwmulu_vv_u16m4_tumu(__VA_ARGS__)
+#define vwmulu_vx_u16m4_m(...) __riscv_vwmulu_vx_u16m4_tumu(__VA_ARGS__)
+#define vwmulu_vv_u16m8_m(...) __riscv_vwmulu_vv_u16m8_tumu(__VA_ARGS__)
+#define vwmulu_vx_u16m8_m(...) __riscv_vwmulu_vx_u16m8_tumu(__VA_ARGS__)
+#define vwmulu_vv_u32mf2_m(...) __riscv_vwmulu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vwmulu_vx_u32mf2_m(...) __riscv_vwmulu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vwmulu_vv_u32m1_m(...) __riscv_vwmulu_vv_u32m1_tumu(__VA_ARGS__)
+#define vwmulu_vx_u32m1_m(...) __riscv_vwmulu_vx_u32m1_tumu(__VA_ARGS__)
+#define vwmulu_vv_u32m2_m(...) __riscv_vwmulu_vv_u32m2_tumu(__VA_ARGS__)
+#define vwmulu_vx_u32m2_m(...) __riscv_vwmulu_vx_u32m2_tumu(__VA_ARGS__)
+#define vwmulu_vv_u32m4_m(...) __riscv_vwmulu_vv_u32m4_tumu(__VA_ARGS__)
+#define vwmulu_vx_u32m4_m(...) __riscv_vwmulu_vx_u32m4_tumu(__VA_ARGS__)
+#define vwmulu_vv_u32m8_m(...) __riscv_vwmulu_vv_u32m8_tumu(__VA_ARGS__)
+#define vwmulu_vx_u32m8_m(...) __riscv_vwmulu_vx_u32m8_tumu(__VA_ARGS__)
+#define vwmulu_vv_u64m1_m(...) __riscv_vwmulu_vv_u64m1_tumu(__VA_ARGS__)
+#define vwmulu_vx_u64m1_m(...) __riscv_vwmulu_vx_u64m1_tumu(__VA_ARGS__)
+#define vwmulu_vv_u64m2_m(...) __riscv_vwmulu_vv_u64m2_tumu(__VA_ARGS__)
+#define vwmulu_vx_u64m2_m(...) __riscv_vwmulu_vx_u64m2_tumu(__VA_ARGS__)
+#define vwmulu_vv_u64m4_m(...) __riscv_vwmulu_vv_u64m4_tumu(__VA_ARGS__)
+#define vwmulu_vx_u64m4_m(...) __riscv_vwmulu_vx_u64m4_tumu(__VA_ARGS__)
+#define vwmulu_vv_u64m8_m(...) __riscv_vwmulu_vv_u64m8_tumu(__VA_ARGS__)
+#define vwmulu_vx_u64m8_m(...) __riscv_vwmulu_vx_u64m8_tumu(__VA_ARGS__)
+#define vmacc_vv_i8mf8(...) __riscv_vmacc_vv_i8mf8_tu(__VA_ARGS__)
+#define vmacc_vx_i8mf8(...) __riscv_vmacc_vx_i8mf8_tu(__VA_ARGS__)
+#define vmacc_vv_i8mf4(...) __riscv_vmacc_vv_i8mf4_tu(__VA_ARGS__)
+#define vmacc_vx_i8mf4(...) __riscv_vmacc_vx_i8mf4_tu(__VA_ARGS__)
+#define vmacc_vv_i8mf2(...) __riscv_vmacc_vv_i8mf2_tu(__VA_ARGS__)
+#define vmacc_vx_i8mf2(...) __riscv_vmacc_vx_i8mf2_tu(__VA_ARGS__)
+#define vmacc_vv_i8m1(...) __riscv_vmacc_vv_i8m1_tu(__VA_ARGS__)
+#define vmacc_vx_i8m1(...) __riscv_vmacc_vx_i8m1_tu(__VA_ARGS__)
+#define vmacc_vv_i8m2(...) __riscv_vmacc_vv_i8m2_tu(__VA_ARGS__)
+#define vmacc_vx_i8m2(...) __riscv_vmacc_vx_i8m2_tu(__VA_ARGS__)
+#define vmacc_vv_i8m4(...) __riscv_vmacc_vv_i8m4_tu(__VA_ARGS__)
+#define vmacc_vx_i8m4(...) __riscv_vmacc_vx_i8m4_tu(__VA_ARGS__)
+#define vmacc_vv_i8m8(...) __riscv_vmacc_vv_i8m8_tu(__VA_ARGS__)
+#define vmacc_vx_i8m8(...) __riscv_vmacc_vx_i8m8_tu(__VA_ARGS__)
+#define vmacc_vv_i16mf4(...) __riscv_vmacc_vv_i16mf4_tu(__VA_ARGS__)
+#define vmacc_vx_i16mf4(...) __riscv_vmacc_vx_i16mf4_tu(__VA_ARGS__)
+#define vmacc_vv_i16mf2(...) __riscv_vmacc_vv_i16mf2_tu(__VA_ARGS__)
+#define vmacc_vx_i16mf2(...) __riscv_vmacc_vx_i16mf2_tu(__VA_ARGS__)
+#define vmacc_vv_i16m1(...) __riscv_vmacc_vv_i16m1_tu(__VA_ARGS__)
+#define vmacc_vx_i16m1(...) __riscv_vmacc_vx_i16m1_tu(__VA_ARGS__)
+#define vmacc_vv_i16m2(...) __riscv_vmacc_vv_i16m2_tu(__VA_ARGS__)
+#define vmacc_vx_i16m2(...) __riscv_vmacc_vx_i16m2_tu(__VA_ARGS__)
+#define vmacc_vv_i16m4(...) __riscv_vmacc_vv_i16m4_tu(__VA_ARGS__)
+#define vmacc_vx_i16m4(...) __riscv_vmacc_vx_i16m4_tu(__VA_ARGS__)
+#define vmacc_vv_i16m8(...) __riscv_vmacc_vv_i16m8_tu(__VA_ARGS__)
+#define vmacc_vx_i16m8(...) __riscv_vmacc_vx_i16m8_tu(__VA_ARGS__)
+#define vmacc_vv_i32mf2(...) __riscv_vmacc_vv_i32mf2_tu(__VA_ARGS__)
+#define vmacc_vx_i32mf2(...) __riscv_vmacc_vx_i32mf2_tu(__VA_ARGS__)
+#define vmacc_vv_i32m1(...) __riscv_vmacc_vv_i32m1_tu(__VA_ARGS__)
+#define vmacc_vx_i32m1(...) __riscv_vmacc_vx_i32m1_tu(__VA_ARGS__)
+#define vmacc_vv_i32m2(...) __riscv_vmacc_vv_i32m2_tu(__VA_ARGS__)
+#define vmacc_vx_i32m2(...) __riscv_vmacc_vx_i32m2_tu(__VA_ARGS__)
+#define vmacc_vv_i32m4(...) __riscv_vmacc_vv_i32m4_tu(__VA_ARGS__)
+#define vmacc_vx_i32m4(...) __riscv_vmacc_vx_i32m4_tu(__VA_ARGS__)
+#define vmacc_vv_i32m8(...) __riscv_vmacc_vv_i32m8_tu(__VA_ARGS__)
+#define vmacc_vx_i32m8(...) __riscv_vmacc_vx_i32m8_tu(__VA_ARGS__)
+#define vmacc_vv_i64m1(...) __riscv_vmacc_vv_i64m1_tu(__VA_ARGS__)
+#define vmacc_vx_i64m1(...) __riscv_vmacc_vx_i64m1_tu(__VA_ARGS__)
+#define vmacc_vv_i64m2(...) __riscv_vmacc_vv_i64m2_tu(__VA_ARGS__)
+#define vmacc_vx_i64m2(...) __riscv_vmacc_vx_i64m2_tu(__VA_ARGS__)
+#define vmacc_vv_i64m4(...) __riscv_vmacc_vv_i64m4_tu(__VA_ARGS__)
+#define vmacc_vx_i64m4(...) __riscv_vmacc_vx_i64m4_tu(__VA_ARGS__)
+#define vmacc_vv_i64m8(...) __riscv_vmacc_vv_i64m8_tu(__VA_ARGS__)
+#define vmacc_vx_i64m8(...) __riscv_vmacc_vx_i64m8_tu(__VA_ARGS__)
+#define vnmsac_vv_i8mf8(...) __riscv_vnmsac_vv_i8mf8_tu(__VA_ARGS__)
+#define vnmsac_vx_i8mf8(...) __riscv_vnmsac_vx_i8mf8_tu(__VA_ARGS__)
+#define vnmsac_vv_i8mf4(...) __riscv_vnmsac_vv_i8mf4_tu(__VA_ARGS__)
+#define vnmsac_vx_i8mf4(...) __riscv_vnmsac_vx_i8mf4_tu(__VA_ARGS__)
+#define vnmsac_vv_i8mf2(...) __riscv_vnmsac_vv_i8mf2_tu(__VA_ARGS__)
+#define vnmsac_vx_i8mf2(...) __riscv_vnmsac_vx_i8mf2_tu(__VA_ARGS__)
+#define vnmsac_vv_i8m1(...) __riscv_vnmsac_vv_i8m1_tu(__VA_ARGS__)
+#define vnmsac_vx_i8m1(...) __riscv_vnmsac_vx_i8m1_tu(__VA_ARGS__)
+#define vnmsac_vv_i8m2(...) __riscv_vnmsac_vv_i8m2_tu(__VA_ARGS__)
+#define vnmsac_vx_i8m2(...) __riscv_vnmsac_vx_i8m2_tu(__VA_ARGS__)
+#define vnmsac_vv_i8m4(...) __riscv_vnmsac_vv_i8m4_tu(__VA_ARGS__)
+#define vnmsac_vx_i8m4(...) __riscv_vnmsac_vx_i8m4_tu(__VA_ARGS__)
+#define vnmsac_vv_i8m8(...) __riscv_vnmsac_vv_i8m8_tu(__VA_ARGS__)
+#define vnmsac_vx_i8m8(...) __riscv_vnmsac_vx_i8m8_tu(__VA_ARGS__)
+#define vnmsac_vv_i16mf4(...) __riscv_vnmsac_vv_i16mf4_tu(__VA_ARGS__)
+#define vnmsac_vx_i16mf4(...) __riscv_vnmsac_vx_i16mf4_tu(__VA_ARGS__)
+#define vnmsac_vv_i16mf2(...) __riscv_vnmsac_vv_i16mf2_tu(__VA_ARGS__)
+#define vnmsac_vx_i16mf2(...) __riscv_vnmsac_vx_i16mf2_tu(__VA_ARGS__)
+#define vnmsac_vv_i16m1(...) __riscv_vnmsac_vv_i16m1_tu(__VA_ARGS__)
+#define vnmsac_vx_i16m1(...) __riscv_vnmsac_vx_i16m1_tu(__VA_ARGS__)
+#define vnmsac_vv_i16m2(...) __riscv_vnmsac_vv_i16m2_tu(__VA_ARGS__)
+#define vnmsac_vx_i16m2(...) __riscv_vnmsac_vx_i16m2_tu(__VA_ARGS__)
+#define vnmsac_vv_i16m4(...) __riscv_vnmsac_vv_i16m4_tu(__VA_ARGS__)
+#define vnmsac_vx_i16m4(...) __riscv_vnmsac_vx_i16m4_tu(__VA_ARGS__)
+#define vnmsac_vv_i16m8(...) __riscv_vnmsac_vv_i16m8_tu(__VA_ARGS__)
+#define vnmsac_vx_i16m8(...) __riscv_vnmsac_vx_i16m8_tu(__VA_ARGS__)
+#define vnmsac_vv_i32mf2(...) __riscv_vnmsac_vv_i32mf2_tu(__VA_ARGS__)
+#define vnmsac_vx_i32mf2(...) __riscv_vnmsac_vx_i32mf2_tu(__VA_ARGS__)
+#define vnmsac_vv_i32m1(...) __riscv_vnmsac_vv_i32m1_tu(__VA_ARGS__)
+#define vnmsac_vx_i32m1(...) __riscv_vnmsac_vx_i32m1_tu(__VA_ARGS__)
+#define vnmsac_vv_i32m2(...) __riscv_vnmsac_vv_i32m2_tu(__VA_ARGS__)
+#define vnmsac_vx_i32m2(...) __riscv_vnmsac_vx_i32m2_tu(__VA_ARGS__)
+#define vnmsac_vv_i32m4(...) __riscv_vnmsac_vv_i32m4_tu(__VA_ARGS__)
+#define vnmsac_vx_i32m4(...) __riscv_vnmsac_vx_i32m4_tu(__VA_ARGS__)
+#define vnmsac_vv_i32m8(...) __riscv_vnmsac_vv_i32m8_tu(__VA_ARGS__)
+#define vnmsac_vx_i32m8(...) __riscv_vnmsac_vx_i32m8_tu(__VA_ARGS__)
+#define vnmsac_vv_i64m1(...) __riscv_vnmsac_vv_i64m1_tu(__VA_ARGS__)
+#define vnmsac_vx_i64m1(...) __riscv_vnmsac_vx_i64m1_tu(__VA_ARGS__)
+#define vnmsac_vv_i64m2(...) __riscv_vnmsac_vv_i64m2_tu(__VA_ARGS__)
+#define vnmsac_vx_i64m2(...) __riscv_vnmsac_vx_i64m2_tu(__VA_ARGS__)
+#define vnmsac_vv_i64m4(...) __riscv_vnmsac_vv_i64m4_tu(__VA_ARGS__)
+#define vnmsac_vx_i64m4(...) __riscv_vnmsac_vx_i64m4_tu(__VA_ARGS__)
+#define vnmsac_vv_i64m8(...) __riscv_vnmsac_vv_i64m8_tu(__VA_ARGS__)
+#define vnmsac_vx_i64m8(...) __riscv_vnmsac_vx_i64m8_tu(__VA_ARGS__)
+#define vmadd_vv_i8mf8(...) __riscv_vmadd_vv_i8mf8_tu(__VA_ARGS__)
+#define vmadd_vx_i8mf8(...) __riscv_vmadd_vx_i8mf8_tu(__VA_ARGS__)
+#define vmadd_vv_i8mf4(...) __riscv_vmadd_vv_i8mf4_tu(__VA_ARGS__)
+#define vmadd_vx_i8mf4(...) __riscv_vmadd_vx_i8mf4_tu(__VA_ARGS__)
+#define vmadd_vv_i8mf2(...) __riscv_vmadd_vv_i8mf2_tu(__VA_ARGS__)
+#define vmadd_vx_i8mf2(...) __riscv_vmadd_vx_i8mf2_tu(__VA_ARGS__)
+#define vmadd_vv_i8m1(...) __riscv_vmadd_vv_i8m1_tu(__VA_ARGS__)
+#define vmadd_vx_i8m1(...) __riscv_vmadd_vx_i8m1_tu(__VA_ARGS__)
+#define vmadd_vv_i8m2(...) __riscv_vmadd_vv_i8m2_tu(__VA_ARGS__)
+#define vmadd_vx_i8m2(...) __riscv_vmadd_vx_i8m2_tu(__VA_ARGS__)
+#define vmadd_vv_i8m4(...) __riscv_vmadd_vv_i8m4_tu(__VA_ARGS__)
+#define vmadd_vx_i8m4(...) __riscv_vmadd_vx_i8m4_tu(__VA_ARGS__)
+#define vmadd_vv_i8m8(...) __riscv_vmadd_vv_i8m8_tu(__VA_ARGS__)
+#define vmadd_vx_i8m8(...) __riscv_vmadd_vx_i8m8_tu(__VA_ARGS__)
+#define vmadd_vv_i16mf4(...) __riscv_vmadd_vv_i16mf4_tu(__VA_ARGS__)
+#define vmadd_vx_i16mf4(...) __riscv_vmadd_vx_i16mf4_tu(__VA_ARGS__)
+#define vmadd_vv_i16mf2(...) __riscv_vmadd_vv_i16mf2_tu(__VA_ARGS__)
+#define vmadd_vx_i16mf2(...) __riscv_vmadd_vx_i16mf2_tu(__VA_ARGS__)
+#define vmadd_vv_i16m1(...) __riscv_vmadd_vv_i16m1_tu(__VA_ARGS__)
+#define vmadd_vx_i16m1(...) __riscv_vmadd_vx_i16m1_tu(__VA_ARGS__)
+#define vmadd_vv_i16m2(...) __riscv_vmadd_vv_i16m2_tu(__VA_ARGS__)
+#define vmadd_vx_i16m2(...) __riscv_vmadd_vx_i16m2_tu(__VA_ARGS__)
+#define vmadd_vv_i16m4(...) __riscv_vmadd_vv_i16m4_tu(__VA_ARGS__)
+#define vmadd_vx_i16m4(...) __riscv_vmadd_vx_i16m4_tu(__VA_ARGS__)
+#define vmadd_vv_i16m8(...) __riscv_vmadd_vv_i16m8_tu(__VA_ARGS__)
+#define vmadd_vx_i16m8(...) __riscv_vmadd_vx_i16m8_tu(__VA_ARGS__)
+#define vmadd_vv_i32mf2(...) __riscv_vmadd_vv_i32mf2_tu(__VA_ARGS__)
+#define vmadd_vx_i32mf2(...) __riscv_vmadd_vx_i32mf2_tu(__VA_ARGS__)
+#define vmadd_vv_i32m1(...) __riscv_vmadd_vv_i32m1_tu(__VA_ARGS__)
+#define vmadd_vx_i32m1(...) __riscv_vmadd_vx_i32m1_tu(__VA_ARGS__)
+#define vmadd_vv_i32m2(...) __riscv_vmadd_vv_i32m2_tu(__VA_ARGS__)
+#define vmadd_vx_i32m2(...) __riscv_vmadd_vx_i32m2_tu(__VA_ARGS__)
+#define vmadd_vv_i32m4(...) __riscv_vmadd_vv_i32m4_tu(__VA_ARGS__)
+#define vmadd_vx_i32m4(...) __riscv_vmadd_vx_i32m4_tu(__VA_ARGS__)
+#define vmadd_vv_i32m8(...) __riscv_vmadd_vv_i32m8_tu(__VA_ARGS__)
+#define vmadd_vx_i32m8(...) __riscv_vmadd_vx_i32m8_tu(__VA_ARGS__)
+#define vmadd_vv_i64m1(...) __riscv_vmadd_vv_i64m1_tu(__VA_ARGS__)
+#define vmadd_vx_i64m1(...) __riscv_vmadd_vx_i64m1_tu(__VA_ARGS__)
+#define vmadd_vv_i64m2(...) __riscv_vmadd_vv_i64m2_tu(__VA_ARGS__)
+#define vmadd_vx_i64m2(...) __riscv_vmadd_vx_i64m2_tu(__VA_ARGS__)
+#define vmadd_vv_i64m4(...) __riscv_vmadd_vv_i64m4_tu(__VA_ARGS__)
+#define vmadd_vx_i64m4(...) __riscv_vmadd_vx_i64m4_tu(__VA_ARGS__)
+#define vmadd_vv_i64m8(...) __riscv_vmadd_vv_i64m8_tu(__VA_ARGS__)
+#define vmadd_vx_i64m8(...) __riscv_vmadd_vx_i64m8_tu(__VA_ARGS__)
+#define vnmsub_vv_i8mf8(...) __riscv_vnmsub_vv_i8mf8_tu(__VA_ARGS__)
+#define vnmsub_vx_i8mf8(...) __riscv_vnmsub_vx_i8mf8_tu(__VA_ARGS__)
+#define vnmsub_vv_i8mf4(...) __riscv_vnmsub_vv_i8mf4_tu(__VA_ARGS__)
+#define vnmsub_vx_i8mf4(...) __riscv_vnmsub_vx_i8mf4_tu(__VA_ARGS__)
+#define vnmsub_vv_i8mf2(...) __riscv_vnmsub_vv_i8mf2_tu(__VA_ARGS__)
+#define vnmsub_vx_i8mf2(...) __riscv_vnmsub_vx_i8mf2_tu(__VA_ARGS__)
+#define vnmsub_vv_i8m1(...) __riscv_vnmsub_vv_i8m1_tu(__VA_ARGS__)
+#define vnmsub_vx_i8m1(...) __riscv_vnmsub_vx_i8m1_tu(__VA_ARGS__)
+#define vnmsub_vv_i8m2(...) __riscv_vnmsub_vv_i8m2_tu(__VA_ARGS__)
+#define vnmsub_vx_i8m2(...) __riscv_vnmsub_vx_i8m2_tu(__VA_ARGS__)
+#define vnmsub_vv_i8m4(...) __riscv_vnmsub_vv_i8m4_tu(__VA_ARGS__)
+#define vnmsub_vx_i8m4(...) __riscv_vnmsub_vx_i8m4_tu(__VA_ARGS__)
+#define vnmsub_vv_i8m8(...) __riscv_vnmsub_vv_i8m8_tu(__VA_ARGS__)
+#define vnmsub_vx_i8m8(...) __riscv_vnmsub_vx_i8m8_tu(__VA_ARGS__)
+#define vnmsub_vv_i16mf4(...) __riscv_vnmsub_vv_i16mf4_tu(__VA_ARGS__)
+#define vnmsub_vx_i16mf4(...) __riscv_vnmsub_vx_i16mf4_tu(__VA_ARGS__)
+#define vnmsub_vv_i16mf2(...) __riscv_vnmsub_vv_i16mf2_tu(__VA_ARGS__)
+#define vnmsub_vx_i16mf2(...) __riscv_vnmsub_vx_i16mf2_tu(__VA_ARGS__)
+#define vnmsub_vv_i16m1(...) __riscv_vnmsub_vv_i16m1_tu(__VA_ARGS__)
+#define vnmsub_vx_i16m1(...) __riscv_vnmsub_vx_i16m1_tu(__VA_ARGS__)
+#define vnmsub_vv_i16m2(...) __riscv_vnmsub_vv_i16m2_tu(__VA_ARGS__)
+#define vnmsub_vx_i16m2(...) __riscv_vnmsub_vx_i16m2_tu(__VA_ARGS__)
+#define vnmsub_vv_i16m4(...) __riscv_vnmsub_vv_i16m4_tu(__VA_ARGS__)
+#define vnmsub_vx_i16m4(...) __riscv_vnmsub_vx_i16m4_tu(__VA_ARGS__)
+#define vnmsub_vv_i16m8(...) __riscv_vnmsub_vv_i16m8_tu(__VA_ARGS__)
+#define vnmsub_vx_i16m8(...) __riscv_vnmsub_vx_i16m8_tu(__VA_ARGS__)
+#define vnmsub_vv_i32mf2(...) __riscv_vnmsub_vv_i32mf2_tu(__VA_ARGS__)
+#define vnmsub_vx_i32mf2(...) __riscv_vnmsub_vx_i32mf2_tu(__VA_ARGS__)
+#define vnmsub_vv_i32m1(...) __riscv_vnmsub_vv_i32m1_tu(__VA_ARGS__)
+#define vnmsub_vx_i32m1(...) __riscv_vnmsub_vx_i32m1_tu(__VA_ARGS__)
+#define vnmsub_vv_i32m2(...) __riscv_vnmsub_vv_i32m2_tu(__VA_ARGS__)
+#define vnmsub_vx_i32m2(...) __riscv_vnmsub_vx_i32m2_tu(__VA_ARGS__)
+#define vnmsub_vv_i32m4(...) __riscv_vnmsub_vv_i32m4_tu(__VA_ARGS__)
+#define vnmsub_vx_i32m4(...) __riscv_vnmsub_vx_i32m4_tu(__VA_ARGS__)
+#define vnmsub_vv_i32m8(...) __riscv_vnmsub_vv_i32m8_tu(__VA_ARGS__)
+#define vnmsub_vx_i32m8(...) __riscv_vnmsub_vx_i32m8_tu(__VA_ARGS__)
+#define vnmsub_vv_i64m1(...) __riscv_vnmsub_vv_i64m1_tu(__VA_ARGS__)
+#define vnmsub_vx_i64m1(...) __riscv_vnmsub_vx_i64m1_tu(__VA_ARGS__)
+#define vnmsub_vv_i64m2(...) __riscv_vnmsub_vv_i64m2_tu(__VA_ARGS__)
+#define vnmsub_vx_i64m2(...) __riscv_vnmsub_vx_i64m2_tu(__VA_ARGS__)
+#define vnmsub_vv_i64m4(...) __riscv_vnmsub_vv_i64m4_tu(__VA_ARGS__)
+#define vnmsub_vx_i64m4(...) __riscv_vnmsub_vx_i64m4_tu(__VA_ARGS__)
+#define vnmsub_vv_i64m8(...) __riscv_vnmsub_vv_i64m8_tu(__VA_ARGS__)
+#define vnmsub_vx_i64m8(...) __riscv_vnmsub_vx_i64m8_tu(__VA_ARGS__)
+#define vmacc_vv_u8mf8(...) __riscv_vmacc_vv_u8mf8_tu(__VA_ARGS__)
+#define vmacc_vx_u8mf8(...) __riscv_vmacc_vx_u8mf8_tu(__VA_ARGS__)
+#define vmacc_vv_u8mf4(...) __riscv_vmacc_vv_u8mf4_tu(__VA_ARGS__)
+#define vmacc_vx_u8mf4(...) __riscv_vmacc_vx_u8mf4_tu(__VA_ARGS__)
+#define vmacc_vv_u8mf2(...) __riscv_vmacc_vv_u8mf2_tu(__VA_ARGS__)
+#define vmacc_vx_u8mf2(...) __riscv_vmacc_vx_u8mf2_tu(__VA_ARGS__)
+#define vmacc_vv_u8m1(...) __riscv_vmacc_vv_u8m1_tu(__VA_ARGS__)
+#define vmacc_vx_u8m1(...) __riscv_vmacc_vx_u8m1_tu(__VA_ARGS__)
+#define vmacc_vv_u8m2(...) __riscv_vmacc_vv_u8m2_tu(__VA_ARGS__)
+#define vmacc_vx_u8m2(...) __riscv_vmacc_vx_u8m2_tu(__VA_ARGS__)
+#define vmacc_vv_u8m4(...) __riscv_vmacc_vv_u8m4_tu(__VA_ARGS__)
+#define vmacc_vx_u8m4(...) __riscv_vmacc_vx_u8m4_tu(__VA_ARGS__)
+#define vmacc_vv_u8m8(...) __riscv_vmacc_vv_u8m8_tu(__VA_ARGS__)
+#define vmacc_vx_u8m8(...) __riscv_vmacc_vx_u8m8_tu(__VA_ARGS__)
+#define vmacc_vv_u16mf4(...) __riscv_vmacc_vv_u16mf4_tu(__VA_ARGS__)
+#define vmacc_vx_u16mf4(...) __riscv_vmacc_vx_u16mf4_tu(__VA_ARGS__)
+#define vmacc_vv_u16mf2(...) __riscv_vmacc_vv_u16mf2_tu(__VA_ARGS__)
+#define vmacc_vx_u16mf2(...) __riscv_vmacc_vx_u16mf2_tu(__VA_ARGS__)
+#define vmacc_vv_u16m1(...) __riscv_vmacc_vv_u16m1_tu(__VA_ARGS__)
+#define vmacc_vx_u16m1(...) __riscv_vmacc_vx_u16m1_tu(__VA_ARGS__)
+#define vmacc_vv_u16m2(...) __riscv_vmacc_vv_u16m2_tu(__VA_ARGS__)
+#define vmacc_vx_u16m2(...) __riscv_vmacc_vx_u16m2_tu(__VA_ARGS__)
+#define vmacc_vv_u16m4(...) __riscv_vmacc_vv_u16m4_tu(__VA_ARGS__)
+#define vmacc_vx_u16m4(...) __riscv_vmacc_vx_u16m4_tu(__VA_ARGS__)
+#define vmacc_vv_u16m8(...) __riscv_vmacc_vv_u16m8_tu(__VA_ARGS__)
+#define vmacc_vx_u16m8(...) __riscv_vmacc_vx_u16m8_tu(__VA_ARGS__)
+#define vmacc_vv_u32mf2(...) __riscv_vmacc_vv_u32mf2_tu(__VA_ARGS__)
+#define vmacc_vx_u32mf2(...) __riscv_vmacc_vx_u32mf2_tu(__VA_ARGS__)
+#define vmacc_vv_u32m1(...) __riscv_vmacc_vv_u32m1_tu(__VA_ARGS__)
+#define vmacc_vx_u32m1(...) __riscv_vmacc_vx_u32m1_tu(__VA_ARGS__)
+#define vmacc_vv_u32m2(...) __riscv_vmacc_vv_u32m2_tu(__VA_ARGS__)
+#define vmacc_vx_u32m2(...) __riscv_vmacc_vx_u32m2_tu(__VA_ARGS__)
+#define vmacc_vv_u32m4(...) __riscv_vmacc_vv_u32m4_tu(__VA_ARGS__)
+#define vmacc_vx_u32m4(...) __riscv_vmacc_vx_u32m4_tu(__VA_ARGS__)
+#define vmacc_vv_u32m8(...) __riscv_vmacc_vv_u32m8_tu(__VA_ARGS__)
+#define vmacc_vx_u32m8(...) __riscv_vmacc_vx_u32m8_tu(__VA_ARGS__)
+#define vmacc_vv_u64m1(...) __riscv_vmacc_vv_u64m1_tu(__VA_ARGS__)
+#define vmacc_vx_u64m1(...) __riscv_vmacc_vx_u64m1_tu(__VA_ARGS__)
+#define vmacc_vv_u64m2(...) __riscv_vmacc_vv_u64m2_tu(__VA_ARGS__)
+#define vmacc_vx_u64m2(...) __riscv_vmacc_vx_u64m2_tu(__VA_ARGS__)
+#define vmacc_vv_u64m4(...) __riscv_vmacc_vv_u64m4_tu(__VA_ARGS__)
+#define vmacc_vx_u64m4(...) __riscv_vmacc_vx_u64m4_tu(__VA_ARGS__)
+#define vmacc_vv_u64m8(...) __riscv_vmacc_vv_u64m8_tu(__VA_ARGS__)
+#define vmacc_vx_u64m8(...) __riscv_vmacc_vx_u64m8_tu(__VA_ARGS__)
+#define vnmsac_vv_u8mf8(...) __riscv_vnmsac_vv_u8mf8_tu(__VA_ARGS__)
+#define vnmsac_vx_u8mf8(...) __riscv_vnmsac_vx_u8mf8_tu(__VA_ARGS__)
+#define vnmsac_vv_u8mf4(...) __riscv_vnmsac_vv_u8mf4_tu(__VA_ARGS__)
+#define vnmsac_vx_u8mf4(...) __riscv_vnmsac_vx_u8mf4_tu(__VA_ARGS__)
+#define vnmsac_vv_u8mf2(...) __riscv_vnmsac_vv_u8mf2_tu(__VA_ARGS__)
+#define vnmsac_vx_u8mf2(...) __riscv_vnmsac_vx_u8mf2_tu(__VA_ARGS__)
+#define vnmsac_vv_u8m1(...) __riscv_vnmsac_vv_u8m1_tu(__VA_ARGS__)
+#define vnmsac_vx_u8m1(...) __riscv_vnmsac_vx_u8m1_tu(__VA_ARGS__)
+#define vnmsac_vv_u8m2(...) __riscv_vnmsac_vv_u8m2_tu(__VA_ARGS__)
+#define vnmsac_vx_u8m2(...) __riscv_vnmsac_vx_u8m2_tu(__VA_ARGS__)
+#define vnmsac_vv_u8m4(...) __riscv_vnmsac_vv_u8m4_tu(__VA_ARGS__)
+#define vnmsac_vx_u8m4(...) __riscv_vnmsac_vx_u8m4_tu(__VA_ARGS__)
+#define vnmsac_vv_u8m8(...) __riscv_vnmsac_vv_u8m8_tu(__VA_ARGS__)
+#define vnmsac_vx_u8m8(...) __riscv_vnmsac_vx_u8m8_tu(__VA_ARGS__)
+#define vnmsac_vv_u16mf4(...) __riscv_vnmsac_vv_u16mf4_tu(__VA_ARGS__)
+#define vnmsac_vx_u16mf4(...) __riscv_vnmsac_vx_u16mf4_tu(__VA_ARGS__)
+#define vnmsac_vv_u16mf2(...) __riscv_vnmsac_vv_u16mf2_tu(__VA_ARGS__)
+#define vnmsac_vx_u16mf2(...) __riscv_vnmsac_vx_u16mf2_tu(__VA_ARGS__)
+#define vnmsac_vv_u16m1(...) __riscv_vnmsac_vv_u16m1_tu(__VA_ARGS__)
+#define vnmsac_vx_u16m1(...) __riscv_vnmsac_vx_u16m1_tu(__VA_ARGS__)
+#define vnmsac_vv_u16m2(...) __riscv_vnmsac_vv_u16m2_tu(__VA_ARGS__)
+#define vnmsac_vx_u16m2(...) __riscv_vnmsac_vx_u16m2_tu(__VA_ARGS__)
+#define vnmsac_vv_u16m4(...) __riscv_vnmsac_vv_u16m4_tu(__VA_ARGS__)
+#define vnmsac_vx_u16m4(...) __riscv_vnmsac_vx_u16m4_tu(__VA_ARGS__)
+#define vnmsac_vv_u16m8(...) __riscv_vnmsac_vv_u16m8_tu(__VA_ARGS__)
+#define vnmsac_vx_u16m8(...) __riscv_vnmsac_vx_u16m8_tu(__VA_ARGS__)
+#define vnmsac_vv_u32mf2(...) __riscv_vnmsac_vv_u32mf2_tu(__VA_ARGS__)
+#define vnmsac_vx_u32mf2(...) __riscv_vnmsac_vx_u32mf2_tu(__VA_ARGS__)
+#define vnmsac_vv_u32m1(...) __riscv_vnmsac_vv_u32m1_tu(__VA_ARGS__)
+#define vnmsac_vx_u32m1(...) __riscv_vnmsac_vx_u32m1_tu(__VA_ARGS__)
+#define vnmsac_vv_u32m2(...) __riscv_vnmsac_vv_u32m2_tu(__VA_ARGS__)
+#define vnmsac_vx_u32m2(...) __riscv_vnmsac_vx_u32m2_tu(__VA_ARGS__)
+#define vnmsac_vv_u32m4(...) __riscv_vnmsac_vv_u32m4_tu(__VA_ARGS__)
+#define vnmsac_vx_u32m4(...) __riscv_vnmsac_vx_u32m4_tu(__VA_ARGS__)
+#define vnmsac_vv_u32m8(...) __riscv_vnmsac_vv_u32m8_tu(__VA_ARGS__)
+#define vnmsac_vx_u32m8(...) __riscv_vnmsac_vx_u32m8_tu(__VA_ARGS__)
+#define vnmsac_vv_u64m1(...) __riscv_vnmsac_vv_u64m1_tu(__VA_ARGS__)
+#define vnmsac_vx_u64m1(...) __riscv_vnmsac_vx_u64m1_tu(__VA_ARGS__)
+#define vnmsac_vv_u64m2(...) __riscv_vnmsac_vv_u64m2_tu(__VA_ARGS__)
+#define vnmsac_vx_u64m2(...) __riscv_vnmsac_vx_u64m2_tu(__VA_ARGS__)
+#define vnmsac_vv_u64m4(...) __riscv_vnmsac_vv_u64m4_tu(__VA_ARGS__)
+#define vnmsac_vx_u64m4(...) __riscv_vnmsac_vx_u64m4_tu(__VA_ARGS__)
+#define vnmsac_vv_u64m8(...) __riscv_vnmsac_vv_u64m8_tu(__VA_ARGS__)
+#define vnmsac_vx_u64m8(...) __riscv_vnmsac_vx_u64m8_tu(__VA_ARGS__)
+#define vmadd_vv_u8mf8(...) __riscv_vmadd_vv_u8mf8_tu(__VA_ARGS__)
+#define vmadd_vx_u8mf8(...) __riscv_vmadd_vx_u8mf8_tu(__VA_ARGS__)
+#define vmadd_vv_u8mf4(...) __riscv_vmadd_vv_u8mf4_tu(__VA_ARGS__)
+#define vmadd_vx_u8mf4(...) __riscv_vmadd_vx_u8mf4_tu(__VA_ARGS__)
+#define vmadd_vv_u8mf2(...) __riscv_vmadd_vv_u8mf2_tu(__VA_ARGS__)
+#define vmadd_vx_u8mf2(...) __riscv_vmadd_vx_u8mf2_tu(__VA_ARGS__)
+#define vmadd_vv_u8m1(...) __riscv_vmadd_vv_u8m1_tu(__VA_ARGS__)
+#define vmadd_vx_u8m1(...) __riscv_vmadd_vx_u8m1_tu(__VA_ARGS__)
+#define vmadd_vv_u8m2(...) __riscv_vmadd_vv_u8m2_tu(__VA_ARGS__)
+#define vmadd_vx_u8m2(...) __riscv_vmadd_vx_u8m2_tu(__VA_ARGS__)
+#define vmadd_vv_u8m4(...) __riscv_vmadd_vv_u8m4_tu(__VA_ARGS__)
+#define vmadd_vx_u8m4(...) __riscv_vmadd_vx_u8m4_tu(__VA_ARGS__)
+#define vmadd_vv_u8m8(...) __riscv_vmadd_vv_u8m8_tu(__VA_ARGS__)
+#define vmadd_vx_u8m8(...) __riscv_vmadd_vx_u8m8_tu(__VA_ARGS__)
+#define vmadd_vv_u16mf4(...) __riscv_vmadd_vv_u16mf4_tu(__VA_ARGS__)
+#define vmadd_vx_u16mf4(...) __riscv_vmadd_vx_u16mf4_tu(__VA_ARGS__)
+#define vmadd_vv_u16mf2(...) __riscv_vmadd_vv_u16mf2_tu(__VA_ARGS__)
+#define vmadd_vx_u16mf2(...) __riscv_vmadd_vx_u16mf2_tu(__VA_ARGS__)
+#define vmadd_vv_u16m1(...) __riscv_vmadd_vv_u16m1_tu(__VA_ARGS__)
+#define vmadd_vx_u16m1(...) __riscv_vmadd_vx_u16m1_tu(__VA_ARGS__)
+#define vmadd_vv_u16m2(...) __riscv_vmadd_vv_u16m2_tu(__VA_ARGS__)
+#define vmadd_vx_u16m2(...) __riscv_vmadd_vx_u16m2_tu(__VA_ARGS__)
+#define vmadd_vv_u16m4(...) __riscv_vmadd_vv_u16m4_tu(__VA_ARGS__)
+#define vmadd_vx_u16m4(...) __riscv_vmadd_vx_u16m4_tu(__VA_ARGS__)
+#define vmadd_vv_u16m8(...) __riscv_vmadd_vv_u16m8_tu(__VA_ARGS__)
+#define vmadd_vx_u16m8(...) __riscv_vmadd_vx_u16m8_tu(__VA_ARGS__)
+#define vmadd_vv_u32mf2(...) __riscv_vmadd_vv_u32mf2_tu(__VA_ARGS__)
+#define vmadd_vx_u32mf2(...) __riscv_vmadd_vx_u32mf2_tu(__VA_ARGS__)
+#define vmadd_vv_u32m1(...) __riscv_vmadd_vv_u32m1_tu(__VA_ARGS__)
+#define vmadd_vx_u32m1(...) __riscv_vmadd_vx_u32m1_tu(__VA_ARGS__)
+#define vmadd_vv_u32m2(...) __riscv_vmadd_vv_u32m2_tu(__VA_ARGS__)
+#define vmadd_vx_u32m2(...) __riscv_vmadd_vx_u32m2_tu(__VA_ARGS__)
+#define vmadd_vv_u32m4(...) __riscv_vmadd_vv_u32m4_tu(__VA_ARGS__)
+#define vmadd_vx_u32m4(...) __riscv_vmadd_vx_u32m4_tu(__VA_ARGS__)
+#define vmadd_vv_u32m8(...) __riscv_vmadd_vv_u32m8_tu(__VA_ARGS__)
+#define vmadd_vx_u32m8(...) __riscv_vmadd_vx_u32m8_tu(__VA_ARGS__)
+#define vmadd_vv_u64m1(...) __riscv_vmadd_vv_u64m1_tu(__VA_ARGS__)
+#define vmadd_vx_u64m1(...) __riscv_vmadd_vx_u64m1_tu(__VA_ARGS__)
+#define vmadd_vv_u64m2(...) __riscv_vmadd_vv_u64m2_tu(__VA_ARGS__)
+#define vmadd_vx_u64m2(...) __riscv_vmadd_vx_u64m2_tu(__VA_ARGS__)
+#define vmadd_vv_u64m4(...) __riscv_vmadd_vv_u64m4_tu(__VA_ARGS__)
+#define vmadd_vx_u64m4(...) __riscv_vmadd_vx_u64m4_tu(__VA_ARGS__)
+#define vmadd_vv_u64m8(...) __riscv_vmadd_vv_u64m8_tu(__VA_ARGS__)
+#define vmadd_vx_u64m8(...) __riscv_vmadd_vx_u64m8_tu(__VA_ARGS__)
+#define vnmsub_vv_u8mf8(...) __riscv_vnmsub_vv_u8mf8_tu(__VA_ARGS__)
+#define vnmsub_vx_u8mf8(...) __riscv_vnmsub_vx_u8mf8_tu(__VA_ARGS__)
+#define vnmsub_vv_u8mf4(...) __riscv_vnmsub_vv_u8mf4_tu(__VA_ARGS__)
+#define vnmsub_vx_u8mf4(...) __riscv_vnmsub_vx_u8mf4_tu(__VA_ARGS__)
+#define vnmsub_vv_u8mf2(...) __riscv_vnmsub_vv_u8mf2_tu(__VA_ARGS__)
+#define vnmsub_vx_u8mf2(...) __riscv_vnmsub_vx_u8mf2_tu(__VA_ARGS__)
+#define vnmsub_vv_u8m1(...) __riscv_vnmsub_vv_u8m1_tu(__VA_ARGS__)
+#define vnmsub_vx_u8m1(...) __riscv_vnmsub_vx_u8m1_tu(__VA_ARGS__)
+#define vnmsub_vv_u8m2(...) __riscv_vnmsub_vv_u8m2_tu(__VA_ARGS__)
+#define vnmsub_vx_u8m2(...) __riscv_vnmsub_vx_u8m2_tu(__VA_ARGS__)
+#define vnmsub_vv_u8m4(...) __riscv_vnmsub_vv_u8m4_tu(__VA_ARGS__)
+#define vnmsub_vx_u8m4(...) __riscv_vnmsub_vx_u8m4_tu(__VA_ARGS__)
+#define vnmsub_vv_u8m8(...) __riscv_vnmsub_vv_u8m8_tu(__VA_ARGS__)
+#define vnmsub_vx_u8m8(...) __riscv_vnmsub_vx_u8m8_tu(__VA_ARGS__)
+#define vnmsub_vv_u16mf4(...) __riscv_vnmsub_vv_u16mf4_tu(__VA_ARGS__)
+#define vnmsub_vx_u16mf4(...) __riscv_vnmsub_vx_u16mf4_tu(__VA_ARGS__)
+#define vnmsub_vv_u16mf2(...) __riscv_vnmsub_vv_u16mf2_tu(__VA_ARGS__)
+#define vnmsub_vx_u16mf2(...) __riscv_vnmsub_vx_u16mf2_tu(__VA_ARGS__)
+#define vnmsub_vv_u16m1(...) __riscv_vnmsub_vv_u16m1_tu(__VA_ARGS__)
+#define vnmsub_vx_u16m1(...) __riscv_vnmsub_vx_u16m1_tu(__VA_ARGS__)
+#define vnmsub_vv_u16m2(...) __riscv_vnmsub_vv_u16m2_tu(__VA_ARGS__)
+#define vnmsub_vx_u16m2(...) __riscv_vnmsub_vx_u16m2_tu(__VA_ARGS__)
+#define vnmsub_vv_u16m4(...) __riscv_vnmsub_vv_u16m4_tu(__VA_ARGS__)
+#define vnmsub_vx_u16m4(...) __riscv_vnmsub_vx_u16m4_tu(__VA_ARGS__)
+#define vnmsub_vv_u16m8(...) __riscv_vnmsub_vv_u16m8_tu(__VA_ARGS__)
+#define vnmsub_vx_u16m8(...) __riscv_vnmsub_vx_u16m8_tu(__VA_ARGS__)
+#define vnmsub_vv_u32mf2(...) __riscv_vnmsub_vv_u32mf2_tu(__VA_ARGS__)
+#define vnmsub_vx_u32mf2(...) __riscv_vnmsub_vx_u32mf2_tu(__VA_ARGS__)
+#define vnmsub_vv_u32m1(...) __riscv_vnmsub_vv_u32m1_tu(__VA_ARGS__)
+#define vnmsub_vx_u32m1(...) __riscv_vnmsub_vx_u32m1_tu(__VA_ARGS__)
+#define vnmsub_vv_u32m2(...) __riscv_vnmsub_vv_u32m2_tu(__VA_ARGS__)
+#define vnmsub_vx_u32m2(...) __riscv_vnmsub_vx_u32m2_tu(__VA_ARGS__)
+#define vnmsub_vv_u32m4(...) __riscv_vnmsub_vv_u32m4_tu(__VA_ARGS__)
+#define vnmsub_vx_u32m4(...) __riscv_vnmsub_vx_u32m4_tu(__VA_ARGS__)
+#define vnmsub_vv_u32m8(...) __riscv_vnmsub_vv_u32m8_tu(__VA_ARGS__)
+#define vnmsub_vx_u32m8(...) __riscv_vnmsub_vx_u32m8_tu(__VA_ARGS__)
+#define vnmsub_vv_u64m1(...) __riscv_vnmsub_vv_u64m1_tu(__VA_ARGS__)
+#define vnmsub_vx_u64m1(...) __riscv_vnmsub_vx_u64m1_tu(__VA_ARGS__)
+#define vnmsub_vv_u64m2(...) __riscv_vnmsub_vv_u64m2_tu(__VA_ARGS__)
+#define vnmsub_vx_u64m2(...) __riscv_vnmsub_vx_u64m2_tu(__VA_ARGS__)
+#define vnmsub_vv_u64m4(...) __riscv_vnmsub_vv_u64m4_tu(__VA_ARGS__)
+#define vnmsub_vx_u64m4(...) __riscv_vnmsub_vx_u64m4_tu(__VA_ARGS__)
+#define vnmsub_vv_u64m8(...) __riscv_vnmsub_vv_u64m8_tu(__VA_ARGS__)
+#define vnmsub_vx_u64m8(...) __riscv_vnmsub_vx_u64m8_tu(__VA_ARGS__)
+// masked functions
+#define vmacc_vv_i8mf8_m(...) __riscv_vmacc_vv_i8mf8_tumu(__VA_ARGS__)
+#define vmacc_vx_i8mf8_m(...) __riscv_vmacc_vx_i8mf8_tumu(__VA_ARGS__)
+#define vmacc_vv_i8mf4_m(...) __riscv_vmacc_vv_i8mf4_tumu(__VA_ARGS__)
+#define vmacc_vx_i8mf4_m(...) __riscv_vmacc_vx_i8mf4_tumu(__VA_ARGS__)
+#define vmacc_vv_i8mf2_m(...) __riscv_vmacc_vv_i8mf2_tumu(__VA_ARGS__)
+#define vmacc_vx_i8mf2_m(...) __riscv_vmacc_vx_i8mf2_tumu(__VA_ARGS__)
+#define vmacc_vv_i8m1_m(...) __riscv_vmacc_vv_i8m1_tumu(__VA_ARGS__)
+#define vmacc_vx_i8m1_m(...) __riscv_vmacc_vx_i8m1_tumu(__VA_ARGS__)
+#define vmacc_vv_i8m2_m(...) __riscv_vmacc_vv_i8m2_tumu(__VA_ARGS__)
+#define vmacc_vx_i8m2_m(...) __riscv_vmacc_vx_i8m2_tumu(__VA_ARGS__)
+#define vmacc_vv_i8m4_m(...) __riscv_vmacc_vv_i8m4_tumu(__VA_ARGS__)
+#define vmacc_vx_i8m4_m(...) __riscv_vmacc_vx_i8m4_tumu(__VA_ARGS__)
+#define vmacc_vv_i8m8_m(...) __riscv_vmacc_vv_i8m8_tumu(__VA_ARGS__)
+#define vmacc_vx_i8m8_m(...) __riscv_vmacc_vx_i8m8_tumu(__VA_ARGS__)
+#define vmacc_vv_i16mf4_m(...) __riscv_vmacc_vv_i16mf4_tumu(__VA_ARGS__)
+#define vmacc_vx_i16mf4_m(...) __riscv_vmacc_vx_i16mf4_tumu(__VA_ARGS__)
+#define vmacc_vv_i16mf2_m(...) __riscv_vmacc_vv_i16mf2_tumu(__VA_ARGS__)
+#define vmacc_vx_i16mf2_m(...) __riscv_vmacc_vx_i16mf2_tumu(__VA_ARGS__)
+#define vmacc_vv_i16m1_m(...) __riscv_vmacc_vv_i16m1_tumu(__VA_ARGS__)
+#define vmacc_vx_i16m1_m(...) __riscv_vmacc_vx_i16m1_tumu(__VA_ARGS__)
+#define vmacc_vv_i16m2_m(...) __riscv_vmacc_vv_i16m2_tumu(__VA_ARGS__)
+#define vmacc_vx_i16m2_m(...) __riscv_vmacc_vx_i16m2_tumu(__VA_ARGS__)
+#define vmacc_vv_i16m4_m(...) __riscv_vmacc_vv_i16m4_tumu(__VA_ARGS__)
+#define vmacc_vx_i16m4_m(...) __riscv_vmacc_vx_i16m4_tumu(__VA_ARGS__)
+#define vmacc_vv_i16m8_m(...) __riscv_vmacc_vv_i16m8_tumu(__VA_ARGS__)
+#define vmacc_vx_i16m8_m(...) __riscv_vmacc_vx_i16m8_tumu(__VA_ARGS__)
+#define vmacc_vv_i32mf2_m(...) __riscv_vmacc_vv_i32mf2_tumu(__VA_ARGS__)
+#define vmacc_vx_i32mf2_m(...) __riscv_vmacc_vx_i32mf2_tumu(__VA_ARGS__)
+#define vmacc_vv_i32m1_m(...) __riscv_vmacc_vv_i32m1_tumu(__VA_ARGS__)
+#define vmacc_vx_i32m1_m(...) __riscv_vmacc_vx_i32m1_tumu(__VA_ARGS__)
+#define vmacc_vv_i32m2_m(...) __riscv_vmacc_vv_i32m2_tumu(__VA_ARGS__)
+#define vmacc_vx_i32m2_m(...) __riscv_vmacc_vx_i32m2_tumu(__VA_ARGS__)
+#define vmacc_vv_i32m4_m(...) __riscv_vmacc_vv_i32m4_tumu(__VA_ARGS__)
+#define vmacc_vx_i32m4_m(...) __riscv_vmacc_vx_i32m4_tumu(__VA_ARGS__)
+#define vmacc_vv_i32m8_m(...) __riscv_vmacc_vv_i32m8_tumu(__VA_ARGS__)
+#define vmacc_vx_i32m8_m(...) __riscv_vmacc_vx_i32m8_tumu(__VA_ARGS__)
+#define vmacc_vv_i64m1_m(...) __riscv_vmacc_vv_i64m1_tumu(__VA_ARGS__)
+#define vmacc_vx_i64m1_m(...) __riscv_vmacc_vx_i64m1_tumu(__VA_ARGS__)
+#define vmacc_vv_i64m2_m(...) __riscv_vmacc_vv_i64m2_tumu(__VA_ARGS__)
+#define vmacc_vx_i64m2_m(...) __riscv_vmacc_vx_i64m2_tumu(__VA_ARGS__)
+#define vmacc_vv_i64m4_m(...) __riscv_vmacc_vv_i64m4_tumu(__VA_ARGS__)
+#define vmacc_vx_i64m4_m(...) __riscv_vmacc_vx_i64m4_tumu(__VA_ARGS__)
+#define vmacc_vv_i64m8_m(...) __riscv_vmacc_vv_i64m8_tumu(__VA_ARGS__)
+#define vmacc_vx_i64m8_m(...) __riscv_vmacc_vx_i64m8_tumu(__VA_ARGS__)
+#define vnmsac_vv_i8mf8_m(...) __riscv_vnmsac_vv_i8mf8_tumu(__VA_ARGS__)
+#define vnmsac_vx_i8mf8_m(...) __riscv_vnmsac_vx_i8mf8_tumu(__VA_ARGS__)
+#define vnmsac_vv_i8mf4_m(...) __riscv_vnmsac_vv_i8mf4_tumu(__VA_ARGS__)
+#define vnmsac_vx_i8mf4_m(...) __riscv_vnmsac_vx_i8mf4_tumu(__VA_ARGS__)
+#define vnmsac_vv_i8mf2_m(...) __riscv_vnmsac_vv_i8mf2_tumu(__VA_ARGS__)
+#define vnmsac_vx_i8mf2_m(...) __riscv_vnmsac_vx_i8mf2_tumu(__VA_ARGS__)
+#define vnmsac_vv_i8m1_m(...) __riscv_vnmsac_vv_i8m1_tumu(__VA_ARGS__)
+#define vnmsac_vx_i8m1_m(...) __riscv_vnmsac_vx_i8m1_tumu(__VA_ARGS__)
+#define vnmsac_vv_i8m2_m(...) __riscv_vnmsac_vv_i8m2_tumu(__VA_ARGS__)
+#define vnmsac_vx_i8m2_m(...) __riscv_vnmsac_vx_i8m2_tumu(__VA_ARGS__)
+#define vnmsac_vv_i8m4_m(...) __riscv_vnmsac_vv_i8m4_tumu(__VA_ARGS__)
+#define vnmsac_vx_i8m4_m(...) __riscv_vnmsac_vx_i8m4_tumu(__VA_ARGS__)
+#define vnmsac_vv_i8m8_m(...) __riscv_vnmsac_vv_i8m8_tumu(__VA_ARGS__)
+#define vnmsac_vx_i8m8_m(...) __riscv_vnmsac_vx_i8m8_tumu(__VA_ARGS__)
+#define vnmsac_vv_i16mf4_m(...) __riscv_vnmsac_vv_i16mf4_tumu(__VA_ARGS__)
+#define vnmsac_vx_i16mf4_m(...) __riscv_vnmsac_vx_i16mf4_tumu(__VA_ARGS__)
+#define vnmsac_vv_i16mf2_m(...) __riscv_vnmsac_vv_i16mf2_tumu(__VA_ARGS__)
+#define vnmsac_vx_i16mf2_m(...) __riscv_vnmsac_vx_i16mf2_tumu(__VA_ARGS__)
+#define vnmsac_vv_i16m1_m(...) __riscv_vnmsac_vv_i16m1_tumu(__VA_ARGS__)
+#define vnmsac_vx_i16m1_m(...) __riscv_vnmsac_vx_i16m1_tumu(__VA_ARGS__)
+#define vnmsac_vv_i16m2_m(...) __riscv_vnmsac_vv_i16m2_tumu(__VA_ARGS__)
+#define vnmsac_vx_i16m2_m(...) __riscv_vnmsac_vx_i16m2_tumu(__VA_ARGS__)
+#define vnmsac_vv_i16m4_m(...) __riscv_vnmsac_vv_i16m4_tumu(__VA_ARGS__)
+#define vnmsac_vx_i16m4_m(...) __riscv_vnmsac_vx_i16m4_tumu(__VA_ARGS__)
+#define vnmsac_vv_i16m8_m(...) __riscv_vnmsac_vv_i16m8_tumu(__VA_ARGS__)
+#define vnmsac_vx_i16m8_m(...) __riscv_vnmsac_vx_i16m8_tumu(__VA_ARGS__)
+#define vnmsac_vv_i32mf2_m(...) __riscv_vnmsac_vv_i32mf2_tumu(__VA_ARGS__)
+#define vnmsac_vx_i32mf2_m(...) __riscv_vnmsac_vx_i32mf2_tumu(__VA_ARGS__)
+#define vnmsac_vv_i32m1_m(...) __riscv_vnmsac_vv_i32m1_tumu(__VA_ARGS__)
+#define vnmsac_vx_i32m1_m(...) __riscv_vnmsac_vx_i32m1_tumu(__VA_ARGS__)
+#define vnmsac_vv_i32m2_m(...) __riscv_vnmsac_vv_i32m2_tumu(__VA_ARGS__)
+#define vnmsac_vx_i32m2_m(...) __riscv_vnmsac_vx_i32m2_tumu(__VA_ARGS__)
+#define vnmsac_vv_i32m4_m(...) __riscv_vnmsac_vv_i32m4_tumu(__VA_ARGS__)
+#define vnmsac_vx_i32m4_m(...) __riscv_vnmsac_vx_i32m4_tumu(__VA_ARGS__)
+#define vnmsac_vv_i32m8_m(...) __riscv_vnmsac_vv_i32m8_tumu(__VA_ARGS__)
+#define vnmsac_vx_i32m8_m(...) __riscv_vnmsac_vx_i32m8_tumu(__VA_ARGS__)
+#define vnmsac_vv_i64m1_m(...) __riscv_vnmsac_vv_i64m1_tumu(__VA_ARGS__)
+#define vnmsac_vx_i64m1_m(...) __riscv_vnmsac_vx_i64m1_tumu(__VA_ARGS__)
+#define vnmsac_vv_i64m2_m(...) __riscv_vnmsac_vv_i64m2_tumu(__VA_ARGS__)
+#define vnmsac_vx_i64m2_m(...) __riscv_vnmsac_vx_i64m2_tumu(__VA_ARGS__)
+#define vnmsac_vv_i64m4_m(...) __riscv_vnmsac_vv_i64m4_tumu(__VA_ARGS__)
+#define vnmsac_vx_i64m4_m(...) __riscv_vnmsac_vx_i64m4_tumu(__VA_ARGS__)
+#define vnmsac_vv_i64m8_m(...) __riscv_vnmsac_vv_i64m8_tumu(__VA_ARGS__)
+#define vnmsac_vx_i64m8_m(...) __riscv_vnmsac_vx_i64m8_tumu(__VA_ARGS__)
+#define vmadd_vv_i8mf8_m(...) __riscv_vmadd_vv_i8mf8_tumu(__VA_ARGS__)
+#define vmadd_vx_i8mf8_m(...) __riscv_vmadd_vx_i8mf8_tumu(__VA_ARGS__)
+#define vmadd_vv_i8mf4_m(...) __riscv_vmadd_vv_i8mf4_tumu(__VA_ARGS__)
+#define vmadd_vx_i8mf4_m(...) __riscv_vmadd_vx_i8mf4_tumu(__VA_ARGS__)
+#define vmadd_vv_i8mf2_m(...) __riscv_vmadd_vv_i8mf2_tumu(__VA_ARGS__)
+#define vmadd_vx_i8mf2_m(...) __riscv_vmadd_vx_i8mf2_tumu(__VA_ARGS__)
+#define vmadd_vv_i8m1_m(...) __riscv_vmadd_vv_i8m1_tumu(__VA_ARGS__)
+#define vmadd_vx_i8m1_m(...) __riscv_vmadd_vx_i8m1_tumu(__VA_ARGS__)
+#define vmadd_vv_i8m2_m(...) __riscv_vmadd_vv_i8m2_tumu(__VA_ARGS__)
+#define vmadd_vx_i8m2_m(...) __riscv_vmadd_vx_i8m2_tumu(__VA_ARGS__)
+#define vmadd_vv_i8m4_m(...) __riscv_vmadd_vv_i8m4_tumu(__VA_ARGS__)
+#define vmadd_vx_i8m4_m(...) __riscv_vmadd_vx_i8m4_tumu(__VA_ARGS__)
+#define vmadd_vv_i8m8_m(...) __riscv_vmadd_vv_i8m8_tumu(__VA_ARGS__)
+#define vmadd_vx_i8m8_m(...) __riscv_vmadd_vx_i8m8_tumu(__VA_ARGS__)
+#define vmadd_vv_i16mf4_m(...) __riscv_vmadd_vv_i16mf4_tumu(__VA_ARGS__)
+#define vmadd_vx_i16mf4_m(...) __riscv_vmadd_vx_i16mf4_tumu(__VA_ARGS__)
+#define vmadd_vv_i16mf2_m(...) __riscv_vmadd_vv_i16mf2_tumu(__VA_ARGS__)
+#define vmadd_vx_i16mf2_m(...) __riscv_vmadd_vx_i16mf2_tumu(__VA_ARGS__)
+#define vmadd_vv_i16m1_m(...) __riscv_vmadd_vv_i16m1_tumu(__VA_ARGS__)
+#define vmadd_vx_i16m1_m(...) __riscv_vmadd_vx_i16m1_tumu(__VA_ARGS__)
+#define vmadd_vv_i16m2_m(...) __riscv_vmadd_vv_i16m2_tumu(__VA_ARGS__)
+#define vmadd_vx_i16m2_m(...) __riscv_vmadd_vx_i16m2_tumu(__VA_ARGS__)
+#define vmadd_vv_i16m4_m(...) __riscv_vmadd_vv_i16m4_tumu(__VA_ARGS__)
+#define vmadd_vx_i16m4_m(...) __riscv_vmadd_vx_i16m4_tumu(__VA_ARGS__)
+#define vmadd_vv_i16m8_m(...) __riscv_vmadd_vv_i16m8_tumu(__VA_ARGS__)
+#define vmadd_vx_i16m8_m(...) __riscv_vmadd_vx_i16m8_tumu(__VA_ARGS__)
+#define vmadd_vv_i32mf2_m(...) __riscv_vmadd_vv_i32mf2_tumu(__VA_ARGS__)
+#define vmadd_vx_i32mf2_m(...) __riscv_vmadd_vx_i32mf2_tumu(__VA_ARGS__)
+#define vmadd_vv_i32m1_m(...) __riscv_vmadd_vv_i32m1_tumu(__VA_ARGS__)
+#define vmadd_vx_i32m1_m(...) __riscv_vmadd_vx_i32m1_tumu(__VA_ARGS__)
+#define vmadd_vv_i32m2_m(...) __riscv_vmadd_vv_i32m2_tumu(__VA_ARGS__)
+#define vmadd_vx_i32m2_m(...) __riscv_vmadd_vx_i32m2_tumu(__VA_ARGS__)
+#define vmadd_vv_i32m4_m(...) __riscv_vmadd_vv_i32m4_tumu(__VA_ARGS__)
+#define vmadd_vx_i32m4_m(...) __riscv_vmadd_vx_i32m4_tumu(__VA_ARGS__)
+#define vmadd_vv_i32m8_m(...) __riscv_vmadd_vv_i32m8_tumu(__VA_ARGS__)
+#define vmadd_vx_i32m8_m(...) __riscv_vmadd_vx_i32m8_tumu(__VA_ARGS__)
+#define vmadd_vv_i64m1_m(...) __riscv_vmadd_vv_i64m1_tumu(__VA_ARGS__)
+#define vmadd_vx_i64m1_m(...) __riscv_vmadd_vx_i64m1_tumu(__VA_ARGS__)
+#define vmadd_vv_i64m2_m(...) __riscv_vmadd_vv_i64m2_tumu(__VA_ARGS__)
+#define vmadd_vx_i64m2_m(...) __riscv_vmadd_vx_i64m2_tumu(__VA_ARGS__)
+#define vmadd_vv_i64m4_m(...) __riscv_vmadd_vv_i64m4_tumu(__VA_ARGS__)
+#define vmadd_vx_i64m4_m(...) __riscv_vmadd_vx_i64m4_tumu(__VA_ARGS__)
+#define vmadd_vv_i64m8_m(...) __riscv_vmadd_vv_i64m8_tumu(__VA_ARGS__)
+#define vmadd_vx_i64m8_m(...) __riscv_vmadd_vx_i64m8_tumu(__VA_ARGS__)
+#define vnmsub_vv_i8mf8_m(...) __riscv_vnmsub_vv_i8mf8_tumu(__VA_ARGS__)
+#define vnmsub_vx_i8mf8_m(...) __riscv_vnmsub_vx_i8mf8_tumu(__VA_ARGS__)
+#define vnmsub_vv_i8mf4_m(...) __riscv_vnmsub_vv_i8mf4_tumu(__VA_ARGS__)
+#define vnmsub_vx_i8mf4_m(...) __riscv_vnmsub_vx_i8mf4_tumu(__VA_ARGS__)
+#define vnmsub_vv_i8mf2_m(...) __riscv_vnmsub_vv_i8mf2_tumu(__VA_ARGS__)
+#define vnmsub_vx_i8mf2_m(...) __riscv_vnmsub_vx_i8mf2_tumu(__VA_ARGS__)
+#define vnmsub_vv_i8m1_m(...) __riscv_vnmsub_vv_i8m1_tumu(__VA_ARGS__)
+#define vnmsub_vx_i8m1_m(...) __riscv_vnmsub_vx_i8m1_tumu(__VA_ARGS__)
+#define vnmsub_vv_i8m2_m(...) __riscv_vnmsub_vv_i8m2_tumu(__VA_ARGS__)
+#define vnmsub_vx_i8m2_m(...) __riscv_vnmsub_vx_i8m2_tumu(__VA_ARGS__)
+#define vnmsub_vv_i8m4_m(...) __riscv_vnmsub_vv_i8m4_tumu(__VA_ARGS__)
+#define vnmsub_vx_i8m4_m(...) __riscv_vnmsub_vx_i8m4_tumu(__VA_ARGS__)
+#define vnmsub_vv_i8m8_m(...) __riscv_vnmsub_vv_i8m8_tumu(__VA_ARGS__)
+#define vnmsub_vx_i8m8_m(...) __riscv_vnmsub_vx_i8m8_tumu(__VA_ARGS__)
+#define vnmsub_vv_i16mf4_m(...) __riscv_vnmsub_vv_i16mf4_tumu(__VA_ARGS__)
+#define vnmsub_vx_i16mf4_m(...) __riscv_vnmsub_vx_i16mf4_tumu(__VA_ARGS__)
+#define vnmsub_vv_i16mf2_m(...) __riscv_vnmsub_vv_i16mf2_tumu(__VA_ARGS__)
+#define vnmsub_vx_i16mf2_m(...) __riscv_vnmsub_vx_i16mf2_tumu(__VA_ARGS__)
+#define vnmsub_vv_i16m1_m(...) __riscv_vnmsub_vv_i16m1_tumu(__VA_ARGS__)
+#define vnmsub_vx_i16m1_m(...) __riscv_vnmsub_vx_i16m1_tumu(__VA_ARGS__)
+#define vnmsub_vv_i16m2_m(...) __riscv_vnmsub_vv_i16m2_tumu(__VA_ARGS__)
+#define vnmsub_vx_i16m2_m(...) __riscv_vnmsub_vx_i16m2_tumu(__VA_ARGS__)
+#define vnmsub_vv_i16m4_m(...) __riscv_vnmsub_vv_i16m4_tumu(__VA_ARGS__)
+#define vnmsub_vx_i16m4_m(...) __riscv_vnmsub_vx_i16m4_tumu(__VA_ARGS__)
+#define vnmsub_vv_i16m8_m(...) __riscv_vnmsub_vv_i16m8_tumu(__VA_ARGS__)
+#define vnmsub_vx_i16m8_m(...) __riscv_vnmsub_vx_i16m8_tumu(__VA_ARGS__)
+#define vnmsub_vv_i32mf2_m(...) __riscv_vnmsub_vv_i32mf2_tumu(__VA_ARGS__)
+#define vnmsub_vx_i32mf2_m(...) __riscv_vnmsub_vx_i32mf2_tumu(__VA_ARGS__)
+#define vnmsub_vv_i32m1_m(...) __riscv_vnmsub_vv_i32m1_tumu(__VA_ARGS__)
+#define vnmsub_vx_i32m1_m(...) __riscv_vnmsub_vx_i32m1_tumu(__VA_ARGS__)
+#define vnmsub_vv_i32m2_m(...) __riscv_vnmsub_vv_i32m2_tumu(__VA_ARGS__)
+#define vnmsub_vx_i32m2_m(...) __riscv_vnmsub_vx_i32m2_tumu(__VA_ARGS__)
+#define vnmsub_vv_i32m4_m(...) __riscv_vnmsub_vv_i32m4_tumu(__VA_ARGS__)
+#define vnmsub_vx_i32m4_m(...) __riscv_vnmsub_vx_i32m4_tumu(__VA_ARGS__)
+#define vnmsub_vv_i32m8_m(...) __riscv_vnmsub_vv_i32m8_tumu(__VA_ARGS__)
+#define vnmsub_vx_i32m8_m(...) __riscv_vnmsub_vx_i32m8_tumu(__VA_ARGS__)
+#define vnmsub_vv_i64m1_m(...) __riscv_vnmsub_vv_i64m1_tumu(__VA_ARGS__)
+#define vnmsub_vx_i64m1_m(...) __riscv_vnmsub_vx_i64m1_tumu(__VA_ARGS__)
+#define vnmsub_vv_i64m2_m(...) __riscv_vnmsub_vv_i64m2_tumu(__VA_ARGS__)
+#define vnmsub_vx_i64m2_m(...) __riscv_vnmsub_vx_i64m2_tumu(__VA_ARGS__)
+#define vnmsub_vv_i64m4_m(...) __riscv_vnmsub_vv_i64m4_tumu(__VA_ARGS__)
+#define vnmsub_vx_i64m4_m(...) __riscv_vnmsub_vx_i64m4_tumu(__VA_ARGS__)
+#define vnmsub_vv_i64m8_m(...) __riscv_vnmsub_vv_i64m8_tumu(__VA_ARGS__)
+#define vnmsub_vx_i64m8_m(...) __riscv_vnmsub_vx_i64m8_tumu(__VA_ARGS__)
+#define vmacc_vv_u8mf8_m(...) __riscv_vmacc_vv_u8mf8_tumu(__VA_ARGS__)
+#define vmacc_vx_u8mf8_m(...) __riscv_vmacc_vx_u8mf8_tumu(__VA_ARGS__)
+#define vmacc_vv_u8mf4_m(...) __riscv_vmacc_vv_u8mf4_tumu(__VA_ARGS__)
+#define vmacc_vx_u8mf4_m(...) __riscv_vmacc_vx_u8mf4_tumu(__VA_ARGS__)
+#define vmacc_vv_u8mf2_m(...) __riscv_vmacc_vv_u8mf2_tumu(__VA_ARGS__)
+#define vmacc_vx_u8mf2_m(...) __riscv_vmacc_vx_u8mf2_tumu(__VA_ARGS__)
+#define vmacc_vv_u8m1_m(...) __riscv_vmacc_vv_u8m1_tumu(__VA_ARGS__)
+#define vmacc_vx_u8m1_m(...) __riscv_vmacc_vx_u8m1_tumu(__VA_ARGS__)
+#define vmacc_vv_u8m2_m(...) __riscv_vmacc_vv_u8m2_tumu(__VA_ARGS__)
+#define vmacc_vx_u8m2_m(...) __riscv_vmacc_vx_u8m2_tumu(__VA_ARGS__)
+#define vmacc_vv_u8m4_m(...) __riscv_vmacc_vv_u8m4_tumu(__VA_ARGS__)
+#define vmacc_vx_u8m4_m(...) __riscv_vmacc_vx_u8m4_tumu(__VA_ARGS__)
+#define vmacc_vv_u8m8_m(...) __riscv_vmacc_vv_u8m8_tumu(__VA_ARGS__)
+#define vmacc_vx_u8m8_m(...) __riscv_vmacc_vx_u8m8_tumu(__VA_ARGS__)
+#define vmacc_vv_u16mf4_m(...) __riscv_vmacc_vv_u16mf4_tumu(__VA_ARGS__)
+#define vmacc_vx_u16mf4_m(...) __riscv_vmacc_vx_u16mf4_tumu(__VA_ARGS__)
+#define vmacc_vv_u16mf2_m(...) __riscv_vmacc_vv_u16mf2_tumu(__VA_ARGS__)
+#define vmacc_vx_u16mf2_m(...) __riscv_vmacc_vx_u16mf2_tumu(__VA_ARGS__)
+#define vmacc_vv_u16m1_m(...) __riscv_vmacc_vv_u16m1_tumu(__VA_ARGS__)
+#define vmacc_vx_u16m1_m(...) __riscv_vmacc_vx_u16m1_tumu(__VA_ARGS__)
+#define vmacc_vv_u16m2_m(...) __riscv_vmacc_vv_u16m2_tumu(__VA_ARGS__)
+#define vmacc_vx_u16m2_m(...) __riscv_vmacc_vx_u16m2_tumu(__VA_ARGS__)
+#define vmacc_vv_u16m4_m(...) __riscv_vmacc_vv_u16m4_tumu(__VA_ARGS__)
+#define vmacc_vx_u16m4_m(...) __riscv_vmacc_vx_u16m4_tumu(__VA_ARGS__)
+#define vmacc_vv_u16m8_m(...) __riscv_vmacc_vv_u16m8_tumu(__VA_ARGS__)
+#define vmacc_vx_u16m8_m(...) __riscv_vmacc_vx_u16m8_tumu(__VA_ARGS__)
+#define vmacc_vv_u32mf2_m(...) __riscv_vmacc_vv_u32mf2_tumu(__VA_ARGS__)
+#define vmacc_vx_u32mf2_m(...) __riscv_vmacc_vx_u32mf2_tumu(__VA_ARGS__)
+#define vmacc_vv_u32m1_m(...) __riscv_vmacc_vv_u32m1_tumu(__VA_ARGS__)
+#define vmacc_vx_u32m1_m(...) __riscv_vmacc_vx_u32m1_tumu(__VA_ARGS__)
+#define vmacc_vv_u32m2_m(...) __riscv_vmacc_vv_u32m2_tumu(__VA_ARGS__)
+#define vmacc_vx_u32m2_m(...) __riscv_vmacc_vx_u32m2_tumu(__VA_ARGS__)
+#define vmacc_vv_u32m4_m(...) __riscv_vmacc_vv_u32m4_tumu(__VA_ARGS__)
+#define vmacc_vx_u32m4_m(...) __riscv_vmacc_vx_u32m4_tumu(__VA_ARGS__)
+#define vmacc_vv_u32m8_m(...) __riscv_vmacc_vv_u32m8_tumu(__VA_ARGS__)
+#define vmacc_vx_u32m8_m(...) __riscv_vmacc_vx_u32m8_tumu(__VA_ARGS__)
+#define vmacc_vv_u64m1_m(...) __riscv_vmacc_vv_u64m1_tumu(__VA_ARGS__)
+#define vmacc_vx_u64m1_m(...) __riscv_vmacc_vx_u64m1_tumu(__VA_ARGS__)
+#define vmacc_vv_u64m2_m(...) __riscv_vmacc_vv_u64m2_tumu(__VA_ARGS__)
+#define vmacc_vx_u64m2_m(...) __riscv_vmacc_vx_u64m2_tumu(__VA_ARGS__)
+#define vmacc_vv_u64m4_m(...) __riscv_vmacc_vv_u64m4_tumu(__VA_ARGS__)
+#define vmacc_vx_u64m4_m(...) __riscv_vmacc_vx_u64m4_tumu(__VA_ARGS__)
+#define vmacc_vv_u64m8_m(...) __riscv_vmacc_vv_u64m8_tumu(__VA_ARGS__)
+#define vmacc_vx_u64m8_m(...) __riscv_vmacc_vx_u64m8_tumu(__VA_ARGS__)
+#define vnmsac_vv_u8mf8_m(...) __riscv_vnmsac_vv_u8mf8_tumu(__VA_ARGS__)
+#define vnmsac_vx_u8mf8_m(...) __riscv_vnmsac_vx_u8mf8_tumu(__VA_ARGS__)
+#define vnmsac_vv_u8mf4_m(...) __riscv_vnmsac_vv_u8mf4_tumu(__VA_ARGS__)
+#define vnmsac_vx_u8mf4_m(...) __riscv_vnmsac_vx_u8mf4_tumu(__VA_ARGS__)
+#define vnmsac_vv_u8mf2_m(...) __riscv_vnmsac_vv_u8mf2_tumu(__VA_ARGS__)
+#define vnmsac_vx_u8mf2_m(...) __riscv_vnmsac_vx_u8mf2_tumu(__VA_ARGS__)
+#define vnmsac_vv_u8m1_m(...) __riscv_vnmsac_vv_u8m1_tumu(__VA_ARGS__)
+#define vnmsac_vx_u8m1_m(...) __riscv_vnmsac_vx_u8m1_tumu(__VA_ARGS__)
+#define vnmsac_vv_u8m2_m(...) __riscv_vnmsac_vv_u8m2_tumu(__VA_ARGS__)
+#define vnmsac_vx_u8m2_m(...) __riscv_vnmsac_vx_u8m2_tumu(__VA_ARGS__)
+#define vnmsac_vv_u8m4_m(...) __riscv_vnmsac_vv_u8m4_tumu(__VA_ARGS__)
+#define vnmsac_vx_u8m4_m(...) __riscv_vnmsac_vx_u8m4_tumu(__VA_ARGS__)
+#define vnmsac_vv_u8m8_m(...) __riscv_vnmsac_vv_u8m8_tumu(__VA_ARGS__)
+#define vnmsac_vx_u8m8_m(...) __riscv_vnmsac_vx_u8m8_tumu(__VA_ARGS__)
+#define vnmsac_vv_u16mf4_m(...) __riscv_vnmsac_vv_u16mf4_tumu(__VA_ARGS__)
+#define vnmsac_vx_u16mf4_m(...) __riscv_vnmsac_vx_u16mf4_tumu(__VA_ARGS__)
+#define vnmsac_vv_u16mf2_m(...) __riscv_vnmsac_vv_u16mf2_tumu(__VA_ARGS__)
+#define vnmsac_vx_u16mf2_m(...) __riscv_vnmsac_vx_u16mf2_tumu(__VA_ARGS__)
+#define vnmsac_vv_u16m1_m(...) __riscv_vnmsac_vv_u16m1_tumu(__VA_ARGS__)
+#define vnmsac_vx_u16m1_m(...) __riscv_vnmsac_vx_u16m1_tumu(__VA_ARGS__)
+#define vnmsac_vv_u16m2_m(...) __riscv_vnmsac_vv_u16m2_tumu(__VA_ARGS__)
+#define vnmsac_vx_u16m2_m(...) __riscv_vnmsac_vx_u16m2_tumu(__VA_ARGS__)
+#define vnmsac_vv_u16m4_m(...) __riscv_vnmsac_vv_u16m4_tumu(__VA_ARGS__)
+#define vnmsac_vx_u16m4_m(...) __riscv_vnmsac_vx_u16m4_tumu(__VA_ARGS__)
+#define vnmsac_vv_u16m8_m(...) __riscv_vnmsac_vv_u16m8_tumu(__VA_ARGS__)
+#define vnmsac_vx_u16m8_m(...) __riscv_vnmsac_vx_u16m8_tumu(__VA_ARGS__)
+#define vnmsac_vv_u32mf2_m(...) __riscv_vnmsac_vv_u32mf2_tumu(__VA_ARGS__)
+#define vnmsac_vx_u32mf2_m(...) __riscv_vnmsac_vx_u32mf2_tumu(__VA_ARGS__)
+#define vnmsac_vv_u32m1_m(...) __riscv_vnmsac_vv_u32m1_tumu(__VA_ARGS__)
+#define vnmsac_vx_u32m1_m(...) __riscv_vnmsac_vx_u32m1_tumu(__VA_ARGS__)
+#define vnmsac_vv_u32m2_m(...) __riscv_vnmsac_vv_u32m2_tumu(__VA_ARGS__)
+#define vnmsac_vx_u32m2_m(...) __riscv_vnmsac_vx_u32m2_tumu(__VA_ARGS__)
+#define vnmsac_vv_u32m4_m(...) __riscv_vnmsac_vv_u32m4_tumu(__VA_ARGS__)
+#define vnmsac_vx_u32m4_m(...) __riscv_vnmsac_vx_u32m4_tumu(__VA_ARGS__)
+#define vnmsac_vv_u32m8_m(...) __riscv_vnmsac_vv_u32m8_tumu(__VA_ARGS__)
+#define vnmsac_vx_u32m8_m(...) __riscv_vnmsac_vx_u32m8_tumu(__VA_ARGS__)
+#define vnmsac_vv_u64m1_m(...) __riscv_vnmsac_vv_u64m1_tumu(__VA_ARGS__)
+#define vnmsac_vx_u64m1_m(...) __riscv_vnmsac_vx_u64m1_tumu(__VA_ARGS__)
+#define vnmsac_vv_u64m2_m(...) __riscv_vnmsac_vv_u64m2_tumu(__VA_ARGS__)
+#define vnmsac_vx_u64m2_m(...) __riscv_vnmsac_vx_u64m2_tumu(__VA_ARGS__)
+#define vnmsac_vv_u64m4_m(...) __riscv_vnmsac_vv_u64m4_tumu(__VA_ARGS__)
+#define vnmsac_vx_u64m4_m(...) __riscv_vnmsac_vx_u64m4_tumu(__VA_ARGS__)
+#define vnmsac_vv_u64m8_m(...) __riscv_vnmsac_vv_u64m8_tumu(__VA_ARGS__)
+#define vnmsac_vx_u64m8_m(...) __riscv_vnmsac_vx_u64m8_tumu(__VA_ARGS__)
+#define vmadd_vv_u8mf8_m(...) __riscv_vmadd_vv_u8mf8_tumu(__VA_ARGS__)
+#define vmadd_vx_u8mf8_m(...) __riscv_vmadd_vx_u8mf8_tumu(__VA_ARGS__)
+#define vmadd_vv_u8mf4_m(...) __riscv_vmadd_vv_u8mf4_tumu(__VA_ARGS__)
+#define vmadd_vx_u8mf4_m(...) __riscv_vmadd_vx_u8mf4_tumu(__VA_ARGS__)
+#define vmadd_vv_u8mf2_m(...) __riscv_vmadd_vv_u8mf2_tumu(__VA_ARGS__)
+#define vmadd_vx_u8mf2_m(...) __riscv_vmadd_vx_u8mf2_tumu(__VA_ARGS__)
+#define vmadd_vv_u8m1_m(...) __riscv_vmadd_vv_u8m1_tumu(__VA_ARGS__)
+#define vmadd_vx_u8m1_m(...) __riscv_vmadd_vx_u8m1_tumu(__VA_ARGS__)
+#define vmadd_vv_u8m2_m(...) __riscv_vmadd_vv_u8m2_tumu(__VA_ARGS__)
+#define vmadd_vx_u8m2_m(...) __riscv_vmadd_vx_u8m2_tumu(__VA_ARGS__)
+#define vmadd_vv_u8m4_m(...) __riscv_vmadd_vv_u8m4_tumu(__VA_ARGS__)
+#define vmadd_vx_u8m4_m(...) __riscv_vmadd_vx_u8m4_tumu(__VA_ARGS__)
+#define vmadd_vv_u8m8_m(...) __riscv_vmadd_vv_u8m8_tumu(__VA_ARGS__)
+#define vmadd_vx_u8m8_m(...) __riscv_vmadd_vx_u8m8_tumu(__VA_ARGS__)
+#define vmadd_vv_u16mf4_m(...) __riscv_vmadd_vv_u16mf4_tumu(__VA_ARGS__)
+#define vmadd_vx_u16mf4_m(...) __riscv_vmadd_vx_u16mf4_tumu(__VA_ARGS__)
+#define vmadd_vv_u16mf2_m(...) __riscv_vmadd_vv_u16mf2_tumu(__VA_ARGS__)
+#define vmadd_vx_u16mf2_m(...) __riscv_vmadd_vx_u16mf2_tumu(__VA_ARGS__)
+#define vmadd_vv_u16m1_m(...) __riscv_vmadd_vv_u16m1_tumu(__VA_ARGS__)
+#define vmadd_vx_u16m1_m(...) __riscv_vmadd_vx_u16m1_tumu(__VA_ARGS__)
+#define vmadd_vv_u16m2_m(...) __riscv_vmadd_vv_u16m2_tumu(__VA_ARGS__)
+#define vmadd_vx_u16m2_m(...) __riscv_vmadd_vx_u16m2_tumu(__VA_ARGS__)
+#define vmadd_vv_u16m4_m(...) __riscv_vmadd_vv_u16m4_tumu(__VA_ARGS__)
+#define vmadd_vx_u16m4_m(...) __riscv_vmadd_vx_u16m4_tumu(__VA_ARGS__)
+#define vmadd_vv_u16m8_m(...) __riscv_vmadd_vv_u16m8_tumu(__VA_ARGS__)
+#define vmadd_vx_u16m8_m(...) __riscv_vmadd_vx_u16m8_tumu(__VA_ARGS__)
+#define vmadd_vv_u32mf2_m(...) __riscv_vmadd_vv_u32mf2_tumu(__VA_ARGS__)
+#define vmadd_vx_u32mf2_m(...) __riscv_vmadd_vx_u32mf2_tumu(__VA_ARGS__)
+#define vmadd_vv_u32m1_m(...) __riscv_vmadd_vv_u32m1_tumu(__VA_ARGS__)
+#define vmadd_vx_u32m1_m(...) __riscv_vmadd_vx_u32m1_tumu(__VA_ARGS__)
+#define vmadd_vv_u32m2_m(...) __riscv_vmadd_vv_u32m2_tumu(__VA_ARGS__)
+#define vmadd_vx_u32m2_m(...) __riscv_vmadd_vx_u32m2_tumu(__VA_ARGS__)
+#define vmadd_vv_u32m4_m(...) __riscv_vmadd_vv_u32m4_tumu(__VA_ARGS__)
+#define vmadd_vx_u32m4_m(...) __riscv_vmadd_vx_u32m4_tumu(__VA_ARGS__)
+#define vmadd_vv_u32m8_m(...) __riscv_vmadd_vv_u32m8_tumu(__VA_ARGS__)
+#define vmadd_vx_u32m8_m(...) __riscv_vmadd_vx_u32m8_tumu(__VA_ARGS__)
+#define vmadd_vv_u64m1_m(...) __riscv_vmadd_vv_u64m1_tumu(__VA_ARGS__)
+#define vmadd_vx_u64m1_m(...) __riscv_vmadd_vx_u64m1_tumu(__VA_ARGS__)
+#define vmadd_vv_u64m2_m(...) __riscv_vmadd_vv_u64m2_tumu(__VA_ARGS__)
+#define vmadd_vx_u64m2_m(...) __riscv_vmadd_vx_u64m2_tumu(__VA_ARGS__)
+#define vmadd_vv_u64m4_m(...) __riscv_vmadd_vv_u64m4_tumu(__VA_ARGS__)
+#define vmadd_vx_u64m4_m(...) __riscv_vmadd_vx_u64m4_tumu(__VA_ARGS__)
+#define vmadd_vv_u64m8_m(...) __riscv_vmadd_vv_u64m8_tumu(__VA_ARGS__)
+#define vmadd_vx_u64m8_m(...) __riscv_vmadd_vx_u64m8_tumu(__VA_ARGS__)
+#define vnmsub_vv_u8mf8_m(...) __riscv_vnmsub_vv_u8mf8_tumu(__VA_ARGS__)
+#define vnmsub_vx_u8mf8_m(...) __riscv_vnmsub_vx_u8mf8_tumu(__VA_ARGS__)
+#define vnmsub_vv_u8mf4_m(...) __riscv_vnmsub_vv_u8mf4_tumu(__VA_ARGS__)
+#define vnmsub_vx_u8mf4_m(...) __riscv_vnmsub_vx_u8mf4_tumu(__VA_ARGS__)
+#define vnmsub_vv_u8mf2_m(...) __riscv_vnmsub_vv_u8mf2_tumu(__VA_ARGS__)
+#define vnmsub_vx_u8mf2_m(...) __riscv_vnmsub_vx_u8mf2_tumu(__VA_ARGS__)
+#define vnmsub_vv_u8m1_m(...) __riscv_vnmsub_vv_u8m1_tumu(__VA_ARGS__)
+#define vnmsub_vx_u8m1_m(...) __riscv_vnmsub_vx_u8m1_tumu(__VA_ARGS__)
+#define vnmsub_vv_u8m2_m(...) __riscv_vnmsub_vv_u8m2_tumu(__VA_ARGS__)
+#define vnmsub_vx_u8m2_m(...) __riscv_vnmsub_vx_u8m2_tumu(__VA_ARGS__)
+#define vnmsub_vv_u8m4_m(...) __riscv_vnmsub_vv_u8m4_tumu(__VA_ARGS__)
+#define vnmsub_vx_u8m4_m(...) __riscv_vnmsub_vx_u8m4_tumu(__VA_ARGS__)
+#define vnmsub_vv_u8m8_m(...) __riscv_vnmsub_vv_u8m8_tumu(__VA_ARGS__)
+#define vnmsub_vx_u8m8_m(...) __riscv_vnmsub_vx_u8m8_tumu(__VA_ARGS__)
+#define vnmsub_vv_u16mf4_m(...) __riscv_vnmsub_vv_u16mf4_tumu(__VA_ARGS__)
+#define vnmsub_vx_u16mf4_m(...) __riscv_vnmsub_vx_u16mf4_tumu(__VA_ARGS__)
+#define vnmsub_vv_u16mf2_m(...) __riscv_vnmsub_vv_u16mf2_tumu(__VA_ARGS__)
+#define vnmsub_vx_u16mf2_m(...) __riscv_vnmsub_vx_u16mf2_tumu(__VA_ARGS__)
+#define vnmsub_vv_u16m1_m(...) __riscv_vnmsub_vv_u16m1_tumu(__VA_ARGS__)
+#define vnmsub_vx_u16m1_m(...) __riscv_vnmsub_vx_u16m1_tumu(__VA_ARGS__)
+#define vnmsub_vv_u16m2_m(...) __riscv_vnmsub_vv_u16m2_tumu(__VA_ARGS__)
+#define vnmsub_vx_u16m2_m(...) __riscv_vnmsub_vx_u16m2_tumu(__VA_ARGS__)
+#define vnmsub_vv_u16m4_m(...) __riscv_vnmsub_vv_u16m4_tumu(__VA_ARGS__)
+#define vnmsub_vx_u16m4_m(...) __riscv_vnmsub_vx_u16m4_tumu(__VA_ARGS__)
+#define vnmsub_vv_u16m8_m(...) __riscv_vnmsub_vv_u16m8_tumu(__VA_ARGS__)
+#define vnmsub_vx_u16m8_m(...) __riscv_vnmsub_vx_u16m8_tumu(__VA_ARGS__)
+#define vnmsub_vv_u32mf2_m(...) __riscv_vnmsub_vv_u32mf2_tumu(__VA_ARGS__)
+#define vnmsub_vx_u32mf2_m(...) __riscv_vnmsub_vx_u32mf2_tumu(__VA_ARGS__)
+#define vnmsub_vv_u32m1_m(...) __riscv_vnmsub_vv_u32m1_tumu(__VA_ARGS__)
+#define vnmsub_vx_u32m1_m(...) __riscv_vnmsub_vx_u32m1_tumu(__VA_ARGS__)
+#define vnmsub_vv_u32m2_m(...) __riscv_vnmsub_vv_u32m2_tumu(__VA_ARGS__)
+#define vnmsub_vx_u32m2_m(...) __riscv_vnmsub_vx_u32m2_tumu(__VA_ARGS__)
+#define vnmsub_vv_u32m4_m(...) __riscv_vnmsub_vv_u32m4_tumu(__VA_ARGS__)
+#define vnmsub_vx_u32m4_m(...) __riscv_vnmsub_vx_u32m4_tumu(__VA_ARGS__)
+#define vnmsub_vv_u32m8_m(...) __riscv_vnmsub_vv_u32m8_tumu(__VA_ARGS__)
+#define vnmsub_vx_u32m8_m(...) __riscv_vnmsub_vx_u32m8_tumu(__VA_ARGS__)
+#define vnmsub_vv_u64m1_m(...) __riscv_vnmsub_vv_u64m1_tumu(__VA_ARGS__)
+#define vnmsub_vx_u64m1_m(...) __riscv_vnmsub_vx_u64m1_tumu(__VA_ARGS__)
+#define vnmsub_vv_u64m2_m(...) __riscv_vnmsub_vv_u64m2_tumu(__VA_ARGS__)
+#define vnmsub_vx_u64m2_m(...) __riscv_vnmsub_vx_u64m2_tumu(__VA_ARGS__)
+#define vnmsub_vv_u64m4_m(...) __riscv_vnmsub_vv_u64m4_tumu(__VA_ARGS__)
+#define vnmsub_vx_u64m4_m(...) __riscv_vnmsub_vx_u64m4_tumu(__VA_ARGS__)
+#define vnmsub_vv_u64m8_m(...) __riscv_vnmsub_vv_u64m8_tumu(__VA_ARGS__)
+#define vnmsub_vx_u64m8_m(...) __riscv_vnmsub_vx_u64m8_tumu(__VA_ARGS__)
+#define vwmacc_vv_i16mf4(...) __riscv_vwmacc_vv_i16mf4_tu(__VA_ARGS__)
+#define vwmacc_vx_i16mf4(...) __riscv_vwmacc_vx_i16mf4_tu(__VA_ARGS__)
+#define vwmacc_vv_i16mf2(...) __riscv_vwmacc_vv_i16mf2_tu(__VA_ARGS__)
+#define vwmacc_vx_i16mf2(...) __riscv_vwmacc_vx_i16mf2_tu(__VA_ARGS__)
+#define vwmacc_vv_i16m1(...) __riscv_vwmacc_vv_i16m1_tu(__VA_ARGS__)
+#define vwmacc_vx_i16m1(...) __riscv_vwmacc_vx_i16m1_tu(__VA_ARGS__)
+#define vwmacc_vv_i16m2(...) __riscv_vwmacc_vv_i16m2_tu(__VA_ARGS__)
+#define vwmacc_vx_i16m2(...) __riscv_vwmacc_vx_i16m2_tu(__VA_ARGS__)
+#define vwmacc_vv_i16m4(...) __riscv_vwmacc_vv_i16m4_tu(__VA_ARGS__)
+#define vwmacc_vx_i16m4(...) __riscv_vwmacc_vx_i16m4_tu(__VA_ARGS__)
+#define vwmacc_vv_i16m8(...) __riscv_vwmacc_vv_i16m8_tu(__VA_ARGS__)
+#define vwmacc_vx_i16m8(...) __riscv_vwmacc_vx_i16m8_tu(__VA_ARGS__)
+#define vwmacc_vv_i32mf2(...) __riscv_vwmacc_vv_i32mf2_tu(__VA_ARGS__)
+#define vwmacc_vx_i32mf2(...) __riscv_vwmacc_vx_i32mf2_tu(__VA_ARGS__)
+#define vwmacc_vv_i32m1(...) __riscv_vwmacc_vv_i32m1_tu(__VA_ARGS__)
+#define vwmacc_vx_i32m1(...) __riscv_vwmacc_vx_i32m1_tu(__VA_ARGS__)
+#define vwmacc_vv_i32m2(...) __riscv_vwmacc_vv_i32m2_tu(__VA_ARGS__)
+#define vwmacc_vx_i32m2(...) __riscv_vwmacc_vx_i32m2_tu(__VA_ARGS__)
+#define vwmacc_vv_i32m4(...) __riscv_vwmacc_vv_i32m4_tu(__VA_ARGS__)
+#define vwmacc_vx_i32m4(...) __riscv_vwmacc_vx_i32m4_tu(__VA_ARGS__)
+#define vwmacc_vv_i32m8(...) __riscv_vwmacc_vv_i32m8_tu(__VA_ARGS__)
+#define vwmacc_vx_i32m8(...) __riscv_vwmacc_vx_i32m8_tu(__VA_ARGS__)
+#define vwmacc_vv_i64m1(...) __riscv_vwmacc_vv_i64m1_tu(__VA_ARGS__)
+#define vwmacc_vx_i64m1(...) __riscv_vwmacc_vx_i64m1_tu(__VA_ARGS__)
+#define vwmacc_vv_i64m2(...) __riscv_vwmacc_vv_i64m2_tu(__VA_ARGS__)
+#define vwmacc_vx_i64m2(...) __riscv_vwmacc_vx_i64m2_tu(__VA_ARGS__)
+#define vwmacc_vv_i64m4(...) __riscv_vwmacc_vv_i64m4_tu(__VA_ARGS__)
+#define vwmacc_vx_i64m4(...) __riscv_vwmacc_vx_i64m4_tu(__VA_ARGS__)
+#define vwmacc_vv_i64m8(...) __riscv_vwmacc_vv_i64m8_tu(__VA_ARGS__)
+#define vwmacc_vx_i64m8(...) __riscv_vwmacc_vx_i64m8_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i16mf4(...) __riscv_vwmaccsu_vv_i16mf4_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i16mf4(...) __riscv_vwmaccsu_vx_i16mf4_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i16mf2(...) __riscv_vwmaccsu_vv_i16mf2_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i16mf2(...) __riscv_vwmaccsu_vx_i16mf2_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i16m1(...) __riscv_vwmaccsu_vv_i16m1_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i16m1(...) __riscv_vwmaccsu_vx_i16m1_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i16m2(...) __riscv_vwmaccsu_vv_i16m2_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i16m2(...) __riscv_vwmaccsu_vx_i16m2_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i16m4(...) __riscv_vwmaccsu_vv_i16m4_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i16m4(...) __riscv_vwmaccsu_vx_i16m4_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i16m8(...) __riscv_vwmaccsu_vv_i16m8_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i16m8(...) __riscv_vwmaccsu_vx_i16m8_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i32mf2(...) __riscv_vwmaccsu_vv_i32mf2_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i32mf2(...) __riscv_vwmaccsu_vx_i32mf2_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i32m1(...) __riscv_vwmaccsu_vv_i32m1_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i32m1(...) __riscv_vwmaccsu_vx_i32m1_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i32m2(...) __riscv_vwmaccsu_vv_i32m2_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i32m2(...) __riscv_vwmaccsu_vx_i32m2_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i32m4(...) __riscv_vwmaccsu_vv_i32m4_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i32m4(...) __riscv_vwmaccsu_vx_i32m4_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i32m8(...) __riscv_vwmaccsu_vv_i32m8_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i32m8(...) __riscv_vwmaccsu_vx_i32m8_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i64m1(...) __riscv_vwmaccsu_vv_i64m1_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i64m1(...) __riscv_vwmaccsu_vx_i64m1_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i64m2(...) __riscv_vwmaccsu_vv_i64m2_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i64m2(...) __riscv_vwmaccsu_vx_i64m2_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i64m4(...) __riscv_vwmaccsu_vv_i64m4_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i64m4(...) __riscv_vwmaccsu_vx_i64m4_tu(__VA_ARGS__)
+#define vwmaccsu_vv_i64m8(...) __riscv_vwmaccsu_vv_i64m8_tu(__VA_ARGS__)
+#define vwmaccsu_vx_i64m8(...) __riscv_vwmaccsu_vx_i64m8_tu(__VA_ARGS__)
+#define vwmaccus_vx_i16mf4(...) __riscv_vwmaccus_vx_i16mf4_tu(__VA_ARGS__)
+#define vwmaccus_vx_i16mf2(...) __riscv_vwmaccus_vx_i16mf2_tu(__VA_ARGS__)
+#define vwmaccus_vx_i16m1(...) __riscv_vwmaccus_vx_i16m1_tu(__VA_ARGS__)
+#define vwmaccus_vx_i16m2(...) __riscv_vwmaccus_vx_i16m2_tu(__VA_ARGS__)
+#define vwmaccus_vx_i16m4(...) __riscv_vwmaccus_vx_i16m4_tu(__VA_ARGS__)
+#define vwmaccus_vx_i16m8(...) __riscv_vwmaccus_vx_i16m8_tu(__VA_ARGS__)
+#define vwmaccus_vx_i32mf2(...) __riscv_vwmaccus_vx_i32mf2_tu(__VA_ARGS__)
+#define vwmaccus_vx_i32m1(...) __riscv_vwmaccus_vx_i32m1_tu(__VA_ARGS__)
+#define vwmaccus_vx_i32m2(...) __riscv_vwmaccus_vx_i32m2_tu(__VA_ARGS__)
+#define vwmaccus_vx_i32m4(...) __riscv_vwmaccus_vx_i32m4_tu(__VA_ARGS__)
+#define vwmaccus_vx_i32m8(...) __riscv_vwmaccus_vx_i32m8_tu(__VA_ARGS__)
+#define vwmaccus_vx_i64m1(...) __riscv_vwmaccus_vx_i64m1_tu(__VA_ARGS__)
+#define vwmaccus_vx_i64m2(...) __riscv_vwmaccus_vx_i64m2_tu(__VA_ARGS__)
+#define vwmaccus_vx_i64m4(...) __riscv_vwmaccus_vx_i64m4_tu(__VA_ARGS__)
+#define vwmaccus_vx_i64m8(...) __riscv_vwmaccus_vx_i64m8_tu(__VA_ARGS__)
+#define vwmaccu_vv_u16mf4(...) __riscv_vwmaccu_vv_u16mf4_tu(__VA_ARGS__)
+#define vwmaccu_vx_u16mf4(...) __riscv_vwmaccu_vx_u16mf4_tu(__VA_ARGS__)
+#define vwmaccu_vv_u16mf2(...) __riscv_vwmaccu_vv_u16mf2_tu(__VA_ARGS__)
+#define vwmaccu_vx_u16mf2(...) __riscv_vwmaccu_vx_u16mf2_tu(__VA_ARGS__)
+#define vwmaccu_vv_u16m1(...) __riscv_vwmaccu_vv_u16m1_tu(__VA_ARGS__)
+#define vwmaccu_vx_u16m1(...) __riscv_vwmaccu_vx_u16m1_tu(__VA_ARGS__)
+#define vwmaccu_vv_u16m2(...) __riscv_vwmaccu_vv_u16m2_tu(__VA_ARGS__)
+#define vwmaccu_vx_u16m2(...) __riscv_vwmaccu_vx_u16m2_tu(__VA_ARGS__)
+#define vwmaccu_vv_u16m4(...) __riscv_vwmaccu_vv_u16m4_tu(__VA_ARGS__)
+#define vwmaccu_vx_u16m4(...) __riscv_vwmaccu_vx_u16m4_tu(__VA_ARGS__)
+#define vwmaccu_vv_u16m8(...) __riscv_vwmaccu_vv_u16m8_tu(__VA_ARGS__)
+#define vwmaccu_vx_u16m8(...) __riscv_vwmaccu_vx_u16m8_tu(__VA_ARGS__)
+#define vwmaccu_vv_u32mf2(...) __riscv_vwmaccu_vv_u32mf2_tu(__VA_ARGS__)
+#define vwmaccu_vx_u32mf2(...) __riscv_vwmaccu_vx_u32mf2_tu(__VA_ARGS__)
+#define vwmaccu_vv_u32m1(...) __riscv_vwmaccu_vv_u32m1_tu(__VA_ARGS__)
+#define vwmaccu_vx_u32m1(...) __riscv_vwmaccu_vx_u32m1_tu(__VA_ARGS__)
+#define vwmaccu_vv_u32m2(...) __riscv_vwmaccu_vv_u32m2_tu(__VA_ARGS__)
+#define vwmaccu_vx_u32m2(...) __riscv_vwmaccu_vx_u32m2_tu(__VA_ARGS__)
+#define vwmaccu_vv_u32m4(...) __riscv_vwmaccu_vv_u32m4_tu(__VA_ARGS__)
+#define vwmaccu_vx_u32m4(...) __riscv_vwmaccu_vx_u32m4_tu(__VA_ARGS__)
+#define vwmaccu_vv_u32m8(...) __riscv_vwmaccu_vv_u32m8_tu(__VA_ARGS__)
+#define vwmaccu_vx_u32m8(...) __riscv_vwmaccu_vx_u32m8_tu(__VA_ARGS__)
+#define vwmaccu_vv_u64m1(...) __riscv_vwmaccu_vv_u64m1_tu(__VA_ARGS__)
+#define vwmaccu_vx_u64m1(...) __riscv_vwmaccu_vx_u64m1_tu(__VA_ARGS__)
+#define vwmaccu_vv_u64m2(...) __riscv_vwmaccu_vv_u64m2_tu(__VA_ARGS__)
+#define vwmaccu_vx_u64m2(...) __riscv_vwmaccu_vx_u64m2_tu(__VA_ARGS__)
+#define vwmaccu_vv_u64m4(...) __riscv_vwmaccu_vv_u64m4_tu(__VA_ARGS__)
+#define vwmaccu_vx_u64m4(...) __riscv_vwmaccu_vx_u64m4_tu(__VA_ARGS__)
+#define vwmaccu_vv_u64m8(...) __riscv_vwmaccu_vv_u64m8_tu(__VA_ARGS__)
+#define vwmaccu_vx_u64m8(...) __riscv_vwmaccu_vx_u64m8_tu(__VA_ARGS__)
+// masked functions
+#define vwmacc_vv_i16mf4_m(...) __riscv_vwmacc_vv_i16mf4_tumu(__VA_ARGS__)
+#define vwmacc_vx_i16mf4_m(...) __riscv_vwmacc_vx_i16mf4_tumu(__VA_ARGS__)
+#define vwmacc_vv_i16mf2_m(...) __riscv_vwmacc_vv_i16mf2_tumu(__VA_ARGS__)
+#define vwmacc_vx_i16mf2_m(...) __riscv_vwmacc_vx_i16mf2_tumu(__VA_ARGS__)
+#define vwmacc_vv_i16m1_m(...) __riscv_vwmacc_vv_i16m1_tumu(__VA_ARGS__)
+#define vwmacc_vx_i16m1_m(...) __riscv_vwmacc_vx_i16m1_tumu(__VA_ARGS__)
+#define vwmacc_vv_i16m2_m(...) __riscv_vwmacc_vv_i16m2_tumu(__VA_ARGS__)
+#define vwmacc_vx_i16m2_m(...) __riscv_vwmacc_vx_i16m2_tumu(__VA_ARGS__)
+#define vwmacc_vv_i16m4_m(...) __riscv_vwmacc_vv_i16m4_tumu(__VA_ARGS__)
+#define vwmacc_vx_i16m4_m(...) __riscv_vwmacc_vx_i16m4_tumu(__VA_ARGS__)
+#define vwmacc_vv_i16m8_m(...) __riscv_vwmacc_vv_i16m8_tumu(__VA_ARGS__)
+#define vwmacc_vx_i16m8_m(...) __riscv_vwmacc_vx_i16m8_tumu(__VA_ARGS__)
+#define vwmacc_vv_i32mf2_m(...) __riscv_vwmacc_vv_i32mf2_tumu(__VA_ARGS__)
+#define vwmacc_vx_i32mf2_m(...) __riscv_vwmacc_vx_i32mf2_tumu(__VA_ARGS__)
+#define vwmacc_vv_i32m1_m(...) __riscv_vwmacc_vv_i32m1_tumu(__VA_ARGS__)
+#define vwmacc_vx_i32m1_m(...) __riscv_vwmacc_vx_i32m1_tumu(__VA_ARGS__)
+#define vwmacc_vv_i32m2_m(...) __riscv_vwmacc_vv_i32m2_tumu(__VA_ARGS__)
+#define vwmacc_vx_i32m2_m(...) __riscv_vwmacc_vx_i32m2_tumu(__VA_ARGS__)
+#define vwmacc_vv_i32m4_m(...) __riscv_vwmacc_vv_i32m4_tumu(__VA_ARGS__)
+#define vwmacc_vx_i32m4_m(...) __riscv_vwmacc_vx_i32m4_tumu(__VA_ARGS__)
+#define vwmacc_vv_i32m8_m(...) __riscv_vwmacc_vv_i32m8_tumu(__VA_ARGS__)
+#define vwmacc_vx_i32m8_m(...) __riscv_vwmacc_vx_i32m8_tumu(__VA_ARGS__)
+#define vwmacc_vv_i64m1_m(...) __riscv_vwmacc_vv_i64m1_tumu(__VA_ARGS__)
+#define vwmacc_vx_i64m1_m(...) __riscv_vwmacc_vx_i64m1_tumu(__VA_ARGS__)
+#define vwmacc_vv_i64m2_m(...) __riscv_vwmacc_vv_i64m2_tumu(__VA_ARGS__)
+#define vwmacc_vx_i64m2_m(...) __riscv_vwmacc_vx_i64m2_tumu(__VA_ARGS__)
+#define vwmacc_vv_i64m4_m(...) __riscv_vwmacc_vv_i64m4_tumu(__VA_ARGS__)
+#define vwmacc_vx_i64m4_m(...) __riscv_vwmacc_vx_i64m4_tumu(__VA_ARGS__)
+#define vwmacc_vv_i64m8_m(...) __riscv_vwmacc_vv_i64m8_tumu(__VA_ARGS__)
+#define vwmacc_vx_i64m8_m(...) __riscv_vwmacc_vx_i64m8_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i16mf4_m(...) __riscv_vwmaccsu_vv_i16mf4_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i16mf4_m(...) __riscv_vwmaccsu_vx_i16mf4_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i16mf2_m(...) __riscv_vwmaccsu_vv_i16mf2_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i16mf2_m(...) __riscv_vwmaccsu_vx_i16mf2_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i16m1_m(...) __riscv_vwmaccsu_vv_i16m1_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i16m1_m(...) __riscv_vwmaccsu_vx_i16m1_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i16m2_m(...) __riscv_vwmaccsu_vv_i16m2_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i16m2_m(...) __riscv_vwmaccsu_vx_i16m2_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i16m4_m(...) __riscv_vwmaccsu_vv_i16m4_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i16m4_m(...) __riscv_vwmaccsu_vx_i16m4_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i16m8_m(...) __riscv_vwmaccsu_vv_i16m8_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i16m8_m(...) __riscv_vwmaccsu_vx_i16m8_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i32mf2_m(...) __riscv_vwmaccsu_vv_i32mf2_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i32mf2_m(...) __riscv_vwmaccsu_vx_i32mf2_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i32m1_m(...) __riscv_vwmaccsu_vv_i32m1_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i32m1_m(...) __riscv_vwmaccsu_vx_i32m1_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i32m2_m(...) __riscv_vwmaccsu_vv_i32m2_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i32m2_m(...) __riscv_vwmaccsu_vx_i32m2_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i32m4_m(...) __riscv_vwmaccsu_vv_i32m4_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i32m4_m(...) __riscv_vwmaccsu_vx_i32m4_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i32m8_m(...) __riscv_vwmaccsu_vv_i32m8_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i32m8_m(...) __riscv_vwmaccsu_vx_i32m8_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i64m1_m(...) __riscv_vwmaccsu_vv_i64m1_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i64m1_m(...) __riscv_vwmaccsu_vx_i64m1_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i64m2_m(...) __riscv_vwmaccsu_vv_i64m2_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i64m2_m(...) __riscv_vwmaccsu_vx_i64m2_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i64m4_m(...) __riscv_vwmaccsu_vv_i64m4_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i64m4_m(...) __riscv_vwmaccsu_vx_i64m4_tumu(__VA_ARGS__)
+#define vwmaccsu_vv_i64m8_m(...) __riscv_vwmaccsu_vv_i64m8_tumu(__VA_ARGS__)
+#define vwmaccsu_vx_i64m8_m(...) __riscv_vwmaccsu_vx_i64m8_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i16mf4_m(...) __riscv_vwmaccus_vx_i16mf4_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i16mf2_m(...) __riscv_vwmaccus_vx_i16mf2_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i16m1_m(...) __riscv_vwmaccus_vx_i16m1_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i16m2_m(...) __riscv_vwmaccus_vx_i16m2_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i16m4_m(...) __riscv_vwmaccus_vx_i16m4_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i16m8_m(...) __riscv_vwmaccus_vx_i16m8_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i32mf2_m(...) __riscv_vwmaccus_vx_i32mf2_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i32m1_m(...) __riscv_vwmaccus_vx_i32m1_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i32m2_m(...) __riscv_vwmaccus_vx_i32m2_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i32m4_m(...) __riscv_vwmaccus_vx_i32m4_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i32m8_m(...) __riscv_vwmaccus_vx_i32m8_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i64m1_m(...) __riscv_vwmaccus_vx_i64m1_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i64m2_m(...) __riscv_vwmaccus_vx_i64m2_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i64m4_m(...) __riscv_vwmaccus_vx_i64m4_tumu(__VA_ARGS__)
+#define vwmaccus_vx_i64m8_m(...) __riscv_vwmaccus_vx_i64m8_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u16mf4_m(...) __riscv_vwmaccu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u16mf4_m(...) __riscv_vwmaccu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u16mf2_m(...) __riscv_vwmaccu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u16mf2_m(...) __riscv_vwmaccu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u16m1_m(...) __riscv_vwmaccu_vv_u16m1_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u16m1_m(...) __riscv_vwmaccu_vx_u16m1_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u16m2_m(...) __riscv_vwmaccu_vv_u16m2_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u16m2_m(...) __riscv_vwmaccu_vx_u16m2_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u16m4_m(...) __riscv_vwmaccu_vv_u16m4_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u16m4_m(...) __riscv_vwmaccu_vx_u16m4_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u16m8_m(...) __riscv_vwmaccu_vv_u16m8_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u16m8_m(...) __riscv_vwmaccu_vx_u16m8_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u32mf2_m(...) __riscv_vwmaccu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u32mf2_m(...) __riscv_vwmaccu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u32m1_m(...) __riscv_vwmaccu_vv_u32m1_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u32m1_m(...) __riscv_vwmaccu_vx_u32m1_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u32m2_m(...) __riscv_vwmaccu_vv_u32m2_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u32m2_m(...) __riscv_vwmaccu_vx_u32m2_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u32m4_m(...) __riscv_vwmaccu_vv_u32m4_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u32m4_m(...) __riscv_vwmaccu_vx_u32m4_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u32m8_m(...) __riscv_vwmaccu_vv_u32m8_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u32m8_m(...) __riscv_vwmaccu_vx_u32m8_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u64m1_m(...) __riscv_vwmaccu_vv_u64m1_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u64m1_m(...) __riscv_vwmaccu_vx_u64m1_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u64m2_m(...) __riscv_vwmaccu_vv_u64m2_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u64m2_m(...) __riscv_vwmaccu_vx_u64m2_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u64m4_m(...) __riscv_vwmaccu_vv_u64m4_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u64m4_m(...) __riscv_vwmaccu_vx_u64m4_tumu(__VA_ARGS__)
+#define vwmaccu_vv_u64m8_m(...) __riscv_vwmaccu_vv_u64m8_tumu(__VA_ARGS__)
+#define vwmaccu_vx_u64m8_m(...) __riscv_vwmaccu_vx_u64m8_tumu(__VA_ARGS__)
+#define vmerge_vvm_i8mf8(mask, op1, op2, vl) __riscv_vmerge_vvm_i8mf8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i8mf8(mask, op1, op2, vl) __riscv_vmerge_vxm_i8mf8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i8mf4(mask, op1, op2, vl) __riscv_vmerge_vvm_i8mf4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i8mf4(mask, op1, op2, vl) __riscv_vmerge_vxm_i8mf4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i8mf2(mask, op1, op2, vl) __riscv_vmerge_vvm_i8mf2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i8mf2(mask, op1, op2, vl) __riscv_vmerge_vxm_i8mf2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i8m1(mask, op1, op2, vl) __riscv_vmerge_vvm_i8m1((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i8m1(mask, op1, op2, vl) __riscv_vmerge_vxm_i8m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i8m2(mask, op1, op2, vl) __riscv_vmerge_vvm_i8m2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i8m2(mask, op1, op2, vl) __riscv_vmerge_vxm_i8m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i8m4(mask, op1, op2, vl) __riscv_vmerge_vvm_i8m4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i8m4(mask, op1, op2, vl) __riscv_vmerge_vxm_i8m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i8m8(mask, op1, op2, vl) __riscv_vmerge_vvm_i8m8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i8m8(mask, op1, op2, vl) __riscv_vmerge_vxm_i8m8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i16mf4(mask, op1, op2, vl) __riscv_vmerge_vvm_i16mf4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i16mf4(mask, op1, op2, vl) __riscv_vmerge_vxm_i16mf4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i16mf2(mask, op1, op2, vl) __riscv_vmerge_vvm_i16mf2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i16mf2(mask, op1, op2, vl) __riscv_vmerge_vxm_i16mf2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i16m1(mask, op1, op2, vl) __riscv_vmerge_vvm_i16m1((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i16m1(mask, op1, op2, vl) __riscv_vmerge_vxm_i16m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i16m2(mask, op1, op2, vl) __riscv_vmerge_vvm_i16m2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i16m2(mask, op1, op2, vl) __riscv_vmerge_vxm_i16m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i16m4(mask, op1, op2, vl) __riscv_vmerge_vvm_i16m4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i16m4(mask, op1, op2, vl) __riscv_vmerge_vxm_i16m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i16m8(mask, op1, op2, vl) __riscv_vmerge_vvm_i16m8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i16m8(mask, op1, op2, vl) __riscv_vmerge_vxm_i16m8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i32mf2(mask, op1, op2, vl) __riscv_vmerge_vvm_i32mf2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i32mf2(mask, op1, op2, vl) __riscv_vmerge_vxm_i32mf2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i32m1(mask, op1, op2, vl) __riscv_vmerge_vvm_i32m1((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i32m1(mask, op1, op2, vl) __riscv_vmerge_vxm_i32m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i32m2(mask, op1, op2, vl) __riscv_vmerge_vvm_i32m2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i32m2(mask, op1, op2, vl) __riscv_vmerge_vxm_i32m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i32m4(mask, op1, op2, vl) __riscv_vmerge_vvm_i32m4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i32m4(mask, op1, op2, vl) __riscv_vmerge_vxm_i32m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i32m8(mask, op1, op2, vl) __riscv_vmerge_vvm_i32m8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i32m8(mask, op1, op2, vl) __riscv_vmerge_vxm_i32m8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i64m1(mask, op1, op2, vl) __riscv_vmerge_vvm_i64m1((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i64m1(mask, op1, op2, vl) __riscv_vmerge_vxm_i64m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i64m2(mask, op1, op2, vl) __riscv_vmerge_vvm_i64m2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i64m2(mask, op1, op2, vl) __riscv_vmerge_vxm_i64m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i64m4(mask, op1, op2, vl) __riscv_vmerge_vvm_i64m4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i64m4(mask, op1, op2, vl) __riscv_vmerge_vxm_i64m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_i64m8(mask, op1, op2, vl) __riscv_vmerge_vvm_i64m8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_i64m8(mask, op1, op2, vl) __riscv_vmerge_vxm_i64m8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u8mf8(mask, op1, op2, vl) __riscv_vmerge_vvm_u8mf8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u8mf8(mask, op1, op2, vl) __riscv_vmerge_vxm_u8mf8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u8mf4(mask, op1, op2, vl) __riscv_vmerge_vvm_u8mf4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u8mf4(mask, op1, op2, vl) __riscv_vmerge_vxm_u8mf4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u8mf2(mask, op1, op2, vl) __riscv_vmerge_vvm_u8mf2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u8mf2(mask, op1, op2, vl) __riscv_vmerge_vxm_u8mf2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u8m1(mask, op1, op2, vl) __riscv_vmerge_vvm_u8m1((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u8m1(mask, op1, op2, vl) __riscv_vmerge_vxm_u8m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u8m2(mask, op1, op2, vl) __riscv_vmerge_vvm_u8m2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u8m2(mask, op1, op2, vl) __riscv_vmerge_vxm_u8m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u8m4(mask, op1, op2, vl) __riscv_vmerge_vvm_u8m4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u8m4(mask, op1, op2, vl) __riscv_vmerge_vxm_u8m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u8m8(mask, op1, op2, vl) __riscv_vmerge_vvm_u8m8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u8m8(mask, op1, op2, vl) __riscv_vmerge_vxm_u8m8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u16mf4(mask, op1, op2, vl) __riscv_vmerge_vvm_u16mf4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u16mf4(mask, op1, op2, vl) __riscv_vmerge_vxm_u16mf4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u16mf2(mask, op1, op2, vl) __riscv_vmerge_vvm_u16mf2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u16mf2(mask, op1, op2, vl) __riscv_vmerge_vxm_u16mf2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u16m1(mask, op1, op2, vl) __riscv_vmerge_vvm_u16m1((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u16m1(mask, op1, op2, vl) __riscv_vmerge_vxm_u16m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u16m2(mask, op1, op2, vl) __riscv_vmerge_vvm_u16m2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u16m2(mask, op1, op2, vl) __riscv_vmerge_vxm_u16m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u16m4(mask, op1, op2, vl) __riscv_vmerge_vvm_u16m4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u16m4(mask, op1, op2, vl) __riscv_vmerge_vxm_u16m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u16m8(mask, op1, op2, vl) __riscv_vmerge_vvm_u16m8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u16m8(mask, op1, op2, vl) __riscv_vmerge_vxm_u16m8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u32mf2(mask, op1, op2, vl) __riscv_vmerge_vvm_u32mf2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u32mf2(mask, op1, op2, vl) __riscv_vmerge_vxm_u32mf2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u32m1(mask, op1, op2, vl) __riscv_vmerge_vvm_u32m1((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u32m1(mask, op1, op2, vl) __riscv_vmerge_vxm_u32m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u32m2(mask, op1, op2, vl) __riscv_vmerge_vvm_u32m2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u32m2(mask, op1, op2, vl) __riscv_vmerge_vxm_u32m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u32m4(mask, op1, op2, vl) __riscv_vmerge_vvm_u32m4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u32m4(mask, op1, op2, vl) __riscv_vmerge_vxm_u32m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u32m8(mask, op1, op2, vl) __riscv_vmerge_vvm_u32m8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u32m8(mask, op1, op2, vl) __riscv_vmerge_vxm_u32m8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u64m1(mask, op1, op2, vl) __riscv_vmerge_vvm_u64m1((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u64m1(mask, op1, op2, vl) __riscv_vmerge_vxm_u64m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u64m2(mask, op1, op2, vl) __riscv_vmerge_vvm_u64m2((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u64m2(mask, op1, op2, vl) __riscv_vmerge_vxm_u64m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u64m4(mask, op1, op2, vl) __riscv_vmerge_vvm_u64m4((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u64m4(mask, op1, op2, vl) __riscv_vmerge_vxm_u64m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_u64m8(mask, op1, op2, vl) __riscv_vmerge_vvm_u64m8((op1), (op2), (mask), (vl))
+#define vmerge_vxm_u64m8(mask, op1, op2, vl) __riscv_vmerge_vxm_u64m8((op1), (op2), (mask), (vl))
+#define vmv_v_v_i8mf8(...) __riscv_vmv_v_v_i8mf8(__VA_ARGS__)
+#define vmv_v_x_i8mf8(...) __riscv_vmv_v_x_i8mf8(__VA_ARGS__)
+#define vmv_v_v_i8mf4(...) __riscv_vmv_v_v_i8mf4(__VA_ARGS__)
+#define vmv_v_x_i8mf4(...) __riscv_vmv_v_x_i8mf4(__VA_ARGS__)
+#define vmv_v_v_i8mf2(...) __riscv_vmv_v_v_i8mf2(__VA_ARGS__)
+#define vmv_v_x_i8mf2(...) __riscv_vmv_v_x_i8mf2(__VA_ARGS__)
+#define vmv_v_v_i8m1(...) __riscv_vmv_v_v_i8m1(__VA_ARGS__)
+#define vmv_v_x_i8m1(...) __riscv_vmv_v_x_i8m1(__VA_ARGS__)
+#define vmv_v_v_i8m2(...) __riscv_vmv_v_v_i8m2(__VA_ARGS__)
+#define vmv_v_x_i8m2(...) __riscv_vmv_v_x_i8m2(__VA_ARGS__)
+#define vmv_v_v_i8m4(...) __riscv_vmv_v_v_i8m4(__VA_ARGS__)
+#define vmv_v_x_i8m4(...) __riscv_vmv_v_x_i8m4(__VA_ARGS__)
+#define vmv_v_v_i8m8(...) __riscv_vmv_v_v_i8m8(__VA_ARGS__)
+#define vmv_v_x_i8m8(...) __riscv_vmv_v_x_i8m8(__VA_ARGS__)
+#define vmv_v_v_i16mf4(...) __riscv_vmv_v_v_i16mf4(__VA_ARGS__)
+#define vmv_v_x_i16mf4(...) __riscv_vmv_v_x_i16mf4(__VA_ARGS__)
+#define vmv_v_v_i16mf2(...) __riscv_vmv_v_v_i16mf2(__VA_ARGS__)
+#define vmv_v_x_i16mf2(...) __riscv_vmv_v_x_i16mf2(__VA_ARGS__)
+#define vmv_v_v_i16m1(...) __riscv_vmv_v_v_i16m1(__VA_ARGS__)
+#define vmv_v_x_i16m1(...) __riscv_vmv_v_x_i16m1(__VA_ARGS__)
+#define vmv_v_v_i16m2(...) __riscv_vmv_v_v_i16m2(__VA_ARGS__)
+#define vmv_v_x_i16m2(...) __riscv_vmv_v_x_i16m2(__VA_ARGS__)
+#define vmv_v_v_i16m4(...) __riscv_vmv_v_v_i16m4(__VA_ARGS__)
+#define vmv_v_x_i16m4(...) __riscv_vmv_v_x_i16m4(__VA_ARGS__)
+#define vmv_v_v_i16m8(...) __riscv_vmv_v_v_i16m8(__VA_ARGS__)
+#define vmv_v_x_i16m8(...) __riscv_vmv_v_x_i16m8(__VA_ARGS__)
+#define vmv_v_v_i32mf2(...) __riscv_vmv_v_v_i32mf2(__VA_ARGS__)
+#define vmv_v_x_i32mf2(...) __riscv_vmv_v_x_i32mf2(__VA_ARGS__)
+#define vmv_v_v_i32m1(...) __riscv_vmv_v_v_i32m1(__VA_ARGS__)
+#define vmv_v_x_i32m1(...) __riscv_vmv_v_x_i32m1(__VA_ARGS__)
+#define vmv_v_v_i32m2(...) __riscv_vmv_v_v_i32m2(__VA_ARGS__)
+#define vmv_v_x_i32m2(...) __riscv_vmv_v_x_i32m2(__VA_ARGS__)
+#define vmv_v_v_i32m4(...) __riscv_vmv_v_v_i32m4(__VA_ARGS__)
+#define vmv_v_x_i32m4(...) __riscv_vmv_v_x_i32m4(__VA_ARGS__)
+#define vmv_v_v_i32m8(...) __riscv_vmv_v_v_i32m8(__VA_ARGS__)
+#define vmv_v_x_i32m8(...) __riscv_vmv_v_x_i32m8(__VA_ARGS__)
+#define vmv_v_v_i64m1(...) __riscv_vmv_v_v_i64m1(__VA_ARGS__)
+#define vmv_v_x_i64m1(...) __riscv_vmv_v_x_i64m1(__VA_ARGS__)
+#define vmv_v_v_i64m2(...) __riscv_vmv_v_v_i64m2(__VA_ARGS__)
+#define vmv_v_x_i64m2(...) __riscv_vmv_v_x_i64m2(__VA_ARGS__)
+#define vmv_v_v_i64m4(...) __riscv_vmv_v_v_i64m4(__VA_ARGS__)
+#define vmv_v_x_i64m4(...) __riscv_vmv_v_x_i64m4(__VA_ARGS__)
+#define vmv_v_v_i64m8(...) __riscv_vmv_v_v_i64m8(__VA_ARGS__)
+#define vmv_v_x_i64m8(...) __riscv_vmv_v_x_i64m8(__VA_ARGS__)
+#define vmv_v_v_u8mf8(...) __riscv_vmv_v_v_u8mf8(__VA_ARGS__)
+#define vmv_v_x_u8mf8(...) __riscv_vmv_v_x_u8mf8(__VA_ARGS__)
+#define vmv_v_v_u8mf4(...) __riscv_vmv_v_v_u8mf4(__VA_ARGS__)
+#define vmv_v_x_u8mf4(...) __riscv_vmv_v_x_u8mf4(__VA_ARGS__)
+#define vmv_v_v_u8mf2(...) __riscv_vmv_v_v_u8mf2(__VA_ARGS__)
+#define vmv_v_x_u8mf2(...) __riscv_vmv_v_x_u8mf2(__VA_ARGS__)
+#define vmv_v_v_u8m1(...) __riscv_vmv_v_v_u8m1(__VA_ARGS__)
+#define vmv_v_x_u8m1(...) __riscv_vmv_v_x_u8m1(__VA_ARGS__)
+#define vmv_v_v_u8m2(...) __riscv_vmv_v_v_u8m2(__VA_ARGS__)
+#define vmv_v_x_u8m2(...) __riscv_vmv_v_x_u8m2(__VA_ARGS__)
+#define vmv_v_v_u8m4(...) __riscv_vmv_v_v_u8m4(__VA_ARGS__)
+#define vmv_v_x_u8m4(...) __riscv_vmv_v_x_u8m4(__VA_ARGS__)
+#define vmv_v_v_u8m8(...) __riscv_vmv_v_v_u8m8(__VA_ARGS__)
+#define vmv_v_x_u8m8(...) __riscv_vmv_v_x_u8m8(__VA_ARGS__)
+#define vmv_v_v_u16mf4(...) __riscv_vmv_v_v_u16mf4(__VA_ARGS__)
+#define vmv_v_x_u16mf4(...) __riscv_vmv_v_x_u16mf4(__VA_ARGS__)
+#define vmv_v_v_u16mf2(...) __riscv_vmv_v_v_u16mf2(__VA_ARGS__)
+#define vmv_v_x_u16mf2(...) __riscv_vmv_v_x_u16mf2(__VA_ARGS__)
+#define vmv_v_v_u16m1(...) __riscv_vmv_v_v_u16m1(__VA_ARGS__)
+#define vmv_v_x_u16m1(...) __riscv_vmv_v_x_u16m1(__VA_ARGS__)
+#define vmv_v_v_u16m2(...) __riscv_vmv_v_v_u16m2(__VA_ARGS__)
+#define vmv_v_x_u16m2(...) __riscv_vmv_v_x_u16m2(__VA_ARGS__)
+#define vmv_v_v_u16m4(...) __riscv_vmv_v_v_u16m4(__VA_ARGS__)
+#define vmv_v_x_u16m4(...) __riscv_vmv_v_x_u16m4(__VA_ARGS__)
+#define vmv_v_v_u16m8(...) __riscv_vmv_v_v_u16m8(__VA_ARGS__)
+#define vmv_v_x_u16m8(...) __riscv_vmv_v_x_u16m8(__VA_ARGS__)
+#define vmv_v_v_u32mf2(...) __riscv_vmv_v_v_u32mf2(__VA_ARGS__)
+#define vmv_v_x_u32mf2(...) __riscv_vmv_v_x_u32mf2(__VA_ARGS__)
+#define vmv_v_v_u32m1(...) __riscv_vmv_v_v_u32m1(__VA_ARGS__)
+#define vmv_v_x_u32m1(...) __riscv_vmv_v_x_u32m1(__VA_ARGS__)
+#define vmv_v_v_u32m2(...) __riscv_vmv_v_v_u32m2(__VA_ARGS__)
+#define vmv_v_x_u32m2(...) __riscv_vmv_v_x_u32m2(__VA_ARGS__)
+#define vmv_v_v_u32m4(...) __riscv_vmv_v_v_u32m4(__VA_ARGS__)
+#define vmv_v_x_u32m4(...) __riscv_vmv_v_x_u32m4(__VA_ARGS__)
+#define vmv_v_v_u32m8(...) __riscv_vmv_v_v_u32m8(__VA_ARGS__)
+#define vmv_v_x_u32m8(...) __riscv_vmv_v_x_u32m8(__VA_ARGS__)
+#define vmv_v_v_u64m1(...) __riscv_vmv_v_v_u64m1(__VA_ARGS__)
+#define vmv_v_x_u64m1(...) __riscv_vmv_v_x_u64m1(__VA_ARGS__)
+#define vmv_v_v_u64m2(...) __riscv_vmv_v_v_u64m2(__VA_ARGS__)
+#define vmv_v_x_u64m2(...) __riscv_vmv_v_x_u64m2(__VA_ARGS__)
+#define vmv_v_v_u64m4(...) __riscv_vmv_v_v_u64m4(__VA_ARGS__)
+#define vmv_v_x_u64m4(...) __riscv_vmv_v_x_u64m4(__VA_ARGS__)
+#define vmv_v_v_u64m8(...) __riscv_vmv_v_v_u64m8(__VA_ARGS__)
+#define vmv_v_x_u64m8(...) __riscv_vmv_v_x_u64m8(__VA_ARGS__)
+#define vsadd_vv_i8mf8(...) __riscv_vsadd_vv_i8mf8(__VA_ARGS__)
+#define vsadd_vx_i8mf8(...) __riscv_vsadd_vx_i8mf8(__VA_ARGS__)
+#define vsadd_vv_i8mf4(...) __riscv_vsadd_vv_i8mf4(__VA_ARGS__)
+#define vsadd_vx_i8mf4(...) __riscv_vsadd_vx_i8mf4(__VA_ARGS__)
+#define vsadd_vv_i8mf2(...) __riscv_vsadd_vv_i8mf2(__VA_ARGS__)
+#define vsadd_vx_i8mf2(...) __riscv_vsadd_vx_i8mf2(__VA_ARGS__)
+#define vsadd_vv_i8m1(...) __riscv_vsadd_vv_i8m1(__VA_ARGS__)
+#define vsadd_vx_i8m1(...) __riscv_vsadd_vx_i8m1(__VA_ARGS__)
+#define vsadd_vv_i8m2(...) __riscv_vsadd_vv_i8m2(__VA_ARGS__)
+#define vsadd_vx_i8m2(...) __riscv_vsadd_vx_i8m2(__VA_ARGS__)
+#define vsadd_vv_i8m4(...) __riscv_vsadd_vv_i8m4(__VA_ARGS__)
+#define vsadd_vx_i8m4(...) __riscv_vsadd_vx_i8m4(__VA_ARGS__)
+#define vsadd_vv_i8m8(...) __riscv_vsadd_vv_i8m8(__VA_ARGS__)
+#define vsadd_vx_i8m8(...) __riscv_vsadd_vx_i8m8(__VA_ARGS__)
+#define vsadd_vv_i16mf4(...) __riscv_vsadd_vv_i16mf4(__VA_ARGS__)
+#define vsadd_vx_i16mf4(...) __riscv_vsadd_vx_i16mf4(__VA_ARGS__)
+#define vsadd_vv_i16mf2(...) __riscv_vsadd_vv_i16mf2(__VA_ARGS__)
+#define vsadd_vx_i16mf2(...) __riscv_vsadd_vx_i16mf2(__VA_ARGS__)
+#define vsadd_vv_i16m1(...) __riscv_vsadd_vv_i16m1(__VA_ARGS__)
+#define vsadd_vx_i16m1(...) __riscv_vsadd_vx_i16m1(__VA_ARGS__)
+#define vsadd_vv_i16m2(...) __riscv_vsadd_vv_i16m2(__VA_ARGS__)
+#define vsadd_vx_i16m2(...) __riscv_vsadd_vx_i16m2(__VA_ARGS__)
+#define vsadd_vv_i16m4(...) __riscv_vsadd_vv_i16m4(__VA_ARGS__)
+#define vsadd_vx_i16m4(...) __riscv_vsadd_vx_i16m4(__VA_ARGS__)
+#define vsadd_vv_i16m8(...) __riscv_vsadd_vv_i16m8(__VA_ARGS__)
+#define vsadd_vx_i16m8(...) __riscv_vsadd_vx_i16m8(__VA_ARGS__)
+#define vsadd_vv_i32mf2(...) __riscv_vsadd_vv_i32mf2(__VA_ARGS__)
+#define vsadd_vx_i32mf2(...) __riscv_vsadd_vx_i32mf2(__VA_ARGS__)
+#define vsadd_vv_i32m1(...) __riscv_vsadd_vv_i32m1(__VA_ARGS__)
+#define vsadd_vx_i32m1(...) __riscv_vsadd_vx_i32m1(__VA_ARGS__)
+#define vsadd_vv_i32m2(...) __riscv_vsadd_vv_i32m2(__VA_ARGS__)
+#define vsadd_vx_i32m2(...) __riscv_vsadd_vx_i32m2(__VA_ARGS__)
+#define vsadd_vv_i32m4(...) __riscv_vsadd_vv_i32m4(__VA_ARGS__)
+#define vsadd_vx_i32m4(...) __riscv_vsadd_vx_i32m4(__VA_ARGS__)
+#define vsadd_vv_i32m8(...) __riscv_vsadd_vv_i32m8(__VA_ARGS__)
+#define vsadd_vx_i32m8(...) __riscv_vsadd_vx_i32m8(__VA_ARGS__)
+#define vsadd_vv_i64m1(...) __riscv_vsadd_vv_i64m1(__VA_ARGS__)
+#define vsadd_vx_i64m1(...) __riscv_vsadd_vx_i64m1(__VA_ARGS__)
+#define vsadd_vv_i64m2(...) __riscv_vsadd_vv_i64m2(__VA_ARGS__)
+#define vsadd_vx_i64m2(...) __riscv_vsadd_vx_i64m2(__VA_ARGS__)
+#define vsadd_vv_i64m4(...) __riscv_vsadd_vv_i64m4(__VA_ARGS__)
+#define vsadd_vx_i64m4(...) __riscv_vsadd_vx_i64m4(__VA_ARGS__)
+#define vsadd_vv_i64m8(...) __riscv_vsadd_vv_i64m8(__VA_ARGS__)
+#define vsadd_vx_i64m8(...) __riscv_vsadd_vx_i64m8(__VA_ARGS__)
+#define vssub_vv_i8mf8(...) __riscv_vssub_vv_i8mf8(__VA_ARGS__)
+#define vssub_vx_i8mf8(...) __riscv_vssub_vx_i8mf8(__VA_ARGS__)
+#define vssub_vv_i8mf4(...) __riscv_vssub_vv_i8mf4(__VA_ARGS__)
+#define vssub_vx_i8mf4(...) __riscv_vssub_vx_i8mf4(__VA_ARGS__)
+#define vssub_vv_i8mf2(...) __riscv_vssub_vv_i8mf2(__VA_ARGS__)
+#define vssub_vx_i8mf2(...) __riscv_vssub_vx_i8mf2(__VA_ARGS__)
+#define vssub_vv_i8m1(...) __riscv_vssub_vv_i8m1(__VA_ARGS__)
+#define vssub_vx_i8m1(...) __riscv_vssub_vx_i8m1(__VA_ARGS__)
+#define vssub_vv_i8m2(...) __riscv_vssub_vv_i8m2(__VA_ARGS__)
+#define vssub_vx_i8m2(...) __riscv_vssub_vx_i8m2(__VA_ARGS__)
+#define vssub_vv_i8m4(...) __riscv_vssub_vv_i8m4(__VA_ARGS__)
+#define vssub_vx_i8m4(...) __riscv_vssub_vx_i8m4(__VA_ARGS__)
+#define vssub_vv_i8m8(...) __riscv_vssub_vv_i8m8(__VA_ARGS__)
+#define vssub_vx_i8m8(...) __riscv_vssub_vx_i8m8(__VA_ARGS__)
+#define vssub_vv_i16mf4(...) __riscv_vssub_vv_i16mf4(__VA_ARGS__)
+#define vssub_vx_i16mf4(...) __riscv_vssub_vx_i16mf4(__VA_ARGS__)
+#define vssub_vv_i16mf2(...) __riscv_vssub_vv_i16mf2(__VA_ARGS__)
+#define vssub_vx_i16mf2(...) __riscv_vssub_vx_i16mf2(__VA_ARGS__)
+#define vssub_vv_i16m1(...) __riscv_vssub_vv_i16m1(__VA_ARGS__)
+#define vssub_vx_i16m1(...) __riscv_vssub_vx_i16m1(__VA_ARGS__)
+#define vssub_vv_i16m2(...) __riscv_vssub_vv_i16m2(__VA_ARGS__)
+#define vssub_vx_i16m2(...) __riscv_vssub_vx_i16m2(__VA_ARGS__)
+#define vssub_vv_i16m4(...) __riscv_vssub_vv_i16m4(__VA_ARGS__)
+#define vssub_vx_i16m4(...) __riscv_vssub_vx_i16m4(__VA_ARGS__)
+#define vssub_vv_i16m8(...) __riscv_vssub_vv_i16m8(__VA_ARGS__)
+#define vssub_vx_i16m8(...) __riscv_vssub_vx_i16m8(__VA_ARGS__)
+#define vssub_vv_i32mf2(...) __riscv_vssub_vv_i32mf2(__VA_ARGS__)
+#define vssub_vx_i32mf2(...) __riscv_vssub_vx_i32mf2(__VA_ARGS__)
+#define vssub_vv_i32m1(...) __riscv_vssub_vv_i32m1(__VA_ARGS__)
+#define vssub_vx_i32m1(...) __riscv_vssub_vx_i32m1(__VA_ARGS__)
+#define vssub_vv_i32m2(...) __riscv_vssub_vv_i32m2(__VA_ARGS__)
+#define vssub_vx_i32m2(...) __riscv_vssub_vx_i32m2(__VA_ARGS__)
+#define vssub_vv_i32m4(...) __riscv_vssub_vv_i32m4(__VA_ARGS__)
+#define vssub_vx_i32m4(...) __riscv_vssub_vx_i32m4(__VA_ARGS__)
+#define vssub_vv_i32m8(...) __riscv_vssub_vv_i32m8(__VA_ARGS__)
+#define vssub_vx_i32m8(...) __riscv_vssub_vx_i32m8(__VA_ARGS__)
+#define vssub_vv_i64m1(...) __riscv_vssub_vv_i64m1(__VA_ARGS__)
+#define vssub_vx_i64m1(...) __riscv_vssub_vx_i64m1(__VA_ARGS__)
+#define vssub_vv_i64m2(...) __riscv_vssub_vv_i64m2(__VA_ARGS__)
+#define vssub_vx_i64m2(...) __riscv_vssub_vx_i64m2(__VA_ARGS__)
+#define vssub_vv_i64m4(...) __riscv_vssub_vv_i64m4(__VA_ARGS__)
+#define vssub_vx_i64m4(...) __riscv_vssub_vx_i64m4(__VA_ARGS__)
+#define vssub_vv_i64m8(...) __riscv_vssub_vv_i64m8(__VA_ARGS__)
+#define vssub_vx_i64m8(...) __riscv_vssub_vx_i64m8(__VA_ARGS__)
+#define vsaddu_vv_u8mf8(...) __riscv_vsaddu_vv_u8mf8(__VA_ARGS__)
+#define vsaddu_vx_u8mf8(...) __riscv_vsaddu_vx_u8mf8(__VA_ARGS__)
+#define vsaddu_vv_u8mf4(...) __riscv_vsaddu_vv_u8mf4(__VA_ARGS__)
+#define vsaddu_vx_u8mf4(...) __riscv_vsaddu_vx_u8mf4(__VA_ARGS__)
+#define vsaddu_vv_u8mf2(...) __riscv_vsaddu_vv_u8mf2(__VA_ARGS__)
+#define vsaddu_vx_u8mf2(...) __riscv_vsaddu_vx_u8mf2(__VA_ARGS__)
+#define vsaddu_vv_u8m1(...) __riscv_vsaddu_vv_u8m1(__VA_ARGS__)
+#define vsaddu_vx_u8m1(...) __riscv_vsaddu_vx_u8m1(__VA_ARGS__)
+#define vsaddu_vv_u8m2(...) __riscv_vsaddu_vv_u8m2(__VA_ARGS__)
+#define vsaddu_vx_u8m2(...) __riscv_vsaddu_vx_u8m2(__VA_ARGS__)
+#define vsaddu_vv_u8m4(...) __riscv_vsaddu_vv_u8m4(__VA_ARGS__)
+#define vsaddu_vx_u8m4(...) __riscv_vsaddu_vx_u8m4(__VA_ARGS__)
+#define vsaddu_vv_u8m8(...) __riscv_vsaddu_vv_u8m8(__VA_ARGS__)
+#define vsaddu_vx_u8m8(...) __riscv_vsaddu_vx_u8m8(__VA_ARGS__)
+#define vsaddu_vv_u16mf4(...) __riscv_vsaddu_vv_u16mf4(__VA_ARGS__)
+#define vsaddu_vx_u16mf4(...) __riscv_vsaddu_vx_u16mf4(__VA_ARGS__)
+#define vsaddu_vv_u16mf2(...) __riscv_vsaddu_vv_u16mf2(__VA_ARGS__)
+#define vsaddu_vx_u16mf2(...) __riscv_vsaddu_vx_u16mf2(__VA_ARGS__)
+#define vsaddu_vv_u16m1(...) __riscv_vsaddu_vv_u16m1(__VA_ARGS__)
+#define vsaddu_vx_u16m1(...) __riscv_vsaddu_vx_u16m1(__VA_ARGS__)
+#define vsaddu_vv_u16m2(...) __riscv_vsaddu_vv_u16m2(__VA_ARGS__)
+#define vsaddu_vx_u16m2(...) __riscv_vsaddu_vx_u16m2(__VA_ARGS__)
+#define vsaddu_vv_u16m4(...) __riscv_vsaddu_vv_u16m4(__VA_ARGS__)
+#define vsaddu_vx_u16m4(...) __riscv_vsaddu_vx_u16m4(__VA_ARGS__)
+#define vsaddu_vv_u16m8(...) __riscv_vsaddu_vv_u16m8(__VA_ARGS__)
+#define vsaddu_vx_u16m8(...) __riscv_vsaddu_vx_u16m8(__VA_ARGS__)
+#define vsaddu_vv_u32mf2(...) __riscv_vsaddu_vv_u32mf2(__VA_ARGS__)
+#define vsaddu_vx_u32mf2(...) __riscv_vsaddu_vx_u32mf2(__VA_ARGS__)
+#define vsaddu_vv_u32m1(...) __riscv_vsaddu_vv_u32m1(__VA_ARGS__)
+#define vsaddu_vx_u32m1(...) __riscv_vsaddu_vx_u32m1(__VA_ARGS__)
+#define vsaddu_vv_u32m2(...) __riscv_vsaddu_vv_u32m2(__VA_ARGS__)
+#define vsaddu_vx_u32m2(...) __riscv_vsaddu_vx_u32m2(__VA_ARGS__)
+#define vsaddu_vv_u32m4(...) __riscv_vsaddu_vv_u32m4(__VA_ARGS__)
+#define vsaddu_vx_u32m4(...) __riscv_vsaddu_vx_u32m4(__VA_ARGS__)
+#define vsaddu_vv_u32m8(...) __riscv_vsaddu_vv_u32m8(__VA_ARGS__)
+#define vsaddu_vx_u32m8(...) __riscv_vsaddu_vx_u32m8(__VA_ARGS__)
+#define vsaddu_vv_u64m1(...) __riscv_vsaddu_vv_u64m1(__VA_ARGS__)
+#define vsaddu_vx_u64m1(...) __riscv_vsaddu_vx_u64m1(__VA_ARGS__)
+#define vsaddu_vv_u64m2(...) __riscv_vsaddu_vv_u64m2(__VA_ARGS__)
+#define vsaddu_vx_u64m2(...) __riscv_vsaddu_vx_u64m2(__VA_ARGS__)
+#define vsaddu_vv_u64m4(...) __riscv_vsaddu_vv_u64m4(__VA_ARGS__)
+#define vsaddu_vx_u64m4(...) __riscv_vsaddu_vx_u64m4(__VA_ARGS__)
+#define vsaddu_vv_u64m8(...) __riscv_vsaddu_vv_u64m8(__VA_ARGS__)
+#define vsaddu_vx_u64m8(...) __riscv_vsaddu_vx_u64m8(__VA_ARGS__)
+#define vssubu_vv_u8mf8(...) __riscv_vssubu_vv_u8mf8(__VA_ARGS__)
+#define vssubu_vx_u8mf8(...) __riscv_vssubu_vx_u8mf8(__VA_ARGS__)
+#define vssubu_vv_u8mf4(...) __riscv_vssubu_vv_u8mf4(__VA_ARGS__)
+#define vssubu_vx_u8mf4(...) __riscv_vssubu_vx_u8mf4(__VA_ARGS__)
+#define vssubu_vv_u8mf2(...) __riscv_vssubu_vv_u8mf2(__VA_ARGS__)
+#define vssubu_vx_u8mf2(...) __riscv_vssubu_vx_u8mf2(__VA_ARGS__)
+#define vssubu_vv_u8m1(...) __riscv_vssubu_vv_u8m1(__VA_ARGS__)
+#define vssubu_vx_u8m1(...) __riscv_vssubu_vx_u8m1(__VA_ARGS__)
+#define vssubu_vv_u8m2(...) __riscv_vssubu_vv_u8m2(__VA_ARGS__)
+#define vssubu_vx_u8m2(...) __riscv_vssubu_vx_u8m2(__VA_ARGS__)
+#define vssubu_vv_u8m4(...) __riscv_vssubu_vv_u8m4(__VA_ARGS__)
+#define vssubu_vx_u8m4(...) __riscv_vssubu_vx_u8m4(__VA_ARGS__)
+#define vssubu_vv_u8m8(...) __riscv_vssubu_vv_u8m8(__VA_ARGS__)
+#define vssubu_vx_u8m8(...) __riscv_vssubu_vx_u8m8(__VA_ARGS__)
+#define vssubu_vv_u16mf4(...) __riscv_vssubu_vv_u16mf4(__VA_ARGS__)
+#define vssubu_vx_u16mf4(...) __riscv_vssubu_vx_u16mf4(__VA_ARGS__)
+#define vssubu_vv_u16mf2(...) __riscv_vssubu_vv_u16mf2(__VA_ARGS__)
+#define vssubu_vx_u16mf2(...) __riscv_vssubu_vx_u16mf2(__VA_ARGS__)
+#define vssubu_vv_u16m1(...) __riscv_vssubu_vv_u16m1(__VA_ARGS__)
+#define vssubu_vx_u16m1(...) __riscv_vssubu_vx_u16m1(__VA_ARGS__)
+#define vssubu_vv_u16m2(...) __riscv_vssubu_vv_u16m2(__VA_ARGS__)
+#define vssubu_vx_u16m2(...) __riscv_vssubu_vx_u16m2(__VA_ARGS__)
+#define vssubu_vv_u16m4(...) __riscv_vssubu_vv_u16m4(__VA_ARGS__)
+#define vssubu_vx_u16m4(...) __riscv_vssubu_vx_u16m4(__VA_ARGS__)
+#define vssubu_vv_u16m8(...) __riscv_vssubu_vv_u16m8(__VA_ARGS__)
+#define vssubu_vx_u16m8(...) __riscv_vssubu_vx_u16m8(__VA_ARGS__)
+#define vssubu_vv_u32mf2(...) __riscv_vssubu_vv_u32mf2(__VA_ARGS__)
+#define vssubu_vx_u32mf2(...) __riscv_vssubu_vx_u32mf2(__VA_ARGS__)
+#define vssubu_vv_u32m1(...) __riscv_vssubu_vv_u32m1(__VA_ARGS__)
+#define vssubu_vx_u32m1(...) __riscv_vssubu_vx_u32m1(__VA_ARGS__)
+#define vssubu_vv_u32m2(...) __riscv_vssubu_vv_u32m2(__VA_ARGS__)
+#define vssubu_vx_u32m2(...) __riscv_vssubu_vx_u32m2(__VA_ARGS__)
+#define vssubu_vv_u32m4(...) __riscv_vssubu_vv_u32m4(__VA_ARGS__)
+#define vssubu_vx_u32m4(...) __riscv_vssubu_vx_u32m4(__VA_ARGS__)
+#define vssubu_vv_u32m8(...) __riscv_vssubu_vv_u32m8(__VA_ARGS__)
+#define vssubu_vx_u32m8(...) __riscv_vssubu_vx_u32m8(__VA_ARGS__)
+#define vssubu_vv_u64m1(...) __riscv_vssubu_vv_u64m1(__VA_ARGS__)
+#define vssubu_vx_u64m1(...) __riscv_vssubu_vx_u64m1(__VA_ARGS__)
+#define vssubu_vv_u64m2(...) __riscv_vssubu_vv_u64m2(__VA_ARGS__)
+#define vssubu_vx_u64m2(...) __riscv_vssubu_vx_u64m2(__VA_ARGS__)
+#define vssubu_vv_u64m4(...) __riscv_vssubu_vv_u64m4(__VA_ARGS__)
+#define vssubu_vx_u64m4(...) __riscv_vssubu_vx_u64m4(__VA_ARGS__)
+#define vssubu_vv_u64m8(...) __riscv_vssubu_vv_u64m8(__VA_ARGS__)
+#define vssubu_vx_u64m8(...) __riscv_vssubu_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vsadd_vv_i8mf8_m(...) __riscv_vsadd_vv_i8mf8_tumu(__VA_ARGS__)
+#define vsadd_vx_i8mf8_m(...) __riscv_vsadd_vx_i8mf8_tumu(__VA_ARGS__)
+#define vsadd_vv_i8mf4_m(...) __riscv_vsadd_vv_i8mf4_tumu(__VA_ARGS__)
+#define vsadd_vx_i8mf4_m(...) __riscv_vsadd_vx_i8mf4_tumu(__VA_ARGS__)
+#define vsadd_vv_i8mf2_m(...) __riscv_vsadd_vv_i8mf2_tumu(__VA_ARGS__)
+#define vsadd_vx_i8mf2_m(...) __riscv_vsadd_vx_i8mf2_tumu(__VA_ARGS__)
+#define vsadd_vv_i8m1_m(...) __riscv_vsadd_vv_i8m1_tumu(__VA_ARGS__)
+#define vsadd_vx_i8m1_m(...) __riscv_vsadd_vx_i8m1_tumu(__VA_ARGS__)
+#define vsadd_vv_i8m2_m(...) __riscv_vsadd_vv_i8m2_tumu(__VA_ARGS__)
+#define vsadd_vx_i8m2_m(...) __riscv_vsadd_vx_i8m2_tumu(__VA_ARGS__)
+#define vsadd_vv_i8m4_m(...) __riscv_vsadd_vv_i8m4_tumu(__VA_ARGS__)
+#define vsadd_vx_i8m4_m(...) __riscv_vsadd_vx_i8m4_tumu(__VA_ARGS__)
+#define vsadd_vv_i8m8_m(...) __riscv_vsadd_vv_i8m8_tumu(__VA_ARGS__)
+#define vsadd_vx_i8m8_m(...) __riscv_vsadd_vx_i8m8_tumu(__VA_ARGS__)
+#define vsadd_vv_i16mf4_m(...) __riscv_vsadd_vv_i16mf4_tumu(__VA_ARGS__)
+#define vsadd_vx_i16mf4_m(...) __riscv_vsadd_vx_i16mf4_tumu(__VA_ARGS__)
+#define vsadd_vv_i16mf2_m(...) __riscv_vsadd_vv_i16mf2_tumu(__VA_ARGS__)
+#define vsadd_vx_i16mf2_m(...) __riscv_vsadd_vx_i16mf2_tumu(__VA_ARGS__)
+#define vsadd_vv_i16m1_m(...) __riscv_vsadd_vv_i16m1_tumu(__VA_ARGS__)
+#define vsadd_vx_i16m1_m(...) __riscv_vsadd_vx_i16m1_tumu(__VA_ARGS__)
+#define vsadd_vv_i16m2_m(...) __riscv_vsadd_vv_i16m2_tumu(__VA_ARGS__)
+#define vsadd_vx_i16m2_m(...) __riscv_vsadd_vx_i16m2_tumu(__VA_ARGS__)
+#define vsadd_vv_i16m4_m(...) __riscv_vsadd_vv_i16m4_tumu(__VA_ARGS__)
+#define vsadd_vx_i16m4_m(...) __riscv_vsadd_vx_i16m4_tumu(__VA_ARGS__)
+#define vsadd_vv_i16m8_m(...) __riscv_vsadd_vv_i16m8_tumu(__VA_ARGS__)
+#define vsadd_vx_i16m8_m(...) __riscv_vsadd_vx_i16m8_tumu(__VA_ARGS__)
+#define vsadd_vv_i32mf2_m(...) __riscv_vsadd_vv_i32mf2_tumu(__VA_ARGS__)
+#define vsadd_vx_i32mf2_m(...) __riscv_vsadd_vx_i32mf2_tumu(__VA_ARGS__)
+#define vsadd_vv_i32m1_m(...) __riscv_vsadd_vv_i32m1_tumu(__VA_ARGS__)
+#define vsadd_vx_i32m1_m(...) __riscv_vsadd_vx_i32m1_tumu(__VA_ARGS__)
+#define vsadd_vv_i32m2_m(...) __riscv_vsadd_vv_i32m2_tumu(__VA_ARGS__)
+#define vsadd_vx_i32m2_m(...) __riscv_vsadd_vx_i32m2_tumu(__VA_ARGS__)
+#define vsadd_vv_i32m4_m(...) __riscv_vsadd_vv_i32m4_tumu(__VA_ARGS__)
+#define vsadd_vx_i32m4_m(...) __riscv_vsadd_vx_i32m4_tumu(__VA_ARGS__)
+#define vsadd_vv_i32m8_m(...) __riscv_vsadd_vv_i32m8_tumu(__VA_ARGS__)
+#define vsadd_vx_i32m8_m(...) __riscv_vsadd_vx_i32m8_tumu(__VA_ARGS__)
+#define vsadd_vv_i64m1_m(...) __riscv_vsadd_vv_i64m1_tumu(__VA_ARGS__)
+#define vsadd_vx_i64m1_m(...) __riscv_vsadd_vx_i64m1_tumu(__VA_ARGS__)
+#define vsadd_vv_i64m2_m(...) __riscv_vsadd_vv_i64m2_tumu(__VA_ARGS__)
+#define vsadd_vx_i64m2_m(...) __riscv_vsadd_vx_i64m2_tumu(__VA_ARGS__)
+#define vsadd_vv_i64m4_m(...) __riscv_vsadd_vv_i64m4_tumu(__VA_ARGS__)
+#define vsadd_vx_i64m4_m(...) __riscv_vsadd_vx_i64m4_tumu(__VA_ARGS__)
+#define vsadd_vv_i64m8_m(...) __riscv_vsadd_vv_i64m8_tumu(__VA_ARGS__)
+#define vsadd_vx_i64m8_m(...) __riscv_vsadd_vx_i64m8_tumu(__VA_ARGS__)
+#define vssub_vv_i8mf8_m(...) __riscv_vssub_vv_i8mf8_tumu(__VA_ARGS__)
+#define vssub_vx_i8mf8_m(...) __riscv_vssub_vx_i8mf8_tumu(__VA_ARGS__)
+#define vssub_vv_i8mf4_m(...) __riscv_vssub_vv_i8mf4_tumu(__VA_ARGS__)
+#define vssub_vx_i8mf4_m(...) __riscv_vssub_vx_i8mf4_tumu(__VA_ARGS__)
+#define vssub_vv_i8mf2_m(...) __riscv_vssub_vv_i8mf2_tumu(__VA_ARGS__)
+#define vssub_vx_i8mf2_m(...) __riscv_vssub_vx_i8mf2_tumu(__VA_ARGS__)
+#define vssub_vv_i8m1_m(...) __riscv_vssub_vv_i8m1_tumu(__VA_ARGS__)
+#define vssub_vx_i8m1_m(...) __riscv_vssub_vx_i8m1_tumu(__VA_ARGS__)
+#define vssub_vv_i8m2_m(...) __riscv_vssub_vv_i8m2_tumu(__VA_ARGS__)
+#define vssub_vx_i8m2_m(...) __riscv_vssub_vx_i8m2_tumu(__VA_ARGS__)
+#define vssub_vv_i8m4_m(...) __riscv_vssub_vv_i8m4_tumu(__VA_ARGS__)
+#define vssub_vx_i8m4_m(...) __riscv_vssub_vx_i8m4_tumu(__VA_ARGS__)
+#define vssub_vv_i8m8_m(...) __riscv_vssub_vv_i8m8_tumu(__VA_ARGS__)
+#define vssub_vx_i8m8_m(...) __riscv_vssub_vx_i8m8_tumu(__VA_ARGS__)
+#define vssub_vv_i16mf4_m(...) __riscv_vssub_vv_i16mf4_tumu(__VA_ARGS__)
+#define vssub_vx_i16mf4_m(...) __riscv_vssub_vx_i16mf4_tumu(__VA_ARGS__)
+#define vssub_vv_i16mf2_m(...) __riscv_vssub_vv_i16mf2_tumu(__VA_ARGS__)
+#define vssub_vx_i16mf2_m(...) __riscv_vssub_vx_i16mf2_tumu(__VA_ARGS__)
+#define vssub_vv_i16m1_m(...) __riscv_vssub_vv_i16m1_tumu(__VA_ARGS__)
+#define vssub_vx_i16m1_m(...) __riscv_vssub_vx_i16m1_tumu(__VA_ARGS__)
+#define vssub_vv_i16m2_m(...) __riscv_vssub_vv_i16m2_tumu(__VA_ARGS__)
+#define vssub_vx_i16m2_m(...) __riscv_vssub_vx_i16m2_tumu(__VA_ARGS__)
+#define vssub_vv_i16m4_m(...) __riscv_vssub_vv_i16m4_tumu(__VA_ARGS__)
+#define vssub_vx_i16m4_m(...) __riscv_vssub_vx_i16m4_tumu(__VA_ARGS__)
+#define vssub_vv_i16m8_m(...) __riscv_vssub_vv_i16m8_tumu(__VA_ARGS__)
+#define vssub_vx_i16m8_m(...) __riscv_vssub_vx_i16m8_tumu(__VA_ARGS__)
+#define vssub_vv_i32mf2_m(...) __riscv_vssub_vv_i32mf2_tumu(__VA_ARGS__)
+#define vssub_vx_i32mf2_m(...) __riscv_vssub_vx_i32mf2_tumu(__VA_ARGS__)
+#define vssub_vv_i32m1_m(...) __riscv_vssub_vv_i32m1_tumu(__VA_ARGS__)
+#define vssub_vx_i32m1_m(...) __riscv_vssub_vx_i32m1_tumu(__VA_ARGS__)
+#define vssub_vv_i32m2_m(...) __riscv_vssub_vv_i32m2_tumu(__VA_ARGS__)
+#define vssub_vx_i32m2_m(...) __riscv_vssub_vx_i32m2_tumu(__VA_ARGS__)
+#define vssub_vv_i32m4_m(...) __riscv_vssub_vv_i32m4_tumu(__VA_ARGS__)
+#define vssub_vx_i32m4_m(...) __riscv_vssub_vx_i32m4_tumu(__VA_ARGS__)
+#define vssub_vv_i32m8_m(...) __riscv_vssub_vv_i32m8_tumu(__VA_ARGS__)
+#define vssub_vx_i32m8_m(...) __riscv_vssub_vx_i32m8_tumu(__VA_ARGS__)
+#define vssub_vv_i64m1_m(...) __riscv_vssub_vv_i64m1_tumu(__VA_ARGS__)
+#define vssub_vx_i64m1_m(...) __riscv_vssub_vx_i64m1_tumu(__VA_ARGS__)
+#define vssub_vv_i64m2_m(...) __riscv_vssub_vv_i64m2_tumu(__VA_ARGS__)
+#define vssub_vx_i64m2_m(...) __riscv_vssub_vx_i64m2_tumu(__VA_ARGS__)
+#define vssub_vv_i64m4_m(...) __riscv_vssub_vv_i64m4_tumu(__VA_ARGS__)
+#define vssub_vx_i64m4_m(...) __riscv_vssub_vx_i64m4_tumu(__VA_ARGS__)
+#define vssub_vv_i64m8_m(...) __riscv_vssub_vv_i64m8_tumu(__VA_ARGS__)
+#define vssub_vx_i64m8_m(...) __riscv_vssub_vx_i64m8_tumu(__VA_ARGS__)
+#define vsaddu_vv_u8mf8_m(...) __riscv_vsaddu_vv_u8mf8_tumu(__VA_ARGS__)
+#define vsaddu_vx_u8mf8_m(...) __riscv_vsaddu_vx_u8mf8_tumu(__VA_ARGS__)
+#define vsaddu_vv_u8mf4_m(...) __riscv_vsaddu_vv_u8mf4_tumu(__VA_ARGS__)
+#define vsaddu_vx_u8mf4_m(...) __riscv_vsaddu_vx_u8mf4_tumu(__VA_ARGS__)
+#define vsaddu_vv_u8mf2_m(...) __riscv_vsaddu_vv_u8mf2_tumu(__VA_ARGS__)
+#define vsaddu_vx_u8mf2_m(...) __riscv_vsaddu_vx_u8mf2_tumu(__VA_ARGS__)
+#define vsaddu_vv_u8m1_m(...) __riscv_vsaddu_vv_u8m1_tumu(__VA_ARGS__)
+#define vsaddu_vx_u8m1_m(...) __riscv_vsaddu_vx_u8m1_tumu(__VA_ARGS__)
+#define vsaddu_vv_u8m2_m(...) __riscv_vsaddu_vv_u8m2_tumu(__VA_ARGS__)
+#define vsaddu_vx_u8m2_m(...) __riscv_vsaddu_vx_u8m2_tumu(__VA_ARGS__)
+#define vsaddu_vv_u8m4_m(...) __riscv_vsaddu_vv_u8m4_tumu(__VA_ARGS__)
+#define vsaddu_vx_u8m4_m(...) __riscv_vsaddu_vx_u8m4_tumu(__VA_ARGS__)
+#define vsaddu_vv_u8m8_m(...) __riscv_vsaddu_vv_u8m8_tumu(__VA_ARGS__)
+#define vsaddu_vx_u8m8_m(...) __riscv_vsaddu_vx_u8m8_tumu(__VA_ARGS__)
+#define vsaddu_vv_u16mf4_m(...) __riscv_vsaddu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vsaddu_vx_u16mf4_m(...) __riscv_vsaddu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vsaddu_vv_u16mf2_m(...) __riscv_vsaddu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vsaddu_vx_u16mf2_m(...) __riscv_vsaddu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vsaddu_vv_u16m1_m(...) __riscv_vsaddu_vv_u16m1_tumu(__VA_ARGS__)
+#define vsaddu_vx_u16m1_m(...) __riscv_vsaddu_vx_u16m1_tumu(__VA_ARGS__)
+#define vsaddu_vv_u16m2_m(...) __riscv_vsaddu_vv_u16m2_tumu(__VA_ARGS__)
+#define vsaddu_vx_u16m2_m(...) __riscv_vsaddu_vx_u16m2_tumu(__VA_ARGS__)
+#define vsaddu_vv_u16m4_m(...) __riscv_vsaddu_vv_u16m4_tumu(__VA_ARGS__)
+#define vsaddu_vx_u16m4_m(...) __riscv_vsaddu_vx_u16m4_tumu(__VA_ARGS__)
+#define vsaddu_vv_u16m8_m(...) __riscv_vsaddu_vv_u16m8_tumu(__VA_ARGS__)
+#define vsaddu_vx_u16m8_m(...) __riscv_vsaddu_vx_u16m8_tumu(__VA_ARGS__)
+#define vsaddu_vv_u32mf2_m(...) __riscv_vsaddu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vsaddu_vx_u32mf2_m(...) __riscv_vsaddu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vsaddu_vv_u32m1_m(...) __riscv_vsaddu_vv_u32m1_tumu(__VA_ARGS__)
+#define vsaddu_vx_u32m1_m(...) __riscv_vsaddu_vx_u32m1_tumu(__VA_ARGS__)
+#define vsaddu_vv_u32m2_m(...) __riscv_vsaddu_vv_u32m2_tumu(__VA_ARGS__)
+#define vsaddu_vx_u32m2_m(...) __riscv_vsaddu_vx_u32m2_tumu(__VA_ARGS__)
+#define vsaddu_vv_u32m4_m(...) __riscv_vsaddu_vv_u32m4_tumu(__VA_ARGS__)
+#define vsaddu_vx_u32m4_m(...) __riscv_vsaddu_vx_u32m4_tumu(__VA_ARGS__)
+#define vsaddu_vv_u32m8_m(...) __riscv_vsaddu_vv_u32m8_tumu(__VA_ARGS__)
+#define vsaddu_vx_u32m8_m(...) __riscv_vsaddu_vx_u32m8_tumu(__VA_ARGS__)
+#define vsaddu_vv_u64m1_m(...) __riscv_vsaddu_vv_u64m1_tumu(__VA_ARGS__)
+#define vsaddu_vx_u64m1_m(...) __riscv_vsaddu_vx_u64m1_tumu(__VA_ARGS__)
+#define vsaddu_vv_u64m2_m(...) __riscv_vsaddu_vv_u64m2_tumu(__VA_ARGS__)
+#define vsaddu_vx_u64m2_m(...) __riscv_vsaddu_vx_u64m2_tumu(__VA_ARGS__)
+#define vsaddu_vv_u64m4_m(...) __riscv_vsaddu_vv_u64m4_tumu(__VA_ARGS__)
+#define vsaddu_vx_u64m4_m(...) __riscv_vsaddu_vx_u64m4_tumu(__VA_ARGS__)
+#define vsaddu_vv_u64m8_m(...) __riscv_vsaddu_vv_u64m8_tumu(__VA_ARGS__)
+#define vsaddu_vx_u64m8_m(...) __riscv_vsaddu_vx_u64m8_tumu(__VA_ARGS__)
+#define vssubu_vv_u8mf8_m(...) __riscv_vssubu_vv_u8mf8_tumu(__VA_ARGS__)
+#define vssubu_vx_u8mf8_m(...) __riscv_vssubu_vx_u8mf8_tumu(__VA_ARGS__)
+#define vssubu_vv_u8mf4_m(...) __riscv_vssubu_vv_u8mf4_tumu(__VA_ARGS__)
+#define vssubu_vx_u8mf4_m(...) __riscv_vssubu_vx_u8mf4_tumu(__VA_ARGS__)
+#define vssubu_vv_u8mf2_m(...) __riscv_vssubu_vv_u8mf2_tumu(__VA_ARGS__)
+#define vssubu_vx_u8mf2_m(...) __riscv_vssubu_vx_u8mf2_tumu(__VA_ARGS__)
+#define vssubu_vv_u8m1_m(...) __riscv_vssubu_vv_u8m1_tumu(__VA_ARGS__)
+#define vssubu_vx_u8m1_m(...) __riscv_vssubu_vx_u8m1_tumu(__VA_ARGS__)
+#define vssubu_vv_u8m2_m(...) __riscv_vssubu_vv_u8m2_tumu(__VA_ARGS__)
+#define vssubu_vx_u8m2_m(...) __riscv_vssubu_vx_u8m2_tumu(__VA_ARGS__)
+#define vssubu_vv_u8m4_m(...) __riscv_vssubu_vv_u8m4_tumu(__VA_ARGS__)
+#define vssubu_vx_u8m4_m(...) __riscv_vssubu_vx_u8m4_tumu(__VA_ARGS__)
+#define vssubu_vv_u8m8_m(...) __riscv_vssubu_vv_u8m8_tumu(__VA_ARGS__)
+#define vssubu_vx_u8m8_m(...) __riscv_vssubu_vx_u8m8_tumu(__VA_ARGS__)
+#define vssubu_vv_u16mf4_m(...) __riscv_vssubu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vssubu_vx_u16mf4_m(...) __riscv_vssubu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vssubu_vv_u16mf2_m(...) __riscv_vssubu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vssubu_vx_u16mf2_m(...) __riscv_vssubu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vssubu_vv_u16m1_m(...) __riscv_vssubu_vv_u16m1_tumu(__VA_ARGS__)
+#define vssubu_vx_u16m1_m(...) __riscv_vssubu_vx_u16m1_tumu(__VA_ARGS__)
+#define vssubu_vv_u16m2_m(...) __riscv_vssubu_vv_u16m2_tumu(__VA_ARGS__)
+#define vssubu_vx_u16m2_m(...) __riscv_vssubu_vx_u16m2_tumu(__VA_ARGS__)
+#define vssubu_vv_u16m4_m(...) __riscv_vssubu_vv_u16m4_tumu(__VA_ARGS__)
+#define vssubu_vx_u16m4_m(...) __riscv_vssubu_vx_u16m4_tumu(__VA_ARGS__)
+#define vssubu_vv_u16m8_m(...) __riscv_vssubu_vv_u16m8_tumu(__VA_ARGS__)
+#define vssubu_vx_u16m8_m(...) __riscv_vssubu_vx_u16m8_tumu(__VA_ARGS__)
+#define vssubu_vv_u32mf2_m(...) __riscv_vssubu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vssubu_vx_u32mf2_m(...) __riscv_vssubu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vssubu_vv_u32m1_m(...) __riscv_vssubu_vv_u32m1_tumu(__VA_ARGS__)
+#define vssubu_vx_u32m1_m(...) __riscv_vssubu_vx_u32m1_tumu(__VA_ARGS__)
+#define vssubu_vv_u32m2_m(...) __riscv_vssubu_vv_u32m2_tumu(__VA_ARGS__)
+#define vssubu_vx_u32m2_m(...) __riscv_vssubu_vx_u32m2_tumu(__VA_ARGS__)
+#define vssubu_vv_u32m4_m(...) __riscv_vssubu_vv_u32m4_tumu(__VA_ARGS__)
+#define vssubu_vx_u32m4_m(...) __riscv_vssubu_vx_u32m4_tumu(__VA_ARGS__)
+#define vssubu_vv_u32m8_m(...) __riscv_vssubu_vv_u32m8_tumu(__VA_ARGS__)
+#define vssubu_vx_u32m8_m(...) __riscv_vssubu_vx_u32m8_tumu(__VA_ARGS__)
+#define vssubu_vv_u64m1_m(...) __riscv_vssubu_vv_u64m1_tumu(__VA_ARGS__)
+#define vssubu_vx_u64m1_m(...) __riscv_vssubu_vx_u64m1_tumu(__VA_ARGS__)
+#define vssubu_vv_u64m2_m(...) __riscv_vssubu_vv_u64m2_tumu(__VA_ARGS__)
+#define vssubu_vx_u64m2_m(...) __riscv_vssubu_vx_u64m2_tumu(__VA_ARGS__)
+#define vssubu_vv_u64m4_m(...) __riscv_vssubu_vv_u64m4_tumu(__VA_ARGS__)
+#define vssubu_vx_u64m4_m(...) __riscv_vssubu_vx_u64m4_tumu(__VA_ARGS__)
+#define vssubu_vv_u64m8_m(...) __riscv_vssubu_vv_u64m8_tumu(__VA_ARGS__)
+#define vssubu_vx_u64m8_m(...) __riscv_vssubu_vx_u64m8_tumu(__VA_ARGS__)
+#define vaadd_vv_i8mf8(...) __riscv_vaadd_vv_i8mf8(__VA_ARGS__)
+#define vaadd_vx_i8mf8(...) __riscv_vaadd_vx_i8mf8(__VA_ARGS__)
+#define vaadd_vv_i8mf4(...) __riscv_vaadd_vv_i8mf4(__VA_ARGS__)
+#define vaadd_vx_i8mf4(...) __riscv_vaadd_vx_i8mf4(__VA_ARGS__)
+#define vaadd_vv_i8mf2(...) __riscv_vaadd_vv_i8mf2(__VA_ARGS__)
+#define vaadd_vx_i8mf2(...) __riscv_vaadd_vx_i8mf2(__VA_ARGS__)
+#define vaadd_vv_i8m1(...) __riscv_vaadd_vv_i8m1(__VA_ARGS__)
+#define vaadd_vx_i8m1(...) __riscv_vaadd_vx_i8m1(__VA_ARGS__)
+#define vaadd_vv_i8m2(...) __riscv_vaadd_vv_i8m2(__VA_ARGS__)
+#define vaadd_vx_i8m2(...) __riscv_vaadd_vx_i8m2(__VA_ARGS__)
+#define vaadd_vv_i8m4(...) __riscv_vaadd_vv_i8m4(__VA_ARGS__)
+#define vaadd_vx_i8m4(...) __riscv_vaadd_vx_i8m4(__VA_ARGS__)
+#define vaadd_vv_i8m8(...) __riscv_vaadd_vv_i8m8(__VA_ARGS__)
+#define vaadd_vx_i8m8(...) __riscv_vaadd_vx_i8m8(__VA_ARGS__)
+#define vaadd_vv_i16mf4(...) __riscv_vaadd_vv_i16mf4(__VA_ARGS__)
+#define vaadd_vx_i16mf4(...) __riscv_vaadd_vx_i16mf4(__VA_ARGS__)
+#define vaadd_vv_i16mf2(...) __riscv_vaadd_vv_i16mf2(__VA_ARGS__)
+#define vaadd_vx_i16mf2(...) __riscv_vaadd_vx_i16mf2(__VA_ARGS__)
+#define vaadd_vv_i16m1(...) __riscv_vaadd_vv_i16m1(__VA_ARGS__)
+#define vaadd_vx_i16m1(...) __riscv_vaadd_vx_i16m1(__VA_ARGS__)
+#define vaadd_vv_i16m2(...) __riscv_vaadd_vv_i16m2(__VA_ARGS__)
+#define vaadd_vx_i16m2(...) __riscv_vaadd_vx_i16m2(__VA_ARGS__)
+#define vaadd_vv_i16m4(...) __riscv_vaadd_vv_i16m4(__VA_ARGS__)
+#define vaadd_vx_i16m4(...) __riscv_vaadd_vx_i16m4(__VA_ARGS__)
+#define vaadd_vv_i16m8(...) __riscv_vaadd_vv_i16m8(__VA_ARGS__)
+#define vaadd_vx_i16m8(...) __riscv_vaadd_vx_i16m8(__VA_ARGS__)
+#define vaadd_vv_i32mf2(...) __riscv_vaadd_vv_i32mf2(__VA_ARGS__)
+#define vaadd_vx_i32mf2(...) __riscv_vaadd_vx_i32mf2(__VA_ARGS__)
+#define vaadd_vv_i32m1(...) __riscv_vaadd_vv_i32m1(__VA_ARGS__)
+#define vaadd_vx_i32m1(...) __riscv_vaadd_vx_i32m1(__VA_ARGS__)
+#define vaadd_vv_i32m2(...) __riscv_vaadd_vv_i32m2(__VA_ARGS__)
+#define vaadd_vx_i32m2(...) __riscv_vaadd_vx_i32m2(__VA_ARGS__)
+#define vaadd_vv_i32m4(...) __riscv_vaadd_vv_i32m4(__VA_ARGS__)
+#define vaadd_vx_i32m4(...) __riscv_vaadd_vx_i32m4(__VA_ARGS__)
+#define vaadd_vv_i32m8(...) __riscv_vaadd_vv_i32m8(__VA_ARGS__)
+#define vaadd_vx_i32m8(...) __riscv_vaadd_vx_i32m8(__VA_ARGS__)
+#define vaadd_vv_i64m1(...) __riscv_vaadd_vv_i64m1(__VA_ARGS__)
+#define vaadd_vx_i64m1(...) __riscv_vaadd_vx_i64m1(__VA_ARGS__)
+#define vaadd_vv_i64m2(...) __riscv_vaadd_vv_i64m2(__VA_ARGS__)
+#define vaadd_vx_i64m2(...) __riscv_vaadd_vx_i64m2(__VA_ARGS__)
+#define vaadd_vv_i64m4(...) __riscv_vaadd_vv_i64m4(__VA_ARGS__)
+#define vaadd_vx_i64m4(...) __riscv_vaadd_vx_i64m4(__VA_ARGS__)
+#define vaadd_vv_i64m8(...) __riscv_vaadd_vv_i64m8(__VA_ARGS__)
+#define vaadd_vx_i64m8(...) __riscv_vaadd_vx_i64m8(__VA_ARGS__)
+#define vasub_vv_i8mf8(...) __riscv_vasub_vv_i8mf8(__VA_ARGS__)
+#define vasub_vx_i8mf8(...) __riscv_vasub_vx_i8mf8(__VA_ARGS__)
+#define vasub_vv_i8mf4(...) __riscv_vasub_vv_i8mf4(__VA_ARGS__)
+#define vasub_vx_i8mf4(...) __riscv_vasub_vx_i8mf4(__VA_ARGS__)
+#define vasub_vv_i8mf2(...) __riscv_vasub_vv_i8mf2(__VA_ARGS__)
+#define vasub_vx_i8mf2(...) __riscv_vasub_vx_i8mf2(__VA_ARGS__)
+#define vasub_vv_i8m1(...) __riscv_vasub_vv_i8m1(__VA_ARGS__)
+#define vasub_vx_i8m1(...) __riscv_vasub_vx_i8m1(__VA_ARGS__)
+#define vasub_vv_i8m2(...) __riscv_vasub_vv_i8m2(__VA_ARGS__)
+#define vasub_vx_i8m2(...) __riscv_vasub_vx_i8m2(__VA_ARGS__)
+#define vasub_vv_i8m4(...) __riscv_vasub_vv_i8m4(__VA_ARGS__)
+#define vasub_vx_i8m4(...) __riscv_vasub_vx_i8m4(__VA_ARGS__)
+#define vasub_vv_i8m8(...) __riscv_vasub_vv_i8m8(__VA_ARGS__)
+#define vasub_vx_i8m8(...) __riscv_vasub_vx_i8m8(__VA_ARGS__)
+#define vasub_vv_i16mf4(...) __riscv_vasub_vv_i16mf4(__VA_ARGS__)
+#define vasub_vx_i16mf4(...) __riscv_vasub_vx_i16mf4(__VA_ARGS__)
+#define vasub_vv_i16mf2(...) __riscv_vasub_vv_i16mf2(__VA_ARGS__)
+#define vasub_vx_i16mf2(...) __riscv_vasub_vx_i16mf2(__VA_ARGS__)
+#define vasub_vv_i16m1(...) __riscv_vasub_vv_i16m1(__VA_ARGS__)
+#define vasub_vx_i16m1(...) __riscv_vasub_vx_i16m1(__VA_ARGS__)
+#define vasub_vv_i16m2(...) __riscv_vasub_vv_i16m2(__VA_ARGS__)
+#define vasub_vx_i16m2(...) __riscv_vasub_vx_i16m2(__VA_ARGS__)
+#define vasub_vv_i16m4(...) __riscv_vasub_vv_i16m4(__VA_ARGS__)
+#define vasub_vx_i16m4(...) __riscv_vasub_vx_i16m4(__VA_ARGS__)
+#define vasub_vv_i16m8(...) __riscv_vasub_vv_i16m8(__VA_ARGS__)
+#define vasub_vx_i16m8(...) __riscv_vasub_vx_i16m8(__VA_ARGS__)
+#define vasub_vv_i32mf2(...) __riscv_vasub_vv_i32mf2(__VA_ARGS__)
+#define vasub_vx_i32mf2(...) __riscv_vasub_vx_i32mf2(__VA_ARGS__)
+#define vasub_vv_i32m1(...) __riscv_vasub_vv_i32m1(__VA_ARGS__)
+#define vasub_vx_i32m1(...) __riscv_vasub_vx_i32m1(__VA_ARGS__)
+#define vasub_vv_i32m2(...) __riscv_vasub_vv_i32m2(__VA_ARGS__)
+#define vasub_vx_i32m2(...) __riscv_vasub_vx_i32m2(__VA_ARGS__)
+#define vasub_vv_i32m4(...) __riscv_vasub_vv_i32m4(__VA_ARGS__)
+#define vasub_vx_i32m4(...) __riscv_vasub_vx_i32m4(__VA_ARGS__)
+#define vasub_vv_i32m8(...) __riscv_vasub_vv_i32m8(__VA_ARGS__)
+#define vasub_vx_i32m8(...) __riscv_vasub_vx_i32m8(__VA_ARGS__)
+#define vasub_vv_i64m1(...) __riscv_vasub_vv_i64m1(__VA_ARGS__)
+#define vasub_vx_i64m1(...) __riscv_vasub_vx_i64m1(__VA_ARGS__)
+#define vasub_vv_i64m2(...) __riscv_vasub_vv_i64m2(__VA_ARGS__)
+#define vasub_vx_i64m2(...) __riscv_vasub_vx_i64m2(__VA_ARGS__)
+#define vasub_vv_i64m4(...) __riscv_vasub_vv_i64m4(__VA_ARGS__)
+#define vasub_vx_i64m4(...) __riscv_vasub_vx_i64m4(__VA_ARGS__)
+#define vasub_vv_i64m8(...) __riscv_vasub_vv_i64m8(__VA_ARGS__)
+#define vasub_vx_i64m8(...) __riscv_vasub_vx_i64m8(__VA_ARGS__)
+#define vaaddu_vv_u8mf8(...) __riscv_vaaddu_vv_u8mf8(__VA_ARGS__)
+#define vaaddu_vx_u8mf8(...) __riscv_vaaddu_vx_u8mf8(__VA_ARGS__)
+#define vaaddu_vv_u8mf4(...) __riscv_vaaddu_vv_u8mf4(__VA_ARGS__)
+#define vaaddu_vx_u8mf4(...) __riscv_vaaddu_vx_u8mf4(__VA_ARGS__)
+#define vaaddu_vv_u8mf2(...) __riscv_vaaddu_vv_u8mf2(__VA_ARGS__)
+#define vaaddu_vx_u8mf2(...) __riscv_vaaddu_vx_u8mf2(__VA_ARGS__)
+#define vaaddu_vv_u8m1(...) __riscv_vaaddu_vv_u8m1(__VA_ARGS__)
+#define vaaddu_vx_u8m1(...) __riscv_vaaddu_vx_u8m1(__VA_ARGS__)
+#define vaaddu_vv_u8m2(...) __riscv_vaaddu_vv_u8m2(__VA_ARGS__)
+#define vaaddu_vx_u8m2(...) __riscv_vaaddu_vx_u8m2(__VA_ARGS__)
+#define vaaddu_vv_u8m4(...) __riscv_vaaddu_vv_u8m4(__VA_ARGS__)
+#define vaaddu_vx_u8m4(...) __riscv_vaaddu_vx_u8m4(__VA_ARGS__)
+#define vaaddu_vv_u8m8(...) __riscv_vaaddu_vv_u8m8(__VA_ARGS__)
+#define vaaddu_vx_u8m8(...) __riscv_vaaddu_vx_u8m8(__VA_ARGS__)
+#define vaaddu_vv_u16mf4(...) __riscv_vaaddu_vv_u16mf4(__VA_ARGS__)
+#define vaaddu_vx_u16mf4(...) __riscv_vaaddu_vx_u16mf4(__VA_ARGS__)
+#define vaaddu_vv_u16mf2(...) __riscv_vaaddu_vv_u16mf2(__VA_ARGS__)
+#define vaaddu_vx_u16mf2(...) __riscv_vaaddu_vx_u16mf2(__VA_ARGS__)
+#define vaaddu_vv_u16m1(...) __riscv_vaaddu_vv_u16m1(__VA_ARGS__)
+#define vaaddu_vx_u16m1(...) __riscv_vaaddu_vx_u16m1(__VA_ARGS__)
+#define vaaddu_vv_u16m2(...) __riscv_vaaddu_vv_u16m2(__VA_ARGS__)
+#define vaaddu_vx_u16m2(...) __riscv_vaaddu_vx_u16m2(__VA_ARGS__)
+#define vaaddu_vv_u16m4(...) __riscv_vaaddu_vv_u16m4(__VA_ARGS__)
+#define vaaddu_vx_u16m4(...) __riscv_vaaddu_vx_u16m4(__VA_ARGS__)
+#define vaaddu_vv_u16m8(...) __riscv_vaaddu_vv_u16m8(__VA_ARGS__)
+#define vaaddu_vx_u16m8(...) __riscv_vaaddu_vx_u16m8(__VA_ARGS__)
+#define vaaddu_vv_u32mf2(...) __riscv_vaaddu_vv_u32mf2(__VA_ARGS__)
+#define vaaddu_vx_u32mf2(...) __riscv_vaaddu_vx_u32mf2(__VA_ARGS__)
+#define vaaddu_vv_u32m1(...) __riscv_vaaddu_vv_u32m1(__VA_ARGS__)
+#define vaaddu_vx_u32m1(...) __riscv_vaaddu_vx_u32m1(__VA_ARGS__)
+#define vaaddu_vv_u32m2(...) __riscv_vaaddu_vv_u32m2(__VA_ARGS__)
+#define vaaddu_vx_u32m2(...) __riscv_vaaddu_vx_u32m2(__VA_ARGS__)
+#define vaaddu_vv_u32m4(...) __riscv_vaaddu_vv_u32m4(__VA_ARGS__)
+#define vaaddu_vx_u32m4(...) __riscv_vaaddu_vx_u32m4(__VA_ARGS__)
+#define vaaddu_vv_u32m8(...) __riscv_vaaddu_vv_u32m8(__VA_ARGS__)
+#define vaaddu_vx_u32m8(...) __riscv_vaaddu_vx_u32m8(__VA_ARGS__)
+#define vaaddu_vv_u64m1(...) __riscv_vaaddu_vv_u64m1(__VA_ARGS__)
+#define vaaddu_vx_u64m1(...) __riscv_vaaddu_vx_u64m1(__VA_ARGS__)
+#define vaaddu_vv_u64m2(...) __riscv_vaaddu_vv_u64m2(__VA_ARGS__)
+#define vaaddu_vx_u64m2(...) __riscv_vaaddu_vx_u64m2(__VA_ARGS__)
+#define vaaddu_vv_u64m4(...) __riscv_vaaddu_vv_u64m4(__VA_ARGS__)
+#define vaaddu_vx_u64m4(...) __riscv_vaaddu_vx_u64m4(__VA_ARGS__)
+#define vaaddu_vv_u64m8(...) __riscv_vaaddu_vv_u64m8(__VA_ARGS__)
+#define vaaddu_vx_u64m8(...) __riscv_vaaddu_vx_u64m8(__VA_ARGS__)
+#define vasubu_vv_u8mf8(...) __riscv_vasubu_vv_u8mf8(__VA_ARGS__)
+#define vasubu_vx_u8mf8(...) __riscv_vasubu_vx_u8mf8(__VA_ARGS__)
+#define vasubu_vv_u8mf4(...) __riscv_vasubu_vv_u8mf4(__VA_ARGS__)
+#define vasubu_vx_u8mf4(...) __riscv_vasubu_vx_u8mf4(__VA_ARGS__)
+#define vasubu_vv_u8mf2(...) __riscv_vasubu_vv_u8mf2(__VA_ARGS__)
+#define vasubu_vx_u8mf2(...) __riscv_vasubu_vx_u8mf2(__VA_ARGS__)
+#define vasubu_vv_u8m1(...) __riscv_vasubu_vv_u8m1(__VA_ARGS__)
+#define vasubu_vx_u8m1(...) __riscv_vasubu_vx_u8m1(__VA_ARGS__)
+#define vasubu_vv_u8m2(...) __riscv_vasubu_vv_u8m2(__VA_ARGS__)
+#define vasubu_vx_u8m2(...) __riscv_vasubu_vx_u8m2(__VA_ARGS__)
+#define vasubu_vv_u8m4(...) __riscv_vasubu_vv_u8m4(__VA_ARGS__)
+#define vasubu_vx_u8m4(...) __riscv_vasubu_vx_u8m4(__VA_ARGS__)
+#define vasubu_vv_u8m8(...) __riscv_vasubu_vv_u8m8(__VA_ARGS__)
+#define vasubu_vx_u8m8(...) __riscv_vasubu_vx_u8m8(__VA_ARGS__)
+#define vasubu_vv_u16mf4(...) __riscv_vasubu_vv_u16mf4(__VA_ARGS__)
+#define vasubu_vx_u16mf4(...) __riscv_vasubu_vx_u16mf4(__VA_ARGS__)
+#define vasubu_vv_u16mf2(...) __riscv_vasubu_vv_u16mf2(__VA_ARGS__)
+#define vasubu_vx_u16mf2(...) __riscv_vasubu_vx_u16mf2(__VA_ARGS__)
+#define vasubu_vv_u16m1(...) __riscv_vasubu_vv_u16m1(__VA_ARGS__)
+#define vasubu_vx_u16m1(...) __riscv_vasubu_vx_u16m1(__VA_ARGS__)
+#define vasubu_vv_u16m2(...) __riscv_vasubu_vv_u16m2(__VA_ARGS__)
+#define vasubu_vx_u16m2(...) __riscv_vasubu_vx_u16m2(__VA_ARGS__)
+#define vasubu_vv_u16m4(...) __riscv_vasubu_vv_u16m4(__VA_ARGS__)
+#define vasubu_vx_u16m4(...) __riscv_vasubu_vx_u16m4(__VA_ARGS__)
+#define vasubu_vv_u16m8(...) __riscv_vasubu_vv_u16m8(__VA_ARGS__)
+#define vasubu_vx_u16m8(...) __riscv_vasubu_vx_u16m8(__VA_ARGS__)
+#define vasubu_vv_u32mf2(...) __riscv_vasubu_vv_u32mf2(__VA_ARGS__)
+#define vasubu_vx_u32mf2(...) __riscv_vasubu_vx_u32mf2(__VA_ARGS__)
+#define vasubu_vv_u32m1(...) __riscv_vasubu_vv_u32m1(__VA_ARGS__)
+#define vasubu_vx_u32m1(...) __riscv_vasubu_vx_u32m1(__VA_ARGS__)
+#define vasubu_vv_u32m2(...) __riscv_vasubu_vv_u32m2(__VA_ARGS__)
+#define vasubu_vx_u32m2(...) __riscv_vasubu_vx_u32m2(__VA_ARGS__)
+#define vasubu_vv_u32m4(...) __riscv_vasubu_vv_u32m4(__VA_ARGS__)
+#define vasubu_vx_u32m4(...) __riscv_vasubu_vx_u32m4(__VA_ARGS__)
+#define vasubu_vv_u32m8(...) __riscv_vasubu_vv_u32m8(__VA_ARGS__)
+#define vasubu_vx_u32m8(...) __riscv_vasubu_vx_u32m8(__VA_ARGS__)
+#define vasubu_vv_u64m1(...) __riscv_vasubu_vv_u64m1(__VA_ARGS__)
+#define vasubu_vx_u64m1(...) __riscv_vasubu_vx_u64m1(__VA_ARGS__)
+#define vasubu_vv_u64m2(...) __riscv_vasubu_vv_u64m2(__VA_ARGS__)
+#define vasubu_vx_u64m2(...) __riscv_vasubu_vx_u64m2(__VA_ARGS__)
+#define vasubu_vv_u64m4(...) __riscv_vasubu_vv_u64m4(__VA_ARGS__)
+#define vasubu_vx_u64m4(...) __riscv_vasubu_vx_u64m4(__VA_ARGS__)
+#define vasubu_vv_u64m8(...) __riscv_vasubu_vv_u64m8(__VA_ARGS__)
+#define vasubu_vx_u64m8(...) __riscv_vasubu_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vaadd_vv_i8mf8_m(...) __riscv_vaadd_vv_i8mf8_tumu(__VA_ARGS__)
+#define vaadd_vx_i8mf8_m(...) __riscv_vaadd_vx_i8mf8_tumu(__VA_ARGS__)
+#define vaadd_vv_i8mf4_m(...) __riscv_vaadd_vv_i8mf4_tumu(__VA_ARGS__)
+#define vaadd_vx_i8mf4_m(...) __riscv_vaadd_vx_i8mf4_tumu(__VA_ARGS__)
+#define vaadd_vv_i8mf2_m(...) __riscv_vaadd_vv_i8mf2_tumu(__VA_ARGS__)
+#define vaadd_vx_i8mf2_m(...) __riscv_vaadd_vx_i8mf2_tumu(__VA_ARGS__)
+#define vaadd_vv_i8m1_m(...) __riscv_vaadd_vv_i8m1_tumu(__VA_ARGS__)
+#define vaadd_vx_i8m1_m(...) __riscv_vaadd_vx_i8m1_tumu(__VA_ARGS__)
+#define vaadd_vv_i8m2_m(...) __riscv_vaadd_vv_i8m2_tumu(__VA_ARGS__)
+#define vaadd_vx_i8m2_m(...) __riscv_vaadd_vx_i8m2_tumu(__VA_ARGS__)
+#define vaadd_vv_i8m4_m(...) __riscv_vaadd_vv_i8m4_tumu(__VA_ARGS__)
+#define vaadd_vx_i8m4_m(...) __riscv_vaadd_vx_i8m4_tumu(__VA_ARGS__)
+#define vaadd_vv_i8m8_m(...) __riscv_vaadd_vv_i8m8_tumu(__VA_ARGS__)
+#define vaadd_vx_i8m8_m(...) __riscv_vaadd_vx_i8m8_tumu(__VA_ARGS__)
+#define vaadd_vv_i16mf4_m(...) __riscv_vaadd_vv_i16mf4_tumu(__VA_ARGS__)
+#define vaadd_vx_i16mf4_m(...) __riscv_vaadd_vx_i16mf4_tumu(__VA_ARGS__)
+#define vaadd_vv_i16mf2_m(...) __riscv_vaadd_vv_i16mf2_tumu(__VA_ARGS__)
+#define vaadd_vx_i16mf2_m(...) __riscv_vaadd_vx_i16mf2_tumu(__VA_ARGS__)
+#define vaadd_vv_i16m1_m(...) __riscv_vaadd_vv_i16m1_tumu(__VA_ARGS__)
+#define vaadd_vx_i16m1_m(...) __riscv_vaadd_vx_i16m1_tumu(__VA_ARGS__)
+#define vaadd_vv_i16m2_m(...) __riscv_vaadd_vv_i16m2_tumu(__VA_ARGS__)
+#define vaadd_vx_i16m2_m(...) __riscv_vaadd_vx_i16m2_tumu(__VA_ARGS__)
+#define vaadd_vv_i16m4_m(...) __riscv_vaadd_vv_i16m4_tumu(__VA_ARGS__)
+#define vaadd_vx_i16m4_m(...) __riscv_vaadd_vx_i16m4_tumu(__VA_ARGS__)
+#define vaadd_vv_i16m8_m(...) __riscv_vaadd_vv_i16m8_tumu(__VA_ARGS__)
+#define vaadd_vx_i16m8_m(...) __riscv_vaadd_vx_i16m8_tumu(__VA_ARGS__)
+#define vaadd_vv_i32mf2_m(...) __riscv_vaadd_vv_i32mf2_tumu(__VA_ARGS__)
+#define vaadd_vx_i32mf2_m(...) __riscv_vaadd_vx_i32mf2_tumu(__VA_ARGS__)
+#define vaadd_vv_i32m1_m(...) __riscv_vaadd_vv_i32m1_tumu(__VA_ARGS__)
+#define vaadd_vx_i32m1_m(...) __riscv_vaadd_vx_i32m1_tumu(__VA_ARGS__)
+#define vaadd_vv_i32m2_m(...) __riscv_vaadd_vv_i32m2_tumu(__VA_ARGS__)
+#define vaadd_vx_i32m2_m(...) __riscv_vaadd_vx_i32m2_tumu(__VA_ARGS__)
+#define vaadd_vv_i32m4_m(...) __riscv_vaadd_vv_i32m4_tumu(__VA_ARGS__)
+#define vaadd_vx_i32m4_m(...) __riscv_vaadd_vx_i32m4_tumu(__VA_ARGS__)
+#define vaadd_vv_i32m8_m(...) __riscv_vaadd_vv_i32m8_tumu(__VA_ARGS__)
+#define vaadd_vx_i32m8_m(...) __riscv_vaadd_vx_i32m8_tumu(__VA_ARGS__)
+#define vaadd_vv_i64m1_m(...) __riscv_vaadd_vv_i64m1_tumu(__VA_ARGS__)
+#define vaadd_vx_i64m1_m(...) __riscv_vaadd_vx_i64m1_tumu(__VA_ARGS__)
+#define vaadd_vv_i64m2_m(...) __riscv_vaadd_vv_i64m2_tumu(__VA_ARGS__)
+#define vaadd_vx_i64m2_m(...) __riscv_vaadd_vx_i64m2_tumu(__VA_ARGS__)
+#define vaadd_vv_i64m4_m(...) __riscv_vaadd_vv_i64m4_tumu(__VA_ARGS__)
+#define vaadd_vx_i64m4_m(...) __riscv_vaadd_vx_i64m4_tumu(__VA_ARGS__)
+#define vaadd_vv_i64m8_m(...) __riscv_vaadd_vv_i64m8_tumu(__VA_ARGS__)
+#define vaadd_vx_i64m8_m(...) __riscv_vaadd_vx_i64m8_tumu(__VA_ARGS__)
+#define vasub_vv_i8mf8_m(...) __riscv_vasub_vv_i8mf8_tumu(__VA_ARGS__)
+#define vasub_vx_i8mf8_m(...) __riscv_vasub_vx_i8mf8_tumu(__VA_ARGS__)
+#define vasub_vv_i8mf4_m(...) __riscv_vasub_vv_i8mf4_tumu(__VA_ARGS__)
+#define vasub_vx_i8mf4_m(...) __riscv_vasub_vx_i8mf4_tumu(__VA_ARGS__)
+#define vasub_vv_i8mf2_m(...) __riscv_vasub_vv_i8mf2_tumu(__VA_ARGS__)
+#define vasub_vx_i8mf2_m(...) __riscv_vasub_vx_i8mf2_tumu(__VA_ARGS__)
+#define vasub_vv_i8m1_m(...) __riscv_vasub_vv_i8m1_tumu(__VA_ARGS__)
+#define vasub_vx_i8m1_m(...) __riscv_vasub_vx_i8m1_tumu(__VA_ARGS__)
+#define vasub_vv_i8m2_m(...) __riscv_vasub_vv_i8m2_tumu(__VA_ARGS__)
+#define vasub_vx_i8m2_m(...) __riscv_vasub_vx_i8m2_tumu(__VA_ARGS__)
+#define vasub_vv_i8m4_m(...) __riscv_vasub_vv_i8m4_tumu(__VA_ARGS__)
+#define vasub_vx_i8m4_m(...) __riscv_vasub_vx_i8m4_tumu(__VA_ARGS__)
+#define vasub_vv_i8m8_m(...) __riscv_vasub_vv_i8m8_tumu(__VA_ARGS__)
+#define vasub_vx_i8m8_m(...) __riscv_vasub_vx_i8m8_tumu(__VA_ARGS__)
+#define vasub_vv_i16mf4_m(...) __riscv_vasub_vv_i16mf4_tumu(__VA_ARGS__)
+#define vasub_vx_i16mf4_m(...) __riscv_vasub_vx_i16mf4_tumu(__VA_ARGS__)
+#define vasub_vv_i16mf2_m(...) __riscv_vasub_vv_i16mf2_tumu(__VA_ARGS__)
+#define vasub_vx_i16mf2_m(...) __riscv_vasub_vx_i16mf2_tumu(__VA_ARGS__)
+#define vasub_vv_i16m1_m(...) __riscv_vasub_vv_i16m1_tumu(__VA_ARGS__)
+#define vasub_vx_i16m1_m(...) __riscv_vasub_vx_i16m1_tumu(__VA_ARGS__)
+#define vasub_vv_i16m2_m(...) __riscv_vasub_vv_i16m2_tumu(__VA_ARGS__)
+#define vasub_vx_i16m2_m(...) __riscv_vasub_vx_i16m2_tumu(__VA_ARGS__)
+#define vasub_vv_i16m4_m(...) __riscv_vasub_vv_i16m4_tumu(__VA_ARGS__)
+#define vasub_vx_i16m4_m(...) __riscv_vasub_vx_i16m4_tumu(__VA_ARGS__)
+#define vasub_vv_i16m8_m(...) __riscv_vasub_vv_i16m8_tumu(__VA_ARGS__)
+#define vasub_vx_i16m8_m(...) __riscv_vasub_vx_i16m8_tumu(__VA_ARGS__)
+#define vasub_vv_i32mf2_m(...) __riscv_vasub_vv_i32mf2_tumu(__VA_ARGS__)
+#define vasub_vx_i32mf2_m(...) __riscv_vasub_vx_i32mf2_tumu(__VA_ARGS__)
+#define vasub_vv_i32m1_m(...) __riscv_vasub_vv_i32m1_tumu(__VA_ARGS__)
+#define vasub_vx_i32m1_m(...) __riscv_vasub_vx_i32m1_tumu(__VA_ARGS__)
+#define vasub_vv_i32m2_m(...) __riscv_vasub_vv_i32m2_tumu(__VA_ARGS__)
+#define vasub_vx_i32m2_m(...) __riscv_vasub_vx_i32m2_tumu(__VA_ARGS__)
+#define vasub_vv_i32m4_m(...) __riscv_vasub_vv_i32m4_tumu(__VA_ARGS__)
+#define vasub_vx_i32m4_m(...) __riscv_vasub_vx_i32m4_tumu(__VA_ARGS__)
+#define vasub_vv_i32m8_m(...) __riscv_vasub_vv_i32m8_tumu(__VA_ARGS__)
+#define vasub_vx_i32m8_m(...) __riscv_vasub_vx_i32m8_tumu(__VA_ARGS__)
+#define vasub_vv_i64m1_m(...) __riscv_vasub_vv_i64m1_tumu(__VA_ARGS__)
+#define vasub_vx_i64m1_m(...) __riscv_vasub_vx_i64m1_tumu(__VA_ARGS__)
+#define vasub_vv_i64m2_m(...) __riscv_vasub_vv_i64m2_tumu(__VA_ARGS__)
+#define vasub_vx_i64m2_m(...) __riscv_vasub_vx_i64m2_tumu(__VA_ARGS__)
+#define vasub_vv_i64m4_m(...) __riscv_vasub_vv_i64m4_tumu(__VA_ARGS__)
+#define vasub_vx_i64m4_m(...) __riscv_vasub_vx_i64m4_tumu(__VA_ARGS__)
+#define vasub_vv_i64m8_m(...) __riscv_vasub_vv_i64m8_tumu(__VA_ARGS__)
+#define vasub_vx_i64m8_m(...) __riscv_vasub_vx_i64m8_tumu(__VA_ARGS__)
+#define vaaddu_vv_u8mf8_m(...) __riscv_vaaddu_vv_u8mf8_tumu(__VA_ARGS__)
+#define vaaddu_vx_u8mf8_m(...) __riscv_vaaddu_vx_u8mf8_tumu(__VA_ARGS__)
+#define vaaddu_vv_u8mf4_m(...) __riscv_vaaddu_vv_u8mf4_tumu(__VA_ARGS__)
+#define vaaddu_vx_u8mf4_m(...) __riscv_vaaddu_vx_u8mf4_tumu(__VA_ARGS__)
+#define vaaddu_vv_u8mf2_m(...) __riscv_vaaddu_vv_u8mf2_tumu(__VA_ARGS__)
+#define vaaddu_vx_u8mf2_m(...) __riscv_vaaddu_vx_u8mf2_tumu(__VA_ARGS__)
+#define vaaddu_vv_u8m1_m(...) __riscv_vaaddu_vv_u8m1_tumu(__VA_ARGS__)
+#define vaaddu_vx_u8m1_m(...) __riscv_vaaddu_vx_u8m1_tumu(__VA_ARGS__)
+#define vaaddu_vv_u8m2_m(...) __riscv_vaaddu_vv_u8m2_tumu(__VA_ARGS__)
+#define vaaddu_vx_u8m2_m(...) __riscv_vaaddu_vx_u8m2_tumu(__VA_ARGS__)
+#define vaaddu_vv_u8m4_m(...) __riscv_vaaddu_vv_u8m4_tumu(__VA_ARGS__)
+#define vaaddu_vx_u8m4_m(...) __riscv_vaaddu_vx_u8m4_tumu(__VA_ARGS__)
+#define vaaddu_vv_u8m8_m(...) __riscv_vaaddu_vv_u8m8_tumu(__VA_ARGS__)
+#define vaaddu_vx_u8m8_m(...) __riscv_vaaddu_vx_u8m8_tumu(__VA_ARGS__)
+#define vaaddu_vv_u16mf4_m(...) __riscv_vaaddu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vaaddu_vx_u16mf4_m(...) __riscv_vaaddu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vaaddu_vv_u16mf2_m(...) __riscv_vaaddu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vaaddu_vx_u16mf2_m(...) __riscv_vaaddu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vaaddu_vv_u16m1_m(...) __riscv_vaaddu_vv_u16m1_tumu(__VA_ARGS__)
+#define vaaddu_vx_u16m1_m(...) __riscv_vaaddu_vx_u16m1_tumu(__VA_ARGS__)
+#define vaaddu_vv_u16m2_m(...) __riscv_vaaddu_vv_u16m2_tumu(__VA_ARGS__)
+#define vaaddu_vx_u16m2_m(...) __riscv_vaaddu_vx_u16m2_tumu(__VA_ARGS__)
+#define vaaddu_vv_u16m4_m(...) __riscv_vaaddu_vv_u16m4_tumu(__VA_ARGS__)
+#define vaaddu_vx_u16m4_m(...) __riscv_vaaddu_vx_u16m4_tumu(__VA_ARGS__)
+#define vaaddu_vv_u16m8_m(...) __riscv_vaaddu_vv_u16m8_tumu(__VA_ARGS__)
+#define vaaddu_vx_u16m8_m(...) __riscv_vaaddu_vx_u16m8_tumu(__VA_ARGS__)
+#define vaaddu_vv_u32mf2_m(...) __riscv_vaaddu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vaaddu_vx_u32mf2_m(...) __riscv_vaaddu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vaaddu_vv_u32m1_m(...) __riscv_vaaddu_vv_u32m1_tumu(__VA_ARGS__)
+#define vaaddu_vx_u32m1_m(...) __riscv_vaaddu_vx_u32m1_tumu(__VA_ARGS__)
+#define vaaddu_vv_u32m2_m(...) __riscv_vaaddu_vv_u32m2_tumu(__VA_ARGS__)
+#define vaaddu_vx_u32m2_m(...) __riscv_vaaddu_vx_u32m2_tumu(__VA_ARGS__)
+#define vaaddu_vv_u32m4_m(...) __riscv_vaaddu_vv_u32m4_tumu(__VA_ARGS__)
+#define vaaddu_vx_u32m4_m(...) __riscv_vaaddu_vx_u32m4_tumu(__VA_ARGS__)
+#define vaaddu_vv_u32m8_m(...) __riscv_vaaddu_vv_u32m8_tumu(__VA_ARGS__)
+#define vaaddu_vx_u32m8_m(...) __riscv_vaaddu_vx_u32m8_tumu(__VA_ARGS__)
+#define vaaddu_vv_u64m1_m(...) __riscv_vaaddu_vv_u64m1_tumu(__VA_ARGS__)
+#define vaaddu_vx_u64m1_m(...) __riscv_vaaddu_vx_u64m1_tumu(__VA_ARGS__)
+#define vaaddu_vv_u64m2_m(...) __riscv_vaaddu_vv_u64m2_tumu(__VA_ARGS__)
+#define vaaddu_vx_u64m2_m(...) __riscv_vaaddu_vx_u64m2_tumu(__VA_ARGS__)
+#define vaaddu_vv_u64m4_m(...) __riscv_vaaddu_vv_u64m4_tumu(__VA_ARGS__)
+#define vaaddu_vx_u64m4_m(...) __riscv_vaaddu_vx_u64m4_tumu(__VA_ARGS__)
+#define vaaddu_vv_u64m8_m(...) __riscv_vaaddu_vv_u64m8_tumu(__VA_ARGS__)
+#define vaaddu_vx_u64m8_m(...) __riscv_vaaddu_vx_u64m8_tumu(__VA_ARGS__)
+#define vasubu_vv_u8mf8_m(...) __riscv_vasubu_vv_u8mf8_tumu(__VA_ARGS__)
+#define vasubu_vx_u8mf8_m(...) __riscv_vasubu_vx_u8mf8_tumu(__VA_ARGS__)
+#define vasubu_vv_u8mf4_m(...) __riscv_vasubu_vv_u8mf4_tumu(__VA_ARGS__)
+#define vasubu_vx_u8mf4_m(...) __riscv_vasubu_vx_u8mf4_tumu(__VA_ARGS__)
+#define vasubu_vv_u8mf2_m(...) __riscv_vasubu_vv_u8mf2_tumu(__VA_ARGS__)
+#define vasubu_vx_u8mf2_m(...) __riscv_vasubu_vx_u8mf2_tumu(__VA_ARGS__)
+#define vasubu_vv_u8m1_m(...) __riscv_vasubu_vv_u8m1_tumu(__VA_ARGS__)
+#define vasubu_vx_u8m1_m(...) __riscv_vasubu_vx_u8m1_tumu(__VA_ARGS__)
+#define vasubu_vv_u8m2_m(...) __riscv_vasubu_vv_u8m2_tumu(__VA_ARGS__)
+#define vasubu_vx_u8m2_m(...) __riscv_vasubu_vx_u8m2_tumu(__VA_ARGS__)
+#define vasubu_vv_u8m4_m(...) __riscv_vasubu_vv_u8m4_tumu(__VA_ARGS__)
+#define vasubu_vx_u8m4_m(...) __riscv_vasubu_vx_u8m4_tumu(__VA_ARGS__)
+#define vasubu_vv_u8m8_m(...) __riscv_vasubu_vv_u8m8_tumu(__VA_ARGS__)
+#define vasubu_vx_u8m8_m(...) __riscv_vasubu_vx_u8m8_tumu(__VA_ARGS__)
+#define vasubu_vv_u16mf4_m(...) __riscv_vasubu_vv_u16mf4_tumu(__VA_ARGS__)
+#define vasubu_vx_u16mf4_m(...) __riscv_vasubu_vx_u16mf4_tumu(__VA_ARGS__)
+#define vasubu_vv_u16mf2_m(...) __riscv_vasubu_vv_u16mf2_tumu(__VA_ARGS__)
+#define vasubu_vx_u16mf2_m(...) __riscv_vasubu_vx_u16mf2_tumu(__VA_ARGS__)
+#define vasubu_vv_u16m1_m(...) __riscv_vasubu_vv_u16m1_tumu(__VA_ARGS__)
+#define vasubu_vx_u16m1_m(...) __riscv_vasubu_vx_u16m1_tumu(__VA_ARGS__)
+#define vasubu_vv_u16m2_m(...) __riscv_vasubu_vv_u16m2_tumu(__VA_ARGS__)
+#define vasubu_vx_u16m2_m(...) __riscv_vasubu_vx_u16m2_tumu(__VA_ARGS__)
+#define vasubu_vv_u16m4_m(...) __riscv_vasubu_vv_u16m4_tumu(__VA_ARGS__)
+#define vasubu_vx_u16m4_m(...) __riscv_vasubu_vx_u16m4_tumu(__VA_ARGS__)
+#define vasubu_vv_u16m8_m(...) __riscv_vasubu_vv_u16m8_tumu(__VA_ARGS__)
+#define vasubu_vx_u16m8_m(...) __riscv_vasubu_vx_u16m8_tumu(__VA_ARGS__)
+#define vasubu_vv_u32mf2_m(...) __riscv_vasubu_vv_u32mf2_tumu(__VA_ARGS__)
+#define vasubu_vx_u32mf2_m(...) __riscv_vasubu_vx_u32mf2_tumu(__VA_ARGS__)
+#define vasubu_vv_u32m1_m(...) __riscv_vasubu_vv_u32m1_tumu(__VA_ARGS__)
+#define vasubu_vx_u32m1_m(...) __riscv_vasubu_vx_u32m1_tumu(__VA_ARGS__)
+#define vasubu_vv_u32m2_m(...) __riscv_vasubu_vv_u32m2_tumu(__VA_ARGS__)
+#define vasubu_vx_u32m2_m(...) __riscv_vasubu_vx_u32m2_tumu(__VA_ARGS__)
+#define vasubu_vv_u32m4_m(...) __riscv_vasubu_vv_u32m4_tumu(__VA_ARGS__)
+#define vasubu_vx_u32m4_m(...) __riscv_vasubu_vx_u32m4_tumu(__VA_ARGS__)
+#define vasubu_vv_u32m8_m(...) __riscv_vasubu_vv_u32m8_tumu(__VA_ARGS__)
+#define vasubu_vx_u32m8_m(...) __riscv_vasubu_vx_u32m8_tumu(__VA_ARGS__)
+#define vasubu_vv_u64m1_m(...) __riscv_vasubu_vv_u64m1_tumu(__VA_ARGS__)
+#define vasubu_vx_u64m1_m(...) __riscv_vasubu_vx_u64m1_tumu(__VA_ARGS__)
+#define vasubu_vv_u64m2_m(...) __riscv_vasubu_vv_u64m2_tumu(__VA_ARGS__)
+#define vasubu_vx_u64m2_m(...) __riscv_vasubu_vx_u64m2_tumu(__VA_ARGS__)
+#define vasubu_vv_u64m4_m(...) __riscv_vasubu_vv_u64m4_tumu(__VA_ARGS__)
+#define vasubu_vx_u64m4_m(...) __riscv_vasubu_vx_u64m4_tumu(__VA_ARGS__)
+#define vasubu_vv_u64m8_m(...) __riscv_vasubu_vv_u64m8_tumu(__VA_ARGS__)
+#define vasubu_vx_u64m8_m(...) __riscv_vasubu_vx_u64m8_tumu(__VA_ARGS__)
+#define vsmul_vv_i8mf8(...) __riscv_vsmul_vv_i8mf8(__VA_ARGS__)
+#define vsmul_vx_i8mf8(...) __riscv_vsmul_vx_i8mf8(__VA_ARGS__)
+#define vsmul_vv_i8mf4(...) __riscv_vsmul_vv_i8mf4(__VA_ARGS__)
+#define vsmul_vx_i8mf4(...) __riscv_vsmul_vx_i8mf4(__VA_ARGS__)
+#define vsmul_vv_i8mf2(...) __riscv_vsmul_vv_i8mf2(__VA_ARGS__)
+#define vsmul_vx_i8mf2(...) __riscv_vsmul_vx_i8mf2(__VA_ARGS__)
+#define vsmul_vv_i8m1(...) __riscv_vsmul_vv_i8m1(__VA_ARGS__)
+#define vsmul_vx_i8m1(...) __riscv_vsmul_vx_i8m1(__VA_ARGS__)
+#define vsmul_vv_i8m2(...) __riscv_vsmul_vv_i8m2(__VA_ARGS__)
+#define vsmul_vx_i8m2(...) __riscv_vsmul_vx_i8m2(__VA_ARGS__)
+#define vsmul_vv_i8m4(...) __riscv_vsmul_vv_i8m4(__VA_ARGS__)
+#define vsmul_vx_i8m4(...) __riscv_vsmul_vx_i8m4(__VA_ARGS__)
+#define vsmul_vv_i8m8(...) __riscv_vsmul_vv_i8m8(__VA_ARGS__)
+#define vsmul_vx_i8m8(...) __riscv_vsmul_vx_i8m8(__VA_ARGS__)
+#define vsmul_vv_i16mf4(...) __riscv_vsmul_vv_i16mf4(__VA_ARGS__)
+#define vsmul_vx_i16mf4(...) __riscv_vsmul_vx_i16mf4(__VA_ARGS__)
+#define vsmul_vv_i16mf2(...) __riscv_vsmul_vv_i16mf2(__VA_ARGS__)
+#define vsmul_vx_i16mf2(...) __riscv_vsmul_vx_i16mf2(__VA_ARGS__)
+#define vsmul_vv_i16m1(...) __riscv_vsmul_vv_i16m1(__VA_ARGS__)
+#define vsmul_vx_i16m1(...) __riscv_vsmul_vx_i16m1(__VA_ARGS__)
+#define vsmul_vv_i16m2(...) __riscv_vsmul_vv_i16m2(__VA_ARGS__)
+#define vsmul_vx_i16m2(...) __riscv_vsmul_vx_i16m2(__VA_ARGS__)
+#define vsmul_vv_i16m4(...) __riscv_vsmul_vv_i16m4(__VA_ARGS__)
+#define vsmul_vx_i16m4(...) __riscv_vsmul_vx_i16m4(__VA_ARGS__)
+#define vsmul_vv_i16m8(...) __riscv_vsmul_vv_i16m8(__VA_ARGS__)
+#define vsmul_vx_i16m8(...) __riscv_vsmul_vx_i16m8(__VA_ARGS__)
+#define vsmul_vv_i32mf2(...) __riscv_vsmul_vv_i32mf2(__VA_ARGS__)
+#define vsmul_vx_i32mf2(...) __riscv_vsmul_vx_i32mf2(__VA_ARGS__)
+#define vsmul_vv_i32m1(...) __riscv_vsmul_vv_i32m1(__VA_ARGS__)
+#define vsmul_vx_i32m1(...) __riscv_vsmul_vx_i32m1(__VA_ARGS__)
+#define vsmul_vv_i32m2(...) __riscv_vsmul_vv_i32m2(__VA_ARGS__)
+#define vsmul_vx_i32m2(...) __riscv_vsmul_vx_i32m2(__VA_ARGS__)
+#define vsmul_vv_i32m4(...) __riscv_vsmul_vv_i32m4(__VA_ARGS__)
+#define vsmul_vx_i32m4(...) __riscv_vsmul_vx_i32m4(__VA_ARGS__)
+#define vsmul_vv_i32m8(...) __riscv_vsmul_vv_i32m8(__VA_ARGS__)
+#define vsmul_vx_i32m8(...) __riscv_vsmul_vx_i32m8(__VA_ARGS__)
+#define vsmul_vv_i64m1(...) __riscv_vsmul_vv_i64m1(__VA_ARGS__)
+#define vsmul_vx_i64m1(...) __riscv_vsmul_vx_i64m1(__VA_ARGS__)
+#define vsmul_vv_i64m2(...) __riscv_vsmul_vv_i64m2(__VA_ARGS__)
+#define vsmul_vx_i64m2(...) __riscv_vsmul_vx_i64m2(__VA_ARGS__)
+#define vsmul_vv_i64m4(...) __riscv_vsmul_vv_i64m4(__VA_ARGS__)
+#define vsmul_vx_i64m4(...) __riscv_vsmul_vx_i64m4(__VA_ARGS__)
+#define vsmul_vv_i64m8(...) __riscv_vsmul_vv_i64m8(__VA_ARGS__)
+#define vsmul_vx_i64m8(...) __riscv_vsmul_vx_i64m8(__VA_ARGS__)
+// masked functions
+#define vsmul_vv_i8mf8_m(...) __riscv_vsmul_vv_i8mf8_mu(__VA_ARGS__)
+#define vsmul_vx_i8mf8_m(...) __riscv_vsmul_vx_i8mf8_mu(__VA_ARGS__)
+#define vsmul_vv_i8mf4_m(...) __riscv_vsmul_vv_i8mf4_mu(__VA_ARGS__)
+#define vsmul_vx_i8mf4_m(...) __riscv_vsmul_vx_i8mf4_mu(__VA_ARGS__)
+#define vsmul_vv_i8mf2_m(...) __riscv_vsmul_vv_i8mf2_mu(__VA_ARGS__)
+#define vsmul_vx_i8mf2_m(...) __riscv_vsmul_vx_i8mf2_mu(__VA_ARGS__)
+#define vsmul_vv_i8m1_m(...) __riscv_vsmul_vv_i8m1_mu(__VA_ARGS__)
+#define vsmul_vx_i8m1_m(...) __riscv_vsmul_vx_i8m1_mu(__VA_ARGS__)
+#define vsmul_vv_i8m2_m(...) __riscv_vsmul_vv_i8m2_mu(__VA_ARGS__)
+#define vsmul_vx_i8m2_m(...) __riscv_vsmul_vx_i8m2_mu(__VA_ARGS__)
+#define vsmul_vv_i8m4_m(...) __riscv_vsmul_vv_i8m4_mu(__VA_ARGS__)
+#define vsmul_vx_i8m4_m(...) __riscv_vsmul_vx_i8m4_mu(__VA_ARGS__)
+#define vsmul_vv_i8m8_m(...) __riscv_vsmul_vv_i8m8_mu(__VA_ARGS__)
+#define vsmul_vx_i8m8_m(...) __riscv_vsmul_vx_i8m8_mu(__VA_ARGS__)
+#define vsmul_vv_i16mf4_m(...) __riscv_vsmul_vv_i16mf4_mu(__VA_ARGS__)
+#define vsmul_vx_i16mf4_m(...) __riscv_vsmul_vx_i16mf4_mu(__VA_ARGS__)
+#define vsmul_vv_i16mf2_m(...) __riscv_vsmul_vv_i16mf2_mu(__VA_ARGS__)
+#define vsmul_vx_i16mf2_m(...) __riscv_vsmul_vx_i16mf2_mu(__VA_ARGS__)
+#define vsmul_vv_i16m1_m(...) __riscv_vsmul_vv_i16m1_mu(__VA_ARGS__)
+#define vsmul_vx_i16m1_m(...) __riscv_vsmul_vx_i16m1_mu(__VA_ARGS__)
+#define vsmul_vv_i16m2_m(...) __riscv_vsmul_vv_i16m2_mu(__VA_ARGS__)
+#define vsmul_vx_i16m2_m(...) __riscv_vsmul_vx_i16m2_mu(__VA_ARGS__)
+#define vsmul_vv_i16m4_m(...) __riscv_vsmul_vv_i16m4_mu(__VA_ARGS__)
+#define vsmul_vx_i16m4_m(...) __riscv_vsmul_vx_i16m4_mu(__VA_ARGS__)
+#define vsmul_vv_i16m8_m(...) __riscv_vsmul_vv_i16m8_mu(__VA_ARGS__)
+#define vsmul_vx_i16m8_m(...) __riscv_vsmul_vx_i16m8_mu(__VA_ARGS__)
+#define vsmul_vv_i32mf2_m(...) __riscv_vsmul_vv_i32mf2_mu(__VA_ARGS__)
+#define vsmul_vx_i32mf2_m(...) __riscv_vsmul_vx_i32mf2_mu(__VA_ARGS__)
+#define vsmul_vv_i32m1_m(...) __riscv_vsmul_vv_i32m1_mu(__VA_ARGS__)
+#define vsmul_vx_i32m1_m(...) __riscv_vsmul_vx_i32m1_mu(__VA_ARGS__)
+#define vsmul_vv_i32m2_m(...) __riscv_vsmul_vv_i32m2_mu(__VA_ARGS__)
+#define vsmul_vx_i32m2_m(...) __riscv_vsmul_vx_i32m2_mu(__VA_ARGS__)
+#define vsmul_vv_i32m4_m(...) __riscv_vsmul_vv_i32m4_mu(__VA_ARGS__)
+#define vsmul_vx_i32m4_m(...) __riscv_vsmul_vx_i32m4_mu(__VA_ARGS__)
+#define vsmul_vv_i32m8_m(...) __riscv_vsmul_vv_i32m8_mu(__VA_ARGS__)
+#define vsmul_vx_i32m8_m(...) __riscv_vsmul_vx_i32m8_mu(__VA_ARGS__)
+#define vsmul_vv_i64m1_m(...) __riscv_vsmul_vv_i64m1_mu(__VA_ARGS__)
+#define vsmul_vx_i64m1_m(...) __riscv_vsmul_vx_i64m1_mu(__VA_ARGS__)
+#define vsmul_vv_i64m2_m(...) __riscv_vsmul_vv_i64m2_mu(__VA_ARGS__)
+#define vsmul_vx_i64m2_m(...) __riscv_vsmul_vx_i64m2_mu(__VA_ARGS__)
+#define vsmul_vv_i64m4_m(...) __riscv_vsmul_vv_i64m4_mu(__VA_ARGS__)
+#define vsmul_vx_i64m4_m(...) __riscv_vsmul_vx_i64m4_mu(__VA_ARGS__)
+#define vsmul_vv_i64m8_m(...) __riscv_vsmul_vv_i64m8_mu(__VA_ARGS__)
+#define vsmul_vx_i64m8_m(...) __riscv_vsmul_vx_i64m8_mu(__VA_ARGS__)
+#define vssra_vv_i8mf8(...) __riscv_vssra_vv_i8mf8(__VA_ARGS__)
+#define vssra_vx_i8mf8(...) __riscv_vssra_vx_i8mf8(__VA_ARGS__)
+#define vssra_vv_i8mf4(...) __riscv_vssra_vv_i8mf4(__VA_ARGS__)
+#define vssra_vx_i8mf4(...) __riscv_vssra_vx_i8mf4(__VA_ARGS__)
+#define vssra_vv_i8mf2(...) __riscv_vssra_vv_i8mf2(__VA_ARGS__)
+#define vssra_vx_i8mf2(...) __riscv_vssra_vx_i8mf2(__VA_ARGS__)
+#define vssra_vv_i8m1(...) __riscv_vssra_vv_i8m1(__VA_ARGS__)
+#define vssra_vx_i8m1(...) __riscv_vssra_vx_i8m1(__VA_ARGS__)
+#define vssra_vv_i8m2(...) __riscv_vssra_vv_i8m2(__VA_ARGS__)
+#define vssra_vx_i8m2(...) __riscv_vssra_vx_i8m2(__VA_ARGS__)
+#define vssra_vv_i8m4(...) __riscv_vssra_vv_i8m4(__VA_ARGS__)
+#define vssra_vx_i8m4(...) __riscv_vssra_vx_i8m4(__VA_ARGS__)
+#define vssra_vv_i8m8(...) __riscv_vssra_vv_i8m8(__VA_ARGS__)
+#define vssra_vx_i8m8(...) __riscv_vssra_vx_i8m8(__VA_ARGS__)
+#define vssra_vv_i16mf4(...) __riscv_vssra_vv_i16mf4(__VA_ARGS__)
+#define vssra_vx_i16mf4(...) __riscv_vssra_vx_i16mf4(__VA_ARGS__)
+#define vssra_vv_i16mf2(...) __riscv_vssra_vv_i16mf2(__VA_ARGS__)
+#define vssra_vx_i16mf2(...) __riscv_vssra_vx_i16mf2(__VA_ARGS__)
+#define vssra_vv_i16m1(...) __riscv_vssra_vv_i16m1(__VA_ARGS__)
+#define vssra_vx_i16m1(...) __riscv_vssra_vx_i16m1(__VA_ARGS__)
+#define vssra_vv_i16m2(...) __riscv_vssra_vv_i16m2(__VA_ARGS__)
+#define vssra_vx_i16m2(...) __riscv_vssra_vx_i16m2(__VA_ARGS__)
+#define vssra_vv_i16m4(...) __riscv_vssra_vv_i16m4(__VA_ARGS__)
+#define vssra_vx_i16m4(...) __riscv_vssra_vx_i16m4(__VA_ARGS__)
+#define vssra_vv_i16m8(...) __riscv_vssra_vv_i16m8(__VA_ARGS__)
+#define vssra_vx_i16m8(...) __riscv_vssra_vx_i16m8(__VA_ARGS__)
+#define vssra_vv_i32mf2(...) __riscv_vssra_vv_i32mf2(__VA_ARGS__)
+#define vssra_vx_i32mf2(...) __riscv_vssra_vx_i32mf2(__VA_ARGS__)
+#define vssra_vv_i32m1(...) __riscv_vssra_vv_i32m1(__VA_ARGS__)
+#define vssra_vx_i32m1(...) __riscv_vssra_vx_i32m1(__VA_ARGS__)
+#define vssra_vv_i32m2(...) __riscv_vssra_vv_i32m2(__VA_ARGS__)
+#define vssra_vx_i32m2(...) __riscv_vssra_vx_i32m2(__VA_ARGS__)
+#define vssra_vv_i32m4(...) __riscv_vssra_vv_i32m4(__VA_ARGS__)
+#define vssra_vx_i32m4(...) __riscv_vssra_vx_i32m4(__VA_ARGS__)
+#define vssra_vv_i32m8(...) __riscv_vssra_vv_i32m8(__VA_ARGS__)
+#define vssra_vx_i32m8(...) __riscv_vssra_vx_i32m8(__VA_ARGS__)
+#define vssra_vv_i64m1(...) __riscv_vssra_vv_i64m1(__VA_ARGS__)
+#define vssra_vx_i64m1(...) __riscv_vssra_vx_i64m1(__VA_ARGS__)
+#define vssra_vv_i64m2(...) __riscv_vssra_vv_i64m2(__VA_ARGS__)
+#define vssra_vx_i64m2(...) __riscv_vssra_vx_i64m2(__VA_ARGS__)
+#define vssra_vv_i64m4(...) __riscv_vssra_vv_i64m4(__VA_ARGS__)
+#define vssra_vx_i64m4(...) __riscv_vssra_vx_i64m4(__VA_ARGS__)
+#define vssra_vv_i64m8(...) __riscv_vssra_vv_i64m8(__VA_ARGS__)
+#define vssra_vx_i64m8(...) __riscv_vssra_vx_i64m8(__VA_ARGS__)
+#define vssrl_vv_u8mf8(...) __riscv_vssrl_vv_u8mf8(__VA_ARGS__)
+#define vssrl_vx_u8mf8(...) __riscv_vssrl_vx_u8mf8(__VA_ARGS__)
+#define vssrl_vv_u8mf4(...) __riscv_vssrl_vv_u8mf4(__VA_ARGS__)
+#define vssrl_vx_u8mf4(...) __riscv_vssrl_vx_u8mf4(__VA_ARGS__)
+#define vssrl_vv_u8mf2(...) __riscv_vssrl_vv_u8mf2(__VA_ARGS__)
+#define vssrl_vx_u8mf2(...) __riscv_vssrl_vx_u8mf2(__VA_ARGS__)
+#define vssrl_vv_u8m1(...) __riscv_vssrl_vv_u8m1(__VA_ARGS__)
+#define vssrl_vx_u8m1(...) __riscv_vssrl_vx_u8m1(__VA_ARGS__)
+#define vssrl_vv_u8m2(...) __riscv_vssrl_vv_u8m2(__VA_ARGS__)
+#define vssrl_vx_u8m2(...) __riscv_vssrl_vx_u8m2(__VA_ARGS__)
+#define vssrl_vv_u8m4(...) __riscv_vssrl_vv_u8m4(__VA_ARGS__)
+#define vssrl_vx_u8m4(...) __riscv_vssrl_vx_u8m4(__VA_ARGS__)
+#define vssrl_vv_u8m8(...) __riscv_vssrl_vv_u8m8(__VA_ARGS__)
+#define vssrl_vx_u8m8(...) __riscv_vssrl_vx_u8m8(__VA_ARGS__)
+#define vssrl_vv_u16mf4(...) __riscv_vssrl_vv_u16mf4(__VA_ARGS__)
+#define vssrl_vx_u16mf4(...) __riscv_vssrl_vx_u16mf4(__VA_ARGS__)
+#define vssrl_vv_u16mf2(...) __riscv_vssrl_vv_u16mf2(__VA_ARGS__)
+#define vssrl_vx_u16mf2(...) __riscv_vssrl_vx_u16mf2(__VA_ARGS__)
+#define vssrl_vv_u16m1(...) __riscv_vssrl_vv_u16m1(__VA_ARGS__)
+#define vssrl_vx_u16m1(...) __riscv_vssrl_vx_u16m1(__VA_ARGS__)
+#define vssrl_vv_u16m2(...) __riscv_vssrl_vv_u16m2(__VA_ARGS__)
+#define vssrl_vx_u16m2(...) __riscv_vssrl_vx_u16m2(__VA_ARGS__)
+#define vssrl_vv_u16m4(...) __riscv_vssrl_vv_u16m4(__VA_ARGS__)
+#define vssrl_vx_u16m4(...) __riscv_vssrl_vx_u16m4(__VA_ARGS__)
+#define vssrl_vv_u16m8(...) __riscv_vssrl_vv_u16m8(__VA_ARGS__)
+#define vssrl_vx_u16m8(...) __riscv_vssrl_vx_u16m8(__VA_ARGS__)
+#define vssrl_vv_u32mf2(...) __riscv_vssrl_vv_u32mf2(__VA_ARGS__)
+#define vssrl_vx_u32mf2(...) __riscv_vssrl_vx_u32mf2(__VA_ARGS__)
+#define vssrl_vv_u32m1(...) __riscv_vssrl_vv_u32m1(__VA_ARGS__)
+#define vssrl_vx_u32m1(...) __riscv_vssrl_vx_u32m1(__VA_ARGS__)
+#define vssrl_vv_u32m2(...) __riscv_vssrl_vv_u32m2(__VA_ARGS__)
+#define vssrl_vx_u32m2(...) __riscv_vssrl_vx_u32m2(__VA_ARGS__)
+#define vssrl_vv_u32m4(...) __riscv_vssrl_vv_u32m4(__VA_ARGS__)
+#define vssrl_vx_u32m4(...) __riscv_vssrl_vx_u32m4(__VA_ARGS__)
+#define vssrl_vv_u32m8(...) __riscv_vssrl_vv_u32m8(__VA_ARGS__)
+#define vssrl_vx_u32m8(...) __riscv_vssrl_vx_u32m8(__VA_ARGS__)
+#define vssrl_vv_u64m1(...) __riscv_vssrl_vv_u64m1(__VA_ARGS__)
+#define vssrl_vx_u64m1(...) __riscv_vssrl_vx_u64m1(__VA_ARGS__)
+#define vssrl_vv_u64m2(...) __riscv_vssrl_vv_u64m2(__VA_ARGS__)
+#define vssrl_vx_u64m2(...) __riscv_vssrl_vx_u64m2(__VA_ARGS__)
+#define vssrl_vv_u64m4(...) __riscv_vssrl_vv_u64m4(__VA_ARGS__)
+#define vssrl_vx_u64m4(...) __riscv_vssrl_vx_u64m4(__VA_ARGS__)
+#define vssrl_vv_u64m8(...) __riscv_vssrl_vv_u64m8(__VA_ARGS__)
+#define vssrl_vx_u64m8(...) __riscv_vssrl_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vssra_vv_i8mf8_m(...) __riscv_vssra_vv_i8mf8_tumu(__VA_ARGS__)
+#define vssra_vx_i8mf8_m(...) __riscv_vssra_vx_i8mf8_tumu(__VA_ARGS__)
+#define vssra_vv_i8mf4_m(...) __riscv_vssra_vv_i8mf4_tumu(__VA_ARGS__)
+#define vssra_vx_i8mf4_m(...) __riscv_vssra_vx_i8mf4_tumu(__VA_ARGS__)
+#define vssra_vv_i8mf2_m(...) __riscv_vssra_vv_i8mf2_tumu(__VA_ARGS__)
+#define vssra_vx_i8mf2_m(...) __riscv_vssra_vx_i8mf2_tumu(__VA_ARGS__)
+#define vssra_vv_i8m1_m(...) __riscv_vssra_vv_i8m1_tumu(__VA_ARGS__)
+#define vssra_vx_i8m1_m(...) __riscv_vssra_vx_i8m1_tumu(__VA_ARGS__)
+#define vssra_vv_i8m2_m(...) __riscv_vssra_vv_i8m2_tumu(__VA_ARGS__)
+#define vssra_vx_i8m2_m(...) __riscv_vssra_vx_i8m2_tumu(__VA_ARGS__)
+#define vssra_vv_i8m4_m(...) __riscv_vssra_vv_i8m4_tumu(__VA_ARGS__)
+#define vssra_vx_i8m4_m(...) __riscv_vssra_vx_i8m4_tumu(__VA_ARGS__)
+#define vssra_vv_i8m8_m(...) __riscv_vssra_vv_i8m8_tumu(__VA_ARGS__)
+#define vssra_vx_i8m8_m(...) __riscv_vssra_vx_i8m8_tumu(__VA_ARGS__)
+#define vssra_vv_i16mf4_m(...) __riscv_vssra_vv_i16mf4_tumu(__VA_ARGS__)
+#define vssra_vx_i16mf4_m(...) __riscv_vssra_vx_i16mf4_tumu(__VA_ARGS__)
+#define vssra_vv_i16mf2_m(...) __riscv_vssra_vv_i16mf2_tumu(__VA_ARGS__)
+#define vssra_vx_i16mf2_m(...) __riscv_vssra_vx_i16mf2_tumu(__VA_ARGS__)
+#define vssra_vv_i16m1_m(...) __riscv_vssra_vv_i16m1_tumu(__VA_ARGS__)
+#define vssra_vx_i16m1_m(...) __riscv_vssra_vx_i16m1_tumu(__VA_ARGS__)
+#define vssra_vv_i16m2_m(...) __riscv_vssra_vv_i16m2_tumu(__VA_ARGS__)
+#define vssra_vx_i16m2_m(...) __riscv_vssra_vx_i16m2_tumu(__VA_ARGS__)
+#define vssra_vv_i16m4_m(...) __riscv_vssra_vv_i16m4_tumu(__VA_ARGS__)
+#define vssra_vx_i16m4_m(...) __riscv_vssra_vx_i16m4_tumu(__VA_ARGS__)
+#define vssra_vv_i16m8_m(...) __riscv_vssra_vv_i16m8_tumu(__VA_ARGS__)
+#define vssra_vx_i16m8_m(...) __riscv_vssra_vx_i16m8_tumu(__VA_ARGS__)
+#define vssra_vv_i32mf2_m(...) __riscv_vssra_vv_i32mf2_tumu(__VA_ARGS__)
+#define vssra_vx_i32mf2_m(...) __riscv_vssra_vx_i32mf2_tumu(__VA_ARGS__)
+#define vssra_vv_i32m1_m(...) __riscv_vssra_vv_i32m1_tumu(__VA_ARGS__)
+#define vssra_vx_i32m1_m(...) __riscv_vssra_vx_i32m1_tumu(__VA_ARGS__)
+#define vssra_vv_i32m2_m(...) __riscv_vssra_vv_i32m2_tumu(__VA_ARGS__)
+#define vssra_vx_i32m2_m(...) __riscv_vssra_vx_i32m2_tumu(__VA_ARGS__)
+#define vssra_vv_i32m4_m(...) __riscv_vssra_vv_i32m4_tumu(__VA_ARGS__)
+#define vssra_vx_i32m4_m(...) __riscv_vssra_vx_i32m4_tumu(__VA_ARGS__)
+#define vssra_vv_i32m8_m(...) __riscv_vssra_vv_i32m8_tumu(__VA_ARGS__)
+#define vssra_vx_i32m8_m(...) __riscv_vssra_vx_i32m8_tumu(__VA_ARGS__)
+#define vssra_vv_i64m1_m(...) __riscv_vssra_vv_i64m1_tumu(__VA_ARGS__)
+#define vssra_vx_i64m1_m(...) __riscv_vssra_vx_i64m1_tumu(__VA_ARGS__)
+#define vssra_vv_i64m2_m(...) __riscv_vssra_vv_i64m2_tumu(__VA_ARGS__)
+#define vssra_vx_i64m2_m(...) __riscv_vssra_vx_i64m2_tumu(__VA_ARGS__)
+#define vssra_vv_i64m4_m(...) __riscv_vssra_vv_i64m4_tumu(__VA_ARGS__)
+#define vssra_vx_i64m4_m(...) __riscv_vssra_vx_i64m4_tumu(__VA_ARGS__)
+#define vssra_vv_i64m8_m(...) __riscv_vssra_vv_i64m8_tumu(__VA_ARGS__)
+#define vssra_vx_i64m8_m(...) __riscv_vssra_vx_i64m8_tumu(__VA_ARGS__)
+#define vssrl_vv_u8mf8_m(...) __riscv_vssrl_vv_u8mf8_tumu(__VA_ARGS__)
+#define vssrl_vx_u8mf8_m(...) __riscv_vssrl_vx_u8mf8_tumu(__VA_ARGS__)
+#define vssrl_vv_u8mf4_m(...) __riscv_vssrl_vv_u8mf4_tumu(__VA_ARGS__)
+#define vssrl_vx_u8mf4_m(...) __riscv_vssrl_vx_u8mf4_tumu(__VA_ARGS__)
+#define vssrl_vv_u8mf2_m(...) __riscv_vssrl_vv_u8mf2_tumu(__VA_ARGS__)
+#define vssrl_vx_u8mf2_m(...) __riscv_vssrl_vx_u8mf2_tumu(__VA_ARGS__)
+#define vssrl_vv_u8m1_m(...) __riscv_vssrl_vv_u8m1_tumu(__VA_ARGS__)
+#define vssrl_vx_u8m1_m(...) __riscv_vssrl_vx_u8m1_tumu(__VA_ARGS__)
+#define vssrl_vv_u8m2_m(...) __riscv_vssrl_vv_u8m2_tumu(__VA_ARGS__)
+#define vssrl_vx_u8m2_m(...) __riscv_vssrl_vx_u8m2_tumu(__VA_ARGS__)
+#define vssrl_vv_u8m4_m(...) __riscv_vssrl_vv_u8m4_tumu(__VA_ARGS__)
+#define vssrl_vx_u8m4_m(...) __riscv_vssrl_vx_u8m4_tumu(__VA_ARGS__)
+#define vssrl_vv_u8m8_m(...) __riscv_vssrl_vv_u8m8_tumu(__VA_ARGS__)
+#define vssrl_vx_u8m8_m(...) __riscv_vssrl_vx_u8m8_tumu(__VA_ARGS__)
+#define vssrl_vv_u16mf4_m(...) __riscv_vssrl_vv_u16mf4_tumu(__VA_ARGS__)
+#define vssrl_vx_u16mf4_m(...) __riscv_vssrl_vx_u16mf4_tumu(__VA_ARGS__)
+#define vssrl_vv_u16mf2_m(...) __riscv_vssrl_vv_u16mf2_tumu(__VA_ARGS__)
+#define vssrl_vx_u16mf2_m(...) __riscv_vssrl_vx_u16mf2_tumu(__VA_ARGS__)
+#define vssrl_vv_u16m1_m(...) __riscv_vssrl_vv_u16m1_tumu(__VA_ARGS__)
+#define vssrl_vx_u16m1_m(...) __riscv_vssrl_vx_u16m1_tumu(__VA_ARGS__)
+#define vssrl_vv_u16m2_m(...) __riscv_vssrl_vv_u16m2_tumu(__VA_ARGS__)
+#define vssrl_vx_u16m2_m(...) __riscv_vssrl_vx_u16m2_tumu(__VA_ARGS__)
+#define vssrl_vv_u16m4_m(...) __riscv_vssrl_vv_u16m4_tumu(__VA_ARGS__)
+#define vssrl_vx_u16m4_m(...) __riscv_vssrl_vx_u16m4_tumu(__VA_ARGS__)
+#define vssrl_vv_u16m8_m(...) __riscv_vssrl_vv_u16m8_tumu(__VA_ARGS__)
+#define vssrl_vx_u16m8_m(...) __riscv_vssrl_vx_u16m8_tumu(__VA_ARGS__)
+#define vssrl_vv_u32mf2_m(...) __riscv_vssrl_vv_u32mf2_tumu(__VA_ARGS__)
+#define vssrl_vx_u32mf2_m(...) __riscv_vssrl_vx_u32mf2_tumu(__VA_ARGS__)
+#define vssrl_vv_u32m1_m(...) __riscv_vssrl_vv_u32m1_tumu(__VA_ARGS__)
+#define vssrl_vx_u32m1_m(...) __riscv_vssrl_vx_u32m1_tumu(__VA_ARGS__)
+#define vssrl_vv_u32m2_m(...) __riscv_vssrl_vv_u32m2_tumu(__VA_ARGS__)
+#define vssrl_vx_u32m2_m(...) __riscv_vssrl_vx_u32m2_tumu(__VA_ARGS__)
+#define vssrl_vv_u32m4_m(...) __riscv_vssrl_vv_u32m4_tumu(__VA_ARGS__)
+#define vssrl_vx_u32m4_m(...) __riscv_vssrl_vx_u32m4_tumu(__VA_ARGS__)
+#define vssrl_vv_u32m8_m(...) __riscv_vssrl_vv_u32m8_tumu(__VA_ARGS__)
+#define vssrl_vx_u32m8_m(...) __riscv_vssrl_vx_u32m8_tumu(__VA_ARGS__)
+#define vssrl_vv_u64m1_m(...) __riscv_vssrl_vv_u64m1_tumu(__VA_ARGS__)
+#define vssrl_vx_u64m1_m(...) __riscv_vssrl_vx_u64m1_tumu(__VA_ARGS__)
+#define vssrl_vv_u64m2_m(...) __riscv_vssrl_vv_u64m2_tumu(__VA_ARGS__)
+#define vssrl_vx_u64m2_m(...) __riscv_vssrl_vx_u64m2_tumu(__VA_ARGS__)
+#define vssrl_vv_u64m4_m(...) __riscv_vssrl_vv_u64m4_tumu(__VA_ARGS__)
+#define vssrl_vx_u64m4_m(...) __riscv_vssrl_vx_u64m4_tumu(__VA_ARGS__)
+#define vssrl_vv_u64m8_m(...) __riscv_vssrl_vv_u64m8_tumu(__VA_ARGS__)
+#define vssrl_vx_u64m8_m(...) __riscv_vssrl_vx_u64m8_tumu(__VA_ARGS__)
+#define vnclip_wv_i8mf8(...) __riscv_vnclip_wv_i8mf8(__VA_ARGS__)
+#define vnclip_wx_i8mf8(...) __riscv_vnclip_wx_i8mf8(__VA_ARGS__)
+#define vnclip_wv_i8mf4(...) __riscv_vnclip_wv_i8mf4(__VA_ARGS__)
+#define vnclip_wx_i8mf4(...) __riscv_vnclip_wx_i8mf4(__VA_ARGS__)
+#define vnclip_wv_i8mf2(...) __riscv_vnclip_wv_i8mf2(__VA_ARGS__)
+#define vnclip_wx_i8mf2(...) __riscv_vnclip_wx_i8mf2(__VA_ARGS__)
+#define vnclip_wv_i8m1(...) __riscv_vnclip_wv_i8m1(__VA_ARGS__)
+#define vnclip_wx_i8m1(...) __riscv_vnclip_wx_i8m1(__VA_ARGS__)
+#define vnclip_wv_i8m2(...) __riscv_vnclip_wv_i8m2(__VA_ARGS__)
+#define vnclip_wx_i8m2(...) __riscv_vnclip_wx_i8m2(__VA_ARGS__)
+#define vnclip_wv_i8m4(...) __riscv_vnclip_wv_i8m4(__VA_ARGS__)
+#define vnclip_wx_i8m4(...) __riscv_vnclip_wx_i8m4(__VA_ARGS__)
+#define vnclip_wv_i16mf4(...) __riscv_vnclip_wv_i16mf4(__VA_ARGS__)
+#define vnclip_wx_i16mf4(...) __riscv_vnclip_wx_i16mf4(__VA_ARGS__)
+#define vnclip_wv_i16mf2(...) __riscv_vnclip_wv_i16mf2(__VA_ARGS__)
+#define vnclip_wx_i16mf2(...) __riscv_vnclip_wx_i16mf2(__VA_ARGS__)
+#define vnclip_wv_i16m1(...) __riscv_vnclip_wv_i16m1(__VA_ARGS__)
+#define vnclip_wx_i16m1(...) __riscv_vnclip_wx_i16m1(__VA_ARGS__)
+#define vnclip_wv_i16m2(...) __riscv_vnclip_wv_i16m2(__VA_ARGS__)
+#define vnclip_wx_i16m2(...) __riscv_vnclip_wx_i16m2(__VA_ARGS__)
+#define vnclip_wv_i16m4(...) __riscv_vnclip_wv_i16m4(__VA_ARGS__)
+#define vnclip_wx_i16m4(...) __riscv_vnclip_wx_i16m4(__VA_ARGS__)
+#define vnclip_wv_i32mf2(...) __riscv_vnclip_wv_i32mf2(__VA_ARGS__)
+#define vnclip_wx_i32mf2(...) __riscv_vnclip_wx_i32mf2(__VA_ARGS__)
+#define vnclip_wv_i32m1(...) __riscv_vnclip_wv_i32m1(__VA_ARGS__)
+#define vnclip_wx_i32m1(...) __riscv_vnclip_wx_i32m1(__VA_ARGS__)
+#define vnclip_wv_i32m2(...) __riscv_vnclip_wv_i32m2(__VA_ARGS__)
+#define vnclip_wx_i32m2(...) __riscv_vnclip_wx_i32m2(__VA_ARGS__)
+#define vnclip_wv_i32m4(...) __riscv_vnclip_wv_i32m4(__VA_ARGS__)
+#define vnclip_wx_i32m4(...) __riscv_vnclip_wx_i32m4(__VA_ARGS__)
+#define vnclipu_wv_u8mf8(...) __riscv_vnclipu_wv_u8mf8(__VA_ARGS__)
+#define vnclipu_wx_u8mf8(...) __riscv_vnclipu_wx_u8mf8(__VA_ARGS__)
+#define vnclipu_wv_u8mf4(...) __riscv_vnclipu_wv_u8mf4(__VA_ARGS__)
+#define vnclipu_wx_u8mf4(...) __riscv_vnclipu_wx_u8mf4(__VA_ARGS__)
+#define vnclipu_wv_u8mf2(...) __riscv_vnclipu_wv_u8mf2(__VA_ARGS__)
+#define vnclipu_wx_u8mf2(...) __riscv_vnclipu_wx_u8mf2(__VA_ARGS__)
+#define vnclipu_wv_u8m1(...) __riscv_vnclipu_wv_u8m1(__VA_ARGS__)
+#define vnclipu_wx_u8m1(...) __riscv_vnclipu_wx_u8m1(__VA_ARGS__)
+#define vnclipu_wv_u8m2(...) __riscv_vnclipu_wv_u8m2(__VA_ARGS__)
+#define vnclipu_wx_u8m2(...) __riscv_vnclipu_wx_u8m2(__VA_ARGS__)
+#define vnclipu_wv_u8m4(...) __riscv_vnclipu_wv_u8m4(__VA_ARGS__)
+#define vnclipu_wx_u8m4(...) __riscv_vnclipu_wx_u8m4(__VA_ARGS__)
+#define vnclipu_wv_u16mf4(...) __riscv_vnclipu_wv_u16mf4(__VA_ARGS__)
+#define vnclipu_wx_u16mf4(...) __riscv_vnclipu_wx_u16mf4(__VA_ARGS__)
+#define vnclipu_wv_u16mf2(...) __riscv_vnclipu_wv_u16mf2(__VA_ARGS__)
+#define vnclipu_wx_u16mf2(...) __riscv_vnclipu_wx_u16mf2(__VA_ARGS__)
+#define vnclipu_wv_u16m1(...) __riscv_vnclipu_wv_u16m1(__VA_ARGS__)
+#define vnclipu_wx_u16m1(...) __riscv_vnclipu_wx_u16m1(__VA_ARGS__)
+#define vnclipu_wv_u16m2(...) __riscv_vnclipu_wv_u16m2(__VA_ARGS__)
+#define vnclipu_wx_u16m2(...) __riscv_vnclipu_wx_u16m2(__VA_ARGS__)
+#define vnclipu_wv_u16m4(...) __riscv_vnclipu_wv_u16m4(__VA_ARGS__)
+#define vnclipu_wx_u16m4(...) __riscv_vnclipu_wx_u16m4(__VA_ARGS__)
+#define vnclipu_wv_u32mf2(...) __riscv_vnclipu_wv_u32mf2(__VA_ARGS__)
+#define vnclipu_wx_u32mf2(...) __riscv_vnclipu_wx_u32mf2(__VA_ARGS__)
+#define vnclipu_wv_u32m1(...) __riscv_vnclipu_wv_u32m1(__VA_ARGS__)
+#define vnclipu_wx_u32m1(...) __riscv_vnclipu_wx_u32m1(__VA_ARGS__)
+#define vnclipu_wv_u32m2(...) __riscv_vnclipu_wv_u32m2(__VA_ARGS__)
+#define vnclipu_wx_u32m2(...) __riscv_vnclipu_wx_u32m2(__VA_ARGS__)
+#define vnclipu_wv_u32m4(...) __riscv_vnclipu_wv_u32m4(__VA_ARGS__)
+#define vnclipu_wx_u32m4(...) __riscv_vnclipu_wx_u32m4(__VA_ARGS__)
+// masked functions
+#define vnclip_wv_i8mf8_m(...) __riscv_vnclip_wv_i8mf8_tumu(__VA_ARGS__)
+#define vnclip_wx_i8mf8_m(...) __riscv_vnclip_wx_i8mf8_tumu(__VA_ARGS__)
+#define vnclip_wv_i8mf4_m(...) __riscv_vnclip_wv_i8mf4_tumu(__VA_ARGS__)
+#define vnclip_wx_i8mf4_m(...) __riscv_vnclip_wx_i8mf4_tumu(__VA_ARGS__)
+#define vnclip_wv_i8mf2_m(...) __riscv_vnclip_wv_i8mf2_tumu(__VA_ARGS__)
+#define vnclip_wx_i8mf2_m(...) __riscv_vnclip_wx_i8mf2_tumu(__VA_ARGS__)
+#define vnclip_wv_i8m1_m(...) __riscv_vnclip_wv_i8m1_tumu(__VA_ARGS__)
+#define vnclip_wx_i8m1_m(...) __riscv_vnclip_wx_i8m1_tumu(__VA_ARGS__)
+#define vnclip_wv_i8m2_m(...) __riscv_vnclip_wv_i8m2_tumu(__VA_ARGS__)
+#define vnclip_wx_i8m2_m(...) __riscv_vnclip_wx_i8m2_tumu(__VA_ARGS__)
+#define vnclip_wv_i8m4_m(...) __riscv_vnclip_wv_i8m4_tumu(__VA_ARGS__)
+#define vnclip_wx_i8m4_m(...) __riscv_vnclip_wx_i8m4_tumu(__VA_ARGS__)
+#define vnclip_wv_i16mf4_m(...) __riscv_vnclip_wv_i16mf4_tumu(__VA_ARGS__)
+#define vnclip_wx_i16mf4_m(...) __riscv_vnclip_wx_i16mf4_tumu(__VA_ARGS__)
+#define vnclip_wv_i16mf2_m(...) __riscv_vnclip_wv_i16mf2_tumu(__VA_ARGS__)
+#define vnclip_wx_i16mf2_m(...) __riscv_vnclip_wx_i16mf2_tumu(__VA_ARGS__)
+#define vnclip_wv_i16m1_m(...) __riscv_vnclip_wv_i16m1_tumu(__VA_ARGS__)
+#define vnclip_wx_i16m1_m(...) __riscv_vnclip_wx_i16m1_tumu(__VA_ARGS__)
+#define vnclip_wv_i16m2_m(...) __riscv_vnclip_wv_i16m2_tumu(__VA_ARGS__)
+#define vnclip_wx_i16m2_m(...) __riscv_vnclip_wx_i16m2_tumu(__VA_ARGS__)
+#define vnclip_wv_i16m4_m(...) __riscv_vnclip_wv_i16m4_tumu(__VA_ARGS__)
+#define vnclip_wx_i16m4_m(...) __riscv_vnclip_wx_i16m4_tumu(__VA_ARGS__)
+#define vnclip_wv_i32mf2_m(...) __riscv_vnclip_wv_i32mf2_tumu(__VA_ARGS__)
+#define vnclip_wx_i32mf2_m(...) __riscv_vnclip_wx_i32mf2_tumu(__VA_ARGS__)
+#define vnclip_wv_i32m1_m(...) __riscv_vnclip_wv_i32m1_tumu(__VA_ARGS__)
+#define vnclip_wx_i32m1_m(...) __riscv_vnclip_wx_i32m1_tumu(__VA_ARGS__)
+#define vnclip_wv_i32m2_m(...) __riscv_vnclip_wv_i32m2_tumu(__VA_ARGS__)
+#define vnclip_wx_i32m2_m(...) __riscv_vnclip_wx_i32m2_tumu(__VA_ARGS__)
+#define vnclip_wv_i32m4_m(...) __riscv_vnclip_wv_i32m4_tumu(__VA_ARGS__)
+#define vnclip_wx_i32m4_m(...) __riscv_vnclip_wx_i32m4_tumu(__VA_ARGS__)
+#define vnclipu_wv_u8mf8_m(...) __riscv_vnclipu_wv_u8mf8_tumu(__VA_ARGS__)
+#define vnclipu_wx_u8mf8_m(...) __riscv_vnclipu_wx_u8mf8_tumu(__VA_ARGS__)
+#define vnclipu_wv_u8mf4_m(...) __riscv_vnclipu_wv_u8mf4_tumu(__VA_ARGS__)
+#define vnclipu_wx_u8mf4_m(...) __riscv_vnclipu_wx_u8mf4_tumu(__VA_ARGS__)
+#define vnclipu_wv_u8mf2_m(...) __riscv_vnclipu_wv_u8mf2_tumu(__VA_ARGS__)
+#define vnclipu_wx_u8mf2_m(...) __riscv_vnclipu_wx_u8mf2_tumu(__VA_ARGS__)
+#define vnclipu_wv_u8m1_m(...) __riscv_vnclipu_wv_u8m1_tumu(__VA_ARGS__)
+#define vnclipu_wx_u8m1_m(...) __riscv_vnclipu_wx_u8m1_tumu(__VA_ARGS__)
+#define vnclipu_wv_u8m2_m(...) __riscv_vnclipu_wv_u8m2_tumu(__VA_ARGS__)
+#define vnclipu_wx_u8m2_m(...) __riscv_vnclipu_wx_u8m2_tumu(__VA_ARGS__)
+#define vnclipu_wv_u8m4_m(...) __riscv_vnclipu_wv_u8m4_tumu(__VA_ARGS__)
+#define vnclipu_wx_u8m4_m(...) __riscv_vnclipu_wx_u8m4_tumu(__VA_ARGS__)
+#define vnclipu_wv_u16mf4_m(...) __riscv_vnclipu_wv_u16mf4_tumu(__VA_ARGS__)
+#define vnclipu_wx_u16mf4_m(...) __riscv_vnclipu_wx_u16mf4_tumu(__VA_ARGS__)
+#define vnclipu_wv_u16mf2_m(...) __riscv_vnclipu_wv_u16mf2_tumu(__VA_ARGS__)
+#define vnclipu_wx_u16mf2_m(...) __riscv_vnclipu_wx_u16mf2_tumu(__VA_ARGS__)
+#define vnclipu_wv_u16m1_m(...) __riscv_vnclipu_wv_u16m1_tumu(__VA_ARGS__)
+#define vnclipu_wx_u16m1_m(...) __riscv_vnclipu_wx_u16m1_tumu(__VA_ARGS__)
+#define vnclipu_wv_u16m2_m(...) __riscv_vnclipu_wv_u16m2_tumu(__VA_ARGS__)
+#define vnclipu_wx_u16m2_m(...) __riscv_vnclipu_wx_u16m2_tumu(__VA_ARGS__)
+#define vnclipu_wv_u16m4_m(...) __riscv_vnclipu_wv_u16m4_tumu(__VA_ARGS__)
+#define vnclipu_wx_u16m4_m(...) __riscv_vnclipu_wx_u16m4_tumu(__VA_ARGS__)
+#define vnclipu_wv_u32mf2_m(...) __riscv_vnclipu_wv_u32mf2_tumu(__VA_ARGS__)
+#define vnclipu_wx_u32mf2_m(...) __riscv_vnclipu_wx_u32mf2_tumu(__VA_ARGS__)
+#define vnclipu_wv_u32m1_m(...) __riscv_vnclipu_wv_u32m1_tumu(__VA_ARGS__)
+#define vnclipu_wx_u32m1_m(...) __riscv_vnclipu_wx_u32m1_tumu(__VA_ARGS__)
+#define vnclipu_wv_u32m2_m(...) __riscv_vnclipu_wv_u32m2_tumu(__VA_ARGS__)
+#define vnclipu_wx_u32m2_m(...) __riscv_vnclipu_wx_u32m2_tumu(__VA_ARGS__)
+#define vnclipu_wv_u32m4_m(...) __riscv_vnclipu_wv_u32m4_tumu(__VA_ARGS__)
+#define vnclipu_wx_u32m4_m(...) __riscv_vnclipu_wx_u32m4_tumu(__VA_ARGS__)
+#define vfadd_vv_f16mf4(...) __riscv_vfadd_vv_f16mf4(__VA_ARGS__)
+#define vfadd_vf_f16mf4(...) __riscv_vfadd_vf_f16mf4(__VA_ARGS__)
+#define vfadd_vv_f16mf2(...) __riscv_vfadd_vv_f16mf2(__VA_ARGS__)
+#define vfadd_vf_f16mf2(...) __riscv_vfadd_vf_f16mf2(__VA_ARGS__)
+#define vfadd_vv_f16m1(...) __riscv_vfadd_vv_f16m1(__VA_ARGS__)
+#define vfadd_vf_f16m1(...) __riscv_vfadd_vf_f16m1(__VA_ARGS__)
+#define vfadd_vv_f16m2(...) __riscv_vfadd_vv_f16m2(__VA_ARGS__)
+#define vfadd_vf_f16m2(...) __riscv_vfadd_vf_f16m2(__VA_ARGS__)
+#define vfadd_vv_f16m4(...) __riscv_vfadd_vv_f16m4(__VA_ARGS__)
+#define vfadd_vf_f16m4(...) __riscv_vfadd_vf_f16m4(__VA_ARGS__)
+#define vfadd_vv_f16m8(...) __riscv_vfadd_vv_f16m8(__VA_ARGS__)
+#define vfadd_vf_f16m8(...) __riscv_vfadd_vf_f16m8(__VA_ARGS__)
+#define vfadd_vv_f32mf2(...) __riscv_vfadd_vv_f32mf2(__VA_ARGS__)
+#define vfadd_vf_f32mf2(...) __riscv_vfadd_vf_f32mf2(__VA_ARGS__)
+#define vfadd_vv_f32m1(...) __riscv_vfadd_vv_f32m1(__VA_ARGS__)
+#define vfadd_vf_f32m1(...) __riscv_vfadd_vf_f32m1(__VA_ARGS__)
+#define vfadd_vv_f32m2(...) __riscv_vfadd_vv_f32m2(__VA_ARGS__)
+#define vfadd_vf_f32m2(...) __riscv_vfadd_vf_f32m2(__VA_ARGS__)
+#define vfadd_vv_f32m4(...) __riscv_vfadd_vv_f32m4(__VA_ARGS__)
+#define vfadd_vf_f32m4(...) __riscv_vfadd_vf_f32m4(__VA_ARGS__)
+#define vfadd_vv_f32m8(...) __riscv_vfadd_vv_f32m8(__VA_ARGS__)
+#define vfadd_vf_f32m8(...) __riscv_vfadd_vf_f32m8(__VA_ARGS__)
+#define vfadd_vv_f64m1(...) __riscv_vfadd_vv_f64m1(__VA_ARGS__)
+#define vfadd_vf_f64m1(...) __riscv_vfadd_vf_f64m1(__VA_ARGS__)
+#define vfadd_vv_f64m2(...) __riscv_vfadd_vv_f64m2(__VA_ARGS__)
+#define vfadd_vf_f64m2(...) __riscv_vfadd_vf_f64m2(__VA_ARGS__)
+#define vfadd_vv_f64m4(...) __riscv_vfadd_vv_f64m4(__VA_ARGS__)
+#define vfadd_vf_f64m4(...) __riscv_vfadd_vf_f64m4(__VA_ARGS__)
+#define vfadd_vv_f64m8(...) __riscv_vfadd_vv_f64m8(__VA_ARGS__)
+#define vfadd_vf_f64m8(...) __riscv_vfadd_vf_f64m8(__VA_ARGS__)
+#define vfsub_vv_f16mf4(...) __riscv_vfsub_vv_f16mf4(__VA_ARGS__)
+#define vfsub_vf_f16mf4(...) __riscv_vfsub_vf_f16mf4(__VA_ARGS__)
+#define vfsub_vv_f16mf2(...) __riscv_vfsub_vv_f16mf2(__VA_ARGS__)
+#define vfsub_vf_f16mf2(...) __riscv_vfsub_vf_f16mf2(__VA_ARGS__)
+#define vfsub_vv_f16m1(...) __riscv_vfsub_vv_f16m1(__VA_ARGS__)
+#define vfsub_vf_f16m1(...) __riscv_vfsub_vf_f16m1(__VA_ARGS__)
+#define vfsub_vv_f16m2(...) __riscv_vfsub_vv_f16m2(__VA_ARGS__)
+#define vfsub_vf_f16m2(...) __riscv_vfsub_vf_f16m2(__VA_ARGS__)
+#define vfsub_vv_f16m4(...) __riscv_vfsub_vv_f16m4(__VA_ARGS__)
+#define vfsub_vf_f16m4(...) __riscv_vfsub_vf_f16m4(__VA_ARGS__)
+#define vfsub_vv_f16m8(...) __riscv_vfsub_vv_f16m8(__VA_ARGS__)
+#define vfsub_vf_f16m8(...) __riscv_vfsub_vf_f16m8(__VA_ARGS__)
+#define vfsub_vv_f32mf2(...) __riscv_vfsub_vv_f32mf2(__VA_ARGS__)
+#define vfsub_vf_f32mf2(...) __riscv_vfsub_vf_f32mf2(__VA_ARGS__)
+#define vfsub_vv_f32m1(...) __riscv_vfsub_vv_f32m1(__VA_ARGS__)
+#define vfsub_vf_f32m1(...) __riscv_vfsub_vf_f32m1(__VA_ARGS__)
+#define vfsub_vv_f32m2(...) __riscv_vfsub_vv_f32m2(__VA_ARGS__)
+#define vfsub_vf_f32m2(...) __riscv_vfsub_vf_f32m2(__VA_ARGS__)
+#define vfsub_vv_f32m4(...) __riscv_vfsub_vv_f32m4(__VA_ARGS__)
+#define vfsub_vf_f32m4(...) __riscv_vfsub_vf_f32m4(__VA_ARGS__)
+#define vfsub_vv_f32m8(...) __riscv_vfsub_vv_f32m8(__VA_ARGS__)
+#define vfsub_vf_f32m8(...) __riscv_vfsub_vf_f32m8(__VA_ARGS__)
+#define vfsub_vv_f64m1(...) __riscv_vfsub_vv_f64m1(__VA_ARGS__)
+#define vfsub_vf_f64m1(...) __riscv_vfsub_vf_f64m1(__VA_ARGS__)
+#define vfsub_vv_f64m2(...) __riscv_vfsub_vv_f64m2(__VA_ARGS__)
+#define vfsub_vf_f64m2(...) __riscv_vfsub_vf_f64m2(__VA_ARGS__)
+#define vfsub_vv_f64m4(...) __riscv_vfsub_vv_f64m4(__VA_ARGS__)
+#define vfsub_vf_f64m4(...) __riscv_vfsub_vf_f64m4(__VA_ARGS__)
+#define vfsub_vv_f64m8(...) __riscv_vfsub_vv_f64m8(__VA_ARGS__)
+#define vfsub_vf_f64m8(...) __riscv_vfsub_vf_f64m8(__VA_ARGS__)
+#define vfrsub_vf_f16mf4(...) __riscv_vfrsub_vf_f16mf4(__VA_ARGS__)
+#define vfrsub_vf_f16mf2(...) __riscv_vfrsub_vf_f16mf2(__VA_ARGS__)
+#define vfrsub_vf_f16m1(...) __riscv_vfrsub_vf_f16m1(__VA_ARGS__)
+#define vfrsub_vf_f16m2(...) __riscv_vfrsub_vf_f16m2(__VA_ARGS__)
+#define vfrsub_vf_f16m4(...) __riscv_vfrsub_vf_f16m4(__VA_ARGS__)
+#define vfrsub_vf_f16m8(...) __riscv_vfrsub_vf_f16m8(__VA_ARGS__)
+#define vfrsub_vf_f32mf2(...) __riscv_vfrsub_vf_f32mf2(__VA_ARGS__)
+#define vfrsub_vf_f32m1(...) __riscv_vfrsub_vf_f32m1(__VA_ARGS__)
+#define vfrsub_vf_f32m2(...) __riscv_vfrsub_vf_f32m2(__VA_ARGS__)
+#define vfrsub_vf_f32m4(...) __riscv_vfrsub_vf_f32m4(__VA_ARGS__)
+#define vfrsub_vf_f32m8(...) __riscv_vfrsub_vf_f32m8(__VA_ARGS__)
+#define vfrsub_vf_f64m1(...) __riscv_vfrsub_vf_f64m1(__VA_ARGS__)
+#define vfrsub_vf_f64m2(...) __riscv_vfrsub_vf_f64m2(__VA_ARGS__)
+#define vfrsub_vf_f64m4(...) __riscv_vfrsub_vf_f64m4(__VA_ARGS__)
+#define vfrsub_vf_f64m8(...) __riscv_vfrsub_vf_f64m8(__VA_ARGS__)
+#define vfneg_v_f16mf4(...) __riscv_vfneg_v_f16mf4(__VA_ARGS__)
+#define vfneg_v_f16mf2(...) __riscv_vfneg_v_f16mf2(__VA_ARGS__)
+#define vfneg_v_f16m1(...) __riscv_vfneg_v_f16m1(__VA_ARGS__)
+#define vfneg_v_f16m2(...) __riscv_vfneg_v_f16m2(__VA_ARGS__)
+#define vfneg_v_f16m4(...) __riscv_vfneg_v_f16m4(__VA_ARGS__)
+#define vfneg_v_f16m8(...) __riscv_vfneg_v_f16m8(__VA_ARGS__)
+#define vfneg_v_f32mf2(...) __riscv_vfneg_v_f32mf2(__VA_ARGS__)
+#define vfneg_v_f32m1(...) __riscv_vfneg_v_f32m1(__VA_ARGS__)
+#define vfneg_v_f32m2(...) __riscv_vfneg_v_f32m2(__VA_ARGS__)
+#define vfneg_v_f32m4(...) __riscv_vfneg_v_f32m4(__VA_ARGS__)
+#define vfneg_v_f32m8(...) __riscv_vfneg_v_f32m8(__VA_ARGS__)
+#define vfneg_v_f64m1(...) __riscv_vfneg_v_f64m1(__VA_ARGS__)
+#define vfneg_v_f64m2(...) __riscv_vfneg_v_f64m2(__VA_ARGS__)
+#define vfneg_v_f64m4(...) __riscv_vfneg_v_f64m4(__VA_ARGS__)
+#define vfneg_v_f64m8(...) __riscv_vfneg_v_f64m8(__VA_ARGS__)
+// masked functions
+#define vfadd_vv_f16mf4_m(...) __riscv_vfadd_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfadd_vf_f16mf4_m(...) __riscv_vfadd_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfadd_vv_f16mf2_m(...) __riscv_vfadd_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfadd_vf_f16mf2_m(...) __riscv_vfadd_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfadd_vv_f16m1_m(...) __riscv_vfadd_vv_f16m1_tumu(__VA_ARGS__)
+#define vfadd_vf_f16m1_m(...) __riscv_vfadd_vf_f16m1_tumu(__VA_ARGS__)
+#define vfadd_vv_f16m2_m(...) __riscv_vfadd_vv_f16m2_tumu(__VA_ARGS__)
+#define vfadd_vf_f16m2_m(...) __riscv_vfadd_vf_f16m2_tumu(__VA_ARGS__)
+#define vfadd_vv_f16m4_m(...) __riscv_vfadd_vv_f16m4_tumu(__VA_ARGS__)
+#define vfadd_vf_f16m4_m(...) __riscv_vfadd_vf_f16m4_tumu(__VA_ARGS__)
+#define vfadd_vv_f16m8_m(...) __riscv_vfadd_vv_f16m8_tumu(__VA_ARGS__)
+#define vfadd_vf_f16m8_m(...) __riscv_vfadd_vf_f16m8_tumu(__VA_ARGS__)
+#define vfadd_vv_f32mf2_m(...) __riscv_vfadd_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfadd_vf_f32mf2_m(...) __riscv_vfadd_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfadd_vv_f32m1_m(...) __riscv_vfadd_vv_f32m1_tumu(__VA_ARGS__)
+#define vfadd_vf_f32m1_m(...) __riscv_vfadd_vf_f32m1_tumu(__VA_ARGS__)
+#define vfadd_vv_f32m2_m(...) __riscv_vfadd_vv_f32m2_tumu(__VA_ARGS__)
+#define vfadd_vf_f32m2_m(...) __riscv_vfadd_vf_f32m2_tumu(__VA_ARGS__)
+#define vfadd_vv_f32m4_m(...) __riscv_vfadd_vv_f32m4_tumu(__VA_ARGS__)
+#define vfadd_vf_f32m4_m(...) __riscv_vfadd_vf_f32m4_tumu(__VA_ARGS__)
+#define vfadd_vv_f32m8_m(...) __riscv_vfadd_vv_f32m8_tumu(__VA_ARGS__)
+#define vfadd_vf_f32m8_m(...) __riscv_vfadd_vf_f32m8_tumu(__VA_ARGS__)
+#define vfadd_vv_f64m1_m(...) __riscv_vfadd_vv_f64m1_tumu(__VA_ARGS__)
+#define vfadd_vf_f64m1_m(...) __riscv_vfadd_vf_f64m1_tumu(__VA_ARGS__)
+#define vfadd_vv_f64m2_m(...) __riscv_vfadd_vv_f64m2_tumu(__VA_ARGS__)
+#define vfadd_vf_f64m2_m(...) __riscv_vfadd_vf_f64m2_tumu(__VA_ARGS__)
+#define vfadd_vv_f64m4_m(...) __riscv_vfadd_vv_f64m4_tumu(__VA_ARGS__)
+#define vfadd_vf_f64m4_m(...) __riscv_vfadd_vf_f64m4_tumu(__VA_ARGS__)
+#define vfadd_vv_f64m8_m(...) __riscv_vfadd_vv_f64m8_tumu(__VA_ARGS__)
+#define vfadd_vf_f64m8_m(...) __riscv_vfadd_vf_f64m8_tumu(__VA_ARGS__)
+#define vfsub_vv_f16mf4_m(...) __riscv_vfsub_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfsub_vf_f16mf4_m(...) __riscv_vfsub_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfsub_vv_f16mf2_m(...) __riscv_vfsub_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfsub_vf_f16mf2_m(...) __riscv_vfsub_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfsub_vv_f16m1_m(...) __riscv_vfsub_vv_f16m1_tumu(__VA_ARGS__)
+#define vfsub_vf_f16m1_m(...) __riscv_vfsub_vf_f16m1_tumu(__VA_ARGS__)
+#define vfsub_vv_f16m2_m(...) __riscv_vfsub_vv_f16m2_tumu(__VA_ARGS__)
+#define vfsub_vf_f16m2_m(...) __riscv_vfsub_vf_f16m2_tumu(__VA_ARGS__)
+#define vfsub_vv_f16m4_m(...) __riscv_vfsub_vv_f16m4_tumu(__VA_ARGS__)
+#define vfsub_vf_f16m4_m(...) __riscv_vfsub_vf_f16m4_tumu(__VA_ARGS__)
+#define vfsub_vv_f16m8_m(...) __riscv_vfsub_vv_f16m8_tumu(__VA_ARGS__)
+#define vfsub_vf_f16m8_m(...) __riscv_vfsub_vf_f16m8_tumu(__VA_ARGS__)
+#define vfsub_vv_f32mf2_m(...) __riscv_vfsub_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfsub_vf_f32mf2_m(...) __riscv_vfsub_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfsub_vv_f32m1_m(...) __riscv_vfsub_vv_f32m1_tumu(__VA_ARGS__)
+#define vfsub_vf_f32m1_m(...) __riscv_vfsub_vf_f32m1_tumu(__VA_ARGS__)
+#define vfsub_vv_f32m2_m(...) __riscv_vfsub_vv_f32m2_tumu(__VA_ARGS__)
+#define vfsub_vf_f32m2_m(...) __riscv_vfsub_vf_f32m2_tumu(__VA_ARGS__)
+#define vfsub_vv_f32m4_m(...) __riscv_vfsub_vv_f32m4_tumu(__VA_ARGS__)
+#define vfsub_vf_f32m4_m(...) __riscv_vfsub_vf_f32m4_tumu(__VA_ARGS__)
+#define vfsub_vv_f32m8_m(...) __riscv_vfsub_vv_f32m8_tumu(__VA_ARGS__)
+#define vfsub_vf_f32m8_m(...) __riscv_vfsub_vf_f32m8_tumu(__VA_ARGS__)
+#define vfsub_vv_f64m1_m(...) __riscv_vfsub_vv_f64m1_tumu(__VA_ARGS__)
+#define vfsub_vf_f64m1_m(...) __riscv_vfsub_vf_f64m1_tumu(__VA_ARGS__)
+#define vfsub_vv_f64m2_m(...) __riscv_vfsub_vv_f64m2_tumu(__VA_ARGS__)
+#define vfsub_vf_f64m2_m(...) __riscv_vfsub_vf_f64m2_tumu(__VA_ARGS__)
+#define vfsub_vv_f64m4_m(...) __riscv_vfsub_vv_f64m4_tumu(__VA_ARGS__)
+#define vfsub_vf_f64m4_m(...) __riscv_vfsub_vf_f64m4_tumu(__VA_ARGS__)
+#define vfsub_vv_f64m8_m(...) __riscv_vfsub_vv_f64m8_tumu(__VA_ARGS__)
+#define vfsub_vf_f64m8_m(...) __riscv_vfsub_vf_f64m8_tumu(__VA_ARGS__)
+#define vfrsub_vf_f16mf4_m(...) __riscv_vfrsub_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfrsub_vf_f16mf2_m(...) __riscv_vfrsub_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfrsub_vf_f16m1_m(...) __riscv_vfrsub_vf_f16m1_tumu(__VA_ARGS__)
+#define vfrsub_vf_f16m2_m(...) __riscv_vfrsub_vf_f16m2_tumu(__VA_ARGS__)
+#define vfrsub_vf_f16m4_m(...) __riscv_vfrsub_vf_f16m4_tumu(__VA_ARGS__)
+#define vfrsub_vf_f16m8_m(...) __riscv_vfrsub_vf_f16m8_tumu(__VA_ARGS__)
+#define vfrsub_vf_f32mf2_m(...) __riscv_vfrsub_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfrsub_vf_f32m1_m(...) __riscv_vfrsub_vf_f32m1_tumu(__VA_ARGS__)
+#define vfrsub_vf_f32m2_m(...) __riscv_vfrsub_vf_f32m2_tumu(__VA_ARGS__)
+#define vfrsub_vf_f32m4_m(...) __riscv_vfrsub_vf_f32m4_tumu(__VA_ARGS__)
+#define vfrsub_vf_f32m8_m(...) __riscv_vfrsub_vf_f32m8_tumu(__VA_ARGS__)
+#define vfrsub_vf_f64m1_m(...) __riscv_vfrsub_vf_f64m1_tumu(__VA_ARGS__)
+#define vfrsub_vf_f64m2_m(...) __riscv_vfrsub_vf_f64m2_tumu(__VA_ARGS__)
+#define vfrsub_vf_f64m4_m(...) __riscv_vfrsub_vf_f64m4_tumu(__VA_ARGS__)
+#define vfrsub_vf_f64m8_m(...) __riscv_vfrsub_vf_f64m8_tumu(__VA_ARGS__)
+#define vfneg_v_f16mf4_m(...) __riscv_vfneg_v_f16mf4_tumu(__VA_ARGS__)
+#define vfneg_v_f16mf2_m(...) __riscv_vfneg_v_f16mf2_tumu(__VA_ARGS__)
+#define vfneg_v_f16m1_m(...) __riscv_vfneg_v_f16m1_tumu(__VA_ARGS__)
+#define vfneg_v_f16m2_m(...) __riscv_vfneg_v_f16m2_tumu(__VA_ARGS__)
+#define vfneg_v_f16m4_m(...) __riscv_vfneg_v_f16m4_tumu(__VA_ARGS__)
+#define vfneg_v_f16m8_m(...) __riscv_vfneg_v_f16m8_tumu(__VA_ARGS__)
+#define vfneg_v_f32mf2_m(...) __riscv_vfneg_v_f32mf2_tumu(__VA_ARGS__)
+#define vfneg_v_f32m1_m(...) __riscv_vfneg_v_f32m1_tumu(__VA_ARGS__)
+#define vfneg_v_f32m2_m(...) __riscv_vfneg_v_f32m2_tumu(__VA_ARGS__)
+#define vfneg_v_f32m4_m(...) __riscv_vfneg_v_f32m4_tumu(__VA_ARGS__)
+#define vfneg_v_f32m8_m(...) __riscv_vfneg_v_f32m8_tumu(__VA_ARGS__)
+#define vfneg_v_f64m1_m(...) __riscv_vfneg_v_f64m1_tumu(__VA_ARGS__)
+#define vfneg_v_f64m2_m(...) __riscv_vfneg_v_f64m2_tumu(__VA_ARGS__)
+#define vfneg_v_f64m4_m(...) __riscv_vfneg_v_f64m4_tumu(__VA_ARGS__)
+#define vfneg_v_f64m8_m(...) __riscv_vfneg_v_f64m8_tumu(__VA_ARGS__)
+#define vfwadd_vv_f32mf2(...) __riscv_vfwadd_vv_f32mf2(__VA_ARGS__)
+#define vfwadd_vf_f32mf2(...) __riscv_vfwadd_vf_f32mf2(__VA_ARGS__)
+#define vfwadd_wv_f32mf2(...) __riscv_vfwadd_wv_f32mf2(__VA_ARGS__)
+#define vfwadd_wf_f32mf2(...) __riscv_vfwadd_wf_f32mf2(__VA_ARGS__)
+#define vfwadd_vv_f32m1(...) __riscv_vfwadd_vv_f32m1(__VA_ARGS__)
+#define vfwadd_vf_f32m1(...) __riscv_vfwadd_vf_f32m1(__VA_ARGS__)
+#define vfwadd_wv_f32m1(...) __riscv_vfwadd_wv_f32m1(__VA_ARGS__)
+#define vfwadd_wf_f32m1(...) __riscv_vfwadd_wf_f32m1(__VA_ARGS__)
+#define vfwadd_vv_f32m2(...) __riscv_vfwadd_vv_f32m2(__VA_ARGS__)
+#define vfwadd_vf_f32m2(...) __riscv_vfwadd_vf_f32m2(__VA_ARGS__)
+#define vfwadd_wv_f32m2(...) __riscv_vfwadd_wv_f32m2(__VA_ARGS__)
+#define vfwadd_wf_f32m2(...) __riscv_vfwadd_wf_f32m2(__VA_ARGS__)
+#define vfwadd_vv_f32m4(...) __riscv_vfwadd_vv_f32m4(__VA_ARGS__)
+#define vfwadd_vf_f32m4(...) __riscv_vfwadd_vf_f32m4(__VA_ARGS__)
+#define vfwadd_wv_f32m4(...) __riscv_vfwadd_wv_f32m4(__VA_ARGS__)
+#define vfwadd_wf_f32m4(...) __riscv_vfwadd_wf_f32m4(__VA_ARGS__)
+#define vfwadd_vv_f32m8(...) __riscv_vfwadd_vv_f32m8(__VA_ARGS__)
+#define vfwadd_vf_f32m8(...) __riscv_vfwadd_vf_f32m8(__VA_ARGS__)
+#define vfwadd_wv_f32m8(...) __riscv_vfwadd_wv_f32m8(__VA_ARGS__)
+#define vfwadd_wf_f32m8(...) __riscv_vfwadd_wf_f32m8(__VA_ARGS__)
+#define vfwadd_vv_f64m1(...) __riscv_vfwadd_vv_f64m1(__VA_ARGS__)
+#define vfwadd_vf_f64m1(...) __riscv_vfwadd_vf_f64m1(__VA_ARGS__)
+#define vfwadd_wv_f64m1(...) __riscv_vfwadd_wv_f64m1(__VA_ARGS__)
+#define vfwadd_wf_f64m1(...) __riscv_vfwadd_wf_f64m1(__VA_ARGS__)
+#define vfwadd_vv_f64m2(...) __riscv_vfwadd_vv_f64m2(__VA_ARGS__)
+#define vfwadd_vf_f64m2(...) __riscv_vfwadd_vf_f64m2(__VA_ARGS__)
+#define vfwadd_wv_f64m2(...) __riscv_vfwadd_wv_f64m2(__VA_ARGS__)
+#define vfwadd_wf_f64m2(...) __riscv_vfwadd_wf_f64m2(__VA_ARGS__)
+#define vfwadd_vv_f64m4(...) __riscv_vfwadd_vv_f64m4(__VA_ARGS__)
+#define vfwadd_vf_f64m4(...) __riscv_vfwadd_vf_f64m4(__VA_ARGS__)
+#define vfwadd_wv_f64m4(...) __riscv_vfwadd_wv_f64m4(__VA_ARGS__)
+#define vfwadd_wf_f64m4(...) __riscv_vfwadd_wf_f64m4(__VA_ARGS__)
+#define vfwadd_vv_f64m8(...) __riscv_vfwadd_vv_f64m8(__VA_ARGS__)
+#define vfwadd_vf_f64m8(...) __riscv_vfwadd_vf_f64m8(__VA_ARGS__)
+#define vfwadd_wv_f64m8(...) __riscv_vfwadd_wv_f64m8(__VA_ARGS__)
+#define vfwadd_wf_f64m8(...) __riscv_vfwadd_wf_f64m8(__VA_ARGS__)
+#define vfwsub_vv_f32mf2(...) __riscv_vfwsub_vv_f32mf2(__VA_ARGS__)
+#define vfwsub_vf_f32mf2(...) __riscv_vfwsub_vf_f32mf2(__VA_ARGS__)
+#define vfwsub_wv_f32mf2(...) __riscv_vfwsub_wv_f32mf2(__VA_ARGS__)
+#define vfwsub_wf_f32mf2(...) __riscv_vfwsub_wf_f32mf2(__VA_ARGS__)
+#define vfwsub_vv_f32m1(...) __riscv_vfwsub_vv_f32m1(__VA_ARGS__)
+#define vfwsub_vf_f32m1(...) __riscv_vfwsub_vf_f32m1(__VA_ARGS__)
+#define vfwsub_wv_f32m1(...) __riscv_vfwsub_wv_f32m1(__VA_ARGS__)
+#define vfwsub_wf_f32m1(...) __riscv_vfwsub_wf_f32m1(__VA_ARGS__)
+#define vfwsub_vv_f32m2(...) __riscv_vfwsub_vv_f32m2(__VA_ARGS__)
+#define vfwsub_vf_f32m2(...) __riscv_vfwsub_vf_f32m2(__VA_ARGS__)
+#define vfwsub_wv_f32m2(...) __riscv_vfwsub_wv_f32m2(__VA_ARGS__)
+#define vfwsub_wf_f32m2(...) __riscv_vfwsub_wf_f32m2(__VA_ARGS__)
+#define vfwsub_vv_f32m4(...) __riscv_vfwsub_vv_f32m4(__VA_ARGS__)
+#define vfwsub_vf_f32m4(...) __riscv_vfwsub_vf_f32m4(__VA_ARGS__)
+#define vfwsub_wv_f32m4(...) __riscv_vfwsub_wv_f32m4(__VA_ARGS__)
+#define vfwsub_wf_f32m4(...) __riscv_vfwsub_wf_f32m4(__VA_ARGS__)
+#define vfwsub_vv_f32m8(...) __riscv_vfwsub_vv_f32m8(__VA_ARGS__)
+#define vfwsub_vf_f32m8(...) __riscv_vfwsub_vf_f32m8(__VA_ARGS__)
+#define vfwsub_wv_f32m8(...) __riscv_vfwsub_wv_f32m8(__VA_ARGS__)
+#define vfwsub_wf_f32m8(...) __riscv_vfwsub_wf_f32m8(__VA_ARGS__)
+#define vfwsub_vv_f64m1(...) __riscv_vfwsub_vv_f64m1(__VA_ARGS__)
+#define vfwsub_vf_f64m1(...) __riscv_vfwsub_vf_f64m1(__VA_ARGS__)
+#define vfwsub_wv_f64m1(...) __riscv_vfwsub_wv_f64m1(__VA_ARGS__)
+#define vfwsub_wf_f64m1(...) __riscv_vfwsub_wf_f64m1(__VA_ARGS__)
+#define vfwsub_vv_f64m2(...) __riscv_vfwsub_vv_f64m2(__VA_ARGS__)
+#define vfwsub_vf_f64m2(...) __riscv_vfwsub_vf_f64m2(__VA_ARGS__)
+#define vfwsub_wv_f64m2(...) __riscv_vfwsub_wv_f64m2(__VA_ARGS__)
+#define vfwsub_wf_f64m2(...) __riscv_vfwsub_wf_f64m2(__VA_ARGS__)
+#define vfwsub_vv_f64m4(...) __riscv_vfwsub_vv_f64m4(__VA_ARGS__)
+#define vfwsub_vf_f64m4(...) __riscv_vfwsub_vf_f64m4(__VA_ARGS__)
+#define vfwsub_wv_f64m4(...) __riscv_vfwsub_wv_f64m4(__VA_ARGS__)
+#define vfwsub_wf_f64m4(...) __riscv_vfwsub_wf_f64m4(__VA_ARGS__)
+#define vfwsub_vv_f64m8(...) __riscv_vfwsub_vv_f64m8(__VA_ARGS__)
+#define vfwsub_vf_f64m8(...) __riscv_vfwsub_vf_f64m8(__VA_ARGS__)
+#define vfwsub_wv_f64m8(...) __riscv_vfwsub_wv_f64m8(__VA_ARGS__)
+#define vfwsub_wf_f64m8(...) __riscv_vfwsub_wf_f64m8(__VA_ARGS__)
+// masked functions
+#define vfwadd_vv_f32mf2_m(...) __riscv_vfwadd_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfwadd_vf_f32mf2_m(...) __riscv_vfwadd_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfwadd_wv_f32mf2_m(...) __riscv_vfwadd_wv_f32mf2_tumu(__VA_ARGS__)
+#define vfwadd_wf_f32mf2_m(...) __riscv_vfwadd_wf_f32mf2_tumu(__VA_ARGS__)
+#define vfwadd_vv_f32m1_m(...) __riscv_vfwadd_vv_f32m1_tumu(__VA_ARGS__)
+#define vfwadd_vf_f32m1_m(...) __riscv_vfwadd_vf_f32m1_tumu(__VA_ARGS__)
+#define vfwadd_wv_f32m1_m(...) __riscv_vfwadd_wv_f32m1_tumu(__VA_ARGS__)
+#define vfwadd_wf_f32m1_m(...) __riscv_vfwadd_wf_f32m1_tumu(__VA_ARGS__)
+#define vfwadd_vv_f32m2_m(...) __riscv_vfwadd_vv_f32m2_tumu(__VA_ARGS__)
+#define vfwadd_vf_f32m2_m(...) __riscv_vfwadd_vf_f32m2_tumu(__VA_ARGS__)
+#define vfwadd_wv_f32m2_m(...) __riscv_vfwadd_wv_f32m2_tumu(__VA_ARGS__)
+#define vfwadd_wf_f32m2_m(...) __riscv_vfwadd_wf_f32m2_tumu(__VA_ARGS__)
+#define vfwadd_vv_f32m4_m(...) __riscv_vfwadd_vv_f32m4_tumu(__VA_ARGS__)
+#define vfwadd_vf_f32m4_m(...) __riscv_vfwadd_vf_f32m4_tumu(__VA_ARGS__)
+#define vfwadd_wv_f32m4_m(...) __riscv_vfwadd_wv_f32m4_tumu(__VA_ARGS__)
+#define vfwadd_wf_f32m4_m(...) __riscv_vfwadd_wf_f32m4_tumu(__VA_ARGS__)
+#define vfwadd_vv_f32m8_m(...) __riscv_vfwadd_vv_f32m8_tumu(__VA_ARGS__)
+#define vfwadd_vf_f32m8_m(...) __riscv_vfwadd_vf_f32m8_tumu(__VA_ARGS__)
+#define vfwadd_wv_f32m8_m(...) __riscv_vfwadd_wv_f32m8_tumu(__VA_ARGS__)
+#define vfwadd_wf_f32m8_m(...) __riscv_vfwadd_wf_f32m8_tumu(__VA_ARGS__)
+#define vfwadd_vv_f64m1_m(...) __riscv_vfwadd_vv_f64m1_tumu(__VA_ARGS__)
+#define vfwadd_vf_f64m1_m(...) __riscv_vfwadd_vf_f64m1_tumu(__VA_ARGS__)
+#define vfwadd_wv_f64m1_m(...) __riscv_vfwadd_wv_f64m1_tumu(__VA_ARGS__)
+#define vfwadd_wf_f64m1_m(...) __riscv_vfwadd_wf_f64m1_tumu(__VA_ARGS__)
+#define vfwadd_vv_f64m2_m(...) __riscv_vfwadd_vv_f64m2_tumu(__VA_ARGS__)
+#define vfwadd_vf_f64m2_m(...) __riscv_vfwadd_vf_f64m2_tumu(__VA_ARGS__)
+#define vfwadd_wv_f64m2_m(...) __riscv_vfwadd_wv_f64m2_tumu(__VA_ARGS__)
+#define vfwadd_wf_f64m2_m(...) __riscv_vfwadd_wf_f64m2_tumu(__VA_ARGS__)
+#define vfwadd_vv_f64m4_m(...) __riscv_vfwadd_vv_f64m4_tumu(__VA_ARGS__)
+#define vfwadd_vf_f64m4_m(...) __riscv_vfwadd_vf_f64m4_tumu(__VA_ARGS__)
+#define vfwadd_wv_f64m4_m(...) __riscv_vfwadd_wv_f64m4_tumu(__VA_ARGS__)
+#define vfwadd_wf_f64m4_m(...) __riscv_vfwadd_wf_f64m4_tumu(__VA_ARGS__)
+#define vfwadd_vv_f64m8_m(...) __riscv_vfwadd_vv_f64m8_tumu(__VA_ARGS__)
+#define vfwadd_vf_f64m8_m(...) __riscv_vfwadd_vf_f64m8_tumu(__VA_ARGS__)
+#define vfwadd_wv_f64m8_m(...) __riscv_vfwadd_wv_f64m8_tumu(__VA_ARGS__)
+#define vfwadd_wf_f64m8_m(...) __riscv_vfwadd_wf_f64m8_tumu(__VA_ARGS__)
+#define vfwsub_vv_f32mf2_m(...) __riscv_vfwsub_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfwsub_vf_f32mf2_m(...) __riscv_vfwsub_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfwsub_wv_f32mf2_m(...) __riscv_vfwsub_wv_f32mf2_tumu(__VA_ARGS__)
+#define vfwsub_wf_f32mf2_m(...) __riscv_vfwsub_wf_f32mf2_tumu(__VA_ARGS__)
+#define vfwsub_vv_f32m1_m(...) __riscv_vfwsub_vv_f32m1_tumu(__VA_ARGS__)
+#define vfwsub_vf_f32m1_m(...) __riscv_vfwsub_vf_f32m1_tumu(__VA_ARGS__)
+#define vfwsub_wv_f32m1_m(...) __riscv_vfwsub_wv_f32m1_tumu(__VA_ARGS__)
+#define vfwsub_wf_f32m1_m(...) __riscv_vfwsub_wf_f32m1_tumu(__VA_ARGS__)
+#define vfwsub_vv_f32m2_m(...) __riscv_vfwsub_vv_f32m2_tumu(__VA_ARGS__)
+#define vfwsub_vf_f32m2_m(...) __riscv_vfwsub_vf_f32m2_tumu(__VA_ARGS__)
+#define vfwsub_wv_f32m2_m(...) __riscv_vfwsub_wv_f32m2_tumu(__VA_ARGS__)
+#define vfwsub_wf_f32m2_m(...) __riscv_vfwsub_wf_f32m2_tumu(__VA_ARGS__)
+#define vfwsub_vv_f32m4_m(...) __riscv_vfwsub_vv_f32m4_tumu(__VA_ARGS__)
+#define vfwsub_vf_f32m4_m(...) __riscv_vfwsub_vf_f32m4_tumu(__VA_ARGS__)
+#define vfwsub_wv_f32m4_m(...) __riscv_vfwsub_wv_f32m4_tumu(__VA_ARGS__)
+#define vfwsub_wf_f32m4_m(...) __riscv_vfwsub_wf_f32m4_tumu(__VA_ARGS__)
+#define vfwsub_vv_f32m8_m(...) __riscv_vfwsub_vv_f32m8_tumu(__VA_ARGS__)
+#define vfwsub_vf_f32m8_m(...) __riscv_vfwsub_vf_f32m8_tumu(__VA_ARGS__)
+#define vfwsub_wv_f32m8_m(...) __riscv_vfwsub_wv_f32m8_tumu(__VA_ARGS__)
+#define vfwsub_wf_f32m8_m(...) __riscv_vfwsub_wf_f32m8_tumu(__VA_ARGS__)
+#define vfwsub_vv_f64m1_m(...) __riscv_vfwsub_vv_f64m1_tumu(__VA_ARGS__)
+#define vfwsub_vf_f64m1_m(...) __riscv_vfwsub_vf_f64m1_tumu(__VA_ARGS__)
+#define vfwsub_wv_f64m1_m(...) __riscv_vfwsub_wv_f64m1_tumu(__VA_ARGS__)
+#define vfwsub_wf_f64m1_m(...) __riscv_vfwsub_wf_f64m1_tumu(__VA_ARGS__)
+#define vfwsub_vv_f64m2_m(...) __riscv_vfwsub_vv_f64m2_tumu(__VA_ARGS__)
+#define vfwsub_vf_f64m2_m(...) __riscv_vfwsub_vf_f64m2_tumu(__VA_ARGS__)
+#define vfwsub_wv_f64m2_m(...) __riscv_vfwsub_wv_f64m2_tumu(__VA_ARGS__)
+#define vfwsub_wf_f64m2_m(...) __riscv_vfwsub_wf_f64m2_tumu(__VA_ARGS__)
+#define vfwsub_vv_f64m4_m(...) __riscv_vfwsub_vv_f64m4_tumu(__VA_ARGS__)
+#define vfwsub_vf_f64m4_m(...) __riscv_vfwsub_vf_f64m4_tumu(__VA_ARGS__)
+#define vfwsub_wv_f64m4_m(...) __riscv_vfwsub_wv_f64m4_tumu(__VA_ARGS__)
+#define vfwsub_wf_f64m4_m(...) __riscv_vfwsub_wf_f64m4_tumu(__VA_ARGS__)
+#define vfwsub_vv_f64m8_m(...) __riscv_vfwsub_vv_f64m8_tumu(__VA_ARGS__)
+#define vfwsub_vf_f64m8_m(...) __riscv_vfwsub_vf_f64m8_tumu(__VA_ARGS__)
+#define vfwsub_wv_f64m8_m(...) __riscv_vfwsub_wv_f64m8_tumu(__VA_ARGS__)
+#define vfwsub_wf_f64m8_m(...) __riscv_vfwsub_wf_f64m8_tumu(__VA_ARGS__)
+#define vfmul_vv_f16mf4(...) __riscv_vfmul_vv_f16mf4(__VA_ARGS__)
+#define vfmul_vf_f16mf4(...) __riscv_vfmul_vf_f16mf4(__VA_ARGS__)
+#define vfmul_vv_f16mf2(...) __riscv_vfmul_vv_f16mf2(__VA_ARGS__)
+#define vfmul_vf_f16mf2(...) __riscv_vfmul_vf_f16mf2(__VA_ARGS__)
+#define vfmul_vv_f16m1(...) __riscv_vfmul_vv_f16m1(__VA_ARGS__)
+#define vfmul_vf_f16m1(...) __riscv_vfmul_vf_f16m1(__VA_ARGS__)
+#define vfmul_vv_f16m2(...) __riscv_vfmul_vv_f16m2(__VA_ARGS__)
+#define vfmul_vf_f16m2(...) __riscv_vfmul_vf_f16m2(__VA_ARGS__)
+#define vfmul_vv_f16m4(...) __riscv_vfmul_vv_f16m4(__VA_ARGS__)
+#define vfmul_vf_f16m4(...) __riscv_vfmul_vf_f16m4(__VA_ARGS__)
+#define vfmul_vv_f16m8(...) __riscv_vfmul_vv_f16m8(__VA_ARGS__)
+#define vfmul_vf_f16m8(...) __riscv_vfmul_vf_f16m8(__VA_ARGS__)
+#define vfmul_vv_f32mf2(...) __riscv_vfmul_vv_f32mf2(__VA_ARGS__)
+#define vfmul_vf_f32mf2(...) __riscv_vfmul_vf_f32mf2(__VA_ARGS__)
+#define vfmul_vv_f32m1(...) __riscv_vfmul_vv_f32m1(__VA_ARGS__)
+#define vfmul_vf_f32m1(...) __riscv_vfmul_vf_f32m1(__VA_ARGS__)
+#define vfmul_vv_f32m2(...) __riscv_vfmul_vv_f32m2(__VA_ARGS__)
+#define vfmul_vf_f32m2(...) __riscv_vfmul_vf_f32m2(__VA_ARGS__)
+#define vfmul_vv_f32m4(...) __riscv_vfmul_vv_f32m4(__VA_ARGS__)
+#define vfmul_vf_f32m4(...) __riscv_vfmul_vf_f32m4(__VA_ARGS__)
+#define vfmul_vv_f32m8(...) __riscv_vfmul_vv_f32m8(__VA_ARGS__)
+#define vfmul_vf_f32m8(...) __riscv_vfmul_vf_f32m8(__VA_ARGS__)
+#define vfmul_vv_f64m1(...) __riscv_vfmul_vv_f64m1(__VA_ARGS__)
+#define vfmul_vf_f64m1(...) __riscv_vfmul_vf_f64m1(__VA_ARGS__)
+#define vfmul_vv_f64m2(...) __riscv_vfmul_vv_f64m2(__VA_ARGS__)
+#define vfmul_vf_f64m2(...) __riscv_vfmul_vf_f64m2(__VA_ARGS__)
+#define vfmul_vv_f64m4(...) __riscv_vfmul_vv_f64m4(__VA_ARGS__)
+#define vfmul_vf_f64m4(...) __riscv_vfmul_vf_f64m4(__VA_ARGS__)
+#define vfmul_vv_f64m8(...) __riscv_vfmul_vv_f64m8(__VA_ARGS__)
+#define vfmul_vf_f64m8(...) __riscv_vfmul_vf_f64m8(__VA_ARGS__)
+#define vfdiv_vv_f16mf4(...) __riscv_vfdiv_vv_f16mf4(__VA_ARGS__)
+#define vfdiv_vf_f16mf4(...) __riscv_vfdiv_vf_f16mf4(__VA_ARGS__)
+#define vfdiv_vv_f16mf2(...) __riscv_vfdiv_vv_f16mf2(__VA_ARGS__)
+#define vfdiv_vf_f16mf2(...) __riscv_vfdiv_vf_f16mf2(__VA_ARGS__)
+#define vfdiv_vv_f16m1(...) __riscv_vfdiv_vv_f16m1(__VA_ARGS__)
+#define vfdiv_vf_f16m1(...) __riscv_vfdiv_vf_f16m1(__VA_ARGS__)
+#define vfdiv_vv_f16m2(...) __riscv_vfdiv_vv_f16m2(__VA_ARGS__)
+#define vfdiv_vf_f16m2(...) __riscv_vfdiv_vf_f16m2(__VA_ARGS__)
+#define vfdiv_vv_f16m4(...) __riscv_vfdiv_vv_f16m4(__VA_ARGS__)
+#define vfdiv_vf_f16m4(...) __riscv_vfdiv_vf_f16m4(__VA_ARGS__)
+#define vfdiv_vv_f16m8(...) __riscv_vfdiv_vv_f16m8(__VA_ARGS__)
+#define vfdiv_vf_f16m8(...) __riscv_vfdiv_vf_f16m8(__VA_ARGS__)
+#define vfdiv_vv_f32mf2(...) __riscv_vfdiv_vv_f32mf2(__VA_ARGS__)
+#define vfdiv_vf_f32mf2(...) __riscv_vfdiv_vf_f32mf2(__VA_ARGS__)
+#define vfdiv_vv_f32m1(...) __riscv_vfdiv_vv_f32m1(__VA_ARGS__)
+#define vfdiv_vf_f32m1(...) __riscv_vfdiv_vf_f32m1(__VA_ARGS__)
+#define vfdiv_vv_f32m2(...) __riscv_vfdiv_vv_f32m2(__VA_ARGS__)
+#define vfdiv_vf_f32m2(...) __riscv_vfdiv_vf_f32m2(__VA_ARGS__)
+#define vfdiv_vv_f32m4(...) __riscv_vfdiv_vv_f32m4(__VA_ARGS__)
+#define vfdiv_vf_f32m4(...) __riscv_vfdiv_vf_f32m4(__VA_ARGS__)
+#define vfdiv_vv_f32m8(...) __riscv_vfdiv_vv_f32m8(__VA_ARGS__)
+#define vfdiv_vf_f32m8(...) __riscv_vfdiv_vf_f32m8(__VA_ARGS__)
+#define vfdiv_vv_f64m1(...) __riscv_vfdiv_vv_f64m1(__VA_ARGS__)
+#define vfdiv_vf_f64m1(...) __riscv_vfdiv_vf_f64m1(__VA_ARGS__)
+#define vfdiv_vv_f64m2(...) __riscv_vfdiv_vv_f64m2(__VA_ARGS__)
+#define vfdiv_vf_f64m2(...) __riscv_vfdiv_vf_f64m2(__VA_ARGS__)
+#define vfdiv_vv_f64m4(...) __riscv_vfdiv_vv_f64m4(__VA_ARGS__)
+#define vfdiv_vf_f64m4(...) __riscv_vfdiv_vf_f64m4(__VA_ARGS__)
+#define vfdiv_vv_f64m8(...) __riscv_vfdiv_vv_f64m8(__VA_ARGS__)
+#define vfdiv_vf_f64m8(...) __riscv_vfdiv_vf_f64m8(__VA_ARGS__)
+#define vfrdiv_vf_f16mf4(...) __riscv_vfrdiv_vf_f16mf4(__VA_ARGS__)
+#define vfrdiv_vf_f16mf2(...) __riscv_vfrdiv_vf_f16mf2(__VA_ARGS__)
+#define vfrdiv_vf_f16m1(...) __riscv_vfrdiv_vf_f16m1(__VA_ARGS__)
+#define vfrdiv_vf_f16m2(...) __riscv_vfrdiv_vf_f16m2(__VA_ARGS__)
+#define vfrdiv_vf_f16m4(...) __riscv_vfrdiv_vf_f16m4(__VA_ARGS__)
+#define vfrdiv_vf_f16m8(...) __riscv_vfrdiv_vf_f16m8(__VA_ARGS__)
+#define vfrdiv_vf_f32mf2(...) __riscv_vfrdiv_vf_f32mf2(__VA_ARGS__)
+#define vfrdiv_vf_f32m1(...) __riscv_vfrdiv_vf_f32m1(__VA_ARGS__)
+#define vfrdiv_vf_f32m2(...) __riscv_vfrdiv_vf_f32m2(__VA_ARGS__)
+#define vfrdiv_vf_f32m4(...) __riscv_vfrdiv_vf_f32m4(__VA_ARGS__)
+#define vfrdiv_vf_f32m8(...) __riscv_vfrdiv_vf_f32m8(__VA_ARGS__)
+#define vfrdiv_vf_f64m1(...) __riscv_vfrdiv_vf_f64m1(__VA_ARGS__)
+#define vfrdiv_vf_f64m2(...) __riscv_vfrdiv_vf_f64m2(__VA_ARGS__)
+#define vfrdiv_vf_f64m4(...) __riscv_vfrdiv_vf_f64m4(__VA_ARGS__)
+#define vfrdiv_vf_f64m8(...) __riscv_vfrdiv_vf_f64m8(__VA_ARGS__)
+// masked functions
+#define vfmul_vv_f16mf4_m(...) __riscv_vfmul_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfmul_vf_f16mf4_m(...) __riscv_vfmul_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfmul_vv_f16mf2_m(...) __riscv_vfmul_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfmul_vf_f16mf2_m(...) __riscv_vfmul_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfmul_vv_f16m1_m(...) __riscv_vfmul_vv_f16m1_tumu(__VA_ARGS__)
+#define vfmul_vf_f16m1_m(...) __riscv_vfmul_vf_f16m1_tumu(__VA_ARGS__)
+#define vfmul_vv_f16m2_m(...) __riscv_vfmul_vv_f16m2_tumu(__VA_ARGS__)
+#define vfmul_vf_f16m2_m(...) __riscv_vfmul_vf_f16m2_tumu(__VA_ARGS__)
+#define vfmul_vv_f16m4_m(...) __riscv_vfmul_vv_f16m4_tumu(__VA_ARGS__)
+#define vfmul_vf_f16m4_m(...) __riscv_vfmul_vf_f16m4_tumu(__VA_ARGS__)
+#define vfmul_vv_f16m8_m(...) __riscv_vfmul_vv_f16m8_tumu(__VA_ARGS__)
+#define vfmul_vf_f16m8_m(...) __riscv_vfmul_vf_f16m8_tumu(__VA_ARGS__)
+#define vfmul_vv_f32mf2_m(...) __riscv_vfmul_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfmul_vf_f32mf2_m(...) __riscv_vfmul_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfmul_vv_f32m1_m(...) __riscv_vfmul_vv_f32m1_tumu(__VA_ARGS__)
+#define vfmul_vf_f32m1_m(...) __riscv_vfmul_vf_f32m1_tumu(__VA_ARGS__)
+#define vfmul_vv_f32m2_m(...) __riscv_vfmul_vv_f32m2_tumu(__VA_ARGS__)
+#define vfmul_vf_f32m2_m(...) __riscv_vfmul_vf_f32m2_tumu(__VA_ARGS__)
+#define vfmul_vv_f32m4_m(...) __riscv_vfmul_vv_f32m4_tumu(__VA_ARGS__)
+#define vfmul_vf_f32m4_m(...) __riscv_vfmul_vf_f32m4_tumu(__VA_ARGS__)
+#define vfmul_vv_f32m8_m(...) __riscv_vfmul_vv_f32m8_tumu(__VA_ARGS__)
+#define vfmul_vf_f32m8_m(...) __riscv_vfmul_vf_f32m8_tumu(__VA_ARGS__)
+#define vfmul_vv_f64m1_m(...) __riscv_vfmul_vv_f64m1_tumu(__VA_ARGS__)
+#define vfmul_vf_f64m1_m(...) __riscv_vfmul_vf_f64m1_tumu(__VA_ARGS__)
+#define vfmul_vv_f64m2_m(...) __riscv_vfmul_vv_f64m2_tumu(__VA_ARGS__)
+#define vfmul_vf_f64m2_m(...) __riscv_vfmul_vf_f64m2_tumu(__VA_ARGS__)
+#define vfmul_vv_f64m4_m(...) __riscv_vfmul_vv_f64m4_tumu(__VA_ARGS__)
+#define vfmul_vf_f64m4_m(...) __riscv_vfmul_vf_f64m4_tumu(__VA_ARGS__)
+#define vfmul_vv_f64m8_m(...) __riscv_vfmul_vv_f64m8_tumu(__VA_ARGS__)
+#define vfmul_vf_f64m8_m(...) __riscv_vfmul_vf_f64m8_tumu(__VA_ARGS__)
+#define vfdiv_vv_f16mf4_m(...) __riscv_vfdiv_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfdiv_vf_f16mf4_m(...) __riscv_vfdiv_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfdiv_vv_f16mf2_m(...) __riscv_vfdiv_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfdiv_vf_f16mf2_m(...) __riscv_vfdiv_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfdiv_vv_f16m1_m(...) __riscv_vfdiv_vv_f16m1_tumu(__VA_ARGS__)
+#define vfdiv_vf_f16m1_m(...) __riscv_vfdiv_vf_f16m1_tumu(__VA_ARGS__)
+#define vfdiv_vv_f16m2_m(...) __riscv_vfdiv_vv_f16m2_tumu(__VA_ARGS__)
+#define vfdiv_vf_f16m2_m(...) __riscv_vfdiv_vf_f16m2_tumu(__VA_ARGS__)
+#define vfdiv_vv_f16m4_m(...) __riscv_vfdiv_vv_f16m4_tumu(__VA_ARGS__)
+#define vfdiv_vf_f16m4_m(...) __riscv_vfdiv_vf_f16m4_tumu(__VA_ARGS__)
+#define vfdiv_vv_f16m8_m(...) __riscv_vfdiv_vv_f16m8_tumu(__VA_ARGS__)
+#define vfdiv_vf_f16m8_m(...) __riscv_vfdiv_vf_f16m8_tumu(__VA_ARGS__)
+#define vfdiv_vv_f32mf2_m(...) __riscv_vfdiv_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfdiv_vf_f32mf2_m(...) __riscv_vfdiv_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfdiv_vv_f32m1_m(...) __riscv_vfdiv_vv_f32m1_tumu(__VA_ARGS__)
+#define vfdiv_vf_f32m1_m(...) __riscv_vfdiv_vf_f32m1_tumu(__VA_ARGS__)
+#define vfdiv_vv_f32m2_m(...) __riscv_vfdiv_vv_f32m2_tumu(__VA_ARGS__)
+#define vfdiv_vf_f32m2_m(...) __riscv_vfdiv_vf_f32m2_tumu(__VA_ARGS__)
+#define vfdiv_vv_f32m4_m(...) __riscv_vfdiv_vv_f32m4_tumu(__VA_ARGS__)
+#define vfdiv_vf_f32m4_m(...) __riscv_vfdiv_vf_f32m4_tumu(__VA_ARGS__)
+#define vfdiv_vv_f32m8_m(...) __riscv_vfdiv_vv_f32m8_tumu(__VA_ARGS__)
+#define vfdiv_vf_f32m8_m(...) __riscv_vfdiv_vf_f32m8_tumu(__VA_ARGS__)
+#define vfdiv_vv_f64m1_m(...) __riscv_vfdiv_vv_f64m1_tumu(__VA_ARGS__)
+#define vfdiv_vf_f64m1_m(...) __riscv_vfdiv_vf_f64m1_tumu(__VA_ARGS__)
+#define vfdiv_vv_f64m2_m(...) __riscv_vfdiv_vv_f64m2_tumu(__VA_ARGS__)
+#define vfdiv_vf_f64m2_m(...) __riscv_vfdiv_vf_f64m2_tumu(__VA_ARGS__)
+#define vfdiv_vv_f64m4_m(...) __riscv_vfdiv_vv_f64m4_tumu(__VA_ARGS__)
+#define vfdiv_vf_f64m4_m(...) __riscv_vfdiv_vf_f64m4_tumu(__VA_ARGS__)
+#define vfdiv_vv_f64m8_m(...) __riscv_vfdiv_vv_f64m8_tumu(__VA_ARGS__)
+#define vfdiv_vf_f64m8_m(...) __riscv_vfdiv_vf_f64m8_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f16mf4_m(...) __riscv_vfrdiv_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f16mf2_m(...) __riscv_vfrdiv_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f16m1_m(...) __riscv_vfrdiv_vf_f16m1_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f16m2_m(...) __riscv_vfrdiv_vf_f16m2_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f16m4_m(...) __riscv_vfrdiv_vf_f16m4_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f16m8_m(...) __riscv_vfrdiv_vf_f16m8_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f32mf2_m(...) __riscv_vfrdiv_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f32m1_m(...) __riscv_vfrdiv_vf_f32m1_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f32m2_m(...) __riscv_vfrdiv_vf_f32m2_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f32m4_m(...) __riscv_vfrdiv_vf_f32m4_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f32m8_m(...) __riscv_vfrdiv_vf_f32m8_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f64m1_m(...) __riscv_vfrdiv_vf_f64m1_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f64m2_m(...) __riscv_vfrdiv_vf_f64m2_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f64m4_m(...) __riscv_vfrdiv_vf_f64m4_tumu(__VA_ARGS__)
+#define vfrdiv_vf_f64m8_m(...) __riscv_vfrdiv_vf_f64m8_tumu(__VA_ARGS__)
+#define vfwmul_vv_f32mf2(...) __riscv_vfwmul_vv_f32mf2(__VA_ARGS__)
+#define vfwmul_vf_f32mf2(...) __riscv_vfwmul_vf_f32mf2(__VA_ARGS__)
+#define vfwmul_vv_f32m1(...) __riscv_vfwmul_vv_f32m1(__VA_ARGS__)
+#define vfwmul_vf_f32m1(...) __riscv_vfwmul_vf_f32m1(__VA_ARGS__)
+#define vfwmul_vv_f32m2(...) __riscv_vfwmul_vv_f32m2(__VA_ARGS__)
+#define vfwmul_vf_f32m2(...) __riscv_vfwmul_vf_f32m2(__VA_ARGS__)
+#define vfwmul_vv_f32m4(...) __riscv_vfwmul_vv_f32m4(__VA_ARGS__)
+#define vfwmul_vf_f32m4(...) __riscv_vfwmul_vf_f32m4(__VA_ARGS__)
+#define vfwmul_vv_f32m8(...) __riscv_vfwmul_vv_f32m8(__VA_ARGS__)
+#define vfwmul_vf_f32m8(...) __riscv_vfwmul_vf_f32m8(__VA_ARGS__)
+#define vfwmul_vv_f64m1(...) __riscv_vfwmul_vv_f64m1(__VA_ARGS__)
+#define vfwmul_vf_f64m1(...) __riscv_vfwmul_vf_f64m1(__VA_ARGS__)
+#define vfwmul_vv_f64m2(...) __riscv_vfwmul_vv_f64m2(__VA_ARGS__)
+#define vfwmul_vf_f64m2(...) __riscv_vfwmul_vf_f64m2(__VA_ARGS__)
+#define vfwmul_vv_f64m4(...) __riscv_vfwmul_vv_f64m4(__VA_ARGS__)
+#define vfwmul_vf_f64m4(...) __riscv_vfwmul_vf_f64m4(__VA_ARGS__)
+#define vfwmul_vv_f64m8(...) __riscv_vfwmul_vv_f64m8(__VA_ARGS__)
+#define vfwmul_vf_f64m8(...) __riscv_vfwmul_vf_f64m8(__VA_ARGS__)
+// masked functions
+#define vfwmul_vv_f32mf2_m(...) __riscv_vfwmul_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfwmul_vf_f32mf2_m(...) __riscv_vfwmul_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfwmul_vv_f32m1_m(...) __riscv_vfwmul_vv_f32m1_tumu(__VA_ARGS__)
+#define vfwmul_vf_f32m1_m(...) __riscv_vfwmul_vf_f32m1_tumu(__VA_ARGS__)
+#define vfwmul_vv_f32m2_m(...) __riscv_vfwmul_vv_f32m2_tumu(__VA_ARGS__)
+#define vfwmul_vf_f32m2_m(...) __riscv_vfwmul_vf_f32m2_tumu(__VA_ARGS__)
+#define vfwmul_vv_f32m4_m(...) __riscv_vfwmul_vv_f32m4_tumu(__VA_ARGS__)
+#define vfwmul_vf_f32m4_m(...) __riscv_vfwmul_vf_f32m4_tumu(__VA_ARGS__)
+#define vfwmul_vv_f32m8_m(...) __riscv_vfwmul_vv_f32m8_tumu(__VA_ARGS__)
+#define vfwmul_vf_f32m8_m(...) __riscv_vfwmul_vf_f32m8_tumu(__VA_ARGS__)
+#define vfwmul_vv_f64m1_m(...) __riscv_vfwmul_vv_f64m1_tumu(__VA_ARGS__)
+#define vfwmul_vf_f64m1_m(...) __riscv_vfwmul_vf_f64m1_tumu(__VA_ARGS__)
+#define vfwmul_vv_f64m2_m(...) __riscv_vfwmul_vv_f64m2_tumu(__VA_ARGS__)
+#define vfwmul_vf_f64m2_m(...) __riscv_vfwmul_vf_f64m2_tumu(__VA_ARGS__)
+#define vfwmul_vv_f64m4_m(...) __riscv_vfwmul_vv_f64m4_tumu(__VA_ARGS__)
+#define vfwmul_vf_f64m4_m(...) __riscv_vfwmul_vf_f64m4_tumu(__VA_ARGS__)
+#define vfwmul_vv_f64m8_m(...) __riscv_vfwmul_vv_f64m8_tumu(__VA_ARGS__)
+#define vfwmul_vf_f64m8_m(...) __riscv_vfwmul_vf_f64m8_tumu(__VA_ARGS__)
+#define vfmacc_vv_f16mf4(...) __riscv_vfmacc_vv_f16mf4_tu(__VA_ARGS__)
+#define vfmacc_vf_f16mf4(...) __riscv_vfmacc_vf_f16mf4_tu(__VA_ARGS__)
+#define vfmacc_vv_f16mf2(...) __riscv_vfmacc_vv_f16mf2_tu(__VA_ARGS__)
+#define vfmacc_vf_f16mf2(...) __riscv_vfmacc_vf_f16mf2_tu(__VA_ARGS__)
+#define vfmacc_vv_f16m1(...) __riscv_vfmacc_vv_f16m1_tu(__VA_ARGS__)
+#define vfmacc_vf_f16m1(...) __riscv_vfmacc_vf_f16m1_tu(__VA_ARGS__)
+#define vfmacc_vv_f16m2(...) __riscv_vfmacc_vv_f16m2_tu(__VA_ARGS__)
+#define vfmacc_vf_f16m2(...) __riscv_vfmacc_vf_f16m2_tu(__VA_ARGS__)
+#define vfmacc_vv_f16m4(...) __riscv_vfmacc_vv_f16m4_tu(__VA_ARGS__)
+#define vfmacc_vf_f16m4(...) __riscv_vfmacc_vf_f16m4_tu(__VA_ARGS__)
+#define vfmacc_vv_f16m8(...) __riscv_vfmacc_vv_f16m8_tu(__VA_ARGS__)
+#define vfmacc_vf_f16m8(...) __riscv_vfmacc_vf_f16m8_tu(__VA_ARGS__)
+#define vfmacc_vv_f32mf2(...) __riscv_vfmacc_vv_f32mf2_tu(__VA_ARGS__)
+#define vfmacc_vf_f32mf2(...) __riscv_vfmacc_vf_f32mf2_tu(__VA_ARGS__)
+#define vfmacc_vv_f32m1(...) __riscv_vfmacc_vv_f32m1_tu(__VA_ARGS__)
+#define vfmacc_vf_f32m1(...) __riscv_vfmacc_vf_f32m1_tu(__VA_ARGS__)
+#define vfmacc_vv_f32m2(...) __riscv_vfmacc_vv_f32m2_tu(__VA_ARGS__)
+#define vfmacc_vf_f32m2(...) __riscv_vfmacc_vf_f32m2_tu(__VA_ARGS__)
+#define vfmacc_vv_f32m4(...) __riscv_vfmacc_vv_f32m4_tu(__VA_ARGS__)
+#define vfmacc_vf_f32m4(...) __riscv_vfmacc_vf_f32m4_tu(__VA_ARGS__)
+#define vfmacc_vv_f32m8(...) __riscv_vfmacc_vv_f32m8_tu(__VA_ARGS__)
+#define vfmacc_vf_f32m8(...) __riscv_vfmacc_vf_f32m8_tu(__VA_ARGS__)
+#define vfmacc_vv_f64m1(...) __riscv_vfmacc_vv_f64m1_tu(__VA_ARGS__)
+#define vfmacc_vf_f64m1(...) __riscv_vfmacc_vf_f64m1_tu(__VA_ARGS__)
+#define vfmacc_vv_f64m2(...) __riscv_vfmacc_vv_f64m2_tu(__VA_ARGS__)
+#define vfmacc_vf_f64m2(...) __riscv_vfmacc_vf_f64m2_tu(__VA_ARGS__)
+#define vfmacc_vv_f64m4(...) __riscv_vfmacc_vv_f64m4_tu(__VA_ARGS__)
+#define vfmacc_vf_f64m4(...) __riscv_vfmacc_vf_f64m4_tu(__VA_ARGS__)
+#define vfmacc_vv_f64m8(...) __riscv_vfmacc_vv_f64m8_tu(__VA_ARGS__)
+#define vfmacc_vf_f64m8(...) __riscv_vfmacc_vf_f64m8_tu(__VA_ARGS__)
+#define vfnmacc_vv_f16mf4(...) __riscv_vfnmacc_vv_f16mf4_tu(__VA_ARGS__)
+#define vfnmacc_vf_f16mf4(...) __riscv_vfnmacc_vf_f16mf4_tu(__VA_ARGS__)
+#define vfnmacc_vv_f16mf2(...) __riscv_vfnmacc_vv_f16mf2_tu(__VA_ARGS__)
+#define vfnmacc_vf_f16mf2(...) __riscv_vfnmacc_vf_f16mf2_tu(__VA_ARGS__)
+#define vfnmacc_vv_f16m1(...) __riscv_vfnmacc_vv_f16m1_tu(__VA_ARGS__)
+#define vfnmacc_vf_f16m1(...) __riscv_vfnmacc_vf_f16m1_tu(__VA_ARGS__)
+#define vfnmacc_vv_f16m2(...) __riscv_vfnmacc_vv_f16m2_tu(__VA_ARGS__)
+#define vfnmacc_vf_f16m2(...) __riscv_vfnmacc_vf_f16m2_tu(__VA_ARGS__)
+#define vfnmacc_vv_f16m4(...) __riscv_vfnmacc_vv_f16m4_tu(__VA_ARGS__)
+#define vfnmacc_vf_f16m4(...) __riscv_vfnmacc_vf_f16m4_tu(__VA_ARGS__)
+#define vfnmacc_vv_f16m8(...) __riscv_vfnmacc_vv_f16m8_tu(__VA_ARGS__)
+#define vfnmacc_vf_f16m8(...) __riscv_vfnmacc_vf_f16m8_tu(__VA_ARGS__)
+#define vfnmacc_vv_f32mf2(...) __riscv_vfnmacc_vv_f32mf2_tu(__VA_ARGS__)
+#define vfnmacc_vf_f32mf2(...) __riscv_vfnmacc_vf_f32mf2_tu(__VA_ARGS__)
+#define vfnmacc_vv_f32m1(...) __riscv_vfnmacc_vv_f32m1_tu(__VA_ARGS__)
+#define vfnmacc_vf_f32m1(...) __riscv_vfnmacc_vf_f32m1_tu(__VA_ARGS__)
+#define vfnmacc_vv_f32m2(...) __riscv_vfnmacc_vv_f32m2_tu(__VA_ARGS__)
+#define vfnmacc_vf_f32m2(...) __riscv_vfnmacc_vf_f32m2_tu(__VA_ARGS__)
+#define vfnmacc_vv_f32m4(...) __riscv_vfnmacc_vv_f32m4_tu(__VA_ARGS__)
+#define vfnmacc_vf_f32m4(...) __riscv_vfnmacc_vf_f32m4_tu(__VA_ARGS__)
+#define vfnmacc_vv_f32m8(...) __riscv_vfnmacc_vv_f32m8_tu(__VA_ARGS__)
+#define vfnmacc_vf_f32m8(...) __riscv_vfnmacc_vf_f32m8_tu(__VA_ARGS__)
+#define vfnmacc_vv_f64m1(...) __riscv_vfnmacc_vv_f64m1_tu(__VA_ARGS__)
+#define vfnmacc_vf_f64m1(...) __riscv_vfnmacc_vf_f64m1_tu(__VA_ARGS__)
+#define vfnmacc_vv_f64m2(...) __riscv_vfnmacc_vv_f64m2_tu(__VA_ARGS__)
+#define vfnmacc_vf_f64m2(...) __riscv_vfnmacc_vf_f64m2_tu(__VA_ARGS__)
+#define vfnmacc_vv_f64m4(...) __riscv_vfnmacc_vv_f64m4_tu(__VA_ARGS__)
+#define vfnmacc_vf_f64m4(...) __riscv_vfnmacc_vf_f64m4_tu(__VA_ARGS__)
+#define vfnmacc_vv_f64m8(...) __riscv_vfnmacc_vv_f64m8_tu(__VA_ARGS__)
+#define vfnmacc_vf_f64m8(...) __riscv_vfnmacc_vf_f64m8_tu(__VA_ARGS__)
+#define vfmsac_vv_f16mf4(...) __riscv_vfmsac_vv_f16mf4_tu(__VA_ARGS__)
+#define vfmsac_vf_f16mf4(...) __riscv_vfmsac_vf_f16mf4_tu(__VA_ARGS__)
+#define vfmsac_vv_f16mf2(...) __riscv_vfmsac_vv_f16mf2_tu(__VA_ARGS__)
+#define vfmsac_vf_f16mf2(...) __riscv_vfmsac_vf_f16mf2_tu(__VA_ARGS__)
+#define vfmsac_vv_f16m1(...) __riscv_vfmsac_vv_f16m1_tu(__VA_ARGS__)
+#define vfmsac_vf_f16m1(...) __riscv_vfmsac_vf_f16m1_tu(__VA_ARGS__)
+#define vfmsac_vv_f16m2(...) __riscv_vfmsac_vv_f16m2_tu(__VA_ARGS__)
+#define vfmsac_vf_f16m2(...) __riscv_vfmsac_vf_f16m2_tu(__VA_ARGS__)
+#define vfmsac_vv_f16m4(...) __riscv_vfmsac_vv_f16m4_tu(__VA_ARGS__)
+#define vfmsac_vf_f16m4(...) __riscv_vfmsac_vf_f16m4_tu(__VA_ARGS__)
+#define vfmsac_vv_f16m8(...) __riscv_vfmsac_vv_f16m8_tu(__VA_ARGS__)
+#define vfmsac_vf_f16m8(...) __riscv_vfmsac_vf_f16m8_tu(__VA_ARGS__)
+#define vfmsac_vv_f32mf2(...) __riscv_vfmsac_vv_f32mf2_tu(__VA_ARGS__)
+#define vfmsac_vf_f32mf2(...) __riscv_vfmsac_vf_f32mf2_tu(__VA_ARGS__)
+#define vfmsac_vv_f32m1(...) __riscv_vfmsac_vv_f32m1_tu(__VA_ARGS__)
+#define vfmsac_vf_f32m1(...) __riscv_vfmsac_vf_f32m1_tu(__VA_ARGS__)
+#define vfmsac_vv_f32m2(...) __riscv_vfmsac_vv_f32m2_tu(__VA_ARGS__)
+#define vfmsac_vf_f32m2(...) __riscv_vfmsac_vf_f32m2_tu(__VA_ARGS__)
+#define vfmsac_vv_f32m4(...) __riscv_vfmsac_vv_f32m4_tu(__VA_ARGS__)
+#define vfmsac_vf_f32m4(...) __riscv_vfmsac_vf_f32m4_tu(__VA_ARGS__)
+#define vfmsac_vv_f32m8(...) __riscv_vfmsac_vv_f32m8_tu(__VA_ARGS__)
+#define vfmsac_vf_f32m8(...) __riscv_vfmsac_vf_f32m8_tu(__VA_ARGS__)
+#define vfmsac_vv_f64m1(...) __riscv_vfmsac_vv_f64m1_tu(__VA_ARGS__)
+#define vfmsac_vf_f64m1(...) __riscv_vfmsac_vf_f64m1_tu(__VA_ARGS__)
+#define vfmsac_vv_f64m2(...) __riscv_vfmsac_vv_f64m2_tu(__VA_ARGS__)
+#define vfmsac_vf_f64m2(...) __riscv_vfmsac_vf_f64m2_tu(__VA_ARGS__)
+#define vfmsac_vv_f64m4(...) __riscv_vfmsac_vv_f64m4_tu(__VA_ARGS__)
+#define vfmsac_vf_f64m4(...) __riscv_vfmsac_vf_f64m4_tu(__VA_ARGS__)
+#define vfmsac_vv_f64m8(...) __riscv_vfmsac_vv_f64m8_tu(__VA_ARGS__)
+#define vfmsac_vf_f64m8(...) __riscv_vfmsac_vf_f64m8_tu(__VA_ARGS__)
+#define vfnmsac_vv_f16mf4(...) __riscv_vfnmsac_vv_f16mf4_tu(__VA_ARGS__)
+#define vfnmsac_vf_f16mf4(...) __riscv_vfnmsac_vf_f16mf4_tu(__VA_ARGS__)
+#define vfnmsac_vv_f16mf2(...) __riscv_vfnmsac_vv_f16mf2_tu(__VA_ARGS__)
+#define vfnmsac_vf_f16mf2(...) __riscv_vfnmsac_vf_f16mf2_tu(__VA_ARGS__)
+#define vfnmsac_vv_f16m1(...) __riscv_vfnmsac_vv_f16m1_tu(__VA_ARGS__)
+#define vfnmsac_vf_f16m1(...) __riscv_vfnmsac_vf_f16m1_tu(__VA_ARGS__)
+#define vfnmsac_vv_f16m2(...) __riscv_vfnmsac_vv_f16m2_tu(__VA_ARGS__)
+#define vfnmsac_vf_f16m2(...) __riscv_vfnmsac_vf_f16m2_tu(__VA_ARGS__)
+#define vfnmsac_vv_f16m4(...) __riscv_vfnmsac_vv_f16m4_tu(__VA_ARGS__)
+#define vfnmsac_vf_f16m4(...) __riscv_vfnmsac_vf_f16m4_tu(__VA_ARGS__)
+#define vfnmsac_vv_f16m8(...) __riscv_vfnmsac_vv_f16m8_tu(__VA_ARGS__)
+#define vfnmsac_vf_f16m8(...) __riscv_vfnmsac_vf_f16m8_tu(__VA_ARGS__)
+#define vfnmsac_vv_f32mf2(...) __riscv_vfnmsac_vv_f32mf2_tu(__VA_ARGS__)
+#define vfnmsac_vf_f32mf2(...) __riscv_vfnmsac_vf_f32mf2_tu(__VA_ARGS__)
+#define vfnmsac_vv_f32m1(...) __riscv_vfnmsac_vv_f32m1_tu(__VA_ARGS__)
+#define vfnmsac_vf_f32m1(...) __riscv_vfnmsac_vf_f32m1_tu(__VA_ARGS__)
+#define vfnmsac_vv_f32m2(...) __riscv_vfnmsac_vv_f32m2_tu(__VA_ARGS__)
+#define vfnmsac_vf_f32m2(...) __riscv_vfnmsac_vf_f32m2_tu(__VA_ARGS__)
+#define vfnmsac_vv_f32m4(...) __riscv_vfnmsac_vv_f32m4_tu(__VA_ARGS__)
+#define vfnmsac_vf_f32m4(...) __riscv_vfnmsac_vf_f32m4_tu(__VA_ARGS__)
+#define vfnmsac_vv_f32m8(...) __riscv_vfnmsac_vv_f32m8_tu(__VA_ARGS__)
+#define vfnmsac_vf_f32m8(...) __riscv_vfnmsac_vf_f32m8_tu(__VA_ARGS__)
+#define vfnmsac_vv_f64m1(...) __riscv_vfnmsac_vv_f64m1_tu(__VA_ARGS__)
+#define vfnmsac_vf_f64m1(...) __riscv_vfnmsac_vf_f64m1_tu(__VA_ARGS__)
+#define vfnmsac_vv_f64m2(...) __riscv_vfnmsac_vv_f64m2_tu(__VA_ARGS__)
+#define vfnmsac_vf_f64m2(...) __riscv_vfnmsac_vf_f64m2_tu(__VA_ARGS__)
+#define vfnmsac_vv_f64m4(...) __riscv_vfnmsac_vv_f64m4_tu(__VA_ARGS__)
+#define vfnmsac_vf_f64m4(...) __riscv_vfnmsac_vf_f64m4_tu(__VA_ARGS__)
+#define vfnmsac_vv_f64m8(...) __riscv_vfnmsac_vv_f64m8_tu(__VA_ARGS__)
+#define vfnmsac_vf_f64m8(...) __riscv_vfnmsac_vf_f64m8_tu(__VA_ARGS__)
+#define vfmadd_vv_f16mf4(...) __riscv_vfmadd_vv_f16mf4_tu(__VA_ARGS__)
+#define vfmadd_vf_f16mf4(...) __riscv_vfmadd_vf_f16mf4_tu(__VA_ARGS__)
+#define vfmadd_vv_f16mf2(...) __riscv_vfmadd_vv_f16mf2_tu(__VA_ARGS__)
+#define vfmadd_vf_f16mf2(...) __riscv_vfmadd_vf_f16mf2_tu(__VA_ARGS__)
+#define vfmadd_vv_f16m1(...) __riscv_vfmadd_vv_f16m1_tu(__VA_ARGS__)
+#define vfmadd_vf_f16m1(...) __riscv_vfmadd_vf_f16m1_tu(__VA_ARGS__)
+#define vfmadd_vv_f16m2(...) __riscv_vfmadd_vv_f16m2_tu(__VA_ARGS__)
+#define vfmadd_vf_f16m2(...) __riscv_vfmadd_vf_f16m2_tu(__VA_ARGS__)
+#define vfmadd_vv_f16m4(...) __riscv_vfmadd_vv_f16m4_tu(__VA_ARGS__)
+#define vfmadd_vf_f16m4(...) __riscv_vfmadd_vf_f16m4_tu(__VA_ARGS__)
+#define vfmadd_vv_f16m8(...) __riscv_vfmadd_vv_f16m8_tu(__VA_ARGS__)
+#define vfmadd_vf_f16m8(...) __riscv_vfmadd_vf_f16m8_tu(__VA_ARGS__)
+#define vfmadd_vv_f32mf2(...) __riscv_vfmadd_vv_f32mf2_tu(__VA_ARGS__)
+#define vfmadd_vf_f32mf2(...) __riscv_vfmadd_vf_f32mf2_tu(__VA_ARGS__)
+#define vfmadd_vv_f32m1(...) __riscv_vfmadd_vv_f32m1_tu(__VA_ARGS__)
+#define vfmadd_vf_f32m1(...) __riscv_vfmadd_vf_f32m1_tu(__VA_ARGS__)
+#define vfmadd_vv_f32m2(...) __riscv_vfmadd_vv_f32m2_tu(__VA_ARGS__)
+#define vfmadd_vf_f32m2(...) __riscv_vfmadd_vf_f32m2_tu(__VA_ARGS__)
+#define vfmadd_vv_f32m4(...) __riscv_vfmadd_vv_f32m4_tu(__VA_ARGS__)
+#define vfmadd_vf_f32m4(...) __riscv_vfmadd_vf_f32m4_tu(__VA_ARGS__)
+#define vfmadd_vv_f32m8(...) __riscv_vfmadd_vv_f32m8_tu(__VA_ARGS__)
+#define vfmadd_vf_f32m8(...) __riscv_vfmadd_vf_f32m8_tu(__VA_ARGS__)
+#define vfmadd_vv_f64m1(...) __riscv_vfmadd_vv_f64m1_tu(__VA_ARGS__)
+#define vfmadd_vf_f64m1(...) __riscv_vfmadd_vf_f64m1_tu(__VA_ARGS__)
+#define vfmadd_vv_f64m2(...) __riscv_vfmadd_vv_f64m2_tu(__VA_ARGS__)
+#define vfmadd_vf_f64m2(...) __riscv_vfmadd_vf_f64m2_tu(__VA_ARGS__)
+#define vfmadd_vv_f64m4(...) __riscv_vfmadd_vv_f64m4_tu(__VA_ARGS__)
+#define vfmadd_vf_f64m4(...) __riscv_vfmadd_vf_f64m4_tu(__VA_ARGS__)
+#define vfmadd_vv_f64m8(...) __riscv_vfmadd_vv_f64m8_tu(__VA_ARGS__)
+#define vfmadd_vf_f64m8(...) __riscv_vfmadd_vf_f64m8_tu(__VA_ARGS__)
+#define vfnmadd_vv_f16mf4(...) __riscv_vfnmadd_vv_f16mf4_tu(__VA_ARGS__)
+#define vfnmadd_vf_f16mf4(...) __riscv_vfnmadd_vf_f16mf4_tu(__VA_ARGS__)
+#define vfnmadd_vv_f16mf2(...) __riscv_vfnmadd_vv_f16mf2_tu(__VA_ARGS__)
+#define vfnmadd_vf_f16mf2(...) __riscv_vfnmadd_vf_f16mf2_tu(__VA_ARGS__)
+#define vfnmadd_vv_f16m1(...) __riscv_vfnmadd_vv_f16m1_tu(__VA_ARGS__)
+#define vfnmadd_vf_f16m1(...) __riscv_vfnmadd_vf_f16m1_tu(__VA_ARGS__)
+#define vfnmadd_vv_f16m2(...) __riscv_vfnmadd_vv_f16m2_tu(__VA_ARGS__)
+#define vfnmadd_vf_f16m2(...) __riscv_vfnmadd_vf_f16m2_tu(__VA_ARGS__)
+#define vfnmadd_vv_f16m4(...) __riscv_vfnmadd_vv_f16m4_tu(__VA_ARGS__)
+#define vfnmadd_vf_f16m4(...) __riscv_vfnmadd_vf_f16m4_tu(__VA_ARGS__)
+#define vfnmadd_vv_f16m8(...) __riscv_vfnmadd_vv_f16m8_tu(__VA_ARGS__)
+#define vfnmadd_vf_f16m8(...) __riscv_vfnmadd_vf_f16m8_tu(__VA_ARGS__)
+#define vfnmadd_vv_f32mf2(...) __riscv_vfnmadd_vv_f32mf2_tu(__VA_ARGS__)
+#define vfnmadd_vf_f32mf2(...) __riscv_vfnmadd_vf_f32mf2_tu(__VA_ARGS__)
+#define vfnmadd_vv_f32m1(...) __riscv_vfnmadd_vv_f32m1_tu(__VA_ARGS__)
+#define vfnmadd_vf_f32m1(...) __riscv_vfnmadd_vf_f32m1_tu(__VA_ARGS__)
+#define vfnmadd_vv_f32m2(...) __riscv_vfnmadd_vv_f32m2_tu(__VA_ARGS__)
+#define vfnmadd_vf_f32m2(...) __riscv_vfnmadd_vf_f32m2_tu(__VA_ARGS__)
+#define vfnmadd_vv_f32m4(...) __riscv_vfnmadd_vv_f32m4_tu(__VA_ARGS__)
+#define vfnmadd_vf_f32m4(...) __riscv_vfnmadd_vf_f32m4_tu(__VA_ARGS__)
+#define vfnmadd_vv_f32m8(...) __riscv_vfnmadd_vv_f32m8_tu(__VA_ARGS__)
+#define vfnmadd_vf_f32m8(...) __riscv_vfnmadd_vf_f32m8_tu(__VA_ARGS__)
+#define vfnmadd_vv_f64m1(...) __riscv_vfnmadd_vv_f64m1_tu(__VA_ARGS__)
+#define vfnmadd_vf_f64m1(...) __riscv_vfnmadd_vf_f64m1_tu(__VA_ARGS__)
+#define vfnmadd_vv_f64m2(...) __riscv_vfnmadd_vv_f64m2_tu(__VA_ARGS__)
+#define vfnmadd_vf_f64m2(...) __riscv_vfnmadd_vf_f64m2_tu(__VA_ARGS__)
+#define vfnmadd_vv_f64m4(...) __riscv_vfnmadd_vv_f64m4_tu(__VA_ARGS__)
+#define vfnmadd_vf_f64m4(...) __riscv_vfnmadd_vf_f64m4_tu(__VA_ARGS__)
+#define vfnmadd_vv_f64m8(...) __riscv_vfnmadd_vv_f64m8_tu(__VA_ARGS__)
+#define vfnmadd_vf_f64m8(...) __riscv_vfnmadd_vf_f64m8_tu(__VA_ARGS__)
+#define vfmsub_vv_f16mf4(...) __riscv_vfmsub_vv_f16mf4_tu(__VA_ARGS__)
+#define vfmsub_vf_f16mf4(...) __riscv_vfmsub_vf_f16mf4_tu(__VA_ARGS__)
+#define vfmsub_vv_f16mf2(...) __riscv_vfmsub_vv_f16mf2_tu(__VA_ARGS__)
+#define vfmsub_vf_f16mf2(...) __riscv_vfmsub_vf_f16mf2_tu(__VA_ARGS__)
+#define vfmsub_vv_f16m1(...) __riscv_vfmsub_vv_f16m1_tu(__VA_ARGS__)
+#define vfmsub_vf_f16m1(...) __riscv_vfmsub_vf_f16m1_tu(__VA_ARGS__)
+#define vfmsub_vv_f16m2(...) __riscv_vfmsub_vv_f16m2_tu(__VA_ARGS__)
+#define vfmsub_vf_f16m2(...) __riscv_vfmsub_vf_f16m2_tu(__VA_ARGS__)
+#define vfmsub_vv_f16m4(...) __riscv_vfmsub_vv_f16m4_tu(__VA_ARGS__)
+#define vfmsub_vf_f16m4(...) __riscv_vfmsub_vf_f16m4_tu(__VA_ARGS__)
+#define vfmsub_vv_f16m8(...) __riscv_vfmsub_vv_f16m8_tu(__VA_ARGS__)
+#define vfmsub_vf_f16m8(...) __riscv_vfmsub_vf_f16m8_tu(__VA_ARGS__)
+#define vfmsub_vv_f32mf2(...) __riscv_vfmsub_vv_f32mf2_tu(__VA_ARGS__)
+#define vfmsub_vf_f32mf2(...) __riscv_vfmsub_vf_f32mf2_tu(__VA_ARGS__)
+#define vfmsub_vv_f32m1(...) __riscv_vfmsub_vv_f32m1_tu(__VA_ARGS__)
+#define vfmsub_vf_f32m1(...) __riscv_vfmsub_vf_f32m1_tu(__VA_ARGS__)
+#define vfmsub_vv_f32m2(...) __riscv_vfmsub_vv_f32m2_tu(__VA_ARGS__)
+#define vfmsub_vf_f32m2(...) __riscv_vfmsub_vf_f32m2_tu(__VA_ARGS__)
+#define vfmsub_vv_f32m4(...) __riscv_vfmsub_vv_f32m4_tu(__VA_ARGS__)
+#define vfmsub_vf_f32m4(...) __riscv_vfmsub_vf_f32m4_tu(__VA_ARGS__)
+#define vfmsub_vv_f32m8(...) __riscv_vfmsub_vv_f32m8_tu(__VA_ARGS__)
+#define vfmsub_vf_f32m8(...) __riscv_vfmsub_vf_f32m8_tu(__VA_ARGS__)
+#define vfmsub_vv_f64m1(...) __riscv_vfmsub_vv_f64m1_tu(__VA_ARGS__)
+#define vfmsub_vf_f64m1(...) __riscv_vfmsub_vf_f64m1_tu(__VA_ARGS__)
+#define vfmsub_vv_f64m2(...) __riscv_vfmsub_vv_f64m2_tu(__VA_ARGS__)
+#define vfmsub_vf_f64m2(...) __riscv_vfmsub_vf_f64m2_tu(__VA_ARGS__)
+#define vfmsub_vv_f64m4(...) __riscv_vfmsub_vv_f64m4_tu(__VA_ARGS__)
+#define vfmsub_vf_f64m4(...) __riscv_vfmsub_vf_f64m4_tu(__VA_ARGS__)
+#define vfmsub_vv_f64m8(...) __riscv_vfmsub_vv_f64m8_tu(__VA_ARGS__)
+#define vfmsub_vf_f64m8(...) __riscv_vfmsub_vf_f64m8_tu(__VA_ARGS__)
+#define vfnmsub_vv_f16mf4(...) __riscv_vfnmsub_vv_f16mf4_tu(__VA_ARGS__)
+#define vfnmsub_vf_f16mf4(...) __riscv_vfnmsub_vf_f16mf4_tu(__VA_ARGS__)
+#define vfnmsub_vv_f16mf2(...) __riscv_vfnmsub_vv_f16mf2_tu(__VA_ARGS__)
+#define vfnmsub_vf_f16mf2(...) __riscv_vfnmsub_vf_f16mf2_tu(__VA_ARGS__)
+#define vfnmsub_vv_f16m1(...) __riscv_vfnmsub_vv_f16m1_tu(__VA_ARGS__)
+#define vfnmsub_vf_f16m1(...) __riscv_vfnmsub_vf_f16m1_tu(__VA_ARGS__)
+#define vfnmsub_vv_f16m2(...) __riscv_vfnmsub_vv_f16m2_tu(__VA_ARGS__)
+#define vfnmsub_vf_f16m2(...) __riscv_vfnmsub_vf_f16m2_tu(__VA_ARGS__)
+#define vfnmsub_vv_f16m4(...) __riscv_vfnmsub_vv_f16m4_tu(__VA_ARGS__)
+#define vfnmsub_vf_f16m4(...) __riscv_vfnmsub_vf_f16m4_tu(__VA_ARGS__)
+#define vfnmsub_vv_f16m8(...) __riscv_vfnmsub_vv_f16m8_tu(__VA_ARGS__)
+#define vfnmsub_vf_f16m8(...) __riscv_vfnmsub_vf_f16m8_tu(__VA_ARGS__)
+#define vfnmsub_vv_f32mf2(...) __riscv_vfnmsub_vv_f32mf2_tu(__VA_ARGS__)
+#define vfnmsub_vf_f32mf2(...) __riscv_vfnmsub_vf_f32mf2_tu(__VA_ARGS__)
+#define vfnmsub_vv_f32m1(...) __riscv_vfnmsub_vv_f32m1_tu(__VA_ARGS__)
+#define vfnmsub_vf_f32m1(...) __riscv_vfnmsub_vf_f32m1_tu(__VA_ARGS__)
+#define vfnmsub_vv_f32m2(...) __riscv_vfnmsub_vv_f32m2_tu(__VA_ARGS__)
+#define vfnmsub_vf_f32m2(...) __riscv_vfnmsub_vf_f32m2_tu(__VA_ARGS__)
+#define vfnmsub_vv_f32m4(...) __riscv_vfnmsub_vv_f32m4_tu(__VA_ARGS__)
+#define vfnmsub_vf_f32m4(...) __riscv_vfnmsub_vf_f32m4_tu(__VA_ARGS__)
+#define vfnmsub_vv_f32m8(...) __riscv_vfnmsub_vv_f32m8_tu(__VA_ARGS__)
+#define vfnmsub_vf_f32m8(...) __riscv_vfnmsub_vf_f32m8_tu(__VA_ARGS__)
+#define vfnmsub_vv_f64m1(...) __riscv_vfnmsub_vv_f64m1_tu(__VA_ARGS__)
+#define vfnmsub_vf_f64m1(...) __riscv_vfnmsub_vf_f64m1_tu(__VA_ARGS__)
+#define vfnmsub_vv_f64m2(...) __riscv_vfnmsub_vv_f64m2_tu(__VA_ARGS__)
+#define vfnmsub_vf_f64m2(...) __riscv_vfnmsub_vf_f64m2_tu(__VA_ARGS__)
+#define vfnmsub_vv_f64m4(...) __riscv_vfnmsub_vv_f64m4_tu(__VA_ARGS__)
+#define vfnmsub_vf_f64m4(...) __riscv_vfnmsub_vf_f64m4_tu(__VA_ARGS__)
+#define vfnmsub_vv_f64m8(...) __riscv_vfnmsub_vv_f64m8_tu(__VA_ARGS__)
+#define vfnmsub_vf_f64m8(...) __riscv_vfnmsub_vf_f64m8_tu(__VA_ARGS__)
+// masked functions
+#define vfmacc_vv_f16mf4_m(...) __riscv_vfmacc_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfmacc_vf_f16mf4_m(...) __riscv_vfmacc_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfmacc_vv_f16mf2_m(...) __riscv_vfmacc_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfmacc_vf_f16mf2_m(...) __riscv_vfmacc_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfmacc_vv_f16m1_m(...) __riscv_vfmacc_vv_f16m1_tumu(__VA_ARGS__)
+#define vfmacc_vf_f16m1_m(...) __riscv_vfmacc_vf_f16m1_tumu(__VA_ARGS__)
+#define vfmacc_vv_f16m2_m(...) __riscv_vfmacc_vv_f16m2_tumu(__VA_ARGS__)
+#define vfmacc_vf_f16m2_m(...) __riscv_vfmacc_vf_f16m2_tumu(__VA_ARGS__)
+#define vfmacc_vv_f16m4_m(...) __riscv_vfmacc_vv_f16m4_tumu(__VA_ARGS__)
+#define vfmacc_vf_f16m4_m(...) __riscv_vfmacc_vf_f16m4_tumu(__VA_ARGS__)
+#define vfmacc_vv_f16m8_m(...) __riscv_vfmacc_vv_f16m8_tumu(__VA_ARGS__)
+#define vfmacc_vf_f16m8_m(...) __riscv_vfmacc_vf_f16m8_tumu(__VA_ARGS__)
+#define vfmacc_vv_f32mf2_m(...) __riscv_vfmacc_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfmacc_vf_f32mf2_m(...) __riscv_vfmacc_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfmacc_vv_f32m1_m(...) __riscv_vfmacc_vv_f32m1_tumu(__VA_ARGS__)
+#define vfmacc_vf_f32m1_m(...) __riscv_vfmacc_vf_f32m1_tumu(__VA_ARGS__)
+#define vfmacc_vv_f32m2_m(...) __riscv_vfmacc_vv_f32m2_tumu(__VA_ARGS__)
+#define vfmacc_vf_f32m2_m(...) __riscv_vfmacc_vf_f32m2_tumu(__VA_ARGS__)
+#define vfmacc_vv_f32m4_m(...) __riscv_vfmacc_vv_f32m4_tumu(__VA_ARGS__)
+#define vfmacc_vf_f32m4_m(...) __riscv_vfmacc_vf_f32m4_tumu(__VA_ARGS__)
+#define vfmacc_vv_f32m8_m(...) __riscv_vfmacc_vv_f32m8_tumu(__VA_ARGS__)
+#define vfmacc_vf_f32m8_m(...) __riscv_vfmacc_vf_f32m8_tumu(__VA_ARGS__)
+#define vfmacc_vv_f64m1_m(...) __riscv_vfmacc_vv_f64m1_tumu(__VA_ARGS__)
+#define vfmacc_vf_f64m1_m(...) __riscv_vfmacc_vf_f64m1_tumu(__VA_ARGS__)
+#define vfmacc_vv_f64m2_m(...) __riscv_vfmacc_vv_f64m2_tumu(__VA_ARGS__)
+#define vfmacc_vf_f64m2_m(...) __riscv_vfmacc_vf_f64m2_tumu(__VA_ARGS__)
+#define vfmacc_vv_f64m4_m(...) __riscv_vfmacc_vv_f64m4_tumu(__VA_ARGS__)
+#define vfmacc_vf_f64m4_m(...) __riscv_vfmacc_vf_f64m4_tumu(__VA_ARGS__)
+#define vfmacc_vv_f64m8_m(...) __riscv_vfmacc_vv_f64m8_tumu(__VA_ARGS__)
+#define vfmacc_vf_f64m8_m(...) __riscv_vfmacc_vf_f64m8_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f16mf4_m(...) __riscv_vfnmacc_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f16mf4_m(...) __riscv_vfnmacc_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f16mf2_m(...) __riscv_vfnmacc_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f16mf2_m(...) __riscv_vfnmacc_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f16m1_m(...) __riscv_vfnmacc_vv_f16m1_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f16m1_m(...) __riscv_vfnmacc_vf_f16m1_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f16m2_m(...) __riscv_vfnmacc_vv_f16m2_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f16m2_m(...) __riscv_vfnmacc_vf_f16m2_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f16m4_m(...) __riscv_vfnmacc_vv_f16m4_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f16m4_m(...) __riscv_vfnmacc_vf_f16m4_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f16m8_m(...) __riscv_vfnmacc_vv_f16m8_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f16m8_m(...) __riscv_vfnmacc_vf_f16m8_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f32mf2_m(...) __riscv_vfnmacc_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f32mf2_m(...) __riscv_vfnmacc_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f32m1_m(...) __riscv_vfnmacc_vv_f32m1_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f32m1_m(...) __riscv_vfnmacc_vf_f32m1_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f32m2_m(...) __riscv_vfnmacc_vv_f32m2_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f32m2_m(...) __riscv_vfnmacc_vf_f32m2_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f32m4_m(...) __riscv_vfnmacc_vv_f32m4_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f32m4_m(...) __riscv_vfnmacc_vf_f32m4_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f32m8_m(...) __riscv_vfnmacc_vv_f32m8_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f32m8_m(...) __riscv_vfnmacc_vf_f32m8_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f64m1_m(...) __riscv_vfnmacc_vv_f64m1_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f64m1_m(...) __riscv_vfnmacc_vf_f64m1_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f64m2_m(...) __riscv_vfnmacc_vv_f64m2_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f64m2_m(...) __riscv_vfnmacc_vf_f64m2_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f64m4_m(...) __riscv_vfnmacc_vv_f64m4_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f64m4_m(...) __riscv_vfnmacc_vf_f64m4_tumu(__VA_ARGS__)
+#define vfnmacc_vv_f64m8_m(...) __riscv_vfnmacc_vv_f64m8_tumu(__VA_ARGS__)
+#define vfnmacc_vf_f64m8_m(...) __riscv_vfnmacc_vf_f64m8_tumu(__VA_ARGS__)
+#define vfmsac_vv_f16mf4_m(...) __riscv_vfmsac_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfmsac_vf_f16mf4_m(...) __riscv_vfmsac_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfmsac_vv_f16mf2_m(...) __riscv_vfmsac_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfmsac_vf_f16mf2_m(...) __riscv_vfmsac_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfmsac_vv_f16m1_m(...) __riscv_vfmsac_vv_f16m1_tumu(__VA_ARGS__)
+#define vfmsac_vf_f16m1_m(...) __riscv_vfmsac_vf_f16m1_tumu(__VA_ARGS__)
+#define vfmsac_vv_f16m2_m(...) __riscv_vfmsac_vv_f16m2_tumu(__VA_ARGS__)
+#define vfmsac_vf_f16m2_m(...) __riscv_vfmsac_vf_f16m2_tumu(__VA_ARGS__)
+#define vfmsac_vv_f16m4_m(...) __riscv_vfmsac_vv_f16m4_tumu(__VA_ARGS__)
+#define vfmsac_vf_f16m4_m(...) __riscv_vfmsac_vf_f16m4_tumu(__VA_ARGS__)
+#define vfmsac_vv_f16m8_m(...) __riscv_vfmsac_vv_f16m8_tumu(__VA_ARGS__)
+#define vfmsac_vf_f16m8_m(...) __riscv_vfmsac_vf_f16m8_tumu(__VA_ARGS__)
+#define vfmsac_vv_f32mf2_m(...) __riscv_vfmsac_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfmsac_vf_f32mf2_m(...) __riscv_vfmsac_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfmsac_vv_f32m1_m(...) __riscv_vfmsac_vv_f32m1_tumu(__VA_ARGS__)
+#define vfmsac_vf_f32m1_m(...) __riscv_vfmsac_vf_f32m1_tumu(__VA_ARGS__)
+#define vfmsac_vv_f32m2_m(...) __riscv_vfmsac_vv_f32m2_tumu(__VA_ARGS__)
+#define vfmsac_vf_f32m2_m(...) __riscv_vfmsac_vf_f32m2_tumu(__VA_ARGS__)
+#define vfmsac_vv_f32m4_m(...) __riscv_vfmsac_vv_f32m4_tumu(__VA_ARGS__)
+#define vfmsac_vf_f32m4_m(...) __riscv_vfmsac_vf_f32m4_tumu(__VA_ARGS__)
+#define vfmsac_vv_f32m8_m(...) __riscv_vfmsac_vv_f32m8_tumu(__VA_ARGS__)
+#define vfmsac_vf_f32m8_m(...) __riscv_vfmsac_vf_f32m8_tumu(__VA_ARGS__)
+#define vfmsac_vv_f64m1_m(...) __riscv_vfmsac_vv_f64m1_tumu(__VA_ARGS__)
+#define vfmsac_vf_f64m1_m(...) __riscv_vfmsac_vf_f64m1_tumu(__VA_ARGS__)
+#define vfmsac_vv_f64m2_m(...) __riscv_vfmsac_vv_f64m2_tumu(__VA_ARGS__)
+#define vfmsac_vf_f64m2_m(...) __riscv_vfmsac_vf_f64m2_tumu(__VA_ARGS__)
+#define vfmsac_vv_f64m4_m(...) __riscv_vfmsac_vv_f64m4_tumu(__VA_ARGS__)
+#define vfmsac_vf_f64m4_m(...) __riscv_vfmsac_vf_f64m4_tumu(__VA_ARGS__)
+#define vfmsac_vv_f64m8_m(...) __riscv_vfmsac_vv_f64m8_tumu(__VA_ARGS__)
+#define vfmsac_vf_f64m8_m(...) __riscv_vfmsac_vf_f64m8_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f16mf4_m(...) __riscv_vfnmsac_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f16mf4_m(...) __riscv_vfnmsac_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f16mf2_m(...) __riscv_vfnmsac_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f16mf2_m(...) __riscv_vfnmsac_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f16m1_m(...) __riscv_vfnmsac_vv_f16m1_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f16m1_m(...) __riscv_vfnmsac_vf_f16m1_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f16m2_m(...) __riscv_vfnmsac_vv_f16m2_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f16m2_m(...) __riscv_vfnmsac_vf_f16m2_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f16m4_m(...) __riscv_vfnmsac_vv_f16m4_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f16m4_m(...) __riscv_vfnmsac_vf_f16m4_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f16m8_m(...) __riscv_vfnmsac_vv_f16m8_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f16m8_m(...) __riscv_vfnmsac_vf_f16m8_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f32mf2_m(...) __riscv_vfnmsac_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f32mf2_m(...) __riscv_vfnmsac_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f32m1_m(...) __riscv_vfnmsac_vv_f32m1_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f32m1_m(...) __riscv_vfnmsac_vf_f32m1_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f32m2_m(...) __riscv_vfnmsac_vv_f32m2_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f32m2_m(...) __riscv_vfnmsac_vf_f32m2_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f32m4_m(...) __riscv_vfnmsac_vv_f32m4_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f32m4_m(...) __riscv_vfnmsac_vf_f32m4_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f32m8_m(...) __riscv_vfnmsac_vv_f32m8_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f32m8_m(...) __riscv_vfnmsac_vf_f32m8_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f64m1_m(...) __riscv_vfnmsac_vv_f64m1_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f64m1_m(...) __riscv_vfnmsac_vf_f64m1_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f64m2_m(...) __riscv_vfnmsac_vv_f64m2_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f64m2_m(...) __riscv_vfnmsac_vf_f64m2_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f64m4_m(...) __riscv_vfnmsac_vv_f64m4_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f64m4_m(...) __riscv_vfnmsac_vf_f64m4_tumu(__VA_ARGS__)
+#define vfnmsac_vv_f64m8_m(...) __riscv_vfnmsac_vv_f64m8_tumu(__VA_ARGS__)
+#define vfnmsac_vf_f64m8_m(...) __riscv_vfnmsac_vf_f64m8_tumu(__VA_ARGS__)
+#define vfmadd_vv_f16mf4_m(...) __riscv_vfmadd_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfmadd_vf_f16mf4_m(...) __riscv_vfmadd_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfmadd_vv_f16mf2_m(...) __riscv_vfmadd_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfmadd_vf_f16mf2_m(...) __riscv_vfmadd_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfmadd_vv_f16m1_m(...) __riscv_vfmadd_vv_f16m1_tumu(__VA_ARGS__)
+#define vfmadd_vf_f16m1_m(...) __riscv_vfmadd_vf_f16m1_tumu(__VA_ARGS__)
+#define vfmadd_vv_f16m2_m(...) __riscv_vfmadd_vv_f16m2_tumu(__VA_ARGS__)
+#define vfmadd_vf_f16m2_m(...) __riscv_vfmadd_vf_f16m2_tumu(__VA_ARGS__)
+#define vfmadd_vv_f16m4_m(...) __riscv_vfmadd_vv_f16m4_tumu(__VA_ARGS__)
+#define vfmadd_vf_f16m4_m(...) __riscv_vfmadd_vf_f16m4_tumu(__VA_ARGS__)
+#define vfmadd_vv_f16m8_m(...) __riscv_vfmadd_vv_f16m8_tumu(__VA_ARGS__)
+#define vfmadd_vf_f16m8_m(...) __riscv_vfmadd_vf_f16m8_tumu(__VA_ARGS__)
+#define vfmadd_vv_f32mf2_m(...) __riscv_vfmadd_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfmadd_vf_f32mf2_m(...) __riscv_vfmadd_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfmadd_vv_f32m1_m(...) __riscv_vfmadd_vv_f32m1_tumu(__VA_ARGS__)
+#define vfmadd_vf_f32m1_m(...) __riscv_vfmadd_vf_f32m1_tumu(__VA_ARGS__)
+#define vfmadd_vv_f32m2_m(...) __riscv_vfmadd_vv_f32m2_tumu(__VA_ARGS__)
+#define vfmadd_vf_f32m2_m(...) __riscv_vfmadd_vf_f32m2_tumu(__VA_ARGS__)
+#define vfmadd_vv_f32m4_m(...) __riscv_vfmadd_vv_f32m4_tumu(__VA_ARGS__)
+#define vfmadd_vf_f32m4_m(...) __riscv_vfmadd_vf_f32m4_tumu(__VA_ARGS__)
+#define vfmadd_vv_f32m8_m(...) __riscv_vfmadd_vv_f32m8_tumu(__VA_ARGS__)
+#define vfmadd_vf_f32m8_m(...) __riscv_vfmadd_vf_f32m8_tumu(__VA_ARGS__)
+#define vfmadd_vv_f64m1_m(...) __riscv_vfmadd_vv_f64m1_tumu(__VA_ARGS__)
+#define vfmadd_vf_f64m1_m(...) __riscv_vfmadd_vf_f64m1_tumu(__VA_ARGS__)
+#define vfmadd_vv_f64m2_m(...) __riscv_vfmadd_vv_f64m2_tumu(__VA_ARGS__)
+#define vfmadd_vf_f64m2_m(...) __riscv_vfmadd_vf_f64m2_tumu(__VA_ARGS__)
+#define vfmadd_vv_f64m4_m(...) __riscv_vfmadd_vv_f64m4_tumu(__VA_ARGS__)
+#define vfmadd_vf_f64m4_m(...) __riscv_vfmadd_vf_f64m4_tumu(__VA_ARGS__)
+#define vfmadd_vv_f64m8_m(...) __riscv_vfmadd_vv_f64m8_tumu(__VA_ARGS__)
+#define vfmadd_vf_f64m8_m(...) __riscv_vfmadd_vf_f64m8_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f16mf4_m(...) __riscv_vfnmadd_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f16mf4_m(...) __riscv_vfnmadd_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f16mf2_m(...) __riscv_vfnmadd_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f16mf2_m(...) __riscv_vfnmadd_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f16m1_m(...) __riscv_vfnmadd_vv_f16m1_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f16m1_m(...) __riscv_vfnmadd_vf_f16m1_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f16m2_m(...) __riscv_vfnmadd_vv_f16m2_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f16m2_m(...) __riscv_vfnmadd_vf_f16m2_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f16m4_m(...) __riscv_vfnmadd_vv_f16m4_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f16m4_m(...) __riscv_vfnmadd_vf_f16m4_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f16m8_m(...) __riscv_vfnmadd_vv_f16m8_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f16m8_m(...) __riscv_vfnmadd_vf_f16m8_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f32mf2_m(...) __riscv_vfnmadd_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f32mf2_m(...) __riscv_vfnmadd_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f32m1_m(...) __riscv_vfnmadd_vv_f32m1_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f32m1_m(...) __riscv_vfnmadd_vf_f32m1_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f32m2_m(...) __riscv_vfnmadd_vv_f32m2_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f32m2_m(...) __riscv_vfnmadd_vf_f32m2_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f32m4_m(...) __riscv_vfnmadd_vv_f32m4_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f32m4_m(...) __riscv_vfnmadd_vf_f32m4_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f32m8_m(...) __riscv_vfnmadd_vv_f32m8_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f32m8_m(...) __riscv_vfnmadd_vf_f32m8_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f64m1_m(...) __riscv_vfnmadd_vv_f64m1_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f64m1_m(...) __riscv_vfnmadd_vf_f64m1_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f64m2_m(...) __riscv_vfnmadd_vv_f64m2_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f64m2_m(...) __riscv_vfnmadd_vf_f64m2_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f64m4_m(...) __riscv_vfnmadd_vv_f64m4_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f64m4_m(...) __riscv_vfnmadd_vf_f64m4_tumu(__VA_ARGS__)
+#define vfnmadd_vv_f64m8_m(...) __riscv_vfnmadd_vv_f64m8_tumu(__VA_ARGS__)
+#define vfnmadd_vf_f64m8_m(...) __riscv_vfnmadd_vf_f64m8_tumu(__VA_ARGS__)
+#define vfmsub_vv_f16mf4_m(...) __riscv_vfmsub_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfmsub_vf_f16mf4_m(...) __riscv_vfmsub_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfmsub_vv_f16mf2_m(...) __riscv_vfmsub_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfmsub_vf_f16mf2_m(...) __riscv_vfmsub_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfmsub_vv_f16m1_m(...) __riscv_vfmsub_vv_f16m1_tumu(__VA_ARGS__)
+#define vfmsub_vf_f16m1_m(...) __riscv_vfmsub_vf_f16m1_tumu(__VA_ARGS__)
+#define vfmsub_vv_f16m2_m(...) __riscv_vfmsub_vv_f16m2_tumu(__VA_ARGS__)
+#define vfmsub_vf_f16m2_m(...) __riscv_vfmsub_vf_f16m2_tumu(__VA_ARGS__)
+#define vfmsub_vv_f16m4_m(...) __riscv_vfmsub_vv_f16m4_tumu(__VA_ARGS__)
+#define vfmsub_vf_f16m4_m(...) __riscv_vfmsub_vf_f16m4_tumu(__VA_ARGS__)
+#define vfmsub_vv_f16m8_m(...) __riscv_vfmsub_vv_f16m8_tumu(__VA_ARGS__)
+#define vfmsub_vf_f16m8_m(...) __riscv_vfmsub_vf_f16m8_tumu(__VA_ARGS__)
+#define vfmsub_vv_f32mf2_m(...) __riscv_vfmsub_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfmsub_vf_f32mf2_m(...) __riscv_vfmsub_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfmsub_vv_f32m1_m(...) __riscv_vfmsub_vv_f32m1_tumu(__VA_ARGS__)
+#define vfmsub_vf_f32m1_m(...) __riscv_vfmsub_vf_f32m1_tumu(__VA_ARGS__)
+#define vfmsub_vv_f32m2_m(...) __riscv_vfmsub_vv_f32m2_tumu(__VA_ARGS__)
+#define vfmsub_vf_f32m2_m(...) __riscv_vfmsub_vf_f32m2_tumu(__VA_ARGS__)
+#define vfmsub_vv_f32m4_m(...) __riscv_vfmsub_vv_f32m4_tumu(__VA_ARGS__)
+#define vfmsub_vf_f32m4_m(...) __riscv_vfmsub_vf_f32m4_tumu(__VA_ARGS__)
+#define vfmsub_vv_f32m8_m(...) __riscv_vfmsub_vv_f32m8_tumu(__VA_ARGS__)
+#define vfmsub_vf_f32m8_m(...) __riscv_vfmsub_vf_f32m8_tumu(__VA_ARGS__)
+#define vfmsub_vv_f64m1_m(...) __riscv_vfmsub_vv_f64m1_tumu(__VA_ARGS__)
+#define vfmsub_vf_f64m1_m(...) __riscv_vfmsub_vf_f64m1_tumu(__VA_ARGS__)
+#define vfmsub_vv_f64m2_m(...) __riscv_vfmsub_vv_f64m2_tumu(__VA_ARGS__)
+#define vfmsub_vf_f64m2_m(...) __riscv_vfmsub_vf_f64m2_tumu(__VA_ARGS__)
+#define vfmsub_vv_f64m4_m(...) __riscv_vfmsub_vv_f64m4_tumu(__VA_ARGS__)
+#define vfmsub_vf_f64m4_m(...) __riscv_vfmsub_vf_f64m4_tumu(__VA_ARGS__)
+#define vfmsub_vv_f64m8_m(...) __riscv_vfmsub_vv_f64m8_tumu(__VA_ARGS__)
+#define vfmsub_vf_f64m8_m(...) __riscv_vfmsub_vf_f64m8_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f16mf4_m(...) __riscv_vfnmsub_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f16mf4_m(...) __riscv_vfnmsub_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f16mf2_m(...) __riscv_vfnmsub_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f16mf2_m(...) __riscv_vfnmsub_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f16m1_m(...) __riscv_vfnmsub_vv_f16m1_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f16m1_m(...) __riscv_vfnmsub_vf_f16m1_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f16m2_m(...) __riscv_vfnmsub_vv_f16m2_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f16m2_m(...) __riscv_vfnmsub_vf_f16m2_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f16m4_m(...) __riscv_vfnmsub_vv_f16m4_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f16m4_m(...) __riscv_vfnmsub_vf_f16m4_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f16m8_m(...) __riscv_vfnmsub_vv_f16m8_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f16m8_m(...) __riscv_vfnmsub_vf_f16m8_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f32mf2_m(...) __riscv_vfnmsub_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f32mf2_m(...) __riscv_vfnmsub_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f32m1_m(...) __riscv_vfnmsub_vv_f32m1_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f32m1_m(...) __riscv_vfnmsub_vf_f32m1_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f32m2_m(...) __riscv_vfnmsub_vv_f32m2_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f32m2_m(...) __riscv_vfnmsub_vf_f32m2_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f32m4_m(...) __riscv_vfnmsub_vv_f32m4_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f32m4_m(...) __riscv_vfnmsub_vf_f32m4_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f32m8_m(...) __riscv_vfnmsub_vv_f32m8_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f32m8_m(...) __riscv_vfnmsub_vf_f32m8_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f64m1_m(...) __riscv_vfnmsub_vv_f64m1_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f64m1_m(...) __riscv_vfnmsub_vf_f64m1_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f64m2_m(...) __riscv_vfnmsub_vv_f64m2_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f64m2_m(...) __riscv_vfnmsub_vf_f64m2_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f64m4_m(...) __riscv_vfnmsub_vv_f64m4_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f64m4_m(...) __riscv_vfnmsub_vf_f64m4_tumu(__VA_ARGS__)
+#define vfnmsub_vv_f64m8_m(...) __riscv_vfnmsub_vv_f64m8_tumu(__VA_ARGS__)
+#define vfnmsub_vf_f64m8_m(...) __riscv_vfnmsub_vf_f64m8_tumu(__VA_ARGS__)
+#define vfwmacc_vv_f32mf2(...) __riscv_vfwmacc_vv_f32mf2_tu(__VA_ARGS__)
+#define vfwmacc_vf_f32mf2(...) __riscv_vfwmacc_vf_f32mf2_tu(__VA_ARGS__)
+#define vfwmacc_vv_f32m1(...) __riscv_vfwmacc_vv_f32m1_tu(__VA_ARGS__)
+#define vfwmacc_vf_f32m1(...) __riscv_vfwmacc_vf_f32m1_tu(__VA_ARGS__)
+#define vfwmacc_vv_f32m2(...) __riscv_vfwmacc_vv_f32m2_tu(__VA_ARGS__)
+#define vfwmacc_vf_f32m2(...) __riscv_vfwmacc_vf_f32m2_tu(__VA_ARGS__)
+#define vfwmacc_vv_f32m4(...) __riscv_vfwmacc_vv_f32m4_tu(__VA_ARGS__)
+#define vfwmacc_vf_f32m4(...) __riscv_vfwmacc_vf_f32m4_tu(__VA_ARGS__)
+#define vfwmacc_vv_f32m8(...) __riscv_vfwmacc_vv_f32m8_tu(__VA_ARGS__)
+#define vfwmacc_vf_f32m8(...) __riscv_vfwmacc_vf_f32m8_tu(__VA_ARGS__)
+#define vfwmacc_vv_f64m1(...) __riscv_vfwmacc_vv_f64m1_tu(__VA_ARGS__)
+#define vfwmacc_vf_f64m1(...) __riscv_vfwmacc_vf_f64m1_tu(__VA_ARGS__)
+#define vfwmacc_vv_f64m2(...) __riscv_vfwmacc_vv_f64m2_tu(__VA_ARGS__)
+#define vfwmacc_vf_f64m2(...) __riscv_vfwmacc_vf_f64m2_tu(__VA_ARGS__)
+#define vfwmacc_vv_f64m4(...) __riscv_vfwmacc_vv_f64m4_tu(__VA_ARGS__)
+#define vfwmacc_vf_f64m4(...) __riscv_vfwmacc_vf_f64m4_tu(__VA_ARGS__)
+#define vfwmacc_vv_f64m8(...) __riscv_vfwmacc_vv_f64m8_tu(__VA_ARGS__)
+#define vfwmacc_vf_f64m8(...) __riscv_vfwmacc_vf_f64m8_tu(__VA_ARGS__)
+#define vfwnmacc_vv_f32mf2(...) __riscv_vfwnmacc_vv_f32mf2_tu(__VA_ARGS__)
+#define vfwnmacc_vf_f32mf2(...) __riscv_vfwnmacc_vf_f32mf2_tu(__VA_ARGS__)
+#define vfwnmacc_vv_f32m1(...) __riscv_vfwnmacc_vv_f32m1_tu(__VA_ARGS__)
+#define vfwnmacc_vf_f32m1(...) __riscv_vfwnmacc_vf_f32m1_tu(__VA_ARGS__)
+#define vfwnmacc_vv_f32m2(...) __riscv_vfwnmacc_vv_f32m2_tu(__VA_ARGS__)
+#define vfwnmacc_vf_f32m2(...) __riscv_vfwnmacc_vf_f32m2_tu(__VA_ARGS__)
+#define vfwnmacc_vv_f32m4(...) __riscv_vfwnmacc_vv_f32m4_tu(__VA_ARGS__)
+#define vfwnmacc_vf_f32m4(...) __riscv_vfwnmacc_vf_f32m4_tu(__VA_ARGS__)
+#define vfwnmacc_vv_f32m8(...) __riscv_vfwnmacc_vv_f32m8_tu(__VA_ARGS__)
+#define vfwnmacc_vf_f32m8(...) __riscv_vfwnmacc_vf_f32m8_tu(__VA_ARGS__)
+#define vfwnmacc_vv_f64m1(...) __riscv_vfwnmacc_vv_f64m1_tu(__VA_ARGS__)
+#define vfwnmacc_vf_f64m1(...) __riscv_vfwnmacc_vf_f64m1_tu(__VA_ARGS__)
+#define vfwnmacc_vv_f64m2(...) __riscv_vfwnmacc_vv_f64m2_tu(__VA_ARGS__)
+#define vfwnmacc_vf_f64m2(...) __riscv_vfwnmacc_vf_f64m2_tu(__VA_ARGS__)
+#define vfwnmacc_vv_f64m4(...) __riscv_vfwnmacc_vv_f64m4_tu(__VA_ARGS__)
+#define vfwnmacc_vf_f64m4(...) __riscv_vfwnmacc_vf_f64m4_tu(__VA_ARGS__)
+#define vfwnmacc_vv_f64m8(...) __riscv_vfwnmacc_vv_f64m8_tu(__VA_ARGS__)
+#define vfwnmacc_vf_f64m8(...) __riscv_vfwnmacc_vf_f64m8_tu(__VA_ARGS__)
+#define vfwmsac_vv_f32mf2(...) __riscv_vfwmsac_vv_f32mf2_tu(__VA_ARGS__)
+#define vfwmsac_vf_f32mf2(...) __riscv_vfwmsac_vf_f32mf2_tu(__VA_ARGS__)
+#define vfwmsac_vv_f32m1(...) __riscv_vfwmsac_vv_f32m1_tu(__VA_ARGS__)
+#define vfwmsac_vf_f32m1(...) __riscv_vfwmsac_vf_f32m1_tu(__VA_ARGS__)
+#define vfwmsac_vv_f32m2(...) __riscv_vfwmsac_vv_f32m2_tu(__VA_ARGS__)
+#define vfwmsac_vf_f32m2(...) __riscv_vfwmsac_vf_f32m2_tu(__VA_ARGS__)
+#define vfwmsac_vv_f32m4(...) __riscv_vfwmsac_vv_f32m4_tu(__VA_ARGS__)
+#define vfwmsac_vf_f32m4(...) __riscv_vfwmsac_vf_f32m4_tu(__VA_ARGS__)
+#define vfwmsac_vv_f32m8(...) __riscv_vfwmsac_vv_f32m8_tu(__VA_ARGS__)
+#define vfwmsac_vf_f32m8(...) __riscv_vfwmsac_vf_f32m8_tu(__VA_ARGS__)
+#define vfwmsac_vv_f64m1(...) __riscv_vfwmsac_vv_f64m1_tu(__VA_ARGS__)
+#define vfwmsac_vf_f64m1(...) __riscv_vfwmsac_vf_f64m1_tu(__VA_ARGS__)
+#define vfwmsac_vv_f64m2(...) __riscv_vfwmsac_vv_f64m2_tu(__VA_ARGS__)
+#define vfwmsac_vf_f64m2(...) __riscv_vfwmsac_vf_f64m2_tu(__VA_ARGS__)
+#define vfwmsac_vv_f64m4(...) __riscv_vfwmsac_vv_f64m4_tu(__VA_ARGS__)
+#define vfwmsac_vf_f64m4(...) __riscv_vfwmsac_vf_f64m4_tu(__VA_ARGS__)
+#define vfwmsac_vv_f64m8(...) __riscv_vfwmsac_vv_f64m8_tu(__VA_ARGS__)
+#define vfwmsac_vf_f64m8(...) __riscv_vfwmsac_vf_f64m8_tu(__VA_ARGS__)
+#define vfwnmsac_vv_f32mf2(...) __riscv_vfwnmsac_vv_f32mf2_tu(__VA_ARGS__)
+#define vfwnmsac_vf_f32mf2(...) __riscv_vfwnmsac_vf_f32mf2_tu(__VA_ARGS__)
+#define vfwnmsac_vv_f32m1(...) __riscv_vfwnmsac_vv_f32m1_tu(__VA_ARGS__)
+#define vfwnmsac_vf_f32m1(...) __riscv_vfwnmsac_vf_f32m1_tu(__VA_ARGS__)
+#define vfwnmsac_vv_f32m2(...) __riscv_vfwnmsac_vv_f32m2_tu(__VA_ARGS__)
+#define vfwnmsac_vf_f32m2(...) __riscv_vfwnmsac_vf_f32m2_tu(__VA_ARGS__)
+#define vfwnmsac_vv_f32m4(...) __riscv_vfwnmsac_vv_f32m4_tu(__VA_ARGS__)
+#define vfwnmsac_vf_f32m4(...) __riscv_vfwnmsac_vf_f32m4_tu(__VA_ARGS__)
+#define vfwnmsac_vv_f32m8(...) __riscv_vfwnmsac_vv_f32m8_tu(__VA_ARGS__)
+#define vfwnmsac_vf_f32m8(...) __riscv_vfwnmsac_vf_f32m8_tu(__VA_ARGS__)
+#define vfwnmsac_vv_f64m1(...) __riscv_vfwnmsac_vv_f64m1_tu(__VA_ARGS__)
+#define vfwnmsac_vf_f64m1(...) __riscv_vfwnmsac_vf_f64m1_tu(__VA_ARGS__)
+#define vfwnmsac_vv_f64m2(...) __riscv_vfwnmsac_vv_f64m2_tu(__VA_ARGS__)
+#define vfwnmsac_vf_f64m2(...) __riscv_vfwnmsac_vf_f64m2_tu(__VA_ARGS__)
+#define vfwnmsac_vv_f64m4(...) __riscv_vfwnmsac_vv_f64m4_tu(__VA_ARGS__)
+#define vfwnmsac_vf_f64m4(...) __riscv_vfwnmsac_vf_f64m4_tu(__VA_ARGS__)
+#define vfwnmsac_vv_f64m8(...) __riscv_vfwnmsac_vv_f64m8_tu(__VA_ARGS__)
+#define vfwnmsac_vf_f64m8(...) __riscv_vfwnmsac_vf_f64m8_tu(__VA_ARGS__)
+// masked functions
+#define vfwmacc_vv_f32mf2_m(...) __riscv_vfwmacc_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfwmacc_vf_f32mf2_m(...) __riscv_vfwmacc_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfwmacc_vv_f32m1_m(...) __riscv_vfwmacc_vv_f32m1_tumu(__VA_ARGS__)
+#define vfwmacc_vf_f32m1_m(...) __riscv_vfwmacc_vf_f32m1_tumu(__VA_ARGS__)
+#define vfwmacc_vv_f32m2_m(...) __riscv_vfwmacc_vv_f32m2_tumu(__VA_ARGS__)
+#define vfwmacc_vf_f32m2_m(...) __riscv_vfwmacc_vf_f32m2_tumu(__VA_ARGS__)
+#define vfwmacc_vv_f32m4_m(...) __riscv_vfwmacc_vv_f32m4_tumu(__VA_ARGS__)
+#define vfwmacc_vf_f32m4_m(...) __riscv_vfwmacc_vf_f32m4_tumu(__VA_ARGS__)
+#define vfwmacc_vv_f32m8_m(...) __riscv_vfwmacc_vv_f32m8_tumu(__VA_ARGS__)
+#define vfwmacc_vf_f32m8_m(...) __riscv_vfwmacc_vf_f32m8_tumu(__VA_ARGS__)
+#define vfwmacc_vv_f64m1_m(...) __riscv_vfwmacc_vv_f64m1_tumu(__VA_ARGS__)
+#define vfwmacc_vf_f64m1_m(...) __riscv_vfwmacc_vf_f64m1_tumu(__VA_ARGS__)
+#define vfwmacc_vv_f64m2_m(...) __riscv_vfwmacc_vv_f64m2_tumu(__VA_ARGS__)
+#define vfwmacc_vf_f64m2_m(...) __riscv_vfwmacc_vf_f64m2_tumu(__VA_ARGS__)
+#define vfwmacc_vv_f64m4_m(...) __riscv_vfwmacc_vv_f64m4_tumu(__VA_ARGS__)
+#define vfwmacc_vf_f64m4_m(...) __riscv_vfwmacc_vf_f64m4_tumu(__VA_ARGS__)
+#define vfwmacc_vv_f64m8_m(...) __riscv_vfwmacc_vv_f64m8_tumu(__VA_ARGS__)
+#define vfwmacc_vf_f64m8_m(...) __riscv_vfwmacc_vf_f64m8_tumu(__VA_ARGS__)
+#define vfwnmacc_vv_f32mf2_m(...) __riscv_vfwnmacc_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfwnmacc_vf_f32mf2_m(...) __riscv_vfwnmacc_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfwnmacc_vv_f32m1_m(...) __riscv_vfwnmacc_vv_f32m1_tumu(__VA_ARGS__)
+#define vfwnmacc_vf_f32m1_m(...) __riscv_vfwnmacc_vf_f32m1_tumu(__VA_ARGS__)
+#define vfwnmacc_vv_f32m2_m(...) __riscv_vfwnmacc_vv_f32m2_tumu(__VA_ARGS__)
+#define vfwnmacc_vf_f32m2_m(...) __riscv_vfwnmacc_vf_f32m2_tumu(__VA_ARGS__)
+#define vfwnmacc_vv_f32m4_m(...) __riscv_vfwnmacc_vv_f32m4_tumu(__VA_ARGS__)
+#define vfwnmacc_vf_f32m4_m(...) __riscv_vfwnmacc_vf_f32m4_tumu(__VA_ARGS__)
+#define vfwnmacc_vv_f32m8_m(...) __riscv_vfwnmacc_vv_f32m8_tumu(__VA_ARGS__)
+#define vfwnmacc_vf_f32m8_m(...) __riscv_vfwnmacc_vf_f32m8_tumu(__VA_ARGS__)
+#define vfwnmacc_vv_f64m1_m(...) __riscv_vfwnmacc_vv_f64m1_tumu(__VA_ARGS__)
+#define vfwnmacc_vf_f64m1_m(...) __riscv_vfwnmacc_vf_f64m1_tumu(__VA_ARGS__)
+#define vfwnmacc_vv_f64m2_m(...) __riscv_vfwnmacc_vv_f64m2_tumu(__VA_ARGS__)
+#define vfwnmacc_vf_f64m2_m(...) __riscv_vfwnmacc_vf_f64m2_tumu(__VA_ARGS__)
+#define vfwnmacc_vv_f64m4_m(...) __riscv_vfwnmacc_vv_f64m4_tumu(__VA_ARGS__)
+#define vfwnmacc_vf_f64m4_m(...) __riscv_vfwnmacc_vf_f64m4_tumu(__VA_ARGS__)
+#define vfwnmacc_vv_f64m8_m(...) __riscv_vfwnmacc_vv_f64m8_tumu(__VA_ARGS__)
+#define vfwnmacc_vf_f64m8_m(...) __riscv_vfwnmacc_vf_f64m8_tumu(__VA_ARGS__)
+#define vfwmsac_vv_f32mf2_m(...) __riscv_vfwmsac_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfwmsac_vf_f32mf2_m(...) __riscv_vfwmsac_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfwmsac_vv_f32m1_m(...) __riscv_vfwmsac_vv_f32m1_tumu(__VA_ARGS__)
+#define vfwmsac_vf_f32m1_m(...) __riscv_vfwmsac_vf_f32m1_tumu(__VA_ARGS__)
+#define vfwmsac_vv_f32m2_m(...) __riscv_vfwmsac_vv_f32m2_tumu(__VA_ARGS__)
+#define vfwmsac_vf_f32m2_m(...) __riscv_vfwmsac_vf_f32m2_tumu(__VA_ARGS__)
+#define vfwmsac_vv_f32m4_m(...) __riscv_vfwmsac_vv_f32m4_tumu(__VA_ARGS__)
+#define vfwmsac_vf_f32m4_m(...) __riscv_vfwmsac_vf_f32m4_tumu(__VA_ARGS__)
+#define vfwmsac_vv_f32m8_m(...) __riscv_vfwmsac_vv_f32m8_tumu(__VA_ARGS__)
+#define vfwmsac_vf_f32m8_m(...) __riscv_vfwmsac_vf_f32m8_tumu(__VA_ARGS__)
+#define vfwmsac_vv_f64m1_m(...) __riscv_vfwmsac_vv_f64m1_tumu(__VA_ARGS__)
+#define vfwmsac_vf_f64m1_m(...) __riscv_vfwmsac_vf_f64m1_tumu(__VA_ARGS__)
+#define vfwmsac_vv_f64m2_m(...) __riscv_vfwmsac_vv_f64m2_tumu(__VA_ARGS__)
+#define vfwmsac_vf_f64m2_m(...) __riscv_vfwmsac_vf_f64m2_tumu(__VA_ARGS__)
+#define vfwmsac_vv_f64m4_m(...) __riscv_vfwmsac_vv_f64m4_tumu(__VA_ARGS__)
+#define vfwmsac_vf_f64m4_m(...) __riscv_vfwmsac_vf_f64m4_tumu(__VA_ARGS__)
+#define vfwmsac_vv_f64m8_m(...) __riscv_vfwmsac_vv_f64m8_tumu(__VA_ARGS__)
+#define vfwmsac_vf_f64m8_m(...) __riscv_vfwmsac_vf_f64m8_tumu(__VA_ARGS__)
+#define vfwnmsac_vv_f32mf2_m(...) __riscv_vfwnmsac_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfwnmsac_vf_f32mf2_m(...) __riscv_vfwnmsac_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfwnmsac_vv_f32m1_m(...) __riscv_vfwnmsac_vv_f32m1_tumu(__VA_ARGS__)
+#define vfwnmsac_vf_f32m1_m(...) __riscv_vfwnmsac_vf_f32m1_tumu(__VA_ARGS__)
+#define vfwnmsac_vv_f32m2_m(...) __riscv_vfwnmsac_vv_f32m2_tumu(__VA_ARGS__)
+#define vfwnmsac_vf_f32m2_m(...) __riscv_vfwnmsac_vf_f32m2_tumu(__VA_ARGS__)
+#define vfwnmsac_vv_f32m4_m(...) __riscv_vfwnmsac_vv_f32m4_tumu(__VA_ARGS__)
+#define vfwnmsac_vf_f32m4_m(...) __riscv_vfwnmsac_vf_f32m4_tumu(__VA_ARGS__)
+#define vfwnmsac_vv_f32m8_m(...) __riscv_vfwnmsac_vv_f32m8_tumu(__VA_ARGS__)
+#define vfwnmsac_vf_f32m8_m(...) __riscv_vfwnmsac_vf_f32m8_tumu(__VA_ARGS__)
+#define vfwnmsac_vv_f64m1_m(...) __riscv_vfwnmsac_vv_f64m1_tumu(__VA_ARGS__)
+#define vfwnmsac_vf_f64m1_m(...) __riscv_vfwnmsac_vf_f64m1_tumu(__VA_ARGS__)
+#define vfwnmsac_vv_f64m2_m(...) __riscv_vfwnmsac_vv_f64m2_tumu(__VA_ARGS__)
+#define vfwnmsac_vf_f64m2_m(...) __riscv_vfwnmsac_vf_f64m2_tumu(__VA_ARGS__)
+#define vfwnmsac_vv_f64m4_m(...) __riscv_vfwnmsac_vv_f64m4_tumu(__VA_ARGS__)
+#define vfwnmsac_vf_f64m4_m(...) __riscv_vfwnmsac_vf_f64m4_tumu(__VA_ARGS__)
+#define vfwnmsac_vv_f64m8_m(...) __riscv_vfwnmsac_vv_f64m8_tumu(__VA_ARGS__)
+#define vfwnmsac_vf_f64m8_m(...) __riscv_vfwnmsac_vf_f64m8_tumu(__VA_ARGS__)
+#define vfsqrt_v_f16mf4(...) __riscv_vfsqrt_v_f16mf4(__VA_ARGS__)
+#define vfsqrt_v_f16mf2(...) __riscv_vfsqrt_v_f16mf2(__VA_ARGS__)
+#define vfsqrt_v_f16m1(...) __riscv_vfsqrt_v_f16m1(__VA_ARGS__)
+#define vfsqrt_v_f16m2(...) __riscv_vfsqrt_v_f16m2(__VA_ARGS__)
+#define vfsqrt_v_f16m4(...) __riscv_vfsqrt_v_f16m4(__VA_ARGS__)
+#define vfsqrt_v_f16m8(...) __riscv_vfsqrt_v_f16m8(__VA_ARGS__)
+#define vfsqrt_v_f32mf2(...) __riscv_vfsqrt_v_f32mf2(__VA_ARGS__)
+#define vfsqrt_v_f32m1(...) __riscv_vfsqrt_v_f32m1(__VA_ARGS__)
+#define vfsqrt_v_f32m2(...) __riscv_vfsqrt_v_f32m2(__VA_ARGS__)
+#define vfsqrt_v_f32m4(...) __riscv_vfsqrt_v_f32m4(__VA_ARGS__)
+#define vfsqrt_v_f32m8(...) __riscv_vfsqrt_v_f32m8(__VA_ARGS__)
+#define vfsqrt_v_f64m1(...) __riscv_vfsqrt_v_f64m1(__VA_ARGS__)
+#define vfsqrt_v_f64m2(...) __riscv_vfsqrt_v_f64m2(__VA_ARGS__)
+#define vfsqrt_v_f64m4(...) __riscv_vfsqrt_v_f64m4(__VA_ARGS__)
+#define vfsqrt_v_f64m8(...) __riscv_vfsqrt_v_f64m8(__VA_ARGS__)
+// masked functions
+#define vfsqrt_v_f16mf4_m(...) __riscv_vfsqrt_v_f16mf4_tumu(__VA_ARGS__)
+#define vfsqrt_v_f16mf2_m(...) __riscv_vfsqrt_v_f16mf2_tumu(__VA_ARGS__)
+#define vfsqrt_v_f16m1_m(...) __riscv_vfsqrt_v_f16m1_tumu(__VA_ARGS__)
+#define vfsqrt_v_f16m2_m(...) __riscv_vfsqrt_v_f16m2_tumu(__VA_ARGS__)
+#define vfsqrt_v_f16m4_m(...) __riscv_vfsqrt_v_f16m4_tumu(__VA_ARGS__)
+#define vfsqrt_v_f16m8_m(...) __riscv_vfsqrt_v_f16m8_tumu(__VA_ARGS__)
+#define vfsqrt_v_f32mf2_m(...) __riscv_vfsqrt_v_f32mf2_tumu(__VA_ARGS__)
+#define vfsqrt_v_f32m1_m(...) __riscv_vfsqrt_v_f32m1_tumu(__VA_ARGS__)
+#define vfsqrt_v_f32m2_m(...) __riscv_vfsqrt_v_f32m2_tumu(__VA_ARGS__)
+#define vfsqrt_v_f32m4_m(...) __riscv_vfsqrt_v_f32m4_tumu(__VA_ARGS__)
+#define vfsqrt_v_f32m8_m(...) __riscv_vfsqrt_v_f32m8_tumu(__VA_ARGS__)
+#define vfsqrt_v_f64m1_m(...) __riscv_vfsqrt_v_f64m1_tumu(__VA_ARGS__)
+#define vfsqrt_v_f64m2_m(...) __riscv_vfsqrt_v_f64m2_tumu(__VA_ARGS__)
+#define vfsqrt_v_f64m4_m(...) __riscv_vfsqrt_v_f64m4_tumu(__VA_ARGS__)
+#define vfsqrt_v_f64m8_m(...) __riscv_vfsqrt_v_f64m8_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f16mf4(...) __riscv_vfrsqrt7_v_f16mf4(__VA_ARGS__)
+#define vfrsqrt7_v_f16mf2(...) __riscv_vfrsqrt7_v_f16mf2(__VA_ARGS__)
+#define vfrsqrt7_v_f16m1(...) __riscv_vfrsqrt7_v_f16m1(__VA_ARGS__)
+#define vfrsqrt7_v_f16m2(...) __riscv_vfrsqrt7_v_f16m2(__VA_ARGS__)
+#define vfrsqrt7_v_f16m4(...) __riscv_vfrsqrt7_v_f16m4(__VA_ARGS__)
+#define vfrsqrt7_v_f16m8(...) __riscv_vfrsqrt7_v_f16m8(__VA_ARGS__)
+#define vfrsqrt7_v_f32mf2(...) __riscv_vfrsqrt7_v_f32mf2(__VA_ARGS__)
+#define vfrsqrt7_v_f32m1(...) __riscv_vfrsqrt7_v_f32m1(__VA_ARGS__)
+#define vfrsqrt7_v_f32m2(...) __riscv_vfrsqrt7_v_f32m2(__VA_ARGS__)
+#define vfrsqrt7_v_f32m4(...) __riscv_vfrsqrt7_v_f32m4(__VA_ARGS__)
+#define vfrsqrt7_v_f32m8(...) __riscv_vfrsqrt7_v_f32m8(__VA_ARGS__)
+#define vfrsqrt7_v_f64m1(...) __riscv_vfrsqrt7_v_f64m1(__VA_ARGS__)
+#define vfrsqrt7_v_f64m2(...) __riscv_vfrsqrt7_v_f64m2(__VA_ARGS__)
+#define vfrsqrt7_v_f64m4(...) __riscv_vfrsqrt7_v_f64m4(__VA_ARGS__)
+#define vfrsqrt7_v_f64m8(...) __riscv_vfrsqrt7_v_f64m8(__VA_ARGS__)
+// masked functions
+#define vfrsqrt7_v_f16mf4_m(...) __riscv_vfrsqrt7_v_f16mf4_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f16mf2_m(...) __riscv_vfrsqrt7_v_f16mf2_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f16m1_m(...) __riscv_vfrsqrt7_v_f16m1_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f16m2_m(...) __riscv_vfrsqrt7_v_f16m2_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f16m4_m(...) __riscv_vfrsqrt7_v_f16m4_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f16m8_m(...) __riscv_vfrsqrt7_v_f16m8_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f32mf2_m(...) __riscv_vfrsqrt7_v_f32mf2_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f32m1_m(...) __riscv_vfrsqrt7_v_f32m1_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f32m2_m(...) __riscv_vfrsqrt7_v_f32m2_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f32m4_m(...) __riscv_vfrsqrt7_v_f32m4_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f32m8_m(...) __riscv_vfrsqrt7_v_f32m8_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f64m1_m(...) __riscv_vfrsqrt7_v_f64m1_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f64m2_m(...) __riscv_vfrsqrt7_v_f64m2_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f64m4_m(...) __riscv_vfrsqrt7_v_f64m4_tumu(__VA_ARGS__)
+#define vfrsqrt7_v_f64m8_m(...) __riscv_vfrsqrt7_v_f64m8_tumu(__VA_ARGS__)
+#define vfrec7_v_f16mf4(...) __riscv_vfrec7_v_f16mf4(__VA_ARGS__)
+#define vfrec7_v_f16mf2(...) __riscv_vfrec7_v_f16mf2(__VA_ARGS__)
+#define vfrec7_v_f16m1(...) __riscv_vfrec7_v_f16m1(__VA_ARGS__)
+#define vfrec7_v_f16m2(...) __riscv_vfrec7_v_f16m2(__VA_ARGS__)
+#define vfrec7_v_f16m4(...) __riscv_vfrec7_v_f16m4(__VA_ARGS__)
+#define vfrec7_v_f16m8(...) __riscv_vfrec7_v_f16m8(__VA_ARGS__)
+#define vfrec7_v_f32mf2(...) __riscv_vfrec7_v_f32mf2(__VA_ARGS__)
+#define vfrec7_v_f32m1(...) __riscv_vfrec7_v_f32m1(__VA_ARGS__)
+#define vfrec7_v_f32m2(...) __riscv_vfrec7_v_f32m2(__VA_ARGS__)
+#define vfrec7_v_f32m4(...) __riscv_vfrec7_v_f32m4(__VA_ARGS__)
+#define vfrec7_v_f32m8(...) __riscv_vfrec7_v_f32m8(__VA_ARGS__)
+#define vfrec7_v_f64m1(...) __riscv_vfrec7_v_f64m1(__VA_ARGS__)
+#define vfrec7_v_f64m2(...) __riscv_vfrec7_v_f64m2(__VA_ARGS__)
+#define vfrec7_v_f64m4(...) __riscv_vfrec7_v_f64m4(__VA_ARGS__)
+#define vfrec7_v_f64m8(...) __riscv_vfrec7_v_f64m8(__VA_ARGS__)
+// masked functions
+#define vfrec7_v_f16mf4_m(...) __riscv_vfrec7_v_f16mf4_tumu(__VA_ARGS__)
+#define vfrec7_v_f16mf2_m(...) __riscv_vfrec7_v_f16mf2_tumu(__VA_ARGS__)
+#define vfrec7_v_f16m1_m(...) __riscv_vfrec7_v_f16m1_tumu(__VA_ARGS__)
+#define vfrec7_v_f16m2_m(...) __riscv_vfrec7_v_f16m2_tumu(__VA_ARGS__)
+#define vfrec7_v_f16m4_m(...) __riscv_vfrec7_v_f16m4_tumu(__VA_ARGS__)
+#define vfrec7_v_f16m8_m(...) __riscv_vfrec7_v_f16m8_tumu(__VA_ARGS__)
+#define vfrec7_v_f32mf2_m(...) __riscv_vfrec7_v_f32mf2_tumu(__VA_ARGS__)
+#define vfrec7_v_f32m1_m(...) __riscv_vfrec7_v_f32m1_tumu(__VA_ARGS__)
+#define vfrec7_v_f32m2_m(...) __riscv_vfrec7_v_f32m2_tumu(__VA_ARGS__)
+#define vfrec7_v_f32m4_m(...) __riscv_vfrec7_v_f32m4_tumu(__VA_ARGS__)
+#define vfrec7_v_f32m8_m(...) __riscv_vfrec7_v_f32m8_tumu(__VA_ARGS__)
+#define vfrec7_v_f64m1_m(...) __riscv_vfrec7_v_f64m1_tumu(__VA_ARGS__)
+#define vfrec7_v_f64m2_m(...) __riscv_vfrec7_v_f64m2_tumu(__VA_ARGS__)
+#define vfrec7_v_f64m4_m(...) __riscv_vfrec7_v_f64m4_tumu(__VA_ARGS__)
+#define vfrec7_v_f64m8_m(...) __riscv_vfrec7_v_f64m8_tumu(__VA_ARGS__)
+#define vfmin_vv_f16mf4(...) __riscv_vfmin_vv_f16mf4(__VA_ARGS__)
+#define vfmin_vf_f16mf4(...) __riscv_vfmin_vf_f16mf4(__VA_ARGS__)
+#define vfmin_vv_f16mf2(...) __riscv_vfmin_vv_f16mf2(__VA_ARGS__)
+#define vfmin_vf_f16mf2(...) __riscv_vfmin_vf_f16mf2(__VA_ARGS__)
+#define vfmin_vv_f16m1(...) __riscv_vfmin_vv_f16m1(__VA_ARGS__)
+#define vfmin_vf_f16m1(...) __riscv_vfmin_vf_f16m1(__VA_ARGS__)
+#define vfmin_vv_f16m2(...) __riscv_vfmin_vv_f16m2(__VA_ARGS__)
+#define vfmin_vf_f16m2(...) __riscv_vfmin_vf_f16m2(__VA_ARGS__)
+#define vfmin_vv_f16m4(...) __riscv_vfmin_vv_f16m4(__VA_ARGS__)
+#define vfmin_vf_f16m4(...) __riscv_vfmin_vf_f16m4(__VA_ARGS__)
+#define vfmin_vv_f16m8(...) __riscv_vfmin_vv_f16m8(__VA_ARGS__)
+#define vfmin_vf_f16m8(...) __riscv_vfmin_vf_f16m8(__VA_ARGS__)
+#define vfmin_vv_f32mf2(...) __riscv_vfmin_vv_f32mf2(__VA_ARGS__)
+#define vfmin_vf_f32mf2(...) __riscv_vfmin_vf_f32mf2(__VA_ARGS__)
+#define vfmin_vv_f32m1(...) __riscv_vfmin_vv_f32m1(__VA_ARGS__)
+#define vfmin_vf_f32m1(...) __riscv_vfmin_vf_f32m1(__VA_ARGS__)
+#define vfmin_vv_f32m2(...) __riscv_vfmin_vv_f32m2(__VA_ARGS__)
+#define vfmin_vf_f32m2(...) __riscv_vfmin_vf_f32m2(__VA_ARGS__)
+#define vfmin_vv_f32m4(...) __riscv_vfmin_vv_f32m4(__VA_ARGS__)
+#define vfmin_vf_f32m4(...) __riscv_vfmin_vf_f32m4(__VA_ARGS__)
+#define vfmin_vv_f32m8(...) __riscv_vfmin_vv_f32m8(__VA_ARGS__)
+#define vfmin_vf_f32m8(...) __riscv_vfmin_vf_f32m8(__VA_ARGS__)
+#define vfmin_vv_f64m1(...) __riscv_vfmin_vv_f64m1(__VA_ARGS__)
+#define vfmin_vf_f64m1(...) __riscv_vfmin_vf_f64m1(__VA_ARGS__)
+#define vfmin_vv_f64m2(...) __riscv_vfmin_vv_f64m2(__VA_ARGS__)
+#define vfmin_vf_f64m2(...) __riscv_vfmin_vf_f64m2(__VA_ARGS__)
+#define vfmin_vv_f64m4(...) __riscv_vfmin_vv_f64m4(__VA_ARGS__)
+#define vfmin_vf_f64m4(...) __riscv_vfmin_vf_f64m4(__VA_ARGS__)
+#define vfmin_vv_f64m8(...) __riscv_vfmin_vv_f64m8(__VA_ARGS__)
+#define vfmin_vf_f64m8(...) __riscv_vfmin_vf_f64m8(__VA_ARGS__)
+#define vfmax_vv_f16mf4(...) __riscv_vfmax_vv_f16mf4(__VA_ARGS__)
+#define vfmax_vf_f16mf4(...) __riscv_vfmax_vf_f16mf4(__VA_ARGS__)
+#define vfmax_vv_f16mf2(...) __riscv_vfmax_vv_f16mf2(__VA_ARGS__)
+#define vfmax_vf_f16mf2(...) __riscv_vfmax_vf_f16mf2(__VA_ARGS__)
+#define vfmax_vv_f16m1(...) __riscv_vfmax_vv_f16m1(__VA_ARGS__)
+#define vfmax_vf_f16m1(...) __riscv_vfmax_vf_f16m1(__VA_ARGS__)
+#define vfmax_vv_f16m2(...) __riscv_vfmax_vv_f16m2(__VA_ARGS__)
+#define vfmax_vf_f16m2(...) __riscv_vfmax_vf_f16m2(__VA_ARGS__)
+#define vfmax_vv_f16m4(...) __riscv_vfmax_vv_f16m4(__VA_ARGS__)
+#define vfmax_vf_f16m4(...) __riscv_vfmax_vf_f16m4(__VA_ARGS__)
+#define vfmax_vv_f16m8(...) __riscv_vfmax_vv_f16m8(__VA_ARGS__)
+#define vfmax_vf_f16m8(...) __riscv_vfmax_vf_f16m8(__VA_ARGS__)
+#define vfmax_vv_f32mf2(...) __riscv_vfmax_vv_f32mf2(__VA_ARGS__)
+#define vfmax_vf_f32mf2(...) __riscv_vfmax_vf_f32mf2(__VA_ARGS__)
+#define vfmax_vv_f32m1(...) __riscv_vfmax_vv_f32m1(__VA_ARGS__)
+#define vfmax_vf_f32m1(...) __riscv_vfmax_vf_f32m1(__VA_ARGS__)
+#define vfmax_vv_f32m2(...) __riscv_vfmax_vv_f32m2(__VA_ARGS__)
+#define vfmax_vf_f32m2(...) __riscv_vfmax_vf_f32m2(__VA_ARGS__)
+#define vfmax_vv_f32m4(...) __riscv_vfmax_vv_f32m4(__VA_ARGS__)
+#define vfmax_vf_f32m4(...) __riscv_vfmax_vf_f32m4(__VA_ARGS__)
+#define vfmax_vv_f32m8(...) __riscv_vfmax_vv_f32m8(__VA_ARGS__)
+#define vfmax_vf_f32m8(...) __riscv_vfmax_vf_f32m8(__VA_ARGS__)
+#define vfmax_vv_f64m1(...) __riscv_vfmax_vv_f64m1(__VA_ARGS__)
+#define vfmax_vf_f64m1(...) __riscv_vfmax_vf_f64m1(__VA_ARGS__)
+#define vfmax_vv_f64m2(...) __riscv_vfmax_vv_f64m2(__VA_ARGS__)
+#define vfmax_vf_f64m2(...) __riscv_vfmax_vf_f64m2(__VA_ARGS__)
+#define vfmax_vv_f64m4(...) __riscv_vfmax_vv_f64m4(__VA_ARGS__)
+#define vfmax_vf_f64m4(...) __riscv_vfmax_vf_f64m4(__VA_ARGS__)
+#define vfmax_vv_f64m8(...) __riscv_vfmax_vv_f64m8(__VA_ARGS__)
+#define vfmax_vf_f64m8(...) __riscv_vfmax_vf_f64m8(__VA_ARGS__)
+// masked functions
+#define vfmin_vv_f16mf4_m(...) __riscv_vfmin_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfmin_vf_f16mf4_m(...) __riscv_vfmin_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfmin_vv_f16mf2_m(...) __riscv_vfmin_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfmin_vf_f16mf2_m(...) __riscv_vfmin_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfmin_vv_f16m1_m(...) __riscv_vfmin_vv_f16m1_tumu(__VA_ARGS__)
+#define vfmin_vf_f16m1_m(...) __riscv_vfmin_vf_f16m1_tumu(__VA_ARGS__)
+#define vfmin_vv_f16m2_m(...) __riscv_vfmin_vv_f16m2_tumu(__VA_ARGS__)
+#define vfmin_vf_f16m2_m(...) __riscv_vfmin_vf_f16m2_tumu(__VA_ARGS__)
+#define vfmin_vv_f16m4_m(...) __riscv_vfmin_vv_f16m4_tumu(__VA_ARGS__)
+#define vfmin_vf_f16m4_m(...) __riscv_vfmin_vf_f16m4_tumu(__VA_ARGS__)
+#define vfmin_vv_f16m8_m(...) __riscv_vfmin_vv_f16m8_tumu(__VA_ARGS__)
+#define vfmin_vf_f16m8_m(...) __riscv_vfmin_vf_f16m8_tumu(__VA_ARGS__)
+#define vfmin_vv_f32mf2_m(...) __riscv_vfmin_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfmin_vf_f32mf2_m(...) __riscv_vfmin_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfmin_vv_f32m1_m(...) __riscv_vfmin_vv_f32m1_tumu(__VA_ARGS__)
+#define vfmin_vf_f32m1_m(...) __riscv_vfmin_vf_f32m1_tumu(__VA_ARGS__)
+#define vfmin_vv_f32m2_m(...) __riscv_vfmin_vv_f32m2_tumu(__VA_ARGS__)
+#define vfmin_vf_f32m2_m(...) __riscv_vfmin_vf_f32m2_tumu(__VA_ARGS__)
+#define vfmin_vv_f32m4_m(...) __riscv_vfmin_vv_f32m4_tumu(__VA_ARGS__)
+#define vfmin_vf_f32m4_m(...) __riscv_vfmin_vf_f32m4_tumu(__VA_ARGS__)
+#define vfmin_vv_f32m8_m(...) __riscv_vfmin_vv_f32m8_tumu(__VA_ARGS__)
+#define vfmin_vf_f32m8_m(...) __riscv_vfmin_vf_f32m8_tumu(__VA_ARGS__)
+#define vfmin_vv_f64m1_m(...) __riscv_vfmin_vv_f64m1_tumu(__VA_ARGS__)
+#define vfmin_vf_f64m1_m(...) __riscv_vfmin_vf_f64m1_tumu(__VA_ARGS__)
+#define vfmin_vv_f64m2_m(...) __riscv_vfmin_vv_f64m2_tumu(__VA_ARGS__)
+#define vfmin_vf_f64m2_m(...) __riscv_vfmin_vf_f64m2_tumu(__VA_ARGS__)
+#define vfmin_vv_f64m4_m(...) __riscv_vfmin_vv_f64m4_tumu(__VA_ARGS__)
+#define vfmin_vf_f64m4_m(...) __riscv_vfmin_vf_f64m4_tumu(__VA_ARGS__)
+#define vfmin_vv_f64m8_m(...) __riscv_vfmin_vv_f64m8_tumu(__VA_ARGS__)
+#define vfmin_vf_f64m8_m(...) __riscv_vfmin_vf_f64m8_tumu(__VA_ARGS__)
+#define vfmax_vv_f16mf4_m(...) __riscv_vfmax_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfmax_vf_f16mf4_m(...) __riscv_vfmax_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfmax_vv_f16mf2_m(...) __riscv_vfmax_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfmax_vf_f16mf2_m(...) __riscv_vfmax_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfmax_vv_f16m1_m(...) __riscv_vfmax_vv_f16m1_tumu(__VA_ARGS__)
+#define vfmax_vf_f16m1_m(...) __riscv_vfmax_vf_f16m1_tumu(__VA_ARGS__)
+#define vfmax_vv_f16m2_m(...) __riscv_vfmax_vv_f16m2_tumu(__VA_ARGS__)
+#define vfmax_vf_f16m2_m(...) __riscv_vfmax_vf_f16m2_tumu(__VA_ARGS__)
+#define vfmax_vv_f16m4_m(...) __riscv_vfmax_vv_f16m4_tumu(__VA_ARGS__)
+#define vfmax_vf_f16m4_m(...) __riscv_vfmax_vf_f16m4_tumu(__VA_ARGS__)
+#define vfmax_vv_f16m8_m(...) __riscv_vfmax_vv_f16m8_tumu(__VA_ARGS__)
+#define vfmax_vf_f16m8_m(...) __riscv_vfmax_vf_f16m8_tumu(__VA_ARGS__)
+#define vfmax_vv_f32mf2_m(...) __riscv_vfmax_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfmax_vf_f32mf2_m(...) __riscv_vfmax_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfmax_vv_f32m1_m(...) __riscv_vfmax_vv_f32m1_tumu(__VA_ARGS__)
+#define vfmax_vf_f32m1_m(...) __riscv_vfmax_vf_f32m1_tumu(__VA_ARGS__)
+#define vfmax_vv_f32m2_m(...) __riscv_vfmax_vv_f32m2_tumu(__VA_ARGS__)
+#define vfmax_vf_f32m2_m(...) __riscv_vfmax_vf_f32m2_tumu(__VA_ARGS__)
+#define vfmax_vv_f32m4_m(...) __riscv_vfmax_vv_f32m4_tumu(__VA_ARGS__)
+#define vfmax_vf_f32m4_m(...) __riscv_vfmax_vf_f32m4_tumu(__VA_ARGS__)
+#define vfmax_vv_f32m8_m(...) __riscv_vfmax_vv_f32m8_tumu(__VA_ARGS__)
+#define vfmax_vf_f32m8_m(...) __riscv_vfmax_vf_f32m8_tumu(__VA_ARGS__)
+#define vfmax_vv_f64m1_m(...) __riscv_vfmax_vv_f64m1_tumu(__VA_ARGS__)
+#define vfmax_vf_f64m1_m(...) __riscv_vfmax_vf_f64m1_tumu(__VA_ARGS__)
+#define vfmax_vv_f64m2_m(...) __riscv_vfmax_vv_f64m2_tumu(__VA_ARGS__)
+#define vfmax_vf_f64m2_m(...) __riscv_vfmax_vf_f64m2_tumu(__VA_ARGS__)
+#define vfmax_vv_f64m4_m(...) __riscv_vfmax_vv_f64m4_tumu(__VA_ARGS__)
+#define vfmax_vf_f64m4_m(...) __riscv_vfmax_vf_f64m4_tumu(__VA_ARGS__)
+#define vfmax_vv_f64m8_m(...) __riscv_vfmax_vv_f64m8_tumu(__VA_ARGS__)
+#define vfmax_vf_f64m8_m(...) __riscv_vfmax_vf_f64m8_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f16mf4(...) __riscv_vfsgnj_vv_f16mf4(__VA_ARGS__)
+#define vfsgnj_vf_f16mf4(...) __riscv_vfsgnj_vf_f16mf4(__VA_ARGS__)
+#define vfsgnj_vv_f16mf2(...) __riscv_vfsgnj_vv_f16mf2(__VA_ARGS__)
+#define vfsgnj_vf_f16mf2(...) __riscv_vfsgnj_vf_f16mf2(__VA_ARGS__)
+#define vfsgnj_vv_f16m1(...) __riscv_vfsgnj_vv_f16m1(__VA_ARGS__)
+#define vfsgnj_vf_f16m1(...) __riscv_vfsgnj_vf_f16m1(__VA_ARGS__)
+#define vfsgnj_vv_f16m2(...) __riscv_vfsgnj_vv_f16m2(__VA_ARGS__)
+#define vfsgnj_vf_f16m2(...) __riscv_vfsgnj_vf_f16m2(__VA_ARGS__)
+#define vfsgnj_vv_f16m4(...) __riscv_vfsgnj_vv_f16m4(__VA_ARGS__)
+#define vfsgnj_vf_f16m4(...) __riscv_vfsgnj_vf_f16m4(__VA_ARGS__)
+#define vfsgnj_vv_f16m8(...) __riscv_vfsgnj_vv_f16m8(__VA_ARGS__)
+#define vfsgnj_vf_f16m8(...) __riscv_vfsgnj_vf_f16m8(__VA_ARGS__)
+#define vfsgnj_vv_f32mf2(...) __riscv_vfsgnj_vv_f32mf2(__VA_ARGS__)
+#define vfsgnj_vf_f32mf2(...) __riscv_vfsgnj_vf_f32mf2(__VA_ARGS__)
+#define vfsgnj_vv_f32m1(...) __riscv_vfsgnj_vv_f32m1(__VA_ARGS__)
+#define vfsgnj_vf_f32m1(...) __riscv_vfsgnj_vf_f32m1(__VA_ARGS__)
+#define vfsgnj_vv_f32m2(...) __riscv_vfsgnj_vv_f32m2(__VA_ARGS__)
+#define vfsgnj_vf_f32m2(...) __riscv_vfsgnj_vf_f32m2(__VA_ARGS__)
+#define vfsgnj_vv_f32m4(...) __riscv_vfsgnj_vv_f32m4(__VA_ARGS__)
+#define vfsgnj_vf_f32m4(...) __riscv_vfsgnj_vf_f32m4(__VA_ARGS__)
+#define vfsgnj_vv_f32m8(...) __riscv_vfsgnj_vv_f32m8(__VA_ARGS__)
+#define vfsgnj_vf_f32m8(...) __riscv_vfsgnj_vf_f32m8(__VA_ARGS__)
+#define vfsgnj_vv_f64m1(...) __riscv_vfsgnj_vv_f64m1(__VA_ARGS__)
+#define vfsgnj_vf_f64m1(...) __riscv_vfsgnj_vf_f64m1(__VA_ARGS__)
+#define vfsgnj_vv_f64m2(...) __riscv_vfsgnj_vv_f64m2(__VA_ARGS__)
+#define vfsgnj_vf_f64m2(...) __riscv_vfsgnj_vf_f64m2(__VA_ARGS__)
+#define vfsgnj_vv_f64m4(...) __riscv_vfsgnj_vv_f64m4(__VA_ARGS__)
+#define vfsgnj_vf_f64m4(...) __riscv_vfsgnj_vf_f64m4(__VA_ARGS__)
+#define vfsgnj_vv_f64m8(...) __riscv_vfsgnj_vv_f64m8(__VA_ARGS__)
+#define vfsgnj_vf_f64m8(...) __riscv_vfsgnj_vf_f64m8(__VA_ARGS__)
+#define vfsgnjn_vv_f16mf4(...) __riscv_vfsgnjn_vv_f16mf4(__VA_ARGS__)
+#define vfsgnjn_vf_f16mf4(...) __riscv_vfsgnjn_vf_f16mf4(__VA_ARGS__)
+#define vfsgnjn_vv_f16mf2(...) __riscv_vfsgnjn_vv_f16mf2(__VA_ARGS__)
+#define vfsgnjn_vf_f16mf2(...) __riscv_vfsgnjn_vf_f16mf2(__VA_ARGS__)
+#define vfsgnjn_vv_f16m1(...) __riscv_vfsgnjn_vv_f16m1(__VA_ARGS__)
+#define vfsgnjn_vf_f16m1(...) __riscv_vfsgnjn_vf_f16m1(__VA_ARGS__)
+#define vfsgnjn_vv_f16m2(...) __riscv_vfsgnjn_vv_f16m2(__VA_ARGS__)
+#define vfsgnjn_vf_f16m2(...) __riscv_vfsgnjn_vf_f16m2(__VA_ARGS__)
+#define vfsgnjn_vv_f16m4(...) __riscv_vfsgnjn_vv_f16m4(__VA_ARGS__)
+#define vfsgnjn_vf_f16m4(...) __riscv_vfsgnjn_vf_f16m4(__VA_ARGS__)
+#define vfsgnjn_vv_f16m8(...) __riscv_vfsgnjn_vv_f16m8(__VA_ARGS__)
+#define vfsgnjn_vf_f16m8(...) __riscv_vfsgnjn_vf_f16m8(__VA_ARGS__)
+#define vfsgnjn_vv_f32mf2(...) __riscv_vfsgnjn_vv_f32mf2(__VA_ARGS__)
+#define vfsgnjn_vf_f32mf2(...) __riscv_vfsgnjn_vf_f32mf2(__VA_ARGS__)
+#define vfsgnjn_vv_f32m1(...) __riscv_vfsgnjn_vv_f32m1(__VA_ARGS__)
+#define vfsgnjn_vf_f32m1(...) __riscv_vfsgnjn_vf_f32m1(__VA_ARGS__)
+#define vfsgnjn_vv_f32m2(...) __riscv_vfsgnjn_vv_f32m2(__VA_ARGS__)
+#define vfsgnjn_vf_f32m2(...) __riscv_vfsgnjn_vf_f32m2(__VA_ARGS__)
+#define vfsgnjn_vv_f32m4(...) __riscv_vfsgnjn_vv_f32m4(__VA_ARGS__)
+#define vfsgnjn_vf_f32m4(...) __riscv_vfsgnjn_vf_f32m4(__VA_ARGS__)
+#define vfsgnjn_vv_f32m8(...) __riscv_vfsgnjn_vv_f32m8(__VA_ARGS__)
+#define vfsgnjn_vf_f32m8(...) __riscv_vfsgnjn_vf_f32m8(__VA_ARGS__)
+#define vfsgnjn_vv_f64m1(...) __riscv_vfsgnjn_vv_f64m1(__VA_ARGS__)
+#define vfsgnjn_vf_f64m1(...) __riscv_vfsgnjn_vf_f64m1(__VA_ARGS__)
+#define vfsgnjn_vv_f64m2(...) __riscv_vfsgnjn_vv_f64m2(__VA_ARGS__)
+#define vfsgnjn_vf_f64m2(...) __riscv_vfsgnjn_vf_f64m2(__VA_ARGS__)
+#define vfsgnjn_vv_f64m4(...) __riscv_vfsgnjn_vv_f64m4(__VA_ARGS__)
+#define vfsgnjn_vf_f64m4(...) __riscv_vfsgnjn_vf_f64m4(__VA_ARGS__)
+#define vfsgnjn_vv_f64m8(...) __riscv_vfsgnjn_vv_f64m8(__VA_ARGS__)
+#define vfsgnjn_vf_f64m8(...) __riscv_vfsgnjn_vf_f64m8(__VA_ARGS__)
+#define vfsgnjx_vv_f16mf4(...) __riscv_vfsgnjx_vv_f16mf4(__VA_ARGS__)
+#define vfsgnjx_vf_f16mf4(...) __riscv_vfsgnjx_vf_f16mf4(__VA_ARGS__)
+#define vfsgnjx_vv_f16mf2(...) __riscv_vfsgnjx_vv_f16mf2(__VA_ARGS__)
+#define vfsgnjx_vf_f16mf2(...) __riscv_vfsgnjx_vf_f16mf2(__VA_ARGS__)
+#define vfsgnjx_vv_f16m1(...) __riscv_vfsgnjx_vv_f16m1(__VA_ARGS__)
+#define vfsgnjx_vf_f16m1(...) __riscv_vfsgnjx_vf_f16m1(__VA_ARGS__)
+#define vfsgnjx_vv_f16m2(...) __riscv_vfsgnjx_vv_f16m2(__VA_ARGS__)
+#define vfsgnjx_vf_f16m2(...) __riscv_vfsgnjx_vf_f16m2(__VA_ARGS__)
+#define vfsgnjx_vv_f16m4(...) __riscv_vfsgnjx_vv_f16m4(__VA_ARGS__)
+#define vfsgnjx_vf_f16m4(...) __riscv_vfsgnjx_vf_f16m4(__VA_ARGS__)
+#define vfsgnjx_vv_f16m8(...) __riscv_vfsgnjx_vv_f16m8(__VA_ARGS__)
+#define vfsgnjx_vf_f16m8(...) __riscv_vfsgnjx_vf_f16m8(__VA_ARGS__)
+#define vfsgnjx_vv_f32mf2(...) __riscv_vfsgnjx_vv_f32mf2(__VA_ARGS__)
+#define vfsgnjx_vf_f32mf2(...) __riscv_vfsgnjx_vf_f32mf2(__VA_ARGS__)
+#define vfsgnjx_vv_f32m1(...) __riscv_vfsgnjx_vv_f32m1(__VA_ARGS__)
+#define vfsgnjx_vf_f32m1(...) __riscv_vfsgnjx_vf_f32m1(__VA_ARGS__)
+#define vfsgnjx_vv_f32m2(...) __riscv_vfsgnjx_vv_f32m2(__VA_ARGS__)
+#define vfsgnjx_vf_f32m2(...) __riscv_vfsgnjx_vf_f32m2(__VA_ARGS__)
+#define vfsgnjx_vv_f32m4(...) __riscv_vfsgnjx_vv_f32m4(__VA_ARGS__)
+#define vfsgnjx_vf_f32m4(...) __riscv_vfsgnjx_vf_f32m4(__VA_ARGS__)
+#define vfsgnjx_vv_f32m8(...) __riscv_vfsgnjx_vv_f32m8(__VA_ARGS__)
+#define vfsgnjx_vf_f32m8(...) __riscv_vfsgnjx_vf_f32m8(__VA_ARGS__)
+#define vfsgnjx_vv_f64m1(...) __riscv_vfsgnjx_vv_f64m1(__VA_ARGS__)
+#define vfsgnjx_vf_f64m1(...) __riscv_vfsgnjx_vf_f64m1(__VA_ARGS__)
+#define vfsgnjx_vv_f64m2(...) __riscv_vfsgnjx_vv_f64m2(__VA_ARGS__)
+#define vfsgnjx_vf_f64m2(...) __riscv_vfsgnjx_vf_f64m2(__VA_ARGS__)
+#define vfsgnjx_vv_f64m4(...) __riscv_vfsgnjx_vv_f64m4(__VA_ARGS__)
+#define vfsgnjx_vf_f64m4(...) __riscv_vfsgnjx_vf_f64m4(__VA_ARGS__)
+#define vfsgnjx_vv_f64m8(...) __riscv_vfsgnjx_vv_f64m8(__VA_ARGS__)
+#define vfsgnjx_vf_f64m8(...) __riscv_vfsgnjx_vf_f64m8(__VA_ARGS__)
+// masked functions
+#define vfsgnj_vv_f16mf4_m(...) __riscv_vfsgnj_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f16mf4_m(...) __riscv_vfsgnj_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f16mf2_m(...) __riscv_vfsgnj_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f16mf2_m(...) __riscv_vfsgnj_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f16m1_m(...) __riscv_vfsgnj_vv_f16m1_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f16m1_m(...) __riscv_vfsgnj_vf_f16m1_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f16m2_m(...) __riscv_vfsgnj_vv_f16m2_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f16m2_m(...) __riscv_vfsgnj_vf_f16m2_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f16m4_m(...) __riscv_vfsgnj_vv_f16m4_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f16m4_m(...) __riscv_vfsgnj_vf_f16m4_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f16m8_m(...) __riscv_vfsgnj_vv_f16m8_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f16m8_m(...) __riscv_vfsgnj_vf_f16m8_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f32mf2_m(...) __riscv_vfsgnj_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f32mf2_m(...) __riscv_vfsgnj_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f32m1_m(...) __riscv_vfsgnj_vv_f32m1_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f32m1_m(...) __riscv_vfsgnj_vf_f32m1_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f32m2_m(...) __riscv_vfsgnj_vv_f32m2_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f32m2_m(...) __riscv_vfsgnj_vf_f32m2_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f32m4_m(...) __riscv_vfsgnj_vv_f32m4_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f32m4_m(...) __riscv_vfsgnj_vf_f32m4_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f32m8_m(...) __riscv_vfsgnj_vv_f32m8_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f32m8_m(...) __riscv_vfsgnj_vf_f32m8_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f64m1_m(...) __riscv_vfsgnj_vv_f64m1_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f64m1_m(...) __riscv_vfsgnj_vf_f64m1_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f64m2_m(...) __riscv_vfsgnj_vv_f64m2_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f64m2_m(...) __riscv_vfsgnj_vf_f64m2_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f64m4_m(...) __riscv_vfsgnj_vv_f64m4_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f64m4_m(...) __riscv_vfsgnj_vf_f64m4_tumu(__VA_ARGS__)
+#define vfsgnj_vv_f64m8_m(...) __riscv_vfsgnj_vv_f64m8_tumu(__VA_ARGS__)
+#define vfsgnj_vf_f64m8_m(...) __riscv_vfsgnj_vf_f64m8_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f16mf4_m(...) __riscv_vfsgnjn_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f16mf4_m(...) __riscv_vfsgnjn_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f16mf2_m(...) __riscv_vfsgnjn_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f16mf2_m(...) __riscv_vfsgnjn_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f16m1_m(...) __riscv_vfsgnjn_vv_f16m1_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f16m1_m(...) __riscv_vfsgnjn_vf_f16m1_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f16m2_m(...) __riscv_vfsgnjn_vv_f16m2_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f16m2_m(...) __riscv_vfsgnjn_vf_f16m2_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f16m4_m(...) __riscv_vfsgnjn_vv_f16m4_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f16m4_m(...) __riscv_vfsgnjn_vf_f16m4_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f16m8_m(...) __riscv_vfsgnjn_vv_f16m8_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f16m8_m(...) __riscv_vfsgnjn_vf_f16m8_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f32mf2_m(...) __riscv_vfsgnjn_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f32mf2_m(...) __riscv_vfsgnjn_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f32m1_m(...) __riscv_vfsgnjn_vv_f32m1_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f32m1_m(...) __riscv_vfsgnjn_vf_f32m1_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f32m2_m(...) __riscv_vfsgnjn_vv_f32m2_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f32m2_m(...) __riscv_vfsgnjn_vf_f32m2_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f32m4_m(...) __riscv_vfsgnjn_vv_f32m4_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f32m4_m(...) __riscv_vfsgnjn_vf_f32m4_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f32m8_m(...) __riscv_vfsgnjn_vv_f32m8_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f32m8_m(...) __riscv_vfsgnjn_vf_f32m8_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f64m1_m(...) __riscv_vfsgnjn_vv_f64m1_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f64m1_m(...) __riscv_vfsgnjn_vf_f64m1_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f64m2_m(...) __riscv_vfsgnjn_vv_f64m2_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f64m2_m(...) __riscv_vfsgnjn_vf_f64m2_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f64m4_m(...) __riscv_vfsgnjn_vv_f64m4_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f64m4_m(...) __riscv_vfsgnjn_vf_f64m4_tumu(__VA_ARGS__)
+#define vfsgnjn_vv_f64m8_m(...) __riscv_vfsgnjn_vv_f64m8_tumu(__VA_ARGS__)
+#define vfsgnjn_vf_f64m8_m(...) __riscv_vfsgnjn_vf_f64m8_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f16mf4_m(...) __riscv_vfsgnjx_vv_f16mf4_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f16mf4_m(...) __riscv_vfsgnjx_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f16mf2_m(...) __riscv_vfsgnjx_vv_f16mf2_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f16mf2_m(...) __riscv_vfsgnjx_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f16m1_m(...) __riscv_vfsgnjx_vv_f16m1_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f16m1_m(...) __riscv_vfsgnjx_vf_f16m1_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f16m2_m(...) __riscv_vfsgnjx_vv_f16m2_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f16m2_m(...) __riscv_vfsgnjx_vf_f16m2_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f16m4_m(...) __riscv_vfsgnjx_vv_f16m4_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f16m4_m(...) __riscv_vfsgnjx_vf_f16m4_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f16m8_m(...) __riscv_vfsgnjx_vv_f16m8_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f16m8_m(...) __riscv_vfsgnjx_vf_f16m8_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f32mf2_m(...) __riscv_vfsgnjx_vv_f32mf2_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f32mf2_m(...) __riscv_vfsgnjx_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f32m1_m(...) __riscv_vfsgnjx_vv_f32m1_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f32m1_m(...) __riscv_vfsgnjx_vf_f32m1_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f32m2_m(...) __riscv_vfsgnjx_vv_f32m2_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f32m2_m(...) __riscv_vfsgnjx_vf_f32m2_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f32m4_m(...) __riscv_vfsgnjx_vv_f32m4_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f32m4_m(...) __riscv_vfsgnjx_vf_f32m4_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f32m8_m(...) __riscv_vfsgnjx_vv_f32m8_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f32m8_m(...) __riscv_vfsgnjx_vf_f32m8_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f64m1_m(...) __riscv_vfsgnjx_vv_f64m1_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f64m1_m(...) __riscv_vfsgnjx_vf_f64m1_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f64m2_m(...) __riscv_vfsgnjx_vv_f64m2_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f64m2_m(...) __riscv_vfsgnjx_vf_f64m2_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f64m4_m(...) __riscv_vfsgnjx_vv_f64m4_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f64m4_m(...) __riscv_vfsgnjx_vf_f64m4_tumu(__VA_ARGS__)
+#define vfsgnjx_vv_f64m8_m(...) __riscv_vfsgnjx_vv_f64m8_tumu(__VA_ARGS__)
+#define vfsgnjx_vf_f64m8_m(...) __riscv_vfsgnjx_vf_f64m8_tumu(__VA_ARGS__)
+#define vfabs_v_f16mf4(...) __riscv_vfabs_v_f16mf4(__VA_ARGS__)
+#define vfabs_v_f16mf2(...) __riscv_vfabs_v_f16mf2(__VA_ARGS__)
+#define vfabs_v_f16m1(...) __riscv_vfabs_v_f16m1(__VA_ARGS__)
+#define vfabs_v_f16m2(...) __riscv_vfabs_v_f16m2(__VA_ARGS__)
+#define vfabs_v_f16m4(...) __riscv_vfabs_v_f16m4(__VA_ARGS__)
+#define vfabs_v_f16m8(...) __riscv_vfabs_v_f16m8(__VA_ARGS__)
+#define vfabs_v_f32mf2(...) __riscv_vfabs_v_f32mf2(__VA_ARGS__)
+#define vfabs_v_f32m1(...) __riscv_vfabs_v_f32m1(__VA_ARGS__)
+#define vfabs_v_f32m2(...) __riscv_vfabs_v_f32m2(__VA_ARGS__)
+#define vfabs_v_f32m4(...) __riscv_vfabs_v_f32m4(__VA_ARGS__)
+#define vfabs_v_f32m8(...) __riscv_vfabs_v_f32m8(__VA_ARGS__)
+#define vfabs_v_f64m1(...) __riscv_vfabs_v_f64m1(__VA_ARGS__)
+#define vfabs_v_f64m2(...) __riscv_vfabs_v_f64m2(__VA_ARGS__)
+#define vfabs_v_f64m4(...) __riscv_vfabs_v_f64m4(__VA_ARGS__)
+#define vfabs_v_f64m8(...) __riscv_vfabs_v_f64m8(__VA_ARGS__)
+// masked functions
+#define vfabs_v_f16mf4_m(...) __riscv_vfabs_v_f16mf4_tumu(__VA_ARGS__)
+#define vfabs_v_f16mf2_m(...) __riscv_vfabs_v_f16mf2_tumu(__VA_ARGS__)
+#define vfabs_v_f16m1_m(...) __riscv_vfabs_v_f16m1_tumu(__VA_ARGS__)
+#define vfabs_v_f16m2_m(...) __riscv_vfabs_v_f16m2_tumu(__VA_ARGS__)
+#define vfabs_v_f16m4_m(...) __riscv_vfabs_v_f16m4_tumu(__VA_ARGS__)
+#define vfabs_v_f16m8_m(...) __riscv_vfabs_v_f16m8_tumu(__VA_ARGS__)
+#define vfabs_v_f32mf2_m(...) __riscv_vfabs_v_f32mf2_tumu(__VA_ARGS__)
+#define vfabs_v_f32m1_m(...) __riscv_vfabs_v_f32m1_tumu(__VA_ARGS__)
+#define vfabs_v_f32m2_m(...) __riscv_vfabs_v_f32m2_tumu(__VA_ARGS__)
+#define vfabs_v_f32m4_m(...) __riscv_vfabs_v_f32m4_tumu(__VA_ARGS__)
+#define vfabs_v_f32m8_m(...) __riscv_vfabs_v_f32m8_tumu(__VA_ARGS__)
+#define vfabs_v_f64m1_m(...) __riscv_vfabs_v_f64m1_tumu(__VA_ARGS__)
+#define vfabs_v_f64m2_m(...) __riscv_vfabs_v_f64m2_tumu(__VA_ARGS__)
+#define vfabs_v_f64m4_m(...) __riscv_vfabs_v_f64m4_tumu(__VA_ARGS__)
+#define vfabs_v_f64m8_m(...) __riscv_vfabs_v_f64m8_tumu(__VA_ARGS__)
+#define vmfeq_vv_f16mf4_b64(...) __riscv_vmfeq_vv_f16mf4_b64(__VA_ARGS__)
+#define vmfeq_vf_f16mf4_b64(...) __riscv_vmfeq_vf_f16mf4_b64(__VA_ARGS__)
+#define vmfeq_vv_f16mf2_b32(...) __riscv_vmfeq_vv_f16mf2_b32(__VA_ARGS__)
+#define vmfeq_vf_f16mf2_b32(...) __riscv_vmfeq_vf_f16mf2_b32(__VA_ARGS__)
+#define vmfeq_vv_f16m1_b16(...) __riscv_vmfeq_vv_f16m1_b16(__VA_ARGS__)
+#define vmfeq_vf_f16m1_b16(...) __riscv_vmfeq_vf_f16m1_b16(__VA_ARGS__)
+#define vmfeq_vv_f16m2_b8(...) __riscv_vmfeq_vv_f16m2_b8(__VA_ARGS__)
+#define vmfeq_vf_f16m2_b8(...) __riscv_vmfeq_vf_f16m2_b8(__VA_ARGS__)
+#define vmfeq_vv_f16m4_b4(...) __riscv_vmfeq_vv_f16m4_b4(__VA_ARGS__)
+#define vmfeq_vf_f16m4_b4(...) __riscv_vmfeq_vf_f16m4_b4(__VA_ARGS__)
+#define vmfeq_vv_f16m8_b2(...) __riscv_vmfeq_vv_f16m8_b2(__VA_ARGS__)
+#define vmfeq_vf_f16m8_b2(...) __riscv_vmfeq_vf_f16m8_b2(__VA_ARGS__)
+#define vmfeq_vv_f32mf2_b64(...) __riscv_vmfeq_vv_f32mf2_b64(__VA_ARGS__)
+#define vmfeq_vf_f32mf2_b64(...) __riscv_vmfeq_vf_f32mf2_b64(__VA_ARGS__)
+#define vmfeq_vv_f32m1_b32(...) __riscv_vmfeq_vv_f32m1_b32(__VA_ARGS__)
+#define vmfeq_vf_f32m1_b32(...) __riscv_vmfeq_vf_f32m1_b32(__VA_ARGS__)
+#define vmfeq_vv_f32m2_b16(...) __riscv_vmfeq_vv_f32m2_b16(__VA_ARGS__)
+#define vmfeq_vf_f32m2_b16(...) __riscv_vmfeq_vf_f32m2_b16(__VA_ARGS__)
+#define vmfeq_vv_f32m4_b8(...) __riscv_vmfeq_vv_f32m4_b8(__VA_ARGS__)
+#define vmfeq_vf_f32m4_b8(...) __riscv_vmfeq_vf_f32m4_b8(__VA_ARGS__)
+#define vmfeq_vv_f32m8_b4(...) __riscv_vmfeq_vv_f32m8_b4(__VA_ARGS__)
+#define vmfeq_vf_f32m8_b4(...) __riscv_vmfeq_vf_f32m8_b4(__VA_ARGS__)
+#define vmfeq_vv_f64m1_b64(...) __riscv_vmfeq_vv_f64m1_b64(__VA_ARGS__)
+#define vmfeq_vf_f64m1_b64(...) __riscv_vmfeq_vf_f64m1_b64(__VA_ARGS__)
+#define vmfeq_vv_f64m2_b32(...) __riscv_vmfeq_vv_f64m2_b32(__VA_ARGS__)
+#define vmfeq_vf_f64m2_b32(...) __riscv_vmfeq_vf_f64m2_b32(__VA_ARGS__)
+#define vmfeq_vv_f64m4_b16(...) __riscv_vmfeq_vv_f64m4_b16(__VA_ARGS__)
+#define vmfeq_vf_f64m4_b16(...) __riscv_vmfeq_vf_f64m4_b16(__VA_ARGS__)
+#define vmfeq_vv_f64m8_b8(...) __riscv_vmfeq_vv_f64m8_b8(__VA_ARGS__)
+#define vmfeq_vf_f64m8_b8(...) __riscv_vmfeq_vf_f64m8_b8(__VA_ARGS__)
+#define vmfne_vv_f16mf4_b64(...) __riscv_vmfne_vv_f16mf4_b64(__VA_ARGS__)
+#define vmfne_vf_f16mf4_b64(...) __riscv_vmfne_vf_f16mf4_b64(__VA_ARGS__)
+#define vmfne_vv_f16mf2_b32(...) __riscv_vmfne_vv_f16mf2_b32(__VA_ARGS__)
+#define vmfne_vf_f16mf2_b32(...) __riscv_vmfne_vf_f16mf2_b32(__VA_ARGS__)
+#define vmfne_vv_f16m1_b16(...) __riscv_vmfne_vv_f16m1_b16(__VA_ARGS__)
+#define vmfne_vf_f16m1_b16(...) __riscv_vmfne_vf_f16m1_b16(__VA_ARGS__)
+#define vmfne_vv_f16m2_b8(...) __riscv_vmfne_vv_f16m2_b8(__VA_ARGS__)
+#define vmfne_vf_f16m2_b8(...) __riscv_vmfne_vf_f16m2_b8(__VA_ARGS__)
+#define vmfne_vv_f16m4_b4(...) __riscv_vmfne_vv_f16m4_b4(__VA_ARGS__)
+#define vmfne_vf_f16m4_b4(...) __riscv_vmfne_vf_f16m4_b4(__VA_ARGS__)
+#define vmfne_vv_f16m8_b2(...) __riscv_vmfne_vv_f16m8_b2(__VA_ARGS__)
+#define vmfne_vf_f16m8_b2(...) __riscv_vmfne_vf_f16m8_b2(__VA_ARGS__)
+#define vmfne_vv_f32mf2_b64(...) __riscv_vmfne_vv_f32mf2_b64(__VA_ARGS__)
+#define vmfne_vf_f32mf2_b64(...) __riscv_vmfne_vf_f32mf2_b64(__VA_ARGS__)
+#define vmfne_vv_f32m1_b32(...) __riscv_vmfne_vv_f32m1_b32(__VA_ARGS__)
+#define vmfne_vf_f32m1_b32(...) __riscv_vmfne_vf_f32m1_b32(__VA_ARGS__)
+#define vmfne_vv_f32m2_b16(...) __riscv_vmfne_vv_f32m2_b16(__VA_ARGS__)
+#define vmfne_vf_f32m2_b16(...) __riscv_vmfne_vf_f32m2_b16(__VA_ARGS__)
+#define vmfne_vv_f32m4_b8(...) __riscv_vmfne_vv_f32m4_b8(__VA_ARGS__)
+#define vmfne_vf_f32m4_b8(...) __riscv_vmfne_vf_f32m4_b8(__VA_ARGS__)
+#define vmfne_vv_f32m8_b4(...) __riscv_vmfne_vv_f32m8_b4(__VA_ARGS__)
+#define vmfne_vf_f32m8_b4(...) __riscv_vmfne_vf_f32m8_b4(__VA_ARGS__)
+#define vmfne_vv_f64m1_b64(...) __riscv_vmfne_vv_f64m1_b64(__VA_ARGS__)
+#define vmfne_vf_f64m1_b64(...) __riscv_vmfne_vf_f64m1_b64(__VA_ARGS__)
+#define vmfne_vv_f64m2_b32(...) __riscv_vmfne_vv_f64m2_b32(__VA_ARGS__)
+#define vmfne_vf_f64m2_b32(...) __riscv_vmfne_vf_f64m2_b32(__VA_ARGS__)
+#define vmfne_vv_f64m4_b16(...) __riscv_vmfne_vv_f64m4_b16(__VA_ARGS__)
+#define vmfne_vf_f64m4_b16(...) __riscv_vmfne_vf_f64m4_b16(__VA_ARGS__)
+#define vmfne_vv_f64m8_b8(...) __riscv_vmfne_vv_f64m8_b8(__VA_ARGS__)
+#define vmfne_vf_f64m8_b8(...) __riscv_vmfne_vf_f64m8_b8(__VA_ARGS__)
+#define vmflt_vv_f16mf4_b64(...) __riscv_vmflt_vv_f16mf4_b64(__VA_ARGS__)
+#define vmflt_vf_f16mf4_b64(...) __riscv_vmflt_vf_f16mf4_b64(__VA_ARGS__)
+#define vmflt_vv_f16mf2_b32(...) __riscv_vmflt_vv_f16mf2_b32(__VA_ARGS__)
+#define vmflt_vf_f16mf2_b32(...) __riscv_vmflt_vf_f16mf2_b32(__VA_ARGS__)
+#define vmflt_vv_f16m1_b16(...) __riscv_vmflt_vv_f16m1_b16(__VA_ARGS__)
+#define vmflt_vf_f16m1_b16(...) __riscv_vmflt_vf_f16m1_b16(__VA_ARGS__)
+#define vmflt_vv_f16m2_b8(...) __riscv_vmflt_vv_f16m2_b8(__VA_ARGS__)
+#define vmflt_vf_f16m2_b8(...) __riscv_vmflt_vf_f16m2_b8(__VA_ARGS__)
+#define vmflt_vv_f16m4_b4(...) __riscv_vmflt_vv_f16m4_b4(__VA_ARGS__)
+#define vmflt_vf_f16m4_b4(...) __riscv_vmflt_vf_f16m4_b4(__VA_ARGS__)
+#define vmflt_vv_f16m8_b2(...) __riscv_vmflt_vv_f16m8_b2(__VA_ARGS__)
+#define vmflt_vf_f16m8_b2(...) __riscv_vmflt_vf_f16m8_b2(__VA_ARGS__)
+#define vmflt_vv_f32mf2_b64(...) __riscv_vmflt_vv_f32mf2_b64(__VA_ARGS__)
+#define vmflt_vf_f32mf2_b64(...) __riscv_vmflt_vf_f32mf2_b64(__VA_ARGS__)
+#define vmflt_vv_f32m1_b32(...) __riscv_vmflt_vv_f32m1_b32(__VA_ARGS__)
+#define vmflt_vf_f32m1_b32(...) __riscv_vmflt_vf_f32m1_b32(__VA_ARGS__)
+#define vmflt_vv_f32m2_b16(...) __riscv_vmflt_vv_f32m2_b16(__VA_ARGS__)
+#define vmflt_vf_f32m2_b16(...) __riscv_vmflt_vf_f32m2_b16(__VA_ARGS__)
+#define vmflt_vv_f32m4_b8(...) __riscv_vmflt_vv_f32m4_b8(__VA_ARGS__)
+#define vmflt_vf_f32m4_b8(...) __riscv_vmflt_vf_f32m4_b8(__VA_ARGS__)
+#define vmflt_vv_f32m8_b4(...) __riscv_vmflt_vv_f32m8_b4(__VA_ARGS__)
+#define vmflt_vf_f32m8_b4(...) __riscv_vmflt_vf_f32m8_b4(__VA_ARGS__)
+#define vmflt_vv_f64m1_b64(...) __riscv_vmflt_vv_f64m1_b64(__VA_ARGS__)
+#define vmflt_vf_f64m1_b64(...) __riscv_vmflt_vf_f64m1_b64(__VA_ARGS__)
+#define vmflt_vv_f64m2_b32(...) __riscv_vmflt_vv_f64m2_b32(__VA_ARGS__)
+#define vmflt_vf_f64m2_b32(...) __riscv_vmflt_vf_f64m2_b32(__VA_ARGS__)
+#define vmflt_vv_f64m4_b16(...) __riscv_vmflt_vv_f64m4_b16(__VA_ARGS__)
+#define vmflt_vf_f64m4_b16(...) __riscv_vmflt_vf_f64m4_b16(__VA_ARGS__)
+#define vmflt_vv_f64m8_b8(...) __riscv_vmflt_vv_f64m8_b8(__VA_ARGS__)
+#define vmflt_vf_f64m8_b8(...) __riscv_vmflt_vf_f64m8_b8(__VA_ARGS__)
+#define vmfle_vv_f16mf4_b64(...) __riscv_vmfle_vv_f16mf4_b64(__VA_ARGS__)
+#define vmfle_vf_f16mf4_b64(...) __riscv_vmfle_vf_f16mf4_b64(__VA_ARGS__)
+#define vmfle_vv_f16mf2_b32(...) __riscv_vmfle_vv_f16mf2_b32(__VA_ARGS__)
+#define vmfle_vf_f16mf2_b32(...) __riscv_vmfle_vf_f16mf2_b32(__VA_ARGS__)
+#define vmfle_vv_f16m1_b16(...) __riscv_vmfle_vv_f16m1_b16(__VA_ARGS__)
+#define vmfle_vf_f16m1_b16(...) __riscv_vmfle_vf_f16m1_b16(__VA_ARGS__)
+#define vmfle_vv_f16m2_b8(...) __riscv_vmfle_vv_f16m2_b8(__VA_ARGS__)
+#define vmfle_vf_f16m2_b8(...) __riscv_vmfle_vf_f16m2_b8(__VA_ARGS__)
+#define vmfle_vv_f16m4_b4(...) __riscv_vmfle_vv_f16m4_b4(__VA_ARGS__)
+#define vmfle_vf_f16m4_b4(...) __riscv_vmfle_vf_f16m4_b4(__VA_ARGS__)
+#define vmfle_vv_f16m8_b2(...) __riscv_vmfle_vv_f16m8_b2(__VA_ARGS__)
+#define vmfle_vf_f16m8_b2(...) __riscv_vmfle_vf_f16m8_b2(__VA_ARGS__)
+#define vmfle_vv_f32mf2_b64(...) __riscv_vmfle_vv_f32mf2_b64(__VA_ARGS__)
+#define vmfle_vf_f32mf2_b64(...) __riscv_vmfle_vf_f32mf2_b64(__VA_ARGS__)
+#define vmfle_vv_f32m1_b32(...) __riscv_vmfle_vv_f32m1_b32(__VA_ARGS__)
+#define vmfle_vf_f32m1_b32(...) __riscv_vmfle_vf_f32m1_b32(__VA_ARGS__)
+#define vmfle_vv_f32m2_b16(...) __riscv_vmfle_vv_f32m2_b16(__VA_ARGS__)
+#define vmfle_vf_f32m2_b16(...) __riscv_vmfle_vf_f32m2_b16(__VA_ARGS__)
+#define vmfle_vv_f32m4_b8(...) __riscv_vmfle_vv_f32m4_b8(__VA_ARGS__)
+#define vmfle_vf_f32m4_b8(...) __riscv_vmfle_vf_f32m4_b8(__VA_ARGS__)
+#define vmfle_vv_f32m8_b4(...) __riscv_vmfle_vv_f32m8_b4(__VA_ARGS__)
+#define vmfle_vf_f32m8_b4(...) __riscv_vmfle_vf_f32m8_b4(__VA_ARGS__)
+#define vmfle_vv_f64m1_b64(...) __riscv_vmfle_vv_f64m1_b64(__VA_ARGS__)
+#define vmfle_vf_f64m1_b64(...) __riscv_vmfle_vf_f64m1_b64(__VA_ARGS__)
+#define vmfle_vv_f64m2_b32(...) __riscv_vmfle_vv_f64m2_b32(__VA_ARGS__)
+#define vmfle_vf_f64m2_b32(...) __riscv_vmfle_vf_f64m2_b32(__VA_ARGS__)
+#define vmfle_vv_f64m4_b16(...) __riscv_vmfle_vv_f64m4_b16(__VA_ARGS__)
+#define vmfle_vf_f64m4_b16(...) __riscv_vmfle_vf_f64m4_b16(__VA_ARGS__)
+#define vmfle_vv_f64m8_b8(...) __riscv_vmfle_vv_f64m8_b8(__VA_ARGS__)
+#define vmfle_vf_f64m8_b8(...) __riscv_vmfle_vf_f64m8_b8(__VA_ARGS__)
+#define vmfgt_vv_f16mf4_b64(...) __riscv_vmfgt_vv_f16mf4_b64(__VA_ARGS__)
+#define vmfgt_vf_f16mf4_b64(...) __riscv_vmfgt_vf_f16mf4_b64(__VA_ARGS__)
+#define vmfgt_vv_f16mf2_b32(...) __riscv_vmfgt_vv_f16mf2_b32(__VA_ARGS__)
+#define vmfgt_vf_f16mf2_b32(...) __riscv_vmfgt_vf_f16mf2_b32(__VA_ARGS__)
+#define vmfgt_vv_f16m1_b16(...) __riscv_vmfgt_vv_f16m1_b16(__VA_ARGS__)
+#define vmfgt_vf_f16m1_b16(...) __riscv_vmfgt_vf_f16m1_b16(__VA_ARGS__)
+#define vmfgt_vv_f16m2_b8(...) __riscv_vmfgt_vv_f16m2_b8(__VA_ARGS__)
+#define vmfgt_vf_f16m2_b8(...) __riscv_vmfgt_vf_f16m2_b8(__VA_ARGS__)
+#define vmfgt_vv_f16m4_b4(...) __riscv_vmfgt_vv_f16m4_b4(__VA_ARGS__)
+#define vmfgt_vf_f16m4_b4(...) __riscv_vmfgt_vf_f16m4_b4(__VA_ARGS__)
+#define vmfgt_vv_f16m8_b2(...) __riscv_vmfgt_vv_f16m8_b2(__VA_ARGS__)
+#define vmfgt_vf_f16m8_b2(...) __riscv_vmfgt_vf_f16m8_b2(__VA_ARGS__)
+#define vmfgt_vv_f32mf2_b64(...) __riscv_vmfgt_vv_f32mf2_b64(__VA_ARGS__)
+#define vmfgt_vf_f32mf2_b64(...) __riscv_vmfgt_vf_f32mf2_b64(__VA_ARGS__)
+#define vmfgt_vv_f32m1_b32(...) __riscv_vmfgt_vv_f32m1_b32(__VA_ARGS__)
+#define vmfgt_vf_f32m1_b32(...) __riscv_vmfgt_vf_f32m1_b32(__VA_ARGS__)
+#define vmfgt_vv_f32m2_b16(...) __riscv_vmfgt_vv_f32m2_b16(__VA_ARGS__)
+#define vmfgt_vf_f32m2_b16(...) __riscv_vmfgt_vf_f32m2_b16(__VA_ARGS__)
+#define vmfgt_vv_f32m4_b8(...) __riscv_vmfgt_vv_f32m4_b8(__VA_ARGS__)
+#define vmfgt_vf_f32m4_b8(...) __riscv_vmfgt_vf_f32m4_b8(__VA_ARGS__)
+#define vmfgt_vv_f32m8_b4(...) __riscv_vmfgt_vv_f32m8_b4(__VA_ARGS__)
+#define vmfgt_vf_f32m8_b4(...) __riscv_vmfgt_vf_f32m8_b4(__VA_ARGS__)
+#define vmfgt_vv_f64m1_b64(...) __riscv_vmfgt_vv_f64m1_b64(__VA_ARGS__)
+#define vmfgt_vf_f64m1_b64(...) __riscv_vmfgt_vf_f64m1_b64(__VA_ARGS__)
+#define vmfgt_vv_f64m2_b32(...) __riscv_vmfgt_vv_f64m2_b32(__VA_ARGS__)
+#define vmfgt_vf_f64m2_b32(...) __riscv_vmfgt_vf_f64m2_b32(__VA_ARGS__)
+#define vmfgt_vv_f64m4_b16(...) __riscv_vmfgt_vv_f64m4_b16(__VA_ARGS__)
+#define vmfgt_vf_f64m4_b16(...) __riscv_vmfgt_vf_f64m4_b16(__VA_ARGS__)
+#define vmfgt_vv_f64m8_b8(...) __riscv_vmfgt_vv_f64m8_b8(__VA_ARGS__)
+#define vmfgt_vf_f64m8_b8(...) __riscv_vmfgt_vf_f64m8_b8(__VA_ARGS__)
+#define vmfge_vv_f16mf4_b64(...) __riscv_vmfge_vv_f16mf4_b64(__VA_ARGS__)
+#define vmfge_vf_f16mf4_b64(...) __riscv_vmfge_vf_f16mf4_b64(__VA_ARGS__)
+#define vmfge_vv_f16mf2_b32(...) __riscv_vmfge_vv_f16mf2_b32(__VA_ARGS__)
+#define vmfge_vf_f16mf2_b32(...) __riscv_vmfge_vf_f16mf2_b32(__VA_ARGS__)
+#define vmfge_vv_f16m1_b16(...) __riscv_vmfge_vv_f16m1_b16(__VA_ARGS__)
+#define vmfge_vf_f16m1_b16(...) __riscv_vmfge_vf_f16m1_b16(__VA_ARGS__)
+#define vmfge_vv_f16m2_b8(...) __riscv_vmfge_vv_f16m2_b8(__VA_ARGS__)
+#define vmfge_vf_f16m2_b8(...) __riscv_vmfge_vf_f16m2_b8(__VA_ARGS__)
+#define vmfge_vv_f16m4_b4(...) __riscv_vmfge_vv_f16m4_b4(__VA_ARGS__)
+#define vmfge_vf_f16m4_b4(...) __riscv_vmfge_vf_f16m4_b4(__VA_ARGS__)
+#define vmfge_vv_f16m8_b2(...) __riscv_vmfge_vv_f16m8_b2(__VA_ARGS__)
+#define vmfge_vf_f16m8_b2(...) __riscv_vmfge_vf_f16m8_b2(__VA_ARGS__)
+#define vmfge_vv_f32mf2_b64(...) __riscv_vmfge_vv_f32mf2_b64(__VA_ARGS__)
+#define vmfge_vf_f32mf2_b64(...) __riscv_vmfge_vf_f32mf2_b64(__VA_ARGS__)
+#define vmfge_vv_f32m1_b32(...) __riscv_vmfge_vv_f32m1_b32(__VA_ARGS__)
+#define vmfge_vf_f32m1_b32(...) __riscv_vmfge_vf_f32m1_b32(__VA_ARGS__)
+#define vmfge_vv_f32m2_b16(...) __riscv_vmfge_vv_f32m2_b16(__VA_ARGS__)
+#define vmfge_vf_f32m2_b16(...) __riscv_vmfge_vf_f32m2_b16(__VA_ARGS__)
+#define vmfge_vv_f32m4_b8(...) __riscv_vmfge_vv_f32m4_b8(__VA_ARGS__)
+#define vmfge_vf_f32m4_b8(...) __riscv_vmfge_vf_f32m4_b8(__VA_ARGS__)
+#define vmfge_vv_f32m8_b4(...) __riscv_vmfge_vv_f32m8_b4(__VA_ARGS__)
+#define vmfge_vf_f32m8_b4(...) __riscv_vmfge_vf_f32m8_b4(__VA_ARGS__)
+#define vmfge_vv_f64m1_b64(...) __riscv_vmfge_vv_f64m1_b64(__VA_ARGS__)
+#define vmfge_vf_f64m1_b64(...) __riscv_vmfge_vf_f64m1_b64(__VA_ARGS__)
+#define vmfge_vv_f64m2_b32(...) __riscv_vmfge_vv_f64m2_b32(__VA_ARGS__)
+#define vmfge_vf_f64m2_b32(...) __riscv_vmfge_vf_f64m2_b32(__VA_ARGS__)
+#define vmfge_vv_f64m4_b16(...) __riscv_vmfge_vv_f64m4_b16(__VA_ARGS__)
+#define vmfge_vf_f64m4_b16(...) __riscv_vmfge_vf_f64m4_b16(__VA_ARGS__)
+#define vmfge_vv_f64m8_b8(...) __riscv_vmfge_vv_f64m8_b8(__VA_ARGS__)
+#define vmfge_vf_f64m8_b8(...) __riscv_vmfge_vf_f64m8_b8(__VA_ARGS__)
+// masked functions
+#define vmfeq_vv_f16mf4_b64_m(...) __riscv_vmfeq_vv_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfeq_vf_f16mf4_b64_m(...) __riscv_vmfeq_vf_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfeq_vv_f16mf2_b32_m(...) __riscv_vmfeq_vv_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfeq_vf_f16mf2_b32_m(...) __riscv_vmfeq_vf_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfeq_vv_f16m1_b16_m(...) __riscv_vmfeq_vv_f16m1_b16_mu(__VA_ARGS__)
+#define vmfeq_vf_f16m1_b16_m(...) __riscv_vmfeq_vf_f16m1_b16_mu(__VA_ARGS__)
+#define vmfeq_vv_f16m2_b8_m(...) __riscv_vmfeq_vv_f16m2_b8_mu(__VA_ARGS__)
+#define vmfeq_vf_f16m2_b8_m(...) __riscv_vmfeq_vf_f16m2_b8_mu(__VA_ARGS__)
+#define vmfeq_vv_f16m4_b4_m(...) __riscv_vmfeq_vv_f16m4_b4_mu(__VA_ARGS__)
+#define vmfeq_vf_f16m4_b4_m(...) __riscv_vmfeq_vf_f16m4_b4_mu(__VA_ARGS__)
+#define vmfeq_vv_f16m8_b2_m(...) __riscv_vmfeq_vv_f16m8_b2_mu(__VA_ARGS__)
+#define vmfeq_vf_f16m8_b2_m(...) __riscv_vmfeq_vf_f16m8_b2_mu(__VA_ARGS__)
+#define vmfeq_vv_f32mf2_b64_m(...) __riscv_vmfeq_vv_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfeq_vf_f32mf2_b64_m(...) __riscv_vmfeq_vf_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfeq_vv_f32m1_b32_m(...) __riscv_vmfeq_vv_f32m1_b32_mu(__VA_ARGS__)
+#define vmfeq_vf_f32m1_b32_m(...) __riscv_vmfeq_vf_f32m1_b32_mu(__VA_ARGS__)
+#define vmfeq_vv_f32m2_b16_m(...) __riscv_vmfeq_vv_f32m2_b16_mu(__VA_ARGS__)
+#define vmfeq_vf_f32m2_b16_m(...) __riscv_vmfeq_vf_f32m2_b16_mu(__VA_ARGS__)
+#define vmfeq_vv_f32m4_b8_m(...) __riscv_vmfeq_vv_f32m4_b8_mu(__VA_ARGS__)
+#define vmfeq_vf_f32m4_b8_m(...) __riscv_vmfeq_vf_f32m4_b8_mu(__VA_ARGS__)
+#define vmfeq_vv_f32m8_b4_m(...) __riscv_vmfeq_vv_f32m8_b4_mu(__VA_ARGS__)
+#define vmfeq_vf_f32m8_b4_m(...) __riscv_vmfeq_vf_f32m8_b4_mu(__VA_ARGS__)
+#define vmfeq_vv_f64m1_b64_m(...) __riscv_vmfeq_vv_f64m1_b64_mu(__VA_ARGS__)
+#define vmfeq_vf_f64m1_b64_m(...) __riscv_vmfeq_vf_f64m1_b64_mu(__VA_ARGS__)
+#define vmfeq_vv_f64m2_b32_m(...) __riscv_vmfeq_vv_f64m2_b32_mu(__VA_ARGS__)
+#define vmfeq_vf_f64m2_b32_m(...) __riscv_vmfeq_vf_f64m2_b32_mu(__VA_ARGS__)
+#define vmfeq_vv_f64m4_b16_m(...) __riscv_vmfeq_vv_f64m4_b16_mu(__VA_ARGS__)
+#define vmfeq_vf_f64m4_b16_m(...) __riscv_vmfeq_vf_f64m4_b16_mu(__VA_ARGS__)
+#define vmfeq_vv_f64m8_b8_m(...) __riscv_vmfeq_vv_f64m8_b8_mu(__VA_ARGS__)
+#define vmfeq_vf_f64m8_b8_m(...) __riscv_vmfeq_vf_f64m8_b8_mu(__VA_ARGS__)
+#define vmfne_vv_f16mf4_b64_m(...) __riscv_vmfne_vv_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfne_vf_f16mf4_b64_m(...) __riscv_vmfne_vf_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfne_vv_f16mf2_b32_m(...) __riscv_vmfne_vv_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfne_vf_f16mf2_b32_m(...) __riscv_vmfne_vf_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfne_vv_f16m1_b16_m(...) __riscv_vmfne_vv_f16m1_b16_mu(__VA_ARGS__)
+#define vmfne_vf_f16m1_b16_m(...) __riscv_vmfne_vf_f16m1_b16_mu(__VA_ARGS__)
+#define vmfne_vv_f16m2_b8_m(...) __riscv_vmfne_vv_f16m2_b8_mu(__VA_ARGS__)
+#define vmfne_vf_f16m2_b8_m(...) __riscv_vmfne_vf_f16m2_b8_mu(__VA_ARGS__)
+#define vmfne_vv_f16m4_b4_m(...) __riscv_vmfne_vv_f16m4_b4_mu(__VA_ARGS__)
+#define vmfne_vf_f16m4_b4_m(...) __riscv_vmfne_vf_f16m4_b4_mu(__VA_ARGS__)
+#define vmfne_vv_f16m8_b2_m(...) __riscv_vmfne_vv_f16m8_b2_mu(__VA_ARGS__)
+#define vmfne_vf_f16m8_b2_m(...) __riscv_vmfne_vf_f16m8_b2_mu(__VA_ARGS__)
+#define vmfne_vv_f32mf2_b64_m(...) __riscv_vmfne_vv_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfne_vf_f32mf2_b64_m(...) __riscv_vmfne_vf_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfne_vv_f32m1_b32_m(...) __riscv_vmfne_vv_f32m1_b32_mu(__VA_ARGS__)
+#define vmfne_vf_f32m1_b32_m(...) __riscv_vmfne_vf_f32m1_b32_mu(__VA_ARGS__)
+#define vmfne_vv_f32m2_b16_m(...) __riscv_vmfne_vv_f32m2_b16_mu(__VA_ARGS__)
+#define vmfne_vf_f32m2_b16_m(...) __riscv_vmfne_vf_f32m2_b16_mu(__VA_ARGS__)
+#define vmfne_vv_f32m4_b8_m(...) __riscv_vmfne_vv_f32m4_b8_mu(__VA_ARGS__)
+#define vmfne_vf_f32m4_b8_m(...) __riscv_vmfne_vf_f32m4_b8_mu(__VA_ARGS__)
+#define vmfne_vv_f32m8_b4_m(...) __riscv_vmfne_vv_f32m8_b4_mu(__VA_ARGS__)
+#define vmfne_vf_f32m8_b4_m(...) __riscv_vmfne_vf_f32m8_b4_mu(__VA_ARGS__)
+#define vmfne_vv_f64m1_b64_m(...) __riscv_vmfne_vv_f64m1_b64_mu(__VA_ARGS__)
+#define vmfne_vf_f64m1_b64_m(...) __riscv_vmfne_vf_f64m1_b64_mu(__VA_ARGS__)
+#define vmfne_vv_f64m2_b32_m(...) __riscv_vmfne_vv_f64m2_b32_mu(__VA_ARGS__)
+#define vmfne_vf_f64m2_b32_m(...) __riscv_vmfne_vf_f64m2_b32_mu(__VA_ARGS__)
+#define vmfne_vv_f64m4_b16_m(...) __riscv_vmfne_vv_f64m4_b16_mu(__VA_ARGS__)
+#define vmfne_vf_f64m4_b16_m(...) __riscv_vmfne_vf_f64m4_b16_mu(__VA_ARGS__)
+#define vmfne_vv_f64m8_b8_m(...) __riscv_vmfne_vv_f64m8_b8_mu(__VA_ARGS__)
+#define vmfne_vf_f64m8_b8_m(...) __riscv_vmfne_vf_f64m8_b8_mu(__VA_ARGS__)
+#define vmflt_vv_f16mf4_b64_m(...) __riscv_vmflt_vv_f16mf4_b64_mu(__VA_ARGS__)
+#define vmflt_vf_f16mf4_b64_m(...) __riscv_vmflt_vf_f16mf4_b64_mu(__VA_ARGS__)
+#define vmflt_vv_f16mf2_b32_m(...) __riscv_vmflt_vv_f16mf2_b32_mu(__VA_ARGS__)
+#define vmflt_vf_f16mf2_b32_m(...) __riscv_vmflt_vf_f16mf2_b32_mu(__VA_ARGS__)
+#define vmflt_vv_f16m1_b16_m(...) __riscv_vmflt_vv_f16m1_b16_mu(__VA_ARGS__)
+#define vmflt_vf_f16m1_b16_m(...) __riscv_vmflt_vf_f16m1_b16_mu(__VA_ARGS__)
+#define vmflt_vv_f16m2_b8_m(...) __riscv_vmflt_vv_f16m2_b8_mu(__VA_ARGS__)
+#define vmflt_vf_f16m2_b8_m(...) __riscv_vmflt_vf_f16m2_b8_mu(__VA_ARGS__)
+#define vmflt_vv_f16m4_b4_m(...) __riscv_vmflt_vv_f16m4_b4_mu(__VA_ARGS__)
+#define vmflt_vf_f16m4_b4_m(...) __riscv_vmflt_vf_f16m4_b4_mu(__VA_ARGS__)
+#define vmflt_vv_f16m8_b2_m(...) __riscv_vmflt_vv_f16m8_b2_mu(__VA_ARGS__)
+#define vmflt_vf_f16m8_b2_m(...) __riscv_vmflt_vf_f16m8_b2_mu(__VA_ARGS__)
+#define vmflt_vv_f32mf2_b64_m(...) __riscv_vmflt_vv_f32mf2_b64_mu(__VA_ARGS__)
+#define vmflt_vf_f32mf2_b64_m(...) __riscv_vmflt_vf_f32mf2_b64_mu(__VA_ARGS__)
+#define vmflt_vv_f32m1_b32_m(...) __riscv_vmflt_vv_f32m1_b32_mu(__VA_ARGS__)
+#define vmflt_vf_f32m1_b32_m(...) __riscv_vmflt_vf_f32m1_b32_mu(__VA_ARGS__)
+#define vmflt_vv_f32m2_b16_m(...) __riscv_vmflt_vv_f32m2_b16_mu(__VA_ARGS__)
+#define vmflt_vf_f32m2_b16_m(...) __riscv_vmflt_vf_f32m2_b16_mu(__VA_ARGS__)
+#define vmflt_vv_f32m4_b8_m(...) __riscv_vmflt_vv_f32m4_b8_mu(__VA_ARGS__)
+#define vmflt_vf_f32m4_b8_m(...) __riscv_vmflt_vf_f32m4_b8_mu(__VA_ARGS__)
+#define vmflt_vv_f32m8_b4_m(...) __riscv_vmflt_vv_f32m8_b4_mu(__VA_ARGS__)
+#define vmflt_vf_f32m8_b4_m(...) __riscv_vmflt_vf_f32m8_b4_mu(__VA_ARGS__)
+#define vmflt_vv_f64m1_b64_m(...) __riscv_vmflt_vv_f64m1_b64_mu(__VA_ARGS__)
+#define vmflt_vf_f64m1_b64_m(...) __riscv_vmflt_vf_f64m1_b64_mu(__VA_ARGS__)
+#define vmflt_vv_f64m2_b32_m(...) __riscv_vmflt_vv_f64m2_b32_mu(__VA_ARGS__)
+#define vmflt_vf_f64m2_b32_m(...) __riscv_vmflt_vf_f64m2_b32_mu(__VA_ARGS__)
+#define vmflt_vv_f64m4_b16_m(...) __riscv_vmflt_vv_f64m4_b16_mu(__VA_ARGS__)
+#define vmflt_vf_f64m4_b16_m(...) __riscv_vmflt_vf_f64m4_b16_mu(__VA_ARGS__)
+#define vmflt_vv_f64m8_b8_m(...) __riscv_vmflt_vv_f64m8_b8_mu(__VA_ARGS__)
+#define vmflt_vf_f64m8_b8_m(...) __riscv_vmflt_vf_f64m8_b8_mu(__VA_ARGS__)
+#define vmfle_vv_f16mf4_b64_m(...) __riscv_vmfle_vv_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfle_vf_f16mf4_b64_m(...) __riscv_vmfle_vf_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfle_vv_f16mf2_b32_m(...) __riscv_vmfle_vv_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfle_vf_f16mf2_b32_m(...) __riscv_vmfle_vf_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfle_vv_f16m1_b16_m(...) __riscv_vmfle_vv_f16m1_b16_mu(__VA_ARGS__)
+#define vmfle_vf_f16m1_b16_m(...) __riscv_vmfle_vf_f16m1_b16_mu(__VA_ARGS__)
+#define vmfle_vv_f16m2_b8_m(...) __riscv_vmfle_vv_f16m2_b8_mu(__VA_ARGS__)
+#define vmfle_vf_f16m2_b8_m(...) __riscv_vmfle_vf_f16m2_b8_mu(__VA_ARGS__)
+#define vmfle_vv_f16m4_b4_m(...) __riscv_vmfle_vv_f16m4_b4_mu(__VA_ARGS__)
+#define vmfle_vf_f16m4_b4_m(...) __riscv_vmfle_vf_f16m4_b4_mu(__VA_ARGS__)
+#define vmfle_vv_f16m8_b2_m(...) __riscv_vmfle_vv_f16m8_b2_mu(__VA_ARGS__)
+#define vmfle_vf_f16m8_b2_m(...) __riscv_vmfle_vf_f16m8_b2_mu(__VA_ARGS__)
+#define vmfle_vv_f32mf2_b64_m(...) __riscv_vmfle_vv_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfle_vf_f32mf2_b64_m(...) __riscv_vmfle_vf_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfle_vv_f32m1_b32_m(...) __riscv_vmfle_vv_f32m1_b32_mu(__VA_ARGS__)
+#define vmfle_vf_f32m1_b32_m(...) __riscv_vmfle_vf_f32m1_b32_mu(__VA_ARGS__)
+#define vmfle_vv_f32m2_b16_m(...) __riscv_vmfle_vv_f32m2_b16_mu(__VA_ARGS__)
+#define vmfle_vf_f32m2_b16_m(...) __riscv_vmfle_vf_f32m2_b16_mu(__VA_ARGS__)
+#define vmfle_vv_f32m4_b8_m(...) __riscv_vmfle_vv_f32m4_b8_mu(__VA_ARGS__)
+#define vmfle_vf_f32m4_b8_m(...) __riscv_vmfle_vf_f32m4_b8_mu(__VA_ARGS__)
+#define vmfle_vv_f32m8_b4_m(...) __riscv_vmfle_vv_f32m8_b4_mu(__VA_ARGS__)
+#define vmfle_vf_f32m8_b4_m(...) __riscv_vmfle_vf_f32m8_b4_mu(__VA_ARGS__)
+#define vmfle_vv_f64m1_b64_m(...) __riscv_vmfle_vv_f64m1_b64_mu(__VA_ARGS__)
+#define vmfle_vf_f64m1_b64_m(...) __riscv_vmfle_vf_f64m1_b64_mu(__VA_ARGS__)
+#define vmfle_vv_f64m2_b32_m(...) __riscv_vmfle_vv_f64m2_b32_mu(__VA_ARGS__)
+#define vmfle_vf_f64m2_b32_m(...) __riscv_vmfle_vf_f64m2_b32_mu(__VA_ARGS__)
+#define vmfle_vv_f64m4_b16_m(...) __riscv_vmfle_vv_f64m4_b16_mu(__VA_ARGS__)
+#define vmfle_vf_f64m4_b16_m(...) __riscv_vmfle_vf_f64m4_b16_mu(__VA_ARGS__)
+#define vmfle_vv_f64m8_b8_m(...) __riscv_vmfle_vv_f64m8_b8_mu(__VA_ARGS__)
+#define vmfle_vf_f64m8_b8_m(...) __riscv_vmfle_vf_f64m8_b8_mu(__VA_ARGS__)
+#define vmfgt_vv_f16mf4_b64_m(...) __riscv_vmfgt_vv_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfgt_vf_f16mf4_b64_m(...) __riscv_vmfgt_vf_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfgt_vv_f16mf2_b32_m(...) __riscv_vmfgt_vv_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfgt_vf_f16mf2_b32_m(...) __riscv_vmfgt_vf_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfgt_vv_f16m1_b16_m(...) __riscv_vmfgt_vv_f16m1_b16_mu(__VA_ARGS__)
+#define vmfgt_vf_f16m1_b16_m(...) __riscv_vmfgt_vf_f16m1_b16_mu(__VA_ARGS__)
+#define vmfgt_vv_f16m2_b8_m(...) __riscv_vmfgt_vv_f16m2_b8_mu(__VA_ARGS__)
+#define vmfgt_vf_f16m2_b8_m(...) __riscv_vmfgt_vf_f16m2_b8_mu(__VA_ARGS__)
+#define vmfgt_vv_f16m4_b4_m(...) __riscv_vmfgt_vv_f16m4_b4_mu(__VA_ARGS__)
+#define vmfgt_vf_f16m4_b4_m(...) __riscv_vmfgt_vf_f16m4_b4_mu(__VA_ARGS__)
+#define vmfgt_vv_f16m8_b2_m(...) __riscv_vmfgt_vv_f16m8_b2_mu(__VA_ARGS__)
+#define vmfgt_vf_f16m8_b2_m(...) __riscv_vmfgt_vf_f16m8_b2_mu(__VA_ARGS__)
+#define vmfgt_vv_f32mf2_b64_m(...) __riscv_vmfgt_vv_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfgt_vf_f32mf2_b64_m(...) __riscv_vmfgt_vf_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfgt_vv_f32m1_b32_m(...) __riscv_vmfgt_vv_f32m1_b32_mu(__VA_ARGS__)
+#define vmfgt_vf_f32m1_b32_m(...) __riscv_vmfgt_vf_f32m1_b32_mu(__VA_ARGS__)
+#define vmfgt_vv_f32m2_b16_m(...) __riscv_vmfgt_vv_f32m2_b16_mu(__VA_ARGS__)
+#define vmfgt_vf_f32m2_b16_m(...) __riscv_vmfgt_vf_f32m2_b16_mu(__VA_ARGS__)
+#define vmfgt_vv_f32m4_b8_m(...) __riscv_vmfgt_vv_f32m4_b8_mu(__VA_ARGS__)
+#define vmfgt_vf_f32m4_b8_m(...) __riscv_vmfgt_vf_f32m4_b8_mu(__VA_ARGS__)
+#define vmfgt_vv_f32m8_b4_m(...) __riscv_vmfgt_vv_f32m8_b4_mu(__VA_ARGS__)
+#define vmfgt_vf_f32m8_b4_m(...) __riscv_vmfgt_vf_f32m8_b4_mu(__VA_ARGS__)
+#define vmfgt_vv_f64m1_b64_m(...) __riscv_vmfgt_vv_f64m1_b64_mu(__VA_ARGS__)
+#define vmfgt_vf_f64m1_b64_m(...) __riscv_vmfgt_vf_f64m1_b64_mu(__VA_ARGS__)
+#define vmfgt_vv_f64m2_b32_m(...) __riscv_vmfgt_vv_f64m2_b32_mu(__VA_ARGS__)
+#define vmfgt_vf_f64m2_b32_m(...) __riscv_vmfgt_vf_f64m2_b32_mu(__VA_ARGS__)
+#define vmfgt_vv_f64m4_b16_m(...) __riscv_vmfgt_vv_f64m4_b16_mu(__VA_ARGS__)
+#define vmfgt_vf_f64m4_b16_m(...) __riscv_vmfgt_vf_f64m4_b16_mu(__VA_ARGS__)
+#define vmfgt_vv_f64m8_b8_m(...) __riscv_vmfgt_vv_f64m8_b8_mu(__VA_ARGS__)
+#define vmfgt_vf_f64m8_b8_m(...) __riscv_vmfgt_vf_f64m8_b8_mu(__VA_ARGS__)
+#define vmfge_vv_f16mf4_b64_m(...) __riscv_vmfge_vv_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfge_vf_f16mf4_b64_m(...) __riscv_vmfge_vf_f16mf4_b64_mu(__VA_ARGS__)
+#define vmfge_vv_f16mf2_b32_m(...) __riscv_vmfge_vv_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfge_vf_f16mf2_b32_m(...) __riscv_vmfge_vf_f16mf2_b32_mu(__VA_ARGS__)
+#define vmfge_vv_f16m1_b16_m(...) __riscv_vmfge_vv_f16m1_b16_mu(__VA_ARGS__)
+#define vmfge_vf_f16m1_b16_m(...) __riscv_vmfge_vf_f16m1_b16_mu(__VA_ARGS__)
+#define vmfge_vv_f16m2_b8_m(...) __riscv_vmfge_vv_f16m2_b8_mu(__VA_ARGS__)
+#define vmfge_vf_f16m2_b8_m(...) __riscv_vmfge_vf_f16m2_b8_mu(__VA_ARGS__)
+#define vmfge_vv_f16m4_b4_m(...) __riscv_vmfge_vv_f16m4_b4_mu(__VA_ARGS__)
+#define vmfge_vf_f16m4_b4_m(...) __riscv_vmfge_vf_f16m4_b4_mu(__VA_ARGS__)
+#define vmfge_vv_f16m8_b2_m(...) __riscv_vmfge_vv_f16m8_b2_mu(__VA_ARGS__)
+#define vmfge_vf_f16m8_b2_m(...) __riscv_vmfge_vf_f16m8_b2_mu(__VA_ARGS__)
+#define vmfge_vv_f32mf2_b64_m(...) __riscv_vmfge_vv_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfge_vf_f32mf2_b64_m(...) __riscv_vmfge_vf_f32mf2_b64_mu(__VA_ARGS__)
+#define vmfge_vv_f32m1_b32_m(...) __riscv_vmfge_vv_f32m1_b32_mu(__VA_ARGS__)
+#define vmfge_vf_f32m1_b32_m(...) __riscv_vmfge_vf_f32m1_b32_mu(__VA_ARGS__)
+#define vmfge_vv_f32m2_b16_m(...) __riscv_vmfge_vv_f32m2_b16_mu(__VA_ARGS__)
+#define vmfge_vf_f32m2_b16_m(...) __riscv_vmfge_vf_f32m2_b16_mu(__VA_ARGS__)
+#define vmfge_vv_f32m4_b8_m(...) __riscv_vmfge_vv_f32m4_b8_mu(__VA_ARGS__)
+#define vmfge_vf_f32m4_b8_m(...) __riscv_vmfge_vf_f32m4_b8_mu(__VA_ARGS__)
+#define vmfge_vv_f32m8_b4_m(...) __riscv_vmfge_vv_f32m8_b4_mu(__VA_ARGS__)
+#define vmfge_vf_f32m8_b4_m(...) __riscv_vmfge_vf_f32m8_b4_mu(__VA_ARGS__)
+#define vmfge_vv_f64m1_b64_m(...) __riscv_vmfge_vv_f64m1_b64_mu(__VA_ARGS__)
+#define vmfge_vf_f64m1_b64_m(...) __riscv_vmfge_vf_f64m1_b64_mu(__VA_ARGS__)
+#define vmfge_vv_f64m2_b32_m(...) __riscv_vmfge_vv_f64m2_b32_mu(__VA_ARGS__)
+#define vmfge_vf_f64m2_b32_m(...) __riscv_vmfge_vf_f64m2_b32_mu(__VA_ARGS__)
+#define vmfge_vv_f64m4_b16_m(...) __riscv_vmfge_vv_f64m4_b16_mu(__VA_ARGS__)
+#define vmfge_vf_f64m4_b16_m(...) __riscv_vmfge_vf_f64m4_b16_mu(__VA_ARGS__)
+#define vmfge_vv_f64m8_b8_m(...) __riscv_vmfge_vv_f64m8_b8_mu(__VA_ARGS__)
+#define vmfge_vf_f64m8_b8_m(...) __riscv_vmfge_vf_f64m8_b8_mu(__VA_ARGS__)
+#define vfclass_v_u16mf4(...) __riscv_vfclass_v_u16mf4(__VA_ARGS__)
+#define vfclass_v_u16mf2(...) __riscv_vfclass_v_u16mf2(__VA_ARGS__)
+#define vfclass_v_u16m1(...) __riscv_vfclass_v_u16m1(__VA_ARGS__)
+#define vfclass_v_u16m2(...) __riscv_vfclass_v_u16m2(__VA_ARGS__)
+#define vfclass_v_u16m4(...) __riscv_vfclass_v_u16m4(__VA_ARGS__)
+#define vfclass_v_u16m8(...) __riscv_vfclass_v_u16m8(__VA_ARGS__)
+#define vfclass_v_u32mf2(...) __riscv_vfclass_v_u32mf2(__VA_ARGS__)
+#define vfclass_v_u32m1(...) __riscv_vfclass_v_u32m1(__VA_ARGS__)
+#define vfclass_v_u32m2(...) __riscv_vfclass_v_u32m2(__VA_ARGS__)
+#define vfclass_v_u32m4(...) __riscv_vfclass_v_u32m4(__VA_ARGS__)
+#define vfclass_v_u32m8(...) __riscv_vfclass_v_u32m8(__VA_ARGS__)
+#define vfclass_v_u64m1(...) __riscv_vfclass_v_u64m1(__VA_ARGS__)
+#define vfclass_v_u64m2(...) __riscv_vfclass_v_u64m2(__VA_ARGS__)
+#define vfclass_v_u64m4(...) __riscv_vfclass_v_u64m4(__VA_ARGS__)
+#define vfclass_v_u64m8(...) __riscv_vfclass_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vfclass_v_u16mf4_m(...) __riscv_vfclass_v_u16mf4_tumu(__VA_ARGS__)
+#define vfclass_v_u16mf2_m(...) __riscv_vfclass_v_u16mf2_tumu(__VA_ARGS__)
+#define vfclass_v_u16m1_m(...) __riscv_vfclass_v_u16m1_tumu(__VA_ARGS__)
+#define vfclass_v_u16m2_m(...) __riscv_vfclass_v_u16m2_tumu(__VA_ARGS__)
+#define vfclass_v_u16m4_m(...) __riscv_vfclass_v_u16m4_tumu(__VA_ARGS__)
+#define vfclass_v_u16m8_m(...) __riscv_vfclass_v_u16m8_tumu(__VA_ARGS__)
+#define vfclass_v_u32mf2_m(...) __riscv_vfclass_v_u32mf2_tumu(__VA_ARGS__)
+#define vfclass_v_u32m1_m(...) __riscv_vfclass_v_u32m1_tumu(__VA_ARGS__)
+#define vfclass_v_u32m2_m(...) __riscv_vfclass_v_u32m2_tumu(__VA_ARGS__)
+#define vfclass_v_u32m4_m(...) __riscv_vfclass_v_u32m4_tumu(__VA_ARGS__)
+#define vfclass_v_u32m8_m(...) __riscv_vfclass_v_u32m8_tumu(__VA_ARGS__)
+#define vfclass_v_u64m1_m(...) __riscv_vfclass_v_u64m1_tumu(__VA_ARGS__)
+#define vfclass_v_u64m2_m(...) __riscv_vfclass_v_u64m2_tumu(__VA_ARGS__)
+#define vfclass_v_u64m4_m(...) __riscv_vfclass_v_u64m4_tumu(__VA_ARGS__)
+#define vfclass_v_u64m8_m(...) __riscv_vfclass_v_u64m8_tumu(__VA_ARGS__)
+#define vmerge_vvm_f16mf4(mask, op1, op2, vl) __riscv_vmerge_vvm_f16mf4((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f16mf4(mask, op1, op2, vl) __riscv_vfmerge_vfm_f16mf4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f16mf2(mask, op1, op2, vl) __riscv_vmerge_vvm_f16mf2((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f16mf2(mask, op1, op2, vl) __riscv_vfmerge_vfm_f16mf2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f16m1(mask, op1, op2, vl) __riscv_vmerge_vvm_f16m1((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f16m1(mask, op1, op2, vl) __riscv_vfmerge_vfm_f16m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f16m2(mask, op1, op2, vl) __riscv_vmerge_vvm_f16m2((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f16m2(mask, op1, op2, vl) __riscv_vfmerge_vfm_f16m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f16m4(mask, op1, op2, vl) __riscv_vmerge_vvm_f16m4((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f16m4(mask, op1, op2, vl) __riscv_vfmerge_vfm_f16m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f16m8(mask, op1, op2, vl) __riscv_vmerge_vvm_f16m8((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f16m8(mask, op1, op2, vl) __riscv_vfmerge_vfm_f16m8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f32mf2(mask, op1, op2, vl) __riscv_vmerge_vvm_f32mf2((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f32mf2(mask, op1, op2, vl) __riscv_vfmerge_vfm_f32mf2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f32m1(mask, op1, op2, vl) __riscv_vmerge_vvm_f32m1((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f32m1(mask, op1, op2, vl) __riscv_vfmerge_vfm_f32m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f32m2(mask, op1, op2, vl) __riscv_vmerge_vvm_f32m2((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f32m2(mask, op1, op2, vl) __riscv_vfmerge_vfm_f32m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f32m4(mask, op1, op2, vl) __riscv_vmerge_vvm_f32m4((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f32m4(mask, op1, op2, vl) __riscv_vfmerge_vfm_f32m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f32m8(mask, op1, op2, vl) __riscv_vmerge_vvm_f32m8((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f32m8(mask, op1, op2, vl) __riscv_vfmerge_vfm_f32m8((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f64m1(mask, op1, op2, vl) __riscv_vmerge_vvm_f64m1((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f64m1(mask, op1, op2, vl) __riscv_vfmerge_vfm_f64m1((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f64m2(mask, op1, op2, vl) __riscv_vmerge_vvm_f64m2((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f64m2(mask, op1, op2, vl) __riscv_vfmerge_vfm_f64m2((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f64m4(mask, op1, op2, vl) __riscv_vmerge_vvm_f64m4((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f64m4(mask, op1, op2, vl) __riscv_vfmerge_vfm_f64m4((op1), (op2), (mask), (vl))
+#define vmerge_vvm_f64m8(mask, op1, op2, vl) __riscv_vmerge_vvm_f64m8((op1), (op2), (mask), (vl))
+#define vfmerge_vfm_f64m8(mask, op1, op2, vl) __riscv_vfmerge_vfm_f64m8((op1), (op2), (mask), (vl))
+#define vmv_v_v_f16mf4(...) __riscv_vmv_v_v_f16mf4(__VA_ARGS__)
+#define vfmv_v_f_f16mf4(...) __riscv_vfmv_v_f_f16mf4(__VA_ARGS__)
+#define vmv_v_v_f16mf2(...) __riscv_vmv_v_v_f16mf2(__VA_ARGS__)
+#define vfmv_v_f_f16mf2(...) __riscv_vfmv_v_f_f16mf2(__VA_ARGS__)
+#define vmv_v_v_f16m1(...) __riscv_vmv_v_v_f16m1(__VA_ARGS__)
+#define vfmv_v_f_f16m1(...) __riscv_vfmv_v_f_f16m1(__VA_ARGS__)
+#define vmv_v_v_f16m2(...) __riscv_vmv_v_v_f16m2(__VA_ARGS__)
+#define vfmv_v_f_f16m2(...) __riscv_vfmv_v_f_f16m2(__VA_ARGS__)
+#define vmv_v_v_f16m4(...) __riscv_vmv_v_v_f16m4(__VA_ARGS__)
+#define vfmv_v_f_f16m4(...) __riscv_vfmv_v_f_f16m4(__VA_ARGS__)
+#define vmv_v_v_f16m8(...) __riscv_vmv_v_v_f16m8(__VA_ARGS__)
+#define vfmv_v_f_f16m8(...) __riscv_vfmv_v_f_f16m8(__VA_ARGS__)
+#define vmv_v_v_f32mf2(...) __riscv_vmv_v_v_f32mf2(__VA_ARGS__)
+#define vfmv_v_f_f32mf2(...) __riscv_vfmv_v_f_f32mf2(__VA_ARGS__)
+#define vmv_v_v_f32m1(...) __riscv_vmv_v_v_f32m1(__VA_ARGS__)
+#define vfmv_v_f_f32m1(...) __riscv_vfmv_v_f_f32m1(__VA_ARGS__)
+#define vmv_v_v_f32m2(...) __riscv_vmv_v_v_f32m2(__VA_ARGS__)
+#define vfmv_v_f_f32m2(...) __riscv_vfmv_v_f_f32m2(__VA_ARGS__)
+#define vmv_v_v_f32m4(...) __riscv_vmv_v_v_f32m4(__VA_ARGS__)
+#define vfmv_v_f_f32m4(...) __riscv_vfmv_v_f_f32m4(__VA_ARGS__)
+#define vmv_v_v_f32m8(...) __riscv_vmv_v_v_f32m8(__VA_ARGS__)
+#define vfmv_v_f_f32m8(...) __riscv_vfmv_v_f_f32m8(__VA_ARGS__)
+#define vmv_v_v_f64m1(...) __riscv_vmv_v_v_f64m1(__VA_ARGS__)
+#define vfmv_v_f_f64m1(...) __riscv_vfmv_v_f_f64m1(__VA_ARGS__)
+#define vmv_v_v_f64m2(...) __riscv_vmv_v_v_f64m2(__VA_ARGS__)
+#define vfmv_v_f_f64m2(...) __riscv_vfmv_v_f_f64m2(__VA_ARGS__)
+#define vmv_v_v_f64m4(...) __riscv_vmv_v_v_f64m4(__VA_ARGS__)
+#define vfmv_v_f_f64m4(...) __riscv_vfmv_v_f_f64m4(__VA_ARGS__)
+#define vmv_v_v_f64m8(...) __riscv_vmv_v_v_f64m8(__VA_ARGS__)
+#define vfmv_v_f_f64m8(...) __riscv_vfmv_v_f_f64m8(__VA_ARGS__)
+#define vfcvt_x_f_v_i16mf4(...) __riscv_vfcvt_x_f_v_i16mf4(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16mf4(...) __riscv_vfcvt_rtz_x_f_v_i16mf4(__VA_ARGS__)
+#define vfcvt_x_f_v_i16mf2(...) __riscv_vfcvt_x_f_v_i16mf2(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16mf2(...) __riscv_vfcvt_rtz_x_f_v_i16mf2(__VA_ARGS__)
+#define vfcvt_x_f_v_i16m1(...) __riscv_vfcvt_x_f_v_i16m1(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16m1(...) __riscv_vfcvt_rtz_x_f_v_i16m1(__VA_ARGS__)
+#define vfcvt_x_f_v_i16m2(...) __riscv_vfcvt_x_f_v_i16m2(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16m2(...) __riscv_vfcvt_rtz_x_f_v_i16m2(__VA_ARGS__)
+#define vfcvt_x_f_v_i16m4(...) __riscv_vfcvt_x_f_v_i16m4(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16m4(...) __riscv_vfcvt_rtz_x_f_v_i16m4(__VA_ARGS__)
+#define vfcvt_x_f_v_i16m8(...) __riscv_vfcvt_x_f_v_i16m8(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16m8(...) __riscv_vfcvt_rtz_x_f_v_i16m8(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16mf4(...) __riscv_vfcvt_xu_f_v_u16mf4(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16mf4(...) __riscv_vfcvt_rtz_xu_f_v_u16mf4(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16mf2(...) __riscv_vfcvt_xu_f_v_u16mf2(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16mf2(...) __riscv_vfcvt_rtz_xu_f_v_u16mf2(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16m1(...) __riscv_vfcvt_xu_f_v_u16m1(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16m1(...) __riscv_vfcvt_rtz_xu_f_v_u16m1(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16m2(...) __riscv_vfcvt_xu_f_v_u16m2(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16m2(...) __riscv_vfcvt_rtz_xu_f_v_u16m2(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16m4(...) __riscv_vfcvt_xu_f_v_u16m4(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16m4(...) __riscv_vfcvt_rtz_xu_f_v_u16m4(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16m8(...) __riscv_vfcvt_xu_f_v_u16m8(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16m8(...) __riscv_vfcvt_rtz_xu_f_v_u16m8(__VA_ARGS__)
+#define vfcvt_f_x_v_f16mf4(...) __riscv_vfcvt_f_x_v_f16mf4(__VA_ARGS__)
+#define vfcvt_f_x_v_f16mf2(...) __riscv_vfcvt_f_x_v_f16mf2(__VA_ARGS__)
+#define vfcvt_f_x_v_f16m1(...) __riscv_vfcvt_f_x_v_f16m1(__VA_ARGS__)
+#define vfcvt_f_x_v_f16m2(...) __riscv_vfcvt_f_x_v_f16m2(__VA_ARGS__)
+#define vfcvt_f_x_v_f16m4(...) __riscv_vfcvt_f_x_v_f16m4(__VA_ARGS__)
+#define vfcvt_f_x_v_f16m8(...) __riscv_vfcvt_f_x_v_f16m8(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16mf4(...) __riscv_vfcvt_f_xu_v_f16mf4(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16mf2(...) __riscv_vfcvt_f_xu_v_f16mf2(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16m1(...) __riscv_vfcvt_f_xu_v_f16m1(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16m2(...) __riscv_vfcvt_f_xu_v_f16m2(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16m4(...) __riscv_vfcvt_f_xu_v_f16m4(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16m8(...) __riscv_vfcvt_f_xu_v_f16m8(__VA_ARGS__)
+#define vfcvt_x_f_v_i32mf2(...) __riscv_vfcvt_x_f_v_i32mf2(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32mf2(...) __riscv_vfcvt_rtz_x_f_v_i32mf2(__VA_ARGS__)
+#define vfcvt_x_f_v_i32m1(...) __riscv_vfcvt_x_f_v_i32m1(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32m1(...) __riscv_vfcvt_rtz_x_f_v_i32m1(__VA_ARGS__)
+#define vfcvt_x_f_v_i32m2(...) __riscv_vfcvt_x_f_v_i32m2(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32m2(...) __riscv_vfcvt_rtz_x_f_v_i32m2(__VA_ARGS__)
+#define vfcvt_x_f_v_i32m4(...) __riscv_vfcvt_x_f_v_i32m4(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32m4(...) __riscv_vfcvt_rtz_x_f_v_i32m4(__VA_ARGS__)
+#define vfcvt_x_f_v_i32m8(...) __riscv_vfcvt_x_f_v_i32m8(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32m8(...) __riscv_vfcvt_rtz_x_f_v_i32m8(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32mf2(...) __riscv_vfcvt_xu_f_v_u32mf2(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32mf2(...) __riscv_vfcvt_rtz_xu_f_v_u32mf2(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32m1(...) __riscv_vfcvt_xu_f_v_u32m1(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32m1(...) __riscv_vfcvt_rtz_xu_f_v_u32m1(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32m2(...) __riscv_vfcvt_xu_f_v_u32m2(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32m2(...) __riscv_vfcvt_rtz_xu_f_v_u32m2(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32m4(...) __riscv_vfcvt_xu_f_v_u32m4(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32m4(...) __riscv_vfcvt_rtz_xu_f_v_u32m4(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32m8(...) __riscv_vfcvt_xu_f_v_u32m8(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32m8(...) __riscv_vfcvt_rtz_xu_f_v_u32m8(__VA_ARGS__)
+#define vfcvt_f_x_v_f32mf2(...) __riscv_vfcvt_f_x_v_f32mf2(__VA_ARGS__)
+#define vfcvt_f_x_v_f32m1(...) __riscv_vfcvt_f_x_v_f32m1(__VA_ARGS__)
+#define vfcvt_f_x_v_f32m2(...) __riscv_vfcvt_f_x_v_f32m2(__VA_ARGS__)
+#define vfcvt_f_x_v_f32m4(...) __riscv_vfcvt_f_x_v_f32m4(__VA_ARGS__)
+#define vfcvt_f_x_v_f32m8(...) __riscv_vfcvt_f_x_v_f32m8(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32mf2(...) __riscv_vfcvt_f_xu_v_f32mf2(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32m1(...) __riscv_vfcvt_f_xu_v_f32m1(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32m2(...) __riscv_vfcvt_f_xu_v_f32m2(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32m4(...) __riscv_vfcvt_f_xu_v_f32m4(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32m8(...) __riscv_vfcvt_f_xu_v_f32m8(__VA_ARGS__)
+#define vfcvt_x_f_v_i64m1(...) __riscv_vfcvt_x_f_v_i64m1(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i64m1(...) __riscv_vfcvt_rtz_x_f_v_i64m1(__VA_ARGS__)
+#define vfcvt_x_f_v_i64m2(...) __riscv_vfcvt_x_f_v_i64m2(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i64m2(...) __riscv_vfcvt_rtz_x_f_v_i64m2(__VA_ARGS__)
+#define vfcvt_x_f_v_i64m4(...) __riscv_vfcvt_x_f_v_i64m4(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i64m4(...) __riscv_vfcvt_rtz_x_f_v_i64m4(__VA_ARGS__)
+#define vfcvt_x_f_v_i64m8(...) __riscv_vfcvt_x_f_v_i64m8(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i64m8(...) __riscv_vfcvt_rtz_x_f_v_i64m8(__VA_ARGS__)
+#define vfcvt_xu_f_v_u64m1(...) __riscv_vfcvt_xu_f_v_u64m1(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u64m1(...) __riscv_vfcvt_rtz_xu_f_v_u64m1(__VA_ARGS__)
+#define vfcvt_xu_f_v_u64m2(...) __riscv_vfcvt_xu_f_v_u64m2(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u64m2(...) __riscv_vfcvt_rtz_xu_f_v_u64m2(__VA_ARGS__)
+#define vfcvt_xu_f_v_u64m4(...) __riscv_vfcvt_xu_f_v_u64m4(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u64m4(...) __riscv_vfcvt_rtz_xu_f_v_u64m4(__VA_ARGS__)
+#define vfcvt_xu_f_v_u64m8(...) __riscv_vfcvt_xu_f_v_u64m8(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u64m8(...) __riscv_vfcvt_rtz_xu_f_v_u64m8(__VA_ARGS__)
+#define vfcvt_f_x_v_f64m1(...) __riscv_vfcvt_f_x_v_f64m1(__VA_ARGS__)
+#define vfcvt_f_x_v_f64m2(...) __riscv_vfcvt_f_x_v_f64m2(__VA_ARGS__)
+#define vfcvt_f_x_v_f64m4(...) __riscv_vfcvt_f_x_v_f64m4(__VA_ARGS__)
+#define vfcvt_f_x_v_f64m8(...) __riscv_vfcvt_f_x_v_f64m8(__VA_ARGS__)
+#define vfcvt_f_xu_v_f64m1(...) __riscv_vfcvt_f_xu_v_f64m1(__VA_ARGS__)
+#define vfcvt_f_xu_v_f64m2(...) __riscv_vfcvt_f_xu_v_f64m2(__VA_ARGS__)
+#define vfcvt_f_xu_v_f64m4(...) __riscv_vfcvt_f_xu_v_f64m4(__VA_ARGS__)
+#define vfcvt_f_xu_v_f64m8(...) __riscv_vfcvt_f_xu_v_f64m8(__VA_ARGS__)
+// masked functions
+#define vfcvt_x_f_v_i16mf4_m(...) __riscv_vfcvt_x_f_v_i16mf4_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16mf4_m(...) __riscv_vfcvt_rtz_x_f_v_i16mf4_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i16mf2_m(...) __riscv_vfcvt_x_f_v_i16mf2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16mf2_m(...) __riscv_vfcvt_rtz_x_f_v_i16mf2_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i16m1_m(...) __riscv_vfcvt_x_f_v_i16m1_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16m1_m(...) __riscv_vfcvt_rtz_x_f_v_i16m1_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i16m2_m(...) __riscv_vfcvt_x_f_v_i16m2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16m2_m(...) __riscv_vfcvt_rtz_x_f_v_i16m2_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i16m4_m(...) __riscv_vfcvt_x_f_v_i16m4_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16m4_m(...) __riscv_vfcvt_rtz_x_f_v_i16m4_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i16m8_m(...) __riscv_vfcvt_x_f_v_i16m8_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i16m8_m(...) __riscv_vfcvt_rtz_x_f_v_i16m8_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16mf4_m(...) __riscv_vfcvt_xu_f_v_u16mf4_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16mf4_m(...) __riscv_vfcvt_rtz_xu_f_v_u16mf4_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16mf2_m(...) __riscv_vfcvt_xu_f_v_u16mf2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16mf2_m(...) __riscv_vfcvt_rtz_xu_f_v_u16mf2_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16m1_m(...) __riscv_vfcvt_xu_f_v_u16m1_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16m1_m(...) __riscv_vfcvt_rtz_xu_f_v_u16m1_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16m2_m(...) __riscv_vfcvt_xu_f_v_u16m2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16m2_m(...) __riscv_vfcvt_rtz_xu_f_v_u16m2_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16m4_m(...) __riscv_vfcvt_xu_f_v_u16m4_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16m4_m(...) __riscv_vfcvt_rtz_xu_f_v_u16m4_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u16m8_m(...) __riscv_vfcvt_xu_f_v_u16m8_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u16m8_m(...) __riscv_vfcvt_rtz_xu_f_v_u16m8_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f16mf4_m(...) __riscv_vfcvt_f_x_v_f16mf4_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f16mf2_m(...) __riscv_vfcvt_f_x_v_f16mf2_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f16m1_m(...) __riscv_vfcvt_f_x_v_f16m1_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f16m2_m(...) __riscv_vfcvt_f_x_v_f16m2_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f16m4_m(...) __riscv_vfcvt_f_x_v_f16m4_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f16m8_m(...) __riscv_vfcvt_f_x_v_f16m8_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16mf4_m(...) __riscv_vfcvt_f_xu_v_f16mf4_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16mf2_m(...) __riscv_vfcvt_f_xu_v_f16mf2_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16m1_m(...) __riscv_vfcvt_f_xu_v_f16m1_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16m2_m(...) __riscv_vfcvt_f_xu_v_f16m2_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16m4_m(...) __riscv_vfcvt_f_xu_v_f16m4_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f16m8_m(...) __riscv_vfcvt_f_xu_v_f16m8_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i32mf2_m(...) __riscv_vfcvt_x_f_v_i32mf2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32mf2_m(...) __riscv_vfcvt_rtz_x_f_v_i32mf2_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i32m1_m(...) __riscv_vfcvt_x_f_v_i32m1_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32m1_m(...) __riscv_vfcvt_rtz_x_f_v_i32m1_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i32m2_m(...) __riscv_vfcvt_x_f_v_i32m2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32m2_m(...) __riscv_vfcvt_rtz_x_f_v_i32m2_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i32m4_m(...) __riscv_vfcvt_x_f_v_i32m4_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32m4_m(...) __riscv_vfcvt_rtz_x_f_v_i32m4_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i32m8_m(...) __riscv_vfcvt_x_f_v_i32m8_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i32m8_m(...) __riscv_vfcvt_rtz_x_f_v_i32m8_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32mf2_m(...) __riscv_vfcvt_xu_f_v_u32mf2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32mf2_m(...) __riscv_vfcvt_rtz_xu_f_v_u32mf2_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32m1_m(...) __riscv_vfcvt_xu_f_v_u32m1_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32m1_m(...) __riscv_vfcvt_rtz_xu_f_v_u32m1_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32m2_m(...) __riscv_vfcvt_xu_f_v_u32m2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32m2_m(...) __riscv_vfcvt_rtz_xu_f_v_u32m2_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32m4_m(...) __riscv_vfcvt_xu_f_v_u32m4_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32m4_m(...) __riscv_vfcvt_rtz_xu_f_v_u32m4_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u32m8_m(...) __riscv_vfcvt_xu_f_v_u32m8_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u32m8_m(...) __riscv_vfcvt_rtz_xu_f_v_u32m8_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f32mf2_m(...) __riscv_vfcvt_f_x_v_f32mf2_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f32m1_m(...) __riscv_vfcvt_f_x_v_f32m1_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f32m2_m(...) __riscv_vfcvt_f_x_v_f32m2_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f32m4_m(...) __riscv_vfcvt_f_x_v_f32m4_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f32m8_m(...) __riscv_vfcvt_f_x_v_f32m8_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32mf2_m(...) __riscv_vfcvt_f_xu_v_f32mf2_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32m1_m(...) __riscv_vfcvt_f_xu_v_f32m1_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32m2_m(...) __riscv_vfcvt_f_xu_v_f32m2_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32m4_m(...) __riscv_vfcvt_f_xu_v_f32m4_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f32m8_m(...) __riscv_vfcvt_f_xu_v_f32m8_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i64m1_m(...) __riscv_vfcvt_x_f_v_i64m1_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i64m1_m(...) __riscv_vfcvt_rtz_x_f_v_i64m1_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i64m2_m(...) __riscv_vfcvt_x_f_v_i64m2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i64m2_m(...) __riscv_vfcvt_rtz_x_f_v_i64m2_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i64m4_m(...) __riscv_vfcvt_x_f_v_i64m4_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i64m4_m(...) __riscv_vfcvt_rtz_x_f_v_i64m4_tumu(__VA_ARGS__)
+#define vfcvt_x_f_v_i64m8_m(...) __riscv_vfcvt_x_f_v_i64m8_tumu(__VA_ARGS__)
+#define vfcvt_rtz_x_f_v_i64m8_m(...) __riscv_vfcvt_rtz_x_f_v_i64m8_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u64m1_m(...) __riscv_vfcvt_xu_f_v_u64m1_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u64m1_m(...) __riscv_vfcvt_rtz_xu_f_v_u64m1_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u64m2_m(...) __riscv_vfcvt_xu_f_v_u64m2_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u64m2_m(...) __riscv_vfcvt_rtz_xu_f_v_u64m2_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u64m4_m(...) __riscv_vfcvt_xu_f_v_u64m4_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u64m4_m(...) __riscv_vfcvt_rtz_xu_f_v_u64m4_tumu(__VA_ARGS__)
+#define vfcvt_xu_f_v_u64m8_m(...) __riscv_vfcvt_xu_f_v_u64m8_tumu(__VA_ARGS__)
+#define vfcvt_rtz_xu_f_v_u64m8_m(...) __riscv_vfcvt_rtz_xu_f_v_u64m8_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f64m1_m(...) __riscv_vfcvt_f_x_v_f64m1_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f64m2_m(...) __riscv_vfcvt_f_x_v_f64m2_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f64m4_m(...) __riscv_vfcvt_f_x_v_f64m4_tumu(__VA_ARGS__)
+#define vfcvt_f_x_v_f64m8_m(...) __riscv_vfcvt_f_x_v_f64m8_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f64m1_m(...) __riscv_vfcvt_f_xu_v_f64m1_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f64m2_m(...) __riscv_vfcvt_f_xu_v_f64m2_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f64m4_m(...) __riscv_vfcvt_f_xu_v_f64m4_tumu(__VA_ARGS__)
+#define vfcvt_f_xu_v_f64m8_m(...) __riscv_vfcvt_f_xu_v_f64m8_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i16mf4(...) __riscv_vwcvt_x_x_v_i16mf4(__VA_ARGS__)
+#define vwcvt_x_x_v_i16mf2(...) __riscv_vwcvt_x_x_v_i16mf2(__VA_ARGS__)
+#define vwcvt_x_x_v_i16m1(...) __riscv_vwcvt_x_x_v_i16m1(__VA_ARGS__)
+#define vwcvt_x_x_v_i16m2(...) __riscv_vwcvt_x_x_v_i16m2(__VA_ARGS__)
+#define vwcvt_x_x_v_i16m4(...) __riscv_vwcvt_x_x_v_i16m4(__VA_ARGS__)
+#define vwcvt_x_x_v_i16m8(...) __riscv_vwcvt_x_x_v_i16m8(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16mf4(...) __riscv_vwcvtu_x_x_v_u16mf4(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16mf2(...) __riscv_vwcvtu_x_x_v_u16mf2(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16m1(...) __riscv_vwcvtu_x_x_v_u16m1(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16m2(...) __riscv_vwcvtu_x_x_v_u16m2(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16m4(...) __riscv_vwcvtu_x_x_v_u16m4(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16m8(...) __riscv_vwcvtu_x_x_v_u16m8(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16mf4(...) __riscv_vfwcvt_f_x_v_f16mf4(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16mf2(...) __riscv_vfwcvt_f_x_v_f16mf2(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16m1(...) __riscv_vfwcvt_f_x_v_f16m1(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16m2(...) __riscv_vfwcvt_f_x_v_f16m2(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16m4(...) __riscv_vfwcvt_f_x_v_f16m4(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16m8(...) __riscv_vfwcvt_f_x_v_f16m8(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16mf4(...) __riscv_vfwcvt_f_xu_v_f16mf4(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16mf2(...) __riscv_vfwcvt_f_xu_v_f16mf2(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16m1(...) __riscv_vfwcvt_f_xu_v_f16m1(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16m2(...) __riscv_vfwcvt_f_xu_v_f16m2(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16m4(...) __riscv_vfwcvt_f_xu_v_f16m4(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16m8(...) __riscv_vfwcvt_f_xu_v_f16m8(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32mf2(...) __riscv_vfwcvt_x_f_v_i32mf2(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32mf2(...) __riscv_vfwcvt_rtz_x_f_v_i32mf2(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32m1(...) __riscv_vfwcvt_x_f_v_i32m1(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32m1(...) __riscv_vfwcvt_rtz_x_f_v_i32m1(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32m2(...) __riscv_vfwcvt_x_f_v_i32m2(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32m2(...) __riscv_vfwcvt_rtz_x_f_v_i32m2(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32m4(...) __riscv_vfwcvt_x_f_v_i32m4(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32m4(...) __riscv_vfwcvt_rtz_x_f_v_i32m4(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32m8(...) __riscv_vfwcvt_x_f_v_i32m8(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32m8(...) __riscv_vfwcvt_rtz_x_f_v_i32m8(__VA_ARGS__)
+#define vwcvt_x_x_v_i32mf2(...) __riscv_vwcvt_x_x_v_i32mf2(__VA_ARGS__)
+#define vwcvt_x_x_v_i32m1(...) __riscv_vwcvt_x_x_v_i32m1(__VA_ARGS__)
+#define vwcvt_x_x_v_i32m2(...) __riscv_vwcvt_x_x_v_i32m2(__VA_ARGS__)
+#define vwcvt_x_x_v_i32m4(...) __riscv_vwcvt_x_x_v_i32m4(__VA_ARGS__)
+#define vwcvt_x_x_v_i32m8(...) __riscv_vwcvt_x_x_v_i32m8(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32mf2(...) __riscv_vwcvtu_x_x_v_u32mf2(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32m1(...) __riscv_vwcvtu_x_x_v_u32m1(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32m2(...) __riscv_vwcvtu_x_x_v_u32m2(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32m4(...) __riscv_vwcvtu_x_x_v_u32m4(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32m8(...) __riscv_vwcvtu_x_x_v_u32m8(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32mf2(...) __riscv_vfwcvt_xu_f_v_u32mf2(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32mf2(...) __riscv_vfwcvt_rtz_xu_f_v_u32mf2(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32m1(...) __riscv_vfwcvt_xu_f_v_u32m1(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32m1(...) __riscv_vfwcvt_rtz_xu_f_v_u32m1(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32m2(...) __riscv_vfwcvt_xu_f_v_u32m2(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32m2(...) __riscv_vfwcvt_rtz_xu_f_v_u32m2(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32m4(...) __riscv_vfwcvt_xu_f_v_u32m4(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32m4(...) __riscv_vfwcvt_rtz_xu_f_v_u32m4(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32m8(...) __riscv_vfwcvt_xu_f_v_u32m8(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32m8(...) __riscv_vfwcvt_rtz_xu_f_v_u32m8(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32mf2(...) __riscv_vfwcvt_f_x_v_f32mf2(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32m1(...) __riscv_vfwcvt_f_x_v_f32m1(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32m2(...) __riscv_vfwcvt_f_x_v_f32m2(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32m4(...) __riscv_vfwcvt_f_x_v_f32m4(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32m8(...) __riscv_vfwcvt_f_x_v_f32m8(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32mf2(...) __riscv_vfwcvt_f_xu_v_f32mf2(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32m1(...) __riscv_vfwcvt_f_xu_v_f32m1(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32m2(...) __riscv_vfwcvt_f_xu_v_f32m2(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32m4(...) __riscv_vfwcvt_f_xu_v_f32m4(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32m8(...) __riscv_vfwcvt_f_xu_v_f32m8(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32mf2(...) __riscv_vfwcvt_f_f_v_f32mf2(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32m1(...) __riscv_vfwcvt_f_f_v_f32m1(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32m2(...) __riscv_vfwcvt_f_f_v_f32m2(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32m4(...) __riscv_vfwcvt_f_f_v_f32m4(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32m8(...) __riscv_vfwcvt_f_f_v_f32m8(__VA_ARGS__)
+#define vfwcvt_x_f_v_i64m1(...) __riscv_vfwcvt_x_f_v_i64m1(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i64m1(...) __riscv_vfwcvt_rtz_x_f_v_i64m1(__VA_ARGS__)
+#define vfwcvt_x_f_v_i64m2(...) __riscv_vfwcvt_x_f_v_i64m2(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i64m2(...) __riscv_vfwcvt_rtz_x_f_v_i64m2(__VA_ARGS__)
+#define vfwcvt_x_f_v_i64m4(...) __riscv_vfwcvt_x_f_v_i64m4(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i64m4(...) __riscv_vfwcvt_rtz_x_f_v_i64m4(__VA_ARGS__)
+#define vfwcvt_x_f_v_i64m8(...) __riscv_vfwcvt_x_f_v_i64m8(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i64m8(...) __riscv_vfwcvt_rtz_x_f_v_i64m8(__VA_ARGS__)
+#define vwcvt_x_x_v_i64m1(...) __riscv_vwcvt_x_x_v_i64m1(__VA_ARGS__)
+#define vwcvt_x_x_v_i64m2(...) __riscv_vwcvt_x_x_v_i64m2(__VA_ARGS__)
+#define vwcvt_x_x_v_i64m4(...) __riscv_vwcvt_x_x_v_i64m4(__VA_ARGS__)
+#define vwcvt_x_x_v_i64m8(...) __riscv_vwcvt_x_x_v_i64m8(__VA_ARGS__)
+#define vwcvtu_x_x_v_u64m1(...) __riscv_vwcvtu_x_x_v_u64m1(__VA_ARGS__)
+#define vwcvtu_x_x_v_u64m2(...) __riscv_vwcvtu_x_x_v_u64m2(__VA_ARGS__)
+#define vwcvtu_x_x_v_u64m4(...) __riscv_vwcvtu_x_x_v_u64m4(__VA_ARGS__)
+#define vwcvtu_x_x_v_u64m8(...) __riscv_vwcvtu_x_x_v_u64m8(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u64m1(...) __riscv_vfwcvt_xu_f_v_u64m1(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u64m1(...) __riscv_vfwcvt_rtz_xu_f_v_u64m1(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u64m2(...) __riscv_vfwcvt_xu_f_v_u64m2(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u64m2(...) __riscv_vfwcvt_rtz_xu_f_v_u64m2(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u64m4(...) __riscv_vfwcvt_xu_f_v_u64m4(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u64m4(...) __riscv_vfwcvt_rtz_xu_f_v_u64m4(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u64m8(...) __riscv_vfwcvt_xu_f_v_u64m8(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u64m8(...) __riscv_vfwcvt_rtz_xu_f_v_u64m8(__VA_ARGS__)
+#define vfwcvt_f_x_v_f64m1(...) __riscv_vfwcvt_f_x_v_f64m1(__VA_ARGS__)
+#define vfwcvt_f_x_v_f64m2(...) __riscv_vfwcvt_f_x_v_f64m2(__VA_ARGS__)
+#define vfwcvt_f_x_v_f64m4(...) __riscv_vfwcvt_f_x_v_f64m4(__VA_ARGS__)
+#define vfwcvt_f_x_v_f64m8(...) __riscv_vfwcvt_f_x_v_f64m8(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f64m1(...) __riscv_vfwcvt_f_xu_v_f64m1(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f64m2(...) __riscv_vfwcvt_f_xu_v_f64m2(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f64m4(...) __riscv_vfwcvt_f_xu_v_f64m4(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f64m8(...) __riscv_vfwcvt_f_xu_v_f64m8(__VA_ARGS__)
+#define vfwcvt_f_f_v_f64m1(...) __riscv_vfwcvt_f_f_v_f64m1(__VA_ARGS__)
+#define vfwcvt_f_f_v_f64m2(...) __riscv_vfwcvt_f_f_v_f64m2(__VA_ARGS__)
+#define vfwcvt_f_f_v_f64m4(...) __riscv_vfwcvt_f_f_v_f64m4(__VA_ARGS__)
+#define vfwcvt_f_f_v_f64m8(...) __riscv_vfwcvt_f_f_v_f64m8(__VA_ARGS__)
+// masked functions
+#define vwcvt_x_x_v_i16mf4_m(...) __riscv_vwcvt_x_x_v_i16mf4_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i16mf2_m(...) __riscv_vwcvt_x_x_v_i16mf2_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i16m1_m(...) __riscv_vwcvt_x_x_v_i16m1_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i16m2_m(...) __riscv_vwcvt_x_x_v_i16m2_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i16m4_m(...) __riscv_vwcvt_x_x_v_i16m4_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i16m8_m(...) __riscv_vwcvt_x_x_v_i16m8_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16mf4_m(...) __riscv_vwcvtu_x_x_v_u16mf4_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16mf2_m(...) __riscv_vwcvtu_x_x_v_u16mf2_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16m1_m(...) __riscv_vwcvtu_x_x_v_u16m1_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16m2_m(...) __riscv_vwcvtu_x_x_v_u16m2_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16m4_m(...) __riscv_vwcvtu_x_x_v_u16m4_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u16m8_m(...) __riscv_vwcvtu_x_x_v_u16m8_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16mf4_m(...) __riscv_vfwcvt_f_x_v_f16mf4_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16mf2_m(...) __riscv_vfwcvt_f_x_v_f16mf2_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16m1_m(...) __riscv_vfwcvt_f_x_v_f16m1_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16m2_m(...) __riscv_vfwcvt_f_x_v_f16m2_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16m4_m(...) __riscv_vfwcvt_f_x_v_f16m4_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f16m8_m(...) __riscv_vfwcvt_f_x_v_f16m8_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16mf4_m(...) __riscv_vfwcvt_f_xu_v_f16mf4_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16mf2_m(...) __riscv_vfwcvt_f_xu_v_f16mf2_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16m1_m(...) __riscv_vfwcvt_f_xu_v_f16m1_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16m2_m(...) __riscv_vfwcvt_f_xu_v_f16m2_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16m4_m(...) __riscv_vfwcvt_f_xu_v_f16m4_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f16m8_m(...) __riscv_vfwcvt_f_xu_v_f16m8_tumu(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32mf2_m(...) __riscv_vfwcvt_x_f_v_i32mf2_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32mf2_m(...) __riscv_vfwcvt_rtz_x_f_v_i32mf2_tumu(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32m1_m(...) __riscv_vfwcvt_x_f_v_i32m1_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32m1_m(...) __riscv_vfwcvt_rtz_x_f_v_i32m1_tumu(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32m2_m(...) __riscv_vfwcvt_x_f_v_i32m2_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32m2_m(...) __riscv_vfwcvt_rtz_x_f_v_i32m2_tumu(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32m4_m(...) __riscv_vfwcvt_x_f_v_i32m4_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32m4_m(...) __riscv_vfwcvt_rtz_x_f_v_i32m4_tumu(__VA_ARGS__)
+#define vfwcvt_x_f_v_i32m8_m(...) __riscv_vfwcvt_x_f_v_i32m8_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i32m8_m(...) __riscv_vfwcvt_rtz_x_f_v_i32m8_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i32mf2_m(...) __riscv_vwcvt_x_x_v_i32mf2_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i32m1_m(...) __riscv_vwcvt_x_x_v_i32m1_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i32m2_m(...) __riscv_vwcvt_x_x_v_i32m2_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i32m4_m(...) __riscv_vwcvt_x_x_v_i32m4_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i32m8_m(...) __riscv_vwcvt_x_x_v_i32m8_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32mf2_m(...) __riscv_vwcvtu_x_x_v_u32mf2_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32m1_m(...) __riscv_vwcvtu_x_x_v_u32m1_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32m2_m(...) __riscv_vwcvtu_x_x_v_u32m2_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32m4_m(...) __riscv_vwcvtu_x_x_v_u32m4_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u32m8_m(...) __riscv_vwcvtu_x_x_v_u32m8_tumu(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32mf2_m(...) __riscv_vfwcvt_xu_f_v_u32mf2_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32mf2_m(...) __riscv_vfwcvt_rtz_xu_f_v_u32mf2_tumu(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32m1_m(...) __riscv_vfwcvt_xu_f_v_u32m1_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32m1_m(...) __riscv_vfwcvt_rtz_xu_f_v_u32m1_tumu(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32m2_m(...) __riscv_vfwcvt_xu_f_v_u32m2_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32m2_m(...) __riscv_vfwcvt_rtz_xu_f_v_u32m2_tumu(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32m4_m(...) __riscv_vfwcvt_xu_f_v_u32m4_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32m4_m(...) __riscv_vfwcvt_rtz_xu_f_v_u32m4_tumu(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u32m8_m(...) __riscv_vfwcvt_xu_f_v_u32m8_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u32m8_m(...) __riscv_vfwcvt_rtz_xu_f_v_u32m8_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32mf2_m(...) __riscv_vfwcvt_f_x_v_f32mf2_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32m1_m(...) __riscv_vfwcvt_f_x_v_f32m1_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32m2_m(...) __riscv_vfwcvt_f_x_v_f32m2_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32m4_m(...) __riscv_vfwcvt_f_x_v_f32m4_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f32m8_m(...) __riscv_vfwcvt_f_x_v_f32m8_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32mf2_m(...) __riscv_vfwcvt_f_xu_v_f32mf2_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32m1_m(...) __riscv_vfwcvt_f_xu_v_f32m1_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32m2_m(...) __riscv_vfwcvt_f_xu_v_f32m2_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32m4_m(...) __riscv_vfwcvt_f_xu_v_f32m4_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f32m8_m(...) __riscv_vfwcvt_f_xu_v_f32m8_tumu(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32mf2_m(...) __riscv_vfwcvt_f_f_v_f32mf2_tumu(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32m1_m(...) __riscv_vfwcvt_f_f_v_f32m1_tumu(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32m2_m(...) __riscv_vfwcvt_f_f_v_f32m2_tumu(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32m4_m(...) __riscv_vfwcvt_f_f_v_f32m4_tumu(__VA_ARGS__)
+#define vfwcvt_f_f_v_f32m8_m(...) __riscv_vfwcvt_f_f_v_f32m8_tumu(__VA_ARGS__)
+#define vfwcvt_x_f_v_i64m1_m(...) __riscv_vfwcvt_x_f_v_i64m1_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i64m1_m(...) __riscv_vfwcvt_rtz_x_f_v_i64m1_tumu(__VA_ARGS__)
+#define vfwcvt_x_f_v_i64m2_m(...) __riscv_vfwcvt_x_f_v_i64m2_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i64m2_m(...) __riscv_vfwcvt_rtz_x_f_v_i64m2_tumu(__VA_ARGS__)
+#define vfwcvt_x_f_v_i64m4_m(...) __riscv_vfwcvt_x_f_v_i64m4_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i64m4_m(...) __riscv_vfwcvt_rtz_x_f_v_i64m4_tumu(__VA_ARGS__)
+#define vfwcvt_x_f_v_i64m8_m(...) __riscv_vfwcvt_x_f_v_i64m8_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_x_f_v_i64m8_m(...) __riscv_vfwcvt_rtz_x_f_v_i64m8_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i64m1_m(...) __riscv_vwcvt_x_x_v_i64m1_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i64m2_m(...) __riscv_vwcvt_x_x_v_i64m2_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i64m4_m(...) __riscv_vwcvt_x_x_v_i64m4_tumu(__VA_ARGS__)
+#define vwcvt_x_x_v_i64m8_m(...) __riscv_vwcvt_x_x_v_i64m8_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u64m1_m(...) __riscv_vwcvtu_x_x_v_u64m1_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u64m2_m(...) __riscv_vwcvtu_x_x_v_u64m2_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u64m4_m(...) __riscv_vwcvtu_x_x_v_u64m4_tumu(__VA_ARGS__)
+#define vwcvtu_x_x_v_u64m8_m(...) __riscv_vwcvtu_x_x_v_u64m8_tumu(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u64m1_m(...) __riscv_vfwcvt_xu_f_v_u64m1_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u64m1_m(...) __riscv_vfwcvt_rtz_xu_f_v_u64m1_tumu(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u64m2_m(...) __riscv_vfwcvt_xu_f_v_u64m2_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u64m2_m(...) __riscv_vfwcvt_rtz_xu_f_v_u64m2_tumu(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u64m4_m(...) __riscv_vfwcvt_xu_f_v_u64m4_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u64m4_m(...) __riscv_vfwcvt_rtz_xu_f_v_u64m4_tumu(__VA_ARGS__)
+#define vfwcvt_xu_f_v_u64m8_m(...) __riscv_vfwcvt_xu_f_v_u64m8_tumu(__VA_ARGS__)
+#define vfwcvt_rtz_xu_f_v_u64m8_m(...) __riscv_vfwcvt_rtz_xu_f_v_u64m8_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f64m1_m(...) __riscv_vfwcvt_f_x_v_f64m1_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f64m2_m(...) __riscv_vfwcvt_f_x_v_f64m2_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f64m4_m(...) __riscv_vfwcvt_f_x_v_f64m4_tumu(__VA_ARGS__)
+#define vfwcvt_f_x_v_f64m8_m(...) __riscv_vfwcvt_f_x_v_f64m8_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f64m1_m(...) __riscv_vfwcvt_f_xu_v_f64m1_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f64m2_m(...) __riscv_vfwcvt_f_xu_v_f64m2_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f64m4_m(...) __riscv_vfwcvt_f_xu_v_f64m4_tumu(__VA_ARGS__)
+#define vfwcvt_f_xu_v_f64m8_m(...) __riscv_vfwcvt_f_xu_v_f64m8_tumu(__VA_ARGS__)
+#define vfwcvt_f_f_v_f64m1_m(...) __riscv_vfwcvt_f_f_v_f64m1_tumu(__VA_ARGS__)
+#define vfwcvt_f_f_v_f64m2_m(...) __riscv_vfwcvt_f_f_v_f64m2_tumu(__VA_ARGS__)
+#define vfwcvt_f_f_v_f64m4_m(...) __riscv_vfwcvt_f_f_v_f64m4_tumu(__VA_ARGS__)
+#define vfwcvt_f_f_v_f64m8_m(...) __riscv_vfwcvt_f_f_v_f64m8_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i8mf8(...) __riscv_vfncvt_x_f_w_i8mf8(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8mf8(...) __riscv_vfncvt_rtz_x_f_w_i8mf8(__VA_ARGS__)
+#define vfncvt_x_f_w_i8mf4(...) __riscv_vfncvt_x_f_w_i8mf4(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8mf4(...) __riscv_vfncvt_rtz_x_f_w_i8mf4(__VA_ARGS__)
+#define vfncvt_x_f_w_i8mf2(...) __riscv_vfncvt_x_f_w_i8mf2(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8mf2(...) __riscv_vfncvt_rtz_x_f_w_i8mf2(__VA_ARGS__)
+#define vfncvt_x_f_w_i8m1(...) __riscv_vfncvt_x_f_w_i8m1(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8m1(...) __riscv_vfncvt_rtz_x_f_w_i8m1(__VA_ARGS__)
+#define vfncvt_x_f_w_i8m2(...) __riscv_vfncvt_x_f_w_i8m2(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8m2(...) __riscv_vfncvt_rtz_x_f_w_i8m2(__VA_ARGS__)
+#define vfncvt_x_f_w_i8m4(...) __riscv_vfncvt_x_f_w_i8m4(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8m4(...) __riscv_vfncvt_rtz_x_f_w_i8m4(__VA_ARGS__)
+#define vncvt_x_x_w_i8mf8(...) __riscv_vncvt_x_x_w_i8mf8(__VA_ARGS__)
+#define vncvt_x_x_w_i8mf4(...) __riscv_vncvt_x_x_w_i8mf4(__VA_ARGS__)
+#define vncvt_x_x_w_i8mf2(...) __riscv_vncvt_x_x_w_i8mf2(__VA_ARGS__)
+#define vncvt_x_x_w_i8m1(...) __riscv_vncvt_x_x_w_i8m1(__VA_ARGS__)
+#define vncvt_x_x_w_i8m2(...) __riscv_vncvt_x_x_w_i8m2(__VA_ARGS__)
+#define vncvt_x_x_w_i8m4(...) __riscv_vncvt_x_x_w_i8m4(__VA_ARGS__)
+#define vncvt_x_x_w_u8mf8(...) __riscv_vncvt_x_x_w_u8mf8(__VA_ARGS__)
+#define vncvt_x_x_w_u8mf4(...) __riscv_vncvt_x_x_w_u8mf4(__VA_ARGS__)
+#define vncvt_x_x_w_u8mf2(...) __riscv_vncvt_x_x_w_u8mf2(__VA_ARGS__)
+#define vncvt_x_x_w_u8m1(...) __riscv_vncvt_x_x_w_u8m1(__VA_ARGS__)
+#define vncvt_x_x_w_u8m2(...) __riscv_vncvt_x_x_w_u8m2(__VA_ARGS__)
+#define vncvt_x_x_w_u8m4(...) __riscv_vncvt_x_x_w_u8m4(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8mf8(...) __riscv_vfncvt_xu_f_w_u8mf8(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8mf8(...) __riscv_vfncvt_rtz_xu_f_w_u8mf8(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8mf4(...) __riscv_vfncvt_xu_f_w_u8mf4(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8mf4(...) __riscv_vfncvt_rtz_xu_f_w_u8mf4(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8mf2(...) __riscv_vfncvt_xu_f_w_u8mf2(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8mf2(...) __riscv_vfncvt_rtz_xu_f_w_u8mf2(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8m1(...) __riscv_vfncvt_xu_f_w_u8m1(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8m1(...) __riscv_vfncvt_rtz_xu_f_w_u8m1(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8m2(...) __riscv_vfncvt_xu_f_w_u8m2(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8m2(...) __riscv_vfncvt_rtz_xu_f_w_u8m2(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8m4(...) __riscv_vfncvt_xu_f_w_u8m4(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8m4(...) __riscv_vfncvt_rtz_xu_f_w_u8m4(__VA_ARGS__)
+#define vfncvt_x_f_w_i16mf4(...) __riscv_vfncvt_x_f_w_i16mf4(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16mf4(...) __riscv_vfncvt_rtz_x_f_w_i16mf4(__VA_ARGS__)
+#define vfncvt_x_f_w_i16mf2(...) __riscv_vfncvt_x_f_w_i16mf2(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16mf2(...) __riscv_vfncvt_rtz_x_f_w_i16mf2(__VA_ARGS__)
+#define vfncvt_x_f_w_i16m1(...) __riscv_vfncvt_x_f_w_i16m1(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16m1(...) __riscv_vfncvt_rtz_x_f_w_i16m1(__VA_ARGS__)
+#define vfncvt_x_f_w_i16m2(...) __riscv_vfncvt_x_f_w_i16m2(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16m2(...) __riscv_vfncvt_rtz_x_f_w_i16m2(__VA_ARGS__)
+#define vfncvt_x_f_w_i16m4(...) __riscv_vfncvt_x_f_w_i16m4(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16m4(...) __riscv_vfncvt_rtz_x_f_w_i16m4(__VA_ARGS__)
+#define vncvt_x_x_w_i16mf4(...) __riscv_vncvt_x_x_w_i16mf4(__VA_ARGS__)
+#define vncvt_x_x_w_i16mf2(...) __riscv_vncvt_x_x_w_i16mf2(__VA_ARGS__)
+#define vncvt_x_x_w_i16m1(...) __riscv_vncvt_x_x_w_i16m1(__VA_ARGS__)
+#define vncvt_x_x_w_i16m2(...) __riscv_vncvt_x_x_w_i16m2(__VA_ARGS__)
+#define vncvt_x_x_w_i16m4(...) __riscv_vncvt_x_x_w_i16m4(__VA_ARGS__)
+#define vncvt_x_x_w_u16mf4(...) __riscv_vncvt_x_x_w_u16mf4(__VA_ARGS__)
+#define vncvt_x_x_w_u16mf2(...) __riscv_vncvt_x_x_w_u16mf2(__VA_ARGS__)
+#define vncvt_x_x_w_u16m1(...) __riscv_vncvt_x_x_w_u16m1(__VA_ARGS__)
+#define vncvt_x_x_w_u16m2(...) __riscv_vncvt_x_x_w_u16m2(__VA_ARGS__)
+#define vncvt_x_x_w_u16m4(...) __riscv_vncvt_x_x_w_u16m4(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16mf4(...) __riscv_vfncvt_xu_f_w_u16mf4(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16mf4(...) __riscv_vfncvt_rtz_xu_f_w_u16mf4(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16mf2(...) __riscv_vfncvt_xu_f_w_u16mf2(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16mf2(...) __riscv_vfncvt_rtz_xu_f_w_u16mf2(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16m1(...) __riscv_vfncvt_xu_f_w_u16m1(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16m1(...) __riscv_vfncvt_rtz_xu_f_w_u16m1(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16m2(...) __riscv_vfncvt_xu_f_w_u16m2(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16m2(...) __riscv_vfncvt_rtz_xu_f_w_u16m2(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16m4(...) __riscv_vfncvt_xu_f_w_u16m4(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16m4(...) __riscv_vfncvt_rtz_xu_f_w_u16m4(__VA_ARGS__)
+#define vfncvt_f_x_w_f16mf4(...) __riscv_vfncvt_f_x_w_f16mf4(__VA_ARGS__)
+#define vfncvt_f_x_w_f16mf2(...) __riscv_vfncvt_f_x_w_f16mf2(__VA_ARGS__)
+#define vfncvt_f_x_w_f16m1(...) __riscv_vfncvt_f_x_w_f16m1(__VA_ARGS__)
+#define vfncvt_f_x_w_f16m2(...) __riscv_vfncvt_f_x_w_f16m2(__VA_ARGS__)
+#define vfncvt_f_x_w_f16m4(...) __riscv_vfncvt_f_x_w_f16m4(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16mf4(...) __riscv_vfncvt_f_xu_w_f16mf4(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16mf2(...) __riscv_vfncvt_f_xu_w_f16mf2(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16m1(...) __riscv_vfncvt_f_xu_w_f16m1(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16m2(...) __riscv_vfncvt_f_xu_w_f16m2(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16m4(...) __riscv_vfncvt_f_xu_w_f16m4(__VA_ARGS__)
+#define vfncvt_f_f_w_f16mf4(...) __riscv_vfncvt_f_f_w_f16mf4(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16mf4(...) __riscv_vfncvt_rod_f_f_w_f16mf4(__VA_ARGS__)
+#define vfncvt_f_f_w_f16mf2(...) __riscv_vfncvt_f_f_w_f16mf2(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16mf2(...) __riscv_vfncvt_rod_f_f_w_f16mf2(__VA_ARGS__)
+#define vfncvt_f_f_w_f16m1(...) __riscv_vfncvt_f_f_w_f16m1(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16m1(...) __riscv_vfncvt_rod_f_f_w_f16m1(__VA_ARGS__)
+#define vfncvt_f_f_w_f16m2(...) __riscv_vfncvt_f_f_w_f16m2(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16m2(...) __riscv_vfncvt_rod_f_f_w_f16m2(__VA_ARGS__)
+#define vfncvt_f_f_w_f16m4(...) __riscv_vfncvt_f_f_w_f16m4(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16m4(...) __riscv_vfncvt_rod_f_f_w_f16m4(__VA_ARGS__)
+#define vfncvt_x_f_w_i32mf2(...) __riscv_vfncvt_x_f_w_i32mf2(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i32mf2(...) __riscv_vfncvt_rtz_x_f_w_i32mf2(__VA_ARGS__)
+#define vfncvt_x_f_w_i32m1(...) __riscv_vfncvt_x_f_w_i32m1(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i32m1(...) __riscv_vfncvt_rtz_x_f_w_i32m1(__VA_ARGS__)
+#define vfncvt_x_f_w_i32m2(...) __riscv_vfncvt_x_f_w_i32m2(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i32m2(...) __riscv_vfncvt_rtz_x_f_w_i32m2(__VA_ARGS__)
+#define vfncvt_x_f_w_i32m4(...) __riscv_vfncvt_x_f_w_i32m4(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i32m4(...) __riscv_vfncvt_rtz_x_f_w_i32m4(__VA_ARGS__)
+#define vncvt_x_x_w_i32mf2(...) __riscv_vncvt_x_x_w_i32mf2(__VA_ARGS__)
+#define vncvt_x_x_w_i32m1(...) __riscv_vncvt_x_x_w_i32m1(__VA_ARGS__)
+#define vncvt_x_x_w_i32m2(...) __riscv_vncvt_x_x_w_i32m2(__VA_ARGS__)
+#define vncvt_x_x_w_i32m4(...) __riscv_vncvt_x_x_w_i32m4(__VA_ARGS__)
+#define vncvt_x_x_w_u32mf2(...) __riscv_vncvt_x_x_w_u32mf2(__VA_ARGS__)
+#define vncvt_x_x_w_u32m1(...) __riscv_vncvt_x_x_w_u32m1(__VA_ARGS__)
+#define vncvt_x_x_w_u32m2(...) __riscv_vncvt_x_x_w_u32m2(__VA_ARGS__)
+#define vncvt_x_x_w_u32m4(...) __riscv_vncvt_x_x_w_u32m4(__VA_ARGS__)
+#define vfncvt_xu_f_w_u32mf2(...) __riscv_vfncvt_xu_f_w_u32mf2(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u32mf2(...) __riscv_vfncvt_rtz_xu_f_w_u32mf2(__VA_ARGS__)
+#define vfncvt_xu_f_w_u32m1(...) __riscv_vfncvt_xu_f_w_u32m1(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u32m1(...) __riscv_vfncvt_rtz_xu_f_w_u32m1(__VA_ARGS__)
+#define vfncvt_xu_f_w_u32m2(...) __riscv_vfncvt_xu_f_w_u32m2(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u32m2(...) __riscv_vfncvt_rtz_xu_f_w_u32m2(__VA_ARGS__)
+#define vfncvt_xu_f_w_u32m4(...) __riscv_vfncvt_xu_f_w_u32m4(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u32m4(...) __riscv_vfncvt_rtz_xu_f_w_u32m4(__VA_ARGS__)
+#define vfncvt_f_x_w_f32mf2(...) __riscv_vfncvt_f_x_w_f32mf2(__VA_ARGS__)
+#define vfncvt_f_x_w_f32m1(...) __riscv_vfncvt_f_x_w_f32m1(__VA_ARGS__)
+#define vfncvt_f_x_w_f32m2(...) __riscv_vfncvt_f_x_w_f32m2(__VA_ARGS__)
+#define vfncvt_f_x_w_f32m4(...) __riscv_vfncvt_f_x_w_f32m4(__VA_ARGS__)
+#define vfncvt_f_xu_w_f32mf2(...) __riscv_vfncvt_f_xu_w_f32mf2(__VA_ARGS__)
+#define vfncvt_f_xu_w_f32m1(...) __riscv_vfncvt_f_xu_w_f32m1(__VA_ARGS__)
+#define vfncvt_f_xu_w_f32m2(...) __riscv_vfncvt_f_xu_w_f32m2(__VA_ARGS__)
+#define vfncvt_f_xu_w_f32m4(...) __riscv_vfncvt_f_xu_w_f32m4(__VA_ARGS__)
+#define vfncvt_f_f_w_f32mf2(...) __riscv_vfncvt_f_f_w_f32mf2(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f32mf2(...) __riscv_vfncvt_rod_f_f_w_f32mf2(__VA_ARGS__)
+#define vfncvt_f_f_w_f32m1(...) __riscv_vfncvt_f_f_w_f32m1(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f32m1(...) __riscv_vfncvt_rod_f_f_w_f32m1(__VA_ARGS__)
+#define vfncvt_f_f_w_f32m2(...) __riscv_vfncvt_f_f_w_f32m2(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f32m2(...) __riscv_vfncvt_rod_f_f_w_f32m2(__VA_ARGS__)
+#define vfncvt_f_f_w_f32m4(...) __riscv_vfncvt_f_f_w_f32m4(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f32m4(...) __riscv_vfncvt_rod_f_f_w_f32m4(__VA_ARGS__)
+// masked functions
+#define vfncvt_x_f_w_i8mf8_m(...) __riscv_vfncvt_x_f_w_i8mf8_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8mf8_m(...) __riscv_vfncvt_rtz_x_f_w_i8mf8_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i8mf4_m(...) __riscv_vfncvt_x_f_w_i8mf4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8mf4_m(...) __riscv_vfncvt_rtz_x_f_w_i8mf4_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i8mf2_m(...) __riscv_vfncvt_x_f_w_i8mf2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8mf2_m(...) __riscv_vfncvt_rtz_x_f_w_i8mf2_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i8m1_m(...) __riscv_vfncvt_x_f_w_i8m1_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8m1_m(...) __riscv_vfncvt_rtz_x_f_w_i8m1_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i8m2_m(...) __riscv_vfncvt_x_f_w_i8m2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8m2_m(...) __riscv_vfncvt_rtz_x_f_w_i8m2_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i8m4_m(...) __riscv_vfncvt_x_f_w_i8m4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i8m4_m(...) __riscv_vfncvt_rtz_x_f_w_i8m4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i8mf8_m(...) __riscv_vncvt_x_x_w_i8mf8_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i8mf4_m(...) __riscv_vncvt_x_x_w_i8mf4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i8mf2_m(...) __riscv_vncvt_x_x_w_i8mf2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i8m1_m(...) __riscv_vncvt_x_x_w_i8m1_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i8m2_m(...) __riscv_vncvt_x_x_w_i8m2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i8m4_m(...) __riscv_vncvt_x_x_w_i8m4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u8mf8_m(...) __riscv_vncvt_x_x_w_u8mf8_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u8mf4_m(...) __riscv_vncvt_x_x_w_u8mf4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u8mf2_m(...) __riscv_vncvt_x_x_w_u8mf2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u8m1_m(...) __riscv_vncvt_x_x_w_u8m1_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u8m2_m(...) __riscv_vncvt_x_x_w_u8m2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u8m4_m(...) __riscv_vncvt_x_x_w_u8m4_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8mf8_m(...) __riscv_vfncvt_xu_f_w_u8mf8_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8mf8_m(...) __riscv_vfncvt_rtz_xu_f_w_u8mf8_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8mf4_m(...) __riscv_vfncvt_xu_f_w_u8mf4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8mf4_m(...) __riscv_vfncvt_rtz_xu_f_w_u8mf4_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8mf2_m(...) __riscv_vfncvt_xu_f_w_u8mf2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8mf2_m(...) __riscv_vfncvt_rtz_xu_f_w_u8mf2_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8m1_m(...) __riscv_vfncvt_xu_f_w_u8m1_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8m1_m(...) __riscv_vfncvt_rtz_xu_f_w_u8m1_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8m2_m(...) __riscv_vfncvt_xu_f_w_u8m2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8m2_m(...) __riscv_vfncvt_rtz_xu_f_w_u8m2_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u8m4_m(...) __riscv_vfncvt_xu_f_w_u8m4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u8m4_m(...) __riscv_vfncvt_rtz_xu_f_w_u8m4_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i16mf4_m(...) __riscv_vfncvt_x_f_w_i16mf4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16mf4_m(...) __riscv_vfncvt_rtz_x_f_w_i16mf4_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i16mf2_m(...) __riscv_vfncvt_x_f_w_i16mf2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16mf2_m(...) __riscv_vfncvt_rtz_x_f_w_i16mf2_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i16m1_m(...) __riscv_vfncvt_x_f_w_i16m1_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16m1_m(...) __riscv_vfncvt_rtz_x_f_w_i16m1_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i16m2_m(...) __riscv_vfncvt_x_f_w_i16m2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16m2_m(...) __riscv_vfncvt_rtz_x_f_w_i16m2_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i16m4_m(...) __riscv_vfncvt_x_f_w_i16m4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i16m4_m(...) __riscv_vfncvt_rtz_x_f_w_i16m4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i16mf4_m(...) __riscv_vncvt_x_x_w_i16mf4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i16mf2_m(...) __riscv_vncvt_x_x_w_i16mf2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i16m1_m(...) __riscv_vncvt_x_x_w_i16m1_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i16m2_m(...) __riscv_vncvt_x_x_w_i16m2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i16m4_m(...) __riscv_vncvt_x_x_w_i16m4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u16mf4_m(...) __riscv_vncvt_x_x_w_u16mf4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u16mf2_m(...) __riscv_vncvt_x_x_w_u16mf2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u16m1_m(...) __riscv_vncvt_x_x_w_u16m1_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u16m2_m(...) __riscv_vncvt_x_x_w_u16m2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u16m4_m(...) __riscv_vncvt_x_x_w_u16m4_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16mf4_m(...) __riscv_vfncvt_xu_f_w_u16mf4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16mf4_m(...) __riscv_vfncvt_rtz_xu_f_w_u16mf4_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16mf2_m(...) __riscv_vfncvt_xu_f_w_u16mf2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16mf2_m(...) __riscv_vfncvt_rtz_xu_f_w_u16mf2_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16m1_m(...) __riscv_vfncvt_xu_f_w_u16m1_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16m1_m(...) __riscv_vfncvt_rtz_xu_f_w_u16m1_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16m2_m(...) __riscv_vfncvt_xu_f_w_u16m2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16m2_m(...) __riscv_vfncvt_rtz_xu_f_w_u16m2_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u16m4_m(...) __riscv_vfncvt_xu_f_w_u16m4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u16m4_m(...) __riscv_vfncvt_rtz_xu_f_w_u16m4_tumu(__VA_ARGS__)
+#define vfncvt_f_x_w_f16mf4_m(...) __riscv_vfncvt_f_x_w_f16mf4_tumu(__VA_ARGS__)
+#define vfncvt_f_x_w_f16mf2_m(...) __riscv_vfncvt_f_x_w_f16mf2_tumu(__VA_ARGS__)
+#define vfncvt_f_x_w_f16m1_m(...) __riscv_vfncvt_f_x_w_f16m1_tumu(__VA_ARGS__)
+#define vfncvt_f_x_w_f16m2_m(...) __riscv_vfncvt_f_x_w_f16m2_tumu(__VA_ARGS__)
+#define vfncvt_f_x_w_f16m4_m(...) __riscv_vfncvt_f_x_w_f16m4_tumu(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16mf4_m(...) __riscv_vfncvt_f_xu_w_f16mf4_tumu(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16mf2_m(...) __riscv_vfncvt_f_xu_w_f16mf2_tumu(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16m1_m(...) __riscv_vfncvt_f_xu_w_f16m1_tumu(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16m2_m(...) __riscv_vfncvt_f_xu_w_f16m2_tumu(__VA_ARGS__)
+#define vfncvt_f_xu_w_f16m4_m(...) __riscv_vfncvt_f_xu_w_f16m4_tumu(__VA_ARGS__)
+#define vfncvt_f_f_w_f16mf4_m(...) __riscv_vfncvt_f_f_w_f16mf4_tumu(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16mf4_m(...) __riscv_vfncvt_rod_f_f_w_f16mf4_tumu(__VA_ARGS__)
+#define vfncvt_f_f_w_f16mf2_m(...) __riscv_vfncvt_f_f_w_f16mf2_tumu(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16mf2_m(...) __riscv_vfncvt_rod_f_f_w_f16mf2_tumu(__VA_ARGS__)
+#define vfncvt_f_f_w_f16m1_m(...) __riscv_vfncvt_f_f_w_f16m1_tumu(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16m1_m(...) __riscv_vfncvt_rod_f_f_w_f16m1_tumu(__VA_ARGS__)
+#define vfncvt_f_f_w_f16m2_m(...) __riscv_vfncvt_f_f_w_f16m2_tumu(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16m2_m(...) __riscv_vfncvt_rod_f_f_w_f16m2_tumu(__VA_ARGS__)
+#define vfncvt_f_f_w_f16m4_m(...) __riscv_vfncvt_f_f_w_f16m4_tumu(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f16m4_m(...) __riscv_vfncvt_rod_f_f_w_f16m4_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i32mf2_m(...) __riscv_vfncvt_x_f_w_i32mf2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i32mf2_m(...) __riscv_vfncvt_rtz_x_f_w_i32mf2_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i32m1_m(...) __riscv_vfncvt_x_f_w_i32m1_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i32m1_m(...) __riscv_vfncvt_rtz_x_f_w_i32m1_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i32m2_m(...) __riscv_vfncvt_x_f_w_i32m2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i32m2_m(...) __riscv_vfncvt_rtz_x_f_w_i32m2_tumu(__VA_ARGS__)
+#define vfncvt_x_f_w_i32m4_m(...) __riscv_vfncvt_x_f_w_i32m4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_x_f_w_i32m4_m(...) __riscv_vfncvt_rtz_x_f_w_i32m4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i32mf2_m(...) __riscv_vncvt_x_x_w_i32mf2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i32m1_m(...) __riscv_vncvt_x_x_w_i32m1_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i32m2_m(...) __riscv_vncvt_x_x_w_i32m2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_i32m4_m(...) __riscv_vncvt_x_x_w_i32m4_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u32mf2_m(...) __riscv_vncvt_x_x_w_u32mf2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u32m1_m(...) __riscv_vncvt_x_x_w_u32m1_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u32m2_m(...) __riscv_vncvt_x_x_w_u32m2_tumu(__VA_ARGS__)
+#define vncvt_x_x_w_u32m4_m(...) __riscv_vncvt_x_x_w_u32m4_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u32mf2_m(...) __riscv_vfncvt_xu_f_w_u32mf2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u32mf2_m(...) __riscv_vfncvt_rtz_xu_f_w_u32mf2_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u32m1_m(...) __riscv_vfncvt_xu_f_w_u32m1_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u32m1_m(...) __riscv_vfncvt_rtz_xu_f_w_u32m1_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u32m2_m(...) __riscv_vfncvt_xu_f_w_u32m2_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u32m2_m(...) __riscv_vfncvt_rtz_xu_f_w_u32m2_tumu(__VA_ARGS__)
+#define vfncvt_xu_f_w_u32m4_m(...) __riscv_vfncvt_xu_f_w_u32m4_tumu(__VA_ARGS__)
+#define vfncvt_rtz_xu_f_w_u32m4_m(...) __riscv_vfncvt_rtz_xu_f_w_u32m4_tumu(__VA_ARGS__)
+#define vfncvt_f_x_w_f32mf2_m(...) __riscv_vfncvt_f_x_w_f32mf2_tumu(__VA_ARGS__)
+#define vfncvt_f_x_w_f32m1_m(...) __riscv_vfncvt_f_x_w_f32m1_tumu(__VA_ARGS__)
+#define vfncvt_f_x_w_f32m2_m(...) __riscv_vfncvt_f_x_w_f32m2_tumu(__VA_ARGS__)
+#define vfncvt_f_x_w_f32m4_m(...) __riscv_vfncvt_f_x_w_f32m4_tumu(__VA_ARGS__)
+#define vfncvt_f_xu_w_f32mf2_m(...) __riscv_vfncvt_f_xu_w_f32mf2_tumu(__VA_ARGS__)
+#define vfncvt_f_xu_w_f32m1_m(...) __riscv_vfncvt_f_xu_w_f32m1_tumu(__VA_ARGS__)
+#define vfncvt_f_xu_w_f32m2_m(...) __riscv_vfncvt_f_xu_w_f32m2_tumu(__VA_ARGS__)
+#define vfncvt_f_xu_w_f32m4_m(...) __riscv_vfncvt_f_xu_w_f32m4_tumu(__VA_ARGS__)
+#define vfncvt_f_f_w_f32mf2_m(...) __riscv_vfncvt_f_f_w_f32mf2_tumu(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f32mf2_m(...) __riscv_vfncvt_rod_f_f_w_f32mf2_tumu(__VA_ARGS__)
+#define vfncvt_f_f_w_f32m1_m(...) __riscv_vfncvt_f_f_w_f32m1_tumu(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f32m1_m(...) __riscv_vfncvt_rod_f_f_w_f32m1_tumu(__VA_ARGS__)
+#define vfncvt_f_f_w_f32m2_m(...) __riscv_vfncvt_f_f_w_f32m2_tumu(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f32m2_m(...) __riscv_vfncvt_rod_f_f_w_f32m2_tumu(__VA_ARGS__)
+#define vfncvt_f_f_w_f32m4_m(...) __riscv_vfncvt_f_f_w_f32m4_tumu(__VA_ARGS__)
+#define vfncvt_rod_f_f_w_f32m4_m(...) __riscv_vfncvt_rod_f_f_w_f32m4_tumu(__VA_ARGS__)
+#define vredsum_vs_i8mf8_i8m1(...) __riscv_vredsum_vs_i8mf8_i8m1_tu(__VA_ARGS__)
+#define vredsum_vs_i8mf4_i8m1(...) __riscv_vredsum_vs_i8mf4_i8m1_tu(__VA_ARGS__)
+#define vredsum_vs_i8mf2_i8m1(...) __riscv_vredsum_vs_i8mf2_i8m1_tu(__VA_ARGS__)
+#define vredsum_vs_i8m1_i8m1(...) __riscv_vredsum_vs_i8m1_i8m1_tu(__VA_ARGS__)
+#define vredsum_vs_i8m2_i8m1(...) __riscv_vredsum_vs_i8m2_i8m1_tu(__VA_ARGS__)
+#define vredsum_vs_i8m4_i8m1(...) __riscv_vredsum_vs_i8m4_i8m1_tu(__VA_ARGS__)
+#define vredsum_vs_i8m8_i8m1(...) __riscv_vredsum_vs_i8m8_i8m1_tu(__VA_ARGS__)
+#define vredsum_vs_i16mf4_i16m1(...) __riscv_vredsum_vs_i16mf4_i16m1_tu(__VA_ARGS__)
+#define vredsum_vs_i16mf2_i16m1(...) __riscv_vredsum_vs_i16mf2_i16m1_tu(__VA_ARGS__)
+#define vredsum_vs_i16m1_i16m1(...) __riscv_vredsum_vs_i16m1_i16m1_tu(__VA_ARGS__)
+#define vredsum_vs_i16m2_i16m1(...) __riscv_vredsum_vs_i16m2_i16m1_tu(__VA_ARGS__)
+#define vredsum_vs_i16m4_i16m1(...) __riscv_vredsum_vs_i16m4_i16m1_tu(__VA_ARGS__)
+#define vredsum_vs_i16m8_i16m1(...) __riscv_vredsum_vs_i16m8_i16m1_tu(__VA_ARGS__)
+#define vredsum_vs_i32mf2_i32m1(...) __riscv_vredsum_vs_i32mf2_i32m1_tu(__VA_ARGS__)
+#define vredsum_vs_i32m1_i32m1(...) __riscv_vredsum_vs_i32m1_i32m1_tu(__VA_ARGS__)
+#define vredsum_vs_i32m2_i32m1(...) __riscv_vredsum_vs_i32m2_i32m1_tu(__VA_ARGS__)
+#define vredsum_vs_i32m4_i32m1(...) __riscv_vredsum_vs_i32m4_i32m1_tu(__VA_ARGS__)
+#define vredsum_vs_i32m8_i32m1(...) __riscv_vredsum_vs_i32m8_i32m1_tu(__VA_ARGS__)
+#define vredsum_vs_i64m1_i64m1(...) __riscv_vredsum_vs_i64m1_i64m1_tu(__VA_ARGS__)
+#define vredsum_vs_i64m2_i64m1(...) __riscv_vredsum_vs_i64m2_i64m1_tu(__VA_ARGS__)
+#define vredsum_vs_i64m4_i64m1(...) __riscv_vredsum_vs_i64m4_i64m1_tu(__VA_ARGS__)
+#define vredsum_vs_i64m8_i64m1(...) __riscv_vredsum_vs_i64m8_i64m1_tu(__VA_ARGS__)
+#define vredmax_vs_i8mf8_i8m1(...) __riscv_vredmax_vs_i8mf8_i8m1_tu(__VA_ARGS__)
+#define vredmax_vs_i8mf4_i8m1(...) __riscv_vredmax_vs_i8mf4_i8m1_tu(__VA_ARGS__)
+#define vredmax_vs_i8mf2_i8m1(...) __riscv_vredmax_vs_i8mf2_i8m1_tu(__VA_ARGS__)
+#define vredmax_vs_i8m1_i8m1(...) __riscv_vredmax_vs_i8m1_i8m1_tu(__VA_ARGS__)
+#define vredmax_vs_i8m2_i8m1(...) __riscv_vredmax_vs_i8m2_i8m1_tu(__VA_ARGS__)
+#define vredmax_vs_i8m4_i8m1(...) __riscv_vredmax_vs_i8m4_i8m1_tu(__VA_ARGS__)
+#define vredmax_vs_i8m8_i8m1(...) __riscv_vredmax_vs_i8m8_i8m1_tu(__VA_ARGS__)
+#define vredmax_vs_i16mf4_i16m1(...) __riscv_vredmax_vs_i16mf4_i16m1_tu(__VA_ARGS__)
+#define vredmax_vs_i16mf2_i16m1(...) __riscv_vredmax_vs_i16mf2_i16m1_tu(__VA_ARGS__)
+#define vredmax_vs_i16m1_i16m1(...) __riscv_vredmax_vs_i16m1_i16m1_tu(__VA_ARGS__)
+#define vredmax_vs_i16m2_i16m1(...) __riscv_vredmax_vs_i16m2_i16m1_tu(__VA_ARGS__)
+#define vredmax_vs_i16m4_i16m1(...) __riscv_vredmax_vs_i16m4_i16m1_tu(__VA_ARGS__)
+#define vredmax_vs_i16m8_i16m1(...) __riscv_vredmax_vs_i16m8_i16m1_tu(__VA_ARGS__)
+#define vredmax_vs_i32mf2_i32m1(...) __riscv_vredmax_vs_i32mf2_i32m1_tu(__VA_ARGS__)
+#define vredmax_vs_i32m1_i32m1(...) __riscv_vredmax_vs_i32m1_i32m1_tu(__VA_ARGS__)
+#define vredmax_vs_i32m2_i32m1(...) __riscv_vredmax_vs_i32m2_i32m1_tu(__VA_ARGS__)
+#define vredmax_vs_i32m4_i32m1(...) __riscv_vredmax_vs_i32m4_i32m1_tu(__VA_ARGS__)
+#define vredmax_vs_i32m8_i32m1(...) __riscv_vredmax_vs_i32m8_i32m1_tu(__VA_ARGS__)
+#define vredmax_vs_i64m1_i64m1(...) __riscv_vredmax_vs_i64m1_i64m1_tu(__VA_ARGS__)
+#define vredmax_vs_i64m2_i64m1(...) __riscv_vredmax_vs_i64m2_i64m1_tu(__VA_ARGS__)
+#define vredmax_vs_i64m4_i64m1(...) __riscv_vredmax_vs_i64m4_i64m1_tu(__VA_ARGS__)
+#define vredmax_vs_i64m8_i64m1(...) __riscv_vredmax_vs_i64m8_i64m1_tu(__VA_ARGS__)
+#define vredmin_vs_i8mf8_i8m1(...) __riscv_vredmin_vs_i8mf8_i8m1_tu(__VA_ARGS__)
+#define vredmin_vs_i8mf4_i8m1(...) __riscv_vredmin_vs_i8mf4_i8m1_tu(__VA_ARGS__)
+#define vredmin_vs_i8mf2_i8m1(...) __riscv_vredmin_vs_i8mf2_i8m1_tu(__VA_ARGS__)
+#define vredmin_vs_i8m1_i8m1(...) __riscv_vredmin_vs_i8m1_i8m1_tu(__VA_ARGS__)
+#define vredmin_vs_i8m2_i8m1(...) __riscv_vredmin_vs_i8m2_i8m1_tu(__VA_ARGS__)
+#define vredmin_vs_i8m4_i8m1(...) __riscv_vredmin_vs_i8m4_i8m1_tu(__VA_ARGS__)
+#define vredmin_vs_i8m8_i8m1(...) __riscv_vredmin_vs_i8m8_i8m1_tu(__VA_ARGS__)
+#define vredmin_vs_i16mf4_i16m1(...) __riscv_vredmin_vs_i16mf4_i16m1_tu(__VA_ARGS__)
+#define vredmin_vs_i16mf2_i16m1(...) __riscv_vredmin_vs_i16mf2_i16m1_tu(__VA_ARGS__)
+#define vredmin_vs_i16m1_i16m1(...) __riscv_vredmin_vs_i16m1_i16m1_tu(__VA_ARGS__)
+#define vredmin_vs_i16m2_i16m1(...) __riscv_vredmin_vs_i16m2_i16m1_tu(__VA_ARGS__)
+#define vredmin_vs_i16m4_i16m1(...) __riscv_vredmin_vs_i16m4_i16m1_tu(__VA_ARGS__)
+#define vredmin_vs_i16m8_i16m1(...) __riscv_vredmin_vs_i16m8_i16m1_tu(__VA_ARGS__)
+#define vredmin_vs_i32mf2_i32m1(...) __riscv_vredmin_vs_i32mf2_i32m1_tu(__VA_ARGS__)
+#define vredmin_vs_i32m1_i32m1(...) __riscv_vredmin_vs_i32m1_i32m1_tu(__VA_ARGS__)
+#define vredmin_vs_i32m2_i32m1(...) __riscv_vredmin_vs_i32m2_i32m1_tu(__VA_ARGS__)
+#define vredmin_vs_i32m4_i32m1(...) __riscv_vredmin_vs_i32m4_i32m1_tu(__VA_ARGS__)
+#define vredmin_vs_i32m8_i32m1(...) __riscv_vredmin_vs_i32m8_i32m1_tu(__VA_ARGS__)
+#define vredmin_vs_i64m1_i64m1(...) __riscv_vredmin_vs_i64m1_i64m1_tu(__VA_ARGS__)
+#define vredmin_vs_i64m2_i64m1(...) __riscv_vredmin_vs_i64m2_i64m1_tu(__VA_ARGS__)
+#define vredmin_vs_i64m4_i64m1(...) __riscv_vredmin_vs_i64m4_i64m1_tu(__VA_ARGS__)
+#define vredmin_vs_i64m8_i64m1(...) __riscv_vredmin_vs_i64m8_i64m1_tu(__VA_ARGS__)
+#define vredand_vs_i8mf8_i8m1(...) __riscv_vredand_vs_i8mf8_i8m1_tu(__VA_ARGS__)
+#define vredand_vs_i8mf4_i8m1(...) __riscv_vredand_vs_i8mf4_i8m1_tu(__VA_ARGS__)
+#define vredand_vs_i8mf2_i8m1(...) __riscv_vredand_vs_i8mf2_i8m1_tu(__VA_ARGS__)
+#define vredand_vs_i8m1_i8m1(...) __riscv_vredand_vs_i8m1_i8m1_tu(__VA_ARGS__)
+#define vredand_vs_i8m2_i8m1(...) __riscv_vredand_vs_i8m2_i8m1_tu(__VA_ARGS__)
+#define vredand_vs_i8m4_i8m1(...) __riscv_vredand_vs_i8m4_i8m1_tu(__VA_ARGS__)
+#define vredand_vs_i8m8_i8m1(...) __riscv_vredand_vs_i8m8_i8m1_tu(__VA_ARGS__)
+#define vredand_vs_i16mf4_i16m1(...) __riscv_vredand_vs_i16mf4_i16m1_tu(__VA_ARGS__)
+#define vredand_vs_i16mf2_i16m1(...) __riscv_vredand_vs_i16mf2_i16m1_tu(__VA_ARGS__)
+#define vredand_vs_i16m1_i16m1(...) __riscv_vredand_vs_i16m1_i16m1_tu(__VA_ARGS__)
+#define vredand_vs_i16m2_i16m1(...) __riscv_vredand_vs_i16m2_i16m1_tu(__VA_ARGS__)
+#define vredand_vs_i16m4_i16m1(...) __riscv_vredand_vs_i16m4_i16m1_tu(__VA_ARGS__)
+#define vredand_vs_i16m8_i16m1(...) __riscv_vredand_vs_i16m8_i16m1_tu(__VA_ARGS__)
+#define vredand_vs_i32mf2_i32m1(...) __riscv_vredand_vs_i32mf2_i32m1_tu(__VA_ARGS__)
+#define vredand_vs_i32m1_i32m1(...) __riscv_vredand_vs_i32m1_i32m1_tu(__VA_ARGS__)
+#define vredand_vs_i32m2_i32m1(...) __riscv_vredand_vs_i32m2_i32m1_tu(__VA_ARGS__)
+#define vredand_vs_i32m4_i32m1(...) __riscv_vredand_vs_i32m4_i32m1_tu(__VA_ARGS__)
+#define vredand_vs_i32m8_i32m1(...) __riscv_vredand_vs_i32m8_i32m1_tu(__VA_ARGS__)
+#define vredand_vs_i64m1_i64m1(...) __riscv_vredand_vs_i64m1_i64m1_tu(__VA_ARGS__)
+#define vredand_vs_i64m2_i64m1(...) __riscv_vredand_vs_i64m2_i64m1_tu(__VA_ARGS__)
+#define vredand_vs_i64m4_i64m1(...) __riscv_vredand_vs_i64m4_i64m1_tu(__VA_ARGS__)
+#define vredand_vs_i64m8_i64m1(...) __riscv_vredand_vs_i64m8_i64m1_tu(__VA_ARGS__)
+#define vredor_vs_i8mf8_i8m1(...) __riscv_vredor_vs_i8mf8_i8m1_tu(__VA_ARGS__)
+#define vredor_vs_i8mf4_i8m1(...) __riscv_vredor_vs_i8mf4_i8m1_tu(__VA_ARGS__)
+#define vredor_vs_i8mf2_i8m1(...) __riscv_vredor_vs_i8mf2_i8m1_tu(__VA_ARGS__)
+#define vredor_vs_i8m1_i8m1(...) __riscv_vredor_vs_i8m1_i8m1_tu(__VA_ARGS__)
+#define vredor_vs_i8m2_i8m1(...) __riscv_vredor_vs_i8m2_i8m1_tu(__VA_ARGS__)
+#define vredor_vs_i8m4_i8m1(...) __riscv_vredor_vs_i8m4_i8m1_tu(__VA_ARGS__)
+#define vredor_vs_i8m8_i8m1(...) __riscv_vredor_vs_i8m8_i8m1_tu(__VA_ARGS__)
+#define vredor_vs_i16mf4_i16m1(...) __riscv_vredor_vs_i16mf4_i16m1_tu(__VA_ARGS__)
+#define vredor_vs_i16mf2_i16m1(...) __riscv_vredor_vs_i16mf2_i16m1_tu(__VA_ARGS__)
+#define vredor_vs_i16m1_i16m1(...) __riscv_vredor_vs_i16m1_i16m1_tu(__VA_ARGS__)
+#define vredor_vs_i16m2_i16m1(...) __riscv_vredor_vs_i16m2_i16m1_tu(__VA_ARGS__)
+#define vredor_vs_i16m4_i16m1(...) __riscv_vredor_vs_i16m4_i16m1_tu(__VA_ARGS__)
+#define vredor_vs_i16m8_i16m1(...) __riscv_vredor_vs_i16m8_i16m1_tu(__VA_ARGS__)
+#define vredor_vs_i32mf2_i32m1(...) __riscv_vredor_vs_i32mf2_i32m1_tu(__VA_ARGS__)
+#define vredor_vs_i32m1_i32m1(...) __riscv_vredor_vs_i32m1_i32m1_tu(__VA_ARGS__)
+#define vredor_vs_i32m2_i32m1(...) __riscv_vredor_vs_i32m2_i32m1_tu(__VA_ARGS__)
+#define vredor_vs_i32m4_i32m1(...) __riscv_vredor_vs_i32m4_i32m1_tu(__VA_ARGS__)
+#define vredor_vs_i32m8_i32m1(...) __riscv_vredor_vs_i32m8_i32m1_tu(__VA_ARGS__)
+#define vredor_vs_i64m1_i64m1(...) __riscv_vredor_vs_i64m1_i64m1_tu(__VA_ARGS__)
+#define vredor_vs_i64m2_i64m1(...) __riscv_vredor_vs_i64m2_i64m1_tu(__VA_ARGS__)
+#define vredor_vs_i64m4_i64m1(...) __riscv_vredor_vs_i64m4_i64m1_tu(__VA_ARGS__)
+#define vredor_vs_i64m8_i64m1(...) __riscv_vredor_vs_i64m8_i64m1_tu(__VA_ARGS__)
+#define vredxor_vs_i8mf8_i8m1(...) __riscv_vredxor_vs_i8mf8_i8m1_tu(__VA_ARGS__)
+#define vredxor_vs_i8mf4_i8m1(...) __riscv_vredxor_vs_i8mf4_i8m1_tu(__VA_ARGS__)
+#define vredxor_vs_i8mf2_i8m1(...) __riscv_vredxor_vs_i8mf2_i8m1_tu(__VA_ARGS__)
+#define vredxor_vs_i8m1_i8m1(...) __riscv_vredxor_vs_i8m1_i8m1_tu(__VA_ARGS__)
+#define vredxor_vs_i8m2_i8m1(...) __riscv_vredxor_vs_i8m2_i8m1_tu(__VA_ARGS__)
+#define vredxor_vs_i8m4_i8m1(...) __riscv_vredxor_vs_i8m4_i8m1_tu(__VA_ARGS__)
+#define vredxor_vs_i8m8_i8m1(...) __riscv_vredxor_vs_i8m8_i8m1_tu(__VA_ARGS__)
+#define vredxor_vs_i16mf4_i16m1(...) __riscv_vredxor_vs_i16mf4_i16m1_tu(__VA_ARGS__)
+#define vredxor_vs_i16mf2_i16m1(...) __riscv_vredxor_vs_i16mf2_i16m1_tu(__VA_ARGS__)
+#define vredxor_vs_i16m1_i16m1(...) __riscv_vredxor_vs_i16m1_i16m1_tu(__VA_ARGS__)
+#define vredxor_vs_i16m2_i16m1(...) __riscv_vredxor_vs_i16m2_i16m1_tu(__VA_ARGS__)
+#define vredxor_vs_i16m4_i16m1(...) __riscv_vredxor_vs_i16m4_i16m1_tu(__VA_ARGS__)
+#define vredxor_vs_i16m8_i16m1(...) __riscv_vredxor_vs_i16m8_i16m1_tu(__VA_ARGS__)
+#define vredxor_vs_i32mf2_i32m1(...) __riscv_vredxor_vs_i32mf2_i32m1_tu(__VA_ARGS__)
+#define vredxor_vs_i32m1_i32m1(...) __riscv_vredxor_vs_i32m1_i32m1_tu(__VA_ARGS__)
+#define vredxor_vs_i32m2_i32m1(...) __riscv_vredxor_vs_i32m2_i32m1_tu(__VA_ARGS__)
+#define vredxor_vs_i32m4_i32m1(...) __riscv_vredxor_vs_i32m4_i32m1_tu(__VA_ARGS__)
+#define vredxor_vs_i32m8_i32m1(...) __riscv_vredxor_vs_i32m8_i32m1_tu(__VA_ARGS__)
+#define vredxor_vs_i64m1_i64m1(...) __riscv_vredxor_vs_i64m1_i64m1_tu(__VA_ARGS__)
+#define vredxor_vs_i64m2_i64m1(...) __riscv_vredxor_vs_i64m2_i64m1_tu(__VA_ARGS__)
+#define vredxor_vs_i64m4_i64m1(...) __riscv_vredxor_vs_i64m4_i64m1_tu(__VA_ARGS__)
+#define vredxor_vs_i64m8_i64m1(...) __riscv_vredxor_vs_i64m8_i64m1_tu(__VA_ARGS__)
+#define vredsum_vs_u8mf8_u8m1(...) __riscv_vredsum_vs_u8mf8_u8m1_tu(__VA_ARGS__)
+#define vredsum_vs_u8mf4_u8m1(...) __riscv_vredsum_vs_u8mf4_u8m1_tu(__VA_ARGS__)
+#define vredsum_vs_u8mf2_u8m1(...) __riscv_vredsum_vs_u8mf2_u8m1_tu(__VA_ARGS__)
+#define vredsum_vs_u8m1_u8m1(...) __riscv_vredsum_vs_u8m1_u8m1_tu(__VA_ARGS__)
+#define vredsum_vs_u8m2_u8m1(...) __riscv_vredsum_vs_u8m2_u8m1_tu(__VA_ARGS__)
+#define vredsum_vs_u8m4_u8m1(...) __riscv_vredsum_vs_u8m4_u8m1_tu(__VA_ARGS__)
+#define vredsum_vs_u8m8_u8m1(...) __riscv_vredsum_vs_u8m8_u8m1_tu(__VA_ARGS__)
+#define vredsum_vs_u16mf4_u16m1(...) __riscv_vredsum_vs_u16mf4_u16m1_tu(__VA_ARGS__)
+#define vredsum_vs_u16mf2_u16m1(...) __riscv_vredsum_vs_u16mf2_u16m1_tu(__VA_ARGS__)
+#define vredsum_vs_u16m1_u16m1(...) __riscv_vredsum_vs_u16m1_u16m1_tu(__VA_ARGS__)
+#define vredsum_vs_u16m2_u16m1(...) __riscv_vredsum_vs_u16m2_u16m1_tu(__VA_ARGS__)
+#define vredsum_vs_u16m4_u16m1(...) __riscv_vredsum_vs_u16m4_u16m1_tu(__VA_ARGS__)
+#define vredsum_vs_u16m8_u16m1(...) __riscv_vredsum_vs_u16m8_u16m1_tu(__VA_ARGS__)
+#define vredsum_vs_u32mf2_u32m1(...) __riscv_vredsum_vs_u32mf2_u32m1_tu(__VA_ARGS__)
+#define vredsum_vs_u32m1_u32m1(...) __riscv_vredsum_vs_u32m1_u32m1_tu(__VA_ARGS__)
+#define vredsum_vs_u32m2_u32m1(...) __riscv_vredsum_vs_u32m2_u32m1_tu(__VA_ARGS__)
+#define vredsum_vs_u32m4_u32m1(...) __riscv_vredsum_vs_u32m4_u32m1_tu(__VA_ARGS__)
+#define vredsum_vs_u32m8_u32m1(...) __riscv_vredsum_vs_u32m8_u32m1_tu(__VA_ARGS__)
+#define vredsum_vs_u64m1_u64m1(...) __riscv_vredsum_vs_u64m1_u64m1_tu(__VA_ARGS__)
+#define vredsum_vs_u64m2_u64m1(...) __riscv_vredsum_vs_u64m2_u64m1_tu(__VA_ARGS__)
+#define vredsum_vs_u64m4_u64m1(...) __riscv_vredsum_vs_u64m4_u64m1_tu(__VA_ARGS__)
+#define vredsum_vs_u64m8_u64m1(...) __riscv_vredsum_vs_u64m8_u64m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u8mf8_u8m1(...) __riscv_vredmaxu_vs_u8mf8_u8m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u8mf4_u8m1(...) __riscv_vredmaxu_vs_u8mf4_u8m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u8mf2_u8m1(...) __riscv_vredmaxu_vs_u8mf2_u8m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u8m1_u8m1(...) __riscv_vredmaxu_vs_u8m1_u8m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u8m2_u8m1(...) __riscv_vredmaxu_vs_u8m2_u8m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u8m4_u8m1(...) __riscv_vredmaxu_vs_u8m4_u8m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u8m8_u8m1(...) __riscv_vredmaxu_vs_u8m8_u8m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u16mf4_u16m1(...) __riscv_vredmaxu_vs_u16mf4_u16m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u16mf2_u16m1(...) __riscv_vredmaxu_vs_u16mf2_u16m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u16m1_u16m1(...) __riscv_vredmaxu_vs_u16m1_u16m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u16m2_u16m1(...) __riscv_vredmaxu_vs_u16m2_u16m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u16m4_u16m1(...) __riscv_vredmaxu_vs_u16m4_u16m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u16m8_u16m1(...) __riscv_vredmaxu_vs_u16m8_u16m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u32mf2_u32m1(...) __riscv_vredmaxu_vs_u32mf2_u32m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u32m1_u32m1(...) __riscv_vredmaxu_vs_u32m1_u32m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u32m2_u32m1(...) __riscv_vredmaxu_vs_u32m2_u32m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u32m4_u32m1(...) __riscv_vredmaxu_vs_u32m4_u32m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u32m8_u32m1(...) __riscv_vredmaxu_vs_u32m8_u32m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u64m1_u64m1(...) __riscv_vredmaxu_vs_u64m1_u64m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u64m2_u64m1(...) __riscv_vredmaxu_vs_u64m2_u64m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u64m4_u64m1(...) __riscv_vredmaxu_vs_u64m4_u64m1_tu(__VA_ARGS__)
+#define vredmaxu_vs_u64m8_u64m1(...) __riscv_vredmaxu_vs_u64m8_u64m1_tu(__VA_ARGS__)
+#define vredminu_vs_u8mf8_u8m1(...) __riscv_vredminu_vs_u8mf8_u8m1_tu(__VA_ARGS__)
+#define vredminu_vs_u8mf4_u8m1(...) __riscv_vredminu_vs_u8mf4_u8m1_tu(__VA_ARGS__)
+#define vredminu_vs_u8mf2_u8m1(...) __riscv_vredminu_vs_u8mf2_u8m1_tu(__VA_ARGS__)
+#define vredminu_vs_u8m1_u8m1(...) __riscv_vredminu_vs_u8m1_u8m1_tu(__VA_ARGS__)
+#define vredminu_vs_u8m2_u8m1(...) __riscv_vredminu_vs_u8m2_u8m1_tu(__VA_ARGS__)
+#define vredminu_vs_u8m4_u8m1(...) __riscv_vredminu_vs_u8m4_u8m1_tu(__VA_ARGS__)
+#define vredminu_vs_u8m8_u8m1(...) __riscv_vredminu_vs_u8m8_u8m1_tu(__VA_ARGS__)
+#define vredminu_vs_u16mf4_u16m1(...) __riscv_vredminu_vs_u16mf4_u16m1_tu(__VA_ARGS__)
+#define vredminu_vs_u16mf2_u16m1(...) __riscv_vredminu_vs_u16mf2_u16m1_tu(__VA_ARGS__)
+#define vredminu_vs_u16m1_u16m1(...) __riscv_vredminu_vs_u16m1_u16m1_tu(__VA_ARGS__)
+#define vredminu_vs_u16m2_u16m1(...) __riscv_vredminu_vs_u16m2_u16m1_tu(__VA_ARGS__)
+#define vredminu_vs_u16m4_u16m1(...) __riscv_vredminu_vs_u16m4_u16m1_tu(__VA_ARGS__)
+#define vredminu_vs_u16m8_u16m1(...) __riscv_vredminu_vs_u16m8_u16m1_tu(__VA_ARGS__)
+#define vredminu_vs_u32mf2_u32m1(...) __riscv_vredminu_vs_u32mf2_u32m1_tu(__VA_ARGS__)
+#define vredminu_vs_u32m1_u32m1(...) __riscv_vredminu_vs_u32m1_u32m1_tu(__VA_ARGS__)
+#define vredminu_vs_u32m2_u32m1(...) __riscv_vredminu_vs_u32m2_u32m1_tu(__VA_ARGS__)
+#define vredminu_vs_u32m4_u32m1(...) __riscv_vredminu_vs_u32m4_u32m1_tu(__VA_ARGS__)
+#define vredminu_vs_u32m8_u32m1(...) __riscv_vredminu_vs_u32m8_u32m1_tu(__VA_ARGS__)
+#define vredminu_vs_u64m1_u64m1(...) __riscv_vredminu_vs_u64m1_u64m1_tu(__VA_ARGS__)
+#define vredminu_vs_u64m2_u64m1(...) __riscv_vredminu_vs_u64m2_u64m1_tu(__VA_ARGS__)
+#define vredminu_vs_u64m4_u64m1(...) __riscv_vredminu_vs_u64m4_u64m1_tu(__VA_ARGS__)
+#define vredminu_vs_u64m8_u64m1(...) __riscv_vredminu_vs_u64m8_u64m1_tu(__VA_ARGS__)
+#define vredand_vs_u8mf8_u8m1(...) __riscv_vredand_vs_u8mf8_u8m1_tu(__VA_ARGS__)
+#define vredand_vs_u8mf4_u8m1(...) __riscv_vredand_vs_u8mf4_u8m1_tu(__VA_ARGS__)
+#define vredand_vs_u8mf2_u8m1(...) __riscv_vredand_vs_u8mf2_u8m1_tu(__VA_ARGS__)
+#define vredand_vs_u8m1_u8m1(...) __riscv_vredand_vs_u8m1_u8m1_tu(__VA_ARGS__)
+#define vredand_vs_u8m2_u8m1(...) __riscv_vredand_vs_u8m2_u8m1_tu(__VA_ARGS__)
+#define vredand_vs_u8m4_u8m1(...) __riscv_vredand_vs_u8m4_u8m1_tu(__VA_ARGS__)
+#define vredand_vs_u8m8_u8m1(...) __riscv_vredand_vs_u8m8_u8m1_tu(__VA_ARGS__)
+#define vredand_vs_u16mf4_u16m1(...) __riscv_vredand_vs_u16mf4_u16m1_tu(__VA_ARGS__)
+#define vredand_vs_u16mf2_u16m1(...) __riscv_vredand_vs_u16mf2_u16m1_tu(__VA_ARGS__)
+#define vredand_vs_u16m1_u16m1(...) __riscv_vredand_vs_u16m1_u16m1_tu(__VA_ARGS__)
+#define vredand_vs_u16m2_u16m1(...) __riscv_vredand_vs_u16m2_u16m1_tu(__VA_ARGS__)
+#define vredand_vs_u16m4_u16m1(...) __riscv_vredand_vs_u16m4_u16m1_tu(__VA_ARGS__)
+#define vredand_vs_u16m8_u16m1(...) __riscv_vredand_vs_u16m8_u16m1_tu(__VA_ARGS__)
+#define vredand_vs_u32mf2_u32m1(...) __riscv_vredand_vs_u32mf2_u32m1_tu(__VA_ARGS__)
+#define vredand_vs_u32m1_u32m1(...) __riscv_vredand_vs_u32m1_u32m1_tu(__VA_ARGS__)
+#define vredand_vs_u32m2_u32m1(...) __riscv_vredand_vs_u32m2_u32m1_tu(__VA_ARGS__)
+#define vredand_vs_u32m4_u32m1(...) __riscv_vredand_vs_u32m4_u32m1_tu(__VA_ARGS__)
+#define vredand_vs_u32m8_u32m1(...) __riscv_vredand_vs_u32m8_u32m1_tu(__VA_ARGS__)
+#define vredand_vs_u64m1_u64m1(...) __riscv_vredand_vs_u64m1_u64m1_tu(__VA_ARGS__)
+#define vredand_vs_u64m2_u64m1(...) __riscv_vredand_vs_u64m2_u64m1_tu(__VA_ARGS__)
+#define vredand_vs_u64m4_u64m1(...) __riscv_vredand_vs_u64m4_u64m1_tu(__VA_ARGS__)
+#define vredand_vs_u64m8_u64m1(...) __riscv_vredand_vs_u64m8_u64m1_tu(__VA_ARGS__)
+#define vredor_vs_u8mf8_u8m1(...) __riscv_vredor_vs_u8mf8_u8m1_tu(__VA_ARGS__)
+#define vredor_vs_u8mf4_u8m1(...) __riscv_vredor_vs_u8mf4_u8m1_tu(__VA_ARGS__)
+#define vredor_vs_u8mf2_u8m1(...) __riscv_vredor_vs_u8mf2_u8m1_tu(__VA_ARGS__)
+#define vredor_vs_u8m1_u8m1(...) __riscv_vredor_vs_u8m1_u8m1_tu(__VA_ARGS__)
+#define vredor_vs_u8m2_u8m1(...) __riscv_vredor_vs_u8m2_u8m1_tu(__VA_ARGS__)
+#define vredor_vs_u8m4_u8m1(...) __riscv_vredor_vs_u8m4_u8m1_tu(__VA_ARGS__)
+#define vredor_vs_u8m8_u8m1(...) __riscv_vredor_vs_u8m8_u8m1_tu(__VA_ARGS__)
+#define vredor_vs_u16mf4_u16m1(...) __riscv_vredor_vs_u16mf4_u16m1_tu(__VA_ARGS__)
+#define vredor_vs_u16mf2_u16m1(...) __riscv_vredor_vs_u16mf2_u16m1_tu(__VA_ARGS__)
+#define vredor_vs_u16m1_u16m1(...) __riscv_vredor_vs_u16m1_u16m1_tu(__VA_ARGS__)
+#define vredor_vs_u16m2_u16m1(...) __riscv_vredor_vs_u16m2_u16m1_tu(__VA_ARGS__)
+#define vredor_vs_u16m4_u16m1(...) __riscv_vredor_vs_u16m4_u16m1_tu(__VA_ARGS__)
+#define vredor_vs_u16m8_u16m1(...) __riscv_vredor_vs_u16m8_u16m1_tu(__VA_ARGS__)
+#define vredor_vs_u32mf2_u32m1(...) __riscv_vredor_vs_u32mf2_u32m1_tu(__VA_ARGS__)
+#define vredor_vs_u32m1_u32m1(...) __riscv_vredor_vs_u32m1_u32m1_tu(__VA_ARGS__)
+#define vredor_vs_u32m2_u32m1(...) __riscv_vredor_vs_u32m2_u32m1_tu(__VA_ARGS__)
+#define vredor_vs_u32m4_u32m1(...) __riscv_vredor_vs_u32m4_u32m1_tu(__VA_ARGS__)
+#define vredor_vs_u32m8_u32m1(...) __riscv_vredor_vs_u32m8_u32m1_tu(__VA_ARGS__)
+#define vredor_vs_u64m1_u64m1(...) __riscv_vredor_vs_u64m1_u64m1_tu(__VA_ARGS__)
+#define vredor_vs_u64m2_u64m1(...) __riscv_vredor_vs_u64m2_u64m1_tu(__VA_ARGS__)
+#define vredor_vs_u64m4_u64m1(...) __riscv_vredor_vs_u64m4_u64m1_tu(__VA_ARGS__)
+#define vredor_vs_u64m8_u64m1(...) __riscv_vredor_vs_u64m8_u64m1_tu(__VA_ARGS__)
+#define vredxor_vs_u8mf8_u8m1(...) __riscv_vredxor_vs_u8mf8_u8m1_tu(__VA_ARGS__)
+#define vredxor_vs_u8mf4_u8m1(...) __riscv_vredxor_vs_u8mf4_u8m1_tu(__VA_ARGS__)
+#define vredxor_vs_u8mf2_u8m1(...) __riscv_vredxor_vs_u8mf2_u8m1_tu(__VA_ARGS__)
+#define vredxor_vs_u8m1_u8m1(...) __riscv_vredxor_vs_u8m1_u8m1_tu(__VA_ARGS__)
+#define vredxor_vs_u8m2_u8m1(...) __riscv_vredxor_vs_u8m2_u8m1_tu(__VA_ARGS__)
+#define vredxor_vs_u8m4_u8m1(...) __riscv_vredxor_vs_u8m4_u8m1_tu(__VA_ARGS__)
+#define vredxor_vs_u8m8_u8m1(...) __riscv_vredxor_vs_u8m8_u8m1_tu(__VA_ARGS__)
+#define vredxor_vs_u16mf4_u16m1(...) __riscv_vredxor_vs_u16mf4_u16m1_tu(__VA_ARGS__)
+#define vredxor_vs_u16mf2_u16m1(...) __riscv_vredxor_vs_u16mf2_u16m1_tu(__VA_ARGS__)
+#define vredxor_vs_u16m1_u16m1(...) __riscv_vredxor_vs_u16m1_u16m1_tu(__VA_ARGS__)
+#define vredxor_vs_u16m2_u16m1(...) __riscv_vredxor_vs_u16m2_u16m1_tu(__VA_ARGS__)
+#define vredxor_vs_u16m4_u16m1(...) __riscv_vredxor_vs_u16m4_u16m1_tu(__VA_ARGS__)
+#define vredxor_vs_u16m8_u16m1(...) __riscv_vredxor_vs_u16m8_u16m1_tu(__VA_ARGS__)
+#define vredxor_vs_u32mf2_u32m1(...) __riscv_vredxor_vs_u32mf2_u32m1_tu(__VA_ARGS__)
+#define vredxor_vs_u32m1_u32m1(...) __riscv_vredxor_vs_u32m1_u32m1_tu(__VA_ARGS__)
+#define vredxor_vs_u32m2_u32m1(...) __riscv_vredxor_vs_u32m2_u32m1_tu(__VA_ARGS__)
+#define vredxor_vs_u32m4_u32m1(...) __riscv_vredxor_vs_u32m4_u32m1_tu(__VA_ARGS__)
+#define vredxor_vs_u32m8_u32m1(...) __riscv_vredxor_vs_u32m8_u32m1_tu(__VA_ARGS__)
+#define vredxor_vs_u64m1_u64m1(...) __riscv_vredxor_vs_u64m1_u64m1_tu(__VA_ARGS__)
+#define vredxor_vs_u64m2_u64m1(...) __riscv_vredxor_vs_u64m2_u64m1_tu(__VA_ARGS__)
+#define vredxor_vs_u64m4_u64m1(...) __riscv_vredxor_vs_u64m4_u64m1_tu(__VA_ARGS__)
+#define vredxor_vs_u64m8_u64m1(...) __riscv_vredxor_vs_u64m8_u64m1_tu(__VA_ARGS__)
+// masked functions
+#define vredsum_vs_i8mf8_i8m1_m(...) __riscv_vredsum_vs_i8mf8_i8m1_tum(__VA_ARGS__)
+#define vredsum_vs_i8mf4_i8m1_m(...) __riscv_vredsum_vs_i8mf4_i8m1_tum(__VA_ARGS__)
+#define vredsum_vs_i8mf2_i8m1_m(...) __riscv_vredsum_vs_i8mf2_i8m1_tum(__VA_ARGS__)
+#define vredsum_vs_i8m1_i8m1_m(...) __riscv_vredsum_vs_i8m1_i8m1_tum(__VA_ARGS__)
+#define vredsum_vs_i8m2_i8m1_m(...) __riscv_vredsum_vs_i8m2_i8m1_tum(__VA_ARGS__)
+#define vredsum_vs_i8m4_i8m1_m(...) __riscv_vredsum_vs_i8m4_i8m1_tum(__VA_ARGS__)
+#define vredsum_vs_i8m8_i8m1_m(...) __riscv_vredsum_vs_i8m8_i8m1_tum(__VA_ARGS__)
+#define vredsum_vs_i16mf4_i16m1_m(...) __riscv_vredsum_vs_i16mf4_i16m1_tum(__VA_ARGS__)
+#define vredsum_vs_i16mf2_i16m1_m(...) __riscv_vredsum_vs_i16mf2_i16m1_tum(__VA_ARGS__)
+#define vredsum_vs_i16m1_i16m1_m(...) __riscv_vredsum_vs_i16m1_i16m1_tum(__VA_ARGS__)
+#define vredsum_vs_i16m2_i16m1_m(...) __riscv_vredsum_vs_i16m2_i16m1_tum(__VA_ARGS__)
+#define vredsum_vs_i16m4_i16m1_m(...) __riscv_vredsum_vs_i16m4_i16m1_tum(__VA_ARGS__)
+#define vredsum_vs_i16m8_i16m1_m(...) __riscv_vredsum_vs_i16m8_i16m1_tum(__VA_ARGS__)
+#define vredsum_vs_i32mf2_i32m1_m(...) __riscv_vredsum_vs_i32mf2_i32m1_tum(__VA_ARGS__)
+#define vredsum_vs_i32m1_i32m1_m(...) __riscv_vredsum_vs_i32m1_i32m1_tum(__VA_ARGS__)
+#define vredsum_vs_i32m2_i32m1_m(...) __riscv_vredsum_vs_i32m2_i32m1_tum(__VA_ARGS__)
+#define vredsum_vs_i32m4_i32m1_m(...) __riscv_vredsum_vs_i32m4_i32m1_tum(__VA_ARGS__)
+#define vredsum_vs_i32m8_i32m1_m(...) __riscv_vredsum_vs_i32m8_i32m1_tum(__VA_ARGS__)
+#define vredsum_vs_i64m1_i64m1_m(...) __riscv_vredsum_vs_i64m1_i64m1_tum(__VA_ARGS__)
+#define vredsum_vs_i64m2_i64m1_m(...) __riscv_vredsum_vs_i64m2_i64m1_tum(__VA_ARGS__)
+#define vredsum_vs_i64m4_i64m1_m(...) __riscv_vredsum_vs_i64m4_i64m1_tum(__VA_ARGS__)
+#define vredsum_vs_i64m8_i64m1_m(...) __riscv_vredsum_vs_i64m8_i64m1_tum(__VA_ARGS__)
+#define vredmax_vs_i8mf8_i8m1_m(...) __riscv_vredmax_vs_i8mf8_i8m1_tum(__VA_ARGS__)
+#define vredmax_vs_i8mf4_i8m1_m(...) __riscv_vredmax_vs_i8mf4_i8m1_tum(__VA_ARGS__)
+#define vredmax_vs_i8mf2_i8m1_m(...) __riscv_vredmax_vs_i8mf2_i8m1_tum(__VA_ARGS__)
+#define vredmax_vs_i8m1_i8m1_m(...) __riscv_vredmax_vs_i8m1_i8m1_tum(__VA_ARGS__)
+#define vredmax_vs_i8m2_i8m1_m(...) __riscv_vredmax_vs_i8m2_i8m1_tum(__VA_ARGS__)
+#define vredmax_vs_i8m4_i8m1_m(...) __riscv_vredmax_vs_i8m4_i8m1_tum(__VA_ARGS__)
+#define vredmax_vs_i8m8_i8m1_m(...) __riscv_vredmax_vs_i8m8_i8m1_tum(__VA_ARGS__)
+#define vredmax_vs_i16mf4_i16m1_m(...) __riscv_vredmax_vs_i16mf4_i16m1_tum(__VA_ARGS__)
+#define vredmax_vs_i16mf2_i16m1_m(...) __riscv_vredmax_vs_i16mf2_i16m1_tum(__VA_ARGS__)
+#define vredmax_vs_i16m1_i16m1_m(...) __riscv_vredmax_vs_i16m1_i16m1_tum(__VA_ARGS__)
+#define vredmax_vs_i16m2_i16m1_m(...) __riscv_vredmax_vs_i16m2_i16m1_tum(__VA_ARGS__)
+#define vredmax_vs_i16m4_i16m1_m(...) __riscv_vredmax_vs_i16m4_i16m1_tum(__VA_ARGS__)
+#define vredmax_vs_i16m8_i16m1_m(...) __riscv_vredmax_vs_i16m8_i16m1_tum(__VA_ARGS__)
+#define vredmax_vs_i32mf2_i32m1_m(...) __riscv_vredmax_vs_i32mf2_i32m1_tum(__VA_ARGS__)
+#define vredmax_vs_i32m1_i32m1_m(...) __riscv_vredmax_vs_i32m1_i32m1_tum(__VA_ARGS__)
+#define vredmax_vs_i32m2_i32m1_m(...) __riscv_vredmax_vs_i32m2_i32m1_tum(__VA_ARGS__)
+#define vredmax_vs_i32m4_i32m1_m(...) __riscv_vredmax_vs_i32m4_i32m1_tum(__VA_ARGS__)
+#define vredmax_vs_i32m8_i32m1_m(...) __riscv_vredmax_vs_i32m8_i32m1_tum(__VA_ARGS__)
+#define vredmax_vs_i64m1_i64m1_m(...) __riscv_vredmax_vs_i64m1_i64m1_tum(__VA_ARGS__)
+#define vredmax_vs_i64m2_i64m1_m(...) __riscv_vredmax_vs_i64m2_i64m1_tum(__VA_ARGS__)
+#define vredmax_vs_i64m4_i64m1_m(...) __riscv_vredmax_vs_i64m4_i64m1_tum(__VA_ARGS__)
+#define vredmax_vs_i64m8_i64m1_m(...) __riscv_vredmax_vs_i64m8_i64m1_tum(__VA_ARGS__)
+#define vredmin_vs_i8mf8_i8m1_m(...) __riscv_vredmin_vs_i8mf8_i8m1_tum(__VA_ARGS__)
+#define vredmin_vs_i8mf4_i8m1_m(...) __riscv_vredmin_vs_i8mf4_i8m1_tum(__VA_ARGS__)
+#define vredmin_vs_i8mf2_i8m1_m(...) __riscv_vredmin_vs_i8mf2_i8m1_tum(__VA_ARGS__)
+#define vredmin_vs_i8m1_i8m1_m(...) __riscv_vredmin_vs_i8m1_i8m1_tum(__VA_ARGS__)
+#define vredmin_vs_i8m2_i8m1_m(...) __riscv_vredmin_vs_i8m2_i8m1_tum(__VA_ARGS__)
+#define vredmin_vs_i8m4_i8m1_m(...) __riscv_vredmin_vs_i8m4_i8m1_tum(__VA_ARGS__)
+#define vredmin_vs_i8m8_i8m1_m(...) __riscv_vredmin_vs_i8m8_i8m1_tum(__VA_ARGS__)
+#define vredmin_vs_i16mf4_i16m1_m(...) __riscv_vredmin_vs_i16mf4_i16m1_tum(__VA_ARGS__)
+#define vredmin_vs_i16mf2_i16m1_m(...) __riscv_vredmin_vs_i16mf2_i16m1_tum(__VA_ARGS__)
+#define vredmin_vs_i16m1_i16m1_m(...) __riscv_vredmin_vs_i16m1_i16m1_tum(__VA_ARGS__)
+#define vredmin_vs_i16m2_i16m1_m(...) __riscv_vredmin_vs_i16m2_i16m1_tum(__VA_ARGS__)
+#define vredmin_vs_i16m4_i16m1_m(...) __riscv_vredmin_vs_i16m4_i16m1_tum(__VA_ARGS__)
+#define vredmin_vs_i16m8_i16m1_m(...) __riscv_vredmin_vs_i16m8_i16m1_tum(__VA_ARGS__)
+#define vredmin_vs_i32mf2_i32m1_m(...) __riscv_vredmin_vs_i32mf2_i32m1_tum(__VA_ARGS__)
+#define vredmin_vs_i32m1_i32m1_m(...) __riscv_vredmin_vs_i32m1_i32m1_tum(__VA_ARGS__)
+#define vredmin_vs_i32m2_i32m1_m(...) __riscv_vredmin_vs_i32m2_i32m1_tum(__VA_ARGS__)
+#define vredmin_vs_i32m4_i32m1_m(...) __riscv_vredmin_vs_i32m4_i32m1_tum(__VA_ARGS__)
+#define vredmin_vs_i32m8_i32m1_m(...) __riscv_vredmin_vs_i32m8_i32m1_tum(__VA_ARGS__)
+#define vredmin_vs_i64m1_i64m1_m(...) __riscv_vredmin_vs_i64m1_i64m1_tum(__VA_ARGS__)
+#define vredmin_vs_i64m2_i64m1_m(...) __riscv_vredmin_vs_i64m2_i64m1_tum(__VA_ARGS__)
+#define vredmin_vs_i64m4_i64m1_m(...) __riscv_vredmin_vs_i64m4_i64m1_tum(__VA_ARGS__)
+#define vredmin_vs_i64m8_i64m1_m(...) __riscv_vredmin_vs_i64m8_i64m1_tum(__VA_ARGS__)
+#define vredand_vs_i8mf8_i8m1_m(...) __riscv_vredand_vs_i8mf8_i8m1_tum(__VA_ARGS__)
+#define vredand_vs_i8mf4_i8m1_m(...) __riscv_vredand_vs_i8mf4_i8m1_tum(__VA_ARGS__)
+#define vredand_vs_i8mf2_i8m1_m(...) __riscv_vredand_vs_i8mf2_i8m1_tum(__VA_ARGS__)
+#define vredand_vs_i8m1_i8m1_m(...) __riscv_vredand_vs_i8m1_i8m1_tum(__VA_ARGS__)
+#define vredand_vs_i8m2_i8m1_m(...) __riscv_vredand_vs_i8m2_i8m1_tum(__VA_ARGS__)
+#define vredand_vs_i8m4_i8m1_m(...) __riscv_vredand_vs_i8m4_i8m1_tum(__VA_ARGS__)
+#define vredand_vs_i8m8_i8m1_m(...) __riscv_vredand_vs_i8m8_i8m1_tum(__VA_ARGS__)
+#define vredand_vs_i16mf4_i16m1_m(...) __riscv_vredand_vs_i16mf4_i16m1_tum(__VA_ARGS__)
+#define vredand_vs_i16mf2_i16m1_m(...) __riscv_vredand_vs_i16mf2_i16m1_tum(__VA_ARGS__)
+#define vredand_vs_i16m1_i16m1_m(...) __riscv_vredand_vs_i16m1_i16m1_tum(__VA_ARGS__)
+#define vredand_vs_i16m2_i16m1_m(...) __riscv_vredand_vs_i16m2_i16m1_tum(__VA_ARGS__)
+#define vredand_vs_i16m4_i16m1_m(...) __riscv_vredand_vs_i16m4_i16m1_tum(__VA_ARGS__)
+#define vredand_vs_i16m8_i16m1_m(...) __riscv_vredand_vs_i16m8_i16m1_tum(__VA_ARGS__)
+#define vredand_vs_i32mf2_i32m1_m(...) __riscv_vredand_vs_i32mf2_i32m1_tum(__VA_ARGS__)
+#define vredand_vs_i32m1_i32m1_m(...) __riscv_vredand_vs_i32m1_i32m1_tum(__VA_ARGS__)
+#define vredand_vs_i32m2_i32m1_m(...) __riscv_vredand_vs_i32m2_i32m1_tum(__VA_ARGS__)
+#define vredand_vs_i32m4_i32m1_m(...) __riscv_vredand_vs_i32m4_i32m1_tum(__VA_ARGS__)
+#define vredand_vs_i32m8_i32m1_m(...) __riscv_vredand_vs_i32m8_i32m1_tum(__VA_ARGS__)
+#define vredand_vs_i64m1_i64m1_m(...) __riscv_vredand_vs_i64m1_i64m1_tum(__VA_ARGS__)
+#define vredand_vs_i64m2_i64m1_m(...) __riscv_vredand_vs_i64m2_i64m1_tum(__VA_ARGS__)
+#define vredand_vs_i64m4_i64m1_m(...) __riscv_vredand_vs_i64m4_i64m1_tum(__VA_ARGS__)
+#define vredand_vs_i64m8_i64m1_m(...) __riscv_vredand_vs_i64m8_i64m1_tum(__VA_ARGS__)
+#define vredor_vs_i8mf8_i8m1_m(...) __riscv_vredor_vs_i8mf8_i8m1_tum(__VA_ARGS__)
+#define vredor_vs_i8mf4_i8m1_m(...) __riscv_vredor_vs_i8mf4_i8m1_tum(__VA_ARGS__)
+#define vredor_vs_i8mf2_i8m1_m(...) __riscv_vredor_vs_i8mf2_i8m1_tum(__VA_ARGS__)
+#define vredor_vs_i8m1_i8m1_m(...) __riscv_vredor_vs_i8m1_i8m1_tum(__VA_ARGS__)
+#define vredor_vs_i8m2_i8m1_m(...) __riscv_vredor_vs_i8m2_i8m1_tum(__VA_ARGS__)
+#define vredor_vs_i8m4_i8m1_m(...) __riscv_vredor_vs_i8m4_i8m1_tum(__VA_ARGS__)
+#define vredor_vs_i8m8_i8m1_m(...) __riscv_vredor_vs_i8m8_i8m1_tum(__VA_ARGS__)
+#define vredor_vs_i16mf4_i16m1_m(...) __riscv_vredor_vs_i16mf4_i16m1_tum(__VA_ARGS__)
+#define vredor_vs_i16mf2_i16m1_m(...) __riscv_vredor_vs_i16mf2_i16m1_tum(__VA_ARGS__)
+#define vredor_vs_i16m1_i16m1_m(...) __riscv_vredor_vs_i16m1_i16m1_tum(__VA_ARGS__)
+#define vredor_vs_i16m2_i16m1_m(...) __riscv_vredor_vs_i16m2_i16m1_tum(__VA_ARGS__)
+#define vredor_vs_i16m4_i16m1_m(...) __riscv_vredor_vs_i16m4_i16m1_tum(__VA_ARGS__)
+#define vredor_vs_i16m8_i16m1_m(...) __riscv_vredor_vs_i16m8_i16m1_tum(__VA_ARGS__)
+#define vredor_vs_i32mf2_i32m1_m(...) __riscv_vredor_vs_i32mf2_i32m1_tum(__VA_ARGS__)
+#define vredor_vs_i32m1_i32m1_m(...) __riscv_vredor_vs_i32m1_i32m1_tum(__VA_ARGS__)
+#define vredor_vs_i32m2_i32m1_m(...) __riscv_vredor_vs_i32m2_i32m1_tum(__VA_ARGS__)
+#define vredor_vs_i32m4_i32m1_m(...) __riscv_vredor_vs_i32m4_i32m1_tum(__VA_ARGS__)
+#define vredor_vs_i32m8_i32m1_m(...) __riscv_vredor_vs_i32m8_i32m1_tum(__VA_ARGS__)
+#define vredor_vs_i64m1_i64m1_m(...) __riscv_vredor_vs_i64m1_i64m1_tum(__VA_ARGS__)
+#define vredor_vs_i64m2_i64m1_m(...) __riscv_vredor_vs_i64m2_i64m1_tum(__VA_ARGS__)
+#define vredor_vs_i64m4_i64m1_m(...) __riscv_vredor_vs_i64m4_i64m1_tum(__VA_ARGS__)
+#define vredor_vs_i64m8_i64m1_m(...) __riscv_vredor_vs_i64m8_i64m1_tum(__VA_ARGS__)
+#define vredxor_vs_i8mf8_i8m1_m(...) __riscv_vredxor_vs_i8mf8_i8m1_tum(__VA_ARGS__)
+#define vredxor_vs_i8mf4_i8m1_m(...) __riscv_vredxor_vs_i8mf4_i8m1_tum(__VA_ARGS__)
+#define vredxor_vs_i8mf2_i8m1_m(...) __riscv_vredxor_vs_i8mf2_i8m1_tum(__VA_ARGS__)
+#define vredxor_vs_i8m1_i8m1_m(...) __riscv_vredxor_vs_i8m1_i8m1_tum(__VA_ARGS__)
+#define vredxor_vs_i8m2_i8m1_m(...) __riscv_vredxor_vs_i8m2_i8m1_tum(__VA_ARGS__)
+#define vredxor_vs_i8m4_i8m1_m(...) __riscv_vredxor_vs_i8m4_i8m1_tum(__VA_ARGS__)
+#define vredxor_vs_i8m8_i8m1_m(...) __riscv_vredxor_vs_i8m8_i8m1_tum(__VA_ARGS__)
+#define vredxor_vs_i16mf4_i16m1_m(...) __riscv_vredxor_vs_i16mf4_i16m1_tum(__VA_ARGS__)
+#define vredxor_vs_i16mf2_i16m1_m(...) __riscv_vredxor_vs_i16mf2_i16m1_tum(__VA_ARGS__)
+#define vredxor_vs_i16m1_i16m1_m(...) __riscv_vredxor_vs_i16m1_i16m1_tum(__VA_ARGS__)
+#define vredxor_vs_i16m2_i16m1_m(...) __riscv_vredxor_vs_i16m2_i16m1_tum(__VA_ARGS__)
+#define vredxor_vs_i16m4_i16m1_m(...) __riscv_vredxor_vs_i16m4_i16m1_tum(__VA_ARGS__)
+#define vredxor_vs_i16m8_i16m1_m(...) __riscv_vredxor_vs_i16m8_i16m1_tum(__VA_ARGS__)
+#define vredxor_vs_i32mf2_i32m1_m(...) __riscv_vredxor_vs_i32mf2_i32m1_tum(__VA_ARGS__)
+#define vredxor_vs_i32m1_i32m1_m(...) __riscv_vredxor_vs_i32m1_i32m1_tum(__VA_ARGS__)
+#define vredxor_vs_i32m2_i32m1_m(...) __riscv_vredxor_vs_i32m2_i32m1_tum(__VA_ARGS__)
+#define vredxor_vs_i32m4_i32m1_m(...) __riscv_vredxor_vs_i32m4_i32m1_tum(__VA_ARGS__)
+#define vredxor_vs_i32m8_i32m1_m(...) __riscv_vredxor_vs_i32m8_i32m1_tum(__VA_ARGS__)
+#define vredxor_vs_i64m1_i64m1_m(...) __riscv_vredxor_vs_i64m1_i64m1_tum(__VA_ARGS__)
+#define vredxor_vs_i64m2_i64m1_m(...) __riscv_vredxor_vs_i64m2_i64m1_tum(__VA_ARGS__)
+#define vredxor_vs_i64m4_i64m1_m(...) __riscv_vredxor_vs_i64m4_i64m1_tum(__VA_ARGS__)
+#define vredxor_vs_i64m8_i64m1_m(...) __riscv_vredxor_vs_i64m8_i64m1_tum(__VA_ARGS__)
+#define vredsum_vs_u8mf8_u8m1_m(...) __riscv_vredsum_vs_u8mf8_u8m1_tum(__VA_ARGS__)
+#define vredsum_vs_u8mf4_u8m1_m(...) __riscv_vredsum_vs_u8mf4_u8m1_tum(__VA_ARGS__)
+#define vredsum_vs_u8mf2_u8m1_m(...) __riscv_vredsum_vs_u8mf2_u8m1_tum(__VA_ARGS__)
+#define vredsum_vs_u8m1_u8m1_m(...) __riscv_vredsum_vs_u8m1_u8m1_tum(__VA_ARGS__)
+#define vredsum_vs_u8m2_u8m1_m(...) __riscv_vredsum_vs_u8m2_u8m1_tum(__VA_ARGS__)
+#define vredsum_vs_u8m4_u8m1_m(...) __riscv_vredsum_vs_u8m4_u8m1_tum(__VA_ARGS__)
+#define vredsum_vs_u8m8_u8m1_m(...) __riscv_vredsum_vs_u8m8_u8m1_tum(__VA_ARGS__)
+#define vredsum_vs_u16mf4_u16m1_m(...) __riscv_vredsum_vs_u16mf4_u16m1_tum(__VA_ARGS__)
+#define vredsum_vs_u16mf2_u16m1_m(...) __riscv_vredsum_vs_u16mf2_u16m1_tum(__VA_ARGS__)
+#define vredsum_vs_u16m1_u16m1_m(...) __riscv_vredsum_vs_u16m1_u16m1_tum(__VA_ARGS__)
+#define vredsum_vs_u16m2_u16m1_m(...) __riscv_vredsum_vs_u16m2_u16m1_tum(__VA_ARGS__)
+#define vredsum_vs_u16m4_u16m1_m(...) __riscv_vredsum_vs_u16m4_u16m1_tum(__VA_ARGS__)
+#define vredsum_vs_u16m8_u16m1_m(...) __riscv_vredsum_vs_u16m8_u16m1_tum(__VA_ARGS__)
+#define vredsum_vs_u32mf2_u32m1_m(...) __riscv_vredsum_vs_u32mf2_u32m1_tum(__VA_ARGS__)
+#define vredsum_vs_u32m1_u32m1_m(...) __riscv_vredsum_vs_u32m1_u32m1_tum(__VA_ARGS__)
+#define vredsum_vs_u32m2_u32m1_m(...) __riscv_vredsum_vs_u32m2_u32m1_tum(__VA_ARGS__)
+#define vredsum_vs_u32m4_u32m1_m(...) __riscv_vredsum_vs_u32m4_u32m1_tum(__VA_ARGS__)
+#define vredsum_vs_u32m8_u32m1_m(...) __riscv_vredsum_vs_u32m8_u32m1_tum(__VA_ARGS__)
+#define vredsum_vs_u64m1_u64m1_m(...) __riscv_vredsum_vs_u64m1_u64m1_tum(__VA_ARGS__)
+#define vredsum_vs_u64m2_u64m1_m(...) __riscv_vredsum_vs_u64m2_u64m1_tum(__VA_ARGS__)
+#define vredsum_vs_u64m4_u64m1_m(...) __riscv_vredsum_vs_u64m4_u64m1_tum(__VA_ARGS__)
+#define vredsum_vs_u64m8_u64m1_m(...) __riscv_vredsum_vs_u64m8_u64m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u8mf8_u8m1_m(...) __riscv_vredmaxu_vs_u8mf8_u8m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u8mf4_u8m1_m(...) __riscv_vredmaxu_vs_u8mf4_u8m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u8mf2_u8m1_m(...) __riscv_vredmaxu_vs_u8mf2_u8m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u8m1_u8m1_m(...) __riscv_vredmaxu_vs_u8m1_u8m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u8m2_u8m1_m(...) __riscv_vredmaxu_vs_u8m2_u8m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u8m4_u8m1_m(...) __riscv_vredmaxu_vs_u8m4_u8m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u8m8_u8m1_m(...) __riscv_vredmaxu_vs_u8m8_u8m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u16mf4_u16m1_m(...) __riscv_vredmaxu_vs_u16mf4_u16m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u16mf2_u16m1_m(...) __riscv_vredmaxu_vs_u16mf2_u16m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u16m1_u16m1_m(...) __riscv_vredmaxu_vs_u16m1_u16m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u16m2_u16m1_m(...) __riscv_vredmaxu_vs_u16m2_u16m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u16m4_u16m1_m(...) __riscv_vredmaxu_vs_u16m4_u16m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u16m8_u16m1_m(...) __riscv_vredmaxu_vs_u16m8_u16m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u32mf2_u32m1_m(...) __riscv_vredmaxu_vs_u32mf2_u32m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u32m1_u32m1_m(...) __riscv_vredmaxu_vs_u32m1_u32m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u32m2_u32m1_m(...) __riscv_vredmaxu_vs_u32m2_u32m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u32m4_u32m1_m(...) __riscv_vredmaxu_vs_u32m4_u32m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u32m8_u32m1_m(...) __riscv_vredmaxu_vs_u32m8_u32m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u64m1_u64m1_m(...) __riscv_vredmaxu_vs_u64m1_u64m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u64m2_u64m1_m(...) __riscv_vredmaxu_vs_u64m2_u64m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u64m4_u64m1_m(...) __riscv_vredmaxu_vs_u64m4_u64m1_tum(__VA_ARGS__)
+#define vredmaxu_vs_u64m8_u64m1_m(...) __riscv_vredmaxu_vs_u64m8_u64m1_tum(__VA_ARGS__)
+#define vredminu_vs_u8mf8_u8m1_m(...) __riscv_vredminu_vs_u8mf8_u8m1_tum(__VA_ARGS__)
+#define vredminu_vs_u8mf4_u8m1_m(...) __riscv_vredminu_vs_u8mf4_u8m1_tum(__VA_ARGS__)
+#define vredminu_vs_u8mf2_u8m1_m(...) __riscv_vredminu_vs_u8mf2_u8m1_tum(__VA_ARGS__)
+#define vredminu_vs_u8m1_u8m1_m(...) __riscv_vredminu_vs_u8m1_u8m1_tum(__VA_ARGS__)
+#define vredminu_vs_u8m2_u8m1_m(...) __riscv_vredminu_vs_u8m2_u8m1_tum(__VA_ARGS__)
+#define vredminu_vs_u8m4_u8m1_m(...) __riscv_vredminu_vs_u8m4_u8m1_tum(__VA_ARGS__)
+#define vredminu_vs_u8m8_u8m1_m(...) __riscv_vredminu_vs_u8m8_u8m1_tum(__VA_ARGS__)
+#define vredminu_vs_u16mf4_u16m1_m(...) __riscv_vredminu_vs_u16mf4_u16m1_tum(__VA_ARGS__)
+#define vredminu_vs_u16mf2_u16m1_m(...) __riscv_vredminu_vs_u16mf2_u16m1_tum(__VA_ARGS__)
+#define vredminu_vs_u16m1_u16m1_m(...) __riscv_vredminu_vs_u16m1_u16m1_tum(__VA_ARGS__)
+#define vredminu_vs_u16m2_u16m1_m(...) __riscv_vredminu_vs_u16m2_u16m1_tum(__VA_ARGS__)
+#define vredminu_vs_u16m4_u16m1_m(...) __riscv_vredminu_vs_u16m4_u16m1_tum(__VA_ARGS__)
+#define vredminu_vs_u16m8_u16m1_m(...) __riscv_vredminu_vs_u16m8_u16m1_tum(__VA_ARGS__)
+#define vredminu_vs_u32mf2_u32m1_m(...) __riscv_vredminu_vs_u32mf2_u32m1_tum(__VA_ARGS__)
+#define vredminu_vs_u32m1_u32m1_m(...) __riscv_vredminu_vs_u32m1_u32m1_tum(__VA_ARGS__)
+#define vredminu_vs_u32m2_u32m1_m(...) __riscv_vredminu_vs_u32m2_u32m1_tum(__VA_ARGS__)
+#define vredminu_vs_u32m4_u32m1_m(...) __riscv_vredminu_vs_u32m4_u32m1_tum(__VA_ARGS__)
+#define vredminu_vs_u32m8_u32m1_m(...) __riscv_vredminu_vs_u32m8_u32m1_tum(__VA_ARGS__)
+#define vredminu_vs_u64m1_u64m1_m(...) __riscv_vredminu_vs_u64m1_u64m1_tum(__VA_ARGS__)
+#define vredminu_vs_u64m2_u64m1_m(...) __riscv_vredminu_vs_u64m2_u64m1_tum(__VA_ARGS__)
+#define vredminu_vs_u64m4_u64m1_m(...) __riscv_vredminu_vs_u64m4_u64m1_tum(__VA_ARGS__)
+#define vredminu_vs_u64m8_u64m1_m(...) __riscv_vredminu_vs_u64m8_u64m1_tum(__VA_ARGS__)
+#define vredand_vs_u8mf8_u8m1_m(...) __riscv_vredand_vs_u8mf8_u8m1_tum(__VA_ARGS__)
+#define vredand_vs_u8mf4_u8m1_m(...) __riscv_vredand_vs_u8mf4_u8m1_tum(__VA_ARGS__)
+#define vredand_vs_u8mf2_u8m1_m(...) __riscv_vredand_vs_u8mf2_u8m1_tum(__VA_ARGS__)
+#define vredand_vs_u8m1_u8m1_m(...) __riscv_vredand_vs_u8m1_u8m1_tum(__VA_ARGS__)
+#define vredand_vs_u8m2_u8m1_m(...) __riscv_vredand_vs_u8m2_u8m1_tum(__VA_ARGS__)
+#define vredand_vs_u8m4_u8m1_m(...) __riscv_vredand_vs_u8m4_u8m1_tum(__VA_ARGS__)
+#define vredand_vs_u8m8_u8m1_m(...) __riscv_vredand_vs_u8m8_u8m1_tum(__VA_ARGS__)
+#define vredand_vs_u16mf4_u16m1_m(...) __riscv_vredand_vs_u16mf4_u16m1_tum(__VA_ARGS__)
+#define vredand_vs_u16mf2_u16m1_m(...) __riscv_vredand_vs_u16mf2_u16m1_tum(__VA_ARGS__)
+#define vredand_vs_u16m1_u16m1_m(...) __riscv_vredand_vs_u16m1_u16m1_tum(__VA_ARGS__)
+#define vredand_vs_u16m2_u16m1_m(...) __riscv_vredand_vs_u16m2_u16m1_tum(__VA_ARGS__)
+#define vredand_vs_u16m4_u16m1_m(...) __riscv_vredand_vs_u16m4_u16m1_tum(__VA_ARGS__)
+#define vredand_vs_u16m8_u16m1_m(...) __riscv_vredand_vs_u16m8_u16m1_tum(__VA_ARGS__)
+#define vredand_vs_u32mf2_u32m1_m(...) __riscv_vredand_vs_u32mf2_u32m1_tum(__VA_ARGS__)
+#define vredand_vs_u32m1_u32m1_m(...) __riscv_vredand_vs_u32m1_u32m1_tum(__VA_ARGS__)
+#define vredand_vs_u32m2_u32m1_m(...) __riscv_vredand_vs_u32m2_u32m1_tum(__VA_ARGS__)
+#define vredand_vs_u32m4_u32m1_m(...) __riscv_vredand_vs_u32m4_u32m1_tum(__VA_ARGS__)
+#define vredand_vs_u32m8_u32m1_m(...) __riscv_vredand_vs_u32m8_u32m1_tum(__VA_ARGS__)
+#define vredand_vs_u64m1_u64m1_m(...) __riscv_vredand_vs_u64m1_u64m1_tum(__VA_ARGS__)
+#define vredand_vs_u64m2_u64m1_m(...) __riscv_vredand_vs_u64m2_u64m1_tum(__VA_ARGS__)
+#define vredand_vs_u64m4_u64m1_m(...) __riscv_vredand_vs_u64m4_u64m1_tum(__VA_ARGS__)
+#define vredand_vs_u64m8_u64m1_m(...) __riscv_vredand_vs_u64m8_u64m1_tum(__VA_ARGS__)
+#define vredor_vs_u8mf8_u8m1_m(...) __riscv_vredor_vs_u8mf8_u8m1_tum(__VA_ARGS__)
+#define vredor_vs_u8mf4_u8m1_m(...) __riscv_vredor_vs_u8mf4_u8m1_tum(__VA_ARGS__)
+#define vredor_vs_u8mf2_u8m1_m(...) __riscv_vredor_vs_u8mf2_u8m1_tum(__VA_ARGS__)
+#define vredor_vs_u8m1_u8m1_m(...) __riscv_vredor_vs_u8m1_u8m1_tum(__VA_ARGS__)
+#define vredor_vs_u8m2_u8m1_m(...) __riscv_vredor_vs_u8m2_u8m1_tum(__VA_ARGS__)
+#define vredor_vs_u8m4_u8m1_m(...) __riscv_vredor_vs_u8m4_u8m1_tum(__VA_ARGS__)
+#define vredor_vs_u8m8_u8m1_m(...) __riscv_vredor_vs_u8m8_u8m1_tum(__VA_ARGS__)
+#define vredor_vs_u16mf4_u16m1_m(...) __riscv_vredor_vs_u16mf4_u16m1_tum(__VA_ARGS__)
+#define vredor_vs_u16mf2_u16m1_m(...) __riscv_vredor_vs_u16mf2_u16m1_tum(__VA_ARGS__)
+#define vredor_vs_u16m1_u16m1_m(...) __riscv_vredor_vs_u16m1_u16m1_tum(__VA_ARGS__)
+#define vredor_vs_u16m2_u16m1_m(...) __riscv_vredor_vs_u16m2_u16m1_tum(__VA_ARGS__)
+#define vredor_vs_u16m4_u16m1_m(...) __riscv_vredor_vs_u16m4_u16m1_tum(__VA_ARGS__)
+#define vredor_vs_u16m8_u16m1_m(...) __riscv_vredor_vs_u16m8_u16m1_tum(__VA_ARGS__)
+#define vredor_vs_u32mf2_u32m1_m(...) __riscv_vredor_vs_u32mf2_u32m1_tum(__VA_ARGS__)
+#define vredor_vs_u32m1_u32m1_m(...) __riscv_vredor_vs_u32m1_u32m1_tum(__VA_ARGS__)
+#define vredor_vs_u32m2_u32m1_m(...) __riscv_vredor_vs_u32m2_u32m1_tum(__VA_ARGS__)
+#define vredor_vs_u32m4_u32m1_m(...) __riscv_vredor_vs_u32m4_u32m1_tum(__VA_ARGS__)
+#define vredor_vs_u32m8_u32m1_m(...) __riscv_vredor_vs_u32m8_u32m1_tum(__VA_ARGS__)
+#define vredor_vs_u64m1_u64m1_m(...) __riscv_vredor_vs_u64m1_u64m1_tum(__VA_ARGS__)
+#define vredor_vs_u64m2_u64m1_m(...) __riscv_vredor_vs_u64m2_u64m1_tum(__VA_ARGS__)
+#define vredor_vs_u64m4_u64m1_m(...) __riscv_vredor_vs_u64m4_u64m1_tum(__VA_ARGS__)
+#define vredor_vs_u64m8_u64m1_m(...) __riscv_vredor_vs_u64m8_u64m1_tum(__VA_ARGS__)
+#define vredxor_vs_u8mf8_u8m1_m(...) __riscv_vredxor_vs_u8mf8_u8m1_tum(__VA_ARGS__)
+#define vredxor_vs_u8mf4_u8m1_m(...) __riscv_vredxor_vs_u8mf4_u8m1_tum(__VA_ARGS__)
+#define vredxor_vs_u8mf2_u8m1_m(...) __riscv_vredxor_vs_u8mf2_u8m1_tum(__VA_ARGS__)
+#define vredxor_vs_u8m1_u8m1_m(...) __riscv_vredxor_vs_u8m1_u8m1_tum(__VA_ARGS__)
+#define vredxor_vs_u8m2_u8m1_m(...) __riscv_vredxor_vs_u8m2_u8m1_tum(__VA_ARGS__)
+#define vredxor_vs_u8m4_u8m1_m(...) __riscv_vredxor_vs_u8m4_u8m1_tum(__VA_ARGS__)
+#define vredxor_vs_u8m8_u8m1_m(...) __riscv_vredxor_vs_u8m8_u8m1_tum(__VA_ARGS__)
+#define vredxor_vs_u16mf4_u16m1_m(...) __riscv_vredxor_vs_u16mf4_u16m1_tum(__VA_ARGS__)
+#define vredxor_vs_u16mf2_u16m1_m(...) __riscv_vredxor_vs_u16mf2_u16m1_tum(__VA_ARGS__)
+#define vredxor_vs_u16m1_u16m1_m(...) __riscv_vredxor_vs_u16m1_u16m1_tum(__VA_ARGS__)
+#define vredxor_vs_u16m2_u16m1_m(...) __riscv_vredxor_vs_u16m2_u16m1_tum(__VA_ARGS__)
+#define vredxor_vs_u16m4_u16m1_m(...) __riscv_vredxor_vs_u16m4_u16m1_tum(__VA_ARGS__)
+#define vredxor_vs_u16m8_u16m1_m(...) __riscv_vredxor_vs_u16m8_u16m1_tum(__VA_ARGS__)
+#define vredxor_vs_u32mf2_u32m1_m(...) __riscv_vredxor_vs_u32mf2_u32m1_tum(__VA_ARGS__)
+#define vredxor_vs_u32m1_u32m1_m(...) __riscv_vredxor_vs_u32m1_u32m1_tum(__VA_ARGS__)
+#define vredxor_vs_u32m2_u32m1_m(...) __riscv_vredxor_vs_u32m2_u32m1_tum(__VA_ARGS__)
+#define vredxor_vs_u32m4_u32m1_m(...) __riscv_vredxor_vs_u32m4_u32m1_tum(__VA_ARGS__)
+#define vredxor_vs_u32m8_u32m1_m(...) __riscv_vredxor_vs_u32m8_u32m1_tum(__VA_ARGS__)
+#define vredxor_vs_u64m1_u64m1_m(...) __riscv_vredxor_vs_u64m1_u64m1_tum(__VA_ARGS__)
+#define vredxor_vs_u64m2_u64m1_m(...) __riscv_vredxor_vs_u64m2_u64m1_tum(__VA_ARGS__)
+#define vredxor_vs_u64m4_u64m1_m(...) __riscv_vredxor_vs_u64m4_u64m1_tum(__VA_ARGS__)
+#define vredxor_vs_u64m8_u64m1_m(...) __riscv_vredxor_vs_u64m8_u64m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i8mf8_i16m1(...) __riscv_vwredsum_vs_i8mf8_i16m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i8mf4_i16m1(...) __riscv_vwredsum_vs_i8mf4_i16m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i8mf2_i16m1(...) __riscv_vwredsum_vs_i8mf2_i16m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i8m1_i16m1(...) __riscv_vwredsum_vs_i8m1_i16m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i8m2_i16m1(...) __riscv_vwredsum_vs_i8m2_i16m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i8m4_i16m1(...) __riscv_vwredsum_vs_i8m4_i16m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i8m8_i16m1(...) __riscv_vwredsum_vs_i8m8_i16m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i16mf4_i32m1(...) __riscv_vwredsum_vs_i16mf4_i32m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i16mf2_i32m1(...) __riscv_vwredsum_vs_i16mf2_i32m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i16m1_i32m1(...) __riscv_vwredsum_vs_i16m1_i32m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i16m2_i32m1(...) __riscv_vwredsum_vs_i16m2_i32m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i16m4_i32m1(...) __riscv_vwredsum_vs_i16m4_i32m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i16m8_i32m1(...) __riscv_vwredsum_vs_i16m8_i32m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i32mf2_i64m1(...) __riscv_vwredsum_vs_i32mf2_i64m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i32m1_i64m1(...) __riscv_vwredsum_vs_i32m1_i64m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i32m2_i64m1(...) __riscv_vwredsum_vs_i32m2_i64m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i32m4_i64m1(...) __riscv_vwredsum_vs_i32m4_i64m1_tu(__VA_ARGS__)
+#define vwredsum_vs_i32m8_i64m1(...) __riscv_vwredsum_vs_i32m8_i64m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u8mf8_u16m1(...) __riscv_vwredsumu_vs_u8mf8_u16m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u8mf4_u16m1(...) __riscv_vwredsumu_vs_u8mf4_u16m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u8mf2_u16m1(...) __riscv_vwredsumu_vs_u8mf2_u16m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u8m1_u16m1(...) __riscv_vwredsumu_vs_u8m1_u16m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u8m2_u16m1(...) __riscv_vwredsumu_vs_u8m2_u16m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u8m4_u16m1(...) __riscv_vwredsumu_vs_u8m4_u16m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u8m8_u16m1(...) __riscv_vwredsumu_vs_u8m8_u16m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u16mf4_u32m1(...) __riscv_vwredsumu_vs_u16mf4_u32m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u16mf2_u32m1(...) __riscv_vwredsumu_vs_u16mf2_u32m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u16m1_u32m1(...) __riscv_vwredsumu_vs_u16m1_u32m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u16m2_u32m1(...) __riscv_vwredsumu_vs_u16m2_u32m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u16m4_u32m1(...) __riscv_vwredsumu_vs_u16m4_u32m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u16m8_u32m1(...) __riscv_vwredsumu_vs_u16m8_u32m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u32mf2_u64m1(...) __riscv_vwredsumu_vs_u32mf2_u64m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u32m1_u64m1(...) __riscv_vwredsumu_vs_u32m1_u64m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u32m2_u64m1(...) __riscv_vwredsumu_vs_u32m2_u64m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u32m4_u64m1(...) __riscv_vwredsumu_vs_u32m4_u64m1_tu(__VA_ARGS__)
+#define vwredsumu_vs_u32m8_u64m1(...) __riscv_vwredsumu_vs_u32m8_u64m1_tu(__VA_ARGS__)
+// masked functions
+#define vwredsum_vs_i8mf8_i16m1_m(...) __riscv_vwredsum_vs_i8mf8_i16m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i8mf4_i16m1_m(...) __riscv_vwredsum_vs_i8mf4_i16m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i8mf2_i16m1_m(...) __riscv_vwredsum_vs_i8mf2_i16m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i8m1_i16m1_m(...) __riscv_vwredsum_vs_i8m1_i16m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i8m2_i16m1_m(...) __riscv_vwredsum_vs_i8m2_i16m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i8m4_i16m1_m(...) __riscv_vwredsum_vs_i8m4_i16m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i8m8_i16m1_m(...) __riscv_vwredsum_vs_i8m8_i16m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i16mf4_i32m1_m(...) __riscv_vwredsum_vs_i16mf4_i32m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i16mf2_i32m1_m(...) __riscv_vwredsum_vs_i16mf2_i32m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i16m1_i32m1_m(...) __riscv_vwredsum_vs_i16m1_i32m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i16m2_i32m1_m(...) __riscv_vwredsum_vs_i16m2_i32m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i16m4_i32m1_m(...) __riscv_vwredsum_vs_i16m4_i32m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i16m8_i32m1_m(...) __riscv_vwredsum_vs_i16m8_i32m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i32mf2_i64m1_m(...) __riscv_vwredsum_vs_i32mf2_i64m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i32m1_i64m1_m(...) __riscv_vwredsum_vs_i32m1_i64m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i32m2_i64m1_m(...) __riscv_vwredsum_vs_i32m2_i64m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i32m4_i64m1_m(...) __riscv_vwredsum_vs_i32m4_i64m1_tum(__VA_ARGS__)
+#define vwredsum_vs_i32m8_i64m1_m(...) __riscv_vwredsum_vs_i32m8_i64m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u8mf8_u16m1_m(...) __riscv_vwredsumu_vs_u8mf8_u16m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u8mf4_u16m1_m(...) __riscv_vwredsumu_vs_u8mf4_u16m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u8mf2_u16m1_m(...) __riscv_vwredsumu_vs_u8mf2_u16m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u8m1_u16m1_m(...) __riscv_vwredsumu_vs_u8m1_u16m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u8m2_u16m1_m(...) __riscv_vwredsumu_vs_u8m2_u16m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u8m4_u16m1_m(...) __riscv_vwredsumu_vs_u8m4_u16m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u8m8_u16m1_m(...) __riscv_vwredsumu_vs_u8m8_u16m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u16mf4_u32m1_m(...) __riscv_vwredsumu_vs_u16mf4_u32m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u16mf2_u32m1_m(...) __riscv_vwredsumu_vs_u16mf2_u32m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u16m1_u32m1_m(...) __riscv_vwredsumu_vs_u16m1_u32m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u16m2_u32m1_m(...) __riscv_vwredsumu_vs_u16m2_u32m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u16m4_u32m1_m(...) __riscv_vwredsumu_vs_u16m4_u32m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u16m8_u32m1_m(...) __riscv_vwredsumu_vs_u16m8_u32m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u32mf2_u64m1_m(...) __riscv_vwredsumu_vs_u32mf2_u64m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u32m1_u64m1_m(...) __riscv_vwredsumu_vs_u32m1_u64m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u32m2_u64m1_m(...) __riscv_vwredsumu_vs_u32m2_u64m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u32m4_u64m1_m(...) __riscv_vwredsumu_vs_u32m4_u64m1_tum(__VA_ARGS__)
+#define vwredsumu_vs_u32m8_u64m1_m(...) __riscv_vwredsumu_vs_u32m8_u64m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f16mf4_f16m1(...) __riscv_vfredosum_vs_f16mf4_f16m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f16mf2_f16m1(...) __riscv_vfredosum_vs_f16mf2_f16m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f16m1_f16m1(...) __riscv_vfredosum_vs_f16m1_f16m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f16m2_f16m1(...) __riscv_vfredosum_vs_f16m2_f16m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f16m4_f16m1(...) __riscv_vfredosum_vs_f16m4_f16m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f16m8_f16m1(...) __riscv_vfredosum_vs_f16m8_f16m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f32mf2_f32m1(...) __riscv_vfredosum_vs_f32mf2_f32m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f32m1_f32m1(...) __riscv_vfredosum_vs_f32m1_f32m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f32m2_f32m1(...) __riscv_vfredosum_vs_f32m2_f32m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f32m4_f32m1(...) __riscv_vfredosum_vs_f32m4_f32m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f32m8_f32m1(...) __riscv_vfredosum_vs_f32m8_f32m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f64m1_f64m1(...) __riscv_vfredosum_vs_f64m1_f64m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f64m2_f64m1(...) __riscv_vfredosum_vs_f64m2_f64m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f64m4_f64m1(...) __riscv_vfredosum_vs_f64m4_f64m1_tu(__VA_ARGS__)
+#define vfredosum_vs_f64m8_f64m1(...) __riscv_vfredosum_vs_f64m8_f64m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f16mf4_f16m1(...) __riscv_vfredusum_vs_f16mf4_f16m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f16mf2_f16m1(...) __riscv_vfredusum_vs_f16mf2_f16m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f16m1_f16m1(...) __riscv_vfredusum_vs_f16m1_f16m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f16m2_f16m1(...) __riscv_vfredusum_vs_f16m2_f16m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f16m4_f16m1(...) __riscv_vfredusum_vs_f16m4_f16m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f16m8_f16m1(...) __riscv_vfredusum_vs_f16m8_f16m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f32mf2_f32m1(...) __riscv_vfredusum_vs_f32mf2_f32m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f32m1_f32m1(...) __riscv_vfredusum_vs_f32m1_f32m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f32m2_f32m1(...) __riscv_vfredusum_vs_f32m2_f32m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f32m4_f32m1(...) __riscv_vfredusum_vs_f32m4_f32m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f32m8_f32m1(...) __riscv_vfredusum_vs_f32m8_f32m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f64m1_f64m1(...) __riscv_vfredusum_vs_f64m1_f64m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f64m2_f64m1(...) __riscv_vfredusum_vs_f64m2_f64m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f64m4_f64m1(...) __riscv_vfredusum_vs_f64m4_f64m1_tu(__VA_ARGS__)
+#define vfredusum_vs_f64m8_f64m1(...) __riscv_vfredusum_vs_f64m8_f64m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f16mf4_f16m1(...) __riscv_vfredmax_vs_f16mf4_f16m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f16mf2_f16m1(...) __riscv_vfredmax_vs_f16mf2_f16m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f16m1_f16m1(...) __riscv_vfredmax_vs_f16m1_f16m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f16m2_f16m1(...) __riscv_vfredmax_vs_f16m2_f16m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f16m4_f16m1(...) __riscv_vfredmax_vs_f16m4_f16m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f16m8_f16m1(...) __riscv_vfredmax_vs_f16m8_f16m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f32mf2_f32m1(...) __riscv_vfredmax_vs_f32mf2_f32m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f32m1_f32m1(...) __riscv_vfredmax_vs_f32m1_f32m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f32m2_f32m1(...) __riscv_vfredmax_vs_f32m2_f32m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f32m4_f32m1(...) __riscv_vfredmax_vs_f32m4_f32m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f32m8_f32m1(...) __riscv_vfredmax_vs_f32m8_f32m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f64m1_f64m1(...) __riscv_vfredmax_vs_f64m1_f64m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f64m2_f64m1(...) __riscv_vfredmax_vs_f64m2_f64m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f64m4_f64m1(...) __riscv_vfredmax_vs_f64m4_f64m1_tu(__VA_ARGS__)
+#define vfredmax_vs_f64m8_f64m1(...) __riscv_vfredmax_vs_f64m8_f64m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f16mf4_f16m1(...) __riscv_vfredmin_vs_f16mf4_f16m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f16mf2_f16m1(...) __riscv_vfredmin_vs_f16mf2_f16m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f16m1_f16m1(...) __riscv_vfredmin_vs_f16m1_f16m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f16m2_f16m1(...) __riscv_vfredmin_vs_f16m2_f16m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f16m4_f16m1(...) __riscv_vfredmin_vs_f16m4_f16m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f16m8_f16m1(...) __riscv_vfredmin_vs_f16m8_f16m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f32mf2_f32m1(...) __riscv_vfredmin_vs_f32mf2_f32m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f32m1_f32m1(...) __riscv_vfredmin_vs_f32m1_f32m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f32m2_f32m1(...) __riscv_vfredmin_vs_f32m2_f32m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f32m4_f32m1(...) __riscv_vfredmin_vs_f32m4_f32m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f32m8_f32m1(...) __riscv_vfredmin_vs_f32m8_f32m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f64m1_f64m1(...) __riscv_vfredmin_vs_f64m1_f64m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f64m2_f64m1(...) __riscv_vfredmin_vs_f64m2_f64m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f64m4_f64m1(...) __riscv_vfredmin_vs_f64m4_f64m1_tu(__VA_ARGS__)
+#define vfredmin_vs_f64m8_f64m1(...) __riscv_vfredmin_vs_f64m8_f64m1_tu(__VA_ARGS__)
+// masked functions
+#define vfredosum_vs_f16mf4_f16m1_m(...) __riscv_vfredosum_vs_f16mf4_f16m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f16mf2_f16m1_m(...) __riscv_vfredosum_vs_f16mf2_f16m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f16m1_f16m1_m(...) __riscv_vfredosum_vs_f16m1_f16m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f16m2_f16m1_m(...) __riscv_vfredosum_vs_f16m2_f16m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f16m4_f16m1_m(...) __riscv_vfredosum_vs_f16m4_f16m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f16m8_f16m1_m(...) __riscv_vfredosum_vs_f16m8_f16m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f32mf2_f32m1_m(...) __riscv_vfredosum_vs_f32mf2_f32m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f32m1_f32m1_m(...) __riscv_vfredosum_vs_f32m1_f32m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f32m2_f32m1_m(...) __riscv_vfredosum_vs_f32m2_f32m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f32m4_f32m1_m(...) __riscv_vfredosum_vs_f32m4_f32m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f32m8_f32m1_m(...) __riscv_vfredosum_vs_f32m8_f32m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f64m1_f64m1_m(...) __riscv_vfredosum_vs_f64m1_f64m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f64m2_f64m1_m(...) __riscv_vfredosum_vs_f64m2_f64m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f64m4_f64m1_m(...) __riscv_vfredosum_vs_f64m4_f64m1_tum(__VA_ARGS__)
+#define vfredosum_vs_f64m8_f64m1_m(...) __riscv_vfredosum_vs_f64m8_f64m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f16mf4_f16m1_m(...) __riscv_vfredusum_vs_f16mf4_f16m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f16mf2_f16m1_m(...) __riscv_vfredusum_vs_f16mf2_f16m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f16m1_f16m1_m(...) __riscv_vfredusum_vs_f16m1_f16m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f16m2_f16m1_m(...) __riscv_vfredusum_vs_f16m2_f16m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f16m4_f16m1_m(...) __riscv_vfredusum_vs_f16m4_f16m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f16m8_f16m1_m(...) __riscv_vfredusum_vs_f16m8_f16m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f32mf2_f32m1_m(...) __riscv_vfredusum_vs_f32mf2_f32m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f32m1_f32m1_m(...) __riscv_vfredusum_vs_f32m1_f32m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f32m2_f32m1_m(...) __riscv_vfredusum_vs_f32m2_f32m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f32m4_f32m1_m(...) __riscv_vfredusum_vs_f32m4_f32m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f32m8_f32m1_m(...) __riscv_vfredusum_vs_f32m8_f32m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f64m1_f64m1_m(...) __riscv_vfredusum_vs_f64m1_f64m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f64m2_f64m1_m(...) __riscv_vfredusum_vs_f64m2_f64m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f64m4_f64m1_m(...) __riscv_vfredusum_vs_f64m4_f64m1_tum(__VA_ARGS__)
+#define vfredusum_vs_f64m8_f64m1_m(...) __riscv_vfredusum_vs_f64m8_f64m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f16mf4_f16m1_m(...) __riscv_vfredmax_vs_f16mf4_f16m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f16mf2_f16m1_m(...) __riscv_vfredmax_vs_f16mf2_f16m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f16m1_f16m1_m(...) __riscv_vfredmax_vs_f16m1_f16m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f16m2_f16m1_m(...) __riscv_vfredmax_vs_f16m2_f16m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f16m4_f16m1_m(...) __riscv_vfredmax_vs_f16m4_f16m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f16m8_f16m1_m(...) __riscv_vfredmax_vs_f16m8_f16m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f32mf2_f32m1_m(...) __riscv_vfredmax_vs_f32mf2_f32m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f32m1_f32m1_m(...) __riscv_vfredmax_vs_f32m1_f32m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f32m2_f32m1_m(...) __riscv_vfredmax_vs_f32m2_f32m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f32m4_f32m1_m(...) __riscv_vfredmax_vs_f32m4_f32m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f32m8_f32m1_m(...) __riscv_vfredmax_vs_f32m8_f32m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f64m1_f64m1_m(...) __riscv_vfredmax_vs_f64m1_f64m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f64m2_f64m1_m(...) __riscv_vfredmax_vs_f64m2_f64m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f64m4_f64m1_m(...) __riscv_vfredmax_vs_f64m4_f64m1_tum(__VA_ARGS__)
+#define vfredmax_vs_f64m8_f64m1_m(...) __riscv_vfredmax_vs_f64m8_f64m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f16mf4_f16m1_m(...) __riscv_vfredmin_vs_f16mf4_f16m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f16mf2_f16m1_m(...) __riscv_vfredmin_vs_f16mf2_f16m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f16m1_f16m1_m(...) __riscv_vfredmin_vs_f16m1_f16m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f16m2_f16m1_m(...) __riscv_vfredmin_vs_f16m2_f16m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f16m4_f16m1_m(...) __riscv_vfredmin_vs_f16m4_f16m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f16m8_f16m1_m(...) __riscv_vfredmin_vs_f16m8_f16m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f32mf2_f32m1_m(...) __riscv_vfredmin_vs_f32mf2_f32m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f32m1_f32m1_m(...) __riscv_vfredmin_vs_f32m1_f32m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f32m2_f32m1_m(...) __riscv_vfredmin_vs_f32m2_f32m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f32m4_f32m1_m(...) __riscv_vfredmin_vs_f32m4_f32m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f32m8_f32m1_m(...) __riscv_vfredmin_vs_f32m8_f32m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f64m1_f64m1_m(...) __riscv_vfredmin_vs_f64m1_f64m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f64m2_f64m1_m(...) __riscv_vfredmin_vs_f64m2_f64m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f64m4_f64m1_m(...) __riscv_vfredmin_vs_f64m4_f64m1_tum(__VA_ARGS__)
+#define vfredmin_vs_f64m8_f64m1_m(...) __riscv_vfredmin_vs_f64m8_f64m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f16mf4_f32m1(...) __riscv_vfwredosum_vs_f16mf4_f32m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f16mf2_f32m1(...) __riscv_vfwredosum_vs_f16mf2_f32m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f16m1_f32m1(...) __riscv_vfwredosum_vs_f16m1_f32m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f16m2_f32m1(...) __riscv_vfwredosum_vs_f16m2_f32m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f16m4_f32m1(...) __riscv_vfwredosum_vs_f16m4_f32m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f16m8_f32m1(...) __riscv_vfwredosum_vs_f16m8_f32m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f32mf2_f64m1(...) __riscv_vfwredosum_vs_f32mf2_f64m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f32m1_f64m1(...) __riscv_vfwredosum_vs_f32m1_f64m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f32m2_f64m1(...) __riscv_vfwredosum_vs_f32m2_f64m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f32m4_f64m1(...) __riscv_vfwredosum_vs_f32m4_f64m1_tu(__VA_ARGS__)
+#define vfwredosum_vs_f32m8_f64m1(...) __riscv_vfwredosum_vs_f32m8_f64m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f16mf4_f32m1(...) __riscv_vfwredusum_vs_f16mf4_f32m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f16mf2_f32m1(...) __riscv_vfwredusum_vs_f16mf2_f32m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f16m1_f32m1(...) __riscv_vfwredusum_vs_f16m1_f32m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f16m2_f32m1(...) __riscv_vfwredusum_vs_f16m2_f32m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f16m4_f32m1(...) __riscv_vfwredusum_vs_f16m4_f32m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f16m8_f32m1(...) __riscv_vfwredusum_vs_f16m8_f32m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f32mf2_f64m1(...) __riscv_vfwredusum_vs_f32mf2_f64m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f32m1_f64m1(...) __riscv_vfwredusum_vs_f32m1_f64m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f32m2_f64m1(...) __riscv_vfwredusum_vs_f32m2_f64m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f32m4_f64m1(...) __riscv_vfwredusum_vs_f32m4_f64m1_tu(__VA_ARGS__)
+#define vfwredusum_vs_f32m8_f64m1(...) __riscv_vfwredusum_vs_f32m8_f64m1_tu(__VA_ARGS__)
+// masked functions
+#define vfwredosum_vs_f16mf4_f32m1_m(...) __riscv_vfwredosum_vs_f16mf4_f32m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f16mf2_f32m1_m(...) __riscv_vfwredosum_vs_f16mf2_f32m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f16m1_f32m1_m(...) __riscv_vfwredosum_vs_f16m1_f32m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f16m2_f32m1_m(...) __riscv_vfwredosum_vs_f16m2_f32m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f16m4_f32m1_m(...) __riscv_vfwredosum_vs_f16m4_f32m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f16m8_f32m1_m(...) __riscv_vfwredosum_vs_f16m8_f32m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f32mf2_f64m1_m(...) __riscv_vfwredosum_vs_f32mf2_f64m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f32m1_f64m1_m(...) __riscv_vfwredosum_vs_f32m1_f64m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f32m2_f64m1_m(...) __riscv_vfwredosum_vs_f32m2_f64m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f32m4_f64m1_m(...) __riscv_vfwredosum_vs_f32m4_f64m1_tum(__VA_ARGS__)
+#define vfwredosum_vs_f32m8_f64m1_m(...) __riscv_vfwredosum_vs_f32m8_f64m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f16mf4_f32m1_m(...) __riscv_vfwredusum_vs_f16mf4_f32m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f16mf2_f32m1_m(...) __riscv_vfwredusum_vs_f16mf2_f32m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f16m1_f32m1_m(...) __riscv_vfwredusum_vs_f16m1_f32m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f16m2_f32m1_m(...) __riscv_vfwredusum_vs_f16m2_f32m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f16m4_f32m1_m(...) __riscv_vfwredusum_vs_f16m4_f32m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f16m8_f32m1_m(...) __riscv_vfwredusum_vs_f16m8_f32m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f32mf2_f64m1_m(...) __riscv_vfwredusum_vs_f32mf2_f64m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f32m1_f64m1_m(...) __riscv_vfwredusum_vs_f32m1_f64m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f32m2_f64m1_m(...) __riscv_vfwredusum_vs_f32m2_f64m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f32m4_f64m1_m(...) __riscv_vfwredusum_vs_f32m4_f64m1_tum(__VA_ARGS__)
+#define vfwredusum_vs_f32m8_f64m1_m(...) __riscv_vfwredusum_vs_f32m8_f64m1_tum(__VA_ARGS__)
+#define vlm_v_b1(...) __riscv_vlm_v_b1(__VA_ARGS__)
+#define vlm_v_b2(...) __riscv_vlm_v_b2(__VA_ARGS__)
+#define vlm_v_b4(...) __riscv_vlm_v_b4(__VA_ARGS__)
+#define vlm_v_b8(...) __riscv_vlm_v_b8(__VA_ARGS__)
+#define vlm_v_b16(...) __riscv_vlm_v_b16(__VA_ARGS__)
+#define vlm_v_b32(...) __riscv_vlm_v_b32(__VA_ARGS__)
+#define vlm_v_b64(...) __riscv_vlm_v_b64(__VA_ARGS__)
+#define vsm_v_b1(...) __riscv_vsm_v_b1(__VA_ARGS__)
+#define vsm_v_b2(...) __riscv_vsm_v_b2(__VA_ARGS__)
+#define vsm_v_b4(...) __riscv_vsm_v_b4(__VA_ARGS__)
+#define vsm_v_b8(...) __riscv_vsm_v_b8(__VA_ARGS__)
+#define vsm_v_b16(...) __riscv_vsm_v_b16(__VA_ARGS__)
+#define vsm_v_b32(...) __riscv_vsm_v_b32(__VA_ARGS__)
+#define vsm_v_b64(...) __riscv_vsm_v_b64(__VA_ARGS__)
+#define vmand_mm_b1(...) __riscv_vmand_mm_b1(__VA_ARGS__)
+#define vmand_mm_b2(...) __riscv_vmand_mm_b2(__VA_ARGS__)
+#define vmand_mm_b4(...) __riscv_vmand_mm_b4(__VA_ARGS__)
+#define vmand_mm_b8(...) __riscv_vmand_mm_b8(__VA_ARGS__)
+#define vmand_mm_b16(...) __riscv_vmand_mm_b16(__VA_ARGS__)
+#define vmand_mm_b32(...) __riscv_vmand_mm_b32(__VA_ARGS__)
+#define vmand_mm_b64(...) __riscv_vmand_mm_b64(__VA_ARGS__)
+#define vmnand_mm_b1(...) __riscv_vmnand_mm_b1(__VA_ARGS__)
+#define vmnand_mm_b2(...) __riscv_vmnand_mm_b2(__VA_ARGS__)
+#define vmnand_mm_b4(...) __riscv_vmnand_mm_b4(__VA_ARGS__)
+#define vmnand_mm_b8(...) __riscv_vmnand_mm_b8(__VA_ARGS__)
+#define vmnand_mm_b16(...) __riscv_vmnand_mm_b16(__VA_ARGS__)
+#define vmnand_mm_b32(...) __riscv_vmnand_mm_b32(__VA_ARGS__)
+#define vmnand_mm_b64(...) __riscv_vmnand_mm_b64(__VA_ARGS__)
+#define vmandn_mm_b1(...) __riscv_vmandn_mm_b1(__VA_ARGS__)
+#define vmandn_mm_b2(...) __riscv_vmandn_mm_b2(__VA_ARGS__)
+#define vmandn_mm_b4(...) __riscv_vmandn_mm_b4(__VA_ARGS__)
+#define vmandn_mm_b8(...) __riscv_vmandn_mm_b8(__VA_ARGS__)
+#define vmandn_mm_b16(...) __riscv_vmandn_mm_b16(__VA_ARGS__)
+#define vmandn_mm_b32(...) __riscv_vmandn_mm_b32(__VA_ARGS__)
+#define vmandn_mm_b64(...) __riscv_vmandn_mm_b64(__VA_ARGS__)
+#define vmxor_mm_b1(...) __riscv_vmxor_mm_b1(__VA_ARGS__)
+#define vmxor_mm_b2(...) __riscv_vmxor_mm_b2(__VA_ARGS__)
+#define vmxor_mm_b4(...) __riscv_vmxor_mm_b4(__VA_ARGS__)
+#define vmxor_mm_b8(...) __riscv_vmxor_mm_b8(__VA_ARGS__)
+#define vmxor_mm_b16(...) __riscv_vmxor_mm_b16(__VA_ARGS__)
+#define vmxor_mm_b32(...) __riscv_vmxor_mm_b32(__VA_ARGS__)
+#define vmxor_mm_b64(...) __riscv_vmxor_mm_b64(__VA_ARGS__)
+#define vmor_mm_b1(...) __riscv_vmor_mm_b1(__VA_ARGS__)
+#define vmor_mm_b2(...) __riscv_vmor_mm_b2(__VA_ARGS__)
+#define vmor_mm_b4(...) __riscv_vmor_mm_b4(__VA_ARGS__)
+#define vmor_mm_b8(...) __riscv_vmor_mm_b8(__VA_ARGS__)
+#define vmor_mm_b16(...) __riscv_vmor_mm_b16(__VA_ARGS__)
+#define vmor_mm_b32(...) __riscv_vmor_mm_b32(__VA_ARGS__)
+#define vmor_mm_b64(...) __riscv_vmor_mm_b64(__VA_ARGS__)
+#define vmnor_mm_b1(...) __riscv_vmnor_mm_b1(__VA_ARGS__)
+#define vmnor_mm_b2(...) __riscv_vmnor_mm_b2(__VA_ARGS__)
+#define vmnor_mm_b4(...) __riscv_vmnor_mm_b4(__VA_ARGS__)
+#define vmnor_mm_b8(...) __riscv_vmnor_mm_b8(__VA_ARGS__)
+#define vmnor_mm_b16(...) __riscv_vmnor_mm_b16(__VA_ARGS__)
+#define vmnor_mm_b32(...) __riscv_vmnor_mm_b32(__VA_ARGS__)
+#define vmnor_mm_b64(...) __riscv_vmnor_mm_b64(__VA_ARGS__)
+#define vmorn_mm_b1(...) __riscv_vmorn_mm_b1(__VA_ARGS__)
+#define vmorn_mm_b2(...) __riscv_vmorn_mm_b2(__VA_ARGS__)
+#define vmorn_mm_b4(...) __riscv_vmorn_mm_b4(__VA_ARGS__)
+#define vmorn_mm_b8(...) __riscv_vmorn_mm_b8(__VA_ARGS__)
+#define vmorn_mm_b16(...) __riscv_vmorn_mm_b16(__VA_ARGS__)
+#define vmorn_mm_b32(...) __riscv_vmorn_mm_b32(__VA_ARGS__)
+#define vmorn_mm_b64(...) __riscv_vmorn_mm_b64(__VA_ARGS__)
+#define vmxnor_mm_b1(...) __riscv_vmxnor_mm_b1(__VA_ARGS__)
+#define vmxnor_mm_b2(...) __riscv_vmxnor_mm_b2(__VA_ARGS__)
+#define vmxnor_mm_b4(...) __riscv_vmxnor_mm_b4(__VA_ARGS__)
+#define vmxnor_mm_b8(...) __riscv_vmxnor_mm_b8(__VA_ARGS__)
+#define vmxnor_mm_b16(...) __riscv_vmxnor_mm_b16(__VA_ARGS__)
+#define vmxnor_mm_b32(...) __riscv_vmxnor_mm_b32(__VA_ARGS__)
+#define vmxnor_mm_b64(...) __riscv_vmxnor_mm_b64(__VA_ARGS__)
+#define vmmv_m_b1(...) __riscv_vmmv_m_b1(__VA_ARGS__)
+#define vmmv_m_b2(...) __riscv_vmmv_m_b2(__VA_ARGS__)
+#define vmmv_m_b4(...) __riscv_vmmv_m_b4(__VA_ARGS__)
+#define vmmv_m_b8(...) __riscv_vmmv_m_b8(__VA_ARGS__)
+#define vmmv_m_b16(...) __riscv_vmmv_m_b16(__VA_ARGS__)
+#define vmmv_m_b32(...) __riscv_vmmv_m_b32(__VA_ARGS__)
+#define vmmv_m_b64(...) __riscv_vmmv_m_b64(__VA_ARGS__)
+#define vmclr_m_b1(...) __riscv_vmclr_m_b1(__VA_ARGS__)
+#define vmclr_m_b2(...) __riscv_vmclr_m_b2(__VA_ARGS__)
+#define vmclr_m_b4(...) __riscv_vmclr_m_b4(__VA_ARGS__)
+#define vmclr_m_b8(...) __riscv_vmclr_m_b8(__VA_ARGS__)
+#define vmclr_m_b16(...) __riscv_vmclr_m_b16(__VA_ARGS__)
+#define vmclr_m_b32(...) __riscv_vmclr_m_b32(__VA_ARGS__)
+#define vmclr_m_b64(...) __riscv_vmclr_m_b64(__VA_ARGS__)
+#define vmset_m_b1(...) __riscv_vmset_m_b1(__VA_ARGS__)
+#define vmset_m_b2(...) __riscv_vmset_m_b2(__VA_ARGS__)
+#define vmset_m_b4(...) __riscv_vmset_m_b4(__VA_ARGS__)
+#define vmset_m_b8(...) __riscv_vmset_m_b8(__VA_ARGS__)
+#define vmset_m_b16(...) __riscv_vmset_m_b16(__VA_ARGS__)
+#define vmset_m_b32(...) __riscv_vmset_m_b32(__VA_ARGS__)
+#define vmset_m_b64(...) __riscv_vmset_m_b64(__VA_ARGS__)
+#define vmnot_m_b1(...) __riscv_vmnot_m_b1(__VA_ARGS__)
+#define vmnot_m_b2(...) __riscv_vmnot_m_b2(__VA_ARGS__)
+#define vmnot_m_b4(...) __riscv_vmnot_m_b4(__VA_ARGS__)
+#define vmnot_m_b8(...) __riscv_vmnot_m_b8(__VA_ARGS__)
+#define vmnot_m_b16(...) __riscv_vmnot_m_b16(__VA_ARGS__)
+#define vmnot_m_b32(...) __riscv_vmnot_m_b32(__VA_ARGS__)
+#define vmnot_m_b64(...) __riscv_vmnot_m_b64(__VA_ARGS__)
+#define vcpop_m_b1(...) __riscv_vcpop_m_b1(__VA_ARGS__)
+#define vcpop_m_b2(...) __riscv_vcpop_m_b2(__VA_ARGS__)
+#define vcpop_m_b4(...) __riscv_vcpop_m_b4(__VA_ARGS__)
+#define vcpop_m_b8(...) __riscv_vcpop_m_b8(__VA_ARGS__)
+#define vcpop_m_b16(...) __riscv_vcpop_m_b16(__VA_ARGS__)
+#define vcpop_m_b32(...) __riscv_vcpop_m_b32(__VA_ARGS__)
+#define vcpop_m_b64(...) __riscv_vcpop_m_b64(__VA_ARGS__)
+// masked functions
+#define vcpop_m_b1_m(...) __riscv_vcpop_m_b1_m(__VA_ARGS__)
+#define vcpop_m_b2_m(...) __riscv_vcpop_m_b2_m(__VA_ARGS__)
+#define vcpop_m_b4_m(...) __riscv_vcpop_m_b4_m(__VA_ARGS__)
+#define vcpop_m_b8_m(...) __riscv_vcpop_m_b8_m(__VA_ARGS__)
+#define vcpop_m_b16_m(...) __riscv_vcpop_m_b16_m(__VA_ARGS__)
+#define vcpop_m_b32_m(...) __riscv_vcpop_m_b32_m(__VA_ARGS__)
+#define vcpop_m_b64_m(...) __riscv_vcpop_m_b64_m(__VA_ARGS__)
+#define vfirst_m_b1(...) __riscv_vfirst_m_b1(__VA_ARGS__)
+#define vfirst_m_b2(...) __riscv_vfirst_m_b2(__VA_ARGS__)
+#define vfirst_m_b4(...) __riscv_vfirst_m_b4(__VA_ARGS__)
+#define vfirst_m_b8(...) __riscv_vfirst_m_b8(__VA_ARGS__)
+#define vfirst_m_b16(...) __riscv_vfirst_m_b16(__VA_ARGS__)
+#define vfirst_m_b32(...) __riscv_vfirst_m_b32(__VA_ARGS__)
+#define vfirst_m_b64(...) __riscv_vfirst_m_b64(__VA_ARGS__)
+// masked functions
+#define vfirst_m_b1_m(...) __riscv_vfirst_m_b1_m(__VA_ARGS__)
+#define vfirst_m_b2_m(...) __riscv_vfirst_m_b2_m(__VA_ARGS__)
+#define vfirst_m_b4_m(...) __riscv_vfirst_m_b4_m(__VA_ARGS__)
+#define vfirst_m_b8_m(...) __riscv_vfirst_m_b8_m(__VA_ARGS__)
+#define vfirst_m_b16_m(...) __riscv_vfirst_m_b16_m(__VA_ARGS__)
+#define vfirst_m_b32_m(...) __riscv_vfirst_m_b32_m(__VA_ARGS__)
+#define vfirst_m_b64_m(...) __riscv_vfirst_m_b64_m(__VA_ARGS__)
+#define vmsbf_m_b1(...) __riscv_vmsbf_m_b1(__VA_ARGS__)
+#define vmsbf_m_b2(...) __riscv_vmsbf_m_b2(__VA_ARGS__)
+#define vmsbf_m_b4(...) __riscv_vmsbf_m_b4(__VA_ARGS__)
+#define vmsbf_m_b8(...) __riscv_vmsbf_m_b8(__VA_ARGS__)
+#define vmsbf_m_b16(...) __riscv_vmsbf_m_b16(__VA_ARGS__)
+#define vmsbf_m_b32(...) __riscv_vmsbf_m_b32(__VA_ARGS__)
+#define vmsbf_m_b64(...) __riscv_vmsbf_m_b64(__VA_ARGS__)
+// masked functions
+#define vmsbf_m_b1_m(...) __riscv_vmsbf_m_b1_mu(__VA_ARGS__)
+#define vmsbf_m_b2_m(...) __riscv_vmsbf_m_b2_mu(__VA_ARGS__)
+#define vmsbf_m_b4_m(...) __riscv_vmsbf_m_b4_mu(__VA_ARGS__)
+#define vmsbf_m_b8_m(...) __riscv_vmsbf_m_b8_mu(__VA_ARGS__)
+#define vmsbf_m_b16_m(...) __riscv_vmsbf_m_b16_mu(__VA_ARGS__)
+#define vmsbf_m_b32_m(...) __riscv_vmsbf_m_b32_mu(__VA_ARGS__)
+#define vmsbf_m_b64_m(...) __riscv_vmsbf_m_b64_mu(__VA_ARGS__)
+#define vmsif_m_b1(...) __riscv_vmsif_m_b1(__VA_ARGS__)
+#define vmsif_m_b2(...) __riscv_vmsif_m_b2(__VA_ARGS__)
+#define vmsif_m_b4(...) __riscv_vmsif_m_b4(__VA_ARGS__)
+#define vmsif_m_b8(...) __riscv_vmsif_m_b8(__VA_ARGS__)
+#define vmsif_m_b16(...) __riscv_vmsif_m_b16(__VA_ARGS__)
+#define vmsif_m_b32(...) __riscv_vmsif_m_b32(__VA_ARGS__)
+#define vmsif_m_b64(...) __riscv_vmsif_m_b64(__VA_ARGS__)
+// masked functions
+#define vmsif_m_b1_m(...) __riscv_vmsif_m_b1_mu(__VA_ARGS__)
+#define vmsif_m_b2_m(...) __riscv_vmsif_m_b2_mu(__VA_ARGS__)
+#define vmsif_m_b4_m(...) __riscv_vmsif_m_b4_mu(__VA_ARGS__)
+#define vmsif_m_b8_m(...) __riscv_vmsif_m_b8_mu(__VA_ARGS__)
+#define vmsif_m_b16_m(...) __riscv_vmsif_m_b16_mu(__VA_ARGS__)
+#define vmsif_m_b32_m(...) __riscv_vmsif_m_b32_mu(__VA_ARGS__)
+#define vmsif_m_b64_m(...) __riscv_vmsif_m_b64_mu(__VA_ARGS__)
+#define vmsof_m_b1(...) __riscv_vmsof_m_b1(__VA_ARGS__)
+#define vmsof_m_b2(...) __riscv_vmsof_m_b2(__VA_ARGS__)
+#define vmsof_m_b4(...) __riscv_vmsof_m_b4(__VA_ARGS__)
+#define vmsof_m_b8(...) __riscv_vmsof_m_b8(__VA_ARGS__)
+#define vmsof_m_b16(...) __riscv_vmsof_m_b16(__VA_ARGS__)
+#define vmsof_m_b32(...) __riscv_vmsof_m_b32(__VA_ARGS__)
+#define vmsof_m_b64(...) __riscv_vmsof_m_b64(__VA_ARGS__)
+// masked functions
+#define vmsof_m_b1_m(...) __riscv_vmsof_m_b1_mu(__VA_ARGS__)
+#define vmsof_m_b2_m(...) __riscv_vmsof_m_b2_mu(__VA_ARGS__)
+#define vmsof_m_b4_m(...) __riscv_vmsof_m_b4_mu(__VA_ARGS__)
+#define vmsof_m_b8_m(...) __riscv_vmsof_m_b8_mu(__VA_ARGS__)
+#define vmsof_m_b16_m(...) __riscv_vmsof_m_b16_mu(__VA_ARGS__)
+#define vmsof_m_b32_m(...) __riscv_vmsof_m_b32_mu(__VA_ARGS__)
+#define vmsof_m_b64_m(...) __riscv_vmsof_m_b64_mu(__VA_ARGS__)
+#define viota_m_u8mf8(...) __riscv_viota_m_u8mf8(__VA_ARGS__)
+#define viota_m_u8mf4(...) __riscv_viota_m_u8mf4(__VA_ARGS__)
+#define viota_m_u8mf2(...) __riscv_viota_m_u8mf2(__VA_ARGS__)
+#define viota_m_u8m1(...) __riscv_viota_m_u8m1(__VA_ARGS__)
+#define viota_m_u8m2(...) __riscv_viota_m_u8m2(__VA_ARGS__)
+#define viota_m_u8m4(...) __riscv_viota_m_u8m4(__VA_ARGS__)
+#define viota_m_u8m8(...) __riscv_viota_m_u8m8(__VA_ARGS__)
+#define viota_m_u16mf4(...) __riscv_viota_m_u16mf4(__VA_ARGS__)
+#define viota_m_u16mf2(...) __riscv_viota_m_u16mf2(__VA_ARGS__)
+#define viota_m_u16m1(...) __riscv_viota_m_u16m1(__VA_ARGS__)
+#define viota_m_u16m2(...) __riscv_viota_m_u16m2(__VA_ARGS__)
+#define viota_m_u16m4(...) __riscv_viota_m_u16m4(__VA_ARGS__)
+#define viota_m_u16m8(...) __riscv_viota_m_u16m8(__VA_ARGS__)
+#define viota_m_u32mf2(...) __riscv_viota_m_u32mf2(__VA_ARGS__)
+#define viota_m_u32m1(...) __riscv_viota_m_u32m1(__VA_ARGS__)
+#define viota_m_u32m2(...) __riscv_viota_m_u32m2(__VA_ARGS__)
+#define viota_m_u32m4(...) __riscv_viota_m_u32m4(__VA_ARGS__)
+#define viota_m_u32m8(...) __riscv_viota_m_u32m8(__VA_ARGS__)
+#define viota_m_u64m1(...) __riscv_viota_m_u64m1(__VA_ARGS__)
+#define viota_m_u64m2(...) __riscv_viota_m_u64m2(__VA_ARGS__)
+#define viota_m_u64m4(...) __riscv_viota_m_u64m4(__VA_ARGS__)
+#define viota_m_u64m8(...) __riscv_viota_m_u64m8(__VA_ARGS__)
+// masked functions
+#define viota_m_u8mf8_m(...) __riscv_viota_m_u8mf8_tumu(__VA_ARGS__)
+#define viota_m_u8mf4_m(...) __riscv_viota_m_u8mf4_tumu(__VA_ARGS__)
+#define viota_m_u8mf2_m(...) __riscv_viota_m_u8mf2_tumu(__VA_ARGS__)
+#define viota_m_u8m1_m(...) __riscv_viota_m_u8m1_tumu(__VA_ARGS__)
+#define viota_m_u8m2_m(...) __riscv_viota_m_u8m2_tumu(__VA_ARGS__)
+#define viota_m_u8m4_m(...) __riscv_viota_m_u8m4_tumu(__VA_ARGS__)
+#define viota_m_u8m8_m(...) __riscv_viota_m_u8m8_tumu(__VA_ARGS__)
+#define viota_m_u16mf4_m(...) __riscv_viota_m_u16mf4_tumu(__VA_ARGS__)
+#define viota_m_u16mf2_m(...) __riscv_viota_m_u16mf2_tumu(__VA_ARGS__)
+#define viota_m_u16m1_m(...) __riscv_viota_m_u16m1_tumu(__VA_ARGS__)
+#define viota_m_u16m2_m(...) __riscv_viota_m_u16m2_tumu(__VA_ARGS__)
+#define viota_m_u16m4_m(...) __riscv_viota_m_u16m4_tumu(__VA_ARGS__)
+#define viota_m_u16m8_m(...) __riscv_viota_m_u16m8_tumu(__VA_ARGS__)
+#define viota_m_u32mf2_m(...) __riscv_viota_m_u32mf2_tumu(__VA_ARGS__)
+#define viota_m_u32m1_m(...) __riscv_viota_m_u32m1_tumu(__VA_ARGS__)
+#define viota_m_u32m2_m(...) __riscv_viota_m_u32m2_tumu(__VA_ARGS__)
+#define viota_m_u32m4_m(...) __riscv_viota_m_u32m4_tumu(__VA_ARGS__)
+#define viota_m_u32m8_m(...) __riscv_viota_m_u32m8_tumu(__VA_ARGS__)
+#define viota_m_u64m1_m(...) __riscv_viota_m_u64m1_tumu(__VA_ARGS__)
+#define viota_m_u64m2_m(...) __riscv_viota_m_u64m2_tumu(__VA_ARGS__)
+#define viota_m_u64m4_m(...) __riscv_viota_m_u64m4_tumu(__VA_ARGS__)
+#define viota_m_u64m8_m(...) __riscv_viota_m_u64m8_tumu(__VA_ARGS__)
+#define vid_v_u8mf8(...) __riscv_vid_v_u8mf8(__VA_ARGS__)
+#define vid_v_u8mf4(...) __riscv_vid_v_u8mf4(__VA_ARGS__)
+#define vid_v_u8mf2(...) __riscv_vid_v_u8mf2(__VA_ARGS__)
+#define vid_v_u8m1(...) __riscv_vid_v_u8m1(__VA_ARGS__)
+#define vid_v_u8m2(...) __riscv_vid_v_u8m2(__VA_ARGS__)
+#define vid_v_u8m4(...) __riscv_vid_v_u8m4(__VA_ARGS__)
+#define vid_v_u8m8(...) __riscv_vid_v_u8m8(__VA_ARGS__)
+#define vid_v_u16mf4(...) __riscv_vid_v_u16mf4(__VA_ARGS__)
+#define vid_v_u16mf2(...) __riscv_vid_v_u16mf2(__VA_ARGS__)
+#define vid_v_u16m1(...) __riscv_vid_v_u16m1(__VA_ARGS__)
+#define vid_v_u16m2(...) __riscv_vid_v_u16m2(__VA_ARGS__)
+#define vid_v_u16m4(...) __riscv_vid_v_u16m4(__VA_ARGS__)
+#define vid_v_u16m8(...) __riscv_vid_v_u16m8(__VA_ARGS__)
+#define vid_v_u32mf2(...) __riscv_vid_v_u32mf2(__VA_ARGS__)
+#define vid_v_u32m1(...) __riscv_vid_v_u32m1(__VA_ARGS__)
+#define vid_v_u32m2(...) __riscv_vid_v_u32m2(__VA_ARGS__)
+#define vid_v_u32m4(...) __riscv_vid_v_u32m4(__VA_ARGS__)
+#define vid_v_u32m8(...) __riscv_vid_v_u32m8(__VA_ARGS__)
+#define vid_v_u64m1(...) __riscv_vid_v_u64m1(__VA_ARGS__)
+#define vid_v_u64m2(...) __riscv_vid_v_u64m2(__VA_ARGS__)
+#define vid_v_u64m4(...) __riscv_vid_v_u64m4(__VA_ARGS__)
+#define vid_v_u64m8(...) __riscv_vid_v_u64m8(__VA_ARGS__)
+// masked functions
+#define vid_v_u8mf8_m(...) __riscv_vid_v_u8mf8_tumu(__VA_ARGS__)
+#define vid_v_u8mf4_m(...) __riscv_vid_v_u8mf4_tumu(__VA_ARGS__)
+#define vid_v_u8mf2_m(...) __riscv_vid_v_u8mf2_tumu(__VA_ARGS__)
+#define vid_v_u8m1_m(...) __riscv_vid_v_u8m1_tumu(__VA_ARGS__)
+#define vid_v_u8m2_m(...) __riscv_vid_v_u8m2_tumu(__VA_ARGS__)
+#define vid_v_u8m4_m(...) __riscv_vid_v_u8m4_tumu(__VA_ARGS__)
+#define vid_v_u8m8_m(...) __riscv_vid_v_u8m8_tumu(__VA_ARGS__)
+#define vid_v_u16mf4_m(...) __riscv_vid_v_u16mf4_tumu(__VA_ARGS__)
+#define vid_v_u16mf2_m(...) __riscv_vid_v_u16mf2_tumu(__VA_ARGS__)
+#define vid_v_u16m1_m(...) __riscv_vid_v_u16m1_tumu(__VA_ARGS__)
+#define vid_v_u16m2_m(...) __riscv_vid_v_u16m2_tumu(__VA_ARGS__)
+#define vid_v_u16m4_m(...) __riscv_vid_v_u16m4_tumu(__VA_ARGS__)
+#define vid_v_u16m8_m(...) __riscv_vid_v_u16m8_tumu(__VA_ARGS__)
+#define vid_v_u32mf2_m(...) __riscv_vid_v_u32mf2_tumu(__VA_ARGS__)
+#define vid_v_u32m1_m(...) __riscv_vid_v_u32m1_tumu(__VA_ARGS__)
+#define vid_v_u32m2_m(...) __riscv_vid_v_u32m2_tumu(__VA_ARGS__)
+#define vid_v_u32m4_m(...) __riscv_vid_v_u32m4_tumu(__VA_ARGS__)
+#define vid_v_u32m8_m(...) __riscv_vid_v_u32m8_tumu(__VA_ARGS__)
+#define vid_v_u64m1_m(...) __riscv_vid_v_u64m1_tumu(__VA_ARGS__)
+#define vid_v_u64m2_m(...) __riscv_vid_v_u64m2_tumu(__VA_ARGS__)
+#define vid_v_u64m4_m(...) __riscv_vid_v_u64m4_tumu(__VA_ARGS__)
+#define vid_v_u64m8_m(...) __riscv_vid_v_u64m8_tumu(__VA_ARGS__)
+#define vfmv_f_s_f16mf4_f16(...) __riscv_vfmv_f_s_f16mf4_f16(__VA_ARGS__)
+#define vfmv_s_f_f16mf4(...) __riscv_vfmv_s_f_f16mf4_tu(__VA_ARGS__)
+#define vfmv_f_s_f16mf2_f16(...) __riscv_vfmv_f_s_f16mf2_f16(__VA_ARGS__)
+#define vfmv_s_f_f16mf2(...) __riscv_vfmv_s_f_f16mf2_tu(__VA_ARGS__)
+#define vfmv_f_s_f16m1_f16(...) __riscv_vfmv_f_s_f16m1_f16(__VA_ARGS__)
+#define vfmv_s_f_f16m1(...) __riscv_vfmv_s_f_f16m1_tu(__VA_ARGS__)
+#define vfmv_f_s_f16m2_f16(...) __riscv_vfmv_f_s_f16m2_f16(__VA_ARGS__)
+#define vfmv_s_f_f16m2(...) __riscv_vfmv_s_f_f16m2_tu(__VA_ARGS__)
+#define vfmv_f_s_f16m4_f16(...) __riscv_vfmv_f_s_f16m4_f16(__VA_ARGS__)
+#define vfmv_s_f_f16m4(...) __riscv_vfmv_s_f_f16m4_tu(__VA_ARGS__)
+#define vfmv_f_s_f16m8_f16(...) __riscv_vfmv_f_s_f16m8_f16(__VA_ARGS__)
+#define vfmv_s_f_f16m8(...) __riscv_vfmv_s_f_f16m8_tu(__VA_ARGS__)
+#define vfmv_f_s_f32mf2_f32(...) __riscv_vfmv_f_s_f32mf2_f32(__VA_ARGS__)
+#define vfmv_s_f_f32mf2(...) __riscv_vfmv_s_f_f32mf2_tu(__VA_ARGS__)
+#define vfmv_f_s_f32m1_f32(...) __riscv_vfmv_f_s_f32m1_f32(__VA_ARGS__)
+#define vfmv_s_f_f32m1(...) __riscv_vfmv_s_f_f32m1_tu(__VA_ARGS__)
+#define vfmv_f_s_f32m2_f32(...) __riscv_vfmv_f_s_f32m2_f32(__VA_ARGS__)
+#define vfmv_s_f_f32m2(...) __riscv_vfmv_s_f_f32m2_tu(__VA_ARGS__)
+#define vfmv_f_s_f32m4_f32(...) __riscv_vfmv_f_s_f32m4_f32(__VA_ARGS__)
+#define vfmv_s_f_f32m4(...) __riscv_vfmv_s_f_f32m4_tu(__VA_ARGS__)
+#define vfmv_f_s_f32m8_f32(...) __riscv_vfmv_f_s_f32m8_f32(__VA_ARGS__)
+#define vfmv_s_f_f32m8(...) __riscv_vfmv_s_f_f32m8_tu(__VA_ARGS__)
+#define vfmv_f_s_f64m1_f64(...) __riscv_vfmv_f_s_f64m1_f64(__VA_ARGS__)
+#define vfmv_s_f_f64m1(...) __riscv_vfmv_s_f_f64m1_tu(__VA_ARGS__)
+#define vfmv_f_s_f64m2_f64(...) __riscv_vfmv_f_s_f64m2_f64(__VA_ARGS__)
+#define vfmv_s_f_f64m2(...) __riscv_vfmv_s_f_f64m2_tu(__VA_ARGS__)
+#define vfmv_f_s_f64m4_f64(...) __riscv_vfmv_f_s_f64m4_f64(__VA_ARGS__)
+#define vfmv_s_f_f64m4(...) __riscv_vfmv_s_f_f64m4_tu(__VA_ARGS__)
+#define vfmv_f_s_f64m8_f64(...) __riscv_vfmv_f_s_f64m8_f64(__VA_ARGS__)
+#define vfmv_s_f_f64m8(...) __riscv_vfmv_s_f_f64m8_tu(__VA_ARGS__)
+#define vmv_x_s_i8mf8_i8(...) __riscv_vmv_x_s_i8mf8_i8(__VA_ARGS__)
+#define vmv_s_x_i8mf8(...) __riscv_vmv_s_x_i8mf8_tu(__VA_ARGS__)
+#define vmv_x_s_i8mf4_i8(...) __riscv_vmv_x_s_i8mf4_i8(__VA_ARGS__)
+#define vmv_s_x_i8mf4(...) __riscv_vmv_s_x_i8mf4_tu(__VA_ARGS__)
+#define vmv_x_s_i8mf2_i8(...) __riscv_vmv_x_s_i8mf2_i8(__VA_ARGS__)
+#define vmv_s_x_i8mf2(...) __riscv_vmv_s_x_i8mf2_tu(__VA_ARGS__)
+#define vmv_x_s_i8m1_i8(...) __riscv_vmv_x_s_i8m1_i8(__VA_ARGS__)
+#define vmv_s_x_i8m1(...) __riscv_vmv_s_x_i8m1_tu(__VA_ARGS__)
+#define vmv_x_s_i8m2_i8(...) __riscv_vmv_x_s_i8m2_i8(__VA_ARGS__)
+#define vmv_s_x_i8m2(...) __riscv_vmv_s_x_i8m2_tu(__VA_ARGS__)
+#define vmv_x_s_i8m4_i8(...) __riscv_vmv_x_s_i8m4_i8(__VA_ARGS__)
+#define vmv_s_x_i8m4(...) __riscv_vmv_s_x_i8m4_tu(__VA_ARGS__)
+#define vmv_x_s_i8m8_i8(...) __riscv_vmv_x_s_i8m8_i8(__VA_ARGS__)
+#define vmv_s_x_i8m8(...) __riscv_vmv_s_x_i8m8_tu(__VA_ARGS__)
+#define vmv_x_s_i16mf4_i16(...) __riscv_vmv_x_s_i16mf4_i16(__VA_ARGS__)
+#define vmv_s_x_i16mf4(...) __riscv_vmv_s_x_i16mf4_tu(__VA_ARGS__)
+#define vmv_x_s_i16mf2_i16(...) __riscv_vmv_x_s_i16mf2_i16(__VA_ARGS__)
+#define vmv_s_x_i16mf2(...) __riscv_vmv_s_x_i16mf2_tu(__VA_ARGS__)
+#define vmv_x_s_i16m1_i16(...) __riscv_vmv_x_s_i16m1_i16(__VA_ARGS__)
+#define vmv_s_x_i16m1(...) __riscv_vmv_s_x_i16m1_tu(__VA_ARGS__)
+#define vmv_x_s_i16m2_i16(...) __riscv_vmv_x_s_i16m2_i16(__VA_ARGS__)
+#define vmv_s_x_i16m2(...) __riscv_vmv_s_x_i16m2_tu(__VA_ARGS__)
+#define vmv_x_s_i16m4_i16(...) __riscv_vmv_x_s_i16m4_i16(__VA_ARGS__)
+#define vmv_s_x_i16m4(...) __riscv_vmv_s_x_i16m4_tu(__VA_ARGS__)
+#define vmv_x_s_i16m8_i16(...) __riscv_vmv_x_s_i16m8_i16(__VA_ARGS__)
+#define vmv_s_x_i16m8(...) __riscv_vmv_s_x_i16m8_tu(__VA_ARGS__)
+#define vmv_x_s_i32mf2_i32(...) __riscv_vmv_x_s_i32mf2_i32(__VA_ARGS__)
+#define vmv_s_x_i32mf2(...) __riscv_vmv_s_x_i32mf2_tu(__VA_ARGS__)
+#define vmv_x_s_i32m1_i32(...) __riscv_vmv_x_s_i32m1_i32(__VA_ARGS__)
+#define vmv_s_x_i32m1(...) __riscv_vmv_s_x_i32m1_tu(__VA_ARGS__)
+#define vmv_x_s_i32m2_i32(...) __riscv_vmv_x_s_i32m2_i32(__VA_ARGS__)
+#define vmv_s_x_i32m2(...) __riscv_vmv_s_x_i32m2_tu(__VA_ARGS__)
+#define vmv_x_s_i32m4_i32(...) __riscv_vmv_x_s_i32m4_i32(__VA_ARGS__)
+#define vmv_s_x_i32m4(...) __riscv_vmv_s_x_i32m4_tu(__VA_ARGS__)
+#define vmv_x_s_i32m8_i32(...) __riscv_vmv_x_s_i32m8_i32(__VA_ARGS__)
+#define vmv_s_x_i32m8(...) __riscv_vmv_s_x_i32m8_tu(__VA_ARGS__)
+#define vmv_x_s_i64m1_i64(...) __riscv_vmv_x_s_i64m1_i64(__VA_ARGS__)
+#define vmv_s_x_i64m1(...) __riscv_vmv_s_x_i64m1_tu(__VA_ARGS__)
+#define vmv_x_s_i64m2_i64(...) __riscv_vmv_x_s_i64m2_i64(__VA_ARGS__)
+#define vmv_s_x_i64m2(...) __riscv_vmv_s_x_i64m2_tu(__VA_ARGS__)
+#define vmv_x_s_i64m4_i64(...) __riscv_vmv_x_s_i64m4_i64(__VA_ARGS__)
+#define vmv_s_x_i64m4(...) __riscv_vmv_s_x_i64m4_tu(__VA_ARGS__)
+#define vmv_x_s_i64m8_i64(...) __riscv_vmv_x_s_i64m8_i64(__VA_ARGS__)
+#define vmv_s_x_i64m8(...) __riscv_vmv_s_x_i64m8_tu(__VA_ARGS__)
+#define vmv_x_s_u8mf8_u8(...) __riscv_vmv_x_s_u8mf8_u8(__VA_ARGS__)
+#define vmv_s_x_u8mf8(...) __riscv_vmv_s_x_u8mf8_tu(__VA_ARGS__)
+#define vmv_x_s_u8mf4_u8(...) __riscv_vmv_x_s_u8mf4_u8(__VA_ARGS__)
+#define vmv_s_x_u8mf4(...) __riscv_vmv_s_x_u8mf4_tu(__VA_ARGS__)
+#define vmv_x_s_u8mf2_u8(...) __riscv_vmv_x_s_u8mf2_u8(__VA_ARGS__)
+#define vmv_s_x_u8mf2(...) __riscv_vmv_s_x_u8mf2_tu(__VA_ARGS__)
+#define vmv_x_s_u8m1_u8(...) __riscv_vmv_x_s_u8m1_u8(__VA_ARGS__)
+#define vmv_s_x_u8m1(...) __riscv_vmv_s_x_u8m1_tu(__VA_ARGS__)
+#define vmv_x_s_u8m2_u8(...) __riscv_vmv_x_s_u8m2_u8(__VA_ARGS__)
+#define vmv_s_x_u8m2(...) __riscv_vmv_s_x_u8m2_tu(__VA_ARGS__)
+#define vmv_x_s_u8m4_u8(...) __riscv_vmv_x_s_u8m4_u8(__VA_ARGS__)
+#define vmv_s_x_u8m4(...) __riscv_vmv_s_x_u8m4_tu(__VA_ARGS__)
+#define vmv_x_s_u8m8_u8(...) __riscv_vmv_x_s_u8m8_u8(__VA_ARGS__)
+#define vmv_s_x_u8m8(...) __riscv_vmv_s_x_u8m8_tu(__VA_ARGS__)
+#define vmv_x_s_u16mf4_u16(...) __riscv_vmv_x_s_u16mf4_u16(__VA_ARGS__)
+#define vmv_s_x_u16mf4(...) __riscv_vmv_s_x_u16mf4_tu(__VA_ARGS__)
+#define vmv_x_s_u16mf2_u16(...) __riscv_vmv_x_s_u16mf2_u16(__VA_ARGS__)
+#define vmv_s_x_u16mf2(...) __riscv_vmv_s_x_u16mf2_tu(__VA_ARGS__)
+#define vmv_x_s_u16m1_u16(...) __riscv_vmv_x_s_u16m1_u16(__VA_ARGS__)
+#define vmv_s_x_u16m1(...) __riscv_vmv_s_x_u16m1_tu(__VA_ARGS__)
+#define vmv_x_s_u16m2_u16(...) __riscv_vmv_x_s_u16m2_u16(__VA_ARGS__)
+#define vmv_s_x_u16m2(...) __riscv_vmv_s_x_u16m2_tu(__VA_ARGS__)
+#define vmv_x_s_u16m4_u16(...) __riscv_vmv_x_s_u16m4_u16(__VA_ARGS__)
+#define vmv_s_x_u16m4(...) __riscv_vmv_s_x_u16m4_tu(__VA_ARGS__)
+#define vmv_x_s_u16m8_u16(...) __riscv_vmv_x_s_u16m8_u16(__VA_ARGS__)
+#define vmv_s_x_u16m8(...) __riscv_vmv_s_x_u16m8_tu(__VA_ARGS__)
+#define vmv_x_s_u32mf2_u32(...) __riscv_vmv_x_s_u32mf2_u32(__VA_ARGS__)
+#define vmv_s_x_u32mf2(...) __riscv_vmv_s_x_u32mf2_tu(__VA_ARGS__)
+#define vmv_x_s_u32m1_u32(...) __riscv_vmv_x_s_u32m1_u32(__VA_ARGS__)
+#define vmv_s_x_u32m1(...) __riscv_vmv_s_x_u32m1_tu(__VA_ARGS__)
+#define vmv_x_s_u32m2_u32(...) __riscv_vmv_x_s_u32m2_u32(__VA_ARGS__)
+#define vmv_s_x_u32m2(...) __riscv_vmv_s_x_u32m2_tu(__VA_ARGS__)
+#define vmv_x_s_u32m4_u32(...) __riscv_vmv_x_s_u32m4_u32(__VA_ARGS__)
+#define vmv_s_x_u32m4(...) __riscv_vmv_s_x_u32m4_tu(__VA_ARGS__)
+#define vmv_x_s_u32m8_u32(...) __riscv_vmv_x_s_u32m8_u32(__VA_ARGS__)
+#define vmv_s_x_u32m8(...) __riscv_vmv_s_x_u32m8_tu(__VA_ARGS__)
+#define vmv_x_s_u64m1_u64(...) __riscv_vmv_x_s_u64m1_u64(__VA_ARGS__)
+#define vmv_s_x_u64m1(...) __riscv_vmv_s_x_u64m1_tu(__VA_ARGS__)
+#define vmv_x_s_u64m2_u64(...) __riscv_vmv_x_s_u64m2_u64(__VA_ARGS__)
+#define vmv_s_x_u64m2(...) __riscv_vmv_s_x_u64m2_tu(__VA_ARGS__)
+#define vmv_x_s_u64m4_u64(...) __riscv_vmv_x_s_u64m4_u64(__VA_ARGS__)
+#define vmv_s_x_u64m4(...) __riscv_vmv_s_x_u64m4_tu(__VA_ARGS__)
+#define vmv_x_s_u64m8_u64(...) __riscv_vmv_x_s_u64m8_u64(__VA_ARGS__)
+#define vmv_s_x_u64m8(...) __riscv_vmv_s_x_u64m8_tu(__VA_ARGS__)
+#define vslideup_vx_f16mf4(...) __riscv_vslideup_vx_f16mf4_tu(__VA_ARGS__)
+#define vslideup_vx_f16mf2(...) __riscv_vslideup_vx_f16mf2_tu(__VA_ARGS__)
+#define vslideup_vx_f16m1(...) __riscv_vslideup_vx_f16m1_tu(__VA_ARGS__)
+#define vslideup_vx_f16m2(...) __riscv_vslideup_vx_f16m2_tu(__VA_ARGS__)
+#define vslideup_vx_f16m4(...) __riscv_vslideup_vx_f16m4_tu(__VA_ARGS__)
+#define vslideup_vx_f16m8(...) __riscv_vslideup_vx_f16m8_tu(__VA_ARGS__)
+#define vslideup_vx_f32mf2(...) __riscv_vslideup_vx_f32mf2_tu(__VA_ARGS__)
+#define vslideup_vx_f32m1(...) __riscv_vslideup_vx_f32m1_tu(__VA_ARGS__)
+#define vslideup_vx_f32m2(...) __riscv_vslideup_vx_f32m2_tu(__VA_ARGS__)
+#define vslideup_vx_f32m4(...) __riscv_vslideup_vx_f32m4_tu(__VA_ARGS__)
+#define vslideup_vx_f32m8(...) __riscv_vslideup_vx_f32m8_tu(__VA_ARGS__)
+#define vslideup_vx_f64m1(...) __riscv_vslideup_vx_f64m1_tu(__VA_ARGS__)
+#define vslideup_vx_f64m2(...) __riscv_vslideup_vx_f64m2_tu(__VA_ARGS__)
+#define vslideup_vx_f64m4(...) __riscv_vslideup_vx_f64m4_tu(__VA_ARGS__)
+#define vslideup_vx_f64m8(...) __riscv_vslideup_vx_f64m8_tu(__VA_ARGS__)
+#define vslideup_vx_i8mf8(...) __riscv_vslideup_vx_i8mf8_tu(__VA_ARGS__)
+#define vslideup_vx_i8mf4(...) __riscv_vslideup_vx_i8mf4_tu(__VA_ARGS__)
+#define vslideup_vx_i8mf2(...) __riscv_vslideup_vx_i8mf2_tu(__VA_ARGS__)
+#define vslideup_vx_i8m1(...) __riscv_vslideup_vx_i8m1_tu(__VA_ARGS__)
+#define vslideup_vx_i8m2(...) __riscv_vslideup_vx_i8m2_tu(__VA_ARGS__)
+#define vslideup_vx_i8m4(...) __riscv_vslideup_vx_i8m4_tu(__VA_ARGS__)
+#define vslideup_vx_i8m8(...) __riscv_vslideup_vx_i8m8_tu(__VA_ARGS__)
+#define vslideup_vx_i16mf4(...) __riscv_vslideup_vx_i16mf4_tu(__VA_ARGS__)
+#define vslideup_vx_i16mf2(...) __riscv_vslideup_vx_i16mf2_tu(__VA_ARGS__)
+#define vslideup_vx_i16m1(...) __riscv_vslideup_vx_i16m1_tu(__VA_ARGS__)
+#define vslideup_vx_i16m2(...) __riscv_vslideup_vx_i16m2_tu(__VA_ARGS__)
+#define vslideup_vx_i16m4(...) __riscv_vslideup_vx_i16m4_tu(__VA_ARGS__)
+#define vslideup_vx_i16m8(...) __riscv_vslideup_vx_i16m8_tu(__VA_ARGS__)
+#define vslideup_vx_i32mf2(...) __riscv_vslideup_vx_i32mf2_tu(__VA_ARGS__)
+#define vslideup_vx_i32m1(...) __riscv_vslideup_vx_i32m1_tu(__VA_ARGS__)
+#define vslideup_vx_i32m2(...) __riscv_vslideup_vx_i32m2_tu(__VA_ARGS__)
+#define vslideup_vx_i32m4(...) __riscv_vslideup_vx_i32m4_tu(__VA_ARGS__)
+#define vslideup_vx_i32m8(...) __riscv_vslideup_vx_i32m8_tu(__VA_ARGS__)
+#define vslideup_vx_i64m1(...) __riscv_vslideup_vx_i64m1_tu(__VA_ARGS__)
+#define vslideup_vx_i64m2(...) __riscv_vslideup_vx_i64m2_tu(__VA_ARGS__)
+#define vslideup_vx_i64m4(...) __riscv_vslideup_vx_i64m4_tu(__VA_ARGS__)
+#define vslideup_vx_i64m8(...) __riscv_vslideup_vx_i64m8_tu(__VA_ARGS__)
+#define vslideup_vx_u8mf8(...) __riscv_vslideup_vx_u8mf8_tu(__VA_ARGS__)
+#define vslideup_vx_u8mf4(...) __riscv_vslideup_vx_u8mf4_tu(__VA_ARGS__)
+#define vslideup_vx_u8mf2(...) __riscv_vslideup_vx_u8mf2_tu(__VA_ARGS__)
+#define vslideup_vx_u8m1(...) __riscv_vslideup_vx_u8m1_tu(__VA_ARGS__)
+#define vslideup_vx_u8m2(...) __riscv_vslideup_vx_u8m2_tu(__VA_ARGS__)
+#define vslideup_vx_u8m4(...) __riscv_vslideup_vx_u8m4_tu(__VA_ARGS__)
+#define vslideup_vx_u8m8(...) __riscv_vslideup_vx_u8m8_tu(__VA_ARGS__)
+#define vslideup_vx_u16mf4(...) __riscv_vslideup_vx_u16mf4_tu(__VA_ARGS__)
+#define vslideup_vx_u16mf2(...) __riscv_vslideup_vx_u16mf2_tu(__VA_ARGS__)
+#define vslideup_vx_u16m1(...) __riscv_vslideup_vx_u16m1_tu(__VA_ARGS__)
+#define vslideup_vx_u16m2(...) __riscv_vslideup_vx_u16m2_tu(__VA_ARGS__)
+#define vslideup_vx_u16m4(...) __riscv_vslideup_vx_u16m4_tu(__VA_ARGS__)
+#define vslideup_vx_u16m8(...) __riscv_vslideup_vx_u16m8_tu(__VA_ARGS__)
+#define vslideup_vx_u32mf2(...) __riscv_vslideup_vx_u32mf2_tu(__VA_ARGS__)
+#define vslideup_vx_u32m1(...) __riscv_vslideup_vx_u32m1_tu(__VA_ARGS__)
+#define vslideup_vx_u32m2(...) __riscv_vslideup_vx_u32m2_tu(__VA_ARGS__)
+#define vslideup_vx_u32m4(...) __riscv_vslideup_vx_u32m4_tu(__VA_ARGS__)
+#define vslideup_vx_u32m8(...) __riscv_vslideup_vx_u32m8_tu(__VA_ARGS__)
+#define vslideup_vx_u64m1(...) __riscv_vslideup_vx_u64m1_tu(__VA_ARGS__)
+#define vslideup_vx_u64m2(...) __riscv_vslideup_vx_u64m2_tu(__VA_ARGS__)
+#define vslideup_vx_u64m4(...) __riscv_vslideup_vx_u64m4_tu(__VA_ARGS__)
+#define vslideup_vx_u64m8(...) __riscv_vslideup_vx_u64m8_tu(__VA_ARGS__)
+// masked functions
+#define vslideup_vx_f16mf4_m(...) __riscv_vslideup_vx_f16mf4_tumu(__VA_ARGS__)
+#define vslideup_vx_f16mf2_m(...) __riscv_vslideup_vx_f16mf2_tumu(__VA_ARGS__)
+#define vslideup_vx_f16m1_m(...) __riscv_vslideup_vx_f16m1_tumu(__VA_ARGS__)
+#define vslideup_vx_f16m2_m(...) __riscv_vslideup_vx_f16m2_tumu(__VA_ARGS__)
+#define vslideup_vx_f16m4_m(...) __riscv_vslideup_vx_f16m4_tumu(__VA_ARGS__)
+#define vslideup_vx_f16m8_m(...) __riscv_vslideup_vx_f16m8_tumu(__VA_ARGS__)
+#define vslideup_vx_f32mf2_m(...) __riscv_vslideup_vx_f32mf2_tumu(__VA_ARGS__)
+#define vslideup_vx_f32m1_m(...) __riscv_vslideup_vx_f32m1_tumu(__VA_ARGS__)
+#define vslideup_vx_f32m2_m(...) __riscv_vslideup_vx_f32m2_tumu(__VA_ARGS__)
+#define vslideup_vx_f32m4_m(...) __riscv_vslideup_vx_f32m4_tumu(__VA_ARGS__)
+#define vslideup_vx_f32m8_m(...) __riscv_vslideup_vx_f32m8_tumu(__VA_ARGS__)
+#define vslideup_vx_f64m1_m(...) __riscv_vslideup_vx_f64m1_tumu(__VA_ARGS__)
+#define vslideup_vx_f64m2_m(...) __riscv_vslideup_vx_f64m2_tumu(__VA_ARGS__)
+#define vslideup_vx_f64m4_m(...) __riscv_vslideup_vx_f64m4_tumu(__VA_ARGS__)
+#define vslideup_vx_f64m8_m(...) __riscv_vslideup_vx_f64m8_tumu(__VA_ARGS__)
+#define vslideup_vx_i8mf8_m(...) __riscv_vslideup_vx_i8mf8_tumu(__VA_ARGS__)
+#define vslideup_vx_i8mf4_m(...) __riscv_vslideup_vx_i8mf4_tumu(__VA_ARGS__)
+#define vslideup_vx_i8mf2_m(...) __riscv_vslideup_vx_i8mf2_tumu(__VA_ARGS__)
+#define vslideup_vx_i8m1_m(...) __riscv_vslideup_vx_i8m1_tumu(__VA_ARGS__)
+#define vslideup_vx_i8m2_m(...) __riscv_vslideup_vx_i8m2_tumu(__VA_ARGS__)
+#define vslideup_vx_i8m4_m(...) __riscv_vslideup_vx_i8m4_tumu(__VA_ARGS__)
+#define vslideup_vx_i8m8_m(...) __riscv_vslideup_vx_i8m8_tumu(__VA_ARGS__)
+#define vslideup_vx_i16mf4_m(...) __riscv_vslideup_vx_i16mf4_tumu(__VA_ARGS__)
+#define vslideup_vx_i16mf2_m(...) __riscv_vslideup_vx_i16mf2_tumu(__VA_ARGS__)
+#define vslideup_vx_i16m1_m(...) __riscv_vslideup_vx_i16m1_tumu(__VA_ARGS__)
+#define vslideup_vx_i16m2_m(...) __riscv_vslideup_vx_i16m2_tumu(__VA_ARGS__)
+#define vslideup_vx_i16m4_m(...) __riscv_vslideup_vx_i16m4_tumu(__VA_ARGS__)
+#define vslideup_vx_i16m8_m(...) __riscv_vslideup_vx_i16m8_tumu(__VA_ARGS__)
+#define vslideup_vx_i32mf2_m(...) __riscv_vslideup_vx_i32mf2_tumu(__VA_ARGS__)
+#define vslideup_vx_i32m1_m(...) __riscv_vslideup_vx_i32m1_tumu(__VA_ARGS__)
+#define vslideup_vx_i32m2_m(...) __riscv_vslideup_vx_i32m2_tumu(__VA_ARGS__)
+#define vslideup_vx_i32m4_m(...) __riscv_vslideup_vx_i32m4_tumu(__VA_ARGS__)
+#define vslideup_vx_i32m8_m(...) __riscv_vslideup_vx_i32m8_tumu(__VA_ARGS__)
+#define vslideup_vx_i64m1_m(...) __riscv_vslideup_vx_i64m1_tumu(__VA_ARGS__)
+#define vslideup_vx_i64m2_m(...) __riscv_vslideup_vx_i64m2_tumu(__VA_ARGS__)
+#define vslideup_vx_i64m4_m(...) __riscv_vslideup_vx_i64m4_tumu(__VA_ARGS__)
+#define vslideup_vx_i64m8_m(...) __riscv_vslideup_vx_i64m8_tumu(__VA_ARGS__)
+#define vslideup_vx_u8mf8_m(...) __riscv_vslideup_vx_u8mf8_tumu(__VA_ARGS__)
+#define vslideup_vx_u8mf4_m(...) __riscv_vslideup_vx_u8mf4_tumu(__VA_ARGS__)
+#define vslideup_vx_u8mf2_m(...) __riscv_vslideup_vx_u8mf2_tumu(__VA_ARGS__)
+#define vslideup_vx_u8m1_m(...) __riscv_vslideup_vx_u8m1_tumu(__VA_ARGS__)
+#define vslideup_vx_u8m2_m(...) __riscv_vslideup_vx_u8m2_tumu(__VA_ARGS__)
+#define vslideup_vx_u8m4_m(...) __riscv_vslideup_vx_u8m4_tumu(__VA_ARGS__)
+#define vslideup_vx_u8m8_m(...) __riscv_vslideup_vx_u8m8_tumu(__VA_ARGS__)
+#define vslideup_vx_u16mf4_m(...) __riscv_vslideup_vx_u16mf4_tumu(__VA_ARGS__)
+#define vslideup_vx_u16mf2_m(...) __riscv_vslideup_vx_u16mf2_tumu(__VA_ARGS__)
+#define vslideup_vx_u16m1_m(...) __riscv_vslideup_vx_u16m1_tumu(__VA_ARGS__)
+#define vslideup_vx_u16m2_m(...) __riscv_vslideup_vx_u16m2_tumu(__VA_ARGS__)
+#define vslideup_vx_u16m4_m(...) __riscv_vslideup_vx_u16m4_tumu(__VA_ARGS__)
+#define vslideup_vx_u16m8_m(...) __riscv_vslideup_vx_u16m8_tumu(__VA_ARGS__)
+#define vslideup_vx_u32mf2_m(...) __riscv_vslideup_vx_u32mf2_tumu(__VA_ARGS__)
+#define vslideup_vx_u32m1_m(...) __riscv_vslideup_vx_u32m1_tumu(__VA_ARGS__)
+#define vslideup_vx_u32m2_m(...) __riscv_vslideup_vx_u32m2_tumu(__VA_ARGS__)
+#define vslideup_vx_u32m4_m(...) __riscv_vslideup_vx_u32m4_tumu(__VA_ARGS__)
+#define vslideup_vx_u32m8_m(...) __riscv_vslideup_vx_u32m8_tumu(__VA_ARGS__)
+#define vslideup_vx_u64m1_m(...) __riscv_vslideup_vx_u64m1_tumu(__VA_ARGS__)
+#define vslideup_vx_u64m2_m(...) __riscv_vslideup_vx_u64m2_tumu(__VA_ARGS__)
+#define vslideup_vx_u64m4_m(...) __riscv_vslideup_vx_u64m4_tumu(__VA_ARGS__)
+#define vslideup_vx_u64m8_m(...) __riscv_vslideup_vx_u64m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_f16mf4(...) __riscv_vslidedown_vx_f16mf4_tu(__VA_ARGS__)
+#define vslidedown_vx_f16mf2(...) __riscv_vslidedown_vx_f16mf2_tu(__VA_ARGS__)
+#define vslidedown_vx_f16m1(...) __riscv_vslidedown_vx_f16m1_tu(__VA_ARGS__)
+#define vslidedown_vx_f16m2(...) __riscv_vslidedown_vx_f16m2_tu(__VA_ARGS__)
+#define vslidedown_vx_f16m4(...) __riscv_vslidedown_vx_f16m4_tu(__VA_ARGS__)
+#define vslidedown_vx_f16m8(...) __riscv_vslidedown_vx_f16m8_tu(__VA_ARGS__)
+#define vslidedown_vx_f32mf2(...) __riscv_vslidedown_vx_f32mf2_tu(__VA_ARGS__)
+#define vslidedown_vx_f32m1(...) __riscv_vslidedown_vx_f32m1_tu(__VA_ARGS__)
+#define vslidedown_vx_f32m2(...) __riscv_vslidedown_vx_f32m2_tu(__VA_ARGS__)
+#define vslidedown_vx_f32m4(...) __riscv_vslidedown_vx_f32m4_tu(__VA_ARGS__)
+#define vslidedown_vx_f32m8(...) __riscv_vslidedown_vx_f32m8_tu(__VA_ARGS__)
+#define vslidedown_vx_f64m1(...) __riscv_vslidedown_vx_f64m1_tu(__VA_ARGS__)
+#define vslidedown_vx_f64m2(...) __riscv_vslidedown_vx_f64m2_tu(__VA_ARGS__)
+#define vslidedown_vx_f64m4(...) __riscv_vslidedown_vx_f64m4_tu(__VA_ARGS__)
+#define vslidedown_vx_f64m8(...) __riscv_vslidedown_vx_f64m8_tu(__VA_ARGS__)
+#define vslidedown_vx_i8mf8(...) __riscv_vslidedown_vx_i8mf8_tu(__VA_ARGS__)
+#define vslidedown_vx_i8mf4(...) __riscv_vslidedown_vx_i8mf4_tu(__VA_ARGS__)
+#define vslidedown_vx_i8mf2(...) __riscv_vslidedown_vx_i8mf2_tu(__VA_ARGS__)
+#define vslidedown_vx_i8m1(...) __riscv_vslidedown_vx_i8m1_tu(__VA_ARGS__)
+#define vslidedown_vx_i8m2(...) __riscv_vslidedown_vx_i8m2_tu(__VA_ARGS__)
+#define vslidedown_vx_i8m4(...) __riscv_vslidedown_vx_i8m4_tu(__VA_ARGS__)
+#define vslidedown_vx_i8m8(...) __riscv_vslidedown_vx_i8m8_tu(__VA_ARGS__)
+#define vslidedown_vx_i16mf4(...) __riscv_vslidedown_vx_i16mf4_tu(__VA_ARGS__)
+#define vslidedown_vx_i16mf2(...) __riscv_vslidedown_vx_i16mf2_tu(__VA_ARGS__)
+#define vslidedown_vx_i16m1(...) __riscv_vslidedown_vx_i16m1_tu(__VA_ARGS__)
+#define vslidedown_vx_i16m2(...) __riscv_vslidedown_vx_i16m2_tu(__VA_ARGS__)
+#define vslidedown_vx_i16m4(...) __riscv_vslidedown_vx_i16m4_tu(__VA_ARGS__)
+#define vslidedown_vx_i16m8(...) __riscv_vslidedown_vx_i16m8_tu(__VA_ARGS__)
+#define vslidedown_vx_i32mf2(...) __riscv_vslidedown_vx_i32mf2_tu(__VA_ARGS__)
+#define vslidedown_vx_i32m1(...) __riscv_vslidedown_vx_i32m1_tu(__VA_ARGS__)
+#define vslidedown_vx_i32m2(...) __riscv_vslidedown_vx_i32m2_tu(__VA_ARGS__)
+#define vslidedown_vx_i32m4(...) __riscv_vslidedown_vx_i32m4_tu(__VA_ARGS__)
+#define vslidedown_vx_i32m8(...) __riscv_vslidedown_vx_i32m8_tu(__VA_ARGS__)
+#define vslidedown_vx_i64m1(...) __riscv_vslidedown_vx_i64m1_tu(__VA_ARGS__)
+#define vslidedown_vx_i64m2(...) __riscv_vslidedown_vx_i64m2_tu(__VA_ARGS__)
+#define vslidedown_vx_i64m4(...) __riscv_vslidedown_vx_i64m4_tu(__VA_ARGS__)
+#define vslidedown_vx_i64m8(...) __riscv_vslidedown_vx_i64m8_tu(__VA_ARGS__)
+#define vslidedown_vx_u8mf8(...) __riscv_vslidedown_vx_u8mf8_tu(__VA_ARGS__)
+#define vslidedown_vx_u8mf4(...) __riscv_vslidedown_vx_u8mf4_tu(__VA_ARGS__)
+#define vslidedown_vx_u8mf2(...) __riscv_vslidedown_vx_u8mf2_tu(__VA_ARGS__)
+#define vslidedown_vx_u8m1(...) __riscv_vslidedown_vx_u8m1_tu(__VA_ARGS__)
+#define vslidedown_vx_u8m2(...) __riscv_vslidedown_vx_u8m2_tu(__VA_ARGS__)
+#define vslidedown_vx_u8m4(...) __riscv_vslidedown_vx_u8m4_tu(__VA_ARGS__)
+#define vslidedown_vx_u8m8(...) __riscv_vslidedown_vx_u8m8_tu(__VA_ARGS__)
+#define vslidedown_vx_u16mf4(...) __riscv_vslidedown_vx_u16mf4_tu(__VA_ARGS__)
+#define vslidedown_vx_u16mf2(...) __riscv_vslidedown_vx_u16mf2_tu(__VA_ARGS__)
+#define vslidedown_vx_u16m1(...) __riscv_vslidedown_vx_u16m1_tu(__VA_ARGS__)
+#define vslidedown_vx_u16m2(...) __riscv_vslidedown_vx_u16m2_tu(__VA_ARGS__)
+#define vslidedown_vx_u16m4(...) __riscv_vslidedown_vx_u16m4_tu(__VA_ARGS__)
+#define vslidedown_vx_u16m8(...) __riscv_vslidedown_vx_u16m8_tu(__VA_ARGS__)
+#define vslidedown_vx_u32mf2(...) __riscv_vslidedown_vx_u32mf2_tu(__VA_ARGS__)
+#define vslidedown_vx_u32m1(...) __riscv_vslidedown_vx_u32m1_tu(__VA_ARGS__)
+#define vslidedown_vx_u32m2(...) __riscv_vslidedown_vx_u32m2_tu(__VA_ARGS__)
+#define vslidedown_vx_u32m4(...) __riscv_vslidedown_vx_u32m4_tu(__VA_ARGS__)
+#define vslidedown_vx_u32m8(...) __riscv_vslidedown_vx_u32m8_tu(__VA_ARGS__)
+#define vslidedown_vx_u64m1(...) __riscv_vslidedown_vx_u64m1_tu(__VA_ARGS__)
+#define vslidedown_vx_u64m2(...) __riscv_vslidedown_vx_u64m2_tu(__VA_ARGS__)
+#define vslidedown_vx_u64m4(...) __riscv_vslidedown_vx_u64m4_tu(__VA_ARGS__)
+#define vslidedown_vx_u64m8(...) __riscv_vslidedown_vx_u64m8_tu(__VA_ARGS__)
+// masked functions
+#define vslidedown_vx_f16mf4_m(...) __riscv_vslidedown_vx_f16mf4_tumu(__VA_ARGS__)
+#define vslidedown_vx_f16mf2_m(...) __riscv_vslidedown_vx_f16mf2_tumu(__VA_ARGS__)
+#define vslidedown_vx_f16m1_m(...) __riscv_vslidedown_vx_f16m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_f16m2_m(...) __riscv_vslidedown_vx_f16m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_f16m4_m(...) __riscv_vslidedown_vx_f16m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_f16m8_m(...) __riscv_vslidedown_vx_f16m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_f32mf2_m(...) __riscv_vslidedown_vx_f32mf2_tumu(__VA_ARGS__)
+#define vslidedown_vx_f32m1_m(...) __riscv_vslidedown_vx_f32m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_f32m2_m(...) __riscv_vslidedown_vx_f32m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_f32m4_m(...) __riscv_vslidedown_vx_f32m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_f32m8_m(...) __riscv_vslidedown_vx_f32m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_f64m1_m(...) __riscv_vslidedown_vx_f64m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_f64m2_m(...) __riscv_vslidedown_vx_f64m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_f64m4_m(...) __riscv_vslidedown_vx_f64m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_f64m8_m(...) __riscv_vslidedown_vx_f64m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_i8mf8_m(...) __riscv_vslidedown_vx_i8mf8_tumu(__VA_ARGS__)
+#define vslidedown_vx_i8mf4_m(...) __riscv_vslidedown_vx_i8mf4_tumu(__VA_ARGS__)
+#define vslidedown_vx_i8mf2_m(...) __riscv_vslidedown_vx_i8mf2_tumu(__VA_ARGS__)
+#define vslidedown_vx_i8m1_m(...) __riscv_vslidedown_vx_i8m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_i8m2_m(...) __riscv_vslidedown_vx_i8m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_i8m4_m(...) __riscv_vslidedown_vx_i8m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_i8m8_m(...) __riscv_vslidedown_vx_i8m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_i16mf4_m(...) __riscv_vslidedown_vx_i16mf4_tumu(__VA_ARGS__)
+#define vslidedown_vx_i16mf2_m(...) __riscv_vslidedown_vx_i16mf2_tumu(__VA_ARGS__)
+#define vslidedown_vx_i16m1_m(...) __riscv_vslidedown_vx_i16m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_i16m2_m(...) __riscv_vslidedown_vx_i16m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_i16m4_m(...) __riscv_vslidedown_vx_i16m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_i16m8_m(...) __riscv_vslidedown_vx_i16m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_i32mf2_m(...) __riscv_vslidedown_vx_i32mf2_tumu(__VA_ARGS__)
+#define vslidedown_vx_i32m1_m(...) __riscv_vslidedown_vx_i32m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_i32m2_m(...) __riscv_vslidedown_vx_i32m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_i32m4_m(...) __riscv_vslidedown_vx_i32m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_i32m8_m(...) __riscv_vslidedown_vx_i32m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_i64m1_m(...) __riscv_vslidedown_vx_i64m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_i64m2_m(...) __riscv_vslidedown_vx_i64m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_i64m4_m(...) __riscv_vslidedown_vx_i64m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_i64m8_m(...) __riscv_vslidedown_vx_i64m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_u8mf8_m(...) __riscv_vslidedown_vx_u8mf8_tumu(__VA_ARGS__)
+#define vslidedown_vx_u8mf4_m(...) __riscv_vslidedown_vx_u8mf4_tumu(__VA_ARGS__)
+#define vslidedown_vx_u8mf2_m(...) __riscv_vslidedown_vx_u8mf2_tumu(__VA_ARGS__)
+#define vslidedown_vx_u8m1_m(...) __riscv_vslidedown_vx_u8m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_u8m2_m(...) __riscv_vslidedown_vx_u8m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_u8m4_m(...) __riscv_vslidedown_vx_u8m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_u8m8_m(...) __riscv_vslidedown_vx_u8m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_u16mf4_m(...) __riscv_vslidedown_vx_u16mf4_tumu(__VA_ARGS__)
+#define vslidedown_vx_u16mf2_m(...) __riscv_vslidedown_vx_u16mf2_tumu(__VA_ARGS__)
+#define vslidedown_vx_u16m1_m(...) __riscv_vslidedown_vx_u16m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_u16m2_m(...) __riscv_vslidedown_vx_u16m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_u16m4_m(...) __riscv_vslidedown_vx_u16m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_u16m8_m(...) __riscv_vslidedown_vx_u16m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_u32mf2_m(...) __riscv_vslidedown_vx_u32mf2_tumu(__VA_ARGS__)
+#define vslidedown_vx_u32m1_m(...) __riscv_vslidedown_vx_u32m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_u32m2_m(...) __riscv_vslidedown_vx_u32m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_u32m4_m(...) __riscv_vslidedown_vx_u32m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_u32m8_m(...) __riscv_vslidedown_vx_u32m8_tumu(__VA_ARGS__)
+#define vslidedown_vx_u64m1_m(...) __riscv_vslidedown_vx_u64m1_tumu(__VA_ARGS__)
+#define vslidedown_vx_u64m2_m(...) __riscv_vslidedown_vx_u64m2_tumu(__VA_ARGS__)
+#define vslidedown_vx_u64m4_m(...) __riscv_vslidedown_vx_u64m4_tumu(__VA_ARGS__)
+#define vslidedown_vx_u64m8_m(...) __riscv_vslidedown_vx_u64m8_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f16mf4(...) __riscv_vfslide1up_vf_f16mf4(__VA_ARGS__)
+#define vfslide1up_vf_f16mf2(...) __riscv_vfslide1up_vf_f16mf2(__VA_ARGS__)
+#define vfslide1up_vf_f16m1(...) __riscv_vfslide1up_vf_f16m1(__VA_ARGS__)
+#define vfslide1up_vf_f16m2(...) __riscv_vfslide1up_vf_f16m2(__VA_ARGS__)
+#define vfslide1up_vf_f16m4(...) __riscv_vfslide1up_vf_f16m4(__VA_ARGS__)
+#define vfslide1up_vf_f16m8(...) __riscv_vfslide1up_vf_f16m8(__VA_ARGS__)
+#define vfslide1up_vf_f32mf2(...) __riscv_vfslide1up_vf_f32mf2(__VA_ARGS__)
+#define vfslide1up_vf_f32m1(...) __riscv_vfslide1up_vf_f32m1(__VA_ARGS__)
+#define vfslide1up_vf_f32m2(...) __riscv_vfslide1up_vf_f32m2(__VA_ARGS__)
+#define vfslide1up_vf_f32m4(...) __riscv_vfslide1up_vf_f32m4(__VA_ARGS__)
+#define vfslide1up_vf_f32m8(...) __riscv_vfslide1up_vf_f32m8(__VA_ARGS__)
+#define vfslide1up_vf_f64m1(...) __riscv_vfslide1up_vf_f64m1(__VA_ARGS__)
+#define vfslide1up_vf_f64m2(...) __riscv_vfslide1up_vf_f64m2(__VA_ARGS__)
+#define vfslide1up_vf_f64m4(...) __riscv_vfslide1up_vf_f64m4(__VA_ARGS__)
+#define vfslide1up_vf_f64m8(...) __riscv_vfslide1up_vf_f64m8(__VA_ARGS__)
+#define vfslide1down_vf_f16mf4(...) __riscv_vfslide1down_vf_f16mf4(__VA_ARGS__)
+#define vfslide1down_vf_f16mf2(...) __riscv_vfslide1down_vf_f16mf2(__VA_ARGS__)
+#define vfslide1down_vf_f16m1(...) __riscv_vfslide1down_vf_f16m1(__VA_ARGS__)
+#define vfslide1down_vf_f16m2(...) __riscv_vfslide1down_vf_f16m2(__VA_ARGS__)
+#define vfslide1down_vf_f16m4(...) __riscv_vfslide1down_vf_f16m4(__VA_ARGS__)
+#define vfslide1down_vf_f16m8(...) __riscv_vfslide1down_vf_f16m8(__VA_ARGS__)
+#define vfslide1down_vf_f32mf2(...) __riscv_vfslide1down_vf_f32mf2(__VA_ARGS__)
+#define vfslide1down_vf_f32m1(...) __riscv_vfslide1down_vf_f32m1(__VA_ARGS__)
+#define vfslide1down_vf_f32m2(...) __riscv_vfslide1down_vf_f32m2(__VA_ARGS__)
+#define vfslide1down_vf_f32m4(...) __riscv_vfslide1down_vf_f32m4(__VA_ARGS__)
+#define vfslide1down_vf_f32m8(...) __riscv_vfslide1down_vf_f32m8(__VA_ARGS__)
+#define vfslide1down_vf_f64m1(...) __riscv_vfslide1down_vf_f64m1(__VA_ARGS__)
+#define vfslide1down_vf_f64m2(...) __riscv_vfslide1down_vf_f64m2(__VA_ARGS__)
+#define vfslide1down_vf_f64m4(...) __riscv_vfslide1down_vf_f64m4(__VA_ARGS__)
+#define vfslide1down_vf_f64m8(...) __riscv_vfslide1down_vf_f64m8(__VA_ARGS__)
+#define vslide1up_vx_i8mf8(...) __riscv_vslide1up_vx_i8mf8(__VA_ARGS__)
+#define vslide1up_vx_i8mf4(...) __riscv_vslide1up_vx_i8mf4(__VA_ARGS__)
+#define vslide1up_vx_i8mf2(...) __riscv_vslide1up_vx_i8mf2(__VA_ARGS__)
+#define vslide1up_vx_i8m1(...) __riscv_vslide1up_vx_i8m1(__VA_ARGS__)
+#define vslide1up_vx_i8m2(...) __riscv_vslide1up_vx_i8m2(__VA_ARGS__)
+#define vslide1up_vx_i8m4(...) __riscv_vslide1up_vx_i8m4(__VA_ARGS__)
+#define vslide1up_vx_i8m8(...) __riscv_vslide1up_vx_i8m8(__VA_ARGS__)
+#define vslide1up_vx_i16mf4(...) __riscv_vslide1up_vx_i16mf4(__VA_ARGS__)
+#define vslide1up_vx_i16mf2(...) __riscv_vslide1up_vx_i16mf2(__VA_ARGS__)
+#define vslide1up_vx_i16m1(...) __riscv_vslide1up_vx_i16m1(__VA_ARGS__)
+#define vslide1up_vx_i16m2(...) __riscv_vslide1up_vx_i16m2(__VA_ARGS__)
+#define vslide1up_vx_i16m4(...) __riscv_vslide1up_vx_i16m4(__VA_ARGS__)
+#define vslide1up_vx_i16m8(...) __riscv_vslide1up_vx_i16m8(__VA_ARGS__)
+#define vslide1up_vx_i32mf2(...) __riscv_vslide1up_vx_i32mf2(__VA_ARGS__)
+#define vslide1up_vx_i32m1(...) __riscv_vslide1up_vx_i32m1(__VA_ARGS__)
+#define vslide1up_vx_i32m2(...) __riscv_vslide1up_vx_i32m2(__VA_ARGS__)
+#define vslide1up_vx_i32m4(...) __riscv_vslide1up_vx_i32m4(__VA_ARGS__)
+#define vslide1up_vx_i32m8(...) __riscv_vslide1up_vx_i32m8(__VA_ARGS__)
+#define vslide1up_vx_i64m1(...) __riscv_vslide1up_vx_i64m1(__VA_ARGS__)
+#define vslide1up_vx_i64m2(...) __riscv_vslide1up_vx_i64m2(__VA_ARGS__)
+#define vslide1up_vx_i64m4(...) __riscv_vslide1up_vx_i64m4(__VA_ARGS__)
+#define vslide1up_vx_i64m8(...) __riscv_vslide1up_vx_i64m8(__VA_ARGS__)
+#define vslide1down_vx_i8mf8(...) __riscv_vslide1down_vx_i8mf8(__VA_ARGS__)
+#define vslide1down_vx_i8mf4(...) __riscv_vslide1down_vx_i8mf4(__VA_ARGS__)
+#define vslide1down_vx_i8mf2(...) __riscv_vslide1down_vx_i8mf2(__VA_ARGS__)
+#define vslide1down_vx_i8m1(...) __riscv_vslide1down_vx_i8m1(__VA_ARGS__)
+#define vslide1down_vx_i8m2(...) __riscv_vslide1down_vx_i8m2(__VA_ARGS__)
+#define vslide1down_vx_i8m4(...) __riscv_vslide1down_vx_i8m4(__VA_ARGS__)
+#define vslide1down_vx_i8m8(...) __riscv_vslide1down_vx_i8m8(__VA_ARGS__)
+#define vslide1down_vx_i16mf4(...) __riscv_vslide1down_vx_i16mf4(__VA_ARGS__)
+#define vslide1down_vx_i16mf2(...) __riscv_vslide1down_vx_i16mf2(__VA_ARGS__)
+#define vslide1down_vx_i16m1(...) __riscv_vslide1down_vx_i16m1(__VA_ARGS__)
+#define vslide1down_vx_i16m2(...) __riscv_vslide1down_vx_i16m2(__VA_ARGS__)
+#define vslide1down_vx_i16m4(...) __riscv_vslide1down_vx_i16m4(__VA_ARGS__)
+#define vslide1down_vx_i16m8(...) __riscv_vslide1down_vx_i16m8(__VA_ARGS__)
+#define vslide1down_vx_i32mf2(...) __riscv_vslide1down_vx_i32mf2(__VA_ARGS__)
+#define vslide1down_vx_i32m1(...) __riscv_vslide1down_vx_i32m1(__VA_ARGS__)
+#define vslide1down_vx_i32m2(...) __riscv_vslide1down_vx_i32m2(__VA_ARGS__)
+#define vslide1down_vx_i32m4(...) __riscv_vslide1down_vx_i32m4(__VA_ARGS__)
+#define vslide1down_vx_i32m8(...) __riscv_vslide1down_vx_i32m8(__VA_ARGS__)
+#define vslide1down_vx_i64m1(...) __riscv_vslide1down_vx_i64m1(__VA_ARGS__)
+#define vslide1down_vx_i64m2(...) __riscv_vslide1down_vx_i64m2(__VA_ARGS__)
+#define vslide1down_vx_i64m4(...) __riscv_vslide1down_vx_i64m4(__VA_ARGS__)
+#define vslide1down_vx_i64m8(...) __riscv_vslide1down_vx_i64m8(__VA_ARGS__)
+#define vslide1up_vx_u8mf8(...) __riscv_vslide1up_vx_u8mf8(__VA_ARGS__)
+#define vslide1up_vx_u8mf4(...) __riscv_vslide1up_vx_u8mf4(__VA_ARGS__)
+#define vslide1up_vx_u8mf2(...) __riscv_vslide1up_vx_u8mf2(__VA_ARGS__)
+#define vslide1up_vx_u8m1(...) __riscv_vslide1up_vx_u8m1(__VA_ARGS__)
+#define vslide1up_vx_u8m2(...) __riscv_vslide1up_vx_u8m2(__VA_ARGS__)
+#define vslide1up_vx_u8m4(...) __riscv_vslide1up_vx_u8m4(__VA_ARGS__)
+#define vslide1up_vx_u8m8(...) __riscv_vslide1up_vx_u8m8(__VA_ARGS__)
+#define vslide1up_vx_u16mf4(...) __riscv_vslide1up_vx_u16mf4(__VA_ARGS__)
+#define vslide1up_vx_u16mf2(...) __riscv_vslide1up_vx_u16mf2(__VA_ARGS__)
+#define vslide1up_vx_u16m1(...) __riscv_vslide1up_vx_u16m1(__VA_ARGS__)
+#define vslide1up_vx_u16m2(...) __riscv_vslide1up_vx_u16m2(__VA_ARGS__)
+#define vslide1up_vx_u16m4(...) __riscv_vslide1up_vx_u16m4(__VA_ARGS__)
+#define vslide1up_vx_u16m8(...) __riscv_vslide1up_vx_u16m8(__VA_ARGS__)
+#define vslide1up_vx_u32mf2(...) __riscv_vslide1up_vx_u32mf2(__VA_ARGS__)
+#define vslide1up_vx_u32m1(...) __riscv_vslide1up_vx_u32m1(__VA_ARGS__)
+#define vslide1up_vx_u32m2(...) __riscv_vslide1up_vx_u32m2(__VA_ARGS__)
+#define vslide1up_vx_u32m4(...) __riscv_vslide1up_vx_u32m4(__VA_ARGS__)
+#define vslide1up_vx_u32m8(...) __riscv_vslide1up_vx_u32m8(__VA_ARGS__)
+#define vslide1up_vx_u64m1(...) __riscv_vslide1up_vx_u64m1(__VA_ARGS__)
+#define vslide1up_vx_u64m2(...) __riscv_vslide1up_vx_u64m2(__VA_ARGS__)
+#define vslide1up_vx_u64m4(...) __riscv_vslide1up_vx_u64m4(__VA_ARGS__)
+#define vslide1up_vx_u64m8(...) __riscv_vslide1up_vx_u64m8(__VA_ARGS__)
+#define vslide1down_vx_u8mf8(...) __riscv_vslide1down_vx_u8mf8(__VA_ARGS__)
+#define vslide1down_vx_u8mf4(...) __riscv_vslide1down_vx_u8mf4(__VA_ARGS__)
+#define vslide1down_vx_u8mf2(...) __riscv_vslide1down_vx_u8mf2(__VA_ARGS__)
+#define vslide1down_vx_u8m1(...) __riscv_vslide1down_vx_u8m1(__VA_ARGS__)
+#define vslide1down_vx_u8m2(...) __riscv_vslide1down_vx_u8m2(__VA_ARGS__)
+#define vslide1down_vx_u8m4(...) __riscv_vslide1down_vx_u8m4(__VA_ARGS__)
+#define vslide1down_vx_u8m8(...) __riscv_vslide1down_vx_u8m8(__VA_ARGS__)
+#define vslide1down_vx_u16mf4(...) __riscv_vslide1down_vx_u16mf4(__VA_ARGS__)
+#define vslide1down_vx_u16mf2(...) __riscv_vslide1down_vx_u16mf2(__VA_ARGS__)
+#define vslide1down_vx_u16m1(...) __riscv_vslide1down_vx_u16m1(__VA_ARGS__)
+#define vslide1down_vx_u16m2(...) __riscv_vslide1down_vx_u16m2(__VA_ARGS__)
+#define vslide1down_vx_u16m4(...) __riscv_vslide1down_vx_u16m4(__VA_ARGS__)
+#define vslide1down_vx_u16m8(...) __riscv_vslide1down_vx_u16m8(__VA_ARGS__)
+#define vslide1down_vx_u32mf2(...) __riscv_vslide1down_vx_u32mf2(__VA_ARGS__)
+#define vslide1down_vx_u32m1(...) __riscv_vslide1down_vx_u32m1(__VA_ARGS__)
+#define vslide1down_vx_u32m2(...) __riscv_vslide1down_vx_u32m2(__VA_ARGS__)
+#define vslide1down_vx_u32m4(...) __riscv_vslide1down_vx_u32m4(__VA_ARGS__)
+#define vslide1down_vx_u32m8(...) __riscv_vslide1down_vx_u32m8(__VA_ARGS__)
+#define vslide1down_vx_u64m1(...) __riscv_vslide1down_vx_u64m1(__VA_ARGS__)
+#define vslide1down_vx_u64m2(...) __riscv_vslide1down_vx_u64m2(__VA_ARGS__)
+#define vslide1down_vx_u64m4(...) __riscv_vslide1down_vx_u64m4(__VA_ARGS__)
+#define vslide1down_vx_u64m8(...) __riscv_vslide1down_vx_u64m8(__VA_ARGS__)
+// masked functions
+#define vfslide1up_vf_f16mf4_m(...) __riscv_vfslide1up_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f16mf2_m(...) __riscv_vfslide1up_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f16m1_m(...) __riscv_vfslide1up_vf_f16m1_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f16m2_m(...) __riscv_vfslide1up_vf_f16m2_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f16m4_m(...) __riscv_vfslide1up_vf_f16m4_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f16m8_m(...) __riscv_vfslide1up_vf_f16m8_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f32mf2_m(...) __riscv_vfslide1up_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f32m1_m(...) __riscv_vfslide1up_vf_f32m1_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f32m2_m(...) __riscv_vfslide1up_vf_f32m2_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f32m4_m(...) __riscv_vfslide1up_vf_f32m4_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f32m8_m(...) __riscv_vfslide1up_vf_f32m8_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f64m1_m(...) __riscv_vfslide1up_vf_f64m1_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f64m2_m(...) __riscv_vfslide1up_vf_f64m2_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f64m4_m(...) __riscv_vfslide1up_vf_f64m4_tumu(__VA_ARGS__)
+#define vfslide1up_vf_f64m8_m(...) __riscv_vfslide1up_vf_f64m8_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f16mf4_m(...) __riscv_vfslide1down_vf_f16mf4_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f16mf2_m(...) __riscv_vfslide1down_vf_f16mf2_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f16m1_m(...) __riscv_vfslide1down_vf_f16m1_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f16m2_m(...) __riscv_vfslide1down_vf_f16m2_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f16m4_m(...) __riscv_vfslide1down_vf_f16m4_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f16m8_m(...) __riscv_vfslide1down_vf_f16m8_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f32mf2_m(...) __riscv_vfslide1down_vf_f32mf2_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f32m1_m(...) __riscv_vfslide1down_vf_f32m1_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f32m2_m(...) __riscv_vfslide1down_vf_f32m2_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f32m4_m(...) __riscv_vfslide1down_vf_f32m4_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f32m8_m(...) __riscv_vfslide1down_vf_f32m8_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f64m1_m(...) __riscv_vfslide1down_vf_f64m1_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f64m2_m(...) __riscv_vfslide1down_vf_f64m2_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f64m4_m(...) __riscv_vfslide1down_vf_f64m4_tumu(__VA_ARGS__)
+#define vfslide1down_vf_f64m8_m(...) __riscv_vfslide1down_vf_f64m8_tumu(__VA_ARGS__)
+#define vslide1up_vx_i8mf8_m(...) __riscv_vslide1up_vx_i8mf8_tumu(__VA_ARGS__)
+#define vslide1up_vx_i8mf4_m(...) __riscv_vslide1up_vx_i8mf4_tumu(__VA_ARGS__)
+#define vslide1up_vx_i8mf2_m(...) __riscv_vslide1up_vx_i8mf2_tumu(__VA_ARGS__)
+#define vslide1up_vx_i8m1_m(...) __riscv_vslide1up_vx_i8m1_tumu(__VA_ARGS__)
+#define vslide1up_vx_i8m2_m(...) __riscv_vslide1up_vx_i8m2_tumu(__VA_ARGS__)
+#define vslide1up_vx_i8m4_m(...) __riscv_vslide1up_vx_i8m4_tumu(__VA_ARGS__)
+#define vslide1up_vx_i8m8_m(...) __riscv_vslide1up_vx_i8m8_tumu(__VA_ARGS__)
+#define vslide1up_vx_i16mf4_m(...) __riscv_vslide1up_vx_i16mf4_tumu(__VA_ARGS__)
+#define vslide1up_vx_i16mf2_m(...) __riscv_vslide1up_vx_i16mf2_tumu(__VA_ARGS__)
+#define vslide1up_vx_i16m1_m(...) __riscv_vslide1up_vx_i16m1_tumu(__VA_ARGS__)
+#define vslide1up_vx_i16m2_m(...) __riscv_vslide1up_vx_i16m2_tumu(__VA_ARGS__)
+#define vslide1up_vx_i16m4_m(...) __riscv_vslide1up_vx_i16m4_tumu(__VA_ARGS__)
+#define vslide1up_vx_i16m8_m(...) __riscv_vslide1up_vx_i16m8_tumu(__VA_ARGS__)
+#define vslide1up_vx_i32mf2_m(...) __riscv_vslide1up_vx_i32mf2_tumu(__VA_ARGS__)
+#define vslide1up_vx_i32m1_m(...) __riscv_vslide1up_vx_i32m1_tumu(__VA_ARGS__)
+#define vslide1up_vx_i32m2_m(...) __riscv_vslide1up_vx_i32m2_tumu(__VA_ARGS__)
+#define vslide1up_vx_i32m4_m(...) __riscv_vslide1up_vx_i32m4_tumu(__VA_ARGS__)
+#define vslide1up_vx_i32m8_m(...) __riscv_vslide1up_vx_i32m8_tumu(__VA_ARGS__)
+#define vslide1up_vx_i64m1_m(...) __riscv_vslide1up_vx_i64m1_tumu(__VA_ARGS__)
+#define vslide1up_vx_i64m2_m(...) __riscv_vslide1up_vx_i64m2_tumu(__VA_ARGS__)
+#define vslide1up_vx_i64m4_m(...) __riscv_vslide1up_vx_i64m4_tumu(__VA_ARGS__)
+#define vslide1up_vx_i64m8_m(...) __riscv_vslide1up_vx_i64m8_tumu(__VA_ARGS__)
+#define vslide1down_vx_i8mf8_m(...) __riscv_vslide1down_vx_i8mf8_tumu(__VA_ARGS__)
+#define vslide1down_vx_i8mf4_m(...) __riscv_vslide1down_vx_i8mf4_tumu(__VA_ARGS__)
+#define vslide1down_vx_i8mf2_m(...) __riscv_vslide1down_vx_i8mf2_tumu(__VA_ARGS__)
+#define vslide1down_vx_i8m1_m(...) __riscv_vslide1down_vx_i8m1_tumu(__VA_ARGS__)
+#define vslide1down_vx_i8m2_m(...) __riscv_vslide1down_vx_i8m2_tumu(__VA_ARGS__)
+#define vslide1down_vx_i8m4_m(...) __riscv_vslide1down_vx_i8m4_tumu(__VA_ARGS__)
+#define vslide1down_vx_i8m8_m(...) __riscv_vslide1down_vx_i8m8_tumu(__VA_ARGS__)
+#define vslide1down_vx_i16mf4_m(...) __riscv_vslide1down_vx_i16mf4_tumu(__VA_ARGS__)
+#define vslide1down_vx_i16mf2_m(...) __riscv_vslide1down_vx_i16mf2_tumu(__VA_ARGS__)
+#define vslide1down_vx_i16m1_m(...) __riscv_vslide1down_vx_i16m1_tumu(__VA_ARGS__)
+#define vslide1down_vx_i16m2_m(...) __riscv_vslide1down_vx_i16m2_tumu(__VA_ARGS__)
+#define vslide1down_vx_i16m4_m(...) __riscv_vslide1down_vx_i16m4_tumu(__VA_ARGS__)
+#define vslide1down_vx_i16m8_m(...) __riscv_vslide1down_vx_i16m8_tumu(__VA_ARGS__)
+#define vslide1down_vx_i32mf2_m(...) __riscv_vslide1down_vx_i32mf2_tumu(__VA_ARGS__)
+#define vslide1down_vx_i32m1_m(...) __riscv_vslide1down_vx_i32m1_tumu(__VA_ARGS__)
+#define vslide1down_vx_i32m2_m(...) __riscv_vslide1down_vx_i32m2_tumu(__VA_ARGS__)
+#define vslide1down_vx_i32m4_m(...) __riscv_vslide1down_vx_i32m4_tumu(__VA_ARGS__)
+#define vslide1down_vx_i32m8_m(...) __riscv_vslide1down_vx_i32m8_tumu(__VA_ARGS__)
+#define vslide1down_vx_i64m1_m(...) __riscv_vslide1down_vx_i64m1_tumu(__VA_ARGS__)
+#define vslide1down_vx_i64m2_m(...) __riscv_vslide1down_vx_i64m2_tumu(__VA_ARGS__)
+#define vslide1down_vx_i64m4_m(...) __riscv_vslide1down_vx_i64m4_tumu(__VA_ARGS__)
+#define vslide1down_vx_i64m8_m(...) __riscv_vslide1down_vx_i64m8_tumu(__VA_ARGS__)
+#define vslide1up_vx_u8mf8_m(...) __riscv_vslide1up_vx_u8mf8_tumu(__VA_ARGS__)
+#define vslide1up_vx_u8mf4_m(...) __riscv_vslide1up_vx_u8mf4_tumu(__VA_ARGS__)
+#define vslide1up_vx_u8mf2_m(...) __riscv_vslide1up_vx_u8mf2_tumu(__VA_ARGS__)
+#define vslide1up_vx_u8m1_m(...) __riscv_vslide1up_vx_u8m1_tumu(__VA_ARGS__)
+#define vslide1up_vx_u8m2_m(...) __riscv_vslide1up_vx_u8m2_tumu(__VA_ARGS__)
+#define vslide1up_vx_u8m4_m(...) __riscv_vslide1up_vx_u8m4_tumu(__VA_ARGS__)
+#define vslide1up_vx_u8m8_m(...) __riscv_vslide1up_vx_u8m8_tumu(__VA_ARGS__)
+#define vslide1up_vx_u16mf4_m(...) __riscv_vslide1up_vx_u16mf4_tumu(__VA_ARGS__)
+#define vslide1up_vx_u16mf2_m(...) __riscv_vslide1up_vx_u16mf2_tumu(__VA_ARGS__)
+#define vslide1up_vx_u16m1_m(...) __riscv_vslide1up_vx_u16m1_tumu(__VA_ARGS__)
+#define vslide1up_vx_u16m2_m(...) __riscv_vslide1up_vx_u16m2_tumu(__VA_ARGS__)
+#define vslide1up_vx_u16m4_m(...) __riscv_vslide1up_vx_u16m4_tumu(__VA_ARGS__)
+#define vslide1up_vx_u16m8_m(...) __riscv_vslide1up_vx_u16m8_tumu(__VA_ARGS__)
+#define vslide1up_vx_u32mf2_m(...) __riscv_vslide1up_vx_u32mf2_tumu(__VA_ARGS__)
+#define vslide1up_vx_u32m1_m(...) __riscv_vslide1up_vx_u32m1_tumu(__VA_ARGS__)
+#define vslide1up_vx_u32m2_m(...) __riscv_vslide1up_vx_u32m2_tumu(__VA_ARGS__)
+#define vslide1up_vx_u32m4_m(...) __riscv_vslide1up_vx_u32m4_tumu(__VA_ARGS__)
+#define vslide1up_vx_u32m8_m(...) __riscv_vslide1up_vx_u32m8_tumu(__VA_ARGS__)
+#define vslide1up_vx_u64m1_m(...) __riscv_vslide1up_vx_u64m1_tumu(__VA_ARGS__)
+#define vslide1up_vx_u64m2_m(...) __riscv_vslide1up_vx_u64m2_tumu(__VA_ARGS__)
+#define vslide1up_vx_u64m4_m(...) __riscv_vslide1up_vx_u64m4_tumu(__VA_ARGS__)
+#define vslide1up_vx_u64m8_m(...) __riscv_vslide1up_vx_u64m8_tumu(__VA_ARGS__)
+#define vslide1down_vx_u8mf8_m(...) __riscv_vslide1down_vx_u8mf8_tumu(__VA_ARGS__)
+#define vslide1down_vx_u8mf4_m(...) __riscv_vslide1down_vx_u8mf4_tumu(__VA_ARGS__)
+#define vslide1down_vx_u8mf2_m(...) __riscv_vslide1down_vx_u8mf2_tumu(__VA_ARGS__)
+#define vslide1down_vx_u8m1_m(...) __riscv_vslide1down_vx_u8m1_tumu(__VA_ARGS__)
+#define vslide1down_vx_u8m2_m(...) __riscv_vslide1down_vx_u8m2_tumu(__VA_ARGS__)
+#define vslide1down_vx_u8m4_m(...) __riscv_vslide1down_vx_u8m4_tumu(__VA_ARGS__)
+#define vslide1down_vx_u8m8_m(...) __riscv_vslide1down_vx_u8m8_tumu(__VA_ARGS__)
+#define vslide1down_vx_u16mf4_m(...) __riscv_vslide1down_vx_u16mf4_tumu(__VA_ARGS__)
+#define vslide1down_vx_u16mf2_m(...) __riscv_vslide1down_vx_u16mf2_tumu(__VA_ARGS__)
+#define vslide1down_vx_u16m1_m(...) __riscv_vslide1down_vx_u16m1_tumu(__VA_ARGS__)
+#define vslide1down_vx_u16m2_m(...) __riscv_vslide1down_vx_u16m2_tumu(__VA_ARGS__)
+#define vslide1down_vx_u16m4_m(...) __riscv_vslide1down_vx_u16m4_tumu(__VA_ARGS__)
+#define vslide1down_vx_u16m8_m(...) __riscv_vslide1down_vx_u16m8_tumu(__VA_ARGS__)
+#define vslide1down_vx_u32mf2_m(...) __riscv_vslide1down_vx_u32mf2_tumu(__VA_ARGS__)
+#define vslide1down_vx_u32m1_m(...) __riscv_vslide1down_vx_u32m1_tumu(__VA_ARGS__)
+#define vslide1down_vx_u32m2_m(...) __riscv_vslide1down_vx_u32m2_tumu(__VA_ARGS__)
+#define vslide1down_vx_u32m4_m(...) __riscv_vslide1down_vx_u32m4_tumu(__VA_ARGS__)
+#define vslide1down_vx_u32m8_m(...) __riscv_vslide1down_vx_u32m8_tumu(__VA_ARGS__)
+#define vslide1down_vx_u64m1_m(...) __riscv_vslide1down_vx_u64m1_tumu(__VA_ARGS__)
+#define vslide1down_vx_u64m2_m(...) __riscv_vslide1down_vx_u64m2_tumu(__VA_ARGS__)
+#define vslide1down_vx_u64m4_m(...) __riscv_vslide1down_vx_u64m4_tumu(__VA_ARGS__)
+#define vslide1down_vx_u64m8_m(...) __riscv_vslide1down_vx_u64m8_tumu(__VA_ARGS__)
+#define vrgather_vv_f16mf4(...) __riscv_vrgather_vv_f16mf4(__VA_ARGS__)
+#define vrgather_vx_f16mf4(...) __riscv_vrgather_vx_f16mf4(__VA_ARGS__)
+#define vrgather_vv_f16mf2(...) __riscv_vrgather_vv_f16mf2(__VA_ARGS__)
+#define vrgather_vx_f16mf2(...) __riscv_vrgather_vx_f16mf2(__VA_ARGS__)
+#define vrgather_vv_f16m1(...) __riscv_vrgather_vv_f16m1(__VA_ARGS__)
+#define vrgather_vx_f16m1(...) __riscv_vrgather_vx_f16m1(__VA_ARGS__)
+#define vrgather_vv_f16m2(...) __riscv_vrgather_vv_f16m2(__VA_ARGS__)
+#define vrgather_vx_f16m2(...) __riscv_vrgather_vx_f16m2(__VA_ARGS__)
+#define vrgather_vv_f16m4(...) __riscv_vrgather_vv_f16m4(__VA_ARGS__)
+#define vrgather_vx_f16m4(...) __riscv_vrgather_vx_f16m4(__VA_ARGS__)
+#define vrgather_vv_f16m8(...) __riscv_vrgather_vv_f16m8(__VA_ARGS__)
+#define vrgather_vx_f16m8(...) __riscv_vrgather_vx_f16m8(__VA_ARGS__)
+#define vrgather_vv_f32mf2(...) __riscv_vrgather_vv_f32mf2(__VA_ARGS__)
+#define vrgather_vx_f32mf2(...) __riscv_vrgather_vx_f32mf2(__VA_ARGS__)
+#define vrgather_vv_f32m1(...) __riscv_vrgather_vv_f32m1(__VA_ARGS__)
+#define vrgather_vx_f32m1(...) __riscv_vrgather_vx_f32m1(__VA_ARGS__)
+#define vrgather_vv_f32m2(...) __riscv_vrgather_vv_f32m2(__VA_ARGS__)
+#define vrgather_vx_f32m2(...) __riscv_vrgather_vx_f32m2(__VA_ARGS__)
+#define vrgather_vv_f32m4(...) __riscv_vrgather_vv_f32m4(__VA_ARGS__)
+#define vrgather_vx_f32m4(...) __riscv_vrgather_vx_f32m4(__VA_ARGS__)
+#define vrgather_vv_f32m8(...) __riscv_vrgather_vv_f32m8(__VA_ARGS__)
+#define vrgather_vx_f32m8(...) __riscv_vrgather_vx_f32m8(__VA_ARGS__)
+#define vrgather_vv_f64m1(...) __riscv_vrgather_vv_f64m1(__VA_ARGS__)
+#define vrgather_vx_f64m1(...) __riscv_vrgather_vx_f64m1(__VA_ARGS__)
+#define vrgather_vv_f64m2(...) __riscv_vrgather_vv_f64m2(__VA_ARGS__)
+#define vrgather_vx_f64m2(...) __riscv_vrgather_vx_f64m2(__VA_ARGS__)
+#define vrgather_vv_f64m4(...) __riscv_vrgather_vv_f64m4(__VA_ARGS__)
+#define vrgather_vx_f64m4(...) __riscv_vrgather_vx_f64m4(__VA_ARGS__)
+#define vrgather_vv_f64m8(...) __riscv_vrgather_vv_f64m8(__VA_ARGS__)
+#define vrgather_vx_f64m8(...) __riscv_vrgather_vx_f64m8(__VA_ARGS__)
+#define vrgatherei16_vv_f16mf4(...) __riscv_vrgatherei16_vv_f16mf4(__VA_ARGS__)
+#define vrgatherei16_vv_f16mf2(...) __riscv_vrgatherei16_vv_f16mf2(__VA_ARGS__)
+#define vrgatherei16_vv_f16m1(...) __riscv_vrgatherei16_vv_f16m1(__VA_ARGS__)
+#define vrgatherei16_vv_f16m2(...) __riscv_vrgatherei16_vv_f16m2(__VA_ARGS__)
+#define vrgatherei16_vv_f16m4(...) __riscv_vrgatherei16_vv_f16m4(__VA_ARGS__)
+#define vrgatherei16_vv_f16m8(...) __riscv_vrgatherei16_vv_f16m8(__VA_ARGS__)
+#define vrgatherei16_vv_f32mf2(...) __riscv_vrgatherei16_vv_f32mf2(__VA_ARGS__)
+#define vrgatherei16_vv_f32m1(...) __riscv_vrgatherei16_vv_f32m1(__VA_ARGS__)
+#define vrgatherei16_vv_f32m2(...) __riscv_vrgatherei16_vv_f32m2(__VA_ARGS__)
+#define vrgatherei16_vv_f32m4(...) __riscv_vrgatherei16_vv_f32m4(__VA_ARGS__)
+#define vrgatherei16_vv_f32m8(...) __riscv_vrgatherei16_vv_f32m8(__VA_ARGS__)
+#define vrgatherei16_vv_f64m1(...) __riscv_vrgatherei16_vv_f64m1(__VA_ARGS__)
+#define vrgatherei16_vv_f64m2(...) __riscv_vrgatherei16_vv_f64m2(__VA_ARGS__)
+#define vrgatherei16_vv_f64m4(...) __riscv_vrgatherei16_vv_f64m4(__VA_ARGS__)
+#define vrgatherei16_vv_f64m8(...) __riscv_vrgatherei16_vv_f64m8(__VA_ARGS__)
+#define vrgather_vv_i8mf8(...) __riscv_vrgather_vv_i8mf8(__VA_ARGS__)
+#define vrgather_vx_i8mf8(...) __riscv_vrgather_vx_i8mf8(__VA_ARGS__)
+#define vrgather_vv_i8mf4(...) __riscv_vrgather_vv_i8mf4(__VA_ARGS__)
+#define vrgather_vx_i8mf4(...) __riscv_vrgather_vx_i8mf4(__VA_ARGS__)
+#define vrgather_vv_i8mf2(...) __riscv_vrgather_vv_i8mf2(__VA_ARGS__)
+#define vrgather_vx_i8mf2(...) __riscv_vrgather_vx_i8mf2(__VA_ARGS__)
+#define vrgather_vv_i8m1(...) __riscv_vrgather_vv_i8m1(__VA_ARGS__)
+#define vrgather_vx_i8m1(...) __riscv_vrgather_vx_i8m1(__VA_ARGS__)
+#define vrgather_vv_i8m2(...) __riscv_vrgather_vv_i8m2(__VA_ARGS__)
+#define vrgather_vx_i8m2(...) __riscv_vrgather_vx_i8m2(__VA_ARGS__)
+#define vrgather_vv_i8m4(...) __riscv_vrgather_vv_i8m4(__VA_ARGS__)
+#define vrgather_vx_i8m4(...) __riscv_vrgather_vx_i8m4(__VA_ARGS__)
+#define vrgather_vv_i8m8(...) __riscv_vrgather_vv_i8m8(__VA_ARGS__)
+#define vrgather_vx_i8m8(...) __riscv_vrgather_vx_i8m8(__VA_ARGS__)
+#define vrgather_vv_i16mf4(...) __riscv_vrgather_vv_i16mf4(__VA_ARGS__)
+#define vrgather_vx_i16mf4(...) __riscv_vrgather_vx_i16mf4(__VA_ARGS__)
+#define vrgather_vv_i16mf2(...) __riscv_vrgather_vv_i16mf2(__VA_ARGS__)
+#define vrgather_vx_i16mf2(...) __riscv_vrgather_vx_i16mf2(__VA_ARGS__)
+#define vrgather_vv_i16m1(...) __riscv_vrgather_vv_i16m1(__VA_ARGS__)
+#define vrgather_vx_i16m1(...) __riscv_vrgather_vx_i16m1(__VA_ARGS__)
+#define vrgather_vv_i16m2(...) __riscv_vrgather_vv_i16m2(__VA_ARGS__)
+#define vrgather_vx_i16m2(...) __riscv_vrgather_vx_i16m2(__VA_ARGS__)
+#define vrgather_vv_i16m4(...) __riscv_vrgather_vv_i16m4(__VA_ARGS__)
+#define vrgather_vx_i16m4(...) __riscv_vrgather_vx_i16m4(__VA_ARGS__)
+#define vrgather_vv_i16m8(...) __riscv_vrgather_vv_i16m8(__VA_ARGS__)
+#define vrgather_vx_i16m8(...) __riscv_vrgather_vx_i16m8(__VA_ARGS__)
+#define vrgather_vv_i32mf2(...) __riscv_vrgather_vv_i32mf2(__VA_ARGS__)
+#define vrgather_vx_i32mf2(...) __riscv_vrgather_vx_i32mf2(__VA_ARGS__)
+#define vrgather_vv_i32m1(...) __riscv_vrgather_vv_i32m1(__VA_ARGS__)
+#define vrgather_vx_i32m1(...) __riscv_vrgather_vx_i32m1(__VA_ARGS__)
+#define vrgather_vv_i32m2(...) __riscv_vrgather_vv_i32m2(__VA_ARGS__)
+#define vrgather_vx_i32m2(...) __riscv_vrgather_vx_i32m2(__VA_ARGS__)
+#define vrgather_vv_i32m4(...) __riscv_vrgather_vv_i32m4(__VA_ARGS__)
+#define vrgather_vx_i32m4(...) __riscv_vrgather_vx_i32m4(__VA_ARGS__)
+#define vrgather_vv_i32m8(...) __riscv_vrgather_vv_i32m8(__VA_ARGS__)
+#define vrgather_vx_i32m8(...) __riscv_vrgather_vx_i32m8(__VA_ARGS__)
+#define vrgather_vv_i64m1(...) __riscv_vrgather_vv_i64m1(__VA_ARGS__)
+#define vrgather_vx_i64m1(...) __riscv_vrgather_vx_i64m1(__VA_ARGS__)
+#define vrgather_vv_i64m2(...) __riscv_vrgather_vv_i64m2(__VA_ARGS__)
+#define vrgather_vx_i64m2(...) __riscv_vrgather_vx_i64m2(__VA_ARGS__)
+#define vrgather_vv_i64m4(...) __riscv_vrgather_vv_i64m4(__VA_ARGS__)
+#define vrgather_vx_i64m4(...) __riscv_vrgather_vx_i64m4(__VA_ARGS__)
+#define vrgather_vv_i64m8(...) __riscv_vrgather_vv_i64m8(__VA_ARGS__)
+#define vrgather_vx_i64m8(...) __riscv_vrgather_vx_i64m8(__VA_ARGS__)
+#define vrgatherei16_vv_i8mf8(...) __riscv_vrgatherei16_vv_i8mf8(__VA_ARGS__)
+#define vrgatherei16_vv_i8mf4(...) __riscv_vrgatherei16_vv_i8mf4(__VA_ARGS__)
+#define vrgatherei16_vv_i8mf2(...) __riscv_vrgatherei16_vv_i8mf2(__VA_ARGS__)
+#define vrgatherei16_vv_i8m1(...) __riscv_vrgatherei16_vv_i8m1(__VA_ARGS__)
+#define vrgatherei16_vv_i8m2(...) __riscv_vrgatherei16_vv_i8m2(__VA_ARGS__)
+#define vrgatherei16_vv_i8m4(...) __riscv_vrgatherei16_vv_i8m4(__VA_ARGS__)
+#define vrgatherei16_vv_i16mf4(...) __riscv_vrgatherei16_vv_i16mf4(__VA_ARGS__)
+#define vrgatherei16_vv_i16mf2(...) __riscv_vrgatherei16_vv_i16mf2(__VA_ARGS__)
+#define vrgatherei16_vv_i16m1(...) __riscv_vrgatherei16_vv_i16m1(__VA_ARGS__)
+#define vrgatherei16_vv_i16m2(...) __riscv_vrgatherei16_vv_i16m2(__VA_ARGS__)
+#define vrgatherei16_vv_i16m4(...) __riscv_vrgatherei16_vv_i16m4(__VA_ARGS__)
+#define vrgatherei16_vv_i16m8(...) __riscv_vrgatherei16_vv_i16m8(__VA_ARGS__)
+#define vrgatherei16_vv_i32mf2(...) __riscv_vrgatherei16_vv_i32mf2(__VA_ARGS__)
+#define vrgatherei16_vv_i32m1(...) __riscv_vrgatherei16_vv_i32m1(__VA_ARGS__)
+#define vrgatherei16_vv_i32m2(...) __riscv_vrgatherei16_vv_i32m2(__VA_ARGS__)
+#define vrgatherei16_vv_i32m4(...) __riscv_vrgatherei16_vv_i32m4(__VA_ARGS__)
+#define vrgatherei16_vv_i32m8(...) __riscv_vrgatherei16_vv_i32m8(__VA_ARGS__)
+#define vrgatherei16_vv_i64m1(...) __riscv_vrgatherei16_vv_i64m1(__VA_ARGS__)
+#define vrgatherei16_vv_i64m2(...) __riscv_vrgatherei16_vv_i64m2(__VA_ARGS__)
+#define vrgatherei16_vv_i64m4(...) __riscv_vrgatherei16_vv_i64m4(__VA_ARGS__)
+#define vrgatherei16_vv_i64m8(...) __riscv_vrgatherei16_vv_i64m8(__VA_ARGS__)
+#define vrgather_vv_u8mf8(...) __riscv_vrgather_vv_u8mf8(__VA_ARGS__)
+#define vrgather_vx_u8mf8(...) __riscv_vrgather_vx_u8mf8(__VA_ARGS__)
+#define vrgather_vv_u8mf4(...) __riscv_vrgather_vv_u8mf4(__VA_ARGS__)
+#define vrgather_vx_u8mf4(...) __riscv_vrgather_vx_u8mf4(__VA_ARGS__)
+#define vrgather_vv_u8mf2(...) __riscv_vrgather_vv_u8mf2(__VA_ARGS__)
+#define vrgather_vx_u8mf2(...) __riscv_vrgather_vx_u8mf2(__VA_ARGS__)
+#define vrgather_vv_u8m1(...) __riscv_vrgather_vv_u8m1(__VA_ARGS__)
+#define vrgather_vx_u8m1(...) __riscv_vrgather_vx_u8m1(__VA_ARGS__)
+#define vrgather_vv_u8m2(...) __riscv_vrgather_vv_u8m2(__VA_ARGS__)
+#define vrgather_vx_u8m2(...) __riscv_vrgather_vx_u8m2(__VA_ARGS__)
+#define vrgather_vv_u8m4(...) __riscv_vrgather_vv_u8m4(__VA_ARGS__)
+#define vrgather_vx_u8m4(...) __riscv_vrgather_vx_u8m4(__VA_ARGS__)
+#define vrgather_vv_u8m8(...) __riscv_vrgather_vv_u8m8(__VA_ARGS__)
+#define vrgather_vx_u8m8(...) __riscv_vrgather_vx_u8m8(__VA_ARGS__)
+#define vrgather_vv_u16mf4(...) __riscv_vrgather_vv_u16mf4(__VA_ARGS__)
+#define vrgather_vx_u16mf4(...) __riscv_vrgather_vx_u16mf4(__VA_ARGS__)
+#define vrgather_vv_u16mf2(...) __riscv_vrgather_vv_u16mf2(__VA_ARGS__)
+#define vrgather_vx_u16mf2(...) __riscv_vrgather_vx_u16mf2(__VA_ARGS__)
+#define vrgather_vv_u16m1(...) __riscv_vrgather_vv_u16m1(__VA_ARGS__)
+#define vrgather_vx_u16m1(...) __riscv_vrgather_vx_u16m1(__VA_ARGS__)
+#define vrgather_vv_u16m2(...) __riscv_vrgather_vv_u16m2(__VA_ARGS__)
+#define vrgather_vx_u16m2(...) __riscv_vrgather_vx_u16m2(__VA_ARGS__)
+#define vrgather_vv_u16m4(...) __riscv_vrgather_vv_u16m4(__VA_ARGS__)
+#define vrgather_vx_u16m4(...) __riscv_vrgather_vx_u16m4(__VA_ARGS__)
+#define vrgather_vv_u16m8(...) __riscv_vrgather_vv_u16m8(__VA_ARGS__)
+#define vrgather_vx_u16m8(...) __riscv_vrgather_vx_u16m8(__VA_ARGS__)
+#define vrgather_vv_u32mf2(...) __riscv_vrgather_vv_u32mf2(__VA_ARGS__)
+#define vrgather_vx_u32mf2(...) __riscv_vrgather_vx_u32mf2(__VA_ARGS__)
+#define vrgather_vv_u32m1(...) __riscv_vrgather_vv_u32m1(__VA_ARGS__)
+#define vrgather_vx_u32m1(...) __riscv_vrgather_vx_u32m1(__VA_ARGS__)
+#define vrgather_vv_u32m2(...) __riscv_vrgather_vv_u32m2(__VA_ARGS__)
+#define vrgather_vx_u32m2(...) __riscv_vrgather_vx_u32m2(__VA_ARGS__)
+#define vrgather_vv_u32m4(...) __riscv_vrgather_vv_u32m4(__VA_ARGS__)
+#define vrgather_vx_u32m4(...) __riscv_vrgather_vx_u32m4(__VA_ARGS__)
+#define vrgather_vv_u32m8(...) __riscv_vrgather_vv_u32m8(__VA_ARGS__)
+#define vrgather_vx_u32m8(...) __riscv_vrgather_vx_u32m8(__VA_ARGS__)
+#define vrgather_vv_u64m1(...) __riscv_vrgather_vv_u64m1(__VA_ARGS__)
+#define vrgather_vx_u64m1(...) __riscv_vrgather_vx_u64m1(__VA_ARGS__)
+#define vrgather_vv_u64m2(...) __riscv_vrgather_vv_u64m2(__VA_ARGS__)
+#define vrgather_vx_u64m2(...) __riscv_vrgather_vx_u64m2(__VA_ARGS__)
+#define vrgather_vv_u64m4(...) __riscv_vrgather_vv_u64m4(__VA_ARGS__)
+#define vrgather_vx_u64m4(...) __riscv_vrgather_vx_u64m4(__VA_ARGS__)
+#define vrgather_vv_u64m8(...) __riscv_vrgather_vv_u64m8(__VA_ARGS__)
+#define vrgather_vx_u64m8(...) __riscv_vrgather_vx_u64m8(__VA_ARGS__)
+#define vrgatherei16_vv_u8mf8(...) __riscv_vrgatherei16_vv_u8mf8(__VA_ARGS__)
+#define vrgatherei16_vv_u8mf4(...) __riscv_vrgatherei16_vv_u8mf4(__VA_ARGS__)
+#define vrgatherei16_vv_u8mf2(...) __riscv_vrgatherei16_vv_u8mf2(__VA_ARGS__)
+#define vrgatherei16_vv_u8m1(...) __riscv_vrgatherei16_vv_u8m1(__VA_ARGS__)
+#define vrgatherei16_vv_u8m2(...) __riscv_vrgatherei16_vv_u8m2(__VA_ARGS__)
+#define vrgatherei16_vv_u8m4(...) __riscv_vrgatherei16_vv_u8m4(__VA_ARGS__)
+#define vrgatherei16_vv_u16mf4(...) __riscv_vrgatherei16_vv_u16mf4(__VA_ARGS__)
+#define vrgatherei16_vv_u16mf2(...) __riscv_vrgatherei16_vv_u16mf2(__VA_ARGS__)
+#define vrgatherei16_vv_u16m1(...) __riscv_vrgatherei16_vv_u16m1(__VA_ARGS__)
+#define vrgatherei16_vv_u16m2(...) __riscv_vrgatherei16_vv_u16m2(__VA_ARGS__)
+#define vrgatherei16_vv_u16m4(...) __riscv_vrgatherei16_vv_u16m4(__VA_ARGS__)
+#define vrgatherei16_vv_u16m8(...) __riscv_vrgatherei16_vv_u16m8(__VA_ARGS__)
+#define vrgatherei16_vv_u32mf2(...) __riscv_vrgatherei16_vv_u32mf2(__VA_ARGS__)
+#define vrgatherei16_vv_u32m1(...) __riscv_vrgatherei16_vv_u32m1(__VA_ARGS__)
+#define vrgatherei16_vv_u32m2(...) __riscv_vrgatherei16_vv_u32m2(__VA_ARGS__)
+#define vrgatherei16_vv_u32m4(...) __riscv_vrgatherei16_vv_u32m4(__VA_ARGS__)
+#define vrgatherei16_vv_u32m8(...) __riscv_vrgatherei16_vv_u32m8(__VA_ARGS__)
+#define vrgatherei16_vv_u64m1(...) __riscv_vrgatherei16_vv_u64m1(__VA_ARGS__)
+#define vrgatherei16_vv_u64m2(...) __riscv_vrgatherei16_vv_u64m2(__VA_ARGS__)
+#define vrgatherei16_vv_u64m4(...) __riscv_vrgatherei16_vv_u64m4(__VA_ARGS__)
+#define vrgatherei16_vv_u64m8(...) __riscv_vrgatherei16_vv_u64m8(__VA_ARGS__)
+// masked functions
+#define vrgather_vv_f16mf4_m(...) __riscv_vrgather_vv_f16mf4_tumu(__VA_ARGS__)
+#define vrgather_vx_f16mf4_m(...) __riscv_vrgather_vx_f16mf4_tumu(__VA_ARGS__)
+#define vrgather_vv_f16mf2_m(...) __riscv_vrgather_vv_f16mf2_tumu(__VA_ARGS__)
+#define vrgather_vx_f16mf2_m(...) __riscv_vrgather_vx_f16mf2_tumu(__VA_ARGS__)
+#define vrgather_vv_f16m1_m(...) __riscv_vrgather_vv_f16m1_tumu(__VA_ARGS__)
+#define vrgather_vx_f16m1_m(...) __riscv_vrgather_vx_f16m1_tumu(__VA_ARGS__)
+#define vrgather_vv_f16m2_m(...) __riscv_vrgather_vv_f16m2_tumu(__VA_ARGS__)
+#define vrgather_vx_f16m2_m(...) __riscv_vrgather_vx_f16m2_tumu(__VA_ARGS__)
+#define vrgather_vv_f16m4_m(...) __riscv_vrgather_vv_f16m4_tumu(__VA_ARGS__)
+#define vrgather_vx_f16m4_m(...) __riscv_vrgather_vx_f16m4_tumu(__VA_ARGS__)
+#define vrgather_vv_f16m8_m(...) __riscv_vrgather_vv_f16m8_tumu(__VA_ARGS__)
+#define vrgather_vx_f16m8_m(...) __riscv_vrgather_vx_f16m8_tumu(__VA_ARGS__)
+#define vrgather_vv_f32mf2_m(...) __riscv_vrgather_vv_f32mf2_tumu(__VA_ARGS__)
+#define vrgather_vx_f32mf2_m(...) __riscv_vrgather_vx_f32mf2_tumu(__VA_ARGS__)
+#define vrgather_vv_f32m1_m(...) __riscv_vrgather_vv_f32m1_tumu(__VA_ARGS__)
+#define vrgather_vx_f32m1_m(...) __riscv_vrgather_vx_f32m1_tumu(__VA_ARGS__)
+#define vrgather_vv_f32m2_m(...) __riscv_vrgather_vv_f32m2_tumu(__VA_ARGS__)
+#define vrgather_vx_f32m2_m(...) __riscv_vrgather_vx_f32m2_tumu(__VA_ARGS__)
+#define vrgather_vv_f32m4_m(...) __riscv_vrgather_vv_f32m4_tumu(__VA_ARGS__)
+#define vrgather_vx_f32m4_m(...) __riscv_vrgather_vx_f32m4_tumu(__VA_ARGS__)
+#define vrgather_vv_f32m8_m(...) __riscv_vrgather_vv_f32m8_tumu(__VA_ARGS__)
+#define vrgather_vx_f32m8_m(...) __riscv_vrgather_vx_f32m8_tumu(__VA_ARGS__)
+#define vrgather_vv_f64m1_m(...) __riscv_vrgather_vv_f64m1_tumu(__VA_ARGS__)
+#define vrgather_vx_f64m1_m(...) __riscv_vrgather_vx_f64m1_tumu(__VA_ARGS__)
+#define vrgather_vv_f64m2_m(...) __riscv_vrgather_vv_f64m2_tumu(__VA_ARGS__)
+#define vrgather_vx_f64m2_m(...) __riscv_vrgather_vx_f64m2_tumu(__VA_ARGS__)
+#define vrgather_vv_f64m4_m(...) __riscv_vrgather_vv_f64m4_tumu(__VA_ARGS__)
+#define vrgather_vx_f64m4_m(...) __riscv_vrgather_vx_f64m4_tumu(__VA_ARGS__)
+#define vrgather_vv_f64m8_m(...) __riscv_vrgather_vv_f64m8_tumu(__VA_ARGS__)
+#define vrgather_vx_f64m8_m(...) __riscv_vrgather_vx_f64m8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f16mf4_m(...) __riscv_vrgatherei16_vv_f16mf4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f16mf2_m(...) __riscv_vrgatherei16_vv_f16mf2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f16m1_m(...) __riscv_vrgatherei16_vv_f16m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f16m2_m(...) __riscv_vrgatherei16_vv_f16m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f16m4_m(...) __riscv_vrgatherei16_vv_f16m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f16m8_m(...) __riscv_vrgatherei16_vv_f16m8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f32mf2_m(...) __riscv_vrgatherei16_vv_f32mf2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f32m1_m(...) __riscv_vrgatherei16_vv_f32m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f32m2_m(...) __riscv_vrgatherei16_vv_f32m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f32m4_m(...) __riscv_vrgatherei16_vv_f32m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f32m8_m(...) __riscv_vrgatherei16_vv_f32m8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f64m1_m(...) __riscv_vrgatherei16_vv_f64m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f64m2_m(...) __riscv_vrgatherei16_vv_f64m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f64m4_m(...) __riscv_vrgatherei16_vv_f64m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_f64m8_m(...) __riscv_vrgatherei16_vv_f64m8_tumu(__VA_ARGS__)
+#define vrgather_vv_i8mf8_m(...) __riscv_vrgather_vv_i8mf8_tumu(__VA_ARGS__)
+#define vrgather_vx_i8mf8_m(...) __riscv_vrgather_vx_i8mf8_tumu(__VA_ARGS__)
+#define vrgather_vv_i8mf4_m(...) __riscv_vrgather_vv_i8mf4_tumu(__VA_ARGS__)
+#define vrgather_vx_i8mf4_m(...) __riscv_vrgather_vx_i8mf4_tumu(__VA_ARGS__)
+#define vrgather_vv_i8mf2_m(...) __riscv_vrgather_vv_i8mf2_tumu(__VA_ARGS__)
+#define vrgather_vx_i8mf2_m(...) __riscv_vrgather_vx_i8mf2_tumu(__VA_ARGS__)
+#define vrgather_vv_i8m1_m(...) __riscv_vrgather_vv_i8m1_tumu(__VA_ARGS__)
+#define vrgather_vx_i8m1_m(...) __riscv_vrgather_vx_i8m1_tumu(__VA_ARGS__)
+#define vrgather_vv_i8m2_m(...) __riscv_vrgather_vv_i8m2_tumu(__VA_ARGS__)
+#define vrgather_vx_i8m2_m(...) __riscv_vrgather_vx_i8m2_tumu(__VA_ARGS__)
+#define vrgather_vv_i8m4_m(...) __riscv_vrgather_vv_i8m4_tumu(__VA_ARGS__)
+#define vrgather_vx_i8m4_m(...) __riscv_vrgather_vx_i8m4_tumu(__VA_ARGS__)
+#define vrgather_vv_i8m8_m(...) __riscv_vrgather_vv_i8m8_tumu(__VA_ARGS__)
+#define vrgather_vx_i8m8_m(...) __riscv_vrgather_vx_i8m8_tumu(__VA_ARGS__)
+#define vrgather_vv_i16mf4_m(...) __riscv_vrgather_vv_i16mf4_tumu(__VA_ARGS__)
+#define vrgather_vx_i16mf4_m(...) __riscv_vrgather_vx_i16mf4_tumu(__VA_ARGS__)
+#define vrgather_vv_i16mf2_m(...) __riscv_vrgather_vv_i16mf2_tumu(__VA_ARGS__)
+#define vrgather_vx_i16mf2_m(...) __riscv_vrgather_vx_i16mf2_tumu(__VA_ARGS__)
+#define vrgather_vv_i16m1_m(...) __riscv_vrgather_vv_i16m1_tumu(__VA_ARGS__)
+#define vrgather_vx_i16m1_m(...) __riscv_vrgather_vx_i16m1_tumu(__VA_ARGS__)
+#define vrgather_vv_i16m2_m(...) __riscv_vrgather_vv_i16m2_tumu(__VA_ARGS__)
+#define vrgather_vx_i16m2_m(...) __riscv_vrgather_vx_i16m2_tumu(__VA_ARGS__)
+#define vrgather_vv_i16m4_m(...) __riscv_vrgather_vv_i16m4_tumu(__VA_ARGS__)
+#define vrgather_vx_i16m4_m(...) __riscv_vrgather_vx_i16m4_tumu(__VA_ARGS__)
+#define vrgather_vv_i16m8_m(...) __riscv_vrgather_vv_i16m8_tumu(__VA_ARGS__)
+#define vrgather_vx_i16m8_m(...) __riscv_vrgather_vx_i16m8_tumu(__VA_ARGS__)
+#define vrgather_vv_i32mf2_m(...) __riscv_vrgather_vv_i32mf2_tumu(__VA_ARGS__)
+#define vrgather_vx_i32mf2_m(...) __riscv_vrgather_vx_i32mf2_tumu(__VA_ARGS__)
+#define vrgather_vv_i32m1_m(...) __riscv_vrgather_vv_i32m1_tumu(__VA_ARGS__)
+#define vrgather_vx_i32m1_m(...) __riscv_vrgather_vx_i32m1_tumu(__VA_ARGS__)
+#define vrgather_vv_i32m2_m(...) __riscv_vrgather_vv_i32m2_tumu(__VA_ARGS__)
+#define vrgather_vx_i32m2_m(...) __riscv_vrgather_vx_i32m2_tumu(__VA_ARGS__)
+#define vrgather_vv_i32m4_m(...) __riscv_vrgather_vv_i32m4_tumu(__VA_ARGS__)
+#define vrgather_vx_i32m4_m(...) __riscv_vrgather_vx_i32m4_tumu(__VA_ARGS__)
+#define vrgather_vv_i32m8_m(...) __riscv_vrgather_vv_i32m8_tumu(__VA_ARGS__)
+#define vrgather_vx_i32m8_m(...) __riscv_vrgather_vx_i32m8_tumu(__VA_ARGS__)
+#define vrgather_vv_i64m1_m(...) __riscv_vrgather_vv_i64m1_tumu(__VA_ARGS__)
+#define vrgather_vx_i64m1_m(...) __riscv_vrgather_vx_i64m1_tumu(__VA_ARGS__)
+#define vrgather_vv_i64m2_m(...) __riscv_vrgather_vv_i64m2_tumu(__VA_ARGS__)
+#define vrgather_vx_i64m2_m(...) __riscv_vrgather_vx_i64m2_tumu(__VA_ARGS__)
+#define vrgather_vv_i64m4_m(...) __riscv_vrgather_vv_i64m4_tumu(__VA_ARGS__)
+#define vrgather_vx_i64m4_m(...) __riscv_vrgather_vx_i64m4_tumu(__VA_ARGS__)
+#define vrgather_vv_i64m8_m(...) __riscv_vrgather_vv_i64m8_tumu(__VA_ARGS__)
+#define vrgather_vx_i64m8_m(...) __riscv_vrgather_vx_i64m8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i8mf8_m(...) __riscv_vrgatherei16_vv_i8mf8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i8mf4_m(...) __riscv_vrgatherei16_vv_i8mf4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i8mf2_m(...) __riscv_vrgatherei16_vv_i8mf2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i8m1_m(...) __riscv_vrgatherei16_vv_i8m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i8m2_m(...) __riscv_vrgatherei16_vv_i8m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i8m4_m(...) __riscv_vrgatherei16_vv_i8m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i16mf4_m(...) __riscv_vrgatherei16_vv_i16mf4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i16mf2_m(...) __riscv_vrgatherei16_vv_i16mf2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i16m1_m(...) __riscv_vrgatherei16_vv_i16m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i16m2_m(...) __riscv_vrgatherei16_vv_i16m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i16m4_m(...) __riscv_vrgatherei16_vv_i16m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i16m8_m(...) __riscv_vrgatherei16_vv_i16m8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i32mf2_m(...) __riscv_vrgatherei16_vv_i32mf2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i32m1_m(...) __riscv_vrgatherei16_vv_i32m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i32m2_m(...) __riscv_vrgatherei16_vv_i32m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i32m4_m(...) __riscv_vrgatherei16_vv_i32m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i32m8_m(...) __riscv_vrgatherei16_vv_i32m8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i64m1_m(...) __riscv_vrgatherei16_vv_i64m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i64m2_m(...) __riscv_vrgatherei16_vv_i64m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i64m4_m(...) __riscv_vrgatherei16_vv_i64m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_i64m8_m(...) __riscv_vrgatherei16_vv_i64m8_tumu(__VA_ARGS__)
+#define vrgather_vv_u8mf8_m(...) __riscv_vrgather_vv_u8mf8_tumu(__VA_ARGS__)
+#define vrgather_vx_u8mf8_m(...) __riscv_vrgather_vx_u8mf8_tumu(__VA_ARGS__)
+#define vrgather_vv_u8mf4_m(...) __riscv_vrgather_vv_u8mf4_tumu(__VA_ARGS__)
+#define vrgather_vx_u8mf4_m(...) __riscv_vrgather_vx_u8mf4_tumu(__VA_ARGS__)
+#define vrgather_vv_u8mf2_m(...) __riscv_vrgather_vv_u8mf2_tumu(__VA_ARGS__)
+#define vrgather_vx_u8mf2_m(...) __riscv_vrgather_vx_u8mf2_tumu(__VA_ARGS__)
+#define vrgather_vv_u8m1_m(...) __riscv_vrgather_vv_u8m1_tumu(__VA_ARGS__)
+#define vrgather_vx_u8m1_m(...) __riscv_vrgather_vx_u8m1_tumu(__VA_ARGS__)
+#define vrgather_vv_u8m2_m(...) __riscv_vrgather_vv_u8m2_tumu(__VA_ARGS__)
+#define vrgather_vx_u8m2_m(...) __riscv_vrgather_vx_u8m2_tumu(__VA_ARGS__)
+#define vrgather_vv_u8m4_m(...) __riscv_vrgather_vv_u8m4_tumu(__VA_ARGS__)
+#define vrgather_vx_u8m4_m(...) __riscv_vrgather_vx_u8m4_tumu(__VA_ARGS__)
+#define vrgather_vv_u8m8_m(...) __riscv_vrgather_vv_u8m8_tumu(__VA_ARGS__)
+#define vrgather_vx_u8m8_m(...) __riscv_vrgather_vx_u8m8_tumu(__VA_ARGS__)
+#define vrgather_vv_u16mf4_m(...) __riscv_vrgather_vv_u16mf4_tumu(__VA_ARGS__)
+#define vrgather_vx_u16mf4_m(...) __riscv_vrgather_vx_u16mf4_tumu(__VA_ARGS__)
+#define vrgather_vv_u16mf2_m(...) __riscv_vrgather_vv_u16mf2_tumu(__VA_ARGS__)
+#define vrgather_vx_u16mf2_m(...) __riscv_vrgather_vx_u16mf2_tumu(__VA_ARGS__)
+#define vrgather_vv_u16m1_m(...) __riscv_vrgather_vv_u16m1_tumu(__VA_ARGS__)
+#define vrgather_vx_u16m1_m(...) __riscv_vrgather_vx_u16m1_tumu(__VA_ARGS__)
+#define vrgather_vv_u16m2_m(...) __riscv_vrgather_vv_u16m2_tumu(__VA_ARGS__)
+#define vrgather_vx_u16m2_m(...) __riscv_vrgather_vx_u16m2_tumu(__VA_ARGS__)
+#define vrgather_vv_u16m4_m(...) __riscv_vrgather_vv_u16m4_tumu(__VA_ARGS__)
+#define vrgather_vx_u16m4_m(...) __riscv_vrgather_vx_u16m4_tumu(__VA_ARGS__)
+#define vrgather_vv_u16m8_m(...) __riscv_vrgather_vv_u16m8_tumu(__VA_ARGS__)
+#define vrgather_vx_u16m8_m(...) __riscv_vrgather_vx_u16m8_tumu(__VA_ARGS__)
+#define vrgather_vv_u32mf2_m(...) __riscv_vrgather_vv_u32mf2_tumu(__VA_ARGS__)
+#define vrgather_vx_u32mf2_m(...) __riscv_vrgather_vx_u32mf2_tumu(__VA_ARGS__)
+#define vrgather_vv_u32m1_m(...) __riscv_vrgather_vv_u32m1_tumu(__VA_ARGS__)
+#define vrgather_vx_u32m1_m(...) __riscv_vrgather_vx_u32m1_tumu(__VA_ARGS__)
+#define vrgather_vv_u32m2_m(...) __riscv_vrgather_vv_u32m2_tumu(__VA_ARGS__)
+#define vrgather_vx_u32m2_m(...) __riscv_vrgather_vx_u32m2_tumu(__VA_ARGS__)
+#define vrgather_vv_u32m4_m(...) __riscv_vrgather_vv_u32m4_tumu(__VA_ARGS__)
+#define vrgather_vx_u32m4_m(...) __riscv_vrgather_vx_u32m4_tumu(__VA_ARGS__)
+#define vrgather_vv_u32m8_m(...) __riscv_vrgather_vv_u32m8_tumu(__VA_ARGS__)
+#define vrgather_vx_u32m8_m(...) __riscv_vrgather_vx_u32m8_tumu(__VA_ARGS__)
+#define vrgather_vv_u64m1_m(...) __riscv_vrgather_vv_u64m1_tumu(__VA_ARGS__)
+#define vrgather_vx_u64m1_m(...) __riscv_vrgather_vx_u64m1_tumu(__VA_ARGS__)
+#define vrgather_vv_u64m2_m(...) __riscv_vrgather_vv_u64m2_tumu(__VA_ARGS__)
+#define vrgather_vx_u64m2_m(...) __riscv_vrgather_vx_u64m2_tumu(__VA_ARGS__)
+#define vrgather_vv_u64m4_m(...) __riscv_vrgather_vv_u64m4_tumu(__VA_ARGS__)
+#define vrgather_vx_u64m4_m(...) __riscv_vrgather_vx_u64m4_tumu(__VA_ARGS__)
+#define vrgather_vv_u64m8_m(...) __riscv_vrgather_vv_u64m8_tumu(__VA_ARGS__)
+#define vrgather_vx_u64m8_m(...) __riscv_vrgather_vx_u64m8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u8mf8_m(...) __riscv_vrgatherei16_vv_u8mf8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u8mf4_m(...) __riscv_vrgatherei16_vv_u8mf4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u8mf2_m(...) __riscv_vrgatherei16_vv_u8mf2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u8m1_m(...) __riscv_vrgatherei16_vv_u8m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u8m2_m(...) __riscv_vrgatherei16_vv_u8m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u8m4_m(...) __riscv_vrgatherei16_vv_u8m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u16mf4_m(...) __riscv_vrgatherei16_vv_u16mf4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u16mf2_m(...) __riscv_vrgatherei16_vv_u16mf2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u16m1_m(...) __riscv_vrgatherei16_vv_u16m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u16m2_m(...) __riscv_vrgatherei16_vv_u16m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u16m4_m(...) __riscv_vrgatherei16_vv_u16m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u16m8_m(...) __riscv_vrgatherei16_vv_u16m8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u32mf2_m(...) __riscv_vrgatherei16_vv_u32mf2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u32m1_m(...) __riscv_vrgatherei16_vv_u32m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u32m2_m(...) __riscv_vrgatherei16_vv_u32m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u32m4_m(...) __riscv_vrgatherei16_vv_u32m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u32m8_m(...) __riscv_vrgatherei16_vv_u32m8_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u64m1_m(...) __riscv_vrgatherei16_vv_u64m1_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u64m2_m(...) __riscv_vrgatherei16_vv_u64m2_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u64m4_m(...) __riscv_vrgatherei16_vv_u64m4_tumu(__VA_ARGS__)
+#define vrgatherei16_vv_u64m8_m(...) __riscv_vrgatherei16_vv_u64m8_tumu(__VA_ARGS__)
+#define vcompress_vm_f16mf4(mask, dest, src, vl) __riscv_vcompress_vm_f16mf4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f16mf2(mask, dest, src, vl) __riscv_vcompress_vm_f16mf2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f16m1(mask, dest, src, vl) __riscv_vcompress_vm_f16m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f16m2(mask, dest, src, vl) __riscv_vcompress_vm_f16m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f16m4(mask, dest, src, vl) __riscv_vcompress_vm_f16m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f16m8(mask, dest, src, vl) __riscv_vcompress_vm_f16m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f32mf2(mask, dest, src, vl) __riscv_vcompress_vm_f32mf2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f32m1(mask, dest, src, vl) __riscv_vcompress_vm_f32m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f32m2(mask, dest, src, vl) __riscv_vcompress_vm_f32m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f32m4(mask, dest, src, vl) __riscv_vcompress_vm_f32m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f32m8(mask, dest, src, vl) __riscv_vcompress_vm_f32m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f64m1(mask, dest, src, vl) __riscv_vcompress_vm_f64m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f64m2(mask, dest, src, vl) __riscv_vcompress_vm_f64m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f64m4(mask, dest, src, vl) __riscv_vcompress_vm_f64m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_f64m8(mask, dest, src, vl) __riscv_vcompress_vm_f64m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i8mf8(mask, dest, src, vl) __riscv_vcompress_vm_i8mf8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i8mf4(mask, dest, src, vl) __riscv_vcompress_vm_i8mf4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i8mf2(mask, dest, src, vl) __riscv_vcompress_vm_i8mf2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i8m1(mask, dest, src, vl) __riscv_vcompress_vm_i8m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i8m2(mask, dest, src, vl) __riscv_vcompress_vm_i8m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i8m4(mask, dest, src, vl) __riscv_vcompress_vm_i8m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i8m8(mask, dest, src, vl) __riscv_vcompress_vm_i8m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i16mf4(mask, dest, src, vl) __riscv_vcompress_vm_i16mf4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i16mf2(mask, dest, src, vl) __riscv_vcompress_vm_i16mf2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i16m1(mask, dest, src, vl) __riscv_vcompress_vm_i16m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i16m2(mask, dest, src, vl) __riscv_vcompress_vm_i16m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i16m4(mask, dest, src, vl) __riscv_vcompress_vm_i16m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i16m8(mask, dest, src, vl) __riscv_vcompress_vm_i16m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i32mf2(mask, dest, src, vl) __riscv_vcompress_vm_i32mf2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i32m1(mask, dest, src, vl) __riscv_vcompress_vm_i32m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i32m2(mask, dest, src, vl) __riscv_vcompress_vm_i32m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i32m4(mask, dest, src, vl) __riscv_vcompress_vm_i32m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i32m8(mask, dest, src, vl) __riscv_vcompress_vm_i32m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i64m1(mask, dest, src, vl) __riscv_vcompress_vm_i64m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i64m2(mask, dest, src, vl) __riscv_vcompress_vm_i64m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i64m4(mask, dest, src, vl) __riscv_vcompress_vm_i64m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_i64m8(mask, dest, src, vl) __riscv_vcompress_vm_i64m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u8mf8(mask, dest, src, vl) __riscv_vcompress_vm_u8mf8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u8mf4(mask, dest, src, vl) __riscv_vcompress_vm_u8mf4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u8mf2(mask, dest, src, vl) __riscv_vcompress_vm_u8mf2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u8m1(mask, dest, src, vl) __riscv_vcompress_vm_u8m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u8m2(mask, dest, src, vl) __riscv_vcompress_vm_u8m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u8m4(mask, dest, src, vl) __riscv_vcompress_vm_u8m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u8m8(mask, dest, src, vl) __riscv_vcompress_vm_u8m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u16mf4(mask, dest, src, vl) __riscv_vcompress_vm_u16mf4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u16mf2(mask, dest, src, vl) __riscv_vcompress_vm_u16mf2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u16m1(mask, dest, src, vl) __riscv_vcompress_vm_u16m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u16m2(mask, dest, src, vl) __riscv_vcompress_vm_u16m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u16m4(mask, dest, src, vl) __riscv_vcompress_vm_u16m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u16m8(mask, dest, src, vl) __riscv_vcompress_vm_u16m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u32mf2(mask, dest, src, vl) __riscv_vcompress_vm_u32mf2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u32m1(mask, dest, src, vl) __riscv_vcompress_vm_u32m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u32m2(mask, dest, src, vl) __riscv_vcompress_vm_u32m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u32m4(mask, dest, src, vl) __riscv_vcompress_vm_u32m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u32m8(mask, dest, src, vl) __riscv_vcompress_vm_u32m8_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u64m1(mask, dest, src, vl) __riscv_vcompress_vm_u64m1_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u64m2(mask, dest, src, vl) __riscv_vcompress_vm_u64m2_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u64m4(mask, dest, src, vl) __riscv_vcompress_vm_u64m4_tu((dest), (src), (mask), (vl))
+#define vcompress_vm_u64m8(mask, dest, src, vl) __riscv_vcompress_vm_u64m8_tu((dest), (src), (mask), (vl))
+// Reinterpret between different type under the same SEW/LMUL
+#define vreinterpret_v_i8mf8_u8mf8(...) __riscv_vreinterpret_v_i8mf8_u8mf8(__VA_ARGS__)
+#define vreinterpret_v_i8mf4_u8mf4(...) __riscv_vreinterpret_v_i8mf4_u8mf4(__VA_ARGS__)
+#define vreinterpret_v_i8mf2_u8mf2(...) __riscv_vreinterpret_v_i8mf2_u8mf2(__VA_ARGS__)
+#define vreinterpret_v_i8m1_u8m1(...) __riscv_vreinterpret_v_i8m1_u8m1(__VA_ARGS__)
+#define vreinterpret_v_i8m2_u8m2(...) __riscv_vreinterpret_v_i8m2_u8m2(__VA_ARGS__)
+#define vreinterpret_v_i8m4_u8m4(...) __riscv_vreinterpret_v_i8m4_u8m4(__VA_ARGS__)
+#define vreinterpret_v_i8m8_u8m8(...) __riscv_vreinterpret_v_i8m8_u8m8(__VA_ARGS__)
+#define vreinterpret_v_u8mf8_i8mf8(...) __riscv_vreinterpret_v_u8mf8_i8mf8(__VA_ARGS__)
+#define vreinterpret_v_u8mf4_i8mf4(...) __riscv_vreinterpret_v_u8mf4_i8mf4(__VA_ARGS__)
+#define vreinterpret_v_u8mf2_i8mf2(...) __riscv_vreinterpret_v_u8mf2_i8mf2(__VA_ARGS__)
+#define vreinterpret_v_u8m1_i8m1(...) __riscv_vreinterpret_v_u8m1_i8m1(__VA_ARGS__)
+#define vreinterpret_v_u8m2_i8m2(...) __riscv_vreinterpret_v_u8m2_i8m2(__VA_ARGS__)
+#define vreinterpret_v_u8m4_i8m4(...) __riscv_vreinterpret_v_u8m4_i8m4(__VA_ARGS__)
+#define vreinterpret_v_u8m8_i8m8(...) __riscv_vreinterpret_v_u8m8_i8m8(__VA_ARGS__)
+#define vreinterpret_v_i16mf4_f16mf4(...) __riscv_vreinterpret_v_i16mf4_f16mf4(__VA_ARGS__)
+#define vreinterpret_v_i16mf2_f16mf2(...) __riscv_vreinterpret_v_i16mf2_f16mf2(__VA_ARGS__)
+#define vreinterpret_v_i16m1_f16m1(...) __riscv_vreinterpret_v_i16m1_f16m1(__VA_ARGS__)
+#define vreinterpret_v_i16m2_f16m2(...) __riscv_vreinterpret_v_i16m2_f16m2(__VA_ARGS__)
+#define vreinterpret_v_i16m4_f16m4(...) __riscv_vreinterpret_v_i16m4_f16m4(__VA_ARGS__)
+#define vreinterpret_v_i16m8_f16m8(...) __riscv_vreinterpret_v_i16m8_f16m8(__VA_ARGS__)
+#define vreinterpret_v_u16mf4_f16mf4(...) __riscv_vreinterpret_v_u16mf4_f16mf4(__VA_ARGS__)
+#define vreinterpret_v_u16mf2_f16mf2(...) __riscv_vreinterpret_v_u16mf2_f16mf2(__VA_ARGS__)
+#define vreinterpret_v_u16m1_f16m1(...) __riscv_vreinterpret_v_u16m1_f16m1(__VA_ARGS__)
+#define vreinterpret_v_u16m2_f16m2(...) __riscv_vreinterpret_v_u16m2_f16m2(__VA_ARGS__)
+#define vreinterpret_v_u16m4_f16m4(...) __riscv_vreinterpret_v_u16m4_f16m4(__VA_ARGS__)
+#define vreinterpret_v_u16m8_f16m8(...) __riscv_vreinterpret_v_u16m8_f16m8(__VA_ARGS__)
+#define vreinterpret_v_i16mf4_u16mf4(...) __riscv_vreinterpret_v_i16mf4_u16mf4(__VA_ARGS__)
+#define vreinterpret_v_i16mf2_u16mf2(...) __riscv_vreinterpret_v_i16mf2_u16mf2(__VA_ARGS__)
+#define vreinterpret_v_i16m1_u16m1(...) __riscv_vreinterpret_v_i16m1_u16m1(__VA_ARGS__)
+#define vreinterpret_v_i16m2_u16m2(...) __riscv_vreinterpret_v_i16m2_u16m2(__VA_ARGS__)
+#define vreinterpret_v_i16m4_u16m4(...) __riscv_vreinterpret_v_i16m4_u16m4(__VA_ARGS__)
+#define vreinterpret_v_i16m8_u16m8(...) __riscv_vreinterpret_v_i16m8_u16m8(__VA_ARGS__)
+#define vreinterpret_v_u16mf4_i16mf4(...) __riscv_vreinterpret_v_u16mf4_i16mf4(__VA_ARGS__)
+#define vreinterpret_v_u16mf2_i16mf2(...) __riscv_vreinterpret_v_u16mf2_i16mf2(__VA_ARGS__)
+#define vreinterpret_v_u16m1_i16m1(...) __riscv_vreinterpret_v_u16m1_i16m1(__VA_ARGS__)
+#define vreinterpret_v_u16m2_i16m2(...) __riscv_vreinterpret_v_u16m2_i16m2(__VA_ARGS__)
+#define vreinterpret_v_u16m4_i16m4(...) __riscv_vreinterpret_v_u16m4_i16m4(__VA_ARGS__)
+#define vreinterpret_v_u16m8_i16m8(...) __riscv_vreinterpret_v_u16m8_i16m8(__VA_ARGS__)
+#define vreinterpret_v_f16mf4_i16mf4(...) __riscv_vreinterpret_v_f16mf4_i16mf4(__VA_ARGS__)
+#define vreinterpret_v_f16mf2_i16mf2(...) __riscv_vreinterpret_v_f16mf2_i16mf2(__VA_ARGS__)
+#define vreinterpret_v_f16m1_i16m1(...) __riscv_vreinterpret_v_f16m1_i16m1(__VA_ARGS__)
+#define vreinterpret_v_f16m2_i16m2(...) __riscv_vreinterpret_v_f16m2_i16m2(__VA_ARGS__)
+#define vreinterpret_v_f16m4_i16m4(...) __riscv_vreinterpret_v_f16m4_i16m4(__VA_ARGS__)
+#define vreinterpret_v_f16m8_i16m8(...) __riscv_vreinterpret_v_f16m8_i16m8(__VA_ARGS__)
+#define vreinterpret_v_f16mf4_u16mf4(...) __riscv_vreinterpret_v_f16mf4_u16mf4(__VA_ARGS__)
+#define vreinterpret_v_f16mf2_u16mf2(...) __riscv_vreinterpret_v_f16mf2_u16mf2(__VA_ARGS__)
+#define vreinterpret_v_f16m1_u16m1(...) __riscv_vreinterpret_v_f16m1_u16m1(__VA_ARGS__)
+#define vreinterpret_v_f16m2_u16m2(...) __riscv_vreinterpret_v_f16m2_u16m2(__VA_ARGS__)
+#define vreinterpret_v_f16m4_u16m4(...) __riscv_vreinterpret_v_f16m4_u16m4(__VA_ARGS__)
+#define vreinterpret_v_f16m8_u16m8(...) __riscv_vreinterpret_v_f16m8_u16m8(__VA_ARGS__)
+#define vreinterpret_v_i32mf2_f32mf2(...) __riscv_vreinterpret_v_i32mf2_f32mf2(__VA_ARGS__)
+#define vreinterpret_v_i32m1_f32m1(...) __riscv_vreinterpret_v_i32m1_f32m1(__VA_ARGS__)
+#define vreinterpret_v_i32m2_f32m2(...) __riscv_vreinterpret_v_i32m2_f32m2(__VA_ARGS__)
+#define vreinterpret_v_i32m4_f32m4(...) __riscv_vreinterpret_v_i32m4_f32m4(__VA_ARGS__)
+#define vreinterpret_v_i32m8_f32m8(...) __riscv_vreinterpret_v_i32m8_f32m8(__VA_ARGS__)
+#define vreinterpret_v_u32mf2_f32mf2(...) __riscv_vreinterpret_v_u32mf2_f32mf2(__VA_ARGS__)
+#define vreinterpret_v_u32m1_f32m1(...) __riscv_vreinterpret_v_u32m1_f32m1(__VA_ARGS__)
+#define vreinterpret_v_u32m2_f32m2(...) __riscv_vreinterpret_v_u32m2_f32m2(__VA_ARGS__)
+#define vreinterpret_v_u32m4_f32m4(...) __riscv_vreinterpret_v_u32m4_f32m4(__VA_ARGS__)
+#define vreinterpret_v_u32m8_f32m8(...) __riscv_vreinterpret_v_u32m8_f32m8(__VA_ARGS__)
+#define vreinterpret_v_i32mf2_u32mf2(...) __riscv_vreinterpret_v_i32mf2_u32mf2(__VA_ARGS__)
+#define vreinterpret_v_i32m1_u32m1(...) __riscv_vreinterpret_v_i32m1_u32m1(__VA_ARGS__)
+#define vreinterpret_v_i32m2_u32m2(...) __riscv_vreinterpret_v_i32m2_u32m2(__VA_ARGS__)
+#define vreinterpret_v_i32m4_u32m4(...) __riscv_vreinterpret_v_i32m4_u32m4(__VA_ARGS__)
+#define vreinterpret_v_i32m8_u32m8(...) __riscv_vreinterpret_v_i32m8_u32m8(__VA_ARGS__)
+#define vreinterpret_v_u32mf2_i32mf2(...) __riscv_vreinterpret_v_u32mf2_i32mf2(__VA_ARGS__)
+#define vreinterpret_v_u32m1_i32m1(...) __riscv_vreinterpret_v_u32m1_i32m1(__VA_ARGS__)
+#define vreinterpret_v_u32m2_i32m2(...) __riscv_vreinterpret_v_u32m2_i32m2(__VA_ARGS__)
+#define vreinterpret_v_u32m4_i32m4(...) __riscv_vreinterpret_v_u32m4_i32m4(__VA_ARGS__)
+#define vreinterpret_v_u32m8_i32m8(...) __riscv_vreinterpret_v_u32m8_i32m8(__VA_ARGS__)
+#define vreinterpret_v_f32mf2_i32mf2(...) __riscv_vreinterpret_v_f32mf2_i32mf2(__VA_ARGS__)
+#define vreinterpret_v_f32m1_i32m1(...) __riscv_vreinterpret_v_f32m1_i32m1(__VA_ARGS__)
+#define vreinterpret_v_f32m2_i32m2(...) __riscv_vreinterpret_v_f32m2_i32m2(__VA_ARGS__)
+#define vreinterpret_v_f32m4_i32m4(...) __riscv_vreinterpret_v_f32m4_i32m4(__VA_ARGS__)
+#define vreinterpret_v_f32m8_i32m8(...) __riscv_vreinterpret_v_f32m8_i32m8(__VA_ARGS__)
+#define vreinterpret_v_f32mf2_u32mf2(...) __riscv_vreinterpret_v_f32mf2_u32mf2(__VA_ARGS__)
+#define vreinterpret_v_f32m1_u32m1(...) __riscv_vreinterpret_v_f32m1_u32m1(__VA_ARGS__)
+#define vreinterpret_v_f32m2_u32m2(...) __riscv_vreinterpret_v_f32m2_u32m2(__VA_ARGS__)
+#define vreinterpret_v_f32m4_u32m4(...) __riscv_vreinterpret_v_f32m4_u32m4(__VA_ARGS__)
+#define vreinterpret_v_f32m8_u32m8(...) __riscv_vreinterpret_v_f32m8_u32m8(__VA_ARGS__)
+#define vreinterpret_v_i64m1_f64m1(...) __riscv_vreinterpret_v_i64m1_f64m1(__VA_ARGS__)
+#define vreinterpret_v_i64m2_f64m2(...) __riscv_vreinterpret_v_i64m2_f64m2(__VA_ARGS__)
+#define vreinterpret_v_i64m4_f64m4(...) __riscv_vreinterpret_v_i64m4_f64m4(__VA_ARGS__)
+#define vreinterpret_v_i64m8_f64m8(...) __riscv_vreinterpret_v_i64m8_f64m8(__VA_ARGS__)
+#define vreinterpret_v_u64m1_f64m1(...) __riscv_vreinterpret_v_u64m1_f64m1(__VA_ARGS__)
+#define vreinterpret_v_u64m2_f64m2(...) __riscv_vreinterpret_v_u64m2_f64m2(__VA_ARGS__)
+#define vreinterpret_v_u64m4_f64m4(...) __riscv_vreinterpret_v_u64m4_f64m4(__VA_ARGS__)
+#define vreinterpret_v_u64m8_f64m8(...) __riscv_vreinterpret_v_u64m8_f64m8(__VA_ARGS__)
+#define vreinterpret_v_i64m1_u64m1(...) __riscv_vreinterpret_v_i64m1_u64m1(__VA_ARGS__)
+#define vreinterpret_v_i64m2_u64m2(...) __riscv_vreinterpret_v_i64m2_u64m2(__VA_ARGS__)
+#define vreinterpret_v_i64m4_u64m4(...) __riscv_vreinterpret_v_i64m4_u64m4(__VA_ARGS__)
+#define vreinterpret_v_i64m8_u64m8(...) __riscv_vreinterpret_v_i64m8_u64m8(__VA_ARGS__)
+#define vreinterpret_v_u64m1_i64m1(...) __riscv_vreinterpret_v_u64m1_i64m1(__VA_ARGS__)
+#define vreinterpret_v_u64m2_i64m2(...) __riscv_vreinterpret_v_u64m2_i64m2(__VA_ARGS__)
+#define vreinterpret_v_u64m4_i64m4(...) __riscv_vreinterpret_v_u64m4_i64m4(__VA_ARGS__)
+#define vreinterpret_v_u64m8_i64m8(...) __riscv_vreinterpret_v_u64m8_i64m8(__VA_ARGS__)
+#define vreinterpret_v_f64m1_i64m1(...) __riscv_vreinterpret_v_f64m1_i64m1(__VA_ARGS__)
+#define vreinterpret_v_f64m2_i64m2(...) __riscv_vreinterpret_v_f64m2_i64m2(__VA_ARGS__)
+#define vreinterpret_v_f64m4_i64m4(...) __riscv_vreinterpret_v_f64m4_i64m4(__VA_ARGS__)
+#define vreinterpret_v_f64m8_i64m8(...) __riscv_vreinterpret_v_f64m8_i64m8(__VA_ARGS__)
+#define vreinterpret_v_f64m1_u64m1(...) __riscv_vreinterpret_v_f64m1_u64m1(__VA_ARGS__)
+#define vreinterpret_v_f64m2_u64m2(...) __riscv_vreinterpret_v_f64m2_u64m2(__VA_ARGS__)
+#define vreinterpret_v_f64m4_u64m4(...) __riscv_vreinterpret_v_f64m4_u64m4(__VA_ARGS__)
+#define vreinterpret_v_f64m8_u64m8(...) __riscv_vreinterpret_v_f64m8_u64m8(__VA_ARGS__)
+// Reinterpret between different SEW under the same LMUL
+#define vreinterpret_v_i8mf4_i16mf4(...) __riscv_vreinterpret_v_i8mf4_i16mf4(__VA_ARGS__)
+#define vreinterpret_v_i8mf2_i16mf2(...) __riscv_vreinterpret_v_i8mf2_i16mf2(__VA_ARGS__)
+#define vreinterpret_v_i8m1_i16m1(...) __riscv_vreinterpret_v_i8m1_i16m1(__VA_ARGS__)
+#define vreinterpret_v_i8m2_i16m2(...) __riscv_vreinterpret_v_i8m2_i16m2(__VA_ARGS__)
+#define vreinterpret_v_i8m4_i16m4(...) __riscv_vreinterpret_v_i8m4_i16m4(__VA_ARGS__)
+#define vreinterpret_v_i8m8_i16m8(...) __riscv_vreinterpret_v_i8m8_i16m8(__VA_ARGS__)
+#define vreinterpret_v_u8mf4_u16mf4(...) __riscv_vreinterpret_v_u8mf4_u16mf4(__VA_ARGS__)
+#define vreinterpret_v_u8mf2_u16mf2(...) __riscv_vreinterpret_v_u8mf2_u16mf2(__VA_ARGS__)
+#define vreinterpret_v_u8m1_u16m1(...) __riscv_vreinterpret_v_u8m1_u16m1(__VA_ARGS__)
+#define vreinterpret_v_u8m2_u16m2(...) __riscv_vreinterpret_v_u8m2_u16m2(__VA_ARGS__)
+#define vreinterpret_v_u8m4_u16m4(...) __riscv_vreinterpret_v_u8m4_u16m4(__VA_ARGS__)
+#define vreinterpret_v_u8m8_u16m8(...) __riscv_vreinterpret_v_u8m8_u16m8(__VA_ARGS__)
+#define vreinterpret_v_i8mf2_i32mf2(...) __riscv_vreinterpret_v_i8mf2_i32mf2(__VA_ARGS__)
+#define vreinterpret_v_i8m1_i32m1(...) __riscv_vreinterpret_v_i8m1_i32m1(__VA_ARGS__)
+#define vreinterpret_v_i8m2_i32m2(...) __riscv_vreinterpret_v_i8m2_i32m2(__VA_ARGS__)
+#define vreinterpret_v_i8m4_i32m4(...) __riscv_vreinterpret_v_i8m4_i32m4(__VA_ARGS__)
+#define vreinterpret_v_i8m8_i32m8(...) __riscv_vreinterpret_v_i8m8_i32m8(__VA_ARGS__)
+#define vreinterpret_v_u8mf2_u32mf2(...) __riscv_vreinterpret_v_u8mf2_u32mf2(__VA_ARGS__)
+#define vreinterpret_v_u8m1_u32m1(...) __riscv_vreinterpret_v_u8m1_u32m1(__VA_ARGS__)
+#define vreinterpret_v_u8m2_u32m2(...) __riscv_vreinterpret_v_u8m2_u32m2(__VA_ARGS__)
+#define vreinterpret_v_u8m4_u32m4(...) __riscv_vreinterpret_v_u8m4_u32m4(__VA_ARGS__)
+#define vreinterpret_v_u8m8_u32m8(...) __riscv_vreinterpret_v_u8m8_u32m8(__VA_ARGS__)
+#define vreinterpret_v_i8m1_i64m1(...) __riscv_vreinterpret_v_i8m1_i64m1(__VA_ARGS__)
+#define vreinterpret_v_i8m2_i64m2(...) __riscv_vreinterpret_v_i8m2_i64m2(__VA_ARGS__)
+#define vreinterpret_v_i8m4_i64m4(...) __riscv_vreinterpret_v_i8m4_i64m4(__VA_ARGS__)
+#define vreinterpret_v_i8m8_i64m8(...) __riscv_vreinterpret_v_i8m8_i64m8(__VA_ARGS__)
+#define vreinterpret_v_u8m1_u64m1(...) __riscv_vreinterpret_v_u8m1_u64m1(__VA_ARGS__)
+#define vreinterpret_v_u8m2_u64m2(...) __riscv_vreinterpret_v_u8m2_u64m2(__VA_ARGS__)
+#define vreinterpret_v_u8m4_u64m4(...) __riscv_vreinterpret_v_u8m4_u64m4(__VA_ARGS__)
+#define vreinterpret_v_u8m8_u64m8(...) __riscv_vreinterpret_v_u8m8_u64m8(__VA_ARGS__)
+#define vreinterpret_v_i16mf4_i8mf4(...) __riscv_vreinterpret_v_i16mf4_i8mf4(__VA_ARGS__)
+#define vreinterpret_v_i16mf2_i8mf2(...) __riscv_vreinterpret_v_i16mf2_i8mf2(__VA_ARGS__)
+#define vreinterpret_v_i16m1_i8m1(...) __riscv_vreinterpret_v_i16m1_i8m1(__VA_ARGS__)
+#define vreinterpret_v_i16m2_i8m2(...) __riscv_vreinterpret_v_i16m2_i8m2(__VA_ARGS__)
+#define vreinterpret_v_i16m4_i8m4(...) __riscv_vreinterpret_v_i16m4_i8m4(__VA_ARGS__)
+#define vreinterpret_v_i16m8_i8m8(...) __riscv_vreinterpret_v_i16m8_i8m8(__VA_ARGS__)
+#define vreinterpret_v_u16mf4_u8mf4(...) __riscv_vreinterpret_v_u16mf4_u8mf4(__VA_ARGS__)
+#define vreinterpret_v_u16mf2_u8mf2(...) __riscv_vreinterpret_v_u16mf2_u8mf2(__VA_ARGS__)
+#define vreinterpret_v_u16m1_u8m1(...) __riscv_vreinterpret_v_u16m1_u8m1(__VA_ARGS__)
+#define vreinterpret_v_u16m2_u8m2(...) __riscv_vreinterpret_v_u16m2_u8m2(__VA_ARGS__)
+#define vreinterpret_v_u16m4_u8m4(...) __riscv_vreinterpret_v_u16m4_u8m4(__VA_ARGS__)
+#define vreinterpret_v_u16m8_u8m8(...) __riscv_vreinterpret_v_u16m8_u8m8(__VA_ARGS__)
+#define vreinterpret_v_i16mf2_i32mf2(...) __riscv_vreinterpret_v_i16mf2_i32mf2(__VA_ARGS__)
+#define vreinterpret_v_i16m1_i32m1(...) __riscv_vreinterpret_v_i16m1_i32m1(__VA_ARGS__)
+#define vreinterpret_v_i16m2_i32m2(...) __riscv_vreinterpret_v_i16m2_i32m2(__VA_ARGS__)
+#define vreinterpret_v_i16m4_i32m4(...) __riscv_vreinterpret_v_i16m4_i32m4(__VA_ARGS__)
+#define vreinterpret_v_i16m8_i32m8(...) __riscv_vreinterpret_v_i16m8_i32m8(__VA_ARGS__)
+#define vreinterpret_v_u16mf2_u32mf2(...) __riscv_vreinterpret_v_u16mf2_u32mf2(__VA_ARGS__)
+#define vreinterpret_v_u16m1_u32m1(...) __riscv_vreinterpret_v_u16m1_u32m1(__VA_ARGS__)
+#define vreinterpret_v_u16m2_u32m2(...) __riscv_vreinterpret_v_u16m2_u32m2(__VA_ARGS__)
+#define vreinterpret_v_u16m4_u32m4(...) __riscv_vreinterpret_v_u16m4_u32m4(__VA_ARGS__)
+#define vreinterpret_v_u16m8_u32m8(...) __riscv_vreinterpret_v_u16m8_u32m8(__VA_ARGS__)
+#define vreinterpret_v_i16m1_i64m1(...) __riscv_vreinterpret_v_i16m1_i64m1(__VA_ARGS__)
+#define vreinterpret_v_i16m2_i64m2(...) __riscv_vreinterpret_v_i16m2_i64m2(__VA_ARGS__)
+#define vreinterpret_v_i16m4_i64m4(...) __riscv_vreinterpret_v_i16m4_i64m4(__VA_ARGS__)
+#define vreinterpret_v_i16m8_i64m8(...) __riscv_vreinterpret_v_i16m8_i64m8(__VA_ARGS__)
+#define vreinterpret_v_u16m1_u64m1(...) __riscv_vreinterpret_v_u16m1_u64m1(__VA_ARGS__)
+#define vreinterpret_v_u16m2_u64m2(...) __riscv_vreinterpret_v_u16m2_u64m2(__VA_ARGS__)
+#define vreinterpret_v_u16m4_u64m4(...) __riscv_vreinterpret_v_u16m4_u64m4(__VA_ARGS__)
+#define vreinterpret_v_u16m8_u64m8(...) __riscv_vreinterpret_v_u16m8_u64m8(__VA_ARGS__)
+#define vreinterpret_v_i32mf2_i8mf2(...) __riscv_vreinterpret_v_i32mf2_i8mf2(__VA_ARGS__)
+#define vreinterpret_v_i32m1_i8m1(...) __riscv_vreinterpret_v_i32m1_i8m1(__VA_ARGS__)
+#define vreinterpret_v_i32m2_i8m2(...) __riscv_vreinterpret_v_i32m2_i8m2(__VA_ARGS__)
+#define vreinterpret_v_i32m4_i8m4(...) __riscv_vreinterpret_v_i32m4_i8m4(__VA_ARGS__)
+#define vreinterpret_v_i32m8_i8m8(...) __riscv_vreinterpret_v_i32m8_i8m8(__VA_ARGS__)
+#define vreinterpret_v_u32mf2_u8mf2(...) __riscv_vreinterpret_v_u32mf2_u8mf2(__VA_ARGS__)
+#define vreinterpret_v_u32m1_u8m1(...) __riscv_vreinterpret_v_u32m1_u8m1(__VA_ARGS__)
+#define vreinterpret_v_u32m2_u8m2(...) __riscv_vreinterpret_v_u32m2_u8m2(__VA_ARGS__)
+#define vreinterpret_v_u32m4_u8m4(...) __riscv_vreinterpret_v_u32m4_u8m4(__VA_ARGS__)
+#define vreinterpret_v_u32m8_u8m8(...) __riscv_vreinterpret_v_u32m8_u8m8(__VA_ARGS__)
+#define vreinterpret_v_i32mf2_i16mf2(...) __riscv_vreinterpret_v_i32mf2_i16mf2(__VA_ARGS__)
+#define vreinterpret_v_i32m1_i16m1(...) __riscv_vreinterpret_v_i32m1_i16m1(__VA_ARGS__)
+#define vreinterpret_v_i32m2_i16m2(...) __riscv_vreinterpret_v_i32m2_i16m2(__VA_ARGS__)
+#define vreinterpret_v_i32m4_i16m4(...) __riscv_vreinterpret_v_i32m4_i16m4(__VA_ARGS__)
+#define vreinterpret_v_i32m8_i16m8(...) __riscv_vreinterpret_v_i32m8_i16m8(__VA_ARGS__)
+#define vreinterpret_v_u32mf2_u16mf2(...) __riscv_vreinterpret_v_u32mf2_u16mf2(__VA_ARGS__)
+#define vreinterpret_v_u32m1_u16m1(...) __riscv_vreinterpret_v_u32m1_u16m1(__VA_ARGS__)
+#define vreinterpret_v_u32m2_u16m2(...) __riscv_vreinterpret_v_u32m2_u16m2(__VA_ARGS__)
+#define vreinterpret_v_u32m4_u16m4(...) __riscv_vreinterpret_v_u32m4_u16m4(__VA_ARGS__)
+#define vreinterpret_v_u32m8_u16m8(...) __riscv_vreinterpret_v_u32m8_u16m8(__VA_ARGS__)
+#define vreinterpret_v_i32m1_i64m1(...) __riscv_vreinterpret_v_i32m1_i64m1(__VA_ARGS__)
+#define vreinterpret_v_i32m2_i64m2(...) __riscv_vreinterpret_v_i32m2_i64m2(__VA_ARGS__)
+#define vreinterpret_v_i32m4_i64m4(...) __riscv_vreinterpret_v_i32m4_i64m4(__VA_ARGS__)
+#define vreinterpret_v_i32m8_i64m8(...) __riscv_vreinterpret_v_i32m8_i64m8(__VA_ARGS__)
+#define vreinterpret_v_u32m1_u64m1(...) __riscv_vreinterpret_v_u32m1_u64m1(__VA_ARGS__)
+#define vreinterpret_v_u32m2_u64m2(...) __riscv_vreinterpret_v_u32m2_u64m2(__VA_ARGS__)
+#define vreinterpret_v_u32m4_u64m4(...) __riscv_vreinterpret_v_u32m4_u64m4(__VA_ARGS__)
+#define vreinterpret_v_u32m8_u64m8(...) __riscv_vreinterpret_v_u32m8_u64m8(__VA_ARGS__)
+#define vreinterpret_v_i64m1_i8m1(...) __riscv_vreinterpret_v_i64m1_i8m1(__VA_ARGS__)
+#define vreinterpret_v_i64m2_i8m2(...) __riscv_vreinterpret_v_i64m2_i8m2(__VA_ARGS__)
+#define vreinterpret_v_i64m4_i8m4(...) __riscv_vreinterpret_v_i64m4_i8m4(__VA_ARGS__)
+#define vreinterpret_v_i64m8_i8m8(...) __riscv_vreinterpret_v_i64m8_i8m8(__VA_ARGS__)
+#define vreinterpret_v_u64m1_u8m1(...) __riscv_vreinterpret_v_u64m1_u8m1(__VA_ARGS__)
+#define vreinterpret_v_u64m2_u8m2(...) __riscv_vreinterpret_v_u64m2_u8m2(__VA_ARGS__)
+#define vreinterpret_v_u64m4_u8m4(...) __riscv_vreinterpret_v_u64m4_u8m4(__VA_ARGS__)
+#define vreinterpret_v_u64m8_u8m8(...) __riscv_vreinterpret_v_u64m8_u8m8(__VA_ARGS__)
+#define vreinterpret_v_i64m1_i16m1(...) __riscv_vreinterpret_v_i64m1_i16m1(__VA_ARGS__)
+#define vreinterpret_v_i64m2_i16m2(...) __riscv_vreinterpret_v_i64m2_i16m2(__VA_ARGS__)
+#define vreinterpret_v_i64m4_i16m4(...) __riscv_vreinterpret_v_i64m4_i16m4(__VA_ARGS__)
+#define vreinterpret_v_i64m8_i16m8(...) __riscv_vreinterpret_v_i64m8_i16m8(__VA_ARGS__)
+#define vreinterpret_v_u64m1_u16m1(...) __riscv_vreinterpret_v_u64m1_u16m1(__VA_ARGS__)
+#define vreinterpret_v_u64m2_u16m2(...) __riscv_vreinterpret_v_u64m2_u16m2(__VA_ARGS__)
+#define vreinterpret_v_u64m4_u16m4(...) __riscv_vreinterpret_v_u64m4_u16m4(__VA_ARGS__)
+#define vreinterpret_v_u64m8_u16m8(...) __riscv_vreinterpret_v_u64m8_u16m8(__VA_ARGS__)
+#define vreinterpret_v_i64m1_i32m1(...) __riscv_vreinterpret_v_i64m1_i32m1(__VA_ARGS__)
+#define vreinterpret_v_i64m2_i32m2(...) __riscv_vreinterpret_v_i64m2_i32m2(__VA_ARGS__)
+#define vreinterpret_v_i64m4_i32m4(...) __riscv_vreinterpret_v_i64m4_i32m4(__VA_ARGS__)
+#define vreinterpret_v_i64m8_i32m8(...) __riscv_vreinterpret_v_i64m8_i32m8(__VA_ARGS__)
+#define vreinterpret_v_u64m1_u32m1(...) __riscv_vreinterpret_v_u64m1_u32m1(__VA_ARGS__)
+#define vreinterpret_v_u64m2_u32m2(...) __riscv_vreinterpret_v_u64m2_u32m2(__VA_ARGS__)
+#define vreinterpret_v_u64m4_u32m4(...) __riscv_vreinterpret_v_u64m4_u32m4(__VA_ARGS__)
+#define vreinterpret_v_u64m8_u32m8(...) __riscv_vreinterpret_v_u64m8_u32m8(__VA_ARGS__)
+#define vlmul_ext_v_f16mf4_f16mf2(...) __riscv_vlmul_ext_v_f16mf4_f16mf2(__VA_ARGS__)
+#define vlmul_ext_v_f16mf4_f16m1(...) __riscv_vlmul_ext_v_f16mf4_f16m1(__VA_ARGS__)
+#define vlmul_ext_v_f16mf4_f16m2(...) __riscv_vlmul_ext_v_f16mf4_f16m2(__VA_ARGS__)
+#define vlmul_ext_v_f16mf4_f16m4(...) __riscv_vlmul_ext_v_f16mf4_f16m4(__VA_ARGS__)
+#define vlmul_ext_v_f16mf4_f16m8(...) __riscv_vlmul_ext_v_f16mf4_f16m8(__VA_ARGS__)
+#define vlmul_ext_v_f16mf2_f16m1(...) __riscv_vlmul_ext_v_f16mf2_f16m1(__VA_ARGS__)
+#define vlmul_ext_v_f16mf2_f16m2(...) __riscv_vlmul_ext_v_f16mf2_f16m2(__VA_ARGS__)
+#define vlmul_ext_v_f16mf2_f16m4(...) __riscv_vlmul_ext_v_f16mf2_f16m4(__VA_ARGS__)
+#define vlmul_ext_v_f16mf2_f16m8(...) __riscv_vlmul_ext_v_f16mf2_f16m8(__VA_ARGS__)
+#define vlmul_ext_v_f16m1_f16m2(...) __riscv_vlmul_ext_v_f16m1_f16m2(__VA_ARGS__)
+#define vlmul_ext_v_f16m1_f16m4(...) __riscv_vlmul_ext_v_f16m1_f16m4(__VA_ARGS__)
+#define vlmul_ext_v_f16m1_f16m8(...) __riscv_vlmul_ext_v_f16m1_f16m8(__VA_ARGS__)
+#define vlmul_ext_v_f16m2_f16m4(...) __riscv_vlmul_ext_v_f16m2_f16m4(__VA_ARGS__)
+#define vlmul_ext_v_f16m2_f16m8(...) __riscv_vlmul_ext_v_f16m2_f16m8(__VA_ARGS__)
+#define vlmul_ext_v_f16m4_f16m8(...) __riscv_vlmul_ext_v_f16m4_f16m8(__VA_ARGS__)
+#define vlmul_ext_v_f32mf2_f32m1(...) __riscv_vlmul_ext_v_f32mf2_f32m1(__VA_ARGS__)
+#define vlmul_ext_v_f32mf2_f32m2(...) __riscv_vlmul_ext_v_f32mf2_f32m2(__VA_ARGS__)
+#define vlmul_ext_v_f32mf2_f32m4(...) __riscv_vlmul_ext_v_f32mf2_f32m4(__VA_ARGS__)
+#define vlmul_ext_v_f32mf2_f32m8(...) __riscv_vlmul_ext_v_f32mf2_f32m8(__VA_ARGS__)
+#define vlmul_ext_v_f32m1_f32m2(...) __riscv_vlmul_ext_v_f32m1_f32m2(__VA_ARGS__)
+#define vlmul_ext_v_f32m1_f32m4(...) __riscv_vlmul_ext_v_f32m1_f32m4(__VA_ARGS__)
+#define vlmul_ext_v_f32m1_f32m8(...) __riscv_vlmul_ext_v_f32m1_f32m8(__VA_ARGS__)
+#define vlmul_ext_v_f32m2_f32m4(...) __riscv_vlmul_ext_v_f32m2_f32m4(__VA_ARGS__)
+#define vlmul_ext_v_f32m2_f32m8(...) __riscv_vlmul_ext_v_f32m2_f32m8(__VA_ARGS__)
+#define vlmul_ext_v_f32m4_f32m8(...) __riscv_vlmul_ext_v_f32m4_f32m8(__VA_ARGS__)
+#define vlmul_ext_v_f64m1_f64m2(...) __riscv_vlmul_ext_v_f64m1_f64m2(__VA_ARGS__)
+#define vlmul_ext_v_f64m1_f64m4(...) __riscv_vlmul_ext_v_f64m1_f64m4(__VA_ARGS__)
+#define vlmul_ext_v_f64m1_f64m8(...) __riscv_vlmul_ext_v_f64m1_f64m8(__VA_ARGS__)
+#define vlmul_ext_v_f64m2_f64m4(...) __riscv_vlmul_ext_v_f64m2_f64m4(__VA_ARGS__)
+#define vlmul_ext_v_f64m2_f64m8(...) __riscv_vlmul_ext_v_f64m2_f64m8(__VA_ARGS__)
+#define vlmul_ext_v_f64m4_f64m8(...) __riscv_vlmul_ext_v_f64m4_f64m8(__VA_ARGS__)
+#define vlmul_ext_v_i8mf8_i8mf4(...) __riscv_vlmul_ext_v_i8mf8_i8mf4(__VA_ARGS__)
+#define vlmul_ext_v_i8mf8_i8mf2(...) __riscv_vlmul_ext_v_i8mf8_i8mf2(__VA_ARGS__)
+#define vlmul_ext_v_i8mf8_i8m1(...) __riscv_vlmul_ext_v_i8mf8_i8m1(__VA_ARGS__)
+#define vlmul_ext_v_i8mf8_i8m2(...) __riscv_vlmul_ext_v_i8mf8_i8m2(__VA_ARGS__)
+#define vlmul_ext_v_i8mf8_i8m4(...) __riscv_vlmul_ext_v_i8mf8_i8m4(__VA_ARGS__)
+#define vlmul_ext_v_i8mf8_i8m8(...) __riscv_vlmul_ext_v_i8mf8_i8m8(__VA_ARGS__)
+#define vlmul_ext_v_i8mf4_i8mf2(...) __riscv_vlmul_ext_v_i8mf4_i8mf2(__VA_ARGS__)
+#define vlmul_ext_v_i8mf4_i8m1(...) __riscv_vlmul_ext_v_i8mf4_i8m1(__VA_ARGS__)
+#define vlmul_ext_v_i8mf4_i8m2(...) __riscv_vlmul_ext_v_i8mf4_i8m2(__VA_ARGS__)
+#define vlmul_ext_v_i8mf4_i8m4(...) __riscv_vlmul_ext_v_i8mf4_i8m4(__VA_ARGS__)
+#define vlmul_ext_v_i8mf4_i8m8(...) __riscv_vlmul_ext_v_i8mf4_i8m8(__VA_ARGS__)
+#define vlmul_ext_v_i8mf2_i8m1(...) __riscv_vlmul_ext_v_i8mf2_i8m1(__VA_ARGS__)
+#define vlmul_ext_v_i8mf2_i8m2(...) __riscv_vlmul_ext_v_i8mf2_i8m2(__VA_ARGS__)
+#define vlmul_ext_v_i8mf2_i8m4(...) __riscv_vlmul_ext_v_i8mf2_i8m4(__VA_ARGS__)
+#define vlmul_ext_v_i8mf2_i8m8(...) __riscv_vlmul_ext_v_i8mf2_i8m8(__VA_ARGS__)
+#define vlmul_ext_v_i8m1_i8m2(...) __riscv_vlmul_ext_v_i8m1_i8m2(__VA_ARGS__)
+#define vlmul_ext_v_i8m1_i8m4(...) __riscv_vlmul_ext_v_i8m1_i8m4(__VA_ARGS__)
+#define vlmul_ext_v_i8m1_i8m8(...) __riscv_vlmul_ext_v_i8m1_i8m8(__VA_ARGS__)
+#define vlmul_ext_v_i8m2_i8m4(...) __riscv_vlmul_ext_v_i8m2_i8m4(__VA_ARGS__)
+#define vlmul_ext_v_i8m2_i8m8(...) __riscv_vlmul_ext_v_i8m2_i8m8(__VA_ARGS__)
+#define vlmul_ext_v_i8m4_i8m8(...) __riscv_vlmul_ext_v_i8m4_i8m8(__VA_ARGS__)
+#define vlmul_ext_v_i16mf4_i16mf2(...) __riscv_vlmul_ext_v_i16mf4_i16mf2(__VA_ARGS__)
+#define vlmul_ext_v_i16mf4_i16m1(...) __riscv_vlmul_ext_v_i16mf4_i16m1(__VA_ARGS__)
+#define vlmul_ext_v_i16mf4_i16m2(...) __riscv_vlmul_ext_v_i16mf4_i16m2(__VA_ARGS__)
+#define vlmul_ext_v_i16mf4_i16m4(...) __riscv_vlmul_ext_v_i16mf4_i16m4(__VA_ARGS__)
+#define vlmul_ext_v_i16mf4_i16m8(...) __riscv_vlmul_ext_v_i16mf4_i16m8(__VA_ARGS__)
+#define vlmul_ext_v_i16mf2_i16m1(...) __riscv_vlmul_ext_v_i16mf2_i16m1(__VA_ARGS__)
+#define vlmul_ext_v_i16mf2_i16m2(...) __riscv_vlmul_ext_v_i16mf2_i16m2(__VA_ARGS__)
+#define vlmul_ext_v_i16mf2_i16m4(...) __riscv_vlmul_ext_v_i16mf2_i16m4(__VA_ARGS__)
+#define vlmul_ext_v_i16mf2_i16m8(...) __riscv_vlmul_ext_v_i16mf2_i16m8(__VA_ARGS__)
+#define vlmul_ext_v_i16m1_i16m2(...) __riscv_vlmul_ext_v_i16m1_i16m2(__VA_ARGS__)
+#define vlmul_ext_v_i16m1_i16m4(...) __riscv_vlmul_ext_v_i16m1_i16m4(__VA_ARGS__)
+#define vlmul_ext_v_i16m1_i16m8(...) __riscv_vlmul_ext_v_i16m1_i16m8(__VA_ARGS__)
+#define vlmul_ext_v_i16m2_i16m4(...) __riscv_vlmul_ext_v_i16m2_i16m4(__VA_ARGS__)
+#define vlmul_ext_v_i16m2_i16m8(...) __riscv_vlmul_ext_v_i16m2_i16m8(__VA_ARGS__)
+#define vlmul_ext_v_i16m4_i16m8(...) __riscv_vlmul_ext_v_i16m4_i16m8(__VA_ARGS__)
+#define vlmul_ext_v_i32mf2_i32m1(...) __riscv_vlmul_ext_v_i32mf2_i32m1(__VA_ARGS__)
+#define vlmul_ext_v_i32mf2_i32m2(...) __riscv_vlmul_ext_v_i32mf2_i32m2(__VA_ARGS__)
+#define vlmul_ext_v_i32mf2_i32m4(...) __riscv_vlmul_ext_v_i32mf2_i32m4(__VA_ARGS__)
+#define vlmul_ext_v_i32mf2_i32m8(...) __riscv_vlmul_ext_v_i32mf2_i32m8(__VA_ARGS__)
+#define vlmul_ext_v_i32m1_i32m2(...) __riscv_vlmul_ext_v_i32m1_i32m2(__VA_ARGS__)
+#define vlmul_ext_v_i32m1_i32m4(...) __riscv_vlmul_ext_v_i32m1_i32m4(__VA_ARGS__)
+#define vlmul_ext_v_i32m1_i32m8(...) __riscv_vlmul_ext_v_i32m1_i32m8(__VA_ARGS__)
+#define vlmul_ext_v_i32m2_i32m4(...) __riscv_vlmul_ext_v_i32m2_i32m4(__VA_ARGS__)
+#define vlmul_ext_v_i32m2_i32m8(...) __riscv_vlmul_ext_v_i32m2_i32m8(__VA_ARGS__)
+#define vlmul_ext_v_i32m4_i32m8(...) __riscv_vlmul_ext_v_i32m4_i32m8(__VA_ARGS__)
+#define vlmul_ext_v_i64m1_i64m2(...) __riscv_vlmul_ext_v_i64m1_i64m2(__VA_ARGS__)
+#define vlmul_ext_v_i64m1_i64m4(...) __riscv_vlmul_ext_v_i64m1_i64m4(__VA_ARGS__)
+#define vlmul_ext_v_i64m1_i64m8(...) __riscv_vlmul_ext_v_i64m1_i64m8(__VA_ARGS__)
+#define vlmul_ext_v_i64m2_i64m4(...) __riscv_vlmul_ext_v_i64m2_i64m4(__VA_ARGS__)
+#define vlmul_ext_v_i64m2_i64m8(...) __riscv_vlmul_ext_v_i64m2_i64m8(__VA_ARGS__)
+#define vlmul_ext_v_i64m4_i64m8(...) __riscv_vlmul_ext_v_i64m4_i64m8(__VA_ARGS__)
+#define vlmul_ext_v_u8mf8_u8mf4(...) __riscv_vlmul_ext_v_u8mf8_u8mf4(__VA_ARGS__)
+#define vlmul_ext_v_u8mf8_u8mf2(...) __riscv_vlmul_ext_v_u8mf8_u8mf2(__VA_ARGS__)
+#define vlmul_ext_v_u8mf8_u8m1(...) __riscv_vlmul_ext_v_u8mf8_u8m1(__VA_ARGS__)
+#define vlmul_ext_v_u8mf8_u8m2(...) __riscv_vlmul_ext_v_u8mf8_u8m2(__VA_ARGS__)
+#define vlmul_ext_v_u8mf8_u8m4(...) __riscv_vlmul_ext_v_u8mf8_u8m4(__VA_ARGS__)
+#define vlmul_ext_v_u8mf8_u8m8(...) __riscv_vlmul_ext_v_u8mf8_u8m8(__VA_ARGS__)
+#define vlmul_ext_v_u8mf4_u8mf2(...) __riscv_vlmul_ext_v_u8mf4_u8mf2(__VA_ARGS__)
+#define vlmul_ext_v_u8mf4_u8m1(...) __riscv_vlmul_ext_v_u8mf4_u8m1(__VA_ARGS__)
+#define vlmul_ext_v_u8mf4_u8m2(...) __riscv_vlmul_ext_v_u8mf4_u8m2(__VA_ARGS__)
+#define vlmul_ext_v_u8mf4_u8m4(...) __riscv_vlmul_ext_v_u8mf4_u8m4(__VA_ARGS__)
+#define vlmul_ext_v_u8mf4_u8m8(...) __riscv_vlmul_ext_v_u8mf4_u8m8(__VA_ARGS__)
+#define vlmul_ext_v_u8mf2_u8m1(...) __riscv_vlmul_ext_v_u8mf2_u8m1(__VA_ARGS__)
+#define vlmul_ext_v_u8mf2_u8m2(...) __riscv_vlmul_ext_v_u8mf2_u8m2(__VA_ARGS__)
+#define vlmul_ext_v_u8mf2_u8m4(...) __riscv_vlmul_ext_v_u8mf2_u8m4(__VA_ARGS__)
+#define vlmul_ext_v_u8mf2_u8m8(...) __riscv_vlmul_ext_v_u8mf2_u8m8(__VA_ARGS__)
+#define vlmul_ext_v_u8m1_u8m2(...) __riscv_vlmul_ext_v_u8m1_u8m2(__VA_ARGS__)
+#define vlmul_ext_v_u8m1_u8m4(...) __riscv_vlmul_ext_v_u8m1_u8m4(__VA_ARGS__)
+#define vlmul_ext_v_u8m1_u8m8(...) __riscv_vlmul_ext_v_u8m1_u8m8(__VA_ARGS__)
+#define vlmul_ext_v_u8m2_u8m4(...) __riscv_vlmul_ext_v_u8m2_u8m4(__VA_ARGS__)
+#define vlmul_ext_v_u8m2_u8m8(...) __riscv_vlmul_ext_v_u8m2_u8m8(__VA_ARGS__)
+#define vlmul_ext_v_u8m4_u8m8(...) __riscv_vlmul_ext_v_u8m4_u8m8(__VA_ARGS__)
+#define vlmul_ext_v_u16mf4_u16mf2(...) __riscv_vlmul_ext_v_u16mf4_u16mf2(__VA_ARGS__)
+#define vlmul_ext_v_u16mf4_u16m1(...) __riscv_vlmul_ext_v_u16mf4_u16m1(__VA_ARGS__)
+#define vlmul_ext_v_u16mf4_u16m2(...) __riscv_vlmul_ext_v_u16mf4_u16m2(__VA_ARGS__)
+#define vlmul_ext_v_u16mf4_u16m4(...) __riscv_vlmul_ext_v_u16mf4_u16m4(__VA_ARGS__)
+#define vlmul_ext_v_u16mf4_u16m8(...) __riscv_vlmul_ext_v_u16mf4_u16m8(__VA_ARGS__)
+#define vlmul_ext_v_u16mf2_u16m1(...) __riscv_vlmul_ext_v_u16mf2_u16m1(__VA_ARGS__)
+#define vlmul_ext_v_u16mf2_u16m2(...) __riscv_vlmul_ext_v_u16mf2_u16m2(__VA_ARGS__)
+#define vlmul_ext_v_u16mf2_u16m4(...) __riscv_vlmul_ext_v_u16mf2_u16m4(__VA_ARGS__)
+#define vlmul_ext_v_u16mf2_u16m8(...) __riscv_vlmul_ext_v_u16mf2_u16m8(__VA_ARGS__)
+#define vlmul_ext_v_u16m1_u16m2(...) __riscv_vlmul_ext_v_u16m1_u16m2(__VA_ARGS__)
+#define vlmul_ext_v_u16m1_u16m4(...) __riscv_vlmul_ext_v_u16m1_u16m4(__VA_ARGS__)
+#define vlmul_ext_v_u16m1_u16m8(...) __riscv_vlmul_ext_v_u16m1_u16m8(__VA_ARGS__)
+#define vlmul_ext_v_u16m2_u16m4(...) __riscv_vlmul_ext_v_u16m2_u16m4(__VA_ARGS__)
+#define vlmul_ext_v_u16m2_u16m8(...) __riscv_vlmul_ext_v_u16m2_u16m8(__VA_ARGS__)
+#define vlmul_ext_v_u16m4_u16m8(...) __riscv_vlmul_ext_v_u16m4_u16m8(__VA_ARGS__)
+#define vlmul_ext_v_u32mf2_u32m1(...) __riscv_vlmul_ext_v_u32mf2_u32m1(__VA_ARGS__)
+#define vlmul_ext_v_u32mf2_u32m2(...) __riscv_vlmul_ext_v_u32mf2_u32m2(__VA_ARGS__)
+#define vlmul_ext_v_u32mf2_u32m4(...) __riscv_vlmul_ext_v_u32mf2_u32m4(__VA_ARGS__)
+#define vlmul_ext_v_u32mf2_u32m8(...) __riscv_vlmul_ext_v_u32mf2_u32m8(__VA_ARGS__)
+#define vlmul_ext_v_u32m1_u32m2(...) __riscv_vlmul_ext_v_u32m1_u32m2(__VA_ARGS__)
+#define vlmul_ext_v_u32m1_u32m4(...) __riscv_vlmul_ext_v_u32m1_u32m4(__VA_ARGS__)
+#define vlmul_ext_v_u32m1_u32m8(...) __riscv_vlmul_ext_v_u32m1_u32m8(__VA_ARGS__)
+#define vlmul_ext_v_u32m2_u32m4(...) __riscv_vlmul_ext_v_u32m2_u32m4(__VA_ARGS__)
+#define vlmul_ext_v_u32m2_u32m8(...) __riscv_vlmul_ext_v_u32m2_u32m8(__VA_ARGS__)
+#define vlmul_ext_v_u32m4_u32m8(...) __riscv_vlmul_ext_v_u32m4_u32m8(__VA_ARGS__)
+#define vlmul_ext_v_u64m1_u64m2(...) __riscv_vlmul_ext_v_u64m1_u64m2(__VA_ARGS__)
+#define vlmul_ext_v_u64m1_u64m4(...) __riscv_vlmul_ext_v_u64m1_u64m4(__VA_ARGS__)
+#define vlmul_ext_v_u64m1_u64m8(...) __riscv_vlmul_ext_v_u64m1_u64m8(__VA_ARGS__)
+#define vlmul_ext_v_u64m2_u64m4(...) __riscv_vlmul_ext_v_u64m2_u64m4(__VA_ARGS__)
+#define vlmul_ext_v_u64m2_u64m8(...) __riscv_vlmul_ext_v_u64m2_u64m8(__VA_ARGS__)
+#define vlmul_ext_v_u64m4_u64m8(...) __riscv_vlmul_ext_v_u64m4_u64m8(__VA_ARGS__)
+#define vlmul_trunc_v_f16mf2_f16mf4(...) __riscv_vlmul_trunc_v_f16mf2_f16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_f16m1_f16mf4(...) __riscv_vlmul_trunc_v_f16m1_f16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_f16m1_f16mf2(...) __riscv_vlmul_trunc_v_f16m1_f16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_f16m2_f16mf4(...) __riscv_vlmul_trunc_v_f16m2_f16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_f16m2_f16mf2(...) __riscv_vlmul_trunc_v_f16m2_f16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_f16m2_f16m1(...) __riscv_vlmul_trunc_v_f16m2_f16m1(__VA_ARGS__)
+#define vlmul_trunc_v_f16m4_f16mf4(...) __riscv_vlmul_trunc_v_f16m4_f16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_f16m4_f16mf2(...) __riscv_vlmul_trunc_v_f16m4_f16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_f16m4_f16m1(...) __riscv_vlmul_trunc_v_f16m4_f16m1(__VA_ARGS__)
+#define vlmul_trunc_v_f16m4_f16m2(...) __riscv_vlmul_trunc_v_f16m4_f16m2(__VA_ARGS__)
+#define vlmul_trunc_v_f16m8_f16mf4(...) __riscv_vlmul_trunc_v_f16m8_f16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_f16m8_f16mf2(...) __riscv_vlmul_trunc_v_f16m8_f16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_f16m8_f16m1(...) __riscv_vlmul_trunc_v_f16m8_f16m1(__VA_ARGS__)
+#define vlmul_trunc_v_f16m8_f16m2(...) __riscv_vlmul_trunc_v_f16m8_f16m2(__VA_ARGS__)
+#define vlmul_trunc_v_f16m8_f16m4(...) __riscv_vlmul_trunc_v_f16m8_f16m4(__VA_ARGS__)
+#define vlmul_trunc_v_f32m1_f32mf2(...) __riscv_vlmul_trunc_v_f32m1_f32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_f32m2_f32mf2(...) __riscv_vlmul_trunc_v_f32m2_f32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_f32m2_f32m1(...) __riscv_vlmul_trunc_v_f32m2_f32m1(__VA_ARGS__)
+#define vlmul_trunc_v_f32m4_f32mf2(...) __riscv_vlmul_trunc_v_f32m4_f32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_f32m4_f32m1(...) __riscv_vlmul_trunc_v_f32m4_f32m1(__VA_ARGS__)
+#define vlmul_trunc_v_f32m4_f32m2(...) __riscv_vlmul_trunc_v_f32m4_f32m2(__VA_ARGS__)
+#define vlmul_trunc_v_f32m8_f32mf2(...) __riscv_vlmul_trunc_v_f32m8_f32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_f32m8_f32m1(...) __riscv_vlmul_trunc_v_f32m8_f32m1(__VA_ARGS__)
+#define vlmul_trunc_v_f32m8_f32m2(...) __riscv_vlmul_trunc_v_f32m8_f32m2(__VA_ARGS__)
+#define vlmul_trunc_v_f32m8_f32m4(...) __riscv_vlmul_trunc_v_f32m8_f32m4(__VA_ARGS__)
+#define vlmul_trunc_v_f64m2_f64m1(...) __riscv_vlmul_trunc_v_f64m2_f64m1(__VA_ARGS__)
+#define vlmul_trunc_v_f64m4_f64m1(...) __riscv_vlmul_trunc_v_f64m4_f64m1(__VA_ARGS__)
+#define vlmul_trunc_v_f64m4_f64m2(...) __riscv_vlmul_trunc_v_f64m4_f64m2(__VA_ARGS__)
+#define vlmul_trunc_v_f64m8_f64m1(...) __riscv_vlmul_trunc_v_f64m8_f64m1(__VA_ARGS__)
+#define vlmul_trunc_v_f64m8_f64m2(...) __riscv_vlmul_trunc_v_f64m8_f64m2(__VA_ARGS__)
+#define vlmul_trunc_v_f64m8_f64m4(...) __riscv_vlmul_trunc_v_f64m8_f64m4(__VA_ARGS__)
+#define vlmul_trunc_v_i8mf4_i8mf8(...) __riscv_vlmul_trunc_v_i8mf4_i8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_i8mf2_i8mf8(...) __riscv_vlmul_trunc_v_i8mf2_i8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_i8mf2_i8mf4(...) __riscv_vlmul_trunc_v_i8mf2_i8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i8m1_i8mf8(...) __riscv_vlmul_trunc_v_i8m1_i8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_i8m1_i8mf4(...) __riscv_vlmul_trunc_v_i8m1_i8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i8m1_i8mf2(...) __riscv_vlmul_trunc_v_i8m1_i8mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i8m2_i8mf8(...) __riscv_vlmul_trunc_v_i8m2_i8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_i8m2_i8mf4(...) __riscv_vlmul_trunc_v_i8m2_i8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i8m2_i8mf2(...) __riscv_vlmul_trunc_v_i8m2_i8mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i8m2_i8m1(...) __riscv_vlmul_trunc_v_i8m2_i8m1(__VA_ARGS__)
+#define vlmul_trunc_v_i8m4_i8mf8(...) __riscv_vlmul_trunc_v_i8m4_i8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_i8m4_i8mf4(...) __riscv_vlmul_trunc_v_i8m4_i8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i8m4_i8mf2(...) __riscv_vlmul_trunc_v_i8m4_i8mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i8m4_i8m1(...) __riscv_vlmul_trunc_v_i8m4_i8m1(__VA_ARGS__)
+#define vlmul_trunc_v_i8m4_i8m2(...) __riscv_vlmul_trunc_v_i8m4_i8m2(__VA_ARGS__)
+#define vlmul_trunc_v_i8m8_i8mf8(...) __riscv_vlmul_trunc_v_i8m8_i8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_i8m8_i8mf4(...) __riscv_vlmul_trunc_v_i8m8_i8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i8m8_i8mf2(...) __riscv_vlmul_trunc_v_i8m8_i8mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i8m8_i8m1(...) __riscv_vlmul_trunc_v_i8m8_i8m1(__VA_ARGS__)
+#define vlmul_trunc_v_i8m8_i8m2(...) __riscv_vlmul_trunc_v_i8m8_i8m2(__VA_ARGS__)
+#define vlmul_trunc_v_i8m8_i8m4(...) __riscv_vlmul_trunc_v_i8m8_i8m4(__VA_ARGS__)
+#define vlmul_trunc_v_i16mf2_i16mf4(...) __riscv_vlmul_trunc_v_i16mf2_i16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i16m1_i16mf4(...) __riscv_vlmul_trunc_v_i16m1_i16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i16m1_i16mf2(...) __riscv_vlmul_trunc_v_i16m1_i16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i16m2_i16mf4(...) __riscv_vlmul_trunc_v_i16m2_i16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i16m2_i16mf2(...) __riscv_vlmul_trunc_v_i16m2_i16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i16m2_i16m1(...) __riscv_vlmul_trunc_v_i16m2_i16m1(__VA_ARGS__)
+#define vlmul_trunc_v_i16m4_i16mf4(...) __riscv_vlmul_trunc_v_i16m4_i16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i16m4_i16mf2(...) __riscv_vlmul_trunc_v_i16m4_i16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i16m4_i16m1(...) __riscv_vlmul_trunc_v_i16m4_i16m1(__VA_ARGS__)
+#define vlmul_trunc_v_i16m4_i16m2(...) __riscv_vlmul_trunc_v_i16m4_i16m2(__VA_ARGS__)
+#define vlmul_trunc_v_i16m8_i16mf4(...) __riscv_vlmul_trunc_v_i16m8_i16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_i16m8_i16mf2(...) __riscv_vlmul_trunc_v_i16m8_i16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i16m8_i16m1(...) __riscv_vlmul_trunc_v_i16m8_i16m1(__VA_ARGS__)
+#define vlmul_trunc_v_i16m8_i16m2(...) __riscv_vlmul_trunc_v_i16m8_i16m2(__VA_ARGS__)
+#define vlmul_trunc_v_i16m8_i16m4(...) __riscv_vlmul_trunc_v_i16m8_i16m4(__VA_ARGS__)
+#define vlmul_trunc_v_i32m1_i32mf2(...) __riscv_vlmul_trunc_v_i32m1_i32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i32m2_i32mf2(...) __riscv_vlmul_trunc_v_i32m2_i32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i32m2_i32m1(...) __riscv_vlmul_trunc_v_i32m2_i32m1(__VA_ARGS__)
+#define vlmul_trunc_v_i32m4_i32mf2(...) __riscv_vlmul_trunc_v_i32m4_i32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i32m4_i32m1(...) __riscv_vlmul_trunc_v_i32m4_i32m1(__VA_ARGS__)
+#define vlmul_trunc_v_i32m4_i32m2(...) __riscv_vlmul_trunc_v_i32m4_i32m2(__VA_ARGS__)
+#define vlmul_trunc_v_i32m8_i32mf2(...) __riscv_vlmul_trunc_v_i32m8_i32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_i32m8_i32m1(...) __riscv_vlmul_trunc_v_i32m8_i32m1(__VA_ARGS__)
+#define vlmul_trunc_v_i32m8_i32m2(...) __riscv_vlmul_trunc_v_i32m8_i32m2(__VA_ARGS__)
+#define vlmul_trunc_v_i32m8_i32m4(...) __riscv_vlmul_trunc_v_i32m8_i32m4(__VA_ARGS__)
+#define vlmul_trunc_v_i64m2_i64m1(...) __riscv_vlmul_trunc_v_i64m2_i64m1(__VA_ARGS__)
+#define vlmul_trunc_v_i64m4_i64m1(...) __riscv_vlmul_trunc_v_i64m4_i64m1(__VA_ARGS__)
+#define vlmul_trunc_v_i64m4_i64m2(...) __riscv_vlmul_trunc_v_i64m4_i64m2(__VA_ARGS__)
+#define vlmul_trunc_v_i64m8_i64m1(...) __riscv_vlmul_trunc_v_i64m8_i64m1(__VA_ARGS__)
+#define vlmul_trunc_v_i64m8_i64m2(...) __riscv_vlmul_trunc_v_i64m8_i64m2(__VA_ARGS__)
+#define vlmul_trunc_v_i64m8_i64m4(...) __riscv_vlmul_trunc_v_i64m8_i64m4(__VA_ARGS__)
+#define vlmul_trunc_v_u8mf4_u8mf8(...) __riscv_vlmul_trunc_v_u8mf4_u8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_u8mf2_u8mf8(...) __riscv_vlmul_trunc_v_u8mf2_u8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_u8mf2_u8mf4(...) __riscv_vlmul_trunc_v_u8mf2_u8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u8m1_u8mf8(...) __riscv_vlmul_trunc_v_u8m1_u8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_u8m1_u8mf4(...) __riscv_vlmul_trunc_v_u8m1_u8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u8m1_u8mf2(...) __riscv_vlmul_trunc_v_u8m1_u8mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u8m2_u8mf8(...) __riscv_vlmul_trunc_v_u8m2_u8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_u8m2_u8mf4(...) __riscv_vlmul_trunc_v_u8m2_u8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u8m2_u8mf2(...) __riscv_vlmul_trunc_v_u8m2_u8mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u8m2_u8m1(...) __riscv_vlmul_trunc_v_u8m2_u8m1(__VA_ARGS__)
+#define vlmul_trunc_v_u8m4_u8mf8(...) __riscv_vlmul_trunc_v_u8m4_u8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_u8m4_u8mf4(...) __riscv_vlmul_trunc_v_u8m4_u8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u8m4_u8mf2(...) __riscv_vlmul_trunc_v_u8m4_u8mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u8m4_u8m1(...) __riscv_vlmul_trunc_v_u8m4_u8m1(__VA_ARGS__)
+#define vlmul_trunc_v_u8m4_u8m2(...) __riscv_vlmul_trunc_v_u8m4_u8m2(__VA_ARGS__)
+#define vlmul_trunc_v_u8m8_u8mf8(...) __riscv_vlmul_trunc_v_u8m8_u8mf8(__VA_ARGS__)
+#define vlmul_trunc_v_u8m8_u8mf4(...) __riscv_vlmul_trunc_v_u8m8_u8mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u8m8_u8mf2(...) __riscv_vlmul_trunc_v_u8m8_u8mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u8m8_u8m1(...) __riscv_vlmul_trunc_v_u8m8_u8m1(__VA_ARGS__)
+#define vlmul_trunc_v_u8m8_u8m2(...) __riscv_vlmul_trunc_v_u8m8_u8m2(__VA_ARGS__)
+#define vlmul_trunc_v_u8m8_u8m4(...) __riscv_vlmul_trunc_v_u8m8_u8m4(__VA_ARGS__)
+#define vlmul_trunc_v_u16mf2_u16mf4(...) __riscv_vlmul_trunc_v_u16mf2_u16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u16m1_u16mf4(...) __riscv_vlmul_trunc_v_u16m1_u16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u16m1_u16mf2(...) __riscv_vlmul_trunc_v_u16m1_u16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u16m2_u16mf4(...) __riscv_vlmul_trunc_v_u16m2_u16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u16m2_u16mf2(...) __riscv_vlmul_trunc_v_u16m2_u16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u16m2_u16m1(...) __riscv_vlmul_trunc_v_u16m2_u16m1(__VA_ARGS__)
+#define vlmul_trunc_v_u16m4_u16mf4(...) __riscv_vlmul_trunc_v_u16m4_u16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u16m4_u16mf2(...) __riscv_vlmul_trunc_v_u16m4_u16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u16m4_u16m1(...) __riscv_vlmul_trunc_v_u16m4_u16m1(__VA_ARGS__)
+#define vlmul_trunc_v_u16m4_u16m2(...) __riscv_vlmul_trunc_v_u16m4_u16m2(__VA_ARGS__)
+#define vlmul_trunc_v_u16m8_u16mf4(...) __riscv_vlmul_trunc_v_u16m8_u16mf4(__VA_ARGS__)
+#define vlmul_trunc_v_u16m8_u16mf2(...) __riscv_vlmul_trunc_v_u16m8_u16mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u16m8_u16m1(...) __riscv_vlmul_trunc_v_u16m8_u16m1(__VA_ARGS__)
+#define vlmul_trunc_v_u16m8_u16m2(...) __riscv_vlmul_trunc_v_u16m8_u16m2(__VA_ARGS__)
+#define vlmul_trunc_v_u16m8_u16m4(...) __riscv_vlmul_trunc_v_u16m8_u16m4(__VA_ARGS__)
+#define vlmul_trunc_v_u32m1_u32mf2(...) __riscv_vlmul_trunc_v_u32m1_u32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u32m2_u32mf2(...) __riscv_vlmul_trunc_v_u32m2_u32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u32m2_u32m1(...) __riscv_vlmul_trunc_v_u32m2_u32m1(__VA_ARGS__)
+#define vlmul_trunc_v_u32m4_u32mf2(...) __riscv_vlmul_trunc_v_u32m4_u32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u32m4_u32m1(...) __riscv_vlmul_trunc_v_u32m4_u32m1(__VA_ARGS__)
+#define vlmul_trunc_v_u32m4_u32m2(...) __riscv_vlmul_trunc_v_u32m4_u32m2(__VA_ARGS__)
+#define vlmul_trunc_v_u32m8_u32mf2(...) __riscv_vlmul_trunc_v_u32m8_u32mf2(__VA_ARGS__)
+#define vlmul_trunc_v_u32m8_u32m1(...) __riscv_vlmul_trunc_v_u32m8_u32m1(__VA_ARGS__)
+#define vlmul_trunc_v_u32m8_u32m2(...) __riscv_vlmul_trunc_v_u32m8_u32m2(__VA_ARGS__)
+#define vlmul_trunc_v_u32m8_u32m4(...) __riscv_vlmul_trunc_v_u32m8_u32m4(__VA_ARGS__)
+#define vlmul_trunc_v_u64m2_u64m1(...) __riscv_vlmul_trunc_v_u64m2_u64m1(__VA_ARGS__)
+#define vlmul_trunc_v_u64m4_u64m1(...) __riscv_vlmul_trunc_v_u64m4_u64m1(__VA_ARGS__)
+#define vlmul_trunc_v_u64m4_u64m2(...) __riscv_vlmul_trunc_v_u64m4_u64m2(__VA_ARGS__)
+#define vlmul_trunc_v_u64m8_u64m1(...) __riscv_vlmul_trunc_v_u64m8_u64m1(__VA_ARGS__)
+#define vlmul_trunc_v_u64m8_u64m2(...) __riscv_vlmul_trunc_v_u64m8_u64m2(__VA_ARGS__)
+#define vlmul_trunc_v_u64m8_u64m4(...) __riscv_vlmul_trunc_v_u64m8_u64m4(__VA_ARGS__)
+#define vundefined_f16mf4(...) __riscv_vundefined_f16mf4(__VA_ARGS__)
+#define vundefined_f16mf2(...) __riscv_vundefined_f16mf2(__VA_ARGS__)
+#define vundefined_f16m1(...) __riscv_vundefined_f16m1(__VA_ARGS__)
+#define vundefined_f16m2(...) __riscv_vundefined_f16m2(__VA_ARGS__)
+#define vundefined_f16m4(...) __riscv_vundefined_f16m4(__VA_ARGS__)
+#define vundefined_f16m8(...) __riscv_vundefined_f16m8(__VA_ARGS__)
+#define vundefined_f32mf2(...) __riscv_vundefined_f32mf2(__VA_ARGS__)
+#define vundefined_f32m1(...) __riscv_vundefined_f32m1(__VA_ARGS__)
+#define vundefined_f32m2(...) __riscv_vundefined_f32m2(__VA_ARGS__)
+#define vundefined_f32m4(...) __riscv_vundefined_f32m4(__VA_ARGS__)
+#define vundefined_f32m8(...) __riscv_vundefined_f32m8(__VA_ARGS__)
+#define vundefined_f64m1(...) __riscv_vundefined_f64m1(__VA_ARGS__)
+#define vundefined_f64m2(...) __riscv_vundefined_f64m2(__VA_ARGS__)
+#define vundefined_f64m4(...) __riscv_vundefined_f64m4(__VA_ARGS__)
+#define vundefined_f64m8(...) __riscv_vundefined_f64m8(__VA_ARGS__)
+#define vundefined_i8mf8(...) __riscv_vundefined_i8mf8(__VA_ARGS__)
+#define vundefined_i8mf4(...) __riscv_vundefined_i8mf4(__VA_ARGS__)
+#define vundefined_i8mf2(...) __riscv_vundefined_i8mf2(__VA_ARGS__)
+#define vundefined_i8m1(...) __riscv_vundefined_i8m1(__VA_ARGS__)
+#define vundefined_i8m2(...) __riscv_vundefined_i8m2(__VA_ARGS__)
+#define vundefined_i8m4(...) __riscv_vundefined_i8m4(__VA_ARGS__)
+#define vundefined_i8m8(...) __riscv_vundefined_i8m8(__VA_ARGS__)
+#define vundefined_i16mf4(...) __riscv_vundefined_i16mf4(__VA_ARGS__)
+#define vundefined_i16mf2(...) __riscv_vundefined_i16mf2(__VA_ARGS__)
+#define vundefined_i16m1(...) __riscv_vundefined_i16m1(__VA_ARGS__)
+#define vundefined_i16m2(...) __riscv_vundefined_i16m2(__VA_ARGS__)
+#define vundefined_i16m4(...) __riscv_vundefined_i16m4(__VA_ARGS__)
+#define vundefined_i16m8(...) __riscv_vundefined_i16m8(__VA_ARGS__)
+#define vundefined_i32mf2(...) __riscv_vundefined_i32mf2(__VA_ARGS__)
+#define vundefined_i32m1(...) __riscv_vundefined_i32m1(__VA_ARGS__)
+#define vundefined_i32m2(...) __riscv_vundefined_i32m2(__VA_ARGS__)
+#define vundefined_i32m4(...) __riscv_vundefined_i32m4(__VA_ARGS__)
+#define vundefined_i32m8(...) __riscv_vundefined_i32m8(__VA_ARGS__)
+#define vundefined_i64m1(...) __riscv_vundefined_i64m1(__VA_ARGS__)
+#define vundefined_i64m2(...) __riscv_vundefined_i64m2(__VA_ARGS__)
+#define vundefined_i64m4(...) __riscv_vundefined_i64m4(__VA_ARGS__)
+#define vundefined_i64m8(...) __riscv_vundefined_i64m8(__VA_ARGS__)
+#define vundefined_u8mf8(...) __riscv_vundefined_u8mf8(__VA_ARGS__)
+#define vundefined_u8mf4(...) __riscv_vundefined_u8mf4(__VA_ARGS__)
+#define vundefined_u8mf2(...) __riscv_vundefined_u8mf2(__VA_ARGS__)
+#define vundefined_u8m1(...) __riscv_vundefined_u8m1(__VA_ARGS__)
+#define vundefined_u8m2(...) __riscv_vundefined_u8m2(__VA_ARGS__)
+#define vundefined_u8m4(...) __riscv_vundefined_u8m4(__VA_ARGS__)
+#define vundefined_u8m8(...) __riscv_vundefined_u8m8(__VA_ARGS__)
+#define vundefined_u16mf4(...) __riscv_vundefined_u16mf4(__VA_ARGS__)
+#define vundefined_u16mf2(...) __riscv_vundefined_u16mf2(__VA_ARGS__)
+#define vundefined_u16m1(...) __riscv_vundefined_u16m1(__VA_ARGS__)
+#define vundefined_u16m2(...) __riscv_vundefined_u16m2(__VA_ARGS__)
+#define vundefined_u16m4(...) __riscv_vundefined_u16m4(__VA_ARGS__)
+#define vundefined_u16m8(...) __riscv_vundefined_u16m8(__VA_ARGS__)
+#define vundefined_u32mf2(...) __riscv_vundefined_u32mf2(__VA_ARGS__)
+#define vundefined_u32m1(...) __riscv_vundefined_u32m1(__VA_ARGS__)
+#define vundefined_u32m2(...) __riscv_vundefined_u32m2(__VA_ARGS__)
+#define vundefined_u32m4(...) __riscv_vundefined_u32m4(__VA_ARGS__)
+#define vundefined_u32m8(...) __riscv_vundefined_u32m8(__VA_ARGS__)
+#define vundefined_u64m1(...) __riscv_vundefined_u64m1(__VA_ARGS__)
+#define vundefined_u64m2(...) __riscv_vundefined_u64m2(__VA_ARGS__)
+#define vundefined_u64m4(...) __riscv_vundefined_u64m4(__VA_ARGS__)
+#define vundefined_u64m8(...) __riscv_vundefined_u64m8(__VA_ARGS__)
+#define vset_v_f16m1_f16m2(...) __riscv_vset_v_f16m1_f16m2(__VA_ARGS__)
+#define vset_v_f16m1_f16m4(...) __riscv_vset_v_f16m1_f16m4(__VA_ARGS__)
+#define vset_v_f16m2_f16m4(...) __riscv_vset_v_f16m2_f16m4(__VA_ARGS__)
+#define vset_v_f16m1_f16m8(...) __riscv_vset_v_f16m1_f16m8(__VA_ARGS__)
+#define vset_v_f16m2_f16m8(...) __riscv_vset_v_f16m2_f16m8(__VA_ARGS__)
+#define vset_v_f16m4_f16m8(...) __riscv_vset_v_f16m4_f16m8(__VA_ARGS__)
+#define vset_v_f32m1_f32m2(...) __riscv_vset_v_f32m1_f32m2(__VA_ARGS__)
+#define vset_v_f32m1_f32m4(...) __riscv_vset_v_f32m1_f32m4(__VA_ARGS__)
+#define vset_v_f32m2_f32m4(...) __riscv_vset_v_f32m2_f32m4(__VA_ARGS__)
+#define vset_v_f32m1_f32m8(...) __riscv_vset_v_f32m1_f32m8(__VA_ARGS__)
+#define vset_v_f32m2_f32m8(...) __riscv_vset_v_f32m2_f32m8(__VA_ARGS__)
+#define vset_v_f32m4_f32m8(...) __riscv_vset_v_f32m4_f32m8(__VA_ARGS__)
+#define vset_v_f64m1_f64m2(...) __riscv_vset_v_f64m1_f64m2(__VA_ARGS__)
+#define vset_v_f64m1_f64m4(...) __riscv_vset_v_f64m1_f64m4(__VA_ARGS__)
+#define vset_v_f64m2_f64m4(...) __riscv_vset_v_f64m2_f64m4(__VA_ARGS__)
+#define vset_v_f64m1_f64m8(...) __riscv_vset_v_f64m1_f64m8(__VA_ARGS__)
+#define vset_v_f64m2_f64m8(...) __riscv_vset_v_f64m2_f64m8(__VA_ARGS__)
+#define vset_v_f64m4_f64m8(...) __riscv_vset_v_f64m4_f64m8(__VA_ARGS__)
+#define vset_v_i8m1_i8m2(...) __riscv_vset_v_i8m1_i8m2(__VA_ARGS__)
+#define vset_v_i8m1_i8m4(...) __riscv_vset_v_i8m1_i8m4(__VA_ARGS__)
+#define vset_v_i8m2_i8m4(...) __riscv_vset_v_i8m2_i8m4(__VA_ARGS__)
+#define vset_v_i8m1_i8m8(...) __riscv_vset_v_i8m1_i8m8(__VA_ARGS__)
+#define vset_v_i8m2_i8m8(...) __riscv_vset_v_i8m2_i8m8(__VA_ARGS__)
+#define vset_v_i8m4_i8m8(...) __riscv_vset_v_i8m4_i8m8(__VA_ARGS__)
+#define vset_v_i16m1_i16m2(...) __riscv_vset_v_i16m1_i16m2(__VA_ARGS__)
+#define vset_v_i16m1_i16m4(...) __riscv_vset_v_i16m1_i16m4(__VA_ARGS__)
+#define vset_v_i16m2_i16m4(...) __riscv_vset_v_i16m2_i16m4(__VA_ARGS__)
+#define vset_v_i16m1_i16m8(...) __riscv_vset_v_i16m1_i16m8(__VA_ARGS__)
+#define vset_v_i16m2_i16m8(...) __riscv_vset_v_i16m2_i16m8(__VA_ARGS__)
+#define vset_v_i16m4_i16m8(...) __riscv_vset_v_i16m4_i16m8(__VA_ARGS__)
+#define vset_v_i32m1_i32m2(...) __riscv_vset_v_i32m1_i32m2(__VA_ARGS__)
+#define vset_v_i32m1_i32m4(...) __riscv_vset_v_i32m1_i32m4(__VA_ARGS__)
+#define vset_v_i32m2_i32m4(...) __riscv_vset_v_i32m2_i32m4(__VA_ARGS__)
+#define vset_v_i32m1_i32m8(...) __riscv_vset_v_i32m1_i32m8(__VA_ARGS__)
+#define vset_v_i32m2_i32m8(...) __riscv_vset_v_i32m2_i32m8(__VA_ARGS__)
+#define vset_v_i32m4_i32m8(...) __riscv_vset_v_i32m4_i32m8(__VA_ARGS__)
+#define vset_v_i64m1_i64m2(...) __riscv_vset_v_i64m1_i64m2(__VA_ARGS__)
+#define vset_v_i64m1_i64m4(...) __riscv_vset_v_i64m1_i64m4(__VA_ARGS__)
+#define vset_v_i64m2_i64m4(...) __riscv_vset_v_i64m2_i64m4(__VA_ARGS__)
+#define vset_v_i64m1_i64m8(...) __riscv_vset_v_i64m1_i64m8(__VA_ARGS__)
+#define vset_v_i64m2_i64m8(...) __riscv_vset_v_i64m2_i64m8(__VA_ARGS__)
+#define vset_v_i64m4_i64m8(...) __riscv_vset_v_i64m4_i64m8(__VA_ARGS__)
+#define vset_v_u8m1_u8m2(...) __riscv_vset_v_u8m1_u8m2(__VA_ARGS__)
+#define vset_v_u8m1_u8m4(...) __riscv_vset_v_u8m1_u8m4(__VA_ARGS__)
+#define vset_v_u8m2_u8m4(...) __riscv_vset_v_u8m2_u8m4(__VA_ARGS__)
+#define vset_v_u8m1_u8m8(...) __riscv_vset_v_u8m1_u8m8(__VA_ARGS__)
+#define vset_v_u8m2_u8m8(...) __riscv_vset_v_u8m2_u8m8(__VA_ARGS__)
+#define vset_v_u8m4_u8m8(...) __riscv_vset_v_u8m4_u8m8(__VA_ARGS__)
+#define vset_v_u16m1_u16m2(...) __riscv_vset_v_u16m1_u16m2(__VA_ARGS__)
+#define vset_v_u16m1_u16m4(...) __riscv_vset_v_u16m1_u16m4(__VA_ARGS__)
+#define vset_v_u16m2_u16m4(...) __riscv_vset_v_u16m2_u16m4(__VA_ARGS__)
+#define vset_v_u16m1_u16m8(...) __riscv_vset_v_u16m1_u16m8(__VA_ARGS__)
+#define vset_v_u16m2_u16m8(...) __riscv_vset_v_u16m2_u16m8(__VA_ARGS__)
+#define vset_v_u16m4_u16m8(...) __riscv_vset_v_u16m4_u16m8(__VA_ARGS__)
+#define vset_v_u32m1_u32m2(...) __riscv_vset_v_u32m1_u32m2(__VA_ARGS__)
+#define vset_v_u32m1_u32m4(...) __riscv_vset_v_u32m1_u32m4(__VA_ARGS__)
+#define vset_v_u32m2_u32m4(...) __riscv_vset_v_u32m2_u32m4(__VA_ARGS__)
+#define vset_v_u32m1_u32m8(...) __riscv_vset_v_u32m1_u32m8(__VA_ARGS__)
+#define vset_v_u32m2_u32m8(...) __riscv_vset_v_u32m2_u32m8(__VA_ARGS__)
+#define vset_v_u32m4_u32m8(...) __riscv_vset_v_u32m4_u32m8(__VA_ARGS__)
+#define vset_v_u64m1_u64m2(...) __riscv_vset_v_u64m1_u64m2(__VA_ARGS__)
+#define vset_v_u64m1_u64m4(...) __riscv_vset_v_u64m1_u64m4(__VA_ARGS__)
+#define vset_v_u64m2_u64m4(...) __riscv_vset_v_u64m2_u64m4(__VA_ARGS__)
+#define vset_v_u64m1_u64m8(...) __riscv_vset_v_u64m1_u64m8(__VA_ARGS__)
+#define vset_v_u64m2_u64m8(...) __riscv_vset_v_u64m2_u64m8(__VA_ARGS__)
+#define vset_v_u64m4_u64m8(...) __riscv_vset_v_u64m4_u64m8(__VA_ARGS__)
+#define vget_v_f16m2_f16m1(...) __riscv_vget_v_f16m2_f16m1(__VA_ARGS__)
+#define vget_v_f16m4_f16m1(...) __riscv_vget_v_f16m4_f16m1(__VA_ARGS__)
+#define vget_v_f16m8_f16m1(...) __riscv_vget_v_f16m8_f16m1(__VA_ARGS__)
+#define vget_v_f16m4_f16m2(...) __riscv_vget_v_f16m4_f16m2(__VA_ARGS__)
+#define vget_v_f16m8_f16m2(...) __riscv_vget_v_f16m8_f16m2(__VA_ARGS__)
+#define vget_v_f16m8_f16m4(...) __riscv_vget_v_f16m8_f16m4(__VA_ARGS__)
+#define vget_v_f32m2_f32m1(...) __riscv_vget_v_f32m2_f32m1(__VA_ARGS__)
+#define vget_v_f32m4_f32m1(...) __riscv_vget_v_f32m4_f32m1(__VA_ARGS__)
+#define vget_v_f32m8_f32m1(...) __riscv_vget_v_f32m8_f32m1(__VA_ARGS__)
+#define vget_v_f32m4_f32m2(...) __riscv_vget_v_f32m4_f32m2(__VA_ARGS__)
+#define vget_v_f32m8_f32m2(...) __riscv_vget_v_f32m8_f32m2(__VA_ARGS__)
+#define vget_v_f32m8_f32m4(...) __riscv_vget_v_f32m8_f32m4(__VA_ARGS__)
+#define vget_v_f64m2_f64m1(...) __riscv_vget_v_f64m2_f64m1(__VA_ARGS__)
+#define vget_v_f64m4_f64m1(...) __riscv_vget_v_f64m4_f64m1(__VA_ARGS__)
+#define vget_v_f64m8_f64m1(...) __riscv_vget_v_f64m8_f64m1(__VA_ARGS__)
+#define vget_v_f64m4_f64m2(...) __riscv_vget_v_f64m4_f64m2(__VA_ARGS__)
+#define vget_v_f64m8_f64m2(...) __riscv_vget_v_f64m8_f64m2(__VA_ARGS__)
+#define vget_v_f64m8_f64m4(...) __riscv_vget_v_f64m8_f64m4(__VA_ARGS__)
+#define vget_v_i8m2_i8m1(...) __riscv_vget_v_i8m2_i8m1(__VA_ARGS__)
+#define vget_v_i8m4_i8m1(...) __riscv_vget_v_i8m4_i8m1(__VA_ARGS__)
+#define vget_v_i8m8_i8m1(...) __riscv_vget_v_i8m8_i8m1(__VA_ARGS__)
+#define vget_v_i8m4_i8m2(...) __riscv_vget_v_i8m4_i8m2(__VA_ARGS__)
+#define vget_v_i8m8_i8m2(...) __riscv_vget_v_i8m8_i8m2(__VA_ARGS__)
+#define vget_v_i8m8_i8m4(...) __riscv_vget_v_i8m8_i8m4(__VA_ARGS__)
+#define vget_v_i16m2_i16m1(...) __riscv_vget_v_i16m2_i16m1(__VA_ARGS__)
+#define vget_v_i16m4_i16m1(...) __riscv_vget_v_i16m4_i16m1(__VA_ARGS__)
+#define vget_v_i16m8_i16m1(...) __riscv_vget_v_i16m8_i16m1(__VA_ARGS__)
+#define vget_v_i16m4_i16m2(...) __riscv_vget_v_i16m4_i16m2(__VA_ARGS__)
+#define vget_v_i16m8_i16m2(...) __riscv_vget_v_i16m8_i16m2(__VA_ARGS__)
+#define vget_v_i16m8_i16m4(...) __riscv_vget_v_i16m8_i16m4(__VA_ARGS__)
+#define vget_v_i32m2_i32m1(...) __riscv_vget_v_i32m2_i32m1(__VA_ARGS__)
+#define vget_v_i32m4_i32m1(...) __riscv_vget_v_i32m4_i32m1(__VA_ARGS__)
+#define vget_v_i32m8_i32m1(...) __riscv_vget_v_i32m8_i32m1(__VA_ARGS__)
+#define vget_v_i32m4_i32m2(...) __riscv_vget_v_i32m4_i32m2(__VA_ARGS__)
+#define vget_v_i32m8_i32m2(...) __riscv_vget_v_i32m8_i32m2(__VA_ARGS__)
+#define vget_v_i32m8_i32m4(...) __riscv_vget_v_i32m8_i32m4(__VA_ARGS__)
+#define vget_v_i64m2_i64m1(...) __riscv_vget_v_i64m2_i64m1(__VA_ARGS__)
+#define vget_v_i64m4_i64m1(...) __riscv_vget_v_i64m4_i64m1(__VA_ARGS__)
+#define vget_v_i64m8_i64m1(...) __riscv_vget_v_i64m8_i64m1(__VA_ARGS__)
+#define vget_v_i64m4_i64m2(...) __riscv_vget_v_i64m4_i64m2(__VA_ARGS__)
+#define vget_v_i64m8_i64m2(...) __riscv_vget_v_i64m8_i64m2(__VA_ARGS__)
+#define vget_v_i64m8_i64m4(...) __riscv_vget_v_i64m8_i64m4(__VA_ARGS__)
+#define vget_v_u8m2_u8m1(...) __riscv_vget_v_u8m2_u8m1(__VA_ARGS__)
+#define vget_v_u8m4_u8m1(...) __riscv_vget_v_u8m4_u8m1(__VA_ARGS__)
+#define vget_v_u8m8_u8m1(...) __riscv_vget_v_u8m8_u8m1(__VA_ARGS__)
+#define vget_v_u8m4_u8m2(...) __riscv_vget_v_u8m4_u8m2(__VA_ARGS__)
+#define vget_v_u8m8_u8m2(...) __riscv_vget_v_u8m8_u8m2(__VA_ARGS__)
+#define vget_v_u8m8_u8m4(...) __riscv_vget_v_u8m8_u8m4(__VA_ARGS__)
+#define vget_v_u16m2_u16m1(...) __riscv_vget_v_u16m2_u16m1(__VA_ARGS__)
+#define vget_v_u16m4_u16m1(...) __riscv_vget_v_u16m4_u16m1(__VA_ARGS__)
+#define vget_v_u16m8_u16m1(...) __riscv_vget_v_u16m8_u16m1(__VA_ARGS__)
+#define vget_v_u16m4_u16m2(...) __riscv_vget_v_u16m4_u16m2(__VA_ARGS__)
+#define vget_v_u16m8_u16m2(...) __riscv_vget_v_u16m8_u16m2(__VA_ARGS__)
+#define vget_v_u16m8_u16m4(...) __riscv_vget_v_u16m8_u16m4(__VA_ARGS__)
+#define vget_v_u32m2_u32m1(...) __riscv_vget_v_u32m2_u32m1(__VA_ARGS__)
+#define vget_v_u32m4_u32m1(...) __riscv_vget_v_u32m4_u32m1(__VA_ARGS__)
+#define vget_v_u32m8_u32m1(...) __riscv_vget_v_u32m8_u32m1(__VA_ARGS__)
+#define vget_v_u32m4_u32m2(...) __riscv_vget_v_u32m4_u32m2(__VA_ARGS__)
+#define vget_v_u32m8_u32m2(...) __riscv_vget_v_u32m8_u32m2(__VA_ARGS__)
+#define vget_v_u32m8_u32m4(...) __riscv_vget_v_u32m8_u32m4(__VA_ARGS__)
+#define vget_v_u64m2_u64m1(...) __riscv_vget_v_u64m2_u64m1(__VA_ARGS__)
+#define vget_v_u64m4_u64m1(...) __riscv_vget_v_u64m4_u64m1(__VA_ARGS__)
+#define vget_v_u64m8_u64m1(...) __riscv_vget_v_u64m8_u64m1(__VA_ARGS__)
+#define vget_v_u64m4_u64m2(...) __riscv_vget_v_u64m4_u64m2(__VA_ARGS__)
+#define vget_v_u64m8_u64m2(...) __riscv_vget_v_u64m8_u64m2(__VA_ARGS__)
+#define vget_v_u64m8_u64m4(...) __riscv_vget_v_u64m8_u64m4(__VA_ARGS__)
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_010_compat_overloaded-non-policy.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_010_compat_overloaded-non-policy.hpp
new file mode 100644
index 000000000000..12a34d20d439
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_010_compat_overloaded-non-policy.hpp
@@ -0,0 +1,768 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copied from
+// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
+
+#ifndef __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
+#define __RVV_0P10_COMPATIBLE_HEADERS_OVERLOADED_NON_POLICY_H
+
+
+// The maximum number of parameters is 20, this is held by segment load
+// instructions with a NFIELD (NF) of 8. 20 is contributed by 8 vector register
+// pointers passed, 1 vector mask register, 8 passthrough register for
+// undisturbed policy, and 3 for address base, byte index, vl.
+#define _GET_OVERRIDE(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,\
+_14, _15, _16, _17, _18, _19, _20, NAME, ...) NAME
+
+
+#if __has_include ("riscv_vector.h")
+#include <riscv_vector.h>
+#endif
+#ifndef __RISCV_VECTOR_H
+#include_next <riscv_vector.h>
+#endif
+
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+#define vmerge(mask, op1, op2, vl) __riscv_vmerge((op1), (op2), (mask), (vl))
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+#define vfmerge(mask, op1, op2, vl) __riscv_vfmerge((op1), (op2), (mask), (vl))
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+// masked functions
+#define vcompress(mask, dest, src, vl) __riscv_vcompress_tu((dest), (src), (mask), (vl))
+// Reinterpret between different type under the same SEW/LMUL
+// Reinterpret between different SEW under the same LMUL
+#define vse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse16, __riscv_vse16, 2, 1)(__VA_ARGS__)
+#define vse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse32, __riscv_vse32, 2, 1)(__VA_ARGS__)
+#define vse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse64, __riscv_vse64, 2, 1)(__VA_ARGS__)
+#define vse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vse8, __riscv_vse8, 2, 1)(__VA_ARGS__)
+#define vsse16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse16, __riscv_vsse16, 3, 2, 1)(__VA_ARGS__)
+#define vsse32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse32, __riscv_vsse32, 3, 2, 1)(__VA_ARGS__)
+#define vsse64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse64, __riscv_vsse64, 3, 2, 1)(__VA_ARGS__)
+#define vsse8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsse8, __riscv_vsse8, 3, 2, 1)(__VA_ARGS__)
+#define vloxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei8_tumu, 4, __riscv_vloxei8, 2, 1)(__VA_ARGS__)
+#define vloxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei16_tumu, 4, __riscv_vloxei16, 2, 1)(__VA_ARGS__)
+#define vloxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei32_tumu, 4, __riscv_vloxei32, 2, 1)(__VA_ARGS__)
+#define vloxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vloxei64_tumu, 4, __riscv_vloxei64, 2, 1)(__VA_ARGS__)
+#define vluxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei8_tumu, 4, __riscv_vluxei8, 2, 1)(__VA_ARGS__)
+#define vluxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei16_tumu, 4, __riscv_vluxei16, 2, 1)(__VA_ARGS__)
+#define vluxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei32_tumu, 4, __riscv_vluxei32, 2, 1)(__VA_ARGS__)
+#define vluxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vluxei64_tumu, 4, __riscv_vluxei64, 2, 1)(__VA_ARGS__)
+#define vsoxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei8, __riscv_vsoxei8, 3, 2, 1)(__VA_ARGS__)
+#define vsoxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei16, __riscv_vsoxei16, 3, 2, 1)(__VA_ARGS__)
+#define vsoxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei32, __riscv_vsoxei32, 3, 2, 1)(__VA_ARGS__)
+#define vsoxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsoxei64, __riscv_vsoxei64, 3, 2, 1)(__VA_ARGS__)
+#define vsuxei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei8, __riscv_vsuxei8, 3, 2, 1)(__VA_ARGS__)
+#define vsuxei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei16, __riscv_vsuxei16, 3, 2, 1)(__VA_ARGS__)
+#define vsuxei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei32, __riscv_vsuxei32, 3, 2, 1)(__VA_ARGS__)
+#define vsuxei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsuxei64, __riscv_vsuxei64, 3, 2, 1)(__VA_ARGS__)
+#define vsseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e16, __riscv_vsseg2e16, 3, 2, 1)(__VA_ARGS__)
+#define vsseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e16, __riscv_vsseg3e16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e16, __riscv_vsseg4e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e16, __riscv_vsseg5e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e16, __riscv_vsseg6e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e16, __riscv_vsseg7e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e16, __riscv_vsseg8e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e32, __riscv_vsseg2e32, 3, 2, 1)(__VA_ARGS__)
+#define vsseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e32, __riscv_vsseg3e32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e32, __riscv_vsseg4e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e32, __riscv_vsseg5e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e32, __riscv_vsseg6e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e32, __riscv_vsseg7e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e32, __riscv_vsseg8e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e64, __riscv_vsseg2e64, 3, 2, 1)(__VA_ARGS__)
+#define vsseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e64, __riscv_vsseg3e64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e64, __riscv_vsseg4e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e64, __riscv_vsseg5e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e64, __riscv_vsseg6e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e64, __riscv_vsseg7e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e64, __riscv_vsseg8e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsseg2e8, __riscv_vsseg2e8, 3, 2, 1)(__VA_ARGS__)
+#define vsseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsseg3e8, __riscv_vsseg3e8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsseg4e8, __riscv_vsseg4e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsseg5e8, __riscv_vsseg5e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsseg6e8, __riscv_vsseg6e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsseg7e8, __riscv_vsseg7e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsseg8e8, __riscv_vsseg8e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg2e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e16, __riscv_vssseg2e16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg3e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e16, __riscv_vssseg3e16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg4e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e16, __riscv_vssseg4e16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg5e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e16, __riscv_vssseg5e16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg6e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e16, __riscv_vssseg6e16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg7e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e16, __riscv_vssseg7e16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg8e16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e16, __riscv_vssseg8e16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg2e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e32, __riscv_vssseg2e32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg3e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e32, __riscv_vssseg3e32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg4e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e32, __riscv_vssseg4e32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg5e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e32, __riscv_vssseg5e32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg6e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e32, __riscv_vssseg6e32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg7e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e32, __riscv_vssseg7e32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg8e32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e32, __riscv_vssseg8e32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg2e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e64, __riscv_vssseg2e64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg3e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e64, __riscv_vssseg3e64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg4e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e64, __riscv_vssseg4e64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg5e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e64, __riscv_vssseg5e64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg6e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e64, __riscv_vssseg6e64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg7e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e64, __riscv_vssseg7e64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg8e64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e64, __riscv_vssseg8e64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg2e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vssseg2e8, __riscv_vssseg2e8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg3e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vssseg3e8, __riscv_vssseg3e8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg4e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vssseg4e8, __riscv_vssseg4e8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg5e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vssseg5e8, __riscv_vssseg5e8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg6e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vssseg6e8, __riscv_vssseg6e8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg7e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vssseg7e8, __riscv_vssseg7e8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vssseg8e8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vssseg8e8, __riscv_vssseg8e8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei8_tumu, 7, 6, __riscv_vloxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei8_tumu, 9, 8, 7, __riscv_vloxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei16_tumu, 7, 6, __riscv_vloxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei16_tumu, 9, 8, 7, __riscv_vloxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei32_tumu, 7, 6, __riscv_vloxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei32_tumu, 9, 8, 7, __riscv_vloxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vloxseg2ei64_tumu, 7, 6, __riscv_vloxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg3ei64_tumu, 9, 8, 7, __riscv_vloxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vloxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vloxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vloxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vloxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vloxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vloxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vloxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vloxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vloxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vloxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vloxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei8_tumu, 7, 6, __riscv_vluxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei8_tumu, 9, 8, 7, __riscv_vluxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei8_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei8_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei8_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei8_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei8_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei16_tumu, 7, 6, __riscv_vluxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei16_tumu, 9, 8, 7, __riscv_vluxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei16_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei16_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei16_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei16_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei16_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei32_tumu, 7, 6, __riscv_vluxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei32_tumu, 9, 8, 7, __riscv_vluxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei32_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei32_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei32_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei32_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei32_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vluxseg2ei64_tumu, 7, 6, __riscv_vluxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg3ei64_tumu, 9, 8, 7, __riscv_vluxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vluxseg4ei64_tumu, 11, 10, 9, 8, __riscv_vluxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, __riscv_vluxseg5ei64_tumu, 13, 12, 11, 10, 9, __riscv_vluxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, __riscv_vluxseg6ei64_tumu, 15, 14, 13, 12, 11, 10, __riscv_vluxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, __riscv_vluxseg7ei64_tumu, 17, 16, 15, 14, 13, 12, 11, __riscv_vluxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vluxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, __riscv_vluxseg8ei64_tumu, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vluxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei8, __riscv_vsoxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei8, __riscv_vsoxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei8, __riscv_vsoxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei8, __riscv_vsoxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei8, __riscv_vsoxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei8, __riscv_vsoxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei8, __riscv_vsoxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei16, __riscv_vsoxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei16, __riscv_vsoxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei16, __riscv_vsoxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei16, __riscv_vsoxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei16, __riscv_vsoxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei16, __riscv_vsoxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei16, __riscv_vsoxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei32, __riscv_vsoxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei32, __riscv_vsoxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei32, __riscv_vsoxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei32, __riscv_vsoxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei32, __riscv_vsoxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei32, __riscv_vsoxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei32, __riscv_vsoxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsoxseg2ei64, __riscv_vsoxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsoxseg3ei64, __riscv_vsoxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsoxseg4ei64, __riscv_vsoxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsoxseg5ei64, __riscv_vsoxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsoxseg6ei64, __riscv_vsoxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsoxseg7ei64, __riscv_vsoxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsoxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsoxseg8ei64, __riscv_vsoxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg2ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei8, __riscv_vsuxseg2ei8, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg3ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei8, __riscv_vsuxseg3ei8, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg4ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei8, __riscv_vsuxseg4ei8, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg5ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei8, __riscv_vsuxseg5ei8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg6ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei8, __riscv_vsuxseg6ei8, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg7ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei8, __riscv_vsuxseg7ei8, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg8ei8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei8, __riscv_vsuxseg8ei8, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg2ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei16, __riscv_vsuxseg2ei16, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg3ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei16, __riscv_vsuxseg3ei16, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg4ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei16, __riscv_vsuxseg4ei16, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg5ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei16, __riscv_vsuxseg5ei16, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg6ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei16, __riscv_vsuxseg6ei16, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg7ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei16, __riscv_vsuxseg7ei16, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg8ei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei16, __riscv_vsuxseg8ei16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg2ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei32, __riscv_vsuxseg2ei32, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg3ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei32, __riscv_vsuxseg3ei32, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg4ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei32, __riscv_vsuxseg4ei32, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg5ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei32, __riscv_vsuxseg5ei32, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg6ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei32, __riscv_vsuxseg6ei32, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg7ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei32, __riscv_vsuxseg7ei32, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg8ei32(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei32, __riscv_vsuxseg8ei32, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg2ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, __riscv_vsuxseg2ei64, __riscv_vsuxseg2ei64, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg3ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, __riscv_vsuxseg3ei64, __riscv_vsuxseg3ei64, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg4ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, __riscv_vsuxseg4ei64, __riscv_vsuxseg4ei64, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg5ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, __riscv_vsuxseg5ei64, __riscv_vsuxseg5ei64, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg6ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, __riscv_vsuxseg6ei64, __riscv_vsuxseg6ei64, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg7ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, __riscv_vsuxseg7ei64, __riscv_vsuxseg7ei64, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vsuxseg8ei64(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, __riscv_vsuxseg8ei64, __riscv_vsuxseg8ei64, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)(__VA_ARGS__)
+#define vadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vadd_tumu, 4, __riscv_vadd, 2, 1)(__VA_ARGS__)
+#define vsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsub_tumu, 4, __riscv_vsub, 2, 1)(__VA_ARGS__)
+#define vrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrsub_tumu, 4, __riscv_vrsub, 2, 1)(__VA_ARGS__)
+#define vneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vneg_tumu, 3, __riscv_vneg, 1)(__VA_ARGS__)
+#define vwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vv_tumu, 4, __riscv_vwadd_vv, 2, 1)(__VA_ARGS__)
+#define vwadd_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_vx_tumu, 4, __riscv_vwadd_vx, 2, 1)(__VA_ARGS__)
+#define vwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wv_tumu, 4, __riscv_vwadd_wv, 2, 1)(__VA_ARGS__)
+#define vwadd_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwadd_wx_tumu, 4, __riscv_vwadd_wx, 2, 1)(__VA_ARGS__)
+#define vwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vv_tumu, 4, __riscv_vwsub_vv, 2, 1)(__VA_ARGS__)
+#define vwsub_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_vx_tumu, 4, __riscv_vwsub_vx, 2, 1)(__VA_ARGS__)
+#define vwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wv_tumu, 4, __riscv_vwsub_wv, 2, 1)(__VA_ARGS__)
+#define vwsub_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsub_wx_tumu, 4, __riscv_vwsub_wx, 2, 1)(__VA_ARGS__)
+#define vwaddu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vv_tumu, 4, __riscv_vwaddu_vv, 2, 1)(__VA_ARGS__)
+#define vwaddu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_vx_tumu, 4, __riscv_vwaddu_vx, 2, 1)(__VA_ARGS__)
+#define vwaddu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wv_tumu, 4, __riscv_vwaddu_wv, 2, 1)(__VA_ARGS__)
+#define vwaddu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwaddu_wx_tumu, 4, __riscv_vwaddu_wx, 2, 1)(__VA_ARGS__)
+#define vwsubu_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vv_tumu, 4, __riscv_vwsubu_vv, 2, 1)(__VA_ARGS__)
+#define vwsubu_vx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_vx_tumu, 4, __riscv_vwsubu_vx, 2, 1)(__VA_ARGS__)
+#define vwsubu_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wv_tumu, 4, __riscv_vwsubu_wv, 2, 1)(__VA_ARGS__)
+#define vwsubu_wx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwsubu_wx_tumu, 4, __riscv_vwsubu_wx, 2, 1)(__VA_ARGS__)
+#define vsext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf2_tumu, 3, __riscv_vsext_vf2, 1)(__VA_ARGS__)
+#define vsext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf4_tumu, 3, __riscv_vsext_vf4, 1)(__VA_ARGS__)
+#define vsext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vsext_vf8_tumu, 3, __riscv_vsext_vf8, 1)(__VA_ARGS__)
+#define vzext_vf2(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf2_tumu, 3, __riscv_vzext_vf2, 1)(__VA_ARGS__)
+#define vzext_vf4(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf4_tumu, 3, __riscv_vzext_vf4, 1)(__VA_ARGS__)
+#define vzext_vf8(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vzext_vf8_tumu, 3, __riscv_vzext_vf8, 1)(__VA_ARGS__)
+#define vadc(...) __riscv_vadc(__VA_ARGS__)
+#define vsbc(...) __riscv_vsbc(__VA_ARGS__)
+#define vmadc(...) __riscv_vmadc(__VA_ARGS__)
+#define vmsbc(...) __riscv_vmsbc(__VA_ARGS__)
+#define vand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vand_tumu, 4, __riscv_vand, 2, 1)(__VA_ARGS__)
+#define vor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vor_tumu, 4, __riscv_vor, 2, 1)(__VA_ARGS__)
+#define vxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vxor_tumu, 4, __riscv_vxor, 2, 1)(__VA_ARGS__)
+#define vnot(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vnot_tumu, 3, __riscv_vnot, 1)(__VA_ARGS__)
+#define vsll(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsll_tumu, 4, __riscv_vsll, 2, 1)(__VA_ARGS__)
+#define vsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsra_tumu, 4, __riscv_vsra, 2, 1)(__VA_ARGS__)
+#define vsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsrl_tumu, 4, __riscv_vsrl, 2, 1)(__VA_ARGS__)
+#define vnsra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsra_tumu, 4, __riscv_vnsra, 2, 1)(__VA_ARGS__)
+#define vnsrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnsrl_tumu, 4, __riscv_vnsrl, 2, 1)(__VA_ARGS__)
+#define vmseq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmseq_mu, 4, __riscv_vmseq, 2, 1)(__VA_ARGS__)
+#define vmsne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsne_mu, 4, __riscv_vmsne, 2, 1)(__VA_ARGS__)
+#define vmslt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmslt_mu, 4, __riscv_vmslt, 2, 1)(__VA_ARGS__)
+#define vmsle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsle_mu, 4, __riscv_vmsle, 2, 1)(__VA_ARGS__)
+#define vmsgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgt_mu, 4, __riscv_vmsgt, 2, 1)(__VA_ARGS__)
+#define vmsge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsge_mu, 4, __riscv_vmsge, 2, 1)(__VA_ARGS__)
+#define vmsltu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsltu_mu, 4, __riscv_vmsltu, 2, 1)(__VA_ARGS__)
+#define vmsleu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsleu_mu, 4, __riscv_vmsleu, 2, 1)(__VA_ARGS__)
+#define vmsgtu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgtu_mu, 4, __riscv_vmsgtu, 2, 1)(__VA_ARGS__)
+#define vmsgeu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmsgeu_mu, 4, __riscv_vmsgeu, 2, 1)(__VA_ARGS__)
+#define vmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmin_tumu, 4, __riscv_vmin, 2, 1)(__VA_ARGS__)
+#define vmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmax_tumu, 4, __riscv_vmax, 2, 1)(__VA_ARGS__)
+#define vminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vminu_tumu, 4, __riscv_vminu, 2, 1)(__VA_ARGS__)
+#define vmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmaxu_tumu, 4, __riscv_vmaxu, 2, 1)(__VA_ARGS__)
+#define vmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmul_tumu, 4, __riscv_vmul, 2, 1)(__VA_ARGS__)
+#define vmulh(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulh_tumu, 4, __riscv_vmulh, 2, 1)(__VA_ARGS__)
+#define vmulhsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhsu_tumu, 4, __riscv_vmulhsu, 2, 1)(__VA_ARGS__)
+#define vmulhu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmulhu_tumu, 4, __riscv_vmulhu, 2, 1)(__VA_ARGS__)
+#define vdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdiv_tumu, 4, __riscv_vdiv, 2, 1)(__VA_ARGS__)
+#define vrem(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrem_tumu, 4, __riscv_vrem, 2, 1)(__VA_ARGS__)
+#define vdivu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vdivu_tumu, 4, __riscv_vdivu, 2, 1)(__VA_ARGS__)
+#define vremu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vremu_tumu, 4, __riscv_vremu, 2, 1)(__VA_ARGS__)
+#define vwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmul_tumu, 4, __riscv_vwmul, 2, 1)(__VA_ARGS__)
+#define vwmulsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulsu_tumu, 4, __riscv_vwmulsu, 2, 1)(__VA_ARGS__)
+#define vwmulu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmulu_tumu, 4, __riscv_vwmulu, 2, 1)(__VA_ARGS__)
+#define vmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmacc_tumu, __riscv_vmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsac_tumu, __riscv_vnmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmadd_tumu, __riscv_vmadd_tu, 3, 2, 1)(__VA_ARGS__)
+#define vnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnmsub_tumu, __riscv_vnmsub_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmacc_tumu, __riscv_vwmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwmaccsu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccsu_tumu, __riscv_vwmaccsu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwmaccus(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccus_tumu, __riscv_vwmaccus_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwmaccu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwmaccu_tumu, __riscv_vwmaccu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vmv_v(...) __riscv_vmv_v(__VA_ARGS__)
+#define vsadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsadd_tumu, 4, __riscv_vsadd, 2, 1)(__VA_ARGS__)
+#define vssub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssub_tumu, 4, __riscv_vssub, 2, 1)(__VA_ARGS__)
+#define vsaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsaddu_tumu, 4, __riscv_vsaddu, 2, 1)(__VA_ARGS__)
+#define vssubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssubu_tumu, 4, __riscv_vssubu, 2, 1)(__VA_ARGS__)
+#define vaadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaadd_tumu, 4, __riscv_vaadd, 2, 1)(__VA_ARGS__)
+#define vasub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasub_tumu, 4, __riscv_vasub, 2, 1)(__VA_ARGS__)
+#define vaaddu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vaaddu_tumu, 4, __riscv_vaaddu, 2, 1)(__VA_ARGS__)
+#define vasubu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vasubu_tumu, 4, __riscv_vasubu, 2, 1)(__VA_ARGS__)
+#define vsmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vsmul_mu, 4, __riscv_vsmul, 2, 1)(__VA_ARGS__)
+#define vssra(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssra_tumu, 4, __riscv_vssra, 2, 1)(__VA_ARGS__)
+#define vssrl(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vssrl_tumu, 4, __riscv_vssrl, 2, 1)(__VA_ARGS__)
+#define vnclip(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclip_tumu, 4, __riscv_vnclip, 2, 1)(__VA_ARGS__)
+#define vnclipu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vnclipu_tumu, 4, __riscv_vnclipu, 2, 1)(__VA_ARGS__)
+#define vfadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfadd_tumu, 4, __riscv_vfadd, 2, 1)(__VA_ARGS__)
+#define vfsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsub_tumu, 4, __riscv_vfsub, 2, 1)(__VA_ARGS__)
+#define vfrsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrsub_tumu, 4, __riscv_vfrsub, 2, 1)(__VA_ARGS__)
+#define vfneg(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfneg_tumu, 3, __riscv_vfneg, 1)(__VA_ARGS__)
+#define vfwadd_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vv_tumu, 4, __riscv_vfwadd_vv, 2, 1)(__VA_ARGS__)
+#define vfwadd_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_vf_tumu, 4, __riscv_vfwadd_vf, 2, 1)(__VA_ARGS__)
+#define vfwadd_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wv_tumu, 4, __riscv_vfwadd_wv, 2, 1)(__VA_ARGS__)
+#define vfwadd_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwadd_wf_tumu, 4, __riscv_vfwadd_wf, 2, 1)(__VA_ARGS__)
+#define vfwsub_vv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vv_tumu, 4, __riscv_vfwsub_vv, 2, 1)(__VA_ARGS__)
+#define vfwsub_vf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_vf_tumu, 4, __riscv_vfwsub_vf, 2, 1)(__VA_ARGS__)
+#define vfwsub_wv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wv_tumu, 4, __riscv_vfwsub_wv, 2, 1)(__VA_ARGS__)
+#define vfwsub_wf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwsub_wf_tumu, 4, __riscv_vfwsub_wf, 2, 1)(__VA_ARGS__)
+#define vfmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmul_tumu, 4, __riscv_vfmul, 2, 1)(__VA_ARGS__)
+#define vfdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfdiv_tumu, 4, __riscv_vfdiv, 2, 1)(__VA_ARGS__)
+#define vfrdiv(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfrdiv_tumu, 4, __riscv_vfrdiv, 2, 1)(__VA_ARGS__)
+#define vfwmul(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmul_tumu, 4, __riscv_vfwmul, 2, 1)(__VA_ARGS__)
+#define vfmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmacc_tumu, __riscv_vfmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmacc_tumu, __riscv_vfnmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsac_tumu, __riscv_vfmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsac_tumu, __riscv_vfnmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmadd_tumu, __riscv_vfmadd_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfnmadd(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmadd_tumu, __riscv_vfnmadd_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmsub_tumu, __riscv_vfmsub_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfnmsub(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfnmsub_tumu, __riscv_vfnmsub_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmacc_tumu, __riscv_vfwmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwnmacc(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmacc_tumu, __riscv_vfwnmacc_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwmsac_tumu, __riscv_vfwmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwnmsac(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwnmsac_tumu, __riscv_vfwnmsac_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfsqrt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfsqrt_tumu, 3, __riscv_vfsqrt, 1)(__VA_ARGS__)
+#define vfrsqrt7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrsqrt7_tumu, 3, __riscv_vfrsqrt7, 1)(__VA_ARGS__)
+#define vfrec7(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfrec7_tumu, 3, __riscv_vfrec7, 1)(__VA_ARGS__)
+#define vfmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmin_tumu, 4, __riscv_vfmin, 2, 1)(__VA_ARGS__)
+#define vfmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfmax_tumu, 4, __riscv_vfmax, 2, 1)(__VA_ARGS__)
+#define vfsgnj(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnj_tumu, 4, __riscv_vfsgnj, 2, 1)(__VA_ARGS__)
+#define vfsgnjn(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjn_tumu, 4, __riscv_vfsgnjn, 2, 1)(__VA_ARGS__)
+#define vfsgnjx(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfsgnjx_tumu, 4, __riscv_vfsgnjx, 2, 1)(__VA_ARGS__)
+#define vfabs(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfabs_tumu, 3, __riscv_vfabs, 1)(__VA_ARGS__)
+#define vmfeq(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfeq_mu, 4, __riscv_vmfeq, 2, 1)(__VA_ARGS__)
+#define vmfne(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfne_mu, 4, __riscv_vmfne, 2, 1)(__VA_ARGS__)
+#define vmflt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmflt_mu, 4, __riscv_vmflt, 2, 1)(__VA_ARGS__)
+#define vmfle(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfle_mu, 4, __riscv_vmfle, 2, 1)(__VA_ARGS__)
+#define vmfgt(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfgt_mu, 4, __riscv_vmfgt, 2, 1)(__VA_ARGS__)
+#define vmfge(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vmfge_mu, 4, __riscv_vmfge, 2, 1)(__VA_ARGS__)
+#define vfclass(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfclass_tumu, 3, __riscv_vfclass, 1)(__VA_ARGS__)
+#define vfcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_x_tumu, 3, __riscv_vfcvt_x, 1)(__VA_ARGS__)
+#define vfcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_x_tumu, 3, __riscv_vfcvt_rtz_x, 1)(__VA_ARGS__)
+#define vfcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_xu_tumu, 3, __riscv_vfcvt_xu, 1)(__VA_ARGS__)
+#define vfcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_rtz_xu_tumu, 3, __riscv_vfcvt_rtz_xu, 1)(__VA_ARGS__)
+#define vfcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfcvt_f_tumu, 3, __riscv_vfcvt_f, 1)(__VA_ARGS__)
+#define vwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvt_x_tumu, 3, __riscv_vwcvt_x, 1)(__VA_ARGS__)
+#define vwcvtu_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vwcvtu_x_tumu, 3, __riscv_vwcvtu_x, 1)(__VA_ARGS__)
+#define vfwcvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_f_tumu, 3, __riscv_vfwcvt_f, 1)(__VA_ARGS__)
+#define vfwcvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_x_tumu, 3, __riscv_vfwcvt_x, 1)(__VA_ARGS__)
+#define vfwcvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_x_tumu, 3, __riscv_vfwcvt_rtz_x, 1)(__VA_ARGS__)
+#define vfwcvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_xu_tumu, 3, __riscv_vfwcvt_xu, 1)(__VA_ARGS__)
+#define vfwcvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfwcvt_rtz_xu_tumu, 3, __riscv_vfwcvt_rtz_xu, 1)(__VA_ARGS__)
+#define vfncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_x_tumu, 3, __riscv_vfncvt_x, 1)(__VA_ARGS__)
+#define vfncvt_rtz_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_x_tumu, 3, __riscv_vfncvt_rtz_x, 1)(__VA_ARGS__)
+#define vncvt_x(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vncvt_x_tumu, 3, __riscv_vncvt_x, 1)(__VA_ARGS__)
+#define vfncvt_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_xu_tumu, 3, __riscv_vfncvt_xu, 1)(__VA_ARGS__)
+#define vfncvt_rtz_xu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rtz_xu_tumu, 3, __riscv_vfncvt_rtz_xu, 1)(__VA_ARGS__)
+#define vfncvt_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_f_tumu, 3, __riscv_vfncvt_f, 1)(__VA_ARGS__)
+#define vfncvt_rod_f(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vfncvt_rod_f_tumu, 3, __riscv_vfncvt_rod_f, 1)(__VA_ARGS__)
+#define vredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredsum_tum, __riscv_vredsum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmax_tum, __riscv_vredmax_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmin_tum, __riscv_vredmin_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredand(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredand_tum, __riscv_vredand_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredor_tum, __riscv_vredor_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredxor(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredxor_tum, __riscv_vredxor_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredmaxu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredmaxu_tum, __riscv_vredmaxu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vredminu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vredminu_tum, __riscv_vredminu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwredsum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsum_tum, __riscv_vwredsum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vwredsumu(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vwredsumu_tum, __riscv_vwredsumu_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredosum_tum, __riscv_vfredosum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredusum_tum, __riscv_vfredusum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfredmax(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmax_tum, __riscv_vfredmax_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfredmin(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfredmin_tum, __riscv_vfredmin_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwredosum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredosum_tum, __riscv_vfwredosum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfwredusum(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfwredusum_tum, __riscv_vfwredusum_tu, 3, 2, 1)(__VA_ARGS__)
+#define vsm(...) __riscv_vsm(__VA_ARGS__)
+#define vmand(...) __riscv_vmand(__VA_ARGS__)
+#define vmnand(...) __riscv_vmnand(__VA_ARGS__)
+#define vmandn(...) __riscv_vmandn(__VA_ARGS__)
+#define vmxor(...) __riscv_vmxor(__VA_ARGS__)
+#define vmor(...) __riscv_vmor(__VA_ARGS__)
+#define vmnor(...) __riscv_vmnor(__VA_ARGS__)
+#define vmorn(...) __riscv_vmorn(__VA_ARGS__)
+#define vmxnor(...) __riscv_vmxnor(__VA_ARGS__)
+#define vmmv(...) __riscv_vmmv(__VA_ARGS__)
+#define vmnot(...) __riscv_vmnot(__VA_ARGS__)
+#define vcpop(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vcpop, __riscv_vcpop, 1)(__VA_ARGS__)
+#define vfirst(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, __riscv_vfirst, __riscv_vfirst, 1)(__VA_ARGS__)
+#define vmsbf(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsbf_mu, 3, __riscv_vmsbf, 1)(__VA_ARGS__)
+#define vmsif(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsif_mu, 3, __riscv_vmsif, 1)(__VA_ARGS__)
+#define vmsof(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, __riscv_vmsof_mu, 3, __riscv_vmsof, 1)(__VA_ARGS__)
+#define vfmv_f(...) __riscv_vfmv_f(__VA_ARGS__)
+#define vfmv_s(...) __riscv_vfmv_s_tu(__VA_ARGS__)
+#define vmv_x(...) __riscv_vmv_x(__VA_ARGS__)
+#define vmv_s(...) __riscv_vmv_s_tu(__VA_ARGS__)
+#define vslideup(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslideup_tumu, __riscv_vslideup_tu, 3, 2, 1)(__VA_ARGS__)
+#define vslidedown(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslidedown_tumu, __riscv_vslidedown_tu, 3, 2, 1)(__VA_ARGS__)
+#define vfslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1up_tumu, 4, __riscv_vfslide1up, 2, 1)(__VA_ARGS__)
+#define vfslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vfslide1down_tumu, 4, __riscv_vfslide1down, 2, 1)(__VA_ARGS__)
+#define vslide1up(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1up_tumu, 4, __riscv_vslide1up, 2, 1)(__VA_ARGS__)
+#define vslide1down(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vslide1down_tumu, 4, __riscv_vslide1down, 2, 1)(__VA_ARGS__)
+#define vrgather(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgather_tumu, 4, __riscv_vrgather, 2, 1)(__VA_ARGS__)
+#define vrgatherei16(...) _GET_OVERRIDE(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, __riscv_vrgatherei16_tumu, 4, __riscv_vrgatherei16, 2, 1)(__VA_ARGS__)
+#define vreinterpret_u8mf8(...) __riscv_vreinterpret_u8mf8(__VA_ARGS__)
+#define vreinterpret_u8mf4(...) __riscv_vreinterpret_u8mf4(__VA_ARGS__)
+#define vreinterpret_u8mf2(...) __riscv_vreinterpret_u8mf2(__VA_ARGS__)
+#define vreinterpret_u8m1(...) __riscv_vreinterpret_u8m1(__VA_ARGS__)
+#define vreinterpret_u8m2(...) __riscv_vreinterpret_u8m2(__VA_ARGS__)
+#define vreinterpret_u8m4(...) __riscv_vreinterpret_u8m4(__VA_ARGS__)
+#define vreinterpret_u8m8(...) __riscv_vreinterpret_u8m8(__VA_ARGS__)
+#define vreinterpret_i8mf8(...) __riscv_vreinterpret_i8mf8(__VA_ARGS__)
+#define vreinterpret_i8mf4(...) __riscv_vreinterpret_i8mf4(__VA_ARGS__)
+#define vreinterpret_i8mf2(...) __riscv_vreinterpret_i8mf2(__VA_ARGS__)
+#define vreinterpret_i8m1(...) __riscv_vreinterpret_i8m1(__VA_ARGS__)
+#define vreinterpret_i8m2(...) __riscv_vreinterpret_i8m2(__VA_ARGS__)
+#define vreinterpret_i8m4(...) __riscv_vreinterpret_i8m4(__VA_ARGS__)
+#define vreinterpret_i8m8(...) __riscv_vreinterpret_i8m8(__VA_ARGS__)
+#define vreinterpret_f16mf4(...) __riscv_vreinterpret_f16mf4(__VA_ARGS__)
+#define vreinterpret_f16mf2(...) __riscv_vreinterpret_f16mf2(__VA_ARGS__)
+#define vreinterpret_f16m1(...) __riscv_vreinterpret_f16m1(__VA_ARGS__)
+#define vreinterpret_f16m2(...) __riscv_vreinterpret_f16m2(__VA_ARGS__)
+#define vreinterpret_f16m4(...) __riscv_vreinterpret_f16m4(__VA_ARGS__)
+#define vreinterpret_f16m8(...) __riscv_vreinterpret_f16m8(__VA_ARGS__)
+#define vreinterpret_u16mf4(...) __riscv_vreinterpret_u16mf4(__VA_ARGS__)
+#define vreinterpret_u16mf2(...) __riscv_vreinterpret_u16mf2(__VA_ARGS__)
+#define vreinterpret_u16m1(...) __riscv_vreinterpret_u16m1(__VA_ARGS__)
+#define vreinterpret_u16m2(...) __riscv_vreinterpret_u16m2(__VA_ARGS__)
+#define vreinterpret_u16m4(...) __riscv_vreinterpret_u16m4(__VA_ARGS__)
+#define vreinterpret_u16m8(...) __riscv_vreinterpret_u16m8(__VA_ARGS__)
+#define vreinterpret_i16mf4(...) __riscv_vreinterpret_i16mf4(__VA_ARGS__)
+#define vreinterpret_i16mf2(...) __riscv_vreinterpret_i16mf2(__VA_ARGS__)
+#define vreinterpret_i16m1(...) __riscv_vreinterpret_i16m1(__VA_ARGS__)
+#define vreinterpret_i16m2(...) __riscv_vreinterpret_i16m2(__VA_ARGS__)
+#define vreinterpret_i16m4(...) __riscv_vreinterpret_i16m4(__VA_ARGS__)
+#define vreinterpret_i16m8(...) __riscv_vreinterpret_i16m8(__VA_ARGS__)
+#define vreinterpret_f32mf2(...) __riscv_vreinterpret_f32mf2(__VA_ARGS__)
+#define vreinterpret_f32m1(...) __riscv_vreinterpret_f32m1(__VA_ARGS__)
+#define vreinterpret_f32m2(...) __riscv_vreinterpret_f32m2(__VA_ARGS__)
+#define vreinterpret_f32m4(...) __riscv_vreinterpret_f32m4(__VA_ARGS__)
+#define vreinterpret_f32m8(...) __riscv_vreinterpret_f32m8(__VA_ARGS__)
+#define vreinterpret_u32mf2(...) __riscv_vreinterpret_u32mf2(__VA_ARGS__)
+#define vreinterpret_u32m1(...) __riscv_vreinterpret_u32m1(__VA_ARGS__)
+#define vreinterpret_u32m2(...) __riscv_vreinterpret_u32m2(__VA_ARGS__)
+#define vreinterpret_u32m4(...) __riscv_vreinterpret_u32m4(__VA_ARGS__)
+#define vreinterpret_u32m8(...) __riscv_vreinterpret_u32m8(__VA_ARGS__)
+#define vreinterpret_i32mf2(...) __riscv_vreinterpret_i32mf2(__VA_ARGS__)
+#define vreinterpret_i32m1(...) __riscv_vreinterpret_i32m1(__VA_ARGS__)
+#define vreinterpret_i32m2(...) __riscv_vreinterpret_i32m2(__VA_ARGS__)
+#define vreinterpret_i32m4(...) __riscv_vreinterpret_i32m4(__VA_ARGS__)
+#define vreinterpret_i32m8(...) __riscv_vreinterpret_i32m8(__VA_ARGS__)
+#define vreinterpret_f64m1(...) __riscv_vreinterpret_f64m1(__VA_ARGS__)
+#define vreinterpret_f64m2(...) __riscv_vreinterpret_f64m2(__VA_ARGS__)
+#define vreinterpret_f64m4(...) __riscv_vreinterpret_f64m4(__VA_ARGS__)
+#define vreinterpret_f64m8(...) __riscv_vreinterpret_f64m8(__VA_ARGS__)
+#define vreinterpret_u64m1(...) __riscv_vreinterpret_u64m1(__VA_ARGS__)
+#define vreinterpret_u64m2(...) __riscv_vreinterpret_u64m2(__VA_ARGS__)
+#define vreinterpret_u64m4(...) __riscv_vreinterpret_u64m4(__VA_ARGS__)
+#define vreinterpret_u64m8(...) __riscv_vreinterpret_u64m8(__VA_ARGS__)
+#define vreinterpret_i64m1(...) __riscv_vreinterpret_i64m1(__VA_ARGS__)
+#define vreinterpret_i64m2(...) __riscv_vreinterpret_i64m2(__VA_ARGS__)
+#define vreinterpret_i64m4(...) __riscv_vreinterpret_i64m4(__VA_ARGS__)
+#define vreinterpret_i64m8(...) __riscv_vreinterpret_i64m8(__VA_ARGS__)
+#define vlmul_ext_f16mf2(...) __riscv_vlmul_ext_f16mf2(__VA_ARGS__)
+#define vlmul_ext_f16m1(...) __riscv_vlmul_ext_f16m1(__VA_ARGS__)
+#define vlmul_ext_f16m2(...) __riscv_vlmul_ext_f16m2(__VA_ARGS__)
+#define vlmul_ext_f16m4(...) __riscv_vlmul_ext_f16m4(__VA_ARGS__)
+#define vlmul_ext_f16m8(...) __riscv_vlmul_ext_f16m8(__VA_ARGS__)
+#define vlmul_ext_f32m1(...) __riscv_vlmul_ext_f32m1(__VA_ARGS__)
+#define vlmul_ext_f32m2(...) __riscv_vlmul_ext_f32m2(__VA_ARGS__)
+#define vlmul_ext_f32m4(...) __riscv_vlmul_ext_f32m4(__VA_ARGS__)
+#define vlmul_ext_f32m8(...) __riscv_vlmul_ext_f32m8(__VA_ARGS__)
+#define vlmul_ext_f64m2(...) __riscv_vlmul_ext_f64m2(__VA_ARGS__)
+#define vlmul_ext_f64m4(...) __riscv_vlmul_ext_f64m4(__VA_ARGS__)
+#define vlmul_ext_f64m8(...) __riscv_vlmul_ext_f64m8(__VA_ARGS__)
+#define vlmul_ext_i8mf4(...) __riscv_vlmul_ext_i8mf4(__VA_ARGS__)
+#define vlmul_ext_i8mf2(...) __riscv_vlmul_ext_i8mf2(__VA_ARGS__)
+#define vlmul_ext_i8m1(...) __riscv_vlmul_ext_i8m1(__VA_ARGS__)
+#define vlmul_ext_i8m2(...) __riscv_vlmul_ext_i8m2(__VA_ARGS__)
+#define vlmul_ext_i8m4(...) __riscv_vlmul_ext_i8m4(__VA_ARGS__)
+#define vlmul_ext_i8m8(...) __riscv_vlmul_ext_i8m8(__VA_ARGS__)
+#define vlmul_ext_i16mf2(...) __riscv_vlmul_ext_i16mf2(__VA_ARGS__)
+#define vlmul_ext_i16m1(...) __riscv_vlmul_ext_i16m1(__VA_ARGS__)
+#define vlmul_ext_i16m2(...) __riscv_vlmul_ext_i16m2(__VA_ARGS__)
+#define vlmul_ext_i16m4(...) __riscv_vlmul_ext_i16m4(__VA_ARGS__)
+#define vlmul_ext_i16m8(...) __riscv_vlmul_ext_i16m8(__VA_ARGS__)
+#define vlmul_ext_i32m1(...) __riscv_vlmul_ext_i32m1(__VA_ARGS__)
+#define vlmul_ext_i32m2(...) __riscv_vlmul_ext_i32m2(__VA_ARGS__)
+#define vlmul_ext_i32m4(...) __riscv_vlmul_ext_i32m4(__VA_ARGS__)
+#define vlmul_ext_i32m8(...) __riscv_vlmul_ext_i32m8(__VA_ARGS__)
+#define vlmul_ext_i64m2(...) __riscv_vlmul_ext_i64m2(__VA_ARGS__)
+#define vlmul_ext_i64m4(...) __riscv_vlmul_ext_i64m4(__VA_ARGS__)
+#define vlmul_ext_i64m8(...) __riscv_vlmul_ext_i64m8(__VA_ARGS__)
+#define vlmul_ext_u8mf4(...) __riscv_vlmul_ext_u8mf4(__VA_ARGS__)
+#define vlmul_ext_u8mf2(...) __riscv_vlmul_ext_u8mf2(__VA_ARGS__)
+#define vlmul_ext_u8m1(...) __riscv_vlmul_ext_u8m1(__VA_ARGS__)
+#define vlmul_ext_u8m2(...) __riscv_vlmul_ext_u8m2(__VA_ARGS__)
+#define vlmul_ext_u8m4(...) __riscv_vlmul_ext_u8m4(__VA_ARGS__)
+#define vlmul_ext_u8m8(...) __riscv_vlmul_ext_u8m8(__VA_ARGS__)
+#define vlmul_ext_u16mf2(...) __riscv_vlmul_ext_u16mf2(__VA_ARGS__)
+#define vlmul_ext_u16m1(...) __riscv_vlmul_ext_u16m1(__VA_ARGS__)
+#define vlmul_ext_u16m2(...) __riscv_vlmul_ext_u16m2(__VA_ARGS__)
+#define vlmul_ext_u16m4(...) __riscv_vlmul_ext_u16m4(__VA_ARGS__)
+#define vlmul_ext_u16m8(...) __riscv_vlmul_ext_u16m8(__VA_ARGS__)
+#define vlmul_ext_u32m1(...) __riscv_vlmul_ext_u32m1(__VA_ARGS__)
+#define vlmul_ext_u32m2(...) __riscv_vlmul_ext_u32m2(__VA_ARGS__)
+#define vlmul_ext_u32m4(...) __riscv_vlmul_ext_u32m4(__VA_ARGS__)
+#define vlmul_ext_u32m8(...) __riscv_vlmul_ext_u32m8(__VA_ARGS__)
+#define vlmul_ext_u64m2(...) __riscv_vlmul_ext_u64m2(__VA_ARGS__)
+#define vlmul_ext_u64m4(...) __riscv_vlmul_ext_u64m4(__VA_ARGS__)
+#define vlmul_ext_u64m8(...) __riscv_vlmul_ext_u64m8(__VA_ARGS__)
+#define vlmul_trunc_f16mf4(...) __riscv_vlmul_trunc_f16mf4(__VA_ARGS__)
+#define vlmul_trunc_f16mf2(...) __riscv_vlmul_trunc_f16mf2(__VA_ARGS__)
+#define vlmul_trunc_f16m1(...) __riscv_vlmul_trunc_f16m1(__VA_ARGS__)
+#define vlmul_trunc_f16m2(...) __riscv_vlmul_trunc_f16m2(__VA_ARGS__)
+#define vlmul_trunc_f16m4(...) __riscv_vlmul_trunc_f16m4(__VA_ARGS__)
+#define vlmul_trunc_f32mf2(...) __riscv_vlmul_trunc_f32mf2(__VA_ARGS__)
+#define vlmul_trunc_f32m1(...) __riscv_vlmul_trunc_f32m1(__VA_ARGS__)
+#define vlmul_trunc_f32m2(...) __riscv_vlmul_trunc_f32m2(__VA_ARGS__)
+#define vlmul_trunc_f32m4(...) __riscv_vlmul_trunc_f32m4(__VA_ARGS__)
+#define vlmul_trunc_f64m1(...) __riscv_vlmul_trunc_f64m1(__VA_ARGS__)
+#define vlmul_trunc_f64m2(...) __riscv_vlmul_trunc_f64m2(__VA_ARGS__)
+#define vlmul_trunc_f64m4(...) __riscv_vlmul_trunc_f64m4(__VA_ARGS__)
+#define vlmul_trunc_i8mf8(...) __riscv_vlmul_trunc_i8mf8(__VA_ARGS__)
+#define vlmul_trunc_i8mf4(...) __riscv_vlmul_trunc_i8mf4(__VA_ARGS__)
+#define vlmul_trunc_i8mf2(...) __riscv_vlmul_trunc_i8mf2(__VA_ARGS__)
+#define vlmul_trunc_i8m1(...) __riscv_vlmul_trunc_i8m1(__VA_ARGS__)
+#define vlmul_trunc_i8m2(...) __riscv_vlmul_trunc_i8m2(__VA_ARGS__)
+#define vlmul_trunc_i8m4(...) __riscv_vlmul_trunc_i8m4(__VA_ARGS__)
+#define vlmul_trunc_i16mf4(...) __riscv_vlmul_trunc_i16mf4(__VA_ARGS__)
+#define vlmul_trunc_i16mf2(...) __riscv_vlmul_trunc_i16mf2(__VA_ARGS__)
+#define vlmul_trunc_i16m1(...) __riscv_vlmul_trunc_i16m1(__VA_ARGS__)
+#define vlmul_trunc_i16m2(...) __riscv_vlmul_trunc_i16m2(__VA_ARGS__)
+#define vlmul_trunc_i16m4(...) __riscv_vlmul_trunc_i16m4(__VA_ARGS__)
+#define vlmul_trunc_i32mf2(...) __riscv_vlmul_trunc_i32mf2(__VA_ARGS__)
+#define vlmul_trunc_i32m1(...) __riscv_vlmul_trunc_i32m1(__VA_ARGS__)
+#define vlmul_trunc_i32m2(...) __riscv_vlmul_trunc_i32m2(__VA_ARGS__)
+#define vlmul_trunc_i32m4(...) __riscv_vlmul_trunc_i32m4(__VA_ARGS__)
+#define vlmul_trunc_i64m1(...) __riscv_vlmul_trunc_i64m1(__VA_ARGS__)
+#define vlmul_trunc_i64m2(...) __riscv_vlmul_trunc_i64m2(__VA_ARGS__)
+#define vlmul_trunc_i64m4(...) __riscv_vlmul_trunc_i64m4(__VA_ARGS__)
+#define vlmul_trunc_u8mf8(...) __riscv_vlmul_trunc_u8mf8(__VA_ARGS__)
+#define vlmul_trunc_u8mf4(...) __riscv_vlmul_trunc_u8mf4(__VA_ARGS__)
+#define vlmul_trunc_u8mf2(...) __riscv_vlmul_trunc_u8mf2(__VA_ARGS__)
+#define vlmul_trunc_u8m1(...) __riscv_vlmul_trunc_u8m1(__VA_ARGS__)
+#define vlmul_trunc_u8m2(...) __riscv_vlmul_trunc_u8m2(__VA_ARGS__)
+#define vlmul_trunc_u8m4(...) __riscv_vlmul_trunc_u8m4(__VA_ARGS__)
+#define vlmul_trunc_u16mf4(...) __riscv_vlmul_trunc_u16mf4(__VA_ARGS__)
+#define vlmul_trunc_u16mf2(...) __riscv_vlmul_trunc_u16mf2(__VA_ARGS__)
+#define vlmul_trunc_u16m1(...) __riscv_vlmul_trunc_u16m1(__VA_ARGS__)
+#define vlmul_trunc_u16m2(...) __riscv_vlmul_trunc_u16m2(__VA_ARGS__)
+#define vlmul_trunc_u16m4(...) __riscv_vlmul_trunc_u16m4(__VA_ARGS__)
+#define vlmul_trunc_u32mf2(...) __riscv_vlmul_trunc_u32mf2(__VA_ARGS__)
+#define vlmul_trunc_u32m1(...) __riscv_vlmul_trunc_u32m1(__VA_ARGS__)
+#define vlmul_trunc_u32m2(...) __riscv_vlmul_trunc_u32m2(__VA_ARGS__)
+#define vlmul_trunc_u32m4(...) __riscv_vlmul_trunc_u32m4(__VA_ARGS__)
+#define vlmul_trunc_u64m1(...) __riscv_vlmul_trunc_u64m1(__VA_ARGS__)
+#define vlmul_trunc_u64m2(...) __riscv_vlmul_trunc_u64m2(__VA_ARGS__)
+#define vlmul_trunc_u64m4(...) __riscv_vlmul_trunc_u64m4(__VA_ARGS__)
+#define vset(...) __riscv_vset(__VA_ARGS__)
+#define vget_f16m1(...) __riscv_vget_f16m1(__VA_ARGS__)
+#define vget_f16m2(...) __riscv_vget_f16m2(__VA_ARGS__)
+#define vget_f16m4(...) __riscv_vget_f16m4(__VA_ARGS__)
+#define vget_f32m1(...) __riscv_vget_f32m1(__VA_ARGS__)
+#define vget_f32m2(...) __riscv_vget_f32m2(__VA_ARGS__)
+#define vget_f32m4(...) __riscv_vget_f32m4(__VA_ARGS__)
+#define vget_f64m1(...) __riscv_vget_f64m1(__VA_ARGS__)
+#define vget_f64m2(...) __riscv_vget_f64m2(__VA_ARGS__)
+#define vget_f64m4(...) __riscv_vget_f64m4(__VA_ARGS__)
+#define vget_i8m1(...) __riscv_vget_i8m1(__VA_ARGS__)
+#define vget_i8m2(...) __riscv_vget_i8m2(__VA_ARGS__)
+#define vget_i8m4(...) __riscv_vget_i8m4(__VA_ARGS__)
+#define vget_i16m1(...) __riscv_vget_i16m1(__VA_ARGS__)
+#define vget_i16m2(...) __riscv_vget_i16m2(__VA_ARGS__)
+#define vget_i16m4(...) __riscv_vget_i16m4(__VA_ARGS__)
+#define vget_i32m1(...) __riscv_vget_i32m1(__VA_ARGS__)
+#define vget_i32m2(...) __riscv_vget_i32m2(__VA_ARGS__)
+#define vget_i32m4(...) __riscv_vget_i32m4(__VA_ARGS__)
+#define vget_i64m1(...) __riscv_vget_i64m1(__VA_ARGS__)
+#define vget_i64m2(...) __riscv_vget_i64m2(__VA_ARGS__)
+#define vget_i64m4(...) __riscv_vget_i64m4(__VA_ARGS__)
+#define vget_u8m1(...) __riscv_vget_u8m1(__VA_ARGS__)
+#define vget_u8m2(...) __riscv_vget_u8m2(__VA_ARGS__)
+#define vget_u8m4(...) __riscv_vget_u8m4(__VA_ARGS__)
+#define vget_u16m1(...) __riscv_vget_u16m1(__VA_ARGS__)
+#define vget_u16m2(...) __riscv_vget_u16m2(__VA_ARGS__)
+#define vget_u16m4(...) __riscv_vget_u16m4(__VA_ARGS__)
+#define vget_u32m1(...) __riscv_vget_u32m1(__VA_ARGS__)
+#define vget_u32m2(...) __riscv_vget_u32m2(__VA_ARGS__)
+#define vget_u32m4(...) __riscv_vget_u32m4(__VA_ARGS__)
+#define vget_u64m1(...) __riscv_vget_u64m1(__VA_ARGS__)
+#define vget_u64m2(...) __riscv_vget_u64m2(__VA_ARGS__)
+#define vget_u64m4(...) __riscv_vget_u64m4(__VA_ARGS__)
+#define vle16(...) __riscv_vle16_tumu(__VA_ARGS__)
+#define vle32(...) __riscv_vle32_tumu(__VA_ARGS__)
+#define vle64(...) __riscv_vle64_tumu(__VA_ARGS__)
+#define vle8(...) __riscv_vle8_tumu(__VA_ARGS__)
+#define vlse16(...) __riscv_vlse16_tumu(__VA_ARGS__)
+#define vlse32(...) __riscv_vlse32_tumu(__VA_ARGS__)
+#define vlse64(...) __riscv_vlse64_tumu(__VA_ARGS__)
+#define vlse8(...) __riscv_vlse8_tumu(__VA_ARGS__)
+#define vle16ff(...) __riscv_vle16ff_tumu(__VA_ARGS__)
+#define vle32ff(...) __riscv_vle32ff_tumu(__VA_ARGS__)
+#define vle64ff(...) __riscv_vle64ff_tumu(__VA_ARGS__)
+#define vle8ff(...) __riscv_vle8ff_tumu(__VA_ARGS__)
+#define vlseg2e16(...) __riscv_vlseg2e16_tumu(__VA_ARGS__)
+#define vlseg3e16(...) __riscv_vlseg3e16_tumu(__VA_ARGS__)
+#define vlseg4e16(...) __riscv_vlseg4e16_tumu(__VA_ARGS__)
+#define vlseg5e16(...) __riscv_vlseg5e16_tumu(__VA_ARGS__)
+#define vlseg6e16(...) __riscv_vlseg6e16_tumu(__VA_ARGS__)
+#define vlseg7e16(...) __riscv_vlseg7e16_tumu(__VA_ARGS__)
+#define vlseg8e16(...) __riscv_vlseg8e16_tumu(__VA_ARGS__)
+#define vlseg2e32(...) __riscv_vlseg2e32_tumu(__VA_ARGS__)
+#define vlseg3e32(...) __riscv_vlseg3e32_tumu(__VA_ARGS__)
+#define vlseg4e32(...) __riscv_vlseg4e32_tumu(__VA_ARGS__)
+#define vlseg5e32(...) __riscv_vlseg5e32_tumu(__VA_ARGS__)
+#define vlseg6e32(...) __riscv_vlseg6e32_tumu(__VA_ARGS__)
+#define vlseg7e32(...) __riscv_vlseg7e32_tumu(__VA_ARGS__)
+#define vlseg8e32(...) __riscv_vlseg8e32_tumu(__VA_ARGS__)
+#define vlseg2e64(...) __riscv_vlseg2e64_tumu(__VA_ARGS__)
+#define vlseg3e64(...) __riscv_vlseg3e64_tumu(__VA_ARGS__)
+#define vlseg4e64(...) __riscv_vlseg4e64_tumu(__VA_ARGS__)
+#define vlseg5e64(...) __riscv_vlseg5e64_tumu(__VA_ARGS__)
+#define vlseg6e64(...) __riscv_vlseg6e64_tumu(__VA_ARGS__)
+#define vlseg7e64(...) __riscv_vlseg7e64_tumu(__VA_ARGS__)
+#define vlseg8e64(...) __riscv_vlseg8e64_tumu(__VA_ARGS__)
+#define vlseg2e16ff(...) __riscv_vlseg2e16ff_tumu(__VA_ARGS__)
+#define vlseg3e16ff(...) __riscv_vlseg3e16ff_tumu(__VA_ARGS__)
+#define vlseg4e16ff(...) __riscv_vlseg4e16ff_tumu(__VA_ARGS__)
+#define vlseg5e16ff(...) __riscv_vlseg5e16ff_tumu(__VA_ARGS__)
+#define vlseg6e16ff(...) __riscv_vlseg6e16ff_tumu(__VA_ARGS__)
+#define vlseg7e16ff(...) __riscv_vlseg7e16ff_tumu(__VA_ARGS__)
+#define vlseg8e16ff(...) __riscv_vlseg8e16ff_tumu(__VA_ARGS__)
+#define vlseg2e32ff(...) __riscv_vlseg2e32ff_tumu(__VA_ARGS__)
+#define vlseg3e32ff(...) __riscv_vlseg3e32ff_tumu(__VA_ARGS__)
+#define vlseg4e32ff(...) __riscv_vlseg4e32ff_tumu(__VA_ARGS__)
+#define vlseg5e32ff(...) __riscv_vlseg5e32ff_tumu(__VA_ARGS__)
+#define vlseg6e32ff(...) __riscv_vlseg6e32ff_tumu(__VA_ARGS__)
+#define vlseg7e32ff(...) __riscv_vlseg7e32ff_tumu(__VA_ARGS__)
+#define vlseg8e32ff(...) __riscv_vlseg8e32ff_tumu(__VA_ARGS__)
+#define vlseg2e64ff(...) __riscv_vlseg2e64ff_tumu(__VA_ARGS__)
+#define vlseg3e64ff(...) __riscv_vlseg3e64ff_tumu(__VA_ARGS__)
+#define vlseg4e64ff(...) __riscv_vlseg4e64ff_tumu(__VA_ARGS__)
+#define vlseg5e64ff(...) __riscv_vlseg5e64ff_tumu(__VA_ARGS__)
+#define vlseg6e64ff(...) __riscv_vlseg6e64ff_tumu(__VA_ARGS__)
+#define vlseg7e64ff(...) __riscv_vlseg7e64ff_tumu(__VA_ARGS__)
+#define vlseg8e64ff(...) __riscv_vlseg8e64ff_tumu(__VA_ARGS__)
+#define vlseg2e8(...) __riscv_vlseg2e8_tumu(__VA_ARGS__)
+#define vlseg3e8(...) __riscv_vlseg3e8_tumu(__VA_ARGS__)
+#define vlseg4e8(...) __riscv_vlseg4e8_tumu(__VA_ARGS__)
+#define vlseg5e8(...) __riscv_vlseg5e8_tumu(__VA_ARGS__)
+#define vlseg6e8(...) __riscv_vlseg6e8_tumu(__VA_ARGS__)
+#define vlseg7e8(...) __riscv_vlseg7e8_tumu(__VA_ARGS__)
+#define vlseg8e8(...) __riscv_vlseg8e8_tumu(__VA_ARGS__)
+#define vlseg2e8ff(...) __riscv_vlseg2e8ff_tumu(__VA_ARGS__)
+#define vlseg3e8ff(...) __riscv_vlseg3e8ff_tumu(__VA_ARGS__)
+#define vlseg4e8ff(...) __riscv_vlseg4e8ff_tumu(__VA_ARGS__)
+#define vlseg5e8ff(...) __riscv_vlseg5e8ff_tumu(__VA_ARGS__)
+#define vlseg6e8ff(...) __riscv_vlseg6e8ff_tumu(__VA_ARGS__)
+#define vlseg7e8ff(...) __riscv_vlseg7e8ff_tumu(__VA_ARGS__)
+#define vlseg8e8ff(...) __riscv_vlseg8e8ff_tumu(__VA_ARGS__)
+#define vlsseg2e16(...) __riscv_vlsseg2e16_tumu(__VA_ARGS__)
+#define vlsseg3e16(...) __riscv_vlsseg3e16_tumu(__VA_ARGS__)
+#define vlsseg4e16(...) __riscv_vlsseg4e16_tumu(__VA_ARGS__)
+#define vlsseg5e16(...) __riscv_vlsseg5e16_tumu(__VA_ARGS__)
+#define vlsseg6e16(...) __riscv_vlsseg6e16_tumu(__VA_ARGS__)
+#define vlsseg7e16(...) __riscv_vlsseg7e16_tumu(__VA_ARGS__)
+#define vlsseg8e16(...) __riscv_vlsseg8e16_tumu(__VA_ARGS__)
+#define vlsseg2e32(...) __riscv_vlsseg2e32_tumu(__VA_ARGS__)
+#define vlsseg3e32(...) __riscv_vlsseg3e32_tumu(__VA_ARGS__)
+#define vlsseg4e32(...) __riscv_vlsseg4e32_tumu(__VA_ARGS__)
+#define vlsseg5e32(...) __riscv_vlsseg5e32_tumu(__VA_ARGS__)
+#define vlsseg6e32(...) __riscv_vlsseg6e32_tumu(__VA_ARGS__)
+#define vlsseg7e32(...) __riscv_vlsseg7e32_tumu(__VA_ARGS__)
+#define vlsseg8e32(...) __riscv_vlsseg8e32_tumu(__VA_ARGS__)
+#define vlsseg2e64(...) __riscv_vlsseg2e64_tumu(__VA_ARGS__)
+#define vlsseg3e64(...) __riscv_vlsseg3e64_tumu(__VA_ARGS__)
+#define vlsseg4e64(...) __riscv_vlsseg4e64_tumu(__VA_ARGS__)
+#define vlsseg5e64(...) __riscv_vlsseg5e64_tumu(__VA_ARGS__)
+#define vlsseg6e64(...) __riscv_vlsseg6e64_tumu(__VA_ARGS__)
+#define vlsseg7e64(...) __riscv_vlsseg7e64_tumu(__VA_ARGS__)
+#define vlsseg8e64(...) __riscv_vlsseg8e64_tumu(__VA_ARGS__)
+#define vlsseg2e8(...) __riscv_vlsseg2e8_tumu(__VA_ARGS__)
+#define vlsseg3e8(...) __riscv_vlsseg3e8_tumu(__VA_ARGS__)
+#define vlsseg4e8(...) __riscv_vlsseg4e8_tumu(__VA_ARGS__)
+#define vlsseg5e8(...) __riscv_vlsseg5e8_tumu(__VA_ARGS__)
+#define vlsseg6e8(...) __riscv_vlsseg6e8_tumu(__VA_ARGS__)
+#define vlsseg7e8(...) __riscv_vlsseg7e8_tumu(__VA_ARGS__)
+#define vlsseg8e8(...) __riscv_vlsseg8e8_tumu(__VA_ARGS__)
+#define viota(...) __riscv_viota_tumu(__VA_ARGS__)
+#define vid(...) __riscv_vid_tumu(__VA_ARGS__)
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_011_compat.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
new file mode 100644
index 000000000000..da5e0fdd5754
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
@@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// 0.11 -> 0.12 compatibility
+
+#ifndef _RVV_IMPLICIT_VXRM
+#define _RVV_IMPLICIT_VXRM __RISCV_VXRM_RNU
+#endif
+
+// NOTE: masked should go first to avoid extra substitution (3 arg -> 4 arg -> 5 arg)
+
+// masked
+#define __riscv_vaadd(_1, _2, _3, _4) __riscv_vaadd(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasub(_1, _2, _3, _4) __riscv_vasub(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vaaddu(_1, _2, _3, _4) __riscv_vaaddu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasubu(_1, _2, _3, _4) __riscv_vasubu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vsmul(_1, _2, _3, _4) __riscv_vsmul(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssra(_1, _2, _3, _4) __riscv_vssra(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssrl(_1, _2, _3, _4) __riscv_vssrl(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclip(_1, _2, _3, _4) __riscv_vnclip(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclipu(_1, _2, _3, _4) __riscv_vnclipu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+
+// unmasked
+#define __riscv_vaadd(_1, _2, _3) __riscv_vaadd(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasub(_1, _2, _3) __riscv_vasub(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vaaddu(_1, _2, _3) __riscv_vaaddu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasubu(_1, _2, _3) __riscv_vasubu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vsmul(_1, _2, _3) __riscv_vsmul(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssra(_1, _2, _3) __riscv_vssra(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssrl(_1, _2, _3) __riscv_vssrl(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclip(_1, _2, _3) __riscv_vnclip(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclipu(_1, _2, _3) __riscv_vnclipu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
new file mode 100644
index 000000000000..2a323069fd9a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
@@ -0,0 +1,213 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
+#define OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
+
+// This file requires VTraits to be defined for vector types
+
+#define OPENCV_HAL_IMPL_RVV_FUN_AND(REG, SUF) \
+inline static REG vand(const REG & op1, const REG & op2, size_t vl) \
+{ \
+    return vand_vv_##SUF(op1, op2, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_AND(vint8m1_t, i8m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vuint8m1_t, u8m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vint16m1_t, i16m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vuint16m1_t, u16m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vint32m1_t, i32m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vuint32m1_t, u32m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vint64m1_t, i64m1)
+OPENCV_HAL_IMPL_RVV_FUN_AND(vuint64m1_t, u64m1)
+
+#define OPENCV_HAL_IMPL_RVV_FUN_LOXEI(REG, SUF, INDX, ISUF) \
+inline static REG vloxe##ISUF(const VTraits<REG>::lane_type *base, INDX bindex, size_t vl) \
+{ \
+    return vloxe##ISUF##_v_##SUF(base, bindex, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint8m1_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint8m2_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m4_t, i8m4, vuint8m4_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m8_t, i8m8, vuint8m8_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m1_t, i8m1, vuint32m4_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint8m2_t, i8m2, vuint32m8_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint16m1_t, i16m1, vuint32m2_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m1_t, i32m1, vuint32m1_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m2_t, i32m2, vuint32m2_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m4_t, i32m4, vuint32m4_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint32m8_t, i32m8, vuint32m8_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vint64m1_t, i64m1, vuint32mf2_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m1_t, u8m1, vuint8m1_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_FUN_MUL(REG, SUF) \
+inline static REG##m1_t vmul(const REG##m1_t & op1, const REG##m1_t & op2, size_t vl) \
+{ \
+    return vmul_vv_##SUF##m1(op1, op2, vl); \
+} \
+inline static REG##m1_t vmul(const REG##m1_t & op1, VTraits<REG##m1_t>::lane_type op2, size_t vl) \
+{ \
+    return vmul_vx_##SUF##m1(op1, op2, vl); \
+} \
+inline static REG##m2_t vmul(const REG##m2_t & op1, const REG##m2_t & op2, size_t vl) \
+{ \
+    return vmul_vv_##SUF##m2(op1, op2, vl); \
+} \
+inline static REG##m2_t vmul(const REG##m2_t & op1, VTraits<REG##m2_t>::lane_type op2, size_t vl) \
+{ \
+    return vmul_vx_##SUF##m2(op1, op2, vl); \
+} \
+inline static REG##m4_t vmul(const REG##m4_t & op1, const REG##m4_t & op2, size_t vl) \
+{ \
+    return vmul_vv_##SUF##m4(op1, op2, vl); \
+} \
+inline static REG##m4_t vmul(const REG##m4_t & op1, VTraits<REG##m4_t>::lane_type op2, size_t vl) \
+{ \
+    return vmul_vx_##SUF##m4(op1, op2, vl); \
+} \
+inline static REG##m8_t vmul(const REG##m8_t & op1, const REG##m8_t & op2, size_t vl) \
+{ \
+    return vmul_vv_##SUF##m8(op1, op2, vl); \
+} \
+inline static REG##m8_t vmul(const REG##m8_t & op1, VTraits<REG##m8_t>::lane_type op2, size_t vl) \
+{ \
+    return vmul_vx_##SUF##m8(op1, op2, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vint8, i8)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vint16, i16)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vint32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_MUL(vuint32, u32)
+
+#define OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(REG1, SUF1, REG2, SUF2) \
+inline static REG1##m1_t vreinterpret_##SUF1##m1(const REG2##m1_t & src) \
+{\
+    return vreinterpret_v_##SUF2##m1_##SUF1##m1(src); \
+} \
+inline static REG1##m2_t vreinterpret_##SUF1##m2(const REG2##m2_t & src) \
+{\
+    return vreinterpret_v_##SUF2##m2_##SUF1##m2(src); \
+} \
+inline static REG1##m4_t vreinterpret_##SUF1##m4(const REG2##m4_t & src) \
+{\
+    return vreinterpret_v_##SUF2##m4_##SUF1##m4(src); \
+} \
+inline static REG1##m8_t vreinterpret_##SUF1##m8(const REG2##m8_t & src) \
+{\
+    return vreinterpret_v_##SUF2##m8_##SUF1##m8(src); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint8, i8, vuint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint16, i16, vuint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vuint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vuint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vfloat32, f32, vint32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vfloat32, f32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vint32, i32, vfloat32, f32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vint8, i8)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint8, u8, vuint64, u64)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vint16, i16)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint16, u16, vuint64, u64)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vint32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_REINTERPRET(vuint32, u32, vuint64, u64)
+
+#define OPENCV_HAL_IMPL_RVV_FUN_STORE(REG, SUF, SZ) \
+inline static void vse##SZ(VTraits<REG>::lane_type *base, REG value, size_t vl) \
+{ \
+    return vse##SZ##_v_##SUF##m1(base, value, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint8, u8, 8)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int8, i8, 8)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint16, u16, 16)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int16, i16, 16)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint32, u32, 32)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int32, i32, 32)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_uint64, u64, 64)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_int64, i64, 64)
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float32, f32, 32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FUN_STORE(v_float64, f64, 64)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(REG, SUF) \
+inline static VTraits<REG>::lane_type vmv_x(const REG & reg) \
+{\
+    return vmv_x_s_##SUF##m1_##SUF(reg); \
+}
+#define OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(REG, SUF) \
+inline static VTraits<REG>::lane_type vfmv_f(const REG & reg) \
+{\
+    return vfmv_f_s_##SUF##m1_##SUF(reg); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int8, i8)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int16, i16)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_uint64, u64)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT(v_int64, i64)
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FUN_EXTRACT_F(v_float64, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_FUN_SLIDE(REG, SUF) \
+inline static REG vslidedown(const REG & dst, const REG & src, size_t offset, size_t vl) \
+{ \
+    return vslidedown_vx_##SUF##m1(dst, src, offset, vl); \
+} \
+inline static REG vslideup(const REG & dst, const REG & src, size_t offset, size_t vl) \
+{ \
+    return vslideup_vx_##SUF##m1(dst, src, offset, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint8, u8)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int8, i8)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint16, u16)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int16, i16)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float32, f32)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_uint64, u64)
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_int64, i64)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FUN_SLIDE(v_float64, f64)
+#endif
+
+inline static vuint32mf2_t vmul(const vuint32mf2_t & op1, uint32_t op2, size_t vl)
+{
+    return vmul_vx_u32mf2(op1, op2, vl);
+}
+
+inline static vuint32mf2_t vreinterpret_u32mf2(const vint32mf2_t& val)
+{
+    return vreinterpret_v_i32mf2_u32mf2(val);
+}
+
+inline static vuint32mf2_t vreinterpret_u32mf2(const vuint16mf2_t& val)
+{
+    return vreinterpret_v_u16mf2_u32mf2(val);
+}
+
+#endif //OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_scalable.hpp
new file mode 100644
index 000000000000..0159e4325a3a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -0,0 +1,2182 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// The original implementation is contributed by HAN Liutong.
+// Copyright (C) 2022, Institute of Software, Chinese Academy of Sciences.
+
+#ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
+#define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
+
+#include <opencv2/core/check.hpp>
+
+// RVV intrinsics have been renamed in version 0.11, so we need to include
+// compatibility headers:
+// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
+#if defined(__riscv_v_intrinsic) &&  __riscv_v_intrinsic>10999
+#include "intrin_rvv_010_compat_non-policy.hpp"
+#include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
+#endif
+
+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
+#include "intrin_rvv_011_compat.hpp"
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+// FIXIT: eliminate massive warnigs from templates
+// GCC from 'rvv-next': riscv64-unknown-linux-gnu-g++ (g42df3464463) 12.0.1 20220505 (prerelease)
+// doesn't work: #pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
+#ifndef CV_RVV_MAX_VLEN
+#define CV_RVV_MAX_VLEN 1024
+#endif
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD_SCALABLE 1
+#define CV_SIMD_SCALABLE_64F 1
+
+using v_uint8 = vuint8m1_t;
+using v_int8 = vint8m1_t;
+using v_uint16 = vuint16m1_t;
+using v_int16 = vint16m1_t;
+using v_uint32 = vuint32m1_t;
+using v_int32 = vint32m1_t;
+using v_uint64 = vuint64m1_t;
+using v_int64 = vint64m1_t;
+
+using v_float32 = vfloat32m1_t;
+#if CV_SIMD_SCALABLE_64F
+using v_float64 = vfloat64m1_t;
+#endif
+
+using uchar = unsigned char;
+using schar = signed char;
+using ushort = unsigned short;
+using uint = unsigned int;
+using uint64 = unsigned long int;
+using int64 = long int;
+
+static const int __cv_rvv_e8m1_nlanes = vsetvlmax_e8m1();
+static const int __cv_rvv_e16m1_nlanes = vsetvlmax_e16m1();
+static const int __cv_rvv_e32m1_nlanes = vsetvlmax_e32m1();
+static const int __cv_rvv_e64m1_nlanes = vsetvlmax_e64m1();
+static const int __cv_rvv_e8m2_nlanes = vsetvlmax_e8m2();
+static const int __cv_rvv_e16m2_nlanes = vsetvlmax_e16m2();
+static const int __cv_rvv_e32m2_nlanes = vsetvlmax_e32m2();
+static const int __cv_rvv_e64m2_nlanes = vsetvlmax_e64m2();
+static const int __cv_rvv_e8m4_nlanes = vsetvlmax_e8m4();
+static const int __cv_rvv_e16m4_nlanes = vsetvlmax_e16m4();
+static const int __cv_rvv_e32m4_nlanes = vsetvlmax_e32m4();
+static const int __cv_rvv_e64m4_nlanes = vsetvlmax_e64m4();
+static const int __cv_rvv_e8m8_nlanes = vsetvlmax_e8m8();
+static const int __cv_rvv_e16m8_nlanes = vsetvlmax_e16m8();
+static const int __cv_rvv_e32m8_nlanes = vsetvlmax_e32m8();
+static const int __cv_rvv_e64m8_nlanes = vsetvlmax_e64m8();
+
+template <class T>
+struct VTraits;
+
+#define OPENCV_HAL_IMPL_RVV_TRAITS(REG, TYP, SUF, SZ) \
+template <> \
+struct VTraits<REG> \
+{ \
+    static inline int vlanes() { return __cv_rvv_##SUF##_nlanes; } \
+    using lane_type = TYP; \
+    static const int max_nlanes = CV_RVV_MAX_VLEN/SZ; \
+};
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vint8m1_t, int8_t, e8m1, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint8m2_t, int8_t, e8m2, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint8m4_t, int8_t, e8m4, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint8m8_t, int8_t, e8m8, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m1_t, uint8_t, e8m1, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m2_t, uint8_t, e8m2, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m4_t, uint8_t, e8m4, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m8_t, uint8_t, e8m8, 8)
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vint16m1_t, int16_t, e16m1, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint16m2_t, int16_t, e16m2, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint16m4_t, int16_t, e16m4, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint16m8_t, int16_t, e16m8, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m1_t, uint16_t, e16m1, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m2_t, uint16_t, e16m2, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m4_t, uint16_t, e16m4, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m8_t, uint16_t, e16m8, 16)
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vint32m1_t, int32_t, e32m1, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint32m2_t, int32_t, e32m2, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint32m4_t, int32_t, e32m4, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint32m8_t, int32_t, e32m8, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m1_t, uint32_t, e32m1, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m2_t, uint32_t, e32m2, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m4_t, uint32_t, e32m4, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m8_t, uint32_t, e32m8, 32)
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vint64m1_t, int64_t, e64m1, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint64m2_t, int64_t, e64m2, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint64m4_t, int64_t, e64m4, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint64m8_t, int64_t, e64m8, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m1_t, uint64_t, e64m1, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m2_t, uint64_t, e64m2, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m4_t, uint64_t, e64m4, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m8_t, uint64_t, e64m8, 64)
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m1_t, float, e32m1, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m2_t, float, e32m2, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m4_t, float, e32m4, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m8_t, float, e32m8, 32)
+
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m1_t, double, e64m1, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m2_t, double, e64m2, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m4_t, double, e64m4, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m8_t, double, e64m8, 64)
+#endif
+
+
+// LLVM/Clang defines "overloaded intrinsics" e.g. 'vand(op1, op2)'
+// GCC does not have these functions, so we need to implement them manually
+// We implement only selected subset required to build current state of the code
+// Included inside namespace cv::
+#ifndef __riscv_v_intrinsic_overloading
+#include "intrin_rvv_compat_overloaded.hpp"
+#endif // __riscv_v_intrinsic_overloading
+
+
+//////////// get0 ////////////
+#define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
+inline _Tp v_get0(const v_##_Tpvec& v) \
+{ \
+    return vmv_x(v); \
+}
+
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
+
+inline float v_get0(const v_float32& v) \
+{ \
+    return vfmv_f(v); \
+}
+#if CV_SIMD_SCALABLE_64F
+inline double v_get0(const v_float64& v) \
+{ \
+    return vfmv_f(v); \
+}
+#endif
+
+//////////// Initial ////////////
+
+#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
+inline v_##_Tpvec v_setzero_##suffix1() \
+{ \
+    return vmv_v_x_##suffix2##m1(0, vl); \
+} \
+inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
+{ \
+    return vmv_v_x_##suffix2##m1(v, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
+inline v_##_Tpv v_setzero_##suffix() \
+{ \
+    return vfmv_v_f_##suffix##m1(0, vl); \
+} \
+inline v_##_Tpv v_setall_##suffix(_Tp v) \
+{ \
+    return vfmv_v_f_##suffix##m1(v, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
+#endif
+
+//////////// Reinterpret ////////////
+#define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \
+{ \
+    return v;\
+}
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
+#endif
+// TODO: can be simplified by using overloaded RV intrinsic
+#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64)
+#endif
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64)
+
+
+#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
+// Three times reinterpret
+inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
+{ \
+    return vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v)));\
+}
+
+inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
+{ \
+    return vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v)));\
+}
+#endif
+
+//////////// Extract //////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vl) \
+template <int s = 0> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
+{ \
+    return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
+} \
+template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
+{ \
+    return vmv_x(vslidedown(v_setzero_##suffix(), v, i, vl)); \
+}
+
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8, uchar, u8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8, schar, s8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16, ushort, u16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16, short, s16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32, unsigned int, u32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32, int, s32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64, uint64, u64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64, int64, s64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vl) \
+template <int s = 0> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
+{ \
+    return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
+} \
+template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
+{ \
+    return vfmv_f(vslidedown(v_setzero_##suffix(), v, i, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, f64, VTraits<v_float64>::vlanes())
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \
+inline _Tp v_extract_highest(_Tpvec v) \
+{ \
+    return v_extract_n(v, vl-1); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8, schar, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16, short, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits<v_int64>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits<v_float64>::vlanes())
+#endif
+
+
+////////////// Load/Store //////////////
+#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ \
+    return vle##width##_v_##suffix##m1(ptr, vl); \
+} \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ \
+    return vle##width##_v_##suffix##m1(ptr, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    return vle##width##_v_##suffix##m1(ptr, hvl); \
+} \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return vslideup(vle##width##_v_##suffix##m1(ptr0, hvl), vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, a, vl); \
+} \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, a, vl); \
+} \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, a, vl); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, a, hvl); \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
+} \
+template<typename... Targs> \
+_Tpvec v_load_##suffix(Targs... nScalars) \
+{ \
+    return v_load({nScalars...}); \
+}
+
+
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8, vmv_v_x_u8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8, vmv_v_x_i8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits<v_uint16>::vlanes() / 2, VTraits<v_uint16>::vlanes(), 16, u16, vmv_v_x_u16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits<v_int16>::vlanes() / 2, VTraits<v_int16>::vlanes(), 16, i16, vmv_v_x_i16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_uint32>::vlanes() / 2, VTraits<v_uint32>::vlanes(), 32, u32, vmv_v_x_u32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32, vmv_v_x_i32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64, vmv_v_x_u64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64, vmv_v_x_i64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32, vfmv_v_f_f32m1)
+
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64, vfmv_v_f_f64m1)
+#endif
+
+////////////// Lookup table access ////////////////////
+#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
+{ \
+    auto vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
+OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
+OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
+OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
+OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \
+inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
+{ \
+    auto v0 = vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \
+    auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/2); \
+    auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/2); \
+    auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/2); \
+    auto sh1 = vslide1up(v_trunc(vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \
+    auto vid = vor(sh1, v_trunc(vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \
+    auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, vlmul_trunc_u32mf2)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, vlmul_trunc_u32mf2)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \
+inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
+{ \
+    auto v0 = vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \
+    auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/4); \
+    auto v2 = vadd(v0, 2, VTraits<_Tpvec>::vlanes()/4); \
+    auto v3 = vadd(v0, 3, VTraits<_Tpvec>::vlanes()/4); \
+    auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/4); \
+    auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/4); \
+    auto w2 = vwcvtu_x(v2, VTraits<_Tpvec>::vlanes()/4); \
+    auto w3 = vwcvtu_x(v3, VTraits<_Tpvec>::vlanes()/4); \
+    auto sh2 = vslide1up(vreinterpret_u32##suffix1(w2),0, VTraits<_Tpvec>::vlanes()/2); \
+    auto sh3 = vslide1up(vreinterpret_u32##suffix1(w3),0, VTraits<_Tpvec>::vlanes()/2); \
+    auto vid0 = vor(sh2, vreinterpret_u32##suffix1(w0), VTraits<_Tpvec>::vlanes()/2); \
+    auto vid1 = vor(sh3, vreinterpret_u32##suffix1(w1), VTraits<_Tpvec>::vlanes()/2); \
+    auto wid0 = vwcvtu_x(v_trunc(vid0), VTraits<_Tpvec>::vlanes()/2); \
+    auto wid1 = vwcvtu_x(v_trunc(vid1), VTraits<_Tpvec>::vlanes()/2); \
+    auto shwid1 = vslide1up(vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \
+    auto vid = vor(shwid1, vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \
+    auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, vlmul_trunc_u32mf2)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, vlmul_trunc_u32mf2)
+
+#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
+inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
+{ \
+    v_uint32 vidx_ = vmul(vreinterpret_u32m1(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
+    return vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_int32, int)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned)
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
+{ \
+    vuint32mf2_t vidx_ = vmul(vlmul_trunc_u32mf2(vreinterpret_u32m1(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
+    return vloxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
+}
+#endif
+
+
+inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+////////////// Pack boolean ////////////////////
+inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b)
+{
+    return vnsrl(vset(vlmul_ext_v_u16m1_u16m2(a),1,b), 0, VTraits<v_uint8>::vlanes());
+}
+
+inline v_uint8 v_pack_b(const v_uint32& a, const v_uint32& b,
+                           const v_uint32& c, const v_uint32& d)
+{
+
+    return vnsrl(vnsrl(vset(vset(vset(vlmul_ext_u32m4(a),1,b),2,c),3,d), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
+}
+
+inline v_uint8 v_pack_b(const v_uint64& a, const v_uint64& b, const v_uint64& c,
+                           const v_uint64& d, const v_uint64& e, const v_uint64& f,
+                           const v_uint64& g, const v_uint64& h)
+{
+    return vnsrl(vnsrl(vnsrl(
+        vset(vset(vset(vset(vset(vset(vset(vlmul_ext_u64m8(a),
+        1,b),2,c),3,d),4,e),5,f),6,g),7,h),
+        0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
+}
+
+////////////// Arithmetics //////////////
+#define OPENCV_HAL_IMPL_RVV_BIN_OP(_Tpvec, ocv_intrin, rvv_intrin) \
+inline _Tpvec v_##ocv_intrin(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return rvv_intrin(a, b, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add, vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub, vssubu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add, vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub, vssub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, vssubu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, vssub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, add, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, sub, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, mul, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, add, vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, sub, vfsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, mul, vfmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, div, vfdiv)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, add, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, sub, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, add, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, sub, vsub)
+
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, add, vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, sub, vfsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, mul, vfmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, div, vfdiv)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_BIN_MADD(_Tpvec, rvv_add) \
+template<typename... Args> \
+inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+    return v_add(rvv_add(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
+}
+#define OPENCV_HAL_IMPL_RVV_BIN_MMUL(_Tpvec, rvv_mul) \
+template<typename... Args> \
+inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+    return v_mul(rvv_mul(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
+}
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint8, vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int8, vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint16, vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int16, vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint32, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int32, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float32, vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint64, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, vadd)
+
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, vfmul)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, vfmul)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _TpwvecM2, suffix, wmul) \
+inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
+{ \
+    _TpwvecM2 temp = wmul(a, b, VTraits<_Tpvec>::vlanes()); \
+    c = vget_##suffix##m1(temp, 0); \
+    d = vget_##suffix##m1(temp, 1); \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8, v_uint16, vuint16m2_t, u16, vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m2_t, i16, vwmul)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m2_t, u32, vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m2_t, i32, vwmul)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m2_t, u64, vwmulu)
+
+inline v_int16 v_mul_hi(const v_int16& a, const v_int16& b)
+{
+    return vmulh(a, b, VTraits<v_int16>::vlanes());
+}
+inline v_uint16 v_mul_hi(const v_uint16& a, const v_uint16& b)
+{
+    return vmulhu(a, b, VTraits<v_uint16>::vlanes());
+}
+
+////////////// Arithmetics (wrap)//////////////
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add_wrap, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add_wrap, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add_wrap, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add_wrap, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub_wrap, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub_wrap, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub_wrap, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub_wrap, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, mul_wrap, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, mul_wrap, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, mul_wrap, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, mul_wrap, vmul)
+
+//////// Saturating Multiply ////////
+#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _clip, _wmul) \
+inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _clip(_wmul(a, b, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()); \
+} \
+template<typename... Args> \
+inline _Tpvec v_mul(const _Tpvec& a1, const _Tpvec& a2, const Args&... va) { \
+    return v_mul(_clip(_wmul(a1, a2, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()), va...); \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8, vnclipu, vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8, vnclip, vwmul)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16, vnclipu, vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16, vnclip, vwmul)
+
+////////////// Bitwise logic //////////////
+
+#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, vl) \
+inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vand(a, b, vl); \
+} \
+inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vor(a, b, vl); \
+} \
+inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vxor(a, b, vl); \
+} \
+inline _Tpvec v_not (const _Tpvec& a) \
+{ \
+    return vnot(a, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(intrin) \
+inline v_float32 intrin (const v_float32& a, const v_float32& b) \
+{ \
+    return vreinterpret_f32m1(intrin(vreinterpret_i32m1(a), vreinterpret_i32m1(b))); \
+}
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_and)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_or)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_xor)
+
+inline v_float32 v_not (const v_float32& a) \
+{ \
+    return vreinterpret_f32m1(v_not(vreinterpret_i32m1(a))); \
+}
+
+#if CV_SIMD_SCALABLE_64F
+#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(intrin) \
+inline v_float64 intrin (const v_float64& a, const v_float64& b) \
+{ \
+    return vreinterpret_f64m1(intrin(vreinterpret_i64m1(a), vreinterpret_i64m1(b))); \
+}
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_and)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_or)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_xor)
+
+inline v_float64 v_not (const v_float64& a) \
+{ \
+    return vreinterpret_f64m1(v_not(vreinterpret_i64m1(a))); \
+}
+#endif
+
+
+////////////// Bitwise shifts //////////////
+/*  Usage
+1. v_shl<N>(vec);
+2. v_shl(vec, N); // instead of vec << N, when N is non-constant.
+*/
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
+{ \
+    return _Tpvec(vsll(a, uint8_t(n), vl)); \
+} \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
+{ \
+    return _Tpvec(vsrl(a, uint8_t(n), vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
+{ \
+    return _Tpvec(vsll(a, uint8_t(n), vl)); \
+} \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
+{ \
+    return _Tpvec(vsra(a, uint8_t(n), vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64, VTraits<v_int64>::vlanes())
+
+////////////// Comparison //////////////
+#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix) \
+inline _Tpvec v_##op(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    size_t VLEN = VTraits<_Tpvec>::vlanes(); \
+    uint64_t ones = -1; \
+    return vmerge(intrin(a, b, VLEN), vmv_v_x_##suffix##m1(0, VLEN), ones, VLEN); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix) \
+inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    size_t VLEN = VTraits<_Tpvec>::vlanes(); \
+    union { uint64_t u; VTraits<_Tpvec>::lane_type d; } ones; \
+    ones.u = -1; \
+    auto diff = intrin(a, b, VLEN); \
+    auto z = vfmv_v_f_##suffix##m1(0, VLEN); \
+    auto res = vfmerge(diff, z, ones.d, VLEN); \
+    return _Tpvec(res); \
+} //TODO
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmsltu, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgtu, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsleu, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsgeu, suffix)
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmslt, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgt, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsle, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsge, suffix)
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, eq, vmfeq, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ne, vmfne, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, lt, vmflt, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, vmfgt, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, vmfle, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, vmfge, suffix)
+
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64, u64)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64)
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64)
+#endif
+
+inline v_float32 v_not_nan(const v_float32& a)
+{ return v_eq(a, a); }
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_not_nan(const v_float64& a)
+{ return v_eq(a, a); }
+#endif
+
+////////////// Min/Max //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return intrin(a, b, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, vminu, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, vmaxu, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, vmin, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, vmax, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, vminu, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, vmaxu, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, vmin, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, vmax, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, vminu, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, vmaxu, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits<v_float32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits<v_float64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits<v_float64>::vlanes())
+#endif
+
+////////////// Transpose4x4 //////////////
+#define OPENCV_HAL_IMPL_RVV_ZIP4(_Tpvec, _wTpvec, suffix, convert2u, convert) \
+inline void v_zip4(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
+    int vl = 4; \
+    _wTpvec temp = vreinterpret_##suffix##m2(convert2u( \
+        vor(vzext_vf2(convert(a0), vl), \
+            vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(convert(a1), vl)), 0, vl*2)), \
+            vl))); \
+    b0 = vget_##suffix##m1(temp, 0); \
+    b1 = vget_##suffix##m1(vrgather(temp, vadd(vid_v_u32m2(vl), 4, vl)/*{4,5,6,7} */, vl) ,0); \
+}
+
+OPENCV_HAL_IMPL_RVV_ZIP4(v_uint32, vuint32m2_t, u32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP4(v_int32, vint32m2_t, i32, vreinterpret_u32m2, vreinterpret_u32m1)
+OPENCV_HAL_IMPL_RVV_ZIP4(v_float32, vfloat32m2_t, f32, vreinterpret_u32m2, vreinterpret_u32m1)
+
+#if 0
+// this is v_zip4 and v_tranpose4x4 for scalable VLEN, costs more instruction than current 128-bit only version.
+inline void v_zip4(const v_float32& a0, const v_float32& a1, v_float32& b0, v_float32& b1) {
+    vuint64m1_t vid1 = vid_v_u64m1(VTraits<vuint64m1_t>::vlanes());
+    vuint16m1_t t1 = vreinterpret_u16m1(vid1);
+    vuint16m1_t t2 = vslide1up(t1, 0, VTraits<vuint16m1_t>::vlanes());
+    vuint16m1_t t3 = vslide1up(t2, 0, VTraits<vuint16m1_t>::vlanes());
+    vuint16m1_t t4 = vslide1up(t3, 0, VTraits<vuint16m1_t>::vlanes());
+    t1 = vor(
+        vor(t1, t2, VTraits<vuint16m1_t>::vlanes()),
+        vor(t3, t4, VTraits<vuint16m1_t>::vlanes()),
+        VTraits<vuint16m1_t>::vlanes()
+    );
+    vuint32m2_t vidx0 = vwmulu(t1, 4, VTraits<vuint32m1_t>::vlanes());
+    vidx0 = vadd(vidx0, vid_v_u32m2(VTraits<vuint32m1_t>::vlanes()), VTraits<vuint32m1_t>::vlanes());
+    vuint32m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint32m1_t>::vlanes());
+    vfloat32m2_t temp = vreinterpret_f32m2(vreinterpret_u32m2(
+        vor(vzext_vf2(vreinterpret_u32m1(a0), VTraits<vuint16m1_t>::vlanes()),
+            vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a1), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vfloat32m1_t>::vlanes()*2)),
+            VTraits<vfloat32m1_t>::vlanes())));
+    b0 = vlmul_trunc_f32m1(vrgather(temp, vidx0, VTraits<vuint16m1_t>::vlanes()));
+    b1 = vlmul_trunc_f32m1(vrgather(temp, vidx1, VTraits<vuint16m1_t>::vlanes()));
+}
+
+inline void v_transpose4x4(const v_float32& a0, const v_float32& a1, const v_float32& a2, const v_float32& a3,\
+                            v_float32& b0, v_float32& b1, v_float32& b2, v_float32& b3) { \
+    vuint64m2_t vid1 = vid_v_u64m2(VTraits<vuint32m1_t>::vlanes());
+    vuint16m2_t t1 = vreinterpret_u16m2(vid1);
+    vuint16m2_t t2 = vslide1up(t1, 0, VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t t3 = vslide1up(t2, 0, VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t t4 = vslide1up(t3, 0, VTraits<vuint8m1_t>::vlanes());
+    t1 = vor(
+        vor(t1, t2, VTraits<vuint8m1_t>::vlanes()),
+        vor(t3, t4, VTraits<vuint8m1_t>::vlanes()),
+        VTraits<vuint8m1_t>::vlanes()
+    );
+    vuint16m2_t vidx0 = vmul(t1, 12, VTraits<vuint8m1_t>::vlanes());
+    vidx0 = vadd(vidx0, vid_v_u16m2(VTraits<vuint8m1_t>::vlanes()), VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t vidx2 = vadd(vidx0, 8, VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t vidx3 = vadd(vidx0, 12, VTraits<vuint8m1_t>::vlanes());
+    vuint32m2_t tempA = vreinterpret_u32m2( \
+        vor(vzext_vf2(vreinterpret_u32m1(a0), VTraits<vuint16m1_t>::vlanes()), \
+            vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a2), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
+            VTraits<vuint32m1_t>::vlanes())); \
+    vuint32m2_t tempB = vreinterpret_u32m2( \
+        vor(vzext_vf2(vreinterpret_u32m1(a1), VTraits<vuint16m1_t>::vlanes()), \
+            vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a3), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
+            VTraits<vuint32m1_t>::vlanes())); \
+    vfloat32m4_t temp = vreinterpret_f32m4(vreinterpret_u32m4( \
+        vor(vzext_vf2(tempA, VTraits<vuint8m1_t>::vlanes()), \
+            vreinterpret_u64m4(vslide1up(vreinterpret_u32m4(vzext_vf2(tempB, VTraits<vuint8m1_t>::vlanes())), 0, VTraits<vuint8m1_t>::vlanes())), \
+            VTraits<vuint16m1_t>::vlanes()))); \
+    b0 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx0, VTraits<vuint8m1_t>::vlanes()));
+    b1 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx1, VTraits<vuint8m1_t>::vlanes()));
+    b2 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx2, VTraits<vuint8m1_t>::vlanes()));
+    b3 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx3, VTraits<vuint8m1_t>::vlanes()));
+}
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, const _Tpvec& a2, const _Tpvec& a3, _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) { \
+    _Tpvec t0,t1,t2,t3; \
+    v_zip4(a0, a2, t0, t2); \
+    v_zip4(a1, a3, t1, t3); \
+    v_zip4(t0, t1, b0, b1); \
+    v_zip4(t2, t3, b2, b3); \
+}
+
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_float32, f32)
+
+////////////// Reduce //////////////
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl, red) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
+    res = v##red(res, a, zero, vl); \
+    return (scalartype)v_get0(res); \
+}
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8, v_uint16, vuint16m1_t, unsigned, u16, VTraits<v_uint8>::vlanes(), wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8, v_int16, vint16m1_t, int, i16, VTraits<v_int8>::vlanes(), wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16, v_uint32, vuint32m1_t, unsigned, u32, VTraits<v_uint16>::vlanes(), wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16, v_int32, vint32m1_t, int, i32, VTraits<v_int16>::vlanes(), wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32, v_uint64, vuint64m1_t, unsigned, u64, VTraits<v_uint32>::vlanes(), wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32, v_int64, vint64m1_t, int, i64, VTraits<v_int32>::vlanes(), wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64, v_uint64, vuint64m1_t, uint64, u64, VTraits<v_uint64>::vlanes(), redsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64, v_int64, vint64m1_t, int64, i64, VTraits<v_int64>::vlanes(), redsum)
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
+    res = vfredosum(res, a, zero, vl); \
+    return (scalartype)v_get0(res); \
+}
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits<v_float64>::vlanes())
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
+inline scalartype v_reduce_##func(const _Tpvec& a)  \
+{ \
+    _Tpvec res = _Tpvec(v##red(a, a, a, vl)); \
+    return (scalartype)v_get0(res); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, min, uchar, u8, VTraits<v_uint8>::vlanes(), redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, min, schar, i8, VTraits<v_int8>::vlanes(), redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, min, ushort, u16, VTraits<v_uint16>::vlanes(), redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, min, short, i16, VTraits<v_int16>::vlanes(), redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, min, unsigned, u32, VTraits<v_uint32>::vlanes(), redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, min, int, i32, VTraits<v_int32>::vlanes(), redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, min, float, f32, VTraits<v_float32>::vlanes(), fredmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, max, uchar, u8, VTraits<v_uint8>::vlanes(), redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, max, schar, i8, VTraits<v_int8>::vlanes(), redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, max, ushort, u16, VTraits<v_uint16>::vlanes(), redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits<v_int16>::vlanes(), redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits<v_uint32>::vlanes(), redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits<v_int32>::vlanes(), redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits<v_float32>::vlanes(), fredmax)
+
+inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b,
+                                 const v_float32& c, const v_float32& d)
+{
+    // 0000 1111 2222 3333 ....
+    vuint64m2_t vid1 = vid_v_u64m2(VTraits<vuint32m1_t>::vlanes());
+    vuint16m2_t t1 = vreinterpret_u16m2(vid1);
+    vuint16m2_t t2 = vslide1up(t1, 0, VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t t3 = vslide1up(t2, 0, VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t t4 = vslide1up(t3, 0, VTraits<vuint8m1_t>::vlanes());
+    t1 = vor(
+        vor(t1, t2, VTraits<vuint8m1_t>::vlanes()),
+        vor(t3, t4, VTraits<vuint8m1_t>::vlanes()),
+        VTraits<vuint8m1_t>::vlanes()
+    );
+
+    // index for transpose4X4
+    vuint16m2_t vidx0 = vmul(t1, 12, VTraits<vuint8m1_t>::vlanes());
+    vidx0 = vadd(vidx0, vid_v_u16m2(VTraits<vuint8m1_t>::vlanes()), VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t vidx2 = vadd(vidx0, 8, VTraits<vuint8m1_t>::vlanes());
+    vuint16m2_t vidx3 = vadd(vidx0, 12, VTraits<vuint8m1_t>::vlanes());
+
+    // zip
+    vuint32m2_t tempA = vreinterpret_u32m2( \
+        vor(vzext_vf2(vreinterpret_u32m1(a), VTraits<vuint16m1_t>::vlanes()), \
+            vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(c), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
+            VTraits<vuint32m1_t>::vlanes())); \
+    vuint32m2_t tempB = vreinterpret_u32m2( \
+        vor(vzext_vf2(vreinterpret_u32m1(b), VTraits<vuint16m1_t>::vlanes()), \
+            vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(d), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
+            VTraits<vuint32m1_t>::vlanes())); \
+    vfloat32m4_t temp = vreinterpret_f32m4(vreinterpret_u32m4( \
+        vor(vzext_vf2(tempA, VTraits<vuint8m1_t>::vlanes()), \
+            vreinterpret_u64m4(vslide1up(vreinterpret_u32m4(vzext_vf2(tempB, VTraits<vuint8m1_t>::vlanes())), 0, VTraits<vuint8m1_t>::vlanes())), \
+            VTraits<vuint16m1_t>::vlanes())));
+
+    // transpose
+    vfloat32m1_t b0 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx0, VTraits<vuint8m1_t>::vlanes()));
+    vfloat32m1_t b1 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx1, VTraits<vuint8m1_t>::vlanes()));
+    vfloat32m1_t b2 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx2, VTraits<vuint8m1_t>::vlanes()));
+    vfloat32m1_t b3 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx3, VTraits<vuint8m1_t>::vlanes()));
+
+    // vector add
+    v_float32 res = vfadd(
+        vfadd(b0, b1, VTraits<vfloat32m1_t>::vlanes()),
+        vfadd(b2, b3, VTraits<vfloat32m1_t>::vlanes()),
+        VTraits<vfloat32m1_t>::vlanes()
+    );
+    return res;
+}
+
+////////////// Square-Root //////////////
+
+inline v_float32 v_sqrt(const v_float32& x)
+{
+    return vfsqrt(x, VTraits<v_float32>::vlanes());
+}
+
+inline v_float32 v_invsqrt(const v_float32& x)
+{
+    v_float32 one = v_setall_f32(1.0f);
+    return v_div(one, v_sqrt(x));
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_sqrt(const v_float64& x)
+{
+    return vfsqrt(x, VTraits<v_float64>::vlanes());
+}
+
+inline v_float64 v_invsqrt(const v_float64& x)
+{
+    v_float64 one = v_setall_f64(1.0f);
+    return v_div(one, v_sqrt(x));
+}
+#endif
+
+inline v_float32 v_magnitude(const v_float32& a, const v_float32& b)
+{
+    v_float32 x = vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes());
+    return v_sqrt(x);
+}
+
+inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b)
+{
+    return v_float32(vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes()));
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_magnitude(const v_float64& a, const v_float64& b)
+{
+    v_float64 x = vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
+    return v_sqrt(x);
+}
+
+inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b)
+{
+    return vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
+}
+#endif
+
+////////////// Multiply-Add //////////////
+
+inline v_float32 v_fma(const v_float32& a, const v_float32& b, const v_float32& c)
+{
+    return vfmacc(c, a, b, VTraits<v_float32>::vlanes());
+}
+inline v_int32 v_fma(const v_int32& a, const v_int32& b, const v_int32& c)
+{
+    return vmacc(c, a, b, VTraits<v_float32>::vlanes());
+}
+
+inline v_float32 v_muladd(const v_float32& a, const v_float32& b, const v_float32& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c)
+{
+    return v_fma(a, b, c);
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c)
+{
+    return vfmacc_vv_f64m1(c, a, b, VTraits<v_float64>::vlanes());
+}
+
+inline v_float64 v_muladd(const v_float64& a, const v_float64& b, const v_float64& c)
+{
+    return v_fma(a, b, c);
+}
+#endif
+
+////////////// Check all/any //////////////
+
+#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    return (int)vcpop(vmslt(a, 0, vl), vl) == vl; \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    return (int)vcpop(vmslt(a, 0, vl), vl) != 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64, VTraits<v_int64>::vlanes())
+
+
+inline bool v_check_all(const v_uint8& a)
+{ return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint8& a)
+{ return v_check_any(v_reinterpret_as_s8(a)); }
+
+inline bool v_check_all(const v_uint16& a)
+{ return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint16& a)
+{ return v_check_any(v_reinterpret_as_s16(a)); }
+
+inline bool v_check_all(const v_uint32& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint32& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_float32& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_uint64& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint64& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+
+#if CV_SIMD_SCALABLE_64F
+inline bool v_check_all(const v_float64& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+#endif
+
+////////////// abs //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
+inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_sub(v_max(a, b), v_min(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff)
+#endif
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8, absdiffs)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16, absdiffs)
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, width) \
+inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vnclipu(vreinterpret_u##width##m2(vwsub_vv(v_max(a, b), v_min(a, b), VTraits<_Tpvec>::vlanes())), 0, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8, v_uint8, 16)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16, v_uint16, 32)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32, v_uint32, 64)
+
+#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
+inline _Tprvec v_abs(const _Tpvec& a) \
+{ \
+    return v_absdiff(a, v_setzero_##suffix()); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32)
+OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
+inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_reduce_sum(v_absdiff(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32, float)
+
+////////////// Select //////////////
+
+#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, vl) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vmerge(vmsne(mask, 0, vl), b, a, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits<v_int32>::vlanes())
+
+inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \
+{ \
+    return vmerge(vmfne(mask, 0, VTraits<v_float32>::vlanes()), b, a, VTraits<v_float32>::vlanes()); \
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_select(const v_float64& mask, const v_float64& a, const v_float64& b) \
+{ \
+    return vmerge(vmfne(mask, 0, VTraits<v_float64>::vlanes()), b, a, VTraits<v_float64>::vlanes()); \
+}
+#endif
+
+////////////// Rotate shift //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return vslideup(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8, u8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8, i8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16, u16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16, i16,  VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32, u32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32, i32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64, u64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64, i64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return vslideup(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64,  VTraits<v_float64>::vlanes())
+#endif
+
+////////////// Convert to float //////////////
+inline v_float32 v_cvt_f32(const v_int32& a)
+{
+    return vfcvt_f_x_v_f32m1(a, VTraits<v_float32>::vlanes());
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float32 v_cvt_f32(const v_float64& a)
+{
+    return vfncvt_f(vlmul_ext_f64m2(a), VTraits<v_float64>::vlanes());
+}
+
+inline v_float32 v_cvt_f32(const v_float64& a, const v_float64& b)
+{
+    return vfncvt_f(vset(vlmul_ext_f64m2(a),1,b), VTraits<v_float32>::vlanes());
+}
+
+inline v_float64 v_cvt_f64(const v_int32& a)
+{
+    return vget_f64m1(vfwcvt_f(a, VTraits<v_int32>::vlanes()), 0);
+}
+
+inline v_float64 v_cvt_f64_high(const v_int32& a)
+{
+    return vget_f64m1(vfwcvt_f(a, VTraits<v_int32>::vlanes()), 1);
+}
+
+inline v_float64 v_cvt_f64(const v_float32& a)
+{
+    return vget_f64m1(vfwcvt_f(a, VTraits<v_float32>::vlanes()), 0);
+}
+
+inline v_float64 v_cvt_f64_high(const v_float32& a)
+{
+    return vget_f64m1(vfwcvt_f(a, VTraits<v_float32>::vlanes()), 1);
+}
+
+inline v_float64 v_cvt_f64(const v_int64& a)
+{
+    return vfcvt_f(a, VTraits<v_int64>::vlanes());
+}
+#endif
+
+//////////// Broadcast //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
+template<int s = 0> inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \
+{ \
+    return v_setall_##suffix(v_extract_n(v, i)); \
+} \
+inline _Tpvec v_broadcast_highest(_Tpvec v) \
+{ \
+    return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32)
+
+
+////////////// Reverse //////////////
+#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, width) \
+inline _Tpvec v_reverse(const _Tpvec& a)  \
+{ \
+    vuint##width##m1_t vidx = vrsub(vid_v_u##width##m1(VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()-1, VTraits<_Tpvec>::vlanes()); \
+    return vrgather(a, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64, 64)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int64, 64)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float64, 64)
+#endif
+
+//////////// Value reordering ////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
+    b0 = vget_##suffix##m1(temp, 0); \
+    b1 = vget_##suffix##m1(temp, 1); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
+    return vget_##suffix##m1(temp, 0); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
+    return vget_##suffix##m1(temp, 1); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return cvt(vle##width##_v_##suffix2##mf2(ptr, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, vwcvt_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, vwcvt_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, vwcvt_x)
+
+inline v_uint32 v_load_expand_q(const uchar* ptr)
+{
+    return vwcvtu_x(vwcvtu_x(vle8_v_u8mf4(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
+}
+
+inline v_int32 v_load_expand_q(const schar* ptr)
+{
+    return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
+}
+
+#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, hwidth, hsuffix, suffix, rshr, shr) \
+inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    return shr(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, VTraits<_Tpvec>::vlanes()); \
+} \
+inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    vse##hwidth##_v_##hsuffix##mf2(ptr, shr(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
+} \
+template<int n = 0> inline \
+_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b, int N = n) \
+{ \
+    return rshr(vset(vlmul_ext_##suffix##m2(a), 1, b), N, VTraits<_Tpvec>::vlanes()); \
+} \
+template<int n = 0> inline \
+void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a, int N = n) \
+{ \
+    vse##hwidth##_v_##hsuffix##mf2(ptr, rshr(a, N, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK(v_uint8, uchar, v_uint16, 8, u8, u16, vnclipu, vnclipu)
+OPENCV_HAL_IMPL_RVV_PACK(v_int8, schar, v_int16, 8,  i8, i16, vnclip, vnclip)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint16, ushort, v_uint32, 16, u16, u32, vnclipu, vnclipu)
+OPENCV_HAL_IMPL_RVV_PACK(v_int16, short, v_int32, 16, i16, i32, vnclip, vnclip)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint32, unsigned, v_uint64, 32, u32, u64, vnclipu, vnsrl)
+OPENCV_HAL_IMPL_RVV_PACK(v_int32, int, v_int64, 32, i32, i64, vnclip, vnsra)
+
+#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
+inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    return vnclipu(cast(vmax(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), 0, vl); \
+} \
+inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    vse##hwidth##_v_##hsuffix##mf2(ptr, vnclipu(vreinterpret_u##width##m1(vmax(a, 0, vl)), 0, vl), hvl); \
+} \
+template<int N = 0> inline \
+_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b, int n = N) \
+{ \
+    return vnclipu(cast(vmax(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), n, vl); \
+} \
+template<int N = 0> inline \
+void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \
+{ \
+    vse##hwidth##_v_##hsuffix##mf2(ptr, vnclipu(vreinterpret_u##width##m1(vmax(a, 0, vl)), n, vl), hvl); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, VTraits<v_int16>::vlanes(), VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, VTraits<v_int32>::vlanes(), VTraits<v_uint16>::vlanes())
+
+
+/* void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
+  a0 = {A1 A2 A3 A4}
+  a1 = {B1 B2 B3 B4}
+---------------
+  {A1 B1 A2 B2} and {A3 B3 A4 B4}
+*/
+
+#define OPENCV_HAL_IMPL_RVV_ZIP(_Tpvec, _wTpvec, suffix, width, width2, convert2um2, convert2um1) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
+    _wTpvec temp = vreinterpret_##suffix##m2(convert2um2( \
+        vor(vzext_vf2(convert2um1(a0), VTraits<_Tpvec>::vlanes()*2), \
+            vreinterpret_u##width2##m2(vslide1up(vreinterpret_u##width##m2(vzext_vf2(convert2um1(a1), VTraits<_Tpvec>::vlanes()*2)), 0, VTraits<_Tpvec>::vlanes()*2)), \
+            VTraits<_Tpvec>::vlanes()))); \
+    b0 = vget_##suffix##m1(temp, 0); \
+    b1 = vget_##suffix##m1(temp, 1); \
+}
+OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, vreinterpret_u8m2, vreinterpret_u8m1)
+OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, vreinterpret_u16m2, vreinterpret_u16m1)
+OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
+OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
+
+#if CV_SIMD_SCALABLE_64F
+inline void v_zip(const v_float64& a0, const v_float64& a1, v_float64& b0, v_float64& b1) { \
+    vuint16mf4_t idx0 = vid_v_u16mf4(VTraits<v_float64>::vlanes());
+    vuint16mf4_t idx1 = vadd(idx0, VTraits<v_float64>::vlanes(), VTraits<v_float64>::vlanes());
+    vuint16mf2_t idx = vreinterpret_u16mf2(( \
+        vor(vzext_vf2(idx0, VTraits<v_float64>::vlanes()), \
+            vreinterpret_u32mf2(vslide1up(vreinterpret_u16mf2(vzext_vf2(idx1, VTraits<v_float64>::vlanes())), 0, VTraits<v_uint32>::vlanes())), \
+            VTraits<v_uint32>::vlanes())));
+#if 0
+    vfloat64m2_t temp = __riscv_vcreate_v_f64m1_f64m2(a0, a1);
+#else // TODO: clean up when RVV Intrinsic is frozen.
+    vfloat64m2_t temp = vlmul_ext_f64m2(a0);
+    temp = vset(temp, 1, a1);
+#endif
+    temp = vrgatherei16(temp, idx, VTraits<v_float64>::vlanes()*2);
+    b0 = vget_f64m1(temp, 0); \
+    b1 = vget_f64m1(temp, 1); \
+}
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(a, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes());\
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup( \
+            vslidedown(a, a, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
+            vslidedown(b, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
+            VTraits<_Tpvec>::vlanes()/2, \
+            VTraits<_Tpvec>::vlanes()); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    c = v_combine_low(a, b); \
+    d = v_combine_high(a, b); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint8, 8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_int8, 8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width, hwidth, vl) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    a = vlse##width##_v_##suffix##m1(ptr  , sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
+    b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
+}\
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    a = vlse##width##_v_##suffix##m1(ptr  , sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
+    b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
+    c = vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    \
+    a = vlse##width##_v_##suffix##m1(ptr  , sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    c = vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    d = vlse##width##_v_##suffix##m1(ptr+3, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vsse##width(ptr, sizeof(_Tp)*2, a, VTraits<v_##_Tpvec>::vlanes()); \
+    vsse##width(ptr+1, sizeof(_Tp)*2, b, VTraits<v_##_Tpvec>::vlanes()); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vsse##width(ptr, sizeof(_Tp)*3, a, VTraits<v_##_Tpvec>::vlanes()); \
+    vsse##width(ptr+1, sizeof(_Tp)*3, b, VTraits<v_##_Tpvec>::vlanes()); \
+    vsse##width(ptr+2, sizeof(_Tp)*3, c, VTraits<v_##_Tpvec>::vlanes()); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    vsse##width(ptr, sizeof(_Tp)*4, a, VTraits<v_##_Tpvec>::vlanes()); \
+    vsse##width(ptr+1, sizeof(_Tp)*4, b, VTraits<v_##_Tpvec>::vlanes()); \
+    vsse##width(ptr+2, sizeof(_Tp)*4, c, VTraits<v_##_Tpvec>::vlanes()); \
+    vsse##width(ptr+3, sizeof(_Tp)*4, d, VTraits<v_##_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, u8, 8, 4, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, i8, 8, 4, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, u16, 16, 8, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, i16, 16, 8, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, u32, 32, 16, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, i32, 32, 16, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, f32, 32, 16, VTraits<v_float32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, u64, 64, 32, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, i64, 64, 32, VTraits<v_int64>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, f64, 64, 32, VTraits<v_float64>::vlanes())
+#endif
+
+static uint64_t idx_interleave_pairs[] = { \
+    0x0705060403010200, 0x0f0d0e0c0b090a08, 0x1715161413111210, 0x1f1d1e1c1b191a18, \
+    0x2725262423212220, 0x2f2d2e2c2b292a28, 0x3735363433313230, 0x3f3d3e3c3b393a38, \
+    0x4745464443414240, 0x4f4d4e4c4b494a48, 0x5755565453515250, 0x5f5d5e5c5b595a58, \
+    0x6765666463616260, 0x6f6d6e6c6b696a68, 0x7775767473717270, 0x7f7d7e7c7b797a78};
+
+static uint64_t idx_interleave_quads[] = { \
+    0x0703060205010400, 0x0f0b0e0a0d090c08, 0x1713161215111410, 0x1f1b1e1a1d191c18, \
+    0x2723262225212420, 0x2f2b2e2a2d292c28, 0x3733363235313430, 0x3f3b3e3a3d393c38, \
+    0x4743464245414440, 0x4f4b4e4a4d494c48, 0x5753565255515450, 0x5f5b5e5a5d595c58, \
+    0x6763666265616460, 0x6f6b6e6a6d696c68, 0x7773767275717470, 0x7f7b7e7a7d797c78};
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(_Tpvec, func) \
+inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
+    CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
+    vuint8m1_t vidx = vundefined_u8m1();\
+    vidx = vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)); \
+    return vrgather(vec, vidx, VTraits<v_uint8>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, quads)
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(_Tpvec, width, vzext_vfx, func) \
+inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
+    CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
+    vuint##width##m1_t vidx = vundefined_u##width##m1();\
+    vidx = vget_u##width##m1(vzext_vfx(vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)), VTraits<v_uint8>::vlanes()), 0); \
+    return vrgather(vec, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, pairs)
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, quads)
+
+//////////// PopCount //////////
+static const unsigned char popCountTable[256] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+#define OPENCV_HAL_IMPL_RVV_HADD(_Tpvec, _Tpvec2, _Tm2, width, width2, suffix, add) \
+static inline _Tpvec2 v_hadd(_Tpvec a) { \
+    vuint##width2##m1_t oneX2 = vmv_v_x_u##width2##m1(1, VTraits<v_uint##width2>::vlanes()); \
+    vuint##width##m1_t one = vreinterpret_u##width##m1(oneX2); \
+    _Tm2 res = add(a, vslide1down(a, 0, VTraits<v_uint##width>::vlanes()), VTraits<v_uint##width>::vlanes()); \
+    return vget_##suffix##m1(vcompress(vmseq(one, 1, VTraits<v_uint##width>::vlanes()), res, res, VTraits<v_uint##width>::vlanes()), 0); \
+}
+OPENCV_HAL_IMPL_RVV_HADD(v_uint8, v_uint16, vuint16m2_t, 8, 16, u16, vwaddu_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_uint16, v_uint32, vuint32m2_t, 16, 32, u32, vwaddu_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_uint32, v_uint64, vuint64m2_t, 32, 64, u64, vwaddu_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_int8, v_int16, vint16m2_t, 8, 16, i16, vwadd_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_int16, v_int32, vint32m2_t, 16, 32, i32, vwadd_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_int32, v_int64, vint64m2_t, 32, 64, i64, vwadd_vv)
+
+OPENCV_HAL_IMPL_RVV_HADD(vint32m2_t, v_int32, vint32m2_t, 16, 32, i32, vadd)
+OPENCV_HAL_IMPL_RVV_HADD(vint64m2_t, v_int64, vint64m2_t, 32, 64, i64, vadd)
+
+inline v_uint8 v_popcount(const v_uint8& a)
+{
+    return vloxei8(popCountTable, a, VTraits<v_uint8>::vlanes());
+}
+inline v_uint16 v_popcount(const v_uint16& a)
+{
+    return v_hadd(v_popcount(vreinterpret_u8m1(a)));
+}
+inline v_uint32 v_popcount(const v_uint32& a)
+{
+    return v_hadd(v_hadd(v_popcount(vreinterpret_u8m1(a))));
+}
+inline v_uint64 v_popcount(const v_uint64& a)
+{
+    return v_hadd(v_hadd(v_hadd(v_popcount(vreinterpret_u8m1(a)))));
+}
+
+inline v_uint8 v_popcount(const v_int8& a)
+{
+    return v_popcount(v_abs(a));\
+}
+inline v_uint16 v_popcount(const v_int16& a)
+{
+    return v_popcount(v_abs(a));\
+}
+inline v_uint32 v_popcount(const v_int32& a)
+{
+    return v_popcount(v_abs(a));\
+}
+inline v_uint64 v_popcount(const v_int64& a)
+{
+    // max(0 - a) is used, since v_abs does not support 64-bit integers.
+    return v_popcount(v_reinterpret_as_u64(vmax(a, v_sub(v_setzero_s64(), a), VTraits<v_int64>::vlanes())));
+}
+
+
+//////////// SignMask ////////////
+#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    uint8_t ans[4] = {0}; \
+    vsm(ans, vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
+    return *(reinterpret_cast<int*>(ans)) & (((__int128_t)1 << VTraits<_Tpvec>::vlanes()) - 1); \
+} \
+inline int v_scan_forward(const _Tpvec& a) \
+{ \
+    return (int)vfirst(vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64)
+
+inline int64 v_signmask(const v_uint8& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+inline int64 v_signmask(const v_uint16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float32& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#if CV_SIMD_SCALABLE_64F
+inline int v_signmask(const v_float64& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#endif
+
+//////////// Scan forward ////////////
+inline int v_scan_forward(const v_uint8& a)
+{ return v_scan_forward(v_reinterpret_as_s8(a)); }
+inline int v_scan_forward(const v_uint16& a)
+{ return v_scan_forward(v_reinterpret_as_s16(a)); }
+inline int v_scan_forward(const v_uint32& a)
+{ return v_scan_forward(v_reinterpret_as_s32(a)); }
+inline int v_scan_forward(const v_float32& a)
+{ return v_scan_forward(v_reinterpret_as_s32(a)); }
+inline int v_scan_forward(const v_uint64& a)
+{ return v_scan_forward(v_reinterpret_as_s64(a)); }
+#if CV_SIMD_SCALABLE_64F
+inline int v_scan_forward(const v_float64& a)
+{ return v_scan_forward(v_reinterpret_as_s64(a)); }
+#endif
+
+//////////// Pack triplets ////////////
+// {A0, A1, A2, A3, B0, B1, B2, B3, C0 ...} --> {A0, A1, A2, B0, B1, B2, C0 ...}
+// mask: {0,0,0,1, ...} -> {T,T,T,F, ...}
+#define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, v_trunc) \
+inline _Tpvec v_pack_triplets(const _Tpvec& vec) { \
+    size_t vl = __cv_rvv_e8m1_nlanes; \
+    vuint32m1_t one = vmv_v_x_u32m1(1, __cv_rvv_e32m1_nlanes); \
+    vuint8m1_t zero = vmv_v_x_u8m1(0, vl); \
+    vuint8m1_t mask = vreinterpret_u8m1(one); \
+    return vcompress(vmseq(v_trunc(vslideup(zero, mask, 3, vl)), 0, vl), vec, vec, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16, vlmul_trunc_u8mf2)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16, vlmul_trunc_u8mf2)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32, vlmul_trunc_u8mf4)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32, vlmul_trunc_u8mf4)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32, vlmul_trunc_u8mf4)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint64, vlmul_trunc_u8mf8)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int64, vlmul_trunc_u8mf8)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float64, vlmul_trunc_u8mf8)
+#endif
+
+
+////// FP16 support ///////
+
+#if defined(__riscv_zfh) && __riscv_zfh
+inline v_float32 v_load_expand(const hfloat* ptr)
+{
+    return vfwcvt_f(vle16_v_f16mf2((_Float16*)ptr, VTraits<v_float32>::vlanes()) ,VTraits<v_float32>::vlanes());;
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32& v)
+{
+    vse16_v_f16mf2((_Float16*)ptr, vfncvt_f_f_w_f16mf2(v, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
+}
+#else
+inline v_float32 v_load_expand(const hfloat* ptr)
+{
+    float buf[32];
+    for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32& v)
+{
+    float buf[32];
+    v_store(buf, v);
+    for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) ptr[i] = hfloat(buf[i]);
+}
+#endif
+////////////// Rounding //////////////
+inline v_int32 v_round(const v_float32& a)
+{
+    // return vfcvt_x(vfadd(a, 1e-6, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
+    return vfcvt_x(a, VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_floor(const v_float32& a)
+{
+    return vfcvt_x(vfsub(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
+    // return vfcvt_x(a, VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_ceil(const v_float32& a)
+{
+    return vfcvt_x(vfadd(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_trunc(const v_float32& a)
+{
+    return vfcvt_rtz_x(a, VTraits<v_float32>::vlanes());
+}
+#if CV_SIMD_SCALABLE_64F
+inline v_int32 v_round(const v_float64& a)
+{
+    return vfncvt_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_round(const v_float64& a, const v_float64& b)
+{
+    // return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
+    // Fix https://github.com/opencv/opencv/issues/24746
+    return vfncvt_x(vset(vlmul_ext_f64m2(a), 1, b), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_floor(const v_float64& a)
+{
+    return vfncvt_x(vlmul_ext_f64m2(vfsub(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_ceil(const v_float64& a)
+{
+    return vfncvt_x(vlmul_ext_f64m2(vfadd(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_trunc(const v_float64& a)
+{
+    return vfncvt_rtz_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
+}
+#endif
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32 v_dotprod(const v_int16& a, const v_int16& b)
+{
+    vint32m2_t temp1 = vwmul(a, b, VTraits<v_int16>::vlanes());
+    return v_hadd(temp1);
+}
+
+inline v_int32 v_dotprod(const v_int16& a, const v_int16& b, const v_int32& c)
+{
+    vint32m2_t temp1 = vwmul(a, b, VTraits<v_int16>::vlanes());
+    return vadd(v_hadd(temp1), c, VTraits<v_int32>::vlanes());
+}
+
+// 32 >> 64
+inline v_int64 v_dotprod(const v_int32& a, const v_int32& b)
+{
+    vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
+    vuint32m1_t one32 = vreinterpret_u32m1(one64); \
+    vbool32_t mask = vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
+    vint64m2_t temp1 = vwmul(a, b, VTraits<v_int32>::vlanes()); \
+    vint64m2_t temp2 = vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
+    vint64m2_t res = vadd(temp1, temp2, VTraits<v_int32>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_int32>::vlanes()); \
+    return vlmul_trunc_i64m1(res); \
+}
+inline v_int64 v_dotprod(const v_int32& a, const v_int32& b, const v_int64& c)
+{
+    vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
+    vuint32m1_t one32 = vreinterpret_u32m1(one64); \
+    vbool32_t mask = vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
+    vint64m2_t temp1 = vwmul(a, b, VTraits<v_int32>::vlanes()); \
+    vint64m2_t temp2 = vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
+    vint64m2_t res = vadd(temp1, temp2, VTraits<v_int32>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_int32>::vlanes()); \
+    return vadd(vlmul_trunc_i64m1(res), c, VTraits<v_int64>::vlanes()); \
+}
+
+// 8 >> 32
+inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b)
+{
+    vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
+    vuint8m1_t one8 = vreinterpret_u8m1(one32); \
+    vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
+    vuint16m2_t t0 = vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
+    vuint16m2_t t1= vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
+    vuint16m2_t t2= vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
+    vuint16m2_t t3= vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
+    vuint32m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_uint8>::vlanes()); \
+    return vlmul_trunc_u32m1(res);
+}
+
+inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b,
+                                  const v_uint32& c)
+{
+    vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
+    vuint8m1_t one8 = vreinterpret_u8m1(one32); \
+    vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
+    vuint16m2_t t0 = vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
+    vuint16m2_t t1= vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
+    vuint16m2_t t2= vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
+    vuint16m2_t t3= vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
+    vuint32m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_uint8>::vlanes()); \
+    return vadd(vlmul_trunc_u32m1(res), c, VTraits<v_uint8>::vlanes());
+}
+
+inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b)
+{
+    vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
+    vuint8m1_t one8 = vreinterpret_u8m1(one32); \
+    vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
+    vint16m2_t t0 = vwmul(a, b, VTraits<v_int8>::vlanes()); \
+    vint16m2_t t1= vslide1down(t0, 0, VTraits<v_int8>::vlanes());
+    vint16m2_t t2= vslide1down(t1, 0, VTraits<v_int8>::vlanes());
+    vint16m2_t t3= vslide1down(t2, 0, VTraits<v_int8>::vlanes());
+    vint32m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_int8>::vlanes()); \
+    return vlmul_trunc_i32m1(res);
+}
+
+inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b,
+                                  const v_int32& c)
+{
+    vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
+    vuint8m1_t one8 = vreinterpret_u8m1(one32); \
+    vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
+    vint16m2_t t0 = vwmul(a, b, VTraits<v_int8>::vlanes()); \
+    vint16m2_t t1= vslide1down(t0, 0, VTraits<v_int8>::vlanes());
+    vint16m2_t t2= vslide1down(t1, 0, VTraits<v_int8>::vlanes());
+    vint16m2_t t3= vslide1down(t2, 0, VTraits<v_int8>::vlanes());
+    vint32m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_int8>::vlanes()); \
+    return vadd(vlmul_trunc_i32m1(res), c, VTraits<v_int8>::vlanes());
+}
+
+
+// // 16 >> 64
+inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b)
+{
+    vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
+    vuint16m1_t one16 = vreinterpret_u16m1(one64); \
+    vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
+    vuint32m2_t t0 = vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
+    vuint32m2_t t1= vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
+    vuint32m2_t t2= vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
+    vuint32m2_t t3= vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
+    vuint64m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_uint16>::vlanes()); \
+    return vlmul_trunc_u64m1(res);
+}
+inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b, const v_uint64& c)
+{
+    vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
+    vuint16m1_t one16 = vreinterpret_u16m1(one64); \
+    vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
+    vuint32m2_t t0 = vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
+    vuint32m2_t t1= vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
+    vuint32m2_t t2= vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
+    vuint32m2_t t3= vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
+    vuint64m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_uint16>::vlanes()); \
+    return vadd(vlmul_trunc_u64m1(res), c, VTraits<v_uint16>::vlanes());
+}
+
+inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b)
+{
+    vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
+    vuint16m1_t one16 = vreinterpret_u16m1(one64); \
+    vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
+    vint32m2_t t0 = vwmul(a, b, VTraits<v_int16>::vlanes()); \
+    vint32m2_t t1= vslide1down(t0, 0, VTraits<v_int16>::vlanes());
+    vint32m2_t t2= vslide1down(t1, 0, VTraits<v_int16>::vlanes());
+    vint32m2_t t3= vslide1down(t2, 0, VTraits<v_int16>::vlanes());
+    vint64m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_int16>::vlanes()); \
+    return vlmul_trunc_i64m1(res);
+}
+inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b,
+                                  const v_int64& c)
+{
+    vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
+    vuint16m1_t one16 = vreinterpret_u16m1(one64); \
+    vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
+    vint32m2_t t0 = vwmul(a, b, VTraits<v_int16>::vlanes()); \
+    vint32m2_t t1= vslide1down(t0, 0, VTraits<v_int16>::vlanes());
+    vint32m2_t t2= vslide1down(t1, 0, VTraits<v_int16>::vlanes());
+    vint32m2_t t3= vslide1down(t2, 0, VTraits<v_int16>::vlanes());
+    vint64m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
+    res = vcompress(mask, res, res, VTraits<v_int16>::vlanes()); \
+    return vadd(vlmul_trunc_i64m1(res), c, VTraits<v_int16>::vlanes());
+}
+
+// // 32 >> 64f
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_dotprod_expand(const v_int32& a, const v_int32& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64 v_dotprod_expand(const v_int32& a,   const v_int32& b,
+                                    const v_float64& c)
+{ return v_add(v_dotprod_expand(a, b) , c); }
+#endif
+
+//////// Fast Dot Product ////////
+// 16 >> 32
+inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b)
+{
+    v_int32 zero = v_setzero_s32();
+    return vredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero,  VTraits<v_int16>::vlanes());
+}
+inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b, const v_int32& c)
+{
+    v_int32 zero = v_setzero_s32();
+    return vredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), vredsum(zero, c, zero, VTraits<v_int32>::vlanes()),  VTraits<v_int16>::vlanes());
+}
+
+// 32 >> 64
+inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b)
+{
+    v_int64 zero = v_setzero_s64();
+    return vredsum(zero, vwmul(a, b, VTraits<v_int32>::vlanes()), zero,  VTraits<v_int32>::vlanes());
+}
+inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b, const v_int64& c)
+{
+    v_int64 zero = v_setzero_s64();
+    return vadd(vredsum(zero, vwmul(a, b, VTraits<v_int32>::vlanes()), zero,  VTraits<v_int32>::vlanes()) , vredsum(zero, c, zero, VTraits<v_int64>::vlanes()), VTraits<v_int64>::vlanes());
+}
+
+
+// 8 >> 32
+inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b)
+{
+    v_uint32 zero = v_setzero_u32();
+    return vwredsumu(zero, vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero,  VTraits<v_uint8>::vlanes());
+}
+inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b, const v_uint32& c)
+{
+    v_uint32 zero = v_setzero_u32();
+    return vadd(vwredsumu(zero, vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero,  VTraits<v_uint8>::vlanes()) , vredsum(zero, c, zero, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
+}
+inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b)
+{
+    v_int32 zero = v_setzero_s32();
+    return vwredsum(zero, vwmul(a, b, VTraits<v_int8>::vlanes()), zero,  VTraits<v_int8>::vlanes());
+}
+inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b, const v_int32& c)
+{
+    v_int32 zero = v_setzero_s32();
+    return vadd(vwredsum(zero, vwmul(a, b, VTraits<v_int8>::vlanes()), zero,  VTraits<v_int8>::vlanes()) , vredsum(zero, c, zero, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
+}
+
+// 16 >> 64
+inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b)
+{
+    v_uint64 zero = v_setzero_u64();
+    return vwredsumu(zero, vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero,  VTraits<v_uint16>::vlanes());
+}
+inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b, const v_uint64& c)
+{
+    v_uint64 zero = v_setzero_u64();
+    return vadd(vwredsumu(zero, vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero,  VTraits<v_uint16>::vlanes()), vredsum(zero, c, zero, VTraits<v_uint64>::vlanes()), VTraits<v_uint64>::vlanes());
+}
+inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b)
+{
+    v_int64 zero = v_setzero_s64();
+    return vwredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero,  VTraits<v_int16>::vlanes());
+}
+inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b, const v_int64& c)
+{
+    v_int64 zero = v_setzero_s64();
+    return vadd(vwredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero,  VTraits<v_int16>::vlanes()), vredsum(zero, c, zero, VTraits<v_int64>::vlanes()), VTraits<v_int64>::vlanes());
+}
+
+// 32 >> 64f
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b)
+{ return v_cvt_f64(v_dotprod_fast(a, b)); }
+inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b, const v_float64& c)
+{ return v_add(v_dotprod_expand_fast(a, b) , c); }
+#endif
+
+// TODO: only 128 bit now.
+inline v_float32 v_matmul(const v_float32& v, const v_float32& m0,
+                            const v_float32& m1, const v_float32& m2,
+                            const v_float32& m3)
+{
+    vfloat32m1_t res;
+    res = vfmul_vf_f32m1(m0, v_extract_n(v, 0), VTraits<v_float32>::vlanes());
+    res = vfmacc_vf_f32m1(res, v_extract_n(v, 1), m1, VTraits<v_float32>::vlanes());
+    res = vfmacc_vf_f32m1(res, v_extract_n(v, 2), m2, VTraits<v_float32>::vlanes());
+    res = vfmacc_vf_f32m1(res, v_extract_n(v, 3), m3, VTraits<v_float32>::vlanes());
+    return res;
+}
+
+// TODO: only 128 bit now.
+inline v_float32 v_matmuladd(const v_float32& v, const v_float32& m0,
+                               const v_float32& m1, const v_float32& m2,
+                               const v_float32& a)
+{
+    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n(v,0), VTraits<v_float32>::vlanes());
+    res = vfmacc_vf_f32m1(res, v_extract_n(v,1), m1, VTraits<v_float32>::vlanes());
+    res = vfmacc_vf_f32m1(res, v_extract_n(v,2), m2, VTraits<v_float32>::vlanes());
+    return vfadd(res, a, VTraits<v_float32>::vlanes());
+}
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} //namespace cv
+
+#endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_sse.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_sse.hpp
new file mode 100644
index 000000000000..68b5a67bbc00
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_sse.hpp
@@ -0,0 +1,3468 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_SSE_HPP
+#define OPENCV_HAL_SSE_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD128_FP16 0  // no native operations with FP16 type.
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+//
+// Compilation troubleshooting:
+// - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned
+//   Replace parameter declaration to const reference:
+//   -v_int32x4 a
+//   +const v_int32x4& a
+//
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Types ////////////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 16 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint8x16() {}
+    explicit v_uint8x16(__m128i v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+
+    uchar get0() const
+    {
+        return (uchar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 16 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int8x16() {}
+    explicit v_int8x16(__m128i v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+              schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+
+    schar get0() const
+    {
+        return (schar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 8 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint16x8() {}
+    explicit v_uint16x8(__m128i v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+
+    ushort get0() const
+    {
+        return (ushort)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 8 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int16x8() {}
+    explicit v_int16x8(__m128i v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+
+    short get0() const
+    {
+        return (short)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 4 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint32x4() {}
+    explicit v_uint32x4(__m128i v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
+    }
+
+    unsigned get0() const
+    {
+        return (unsigned)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 4 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int32x4() {}
+    explicit v_int32x4(__m128i v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        val = _mm_setr_epi32(v0, v1, v2, v3);
+    }
+
+    int get0() const
+    {
+        return _mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    typedef __m128 vector_type;
+    enum { nlanes = 4 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float32x4() {}
+    explicit v_float32x4(__m128 v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        val = _mm_setr_ps(v0, v1, v2, v3);
+    }
+
+    float get0() const
+    {
+        return _mm_cvtss_f32(val);
+    }
+
+    __m128 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 2 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint64x2() {}
+    explicit v_uint64x2(__m128i v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+#if defined(_MSC_VER) && _MSC_VER >= 1920/*MSVS 2019*/ && defined(_M_X64) && !defined(__clang__)
+        val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
+#elif defined(__GNUC__)
+        val = _mm_setr_epi64((__m64)v0, (__m64)v1);
+#else
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+#endif
+    }
+
+    uint64 get0() const
+    {
+    #if !defined(__x86_64__) && !defined(_M_X64)
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #else
+        return (uint64)_mm_cvtsi128_si64(val);
+    #endif
+    }
+
+    __m128i val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 2 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int64x2() {}
+    explicit v_int64x2(__m128i v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+#if defined(_MSC_VER) && _MSC_VER >= 1920/*MSVS 2019*/ && defined(_M_X64) && !defined(__clang__)
+        val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
+#elif defined(__GNUC__)
+        val = _mm_setr_epi64((__m64)v0, (__m64)v1);
+#else
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+#endif
+    }
+
+    int64 get0() const
+    {
+    #if !defined(__x86_64__) && !defined(_M_X64)
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #else
+        return _mm_cvtsi128_si64(val);
+    #endif
+    }
+
+    __m128i val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    typedef __m128d vector_type;
+    enum { nlanes = 2 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float64x2() {}
+    explicit v_float64x2(__m128d v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        val = _mm_setr_pd(v0, v1);
+    }
+
+    double get0() const
+    {
+        return _mm_cvtsd_f64(val);
+    }
+
+    __m128d val;
+};
+
+namespace hal_sse_internal
+{
+    template <typename to_sse_type, typename from_sse_type>
+    to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
+
+#define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
+    template<> inline \
+    to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
+    { return sse_cast_intrin(a); }
+
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
+}
+
+#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
+{ return _Tpvec(cast(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
+
+inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
+inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
+inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
+inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
+
+template<typename _Tpvec> inline
+v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
+template<typename _Tpvec> inline
+v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
+{ return _Tpvec(_mm_castps_si128(a.val)); } \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
+{ return _Tpvec(_mm_castpd_si128(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
+
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
+
+//////////////// PACK ///////////////
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
+                                       _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
+}
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
+                                       _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+template<int n> inline
+v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                       _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
+
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                     _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
+}
+
+
+// byte-wise "mask ? a : b"
+inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
+{
+#if CV_SSE4_1
+    return _mm_blendv_epi8(b, a, mask);
+#else
+    return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
+#endif
+}
+
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i r = _mm_packs_epi32(a1, a1);
+    _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_uint16x8(_mm_packus_epi32(a.val, b.val));
+#else
+    __m128i delta32 = _mm_set1_epi32(32768);
+
+    // preliminary saturate negative values to zero
+    __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
+    __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
+
+    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+#endif
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+#if CV_SSE4_1
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
+#else
+    __m128i delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(a.val, delta32);
+    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, r);
+#endif
+}
+
+template<int n> inline
+v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    __m128i delta = _mm_set1_epi32(1 << (n - 1));
+    return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
+                                       _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
+#else
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
+    return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
+#endif
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+#if CV_SSE4_1
+    __m128i delta = _mm_set1_epi32(1 << (n - 1));
+    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
+#else
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+#endif
+}
+
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
+
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
+}
+
+template<int n> inline
+v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
+                                     _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
+}
+
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+template<int n> inline
+v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline __m128i v_sign_epi64(__m128i a)
+{
+    return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
+}
+
+inline __m128i v_srai_epi64(__m128i a, int imm)
+{
+    __m128i smask = v_sign_epi64(a);
+    return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
+}
+
+template<int n> inline
+v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i ab = _mm_packs_epi16(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    return v_uint8x16(_mm_packs_epi16(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    __m128i ef = _mm_packs_epi32(e.val, f.val);
+    __m128i gh = _mm_packs_epi32(g.val, h.val);
+
+    __m128i abcd = _mm_packs_epi32(ab, cd);
+    __m128i efgh = _mm_packs_epi32(ef, gh);
+    return v_uint8x16(_mm_packs_epi16(abcd, efgh));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+    __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
+}
+
+#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return _Tpvec(intrin(a.val, b.val)); \
+    } \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+    { \
+        a.val = intrin(a.val, b.val); \
+        return a; \
+    }
+
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec)             \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8,  v_int32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    v_uint16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    v_int16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
+    c.val = _mm_unpacklo_epi16(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
+    c.val = _mm_unpacklo_epi16(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    c.val = _mm_unpacklo_epi64(c0, c1);
+    d.val = _mm_unpackhi_epi64(c0, c1);
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    __m128i even = _mm_mul_epi32(a.val, b.val);
+    __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    return v_int64x2(_mm_add_epi64(even, odd));
+#else
+    __m128i even_u = _mm_mul_epu32(a.val, b.val);
+    __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
+    __m128i a_sign = _mm_srai_epi32(a.val, 31);
+    __m128i b_sign = _mm_srai_epi32(b.val, 31);
+    // |x * sign of x
+    __m128i axb  = _mm_and_si128(a.val, b_sign);
+    __m128i bxa  = _mm_and_si128(b.val, a_sign);
+    // sum of sign corrections
+    __m128i ssum = _mm_add_epi32(bxa, axb);
+    __m128i even_ssum = _mm_slli_epi64(ssum, 32);
+    __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
+    // convert to signed and prod
+    return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
+#endif
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
+    __m128i a1 = _mm_srli_epi16(a.val, 8); // odd
+    __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
+    __m128i b1 = _mm_srli_epi16(b.val, 8);
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_uint32x4(_mm_add_epi32(p0, p1));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
+    __m128i a1 = _mm_srai_epi16(a.val, 8); // odd
+    __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
+    __m128i b1 = _mm_srai_epi16(b.val, 8);
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_int32x4(_mm_add_epi32(p0, p1));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 c, d;
+    v_mul_expand(a, b, c, d);
+
+    v_uint64x2 c0, c1, d0, d1;
+    v_expand(c, c0, c1);
+    v_expand(d, d0, d1);
+
+    c0 += c1; d0 += d1;
+    return v_uint64x2(_mm_add_epi64(
+        _mm_unpacklo_epi64(c0.val, d0.val),
+        _mm_unpackhi_epi64(c0.val, d0.val)
+    ));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return v_int64x2(_mm_add_epi64(
+        _mm_unpacklo_epi64(c.val, d.val),
+        _mm_unpackhi_epi64(c.val, d.val)
+    ));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_cvt_f64(v_dotprod(a, b));
+#else
+    v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
+    v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
+
+    return v_float64x2(_mm_add_pd(
+        _mm_unpacklo_pd(c.val, d.val),
+        _mm_unpackhi_pd(c.val, d.val)
+    ));
+#endif
+}
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod_fast(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = v_expand_low(a).val;
+    __m128i a1 = v_expand_high(a).val;
+    __m128i b0 = v_expand_low(b).val;
+    __m128i b1 = v_expand_high(b).val;
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_uint32x4(_mm_add_epi32(p0, p1));
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    __m128i a0 = _mm_cvtepi8_epi16(a.val);
+    __m128i a1 = v_expand_high(a).val;
+    __m128i b0 = _mm_cvtepi8_epi16(b.val);
+    __m128i b1 = v_expand_high(b).val;
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_int32x4(_mm_add_epi32(p0, p1));
+#else
+    return v_dotprod_expand(a, b);
+#endif
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 c, d;
+    v_mul_expand(a, b, c, d);
+
+    v_uint64x2 c0, c1, d0, d1;
+    v_expand(c, c0, c1);
+    v_expand(d, d0, d1);
+
+    c0 += c1; d0 += d1;
+    return c0 + d0;
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return c + d;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a,   const v_int32x4& b, const v_float64x2& c)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
+
+#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
+    }
+
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{ return v_float32x4(_mm_sqrt_ps(x.val)); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
+    __m128 t = x.val;
+    __m128 h = _mm_mul_ps(t, _0_5);
+    t = _mm_rsqrt_ps(t);
+    t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
+    return v_float32x4(t);
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{ return v_float64x2(_mm_sqrt_pd(x.val)); }
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    const __m128d v_1 = _mm_set1_pd(1.);
+    return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
+}
+
+#define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
+inline _Tpuvec v_abs(const _Tpsvec& x) \
+{ return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
+
+OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
+OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
+inline v_uint32x4 v_abs(const v_int32x4& x)
+{
+    __m128i s = _mm_srli_epi32(x.val, 31);
+    __m128i f = _mm_srai_epi32(x.val, 31);
+    return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
+}
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{
+    return v_float64x2(_mm_and_pd(x.val,
+        _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
+
+inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    return v_int8x16(_mm_min_epi8(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+#endif
+}
+inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    return v_int8x16(_mm_max_epi8(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+#endif
+}
+inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_SSE4_1
+    return v_uint16x8(_mm_min_epu16(a.val, b.val));
+#else
+    return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
+#endif
+}
+inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_SSE4_1
+    return v_uint16x8(_mm_max_epu16(a.val, b.val));
+#else
+    return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
+#endif
+}
+inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
+{
+#if CV_SSE4_1
+    return v_uint32x4(_mm_min_epu32(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, b.val, a.val));
+#endif
+}
+inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
+{
+#if CV_SSE4_1
+    return v_uint32x4(_mm_max_epu32(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, a.val, b.val));
+#endif
+}
+inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_int32x4(_mm_min_epi32(a.val, b.val));
+#else
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
+#endif
+}
+inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_int32x4(_mm_max_epi32(a.val, b.val));
+#else
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
+#endif
+}
+
+#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
+inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
+{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
+{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
+} \
+inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
+} \
+inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
+} \
+inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
+} \
+inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
+}
+
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
+
+#if CV_SSE4_1
+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return ~(a == b); }
+#else
+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
+  return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return ~(a == b); }
+#endif
+
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
+
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i ad = _mm_srai_epi16(a.val, 8);
+    __m128i bd = _mm_srai_epi16(b.val, 8);
+    __m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
+    __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
+    const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
+    return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
+}
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+/** Absolute difference **/
+
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = v_sub_wrap(a, b);
+    v_int8x16 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
+}
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{
+    v_int32x4 d = a - b;
+    v_int32x4 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+ }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_FMA3
+    return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
+#else
+    return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
+#endif
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+#if CV_FMA3
+    return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
+#else
+    return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
+#endif
+}
+
+#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
+inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
+    return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
+} \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpvec res = v_fma(a, a, b*b); \
+    return _Tpvec(_mm_sqrt_##suffix(res.val)); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_fma(a, a, b*b); \
+} \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ \
+    return v_fma(a, b, c); \
+}
+
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
+
+#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shl(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shl(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shr(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shr(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+}
+
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
+
+namespace hal_sse_internal
+{
+    template <int imm,
+        bool is_invalid = ((imm < 0) || (imm > 16)),
+        bool is_first = (imm == 0),
+        bool is_half = (imm == 8),
+        bool is_second = (imm == 16),
+        bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
+    class v_sse_palignr_u8_class;
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, true, false, false, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i&) const
+        {
+            return a;
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, true, false, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, false, true, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i&, const __m128i& b) const
+        {
+            return b;
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, false, false, true>
+    {
+#if CV_SSSE3
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            return _mm_alignr_epi8(b, a, imm);
+        }
+#else
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            enum { imm2 = (sizeof(__m128i) - imm) };
+            return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
+        }
+#endif
+    };
+
+    template <int imm>
+    inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
+    {
+        CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
+        return v_sse_palignr_u8_class<imm>()(a, b);
+    }
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        _mm_srli_si128(
+            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        _mm_slli_si128(
+            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        v_sse_palignr_u8<imm2>(
+            v_sse_reinterpret_as<__m128i>(a.val),
+            v_sse_reinterpret_as<__m128i>(b.val))));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        v_sse_palignr_u8<imm2>(
+            v_sse_reinterpret_as<__m128i>(b.val),
+            v_sse_reinterpret_as<__m128i>(a.val))));
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                                     _mm_loadl_epi64((const __m128i*)ptr1))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_si128((__m128i*)ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+    if( mode == hal::STORE_UNALIGNED ) \
+        _mm_storeu_si128((__m128i*)ptr, a.val); \
+    else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+        _mm_stream_si128((__m128i*)ptr, a.val); \
+    else \
+        _mm_store_si128((__m128i*)ptr, a.val); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_##suffix(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_castsi128_##suffix( \
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                           _mm_loadl_epi64((const __m128i*)ptr1)))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+    if( mode == hal::STORE_UNALIGNED ) \
+        _mm_storeu_##suffix(ptr, a.val); \
+    else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+        _mm_stream_##suffix(ptr, a.val); \
+    else \
+        _mm_store_##suffix(ptr, a.val); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __m128i a1 = _mm_cast##suffix##_si128(a.val); \
+    _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
+
+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+    __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    __m128i half = _mm_set1_epi8((schar)-128);
+    half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
+}
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
+inline schar v_reduce_##func(const v_int8x16& a) \
+{ \
+    __m128i val = a.val; \
+    __m128i smask = _mm_set1_epi8((schar)-128); \
+    val = _mm_xor_si128(val, smask); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
+    return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
+} \
+inline uchar v_reduce_##func(const v_uint8x16& a) \
+{ \
+    __m128i val = a.val; \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
+    return (uchar)_mm_cvtsi128_si32(val); \
+}
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
+inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
+{ \
+    __m128i val = a.val; \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
+    return (scalartype)_mm_cvtsi128_si32(val); \
+} \
+inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
+{ \
+    __m128i val = a.val; \
+    __m128i smask = _mm_set1_epi16(sbit); \
+    val = _mm_xor_si128(val, smask); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
+    return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^  sbit); \
+}
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    regtype val = a.val; \
+    val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
+    val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
+    return (scalartype)_mm_cvt##extract(val); \
+}
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
+
+inline int v_reduce_sum(const v_int16x8& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x8& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+    uint64 CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+    int64 CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    double CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+#if CV_SSE3
+    __m128 ab = _mm_hadd_ps(a.val, b.val);
+    __m128 cd = _mm_hadd_ps(c.val, d.val);
+    return v_float32x4(_mm_hadd_ps(ab, cd));
+#else
+    __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
+    __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
+    return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
+#endif
+}
+
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i half = _mm_sad_epu8(a.val, b.val);
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i half = _mm_set1_epi8(0x7f);
+    half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{
+    __m128i m1 = _mm_set1_epi32(0x55555555);
+    __m128i m2 = _mm_set1_epi32(0x33333333);
+    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
+    __m128i p = a.val;
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
+    return v_uint8x16(p);
+}
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+}
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+}
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{
+    return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
+}
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
+inline int v_signmask(const _Tpvec& a)   { return _mm_movemask_##suffix(cast_op(a.val)); } \
+inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
+inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3)
+
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
+inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
+inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
+inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8)
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+#if CV_SSE4_1
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
+
+#else // CV_SSE4_1
+
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
+#endif
+
+/* Expand */
+#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(a.val);                                     \
+        b1.val = __CV_CAT(intrin, _high)(a.val);                    \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(a.val)); }                              \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); }             \
+    inline _Tpwvec v_load_expand(const _Tp* ptr)                    \
+    {                                                               \
+        __m128i a = _mm_loadl_epi64((const __m128i*)ptr);           \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8,  uchar,    _v128_cvtepu8_epi16)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16,  v_int16x8,   schar,    _v128_cvtepi8_epi16)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4,  ushort,   _v128_cvtepu16_epi32)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8,  v_int32x4,   short,    _v128_cvtepi16_epi32)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2,  unsigned, _v128_cvtepu32_epi64)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4,  v_int64x2,   int,      _v128_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin)          \
+    inline _Tpvec v_load_expand_q(const _Tp* ptr)                  \
+    {                                                              \
+        typedef int CV_DECL_ALIGNED(1) unaligned_int;              \
+        __m128i a = _mm_cvtsi32_si128(*(const unaligned_int*)ptr); \
+        return _Tpvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
+OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4,  schar, _v128_cvtepi8_epi32)
+
+#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
+    b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
+    d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
+
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+#if CV_SSSE3
+    static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
+#else
+    uchar CV_DECL_ALIGNED(32) d[16];
+    v_store_aligned(d, a);
+    return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
+#endif
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+#if CV_SSSE3
+    static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
+#else
+    __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
+    r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
+    r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
+    return v_uint16x8(r);
+#endif
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{
+    return v_rotate_right<s>(a, b);
+}
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(_mm_cvttps_epi32(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
+    return v_int32x4(_mm_unpacklo_epi64(ai, bi));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(_mm_cvttpd_epi32(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
+    __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
+    __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
+    __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
+\
+    b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
+    b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
+    b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
+    b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
+}
+
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+
+// load deinterleave
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+
+    __m128i t10 = _mm_unpacklo_epi8(t00, t01);
+    __m128i t11 = _mm_unpackhi_epi8(t00, t01);
+
+    __m128i t20 = _mm_unpacklo_epi8(t10, t11);
+    __m128i t21 = _mm_unpackhi_epi8(t10, t11);
+
+    __m128i t30 = _mm_unpacklo_epi8(t20, t21);
+    __m128i t31 = _mm_unpackhi_epi8(t20, t21);
+
+    a.val = _mm_unpacklo_epi8(t30, t31);
+    b.val = _mm_unpackhi_epi8(t30, t31);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+#if CV_SSE4_1
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+    __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
+    __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
+    __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
+    const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+    const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
+    const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_b);
+    b0 = _mm_shuffle_epi8(b0, sh_g);
+    c0 = _mm_shuffle_epi8(c0, sh_r);
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#elif CV_SSSE3
+    const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
+    const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
+    const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
+
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+
+    __m128i s0 = _mm_shuffle_epi8(t0, m0);
+    __m128i s1 = _mm_shuffle_epi8(t1, m1);
+    __m128i s2 = _mm_shuffle_epi8(t2, m2);
+
+    t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
+    a.val = _mm_alignr_epi8(s2, t0, 5);
+
+    t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
+    b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
+
+    t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
+    c.val = _mm_alignr_epi8(t2, s0, 11);
+#else
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+
+    __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
+
+    __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
+    __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
+    __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
+
+    a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
+    b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
+    c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
+#endif
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
+
+    u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
+    u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
+    u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
+    u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
+
+    v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
+    v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
+    v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
+    v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
+
+    a.val = _mm_unpacklo_epi8(v0, v1);
+    b.val = _mm_unpackhi_epi8(v0, v1);
+    c.val = _mm_unpacklo_epi8(v2, v3);
+    d.val = _mm_unpackhi_epi8(v2, v3);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
+    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
+    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
+    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+
+    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+#if CV_SSE4_1
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
+    __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
+    __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
+    __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
+    __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
+
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_a);
+    b0 = _mm_shuffle_epi8(b0, sh_b);
+    c0 = _mm_shuffle_epi8(c0, sh_c);
+
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#else
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+
+    __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
+
+    a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
+    b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
+    c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
+#endif
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
+
+    u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
+    u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
+    u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
+    u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
+
+    a.val = _mm_unpacklo_epi16(u0, u1);
+    b.val = _mm_unpackhi_epi16(u0, u1);
+    c.val = _mm_unpacklo_epi16(u2, u3);
+    d.val = _mm_unpackhi_epi16(u2, u3);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
+
+    __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
+    __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
+
+    a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
+    b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+
+    __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
+
+    a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
+    b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
+    c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
+    v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4)));  // a1 b1 c1 d1
+    v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8)));  // a2 b2 c2 d2
+    v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
+
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+    __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
+    __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
+
+    a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
+    b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
+{
+    __m128 t0 = _mm_loadu_ps(ptr + 0);
+    __m128 t1 = _mm_loadu_ps(ptr + 4);
+    __m128 t2 = _mm_loadu_ps(ptr + 8);
+
+    __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
+    a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
+
+    __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
+    __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
+    b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
+
+    __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
+    c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
+{
+    __m128 t0 = _mm_loadu_ps(ptr +  0);
+    __m128 t1 = _mm_loadu_ps(ptr +  4);
+    __m128 t2 = _mm_loadu_ps(ptr +  8);
+    __m128 t3 = _mm_loadu_ps(ptr + 12);
+    __m128 t02lo = _mm_unpacklo_ps(t0, t2);
+    __m128 t13lo = _mm_unpacklo_ps(t1, t3);
+    __m128 t02hi = _mm_unpackhi_ps(t0, t2);
+    __m128 t13hi = _mm_unpackhi_ps(t1, t3);
+    a.val = _mm_unpacklo_ps(t02lo, t13lo);
+    b.val = _mm_unpackhi_ps(t02lo, t13lo);
+    c.val = _mm_unpacklo_ps(t02hi, t13hi);
+    d.val = _mm_unpackhi_ps(t02hi, t13hi);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
+
+    t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+    b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
+    c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
+                                v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
+    __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
+    c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
+    d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
+}
+
+// store interleave
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+#if CV_SSE4_1
+    const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
+    __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
+    __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
+#elif CV_SSSE3
+    const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
+    const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
+    const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
+
+    __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
+    t0 = _mm_alignr_epi8(c.val, t0, 5);
+    __m128i v0 = _mm_shuffle_epi8(t0, m0);
+
+    __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
+    t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
+    __m128i v1 = _mm_shuffle_epi8(t1, m1);
+
+    __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
+    t2 = _mm_alignr_epi8(t2, a.val, 11);
+    __m128i v2 = _mm_shuffle_epi8(t2, m2);
+#else
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi8(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi8(c.val, z);
+
+    __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
+    __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
+    __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
+    __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
+
+    __m128i p10 = _mm_unpacklo_epi32(p00, p01);
+    __m128i p11 = _mm_unpackhi_epi32(p00, p01);
+    __m128i p12 = _mm_unpacklo_epi32(p02, p03);
+    __m128i p13 = _mm_unpackhi_epi32(p02, p03);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 1);
+    p22 = _mm_slli_si128(p22, 1);
+
+    __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
+    __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
+    __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
+    __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
+
+    __m128i p40 = _mm_unpacklo_epi64(p30, p31);
+    __m128i p41 = _mm_unpackhi_epi64(p30, p31);
+    __m128i p42 = _mm_unpacklo_epi64(p32, p33);
+    __m128i p43 = _mm_unpackhi_epi64(p32, p33);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
+#endif
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+        _mm_stream_si128((__m128i*)(ptr + 32), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+        _mm_store_si128((__m128i*)(ptr + 32), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, const v_uint8x16& d,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
+    __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+        _mm_stream_si128((__m128i*)(ptr + 32), v2);
+        _mm_stream_si128((__m128i*)(ptr + 48), v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+        _mm_store_si128((__m128i*)(ptr + 32), v2);
+        _mm_store_si128((__m128i*)(ptr + 48), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
+                                const v_uint16x8& b, const v_uint16x8& c,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+#if CV_SSE4_1
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
+    __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
+    __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
+#else
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi16(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi16(c.val, z);
+
+    __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
+    __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
+    __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
+    __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 2);
+    p22 = _mm_slli_si128(p22, 2);
+
+    __m128i p30 = _mm_unpacklo_epi64(p20, p21);
+    __m128i p31 = _mm_unpackhi_epi64(p20, p21);
+    __m128i p32 = _mm_unpacklo_epi64(p22, p23);
+    __m128i p33 = _mm_unpackhi_epi64(p22, p23);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
+#endif
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+        _mm_stream_si128((__m128i*)(ptr + 16), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+        _mm_store_si128((__m128i*)(ptr + 16), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                const v_uint16x8& c, const v_uint16x8& d,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
+    __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+        _mm_stream_si128((__m128i*)(ptr + 16), v2);
+        _mm_stream_si128((__m128i*)(ptr + 24), v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+        _mm_store_si128((__m128i*)(ptr + 16), v2);
+        _mm_store_si128((__m128i*)(ptr + 24), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 4), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
+    v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
+
+    __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1);
+        _mm_stream_si128((__m128i*)(ptr + 8), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 4), v1);
+        _mm_store_si128((__m128i*)(ptr + 8), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+    }
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    v_uint32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0.val);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0.val);
+        _mm_store_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_store_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_store_si128((__m128i*)(ptr + 12), v3.val);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0.val);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
+    }
+}
+
+// 2-channel, float only
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
+    __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+    }
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
+    __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
+    __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
+    __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
+    __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
+    __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+        _mm_stream_ps(ptr + 8, v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+        _mm_store_ps(ptr + 8, v2);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+        _mm_storeu_ps(ptr + 8, v2);
+    }
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, const v_float32x4& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
+    __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
+    __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
+    __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
+    __m128 v0 = _mm_unpacklo_ps(u0, u1);
+    __m128 v2 = _mm_unpacklo_ps(u2, u3);
+    __m128 v1 = _mm_unpackhi_ps(u0, u1);
+    __m128 v3 = _mm_unpackhi_ps(u2, u3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+        _mm_stream_ps(ptr + 8, v2);
+        _mm_stream_ps(ptr + 12, v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+        _mm_store_ps(ptr + 8, v2);
+        _mm_store_ps(ptr + 12, v3);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+        _mm_storeu_ps(ptr + 8, v2);
+        _mm_storeu_ps(ptr + 12, v3);
+    }
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+    }
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
+    __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+        _mm_stream_si128((__m128i*)(ptr + 4), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+        _mm_store_si128((__m128i*)(ptr + 4), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+    }
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
+    __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
+    __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+        _mm_stream_si128((__m128i*)(ptr + 4), v2);
+        _mm_stream_si128((__m128i*)(ptr + 6), v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+        _mm_store_si128((__m128i*)(ptr + 4), v2);
+        _mm_store_si128((__m128i*)(ptr + 6), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 6), v3);
+    }
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(_mm_cvtepi32_ps(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    return v_float32x4(_mm_cvtpd_ps(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(_mm_cvtepi32_pd(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(_mm_cvtps_pd(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
+}
+
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x2 v_cvt_f64(const v_int64x2& v)
+{
+    // constants encoded as floating-point
+    __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m128i magic_i_all  = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m128d magic_d_all  = _mm_castsi128_pd(magic_i_all);
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+#if CV_SSE4_1
+    __m128i magic_i_lo   = _mm_set1_epi64x(0x4330000000000000); // 2^52
+    __m128i v_lo         = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
+#else
+    __m128i magic_i_lo   = _mm_set1_epi32(0x43300000); // 2^52
+    __m128i v_lo         = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
+#endif
+    // Extract the 32 most significant bits of v
+    __m128i v_hi         = _mm_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm_xor_si128(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m128d v_hi_dbl     = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m128d result       = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
+    return v_float64x2(result);
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
+                                   tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
+#else
+    return v_int8x16(_mm_setr_epi64(
+                        _mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]),
+                        _mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])
+                    ));
+#endif
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int8x16(_mm_setr_epi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]),
+                                    *(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
+#else
+    return v_int8x16(_mm_setr_epi64(
+                        _mm_setr_pi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3])),
+                        _mm_setr_pi16(*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))
+                    ));
+#endif
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int8x16(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                    *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+#else
+    return v_int8x16(_mm_setr_epi64(
+                        _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
+                        _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
+                    ));
+#endif
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                                    tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+#else
+    return v_int16x8(_mm_setr_epi64(
+                        _mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]),
+                        _mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])
+                    ));
+#endif
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int16x8(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                    *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+#else
+    return v_int16x8(_mm_setr_epi64(
+                        _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
+                        _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
+                    ));
+#endif
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]],
+                                    tab[idx[2]], tab[idx[3]]));
+#else
+    return v_int32x4(_mm_setr_epi64(
+                        _mm_setr_pi32(tab[idx[0]], tab[idx[1]]),
+                        _mm_setr_pi32(tab[idx[2]], tab[idx[3]])
+                    ));
+#endif
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]]));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+//   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+//   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
+    __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
+    xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
+    xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
+    __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
+    __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
+    x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
+    y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
+    y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+#if CV_SSSE3
+    return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
+#else
+    __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
+    a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
+    a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
+    return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
+#endif
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+#if CV_SSSE3
+    return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
+#else
+    __m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
+    return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
+#endif
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+#if CV_SSSE3
+    return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
+#else
+    __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
+    return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
+#endif
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+#if CV_SSSE3
+    return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
+#else
+    return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
+#endif
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+#if CV_SSSE3
+    return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
+#else
+    __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
+    __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
+    return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
+#endif
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+#if CV_SSSE3
+    return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
+#else
+    return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
+#endif
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+template<int i>
+inline uchar v_extract_n(const v_uint8x16& v)
+{
+#if CV_SSE4_1
+    return (uchar)_mm_extract_epi8(v.val, i);
+#else
+    return v_rotate_right<i>(v).get0();
+#endif
+}
+
+template<int i>
+inline schar v_extract_n(const v_int8x16& v)
+{
+    return (schar)v_extract_n<i>(v_reinterpret_as_u8(v));
+}
+
+template<int i>
+inline ushort v_extract_n(const v_uint16x8& v)
+{
+    return (ushort)_mm_extract_epi16(v.val, i);
+}
+
+template<int i>
+inline short v_extract_n(const v_int16x8& v)
+{
+    return (short)v_extract_n<i>(v_reinterpret_as_u16(v));
+}
+
+template<int i>
+inline uint v_extract_n(const v_uint32x4& v)
+{
+#if CV_SSE4_1
+    return (uint)_mm_extract_epi32(v.val, i);
+#else
+    return v_rotate_right<i>(v).get0();
+#endif
+}
+
+template<int i>
+inline int v_extract_n(const v_int32x4& v)
+{
+    return (int)v_extract_n<i>(v_reinterpret_as_u32(v));
+}
+
+template<int i>
+inline uint64 v_extract_n(const v_uint64x2& v)
+{
+#ifdef CV__SIMD_NATIVE_mm_extract_epi64
+    return (uint64)_v128_extract_epi64<i>(v.val);
+#else
+    return v_rotate_right<i>(v).get0();
+#endif
+}
+
+template<int i>
+inline int64 v_extract_n(const v_int64x2& v)
+{
+    return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
+}
+
+template<int i>
+inline float v_extract_n(const v_float32x4& v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(const v_float64x2& v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
+    return d.dv;
+}
+
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& v)
+{
+    return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& v)
+{
+    return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
+}
+
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& v)
+{
+    return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i)));
+}
+
+////////////// FP16 support ///////////////////////////
+
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+#if CV_FP16
+    return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
+#else
+    const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
+    const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
+    const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
+    __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
+    __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
+    __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
+
+    t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
+    __m128i zmask = _mm_cmpeq_epi32(e, z);
+    __m128i ft = v_select_si128(zmask, zt, t);
+    return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
+#endif
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+#if CV_FP16
+    __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
+    _mm_storel_epi64((__m128i*)ptr, fp16_value);
+#else
+    const __m128i signmask = _mm_set1_epi32(0x80000000);
+    const __m128i rval = _mm_set1_epi32(0x3f000000);
+
+    __m128i t = _mm_castps_si128(v.val);
+    __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
+    t = _mm_andnot_si128(signmask, t);
+
+    __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
+    __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
+    __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
+    __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
+    __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
+    tt = _mm_sub_epi32(tt, rval);
+    __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
+    __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
+    nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
+    t = v_select_si128(tinymask, tt, nt);
+    t = v_select_si128(finitemask, t, naninf);
+    t = _mm_or_si128(t, sign);
+    t = _mm_packs_epi32(t, t);
+    _mm_storel_epi64((__m128i*)ptr, t);
+#endif
+}
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_sse_em.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_sse_em.hpp
new file mode 100644
index 000000000000..6fb088161a5d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_sse_em.hpp
@@ -0,0 +1,180 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
+#define OPENCV_HAL_INTRIN_SSE_EM_HPP
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
+    inline tp _v128_##fun(const tp& a) \
+    { return _mm_##fun(a); }
+
+#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
+    inline tp _v128_##fun(const tp& a, const tp& b) \
+    { return _mm_##fun(a, b); }
+
+#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
+    inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
+    { return _mm_##fun(a, b, c); }
+
+///////////////////////////// XOP /////////////////////////////
+
+// [todo] define CV_XOP
+#if 1 // CV_XOP
+inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i delta = _mm_set1_epi32((int)0x80000000);
+    return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
+}
+// wrapping XOP
+#else
+OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i)
+#endif // !CV_XOP
+
+///////////////////////////// SSE4.1 /////////////////////////////
+
+#if !CV_SSE4_1
+
+/** Swizzle **/
+inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask)
+{ return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
+
+/** Convert **/
+// 8 >> 16
+inline __m128i _v128_cvtepu8_epi16(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi8(a, z);
+}
+inline __m128i _v128_cvtepi8_epi16(const __m128i& a)
+{ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); }
+// 8 >> 32
+inline __m128i _v128_cvtepu8_epi32(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
+}
+inline __m128i _v128_cvtepi8_epi32(const __m128i& a)
+{
+    __m128i r = _mm_unpacklo_epi8(a, a);
+    r = _mm_unpacklo_epi8(r, r);
+    return _mm_srai_epi32(r, 24);
+}
+// 16 >> 32
+inline __m128i _v128_cvtepu16_epi32(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi16(a, z);
+}
+inline __m128i _v128_cvtepi16_epi32(const __m128i& a)
+{ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); }
+// 32 >> 64
+inline __m128i _v128_cvtepu32_epi64(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi32(a, z);
+}
+inline __m128i _v128_cvtepi32_epi64(const __m128i& a)
+{ return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); }
+
+/** Arithmetic **/
+inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b)
+{
+    __m128i c0 = _mm_mul_epu32(a, b);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return _mm_unpacklo_epi64(d0, d1);
+}
+
+/** Math **/
+inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b)
+{ return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); }
+
+// wrapping SSE4.1
+#else
+OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i)
+OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i)
+OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i)
+#endif // !CV_SSE4_1
+
+///////////////////////////// Revolutionary /////////////////////////////
+
+/** Convert **/
+// 16 << 8
+inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi8(a, z);
+}
+inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a)
+{ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); }
+// 32 << 16
+inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi16(a, z);
+}
+inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a)
+{ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); }
+// 64 << 32
+inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi32(a, z);
+}
+inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a)
+{ return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); }
+
+/** Miscellaneous **/
+inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i m = _mm_set1_epi32(65535);
+    __m128i am = _v128_min_epu32(a, m);
+    __m128i bm = _v128_min_epu32(b, m);
+#if CV_SSE4_1
+    return _mm_packus_epi32(am, bm);
+#else
+    const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768);
+    am = _mm_sub_epi32(am, d);
+    bm = _mm_sub_epi32(bm, d);
+    am = _mm_packs_epi32(am, bm);
+    return _mm_sub_epi16(am, nd);
+#endif
+}
+
+template<int i>
+inline int64 _v128_extract_epi64(const __m128i& a)
+{
+#if defined(CV__SIMD_HAVE_mm_extract_epi64) || (CV_SSE4_1 && (defined(__x86_64__)/*GCC*/ || defined(_M_X64)/*MSVC*/))
+#define CV__SIMD_NATIVE_mm_extract_epi64 1
+    return _mm_extract_epi64(a, i);
+#else
+    CV_DECL_ALIGNED(16) int64 tmp[2];
+    _mm_store_si128((__m128i*)tmp, a);
+    return tmp[i];
+#endif
+}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_vsx.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_vsx.hpp
new file mode 100644
index 000000000000..e66563bede26
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_vsx.hpp
@@ -0,0 +1,1608 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_VSX_HPP
+#define OPENCV_HAL_VSX_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Types ////////////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+    vec_uchar16 val;
+
+    explicit v_uint8x16(const vec_uchar16& v) : val(v)
+    {}
+    v_uint8x16()
+    {}
+    v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v))
+    {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+        : val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
+    {}
+
+    static inline v_uint8x16 zero() { return v_uint8x16(vec_uchar16_z); }
+
+    uchar get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+    vec_char16 val;
+
+    explicit v_int8x16(const vec_char16& v) : val(v)
+    {}
+    v_int8x16()
+    {}
+    v_int8x16(vec_bchar16 v) : val(vec_char16_c(v))
+    {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+              schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+        : val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
+    {}
+
+    static inline v_int8x16 zero() { return v_int8x16(vec_char16_z); }
+
+    schar get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+    vec_ushort8 val;
+
+    explicit v_uint16x8(const vec_ushort8& v) : val(v)
+    {}
+    v_uint16x8()
+    {}
+    v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v))
+    {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+        : val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7))
+    {}
+
+    static inline v_uint16x8 zero() { return v_uint16x8(vec_ushort8_z); }
+
+    ushort get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+    vec_short8 val;
+
+    explicit v_int16x8(const vec_short8& v) : val(v)
+    {}
+    v_int16x8()
+    {}
+    v_int16x8(vec_bshort8 v) : val(vec_short8_c(v))
+    {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+        : val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7))
+    {}
+
+    static inline v_int16x8 zero() { return v_int16x8(vec_short8_z); }
+
+    short get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+    vec_uint4 val;
+
+    explicit v_uint32x4(const vec_uint4& v) : val(v)
+    {}
+    v_uint32x4()
+    {}
+    v_uint32x4(vec_bint4 v) : val(vec_uint4_c(v))
+    {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3))
+    {}
+
+    static inline v_uint32x4 zero() { return v_uint32x4(vec_uint4_z); }
+
+    uint get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+    vec_int4 val;
+
+    explicit v_int32x4(const vec_int4& v) : val(v)
+    {}
+    v_int32x4()
+    {}
+    v_int32x4(vec_bint4 v) : val(vec_int4_c(v))
+    {}
+    v_int32x4(int v0, int v1, int v2, int v3) : val(vec_int4_set(v0, v1, v2, v3))
+    {}
+
+    static inline v_int32x4 zero() { return v_int32x4(vec_int4_z); }
+
+    int get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+    vec_float4 val;
+
+    explicit v_float32x4(const vec_float4& v) : val(v)
+    {}
+    v_float32x4()
+    {}
+    v_float32x4(vec_bint4 v) : val(vec_float4_c(v))
+    {}
+    v_float32x4(float v0, float v1, float v2, float v3) : val(vec_float4_set(v0, v1, v2, v3))
+    {}
+
+    static inline v_float32x4 zero() { return v_float32x4(vec_float4_z); }
+
+    float get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+    vec_udword2 val;
+
+    explicit v_uint64x2(const vec_udword2& v) : val(v)
+    {}
+    v_uint64x2()
+    {}
+    v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v))
+    {}
+    v_uint64x2(uint64 v0, uint64 v1) : val(vec_udword2_set(v0, v1))
+    {}
+
+    static inline v_uint64x2 zero() { return v_uint64x2(vec_udword2_z); }
+
+    uint64 get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+    vec_dword2 val;
+
+    explicit v_int64x2(const vec_dword2& v) : val(v)
+    {}
+    v_int64x2()
+    {}
+    v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v))
+    {}
+    v_int64x2(int64 v0, int64 v1) : val(vec_dword2_set(v0, v1))
+    {}
+
+    static inline v_int64x2 zero() { return v_int64x2(vec_dword2_z); }
+
+    int64 get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+    vec_double2 val;
+
+    explicit v_float64x2(const vec_double2& v) : val(v)
+    {}
+    v_float64x2()
+    {}
+    v_float64x2(vec_bdword2 v) : val(vec_double2_c(v))
+    {}
+    v_float64x2(double v0, double v1) : val(vec_double2_set(v0, v1))
+    {}
+
+    static inline v_float64x2 zero() { return v_float64x2(vec_double2_z); }
+
+    double get0() const
+    { return vec_extract(val, 0); }
+};
+
+#define OPENCV_HAL_IMPL_VSX_EXTRACT_N(_Tpvec, _Tp) \
+template<int i> inline _Tp v_extract_n(VSX_UNUSED(_Tpvec v)) { return vec_extract(v.val, i); }
+
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int8x16, schar)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int16x8, short)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint32x4, uint)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int32x4, int)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int64x2, int64)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float32x4, float)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
+
+//////////////// Load and store operations ///////////////
+
+/*
+ * clang-5 aborted during parse "vec_xxx_c" only if it's
+ * inside a function template which is defined by preprocessor macro.
+ *
+ * if vec_xxx_c defined as C++ cast, clang-5 will pass it
+*/
+#define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast)                        \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); }             \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));}          \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a)  \
+{ return _Tpvec((cast)a.val); }
+
+OPENCV_HAL_IMPL_VSX_INITVEC(v_uint8x16, uchar, u8, vec_uchar16)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_int8x16, schar, s8, vec_char16)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_uint16x8, ushort, u16, vec_ushort8)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_int16x8, short, s16, vec_short8)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_uint32x4, uint, u32, vec_uint4)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_int32x4, int, s32, vec_int4)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_uint64x2, uint64, u64, vec_udword2)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)
+
+#define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a)    \
+inline _Tpvec v_load(const _Tp* ptr)                                        \
+{ return _Tpvec(ld(0, ptr)); }                                              \
+inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr))                    \
+{ return _Tpvec(ld_a(0, ptr)); }                                            \
+inline _Tpvec v_load_low(const _Tp* ptr)                                    \
+{ return _Tpvec(vec_ld_l8(ptr)); }                                          \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)               \
+{ return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); }          \
+inline void v_store(_Tp* ptr, const _Tpvec& a)                              \
+{ st(a.val, 0, ptr); }                                                      \
+inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)          \
+{ st_a(a.val, 0, ptr); }                                                    \
+inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)  \
+{ st_a(a.val, 0, ptr); }                                                    \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)         \
+{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a)                          \
+{ vec_st_l8(a.val, ptr); }                                                  \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a)                         \
+{ vec_st_h8(a.val, ptr); }
+
+// working around gcc bug for aligned ld/st
+// if runtime check for vec_ld/st fail we failback to unaligned ld/st
+// https://github.com/opencv/opencv/issues/13211
+#ifdef CV_COMPILER_VSX_BROKEN_ALIGNED
+    #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
+    OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vsx_ld, vsx_st, vsx_st)
+#else
+    #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
+    OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
+#endif
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16,  uchar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16,   schar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8,  ushort)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8,   short)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4,  uint)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4,   int)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld,  vsx_ld,  vsx_st,  vsx_st)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2,  uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2,    int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
+
+//////////////// Value reordering ///////////////
+
+/* de&interleave */
+#define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec)                          \
+inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b)        \
+{ vec_ld_deinterleave(ptr, a.val, b.val);}                                   \
+inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a,                   \
+                                _Tpvec& b, _Tpvec& c)                        \
+{ vec_ld_deinterleave(ptr, a.val, b.val, c.val); }                           \
+inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b,        \
+                                                _Tpvec& c, _Tpvec& d)        \
+{ vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); }                    \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ vec_st_interleave(a.val, b.val, ptr); }                                    \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a,                    \
+                               const _Tpvec& b, const _Tpvec& c,             \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ vec_st_interleave(a.val, b.val, c.val, ptr); }                             \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
+                                         const _Tpvec& c, const _Tpvec& d,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
+
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(schar, v_int8x16)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(ushort, v_uint16x8)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(short, v_int16x8)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)
+
+/* Expand */
+#define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh)  \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)   \
+{                                                                 \
+    b0.val = fh(a.val);                                           \
+    b1.val = fl(a.val);                                           \
+}                                                                 \
+inline _Tpwvec v_expand_low(const _Tpvec& a)                      \
+{ return _Tpwvec(fh(a.val)); }                                    \
+inline _Tpwvec v_expand_high(const _Tpvec& a)                     \
+{ return _Tpwvec(fl(a.val)); }                                    \
+inline _Tpwvec v_load_expand(const _Tp* ptr)                      \
+{ return _Tpwvec(fh(vec_ld_l8(ptr))); }
+
+OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_uint16x8, v_uint32x4, ushort, vec_unpacklu, vec_unpackhu)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
+
+/* Load and zero expand a 4 byte value into the second dword, first is don't care. */
+#if !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
+#else
+    /* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */
+    #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
+#endif
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    // Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel
+    // Likewise note, value is zero extended and upper 4 bytes are zero'ed.
+    vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
+    vec_uchar16 out;
+
+    _LXSIWZX(out, ptr, vec_uchar16);
+    out = vec_perm(out, out, pmu);
+    return v_uint32x4((vec_uint4)out);
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    vec_char16 out;
+    vec_short8 outs;
+    vec_int4 outw;
+
+    _LXSIWZX(out, ptr, vec_char16);
+    outs = vec_unpackl(out);
+    outw = vec_unpackh(outs);
+    return v_int32x4(outw);
+}
+
+/* pack */
+#define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack)    \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b)                                          \
+{                                                                                                   \
+    return _Tpvec(pkfnc(a.val, b.val));                                                             \
+}                                                                                                   \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a)                                            \
+{                                                                                                   \
+    vec_st_l8(pkfnc(a.val, a.val), ptr);                                                            \
+}                                                                                                   \
+template<int n>                                                                                     \
+inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b)                                     \
+{                                                                                                   \
+    const __vector _Tpvn vn = vec_splats((_Tpvn)n);                                                 \
+    const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1)));                         \
+    return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn)));           \
+}                                                                                                   \
+template<int n>                                                                                     \
+inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a)                                       \
+{                                                                                                   \
+    const __vector _Tpvn vn = vec_splats((_Tpvn)n);                                                 \
+    const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1)));                         \
+    vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr);                                   \
+}
+
+OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_uint16x8, unsigned short, unsigned short,
+                         vec_sr, vec_packs, vec_adds, pack)
+OPENCV_HAL_IMPL_VSX_PACK(v_int8x16, schar, v_int16x8, unsigned short, short,
+                         vec_sra, vec_packs, vec_adds, pack)
+
+OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_uint32x4, unsigned int, unsigned int,
+                         vec_sr, vec_packs, vec_add, pack)
+OPENCV_HAL_IMPL_VSX_PACK(v_int16x8, short, v_int32x4, unsigned int, int,
+                         vec_sra, vec_packs, vec_add, pack)
+
+OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_uint64x2, unsigned long long, unsigned long long,
+                         vec_sr, vec_pack, vec_add, pack)
+OPENCV_HAL_IMPL_VSX_PACK(v_int32x4, int, v_int64x2, unsigned long long, long long,
+                         vec_sra, vec_pack, vec_add, pack)
+
+OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_int16x8, unsigned short, short,
+                         vec_sra, vec_packsu, vec_adds, pack_u)
+OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
+                         vec_sra, vec_packsu, vec_add, pack_u)
+// Following variant is not implemented on other platforms:
+//OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
+//                         vec_sra, vec_packsu, vec_add, pack_u)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_uchar16 ab = vec_pack(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    vec_ushort8 ab = vec_pack(a.val, b.val);
+    vec_ushort8 cd = vec_pack(c.val, d.val);
+    return v_uint8x16(vec_pack(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    vec_uint4 ab = vec_pack(a.val, b.val);
+    vec_uint4 cd = vec_pack(c.val, d.val);
+    vec_uint4 ef = vec_pack(e.val, f.val);
+    vec_uint4 gh = vec_pack(g.val, h.val);
+
+    vec_ushort8 abcd = vec_pack(ab, cd);
+    vec_ushort8 efgh = vec_pack(ef, gh);
+    return v_uint8x16(vec_pack(abcd, efgh));
+}
+
+/* Recombine */
+template <typename _Tpvec>
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
+{
+    b0.val = vec_mergeh(a0.val, a1.val);
+    b1.val = vec_mergel(a0.val, a1.val);
+}
+
+template <typename _Tpvec>
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(vec_mergesql(a.val, b.val)); }
+
+template <typename _Tpvec>
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(vec_mergesqh(a.val, b.val)); }
+
+template <typename _Tpvec>
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    c.val = vec_mergesqh(a.val, b.val);
+    d.val = vec_mergesql(a.val, b.val);
+}
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin)       \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(intrin(a.val, b.val)); }                         \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)   \
+{ a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16,  vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
+
+// saturating multiply
+#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec)             \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4)
+
+template<typename Tvec, typename Twvec>
+inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
+{
+    Twvec p0 = Twvec(vec_mule(a.val, b.val));
+    Twvec p1 = Twvec(vec_mulo(a.val, b.val));
+    v_zip(p0, p1, c, d);
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    vec_int4 p0 = vec_mule(a.val, b.val);
+    vec_int4 p1 = vec_mulo(a.val, b.val);
+    static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+    return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_uint4 p0 = vec_mule(a.val, b.val);
+    vec_uint4 p1 = vec_mulo(a.val, b.val);
+    static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+    return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
+}
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin)    \
+template<typename _Tpvec>                             \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)  \
+{ return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc)   \
+inline _Tpvec operator << (const _Tpvec& a, int imm)         \
+{ return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
+inline _Tpvec operator >> (const _Tpvec& a, int imm)         \
+{ return _Tpvec(shr(a.val, splfunc(imm))); }                 \
+template<int imm> inline _Tpvec v_shl(const _Tpvec& a)       \
+{ return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
+template<int imm> inline _Tpvec v_shr(const _Tpvec& a)       \
+{ return _Tpvec(shr(a.val, splfunc(imm))); }
+
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
+// algebraic right shift
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec)    \
+OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and)  \
+OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or)   \
+OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor)  \
+inline _Tpvec operator ~ (const _Tpvec& a)      \
+{ return _Tpvec(vec_not(a.val)); }
+
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int8x16)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint16x8)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int16x8)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint32x4)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int32x4)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint64x2)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int64x2)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float32x4)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float64x2)
+
+/** Bitwise select **/
+#define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast)                             \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); }
+
+OPENCV_HAL_IMPL_VSX_SELECT(v_uint8x16, vec_bchar16_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_int8x16, vec_bchar16_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_uint16x8, vec_bshort8_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_int16x8, vec_bshort8_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_uint32x4, vec_bint4_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_int32x4, vec_bint4_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_float32x4, vec_bint4_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec)                 \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)   \
+{ return _Tpvec(vec_cmpeq(a.val, b.val)); }                    \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)   \
+{ return _Tpvec(vec_cmpne(a.val, b.val)); }                    \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b)    \
+{ return _Tpvec(vec_cmplt(a.val, b.val)); }                    \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b)    \
+{ return _Tpvec(vec_cmpgt(a.val, b.val)); }                    \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)   \
+{ return _Tpvec(vec_cmple(a.val, b.val)); }                    \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)   \
+{ return _Tpvec(vec_cmpge(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int8x16)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint16x8)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int16x8)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint32x4)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int32x4)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float32x4)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vec_cmpeq(a.val, a.val)); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vec_cmpeq(a.val, a.val)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
+
+/** Rotate **/
+#define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast)                       \
+template<int imm>                                                               \
+inline _Tpvec v_rotate_##suffix(const _Tpvec& a)                                \
+{                                                                               \
+    const int wd = imm * sizeof(typename _Tpvec::lane_type);                    \
+    if (wd > 15)                                                                \
+        return _Tpvec::zero();                                                  \
+    return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3)));    \
+}
+
+#define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast)     \
+OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
+OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
+
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16,  vec_char16)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8,  vec_short8)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4,  vec_int4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2,  vec_dword2)
+OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
+{
+    enum { CV_SHIFT = 16 - imm * (sizeof(typename _Tpvec::lane_type)) };
+    if (CV_SHIFT == 16)
+        return a;
+#ifdef __IBMCPP__
+    return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15));
+#else
+    return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT));
+#endif
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
+{
+    enum { CV_SHIFT = imm * (sizeof(typename _Tpvec::lane_type)) };
+    if (CV_SHIFT == 16)
+        return b;
+    return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
+}
+
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2)   \
+template<int imm>                                                 \
+inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
+{                                                                 \
+    if (imm == 1)                                                 \
+        return _Tpvec(vec_permi(rg1.val, rg2.val, 2));            \
+    return imm ? b : a;                                           \
+}
+
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec)    \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left,  b, a)  \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
+
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
+
+/* Reverse */
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_uint8x16(vec_perm(vec, vec, perm));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+/* Extract */
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{ return v_rotate_right<s>(a, b); }
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+inline uint v_reduce_sum(const v_uint8x16& a)
+{
+    const vec_uint4 zero4 = vec_uint4_z;
+    vec_uint4 sum4 = vec_sum4s(a.val, zero4);
+    return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_int4 sum4 = vec_sum4s(a.val, zero4);
+    return (int)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline int v_reduce_sum(const v_int16x8& a)
+{
+    const vec_int4 zero = vec_int4_z;
+    return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
+}
+inline uint v_reduce_sum(const v_uint16x8& a)
+{
+    const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
+    return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
+}
+
+#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
+inline scalartype v_reduce_##suffix(const _Tpvec& a)                               \
+{                                                                                  \
+    const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8));                      \
+    return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0);                           \
+}
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, sum, vec_add)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, sum, vec_add)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
+
+#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
+inline scalartype v_reduce_##suffix(const _Tpvec& a)                               \
+{                                                                                  \
+    _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8));                            \
+    rs = func(rs, vec_sld(rs, rs, 4));                                             \
+    return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0);                           \
+}
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
+
+#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
+inline scalartype v_reduce_##suffix(const _Tpvec& a)                               \
+{                                                                                  \
+    _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8));                            \
+    rs = func(rs, vec_sld(rs, rs, 4));                                             \
+    rs = func(rs, vec_sld(rs, rs, 2));                                             \
+    return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0);                           \
+}
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, min, vec_min)
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val));
+    ac = vec_add(ac, vec_sld(ac, ac, 8));
+
+    vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val));
+    bd = vec_add(bd, vec_sld(bd, bd, 8));
+    return v_float32x4(vec_mergeh(ac, bd));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    const vec_uint4 zero4 = vec_uint4_z;
+    vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
+    return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
+    vec_int4 sum4 = vec_sum4s(ad, zero4);
+    return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_ushort8 ad = vec_absd(a.val, b.val);
+    VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)) + vec_int4_c(vec_unpacklu(ad)), vec_int4_z);
+    return (unsigned)vec_extract(sum, 3);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
+    vec_int4 sum4 = vec_sum4s(ad, zero4);
+    return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    const vec_uint4 ad = vec_absd(a.val, b.val);
+    const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
+    return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
+    return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
+    const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
+    return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
+}
+
+/** Popcount **/
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{ return v_uint8x16(vec_popcntu(a.val)); }
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_uint8x16(vec_popcntu(a.val)); }
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{ return v_uint16x8(vec_popcntu(a.val)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_uint16x8(vec_popcntu(a.val)); }
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{ return v_uint32x4(vec_popcntu(a.val)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_uint32x4(vec_popcntu(a.val)); }
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{ return v_uint64x2(vec_popcntu(a.val)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_uint64x2(vec_popcntu(a.val)); }
+
+/** Mask **/
+inline int v_signmask(const v_uint8x16& a)
+{
+    static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
+    return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_int16x8& a)
+{
+    static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
+    return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
+}
+inline int v_signmask(const v_uint16x8& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_int32x4& a)
+{
+    static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
+    return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
+}
+inline int v_signmask(const v_uint32x4& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+
+inline int v_signmask(const v_int64x2& a)
+{
+    VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
+    return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
+}
+inline int v_signmask(const v_uint64x2& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+
+template<typename _Tpvec>
+inline bool v_check_all(const _Tpvec& a)
+{ return vec_all_lt(a.val, _Tpvec::zero().val); }
+inline bool v_check_all(const v_uint8x16& a)
+{ return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_all(const v_uint16x8& a)
+{ return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_all(const v_uint32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_uint64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+
+template<typename _Tpvec>
+inline bool v_check_any(const _Tpvec& a)
+{ return vec_any_lt(a.val, _Tpvec::zero().val); }
+inline bool v_check_any(const v_uint8x16& a)
+{ return v_check_any(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint16x8& a)
+{ return v_check_any(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{ return v_float32x4(vec_sqrt(x.val)); }
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{ return v_float64x2(vec_sqrt(x.val)); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{ return v_float32x4(vec_rsqrt(x.val)); }
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{ return v_float64x2(vec_rsqrt(x.val)); }
+
+#define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec)                                  \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
+{ return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
+{ return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); }           \
+inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \
+{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }                           \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)   \
+{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }
+
+OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
+OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return a * b + c; }
+
+// TODO: exp, log, sin, cos
+
+/** Absolute values **/
+inline v_uint8x16 v_abs(const v_int8x16& x)
+{ return v_uint8x16(vec_uchar16_c(vec_abs(x.val))); }
+
+inline v_uint16x8 v_abs(const v_int16x8& x)
+{ return v_uint16x8(vec_ushort8_c(vec_abs(x.val))); }
+
+inline v_uint32x4 v_abs(const v_int32x4& x)
+{ return v_uint32x4(vec_uint4_c(vec_abs(x.val))); }
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(vec_abs(x.val)); }
+
+inline v_float64x2 v_abs(const v_float64x2& x)
+{ return v_float64x2(vec_abs(x.val)); }
+
+/** Absolute difference **/
+// unsigned
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{ return v_abs(a - b); }
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{ return v_abs(a - b); }
+
+/** Absolute difference for signed integers **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(vec_cts(vec_rint(a.val))); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{ return v_int32x4(vec_cts(vec_floor(a.val))); }
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_floor(a.val)), vec_int4_z)); }
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{ return v_int32x4(vec_cts(vec_ceil(a.val))); }
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_ceil(a.val)), vec_int4_z)); }
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(vec_cts(a.val)); }
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(vec_mergesqo(vec_ctso(a.val), vec_int4_z)); }
+
+/** To float **/
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{ return v_float32x4(vec_ctf(a.val)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{ return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{ return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{ return v_float64x2(vec_ctd(a.val)); }
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]],
+                     tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_reinterpret_as_s8(v_int16x8(*(const short*)(tab+idx[0]), *(const short*)(tab+idx[1]), *(const short*)(tab+idx[2]), *(const short*)(tab+idx[3]),
+                                       *(const short*)(tab+idx[4]), *(const short*)(tab+idx[5]), *(const short*)(tab+idx[6]), *(const short*)(tab+idx[7])));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    return v_reinterpret_as_s8(v_int32x4(*(const int*)(tab+idx[0]), *(const int*)(tab+idx[1]), *(const int*)(tab+idx[2]), *(const int*)(tab+idx[3])));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    return v_reinterpret_as_s16(v_int32x4(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_reinterpret_as_s16(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_reinterpret_as_s32(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vsx_ld(0, tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vsx_ld2(0, tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int*)tab, idx)); }
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_load(tab + *idx); }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_load(tab + *idx); }
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    const int idx[4] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1),
+        vec_extract(idxvec.val, 2),
+        vec_extract(idxvec.val, 3)
+    };
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    const int idx[4] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1),
+        vec_extract(idxvec.val, 2),
+        vec_extract(idxvec.val, 3)
+    };
+    return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    const int idx[4] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1),
+        vec_extract(idxvec.val, 2),
+        vec_extract(idxvec.val, 3)
+    };
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    const int idx[2] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1)
+    };
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    vec_float4 xy0 = vec_ld_l8(tab + vec_extract(idxvec.val, 0));
+    vec_float4 xy1 = vec_ld_l8(tab + vec_extract(idxvec.val, 1));
+    vec_float4 xy2 = vec_ld_l8(tab + vec_extract(idxvec.val, 2));
+    vec_float4 xy3 = vec_ld_l8(tab + vec_extract(idxvec.val, 3));
+    vec_float4 xy02 = vec_mergeh(xy0, xy2); // x0, x2, y0, y2
+    vec_float4 xy13 = vec_mergeh(xy1, xy3); // x1, x3, y1, y3
+    x.val = vec_mergeh(xy02, xy13);
+    y.val = vec_mergel(xy02, xy13);
+}
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    vec_double2 xy0 = vsx_ld(vec_extract(idxvec.val, 0), tab);
+    vec_double2 xy1 = vsx_ld(vec_extract(idxvec.val, 1), tab);
+    x.val = vec_mergeh(xy0, xy1);
+    y.val = vec_mergel(xy0, xy1);
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    static const vec_uchar16 perm = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15};
+    return v_int8x16(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    static const vec_uchar16 perm = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
+    return v_int8x16(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    static const vec_uchar16 perm = {0,1, 4,5, 2,3, 6,7, 8,9, 12,13, 10,11, 14,15};
+    return v_int16x8(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    static const vec_uchar16 perm = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
+    return v_int16x8(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    static const vec_uchar16 perm = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15};
+    return v_int32x4(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
+{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
+{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    static const vec_uchar16 perm = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 15, 15, 15};
+    return v_int8x16(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    static const vec_uchar16 perm = {0,1, 2,3, 4,5, 8,9, 10,11, 12,13, 14,15, 14,15};
+    return v_int16x8(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec)
+{ return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec)
+{ return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
+{ return vec; }
+
+/////// FP16 support ////////
+
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
+#if CV_VSX3 && defined(vec_extract_fp_from_shorth)
+    return v_float32x4(vec_extract_fp_from_shorth(vf16));
+#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    vec_float4 vf32;
+    __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wa" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
+    return v_float32x4(vf32);
+#else
+    const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
+    const vec_int4 signmask = vec_int4_sp(0x80000000);
+    const vec_int4 maxexp = vec_int4_sp(0x7c000000);
+    const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
+
+    vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
+    vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
+    vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
+    vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
+
+    t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
+    vec_bint4 zmask = vec_cmpeq(e, z);
+    vec_int4 ft = vec_sel(t, zt, zmask);
+    return v_float32x4(vec_float4_c(vec_or(ft, sign)));
+#endif
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+// fixme: Is there any builtin op or intrinsic that cover "xvcvsphp"?
+#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    vec_ushort8 vf16;
+    __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (v.val));
+    vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
+#else
+    const vec_int4 signmask = vec_int4_sp(0x80000000);
+    const vec_int4 rval = vec_int4_sp(0x3f000000);
+
+    vec_int4 t = vec_int4_c(v.val);
+    vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
+    t = vec_and(vec_nor(signmask, signmask), t);
+
+    vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
+    vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
+    vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
+    vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
+    vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
+    tt = vec_sub(tt, rval);
+    vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
+    vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
+    nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
+    t = vec_sel(nt, tt, tinymask);
+    t = vec_sel(naninf, t, finitemask);
+    t = vec_or(t, sign);
+    vec_st_l8(vec_packs(t, t), ptr);
+#endif
+}
+
+inline void v_cleanup() {}
+
+
+/** Reinterpret **/
+/** its up there with load and store operations **/
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    vec_dword2 even = vec_mule(a.val, b.val);
+    vec_dword2 odd = vec_mulo(a.val, b.val);
+    return v_int64x2(vec_add(even, odd));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    const vec_ushort8 eight = vec_ushort8_sp(8);
+    vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
+    vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
+    vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
+    vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    const vec_ushort8 eight = vec_ushort8_sp(8);
+    vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
+    vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
+    vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
+    vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    const vec_uint4 zero = vec_uint4_z;
+    vec_uint4 even = vec_mule(a.val, b.val);
+    vec_uint4 odd  = vec_mulo(a.val, b.val);
+    vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
+    vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
+    vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
+    vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
+    vec_udword2 s0 = vec_add(e0, o0);
+    vec_udword2 s1 = vec_add(e1, o1);
+    return v_uint64x2(vec_add(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    vec_short8 a0 = vec_unpackh(a.val);
+    vec_short8 a1 = vec_unpackl(a.val);
+    vec_short8 b0 = vec_unpackh(b.val);
+    vec_short8 b1 = vec_unpackl(b.val);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return c + d;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    const vec_float4 v0 = vec_splat(v.val, 0);
+    const vec_float4 v1 = vec_splat(v.val, 1);
+    const vec_float4 v2 = vec_splat(v.val, 2);
+    VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
+    return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    const vec_float4 v0 = vec_splat(v.val, 0);
+    const vec_float4 v1 = vec_splat(v.val, 1);
+    const vec_float4 v2 = vec_splat(v.val, 2);
+    return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
+}
+
+#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2)                        \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,                   \
+                           const _Tpvec& a2, const _Tpvec& a3,                   \
+                           _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)       \
+{                                                                                \
+    _Tpvec2 a02 = vec_mergeh(a0.val, a2.val);                                    \
+    _Tpvec2 a13 = vec_mergeh(a1.val, a3.val);                                    \
+    b0.val = vec_mergeh(a02, a13);                                               \
+    b1.val = vec_mergel(a02, a13);                                               \
+    a02 = vec_mergel(a0.val, a2.val);                                            \
+    a13 = vec_mergel(a1.val, a3.val);                                            \
+    b2.val  = vec_mergeh(a02, a13);                                              \
+    b3.val  = vec_mergel(a02, a13);                                              \
+}
+OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4)
+OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4)
+OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4)
+
+template<int i, typename Tvec>
+inline Tvec v_broadcast_element(const Tvec& v)
+{ return Tvec(vec_splat(v.val, i)); }
+
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif // OPENCV_HAL_VSX_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_wasm.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_wasm.hpp
new file mode 100644
index 000000000000..5d470d94192c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/intrin_wasm.hpp
@@ -0,0 +1,2783 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_INTRIN_WASM_HPP
+#define OPENCV_HAL_INTRIN_WASM_HPP
+
+#include <limits>
+#include <cstring>
+#include <algorithm>
+#include <emscripten/version.h>
+#include "opencv2/core/saturate.hpp"
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
+#define CV_SIMD128_FP16 0
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
+// handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
+#define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
+#define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
+#define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
+#define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
+#define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
+#define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
+#define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
+#define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
+#endif // COMPATIBILITY: <1.38.46
+
+///////// Types ///////////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(v128_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+            uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = wasm_v128_load(v);
+    }
+
+    uchar get0() const
+    {
+        return (uchar)wasm_i8x16_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(v128_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+            schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = wasm_v128_load(v);
+    }
+
+    schar get0() const
+    {
+        return wasm_i8x16_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(v128_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = wasm_v128_load(v);
+    }
+
+    ushort get0() const
+    {
+        return (ushort)wasm_i16x8_extract_lane(val, 0);    // wasm_u16x8_extract_lane() unimplemented yet
+    }
+
+    v128_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(v128_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = wasm_v128_load(v);
+    }
+
+    short get0() const
+    {
+        return wasm_i16x8_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(v128_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = wasm_v128_load(v);
+    }
+
+    unsigned get0() const
+    {
+        return (unsigned)wasm_i32x4_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(v128_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = wasm_v128_load(v);
+    }
+
+    int get0() const
+    {
+        return wasm_i32x4_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(v128_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = wasm_v128_load(v);
+    }
+
+    float get0() const
+    {
+        return wasm_f32x4_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(v128_t v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = wasm_v128_load(v);
+    }
+
+    uint64 get0() const
+    {
+        return (uint64)wasm_i64x2_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(v128_t v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = wasm_v128_load(v);
+    }
+
+    int64 get0() const
+    {
+        return wasm_i64x2_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(v128_t v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = wasm_v128_load(v);
+    }
+
+    double get0() const
+    {
+        return wasm_f64x2_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+namespace
+{
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
+inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
+
+static const unsigned char popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+}  // namespace
+
+static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
+}
+
+static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
+}
+
+static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
+}
+
+static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+}
+
+static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
+}
+
+static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
+}
+
+static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
+}
+
+static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+}
+
+/** Convert **/
+// 8 >> 16
+inline v128_t v128_cvtu8x16_i16x8(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i8x16(a, z);
+}
+inline v128_t v128_cvti8x16_i16x8(const v128_t& a)
+{ return wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); }
+// 8 >> 32
+inline v128_t v128_cvtu8x16_i32x4(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z);
+}
+inline v128_t v128_cvti8x16_i32x4(const v128_t& a)
+{
+    v128_t r = wasm_unpacklo_i8x16(a, a);
+    r = wasm_unpacklo_i8x16(r, r);
+    return wasm_i32x4_shr(r, 24);
+}
+// 16 >> 32
+inline v128_t v128_cvtu16x8_i32x4(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i16x8(a, z);
+}
+inline v128_t v128_cvti16x8_i32x4(const v128_t& a)
+{ return wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); }
+// 32 >> 64
+inline v128_t v128_cvtu32x4_i64x2(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i32x4(a, z);
+}
+inline v128_t v128_cvti32x4_i64x2(const v128_t& a)
+{ return wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); }
+
+// 16 << 8
+inline v128_t v128_cvtu8x16_i16x8_high(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpackhi_i8x16(a, z);
+}
+inline v128_t v128_cvti8x16_i16x8_high(const v128_t& a)
+{ return wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); }
+// 32 << 16
+inline v128_t v128_cvtu16x8_i32x4_high(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpackhi_i16x8(a, z);
+}
+inline v128_t v128_cvti16x8_i32x4_high(const v128_t& a)
+{ return wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); }
+// 64 << 32
+inline v128_t v128_cvtu32x4_i64x2_high(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpackhi_i32x4(a, z);
+}
+inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
+{ return wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); }
+
+#define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
+{ return _Tpvec(a.val); }
+
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16, uchar, u8, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int8x16, schar, s8, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint16x8, ushort, u16, i16x8, short)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int16x8, short, s16, i16x8, short)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint32x4, unsigned, u32, i32x4, int)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int32x4, int, s32, i32x4, int)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_float32x4, float, f32, f32x4, float)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint64x2, uint64, u64, i64x2, int64)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int64x2, int64, s64, i64x2, int64)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_float64x2, double, f64, f64x2, double)
+
+//////////////// PACK ///////////////
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
+    return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
+    return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
+    return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i32x4_splat(-32768);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
+    return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
+    return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i32x4_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
+    return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+
+template<int n>
+inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
+    return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+template<int n>
+inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
+    return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+template<int n>
+inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
+    return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+template<int n>
+inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i16x8_splat(-32768);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
+    return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+template<int n>
+inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
+    return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+template<int n>
+inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
+    return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+template<int n>
+inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
+    return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+template<int n>
+inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
+    return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
+{
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    schar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i32x4_splat(-32768);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    short t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    unsigned t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(int* ptr, const v_int64x2& a)
+{
+    v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    int t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i32x4_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+
+template<int n>
+inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    v128_t delta = wasm_i16x8_splat((short)(1 << (n-1)));
+    v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    schar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i32x4_splat(-32768);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    short t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    unsigned t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    int t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i32x4_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
+    return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    v128_t maxval = wasm_i32x4_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
+    v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
+    v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
+    v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
+    v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
+    return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v128_t maxval = wasm_i32x4_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval));
+    v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval));
+    v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval));
+    v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval));
+    v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
+    v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
+    v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
+    v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
+    v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
+    return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
+    v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
+    v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
+    v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3));
+    v0 = wasm_f32x4_mul(v0, m0.val);
+    v1 = wasm_f32x4_mul(v1, m1.val);
+    v2 = wasm_f32x4_mul(v2, m2.val);
+    v3 = wasm_f32x4_mul(v3, m3.val);
+
+    return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3)));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
+    v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
+    v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
+    v0 = wasm_f32x4_mul(v0, m0.val);
+    v1 = wasm_f32x4_mul(v1, m1.val);
+    v2 = wasm_f32x4_mul(v2, m2.val);
+
+    return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val)));
+}
+
+#define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec)        \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+{                                                            \
+    _Tpwvec c, d;                                            \
+    v_mul_expand(a, b, c, d);                                \
+    return v_pack(c, d);                                     \
+}                                                            \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+{ a = a * b; return a; }
+
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int16x8,  v_int32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    v_uint16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    v_int16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    v_int32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c.val = wasm_i32x4_mul(a0.val, b0.val);
+    d.val = wasm_i32x4_mul(a1.val, b1.val);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c.val = wasm_i32x4_mul(a0.val, b0.val);
+    d.val = wasm_i32x4_mul(a1.val, b1.val);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    v_uint64x2 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val));
+    d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val));
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    v128_t c = wasm_i32x4_mul(a0.val, b0.val);
+    v128_t d = wasm_i32x4_mul(a1.val, b1.val);
+    return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    v128_t c = wasm_i32x4_mul(a0.val, b0.val);
+    v128_t d = wasm_i32x4_mul(a1.val, b1.val);
+    return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
+}
+
+//////// Dot Product ////////
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
+    v128_t a1 = wasm_i32x4_shr(a.val, 16);
+    v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
+    v128_t b1 = wasm_i32x4_shr(b.val, 16);
+    v128_t c = wasm_i32x4_mul(a0, b0);
+    v128_t d = wasm_i32x4_mul(a1, b1);
+    return v_int32x4(wasm_i32x4_add(c, d));
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
+    v128_t a1 = wasm_i64x2_shr(a.val, 32);
+    v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
+    v128_t b1 = wasm_i64x2_shr(b.val, 32);
+    v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
+    v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
+    return v_int64x2(wasm_i64x2_add(c, d));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    return v_dotprod(a, b) + c;
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_u16x8_shr(a.val, 8);
+    v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_u16x8_shr(b.val, 8);
+    return v_uint32x4((
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
+    );
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_i16x8_shr(a.val, 8);
+    v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_i16x8_shr(b.val, 8);
+    return v_int32x4(
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))
+    );
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
+    v128_t a1 = wasm_u32x4_shr(a.val, 16);
+    v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
+    v128_t b1 = wasm_u32x4_shr(b.val, 16);
+    return v_uint64x2((
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+        v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
+    );
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
+    v128_t a1 = wasm_i32x4_shr(a.val, 16);
+    v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
+    v128_t b1 = wasm_i32x4_shr(b.val, 16);
+    return v_int64x2((
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+        v_dotprod(v_int32x4(a1), v_int32x4(b1)))
+    );
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+#define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+    return _Tpvec(wasm_v128_not(a.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint8x16)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint16x8)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int16x8)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint32x4)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int32x4)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint64x2)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int64x2)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float32x4)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float64x2)
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(wasm_f32x4_sqrt(x.val));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    const v128_t _1_0 = wasm_f32x4_splat(1.0);
+    return v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val)));
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(wasm_f64x2_sqrt(x.val));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    const v128_t _1_0 = wasm_f64x2_splat(1.0);
+    return v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val)));
+}
+
+#define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \
+inline _Tpuvec v_abs(const _Tpsvec& x) \
+{ \
+    v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \
+    v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \
+    return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \
+}
+
+OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint8x16, v_int8x16, u8x16, i8x16, 7)
+OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint16x8, v_int16x8, u16x8, i16x8, 15)
+OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint32x4, v_int32x4, u32x4, i32x4, 31)
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(wasm_f32x4_abs(x.val)); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{
+    return v_float64x2(wasm_f64x2_abs(x.val));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_max, wasm_f64x2_max)
+
+#define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \
+inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \
+} \
+inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int32x4, i32x4)
+
+#define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \
+inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t delta = wasm_##suffix##_splat(deltaNum); \
+    v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
+    return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \
+} \
+inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t delta = wasm_##suffix##_splat(deltaNum); \
+    v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
+    return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \
+}
+
+OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint8x16, i8x16, (schar)0x80)
+OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
+OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int8x16, i8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint16x8, u16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int16x8, i16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint32x4, u32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int32x4, i32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
+
+#define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
+
+OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
+OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{
+    v128_t z = wasm_i32x4_splat(0x7fffffff);
+    v128_t t = wasm_i32x4_splat(0x7f800000);
+    return v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t));
+}
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{
+    v128_t z = wasm_i64x2_splat(0x7fffffffffffffff);
+    v128_t t = wasm_i64x2_splat(0x7ff0000000000000);
+    return v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t);
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_add_wrap, wasm_i8x16_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_add_wrap, wasm_i8x16_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_add_wrap, wasm_i16x8_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_add_wrap, wasm_i16x8_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
+#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
+// details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
+// 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uchar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (uchar)(a_[i] * b_[i]);
+    return v_uint8x16(wasm_v128_load(a_));
+}
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    schar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (schar)(a_[i] * b_[i]);
+    return v_int8x16(wasm_v128_load(a_));
+}
+#else
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
+#endif
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
+
+
+/** Absolute difference **/
+
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = v_sub_wrap(a, b);
+    v_int8x16 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
+}
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{
+    v_int32x4 d = a - b;
+    v_int32x4 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+ }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return a * b + c;
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return a * b + c;
+}
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{
+    v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff);
+    return v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec));
+}
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{
+    v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1);
+    return v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec));
+}
+
+#define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
+    v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
+    return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
+    v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
+    return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \
+} \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ \
+    return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
+OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
+
+#define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
+} \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shl(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shl(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shr(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shr(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
+}
+
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint8x16, v_int8x16, i8x16, u8x16)
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint16x8, v_int16x8, i16x8, u16x8)
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint32x4, v_int32x4, i32x4, u32x4)
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint64x2, v_int64x2, i64x2, u64x2)
+
+namespace hal_wasm_internal
+{
+    template <int imm,
+        bool is_invalid = ((imm < 0) || (imm > 16)),
+        bool is_first = (imm == 0),
+        bool is_second = (imm == 16),
+        bool is_other = (((imm > 0) && (imm < 16)))>
+    class v_wasm_palignr_u8_class;
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, true, false, false, false>;
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, false, true, false, false>
+    {
+    public:
+        inline v128_t operator()(const v128_t& a, const v128_t&) const
+        {
+            return a;
+        }
+    };
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, false, false, true, false>
+    {
+    public:
+        inline v128_t operator()(const v128_t&, const v128_t& b) const
+        {
+            return b;
+        }
+    };
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, false, false, false, true>
+    {
+    public:
+        inline v128_t operator()(const v128_t& a, const v128_t& b) const
+        {
+            enum { imm2 = (sizeof(v128_t) - imm) };
+            return wasm_v8x16_shuffle(a, b,
+                                      imm, imm+1, imm+2, imm+3,
+                                      imm+4, imm+5, imm+6, imm+7,
+                                      imm+8, imm+9, imm+10, imm+11,
+                                      imm+12, imm+13, imm+14, imm+15);
+        }
+    };
+
+    template <int imm>
+    inline v128_t v_wasm_palignr_u8(const v128_t& a, const v128_t& b)
+    {
+        CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_wasm_palignr_u8.");
+        return v_wasm_palignr_u8_class<imm>()(a, b);
+    }
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    v128_t z = wasm_i8x16_splat(0);
+    return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, z));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    v128_t z = wasm_i8x16_splat(0);
+    return _Tpvec(v_wasm_palignr_u8<imm2>(z, a.val));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, b.val));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_wasm_palignr_u8<imm2>(b.val, a.val));
+}
+
+#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(wasm_v128_load(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(wasm_v128_load(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    _Tp tmp[_Tpvec::nlanes] = {0}; \
+    for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
+        tmp[i] = ptr[i]; \
+    } \
+    return _Tpvec(wasm_v128_load(tmp)); \
+} \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    _Tp tmp[_Tpvec::nlanes]; \
+    for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
+        tmp[i] = ptr0[i]; \
+        tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \
+    } \
+    return _Tpvec(wasm_v128_load(tmp)); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ wasm_v128_store(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ wasm_v128_store(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ wasm_v128_store(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    wasm_v128_store(ptr, a.val); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i]; \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
+}
+
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int8x16, schar)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int16x8, short)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int32x4, int)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int64x2, int64)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
+
+
+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{ return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{ return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{ return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{ return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
+#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    regtype val = a.val; \
+    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
+    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
+    return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
+}
+
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4, unsigned, v128_t, i32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_int32x4, int, v128_t, i32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
+
+// To do: Optimize v_reduce_sum with wasm intrin.
+//        Now use fallback implementation as there is no widening op in wasm intrin.
+
+#define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    scalartype c = a_[0]; \
+    for (int i = 1; i < _Tpvec::nlanes; i++) \
+        c += a_[i]; \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int8x16, int)
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint16x8, unsigned)
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int)
+
+
+#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    regtype val = a.val; \
+    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
+    return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
+}
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_int64x2, int64,  v128_t, i64x2, i64x2)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_float64x2, double,  v128_t, f64x2,f64x2)
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val));
+    v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val));
+    return v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd)));
+}
+
+#define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype buf[_Tpvec::nlanes]; \
+    v_store(buf, a); \
+    scalartype tmp = buf[0]; \
+    for (int i=1; i<_Tpvec::nlanes; ++i) { \
+        tmp = scalar_func(tmp, buf[i]); \
+    } \
+    return tmp; \
+}
+
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, min, std::min)
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v_uint16x8 l16, h16;
+    v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
+    v_expand(v_absdiff(a, b), l16, h16);
+    v_expand(l16, l16_l32, l16_h32);
+    v_expand(h16, h16_l32, h16_h32);
+    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    v_uint16x8 l16, h16;
+    v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
+    v_expand(v_absdiff(a, b), l16, h16);
+    v_expand(l16, l16_l32, l16_h32);
+    v_expand(h16, h16_l32, h16_h32);
+    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{
+    v128_t m1 = wasm_i32x4_splat(0x55555555);
+    v128_t m2 = wasm_i32x4_splat(0x33333333);
+    v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f);
+    v128_t p = a.val;
+    p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1));
+    p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2));
+    p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4));
+    return v_uint8x16(p);
+}
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+}
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+}
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{
+    uint64 a_[2], b_[2] = { 0 };
+    wasm_v128_store(a_, a.val);
+    for (int i = 0; i < 16; i++)
+        b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
+    return v_uint64x2(wasm_v128_load(b_));
+}
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+#define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    int mask = 0; \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+        mask |= (reinterpret_int(a_[i]) < 0) << i; \
+    return mask; \
+} \
+inline bool v_check_all(const _Tpvec& a) \
+{ return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
+inline bool v_check_any(const _Tpvec& a) \
+{ return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; }
+
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int8x16, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint16x8, i16x8, short)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int16x8, i16x8, short)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint32x4, i32x4, int)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int32x4, i32x4, int)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float32x4, i32x4, float)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float64x2, f64x2, double)
+
+#define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    v128_t masked = v_reinterpret_as_##esuffix(a).val; \
+    masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \
+    masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \
+    return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    v128_t masked = v_reinterpret_as_##esuffix(a).val; \
+    masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \
+    masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \
+    return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
+} \
+
+OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_int64x2, i32x4, s32)
+OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_uint64x2, i32x4, u32)
+
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+#define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint8x16)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint16x8)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int16x8)
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint32x4)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int32x4)
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint64x2)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int64x2)
+OPENCV_HAL_IMPL_WASM_SELECT(v_float32x4)
+OPENCV_HAL_IMPL_WASM_SELECT(v_float64x2)
+
+#define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)      \
+{                                                                    \
+    b0.val = intrin(a.val);                                          \
+    b1.val = __CV_CAT(intrin, _high)(a.val);                         \
+}                                                                    \
+inline _Tpwvec v_expand_low(const _Tpvec& a)                         \
+{ return _Tpwvec(intrin(a.val)); }                                   \
+inline _Tpwvec v_expand_high(const _Tpvec& a)                        \
+{ return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); }                  \
+inline _Tpwvec v_load_expand(const _Tp* ptr)                         \
+{                                                                    \
+    v128_t a = wasm_v128_load(ptr);                                  \
+    return _Tpwvec(intrin(a));                                       \
+}
+
+OPENCV_HAL_IMPL_WASM_EXPAND(v_uint8x16, v_uint16x8, uchar, v128_cvtu8x16_i16x8)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_int8x16,  v_int16x8,  schar, v128_cvti8x16_i16x8)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_uint16x8, v_uint32x4, ushort, v128_cvtu16x8_i32x4)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_int16x8,  v_int32x4,  short, v128_cvti16x8_i32x4)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_uint32x4, v_uint64x2, unsigned, v128_cvtu32x4_i64x2)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_int32x4,  v_int64x2,  int, v128_cvti32x4_i64x2)
+
+#define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin)  \
+inline _Tpvec v_load_expand_q(const _Tp* ptr)               \
+{                                                           \
+    v128_t a = wasm_v128_load(ptr);                         \
+    return _Tpvec(intrin(a));                               \
+}
+
+OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_uint32x4, uchar, v128_cvtu8x16_i32x4)
+OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_int32x4, schar, v128_cvti8x16_i32x4)
+
+#define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \
+    b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    c.val = wasm_unpacklo_i64x2(a.val, b.val); \
+    d.val = wasm_unpackhi_i64x2(a.val, b.val); \
+}
+
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_int8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_int16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_int32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_float32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_float64x2, i64x2)
+
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{
+    return v_rotate_right<s>(a, b);
+}
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    v128_t h = wasm_f32x4_splat(0.5);
+    return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
+    return v_int32x4(wasm_i32x4_add(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
+    return v_int32x4(wasm_i32x4_sub(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
+
+#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
+inline v_int32x4 func(const v_float64x2& a) \
+{ \
+    double a_[2]; \
+    wasm_v128_store(a_, a.val); \
+    int c_[4]; \
+    c_[0] = cfunc(a_[0]); \
+    c_[1] = cfunc(a_[1]); \
+    c_[2] = 0; \
+    c_[3] = 0; \
+    return v_int32x4(wasm_v128_load(c_)); \
+}
+
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    int c_[4];
+    c_[0] = cvRound(a_[0]);
+    c_[1] = cvRound(a_[1]);
+    c_[2] = cvRound(b_[0]);
+    c_[3] = cvRound(b_[1]);
+    return v_int32x4(wasm_v128_load(c_));
+}
+
+#define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \
+    v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \
+    v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \
+    v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \
+\
+    b0.val = wasm_unpacklo_i64x2(t0, t1); \
+    b1.val = wasm_unpackhi_i64x2(t0, t1); \
+    b2.val = wasm_unpacklo_i64x2(t2, t3); \
+    b3.val = wasm_unpackhi_i64x2(t2, t3); \
+}
+
+OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_int32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_float32x4, i32x4)
+
+// load deinterleave
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
+{
+    v128_t t00 = wasm_v128_load(ptr);
+    v128_t t01 = wasm_v128_load(ptr + 16);
+
+    a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
+    b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);
+    v128_t t01 = wasm_v128_load(ptr + 16);
+    v128_t t02 = wasm_v128_load(ptr + 32);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    v128_t u1 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
+    v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ...
+    v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ...
+
+    v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
+    v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
+    v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
+    v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1 a2 b2 a3 b3
+    v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1 b1 c1 a2 b2
+    v128_t t01 = wasm_v128_load(ptr + 8);    // c2 a3 b3 c3 a4 b4 c4 a5
+    v128_t t02 = wasm_v128_load(ptr + 16);  // b5 c5 a6 b6 c6 a7 b7 c7
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1
+    v128_t u1 = wasm_v128_load(ptr + 8); // a2 b2 c2 d2 ...
+    v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
+    v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ...
+
+    v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
+    v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
+    v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
+    v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1
+    v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
+    b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1
+    v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
+    v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 s0(wasm_v128_load(ptr));      // a0 b0 c0 d0
+    v_uint32x4 s1(wasm_v128_load(ptr + 4));  // a1 b1 c1 d1
+    v_uint32x4 s2(wasm_v128_load(ptr + 8));  // a2 b2 c2 d2
+    v_uint32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
+
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+    v128_t v0 = wasm_v128_load(ptr);       // a0 b0 a1 b1
+    v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
+    b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1
+    v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
+    v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
+{
+    v_float32x4 s0(wasm_v128_load(ptr));      // a0 b0 c0 d0
+    v_float32x4 s1(wasm_v128_load(ptr + 4));  // a1 b1 c1 d1
+    v_float32x4 s2(wasm_v128_load(ptr + 8));  // a2 b2 c2 d2
+    v_float32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
+
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
+{
+    v128_t t0 = wasm_v128_load(ptr);      // a0 b0
+    v128_t t1 = wasm_v128_load(ptr + 2);  // a1 b1
+
+    a.val = wasm_unpacklo_i64x2(t0, t1);
+    b.val = wasm_unpackhi_i64x2(t0, t1);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    v128_t t0 = wasm_v128_load(ptr);     // a0, b0
+    v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1
+    v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1
+
+    a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+    b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
+    c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
+                                v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
+{
+    v128_t t0 = wasm_v128_load(ptr);     // a0 b0
+    v128_t t1 = wasm_v128_load(ptr + 2); // c0 d0
+    v128_t t2 = wasm_v128_load(ptr + 4); // a1 b1
+    v128_t t3 = wasm_v128_load(ptr + 6); // c1 d1
+
+    a.val = wasm_unpacklo_i64x2(t0, t2);
+    b.val = wasm_unpackhi_i64x2(t0, t2);
+    c.val = wasm_unpacklo_i64x2(t1, t3);
+    d.val = wasm_unpackhi_i64x2(t1, t3);
+}
+
+// store interleave
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 16, v1);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 16, t11);
+    wasm_v128_store(ptr + 32, t12);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, const v_uint8x16& d,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val); // a0 c0 a1 c1 ...
+    v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val); // a8 c8 a9 c9 ...
+    v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val); // b0 d0 b1 d1 ...
+    v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val); // b8 d8 b9 d9 ...
+
+    v128_t v0 = wasm_unpacklo_i8x16(u0, u2); // a0 b0 c0 d0 ...
+    v128_t v1 = wasm_unpackhi_i8x16(u0, u2); // a4 b4 c4 d4 ...
+    v128_t v2 = wasm_unpacklo_i8x16(u1, u3); // a8 b8 c8 d8 ...
+    v128_t v3 = wasm_unpackhi_i8x16(u1, u3); // a12 b12 c12 d12 ...
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 16, v1);
+    wasm_v128_store(ptr + 32, v2);
+    wasm_v128_store(ptr + 48, v3);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 8, v1);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
+                                const v_uint16x8& b, const v_uint16x8& c,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 8, t11);
+    wasm_v128_store(ptr + 16, t12);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                const v_uint16x8& c, const v_uint16x8& d,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val); // a0 c0 a1 c1 ...
+    v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val); // a4 c4 a5 c5 ...
+    v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val); // b0 d0 b1 d1 ...
+    v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val); // b4 d4 b5 d5 ...
+
+    v128_t v0 = wasm_unpacklo_i16x8(u0, u2); // a0 b0 c0 d0 ...
+    v128_t v1 = wasm_unpackhi_i16x8(u0, u2); // a2 b2 c2 d2 ...
+    v128_t v2 = wasm_unpacklo_i16x8(u1, u3); // a4 b4 c4 d4 ...
+    v128_t v3 = wasm_unpackhi_i16x8(u1, u3); // a6 b6 c6 d6 ...
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 8, v1);
+    wasm_v128_store(ptr + 16, v2);
+    wasm_v128_store(ptr + 24, v3);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 4, v1);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 4, t11);
+    wasm_v128_store(ptr + 8, t12);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v_uint32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    wasm_v128_store(ptr, v0.val);
+    wasm_v128_store(ptr + 4, v1.val);
+    wasm_v128_store(ptr + 8, v2.val);
+    wasm_v128_store(ptr + 12, v3.val);
+}
+
+// 2-channel, float only
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 4, v1);
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 4, t11);
+    wasm_v128_store(ptr + 8, t12);
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, const v_float32x4& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v_float32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    wasm_v128_store(ptr, v0.val);
+    wasm_v128_store(ptr + 4, v1.val);
+    wasm_v128_store(ptr + 8, v2.val);
+    wasm_v128_store(ptr + 12, v3.val);
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 2, v1);
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 2, v1);
+    wasm_v128_store(ptr + 4, v2);
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
+    v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val);
+    v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val);
+    v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 2, v1);
+    wasm_v128_store(ptr + 4, v2);
+    wasm_v128_store(ptr + 6, v3);
+}
+
+#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    double a_[2];
+    wasm_v128_store(a_, a.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = 0;
+    c_[3] = 0;
+    return v_float32x4(wasm_v128_load(c_));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = (float)(b_[0]);
+    c_[3] = (float)(b_[1]);
+    return v_float32x4(wasm_v128_load(c_));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    v128_t p = v128_cvti32x4_i64x2(a.val);
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
+#else
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
+#endif
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    v128_t p = v128_cvti32x4_i64x2_high(a.val);
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
+#else
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return v_float64x2(wasm_v128_load(c_));
+#endif
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return v_float64x2(wasm_v128_load(c_));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
+#else
+    int64 a_[2];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
+#endif
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
+                     tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1],
+                     tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]);
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3],
+                     tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]);
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1],
+                     tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]);
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3],
+                     tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]);
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    return v_int32x4(tab[idx[0]], tab[idx[1]],
+                     tab[idx[2]], tab[idx[3]]);
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(tab[idx[0]], tab[idx[0]+1],
+                     tab[idx[1]], tab[idx[1]+1]);
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(wasm_v128_load(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(wasm_v128_load(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(wasm_v128_load(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    return v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                     tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
+                     tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
+                     tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    return v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    return v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 1)]);
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+//   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+//   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    x = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
+    y = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0));
+    v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1));
+    x.val = wasm_unpacklo_i64x2(xy0, xy1);
+    y.val = wasm_unpacklo_i64x2(xy0, xy1);
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
+{
+    return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
+}
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+template<int i, typename _Tp>
+inline typename _Tp::lane_type v_extract_n(const _Tp& a)
+{
+    return v_rotate_right<i>(a).get0();
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{
+    return v_setall_u32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{
+    return v_setall_s32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{
+    return v_setall_f32(v_extract_n<i>(a));
+}
+
+
+////////////// FP16 support ///////////////////////////
+
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    float a[4];
+    for (int i = 0; i < 4; i++)
+        a[i] = ptr[i];
+    return v_float32x4(wasm_v128_load(a));
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    double v_[4];
+    wasm_v128_store(v_, v.val);
+    ptr[0] = hfloat(v_[0]);
+    ptr[1] = hfloat(v_[1]);
+    ptr[2] = hfloat(v_[2]);
+    ptr[3] = hfloat(v_[3]);
+}
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/msa_macros.h b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/msa_macros.h
new file mode 100644
index 000000000000..fad8c5adda25
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/msa_macros.h
@@ -0,0 +1,1558 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_HAL_MSA_MACROS_H
+#define OPENCV_CORE_HAL_MSA_MACROS_H
+
+#ifdef __mips_msa
+#include "msa.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Define 64 bits vector types */
+typedef signed char v8i8 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned char v8u8 __attribute__ ((vector_size(8), aligned(8)));
+typedef short v4i16 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned short v4u16 __attribute__ ((vector_size(8), aligned(8)));
+typedef int v2i32 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned int v2u32 __attribute__ ((vector_size(8), aligned(8)));
+typedef long long v1i64 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned long long v1u64 __attribute__ ((vector_size(8), aligned(8)));
+typedef float v2f32 __attribute__ ((vector_size(8), aligned(8)));
+typedef double v1f64 __attribute__ ((vector_size(8), aligned(8)));
+
+
+/* Load values from the given memory a 64-bit vector. */
+#define msa_ld1_s8(__a)  (*((v8i8*)(__a)))
+#define msa_ld1_s16(__a) (*((v4i16*)(__a)))
+#define msa_ld1_s32(__a) (*((v2i32*)(__a)))
+#define msa_ld1_s64(__a) (*((v1i64*)(__a)))
+#define msa_ld1_u8(__a)  (*((v8u8*)(__a)))
+#define msa_ld1_u16(__a) (*((v4u16*)(__a)))
+#define msa_ld1_u32(__a) (*((v2u32*)(__a)))
+#define msa_ld1_u64(__a) (*((v1u64*)(__a)))
+#define msa_ld1_f32(__a) (*((v2f32*)(__a)))
+#define msa_ld1_f64(__a) (*((v1f64*)(__a)))
+
+/* Load values from the given memory address to a 128-bit vector */
+#define msa_ld1q_s8(__a)  ((v16i8)__builtin_msa_ld_b(__a, 0))
+#define msa_ld1q_s16(__a) ((v8i16)__builtin_msa_ld_h(__a, 0))
+#define msa_ld1q_s32(__a) ((v4i32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_s64(__a) ((v2i64)__builtin_msa_ld_d(__a, 0))
+#define msa_ld1q_u8(__a)  ((v16u8)__builtin_msa_ld_b(__a, 0))
+#define msa_ld1q_u16(__a) ((v8u16)__builtin_msa_ld_h(__a, 0))
+#define msa_ld1q_u32(__a) ((v4u32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_u64(__a) ((v2u64)__builtin_msa_ld_d(__a, 0))
+#define msa_ld1q_f32(__a) ((v4f32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_f64(__a) ((v2f64)__builtin_msa_ld_d(__a, 0))
+
+/* Store 64bits vector elements values to the given memory address. */
+#define msa_st1_s8(__a, __b)  (*((v8i8*)(__a)) = __b)
+#define msa_st1_s16(__a, __b) (*((v4i16*)(__a)) = __b)
+#define msa_st1_s32(__a, __b) (*((v2i32*)(__a)) = __b)
+#define msa_st1_s64(__a, __b) (*((v1i64*)(__a)) = __b)
+#define msa_st1_u8(__a, __b)  (*((v8u8*)(__a)) = __b)
+#define msa_st1_u16(__a, __b) (*((v4u16*)(__a)) = __b)
+#define msa_st1_u32(__a, __b) (*((v2u32*)(__a)) = __b)
+#define msa_st1_u64(__a, __b) (*((v1u64*)(__a)) = __b)
+#define msa_st1_f32(__a, __b) (*((v2f32*)(__a)) = __b)
+#define msa_st1_f64(__a, __b) (*((v1f64*)(__a)) = __b)
+
+/* Store the values of elements in the 128 bits vector __a to the given memory address __a. */
+#define msa_st1q_s8(__a, __b)  (__builtin_msa_st_b((v16i8)(__b), __a, 0))
+#define msa_st1q_s16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
+#define msa_st1q_s32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_s64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+#define msa_st1q_u8(__a, __b)  (__builtin_msa_st_b((v16i8)(__b), __a, 0))
+#define msa_st1q_u16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
+#define msa_st1q_u32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_u64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+#define msa_st1q_f32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_f64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+
+/* Store the value of the element with the index __c in vector __a to the given memory address __a. */
+#define msa_st1_lane_s8(__a, __b, __c)   (*((int8_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s16(__a, __b, __c)  (*((int16_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s32(__a, __b, __c)  (*((int32_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s64(__a, __b, __c)  (*((int64_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u8(__a, __b, __c)   (*((uint8_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u16(__a, __b, __c)  (*((uint16_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u32(__a, __b, __c)  (*((uint32_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u64(__a, __b, __c)  (*((uint64_t*)(__a)) = __b[__c])
+#define msa_st1_lane_f32(__a, __b, __c)  (*((float*)(__a)) = __b[__c])
+#define msa_st1_lane_f64(__a, __b, __c)  (*((double*)(__a)) = __b[__c])
+#define msa_st1q_lane_s8(__a, __b, __c)  (*((int8_t*)(__a)) = (int8_t)__builtin_msa_copy_s_b(__b, __c))
+#define msa_st1q_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = (int16_t)__builtin_msa_copy_s_h(__b, __c))
+#define msa_st1q_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __builtin_msa_copy_s_w(__b, __c))
+#define msa_st1q_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __builtin_msa_copy_s_d(__b, __c))
+#define msa_st1q_lane_u8(__a, __b, __c)  (*((uint8_t*)(__a)) = (uint8_t)__builtin_msa_copy_u_b((v16i8)(__b), __c))
+#define msa_st1q_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = (uint16_t)__builtin_msa_copy_u_h((v8i16)(__b), __c))
+#define msa_st1q_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __builtin_msa_copy_u_w((v4i32)(__b), __c))
+#define msa_st1q_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __builtin_msa_copy_u_d((v2i64)(__b), __c))
+#define msa_st1q_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c])
+#define msa_st1q_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c])
+
+/* Duplicate elements for 64-bit doubleword vectors */
+#define msa_dup_n_s8(__a)  ((v8i8)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
+#define msa_dup_n_s16(__a) ((v4i16)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
+#define msa_dup_n_s32(__a) ((v2i32){__a, __a})
+#define msa_dup_n_s64(__a) ((v1i64){__a})
+#define msa_dup_n_u8(__a)  ((v8u8)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
+#define msa_dup_n_u16(__a) ((v4u16)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
+#define msa_dup_n_u32(__a) ((v2u32){__a, __a})
+#define msa_dup_n_u64(__a) ((v1u64){__a})
+#define msa_dup_n_f32(__a) ((v2f32){__a, __a})
+#define msa_dup_n_f64(__a) ((v1f64){__a})
+
+/* Duplicate elements for 128-bit quadword vectors */
+#define msa_dupq_n_s8(__a)  (__builtin_msa_fill_b((int32_t)(__a)))
+#define msa_dupq_n_s16(__a) (__builtin_msa_fill_h((int32_t)(__a)))
+#define msa_dupq_n_s32(__a) (__builtin_msa_fill_w((int32_t)(__a)))
+#define msa_dupq_n_s64(__a) (__builtin_msa_fill_d((int64_t)(__a)))
+#define msa_dupq_n_u8(__a)  ((v16u8)__builtin_msa_fill_b((int32_t)(__a)))
+#define msa_dupq_n_u16(__a) ((v8u16)__builtin_msa_fill_h((int32_t)(__a)))
+#define msa_dupq_n_u32(__a) ((v4u32)__builtin_msa_fill_w((int32_t)(__a)))
+#define msa_dupq_n_u64(__a) ((v2u64)__builtin_msa_fill_d((int64_t)(__a)))
+#define msa_dupq_n_f32(__a) ((v4f32){__a, __a, __a, __a})
+#define msa_dupq_n_f64(__a) ((v2f64){__a, __a})
+#define msa_dupq_lane_s8(__a, __b)  (__builtin_msa_splat_b(__a, __b))
+#define msa_dupq_lane_s16(__a, __b) (__builtin_msa_splat_h(__a, __b))
+#define msa_dupq_lane_s32(__a, __b) (__builtin_msa_splat_w(__a, __b))
+#define msa_dupq_lane_s64(__a, __b) (__builtin_msa_splat_d(__a, __b))
+#define msa_dupq_lane_u8(__a, __b)  ((v16u8)__builtin_msa_splat_b((v16i8)(__a), __b))
+#define msa_dupq_lane_u16(__a, __b) ((v8u16)__builtin_msa_splat_h((v8i16)(__a), __b))
+#define msa_dupq_lane_u32(__a, __b) ((v4u32)__builtin_msa_splat_w((v4i32)(__a), __b))
+#define msa_dupq_lane_u64(__a, __b) ((v2u64)__builtin_msa_splat_d((v2i64)(__a), __b))
+
+/* Create a 64 bits vector */
+#define msa_create_s8(__a)  ((v8i8)((uint64_t)(__a)))
+#define msa_create_s16(__a) ((v4i16)((uint64_t)(__a)))
+#define msa_create_s32(__a) ((v2i32)((uint64_t)(__a)))
+#define msa_create_s64(__a) ((v1i64)((uint64_t)(__a)))
+#define msa_create_u8(__a)  ((v8u8)((uint64_t)(__a)))
+#define msa_create_u16(__a) ((v4u16)((uint64_t)(__a)))
+#define msa_create_u32(__a) ((v2u32)((uint64_t)(__a)))
+#define msa_create_u64(__a) ((v1u64)((uint64_t)(__a)))
+#define msa_create_f32(__a) ((v2f32)((uint64_t)(__a)))
+#define msa_create_f64(__a) ((v1f64)((uint64_t)(__a)))
+
+/* Sign extends or zero extends each element in a 64 bits vector to twice its original length, and places the results in a 128 bits vector. */
+/*Transform v8i8 to v8i16*/
+#define msa_movl_s8(__a) \
+((v8i16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
+         (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
+
+/*Transform v8u8 to v8u16*/
+#define msa_movl_u8(__a) \
+((v8u16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
+         (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
+
+/*Transform v4i16 to v8i16*/
+#define msa_movl_s16(__a) ((v4i32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
+
+/*Transform v2i32 to v4i32*/
+#define msa_movl_s32(__a) ((v2i64){(__a)[0], (__a)[1]})
+
+/*Transform v4u16 to v8u16*/
+#define msa_movl_u16(__a) ((v4u32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
+
+/*Transform v2u32 to v4u32*/
+#define msa_movl_u32(__a) ((v2u64){(__a)[0], (__a)[1]})
+
+/* Copies the least significant half of each element of a 128 bits vector into the corresponding elements of a 64 bits vector. */
+#define msa_movn_s16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_s32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_s64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* qmovn */
+#define msa_qmovn_s16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_s32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_s64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* qmovun */
+#define msa_qmovun_s16(__a) \
+({ \
+  v8i16 __d = __builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qmovun_s32(__a) \
+({ \
+  v4i32 __d = __builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qmovun_s64(__a) \
+({ \
+  v2i64 __d = __builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
+#define msa_shrn_n_s16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__b))); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_s32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__b))); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_s64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__b))); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__b))); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__b))); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__b))); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
+#define msa_rshrn_n_s16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)__b)); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_s32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)__b)); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_s64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)__b)); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)__b)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)__b)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)__b)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector. */
+#define msa_qrshrn_n_s16(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__b)), 7); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_s32(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__b)), 15); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_s64(__a, __b) \
+({ \
+  v2i64 __d = __builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__b)), 31); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u16(__a, __b) \
+({ \
+  v8u16 __d = __builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__b)), 7); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u32(__a, __b) \
+({ \
+  v4u32 __d = __builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__b)), 15); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u64(__a, __b) \
+({ \
+  v2u64 __d = __builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__b)), 31); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector.
+   Input is signed and output is unsigned. */
+#define msa_qrshrun_n_s16(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__b)); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrun_n_s32(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__b)); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrun_n_s64(__a, __b) \
+({ \
+  v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__b)); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* pack */
+#define msa_pack_s16(__a, __b) (__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_pack_s32(__a, __b) (__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_pack_s64(__a, __b) (__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_pack_u16(__a, __b) ((v16u8)__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_pack_u32(__a, __b) ((v8u16)__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_pack_u64(__a, __b) ((v4u32)__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
+
+/* qpack */
+#define msa_qpack_s16(__a, __b) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h((v8i16)(__b), 7), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)))
+#define msa_qpack_s32(__a, __b) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w((v4i32)(__b), 15), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)))
+#define msa_qpack_s64(__a, __b) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d((v2i64)(__b), 31), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)))
+#define msa_qpack_u16(__a, __b) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__b), 7), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)))
+#define msa_qpack_u32(__a, __b) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__b), 15), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)))
+#define msa_qpack_u64(__a, __b) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__b), 31), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)))
+
+/* qpacku */
+#define msa_qpacku_s16(__a, __b) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b))), 7), \
+                              (v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a))), 7)))
+#define msa_qpacku_s32(__a, __b) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b))), 15), \
+                              (v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a))), 15)))
+#define msa_qpacku_s64(__a, __b) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b))), 31), \
+                              (v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a))), 31)))
+
+/* packr */
+#define msa_packr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_srai_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__c))))
+#define msa_packr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_srai_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__c))))
+#define msa_packr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_srai_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__c))))
+#define msa_packr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srli_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__c))))
+#define msa_packr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srli_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__c))))
+#define msa_packr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srli_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__c))))
+
+/* rpackr */
+#define msa_rpackr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)(__c))))
+#define msa_rpackr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)(__c))))
+#define msa_rpackr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)(__c))))
+#define msa_rpackr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c))))
+#define msa_rpackr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c))))
+#define msa_rpackr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c))))
+
+/* qrpackr */
+#define msa_qrpackr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), 7), \
+                       (v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__c)), 7)))
+#define msa_qrpackr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), 15), \
+                       (v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__c)), 15)))
+#define msa_qrpackr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), 31), \
+                       (v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__c)), 31)))
+#define msa_qrpackr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), 7), \
+                              (v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)), 7)))
+#define msa_qrpackr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), 15), \
+                              (v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)), 15)))
+#define msa_qrpackr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), 31), \
+                              (v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)), 31)))
+
+/* qrpackru */
+#define msa_qrpackru_s16(__a, __b, __c) \
+({ \
+  v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__c)); \
+  v8i16 __e = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b)), (int)(__c)); \
+  (v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__e, 7), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+})
+
+#define msa_qrpackru_s32(__a, __b, __c) \
+({ \
+  v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__c)); \
+  v4i32 __e = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b)), (int)(__c)); \
+  (v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__e, 15), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+})
+
+#define msa_qrpackru_s64(__a, __b, __c) \
+({ \
+  v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__c)); \
+  v2i64 __e = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b)), (int)(__c)); \
+  (v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__e, 31), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+})
+
+/* Minimum values between corresponding elements in the two vectors are written to the returned vector. */
+#define msa_minq_s8(__a, __b)  (__builtin_msa_min_s_b(__a, __b))
+#define msa_minq_s16(__a, __b) (__builtin_msa_min_s_h(__a, __b))
+#define msa_minq_s32(__a, __b) (__builtin_msa_min_s_w(__a, __b))
+#define msa_minq_s64(__a, __b) (__builtin_msa_min_s_d(__a, __b))
+#define msa_minq_u8(__a, __b)  ((v16u8)__builtin_msa_min_u_b(__a, __b))
+#define msa_minq_u16(__a, __b) ((v8u16)__builtin_msa_min_u_h(__a, __b))
+#define msa_minq_u32(__a, __b) ((v4u32)__builtin_msa_min_u_w(__a, __b))
+#define msa_minq_u64(__a, __b) ((v2u64)__builtin_msa_min_u_d(__a, __b))
+#define msa_minq_f32(__a, __b) (__builtin_msa_fmin_w(__a, __b))
+#define msa_minq_f64(__a, __b) (__builtin_msa_fmin_d(__a, __b))
+
+/* Maximum values between corresponding elements in the two vectors are written to the returned vector. */
+#define msa_maxq_s8(__a, __b)  (__builtin_msa_max_s_b(__a, __b))
+#define msa_maxq_s16(__a, __b) (__builtin_msa_max_s_h(__a, __b))
+#define msa_maxq_s32(__a, __b) (__builtin_msa_max_s_w(__a, __b))
+#define msa_maxq_s64(__a, __b) (__builtin_msa_max_s_d(__a, __b))
+#define msa_maxq_u8(__a, __b)  ((v16u8)__builtin_msa_max_u_b(__a, __b))
+#define msa_maxq_u16(__a, __b) ((v8u16)__builtin_msa_max_u_h(__a, __b))
+#define msa_maxq_u32(__a, __b) ((v4u32)__builtin_msa_max_u_w(__a, __b))
+#define msa_maxq_u64(__a, __b) ((v2u64)__builtin_msa_max_u_d(__a, __b))
+#define msa_maxq_f32(__a, __b) (__builtin_msa_fmax_w(__a, __b))
+#define msa_maxq_f64(__a, __b) (__builtin_msa_fmax_d(__a, __b))
+
+/* Vector type reinterpretion */
+#define MSA_TPV_REINTERPRET(_Tpv, Vec) ((_Tpv)(Vec))
+
+/* Add the odd elements in vector __a with the even elements in vector __b to double width elements in the returned vector. */
+/* v8i16 msa_hadd_s16 ((v16i8)__a, (v16i8)__b) */
+#define msa_hadd_s16(__a, __b) (__builtin_msa_hadd_s_h((v16i8)(__a), (v16i8)(__b)))
+/* v4i32 msa_hadd_s32 ((v8i16)__a, (v8i16)__b) */
+#define msa_hadd_s32(__a, __b) (__builtin_msa_hadd_s_w((v8i16)(__a), (v8i16)(__b)))
+/* v2i64 msa_hadd_s64 ((v4i32)__a, (v4i32)__b) */
+#define msa_hadd_s64(__a, __b) (__builtin_msa_hadd_s_d((v4i32)(__a), (v4i32)(__b)))
+
+/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
+#define msa_pckev_s8(__a, __b)  (__builtin_msa_pckev_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_pckev_s16(__a, __b) (__builtin_msa_pckev_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_pckev_s32(__a, __b) (__builtin_msa_pckev_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_pckev_s64(__a, __b) (__builtin_msa_pckev_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
+#define msa_pckod_s8(__a, __b)  (__builtin_msa_pckod_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_pckod_s16(__a, __b) (__builtin_msa_pckod_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_pckod_s32(__a, __b) (__builtin_msa_pckod_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_pckod_s64(__a, __b) (__builtin_msa_pckod_d((v2i64)(__a), (v2i64)(__b)))
+
+#ifdef _MIPSEB
+#define LANE_IMM0_1(x)  (0b1 - ((x) & 0b1))
+#define LANE_IMM0_3(x)  (0b11 - ((x) & 0b11))
+#define LANE_IMM0_7(x)  (0b111 - ((x) & 0b111))
+#define LANE_IMM0_15(x) (0b1111 - ((x) & 0b1111))
+#else
+#define LANE_IMM0_1(x)  ((x) & 0b1)
+#define LANE_IMM0_3(x)  ((x) & 0b11)
+#define LANE_IMM0_7(x)  ((x) & 0b111)
+#define LANE_IMM0_15(x) ((x) & 0b1111)
+#endif
+
+#define msa_get_lane_u8(__a, __b)        ((uint8_t)(__a)[LANE_IMM0_7(__b)])
+#define msa_get_lane_s8(__a, __b)        ((int8_t)(__a)[LANE_IMM0_7(__b)])
+#define msa_get_lane_u16(__a, __b)       ((uint16_t)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_s16(__a, __b)       ((int16_t)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_u32(__a, __b)       ((uint32_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_s32(__a, __b)       ((int32_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_f32(__a, __b)       ((float)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_s64(__a, __b)       ((int64_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_u64(__a, __b)       ((uint64_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_f64(__a, __b)       ((double)(__a)[LANE_IMM0_1(__b)])
+#define msa_getq_lane_u8(__a, imm0_15)   ((uint8_t)__builtin_msa_copy_u_b((v16i8)(__a), imm0_15))
+#define msa_getq_lane_s8(__a, imm0_15)   ((int8_t)__builtin_msa_copy_s_b(__a, imm0_15))
+#define msa_getq_lane_u16(__a, imm0_7)   ((uint16_t)__builtin_msa_copy_u_h((v8i16)(__a), imm0_7))
+#define msa_getq_lane_s16(__a, imm0_7)   ((int16_t)__builtin_msa_copy_s_h(__a, imm0_7))
+#define msa_getq_lane_u32(__a, imm0_3)   __builtin_msa_copy_u_w((v4i32)(__a), imm0_3)
+#define msa_getq_lane_s32                __builtin_msa_copy_s_w
+#define msa_getq_lane_f32(__a, __b)      ((float)(__a)[LANE_IMM0_3(__b)])
+#define msa_getq_lane_f64(__a, __b)      ((double)(__a)[LANE_IMM0_1(__b)])
+#if (__mips == 64)
+#define msa_getq_lane_u64(__a, imm0_1)   __builtin_msa_copy_u_d((v2i64)(__a), imm0_1)
+#define msa_getq_lane_s64                __builtin_msa_copy_s_d
+#else
+#define msa_getq_lane_u64(__a, imm0_1)   ((uint64_t)(__a)[LANE_IMM0_1(imm0_1)])
+#define msa_getq_lane_s64(__a, imm0_1)   ((int64_t)(__a)[LANE_IMM0_1(imm0_1)])
+#endif
+
+/* combine */
+#if (__mips == 64)
+#define __COMBINE_64_64(__TYPE, a, b)    ((__TYPE)((v2u64){((v1u64)(a))[0], ((v1u64)(b))[0]}))
+#else
+#define __COMBINE_64_64(__TYPE, a, b)    ((__TYPE)((v4u32){((v2u32)(a))[0], ((v2u32)(a))[1],  \
+                                                           ((v2u32)(b))[0], ((v2u32)(b))[1]}))
+#endif
+
+/* v16i8 msa_combine_s8 (v8i8 __a, v8i8 __b) */
+#define msa_combine_s8(__a, __b)  __COMBINE_64_64(v16i8, __a, __b)
+
+/* v8i16 msa_combine_s16(v4i16 __a, v4i16 __b) */
+#define msa_combine_s16(__a, __b)  __COMBINE_64_64(v8i16, __a, __b)
+
+/* v4i32 msa_combine_s32(v2i32 __a, v2i32 __b) */
+#define msa_combine_s32(__a, __b)  __COMBINE_64_64(v4i32, __a, __b)
+
+/* v2i64 msa_combine_s64(v1i64 __a, v1i64 __b) */
+#define msa_combine_s64(__a, __b)  __COMBINE_64_64(v2i64, __a, __b)
+
+/* v4f32 msa_combine_f32(v2f32 __a, v2f32 __b) */
+#define msa_combine_f32(__a, __b)  __COMBINE_64_64(v4f32, __a, __b)
+
+/* v16u8 msa_combine_u8(v8u8 __a, v8u8 __b) */
+#define msa_combine_u8(__a, __b)  __COMBINE_64_64(v16u8, __a, __b)
+
+/* v8u16 msa_combine_u16(v4u16 __a, v4u16 __b) */
+#define msa_combine_u16(__a, __b)  __COMBINE_64_64(v8u16, __a, __b)
+
+/* v4u32 msa_combine_u32(v2u32 __a, v2u32 __b) */
+#define msa_combine_u32(__a, __b)  __COMBINE_64_64(v4u32, __a, __b)
+
+/* v2u64 msa_combine_u64(v1u64 __a, v1u64 __b) */
+#define msa_combine_u64(__a, __b)  __COMBINE_64_64(v2u64, __a, __b)
+
+/* v2f64 msa_combine_f64(v1f64 __a, v1f64 __b) */
+#define msa_combine_f64(__a, __b)  __COMBINE_64_64(v2f64, __a, __b)
+
+/* get_low, get_high */
+#if (__mips == 64)
+#define __GET_LOW(__TYPE, a)   ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 0))))
+#define __GET_HIGH(__TYPE, a)  ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 1))))
+#else
+#define __GET_LOW(__TYPE, a)   ((__TYPE)(((v2u64)(a))[0]))
+#define __GET_HIGH(__TYPE, a)  ((__TYPE)(((v2u64)(a))[1]))
+#endif
+
+/* v8i8 msa_get_low_s8(v16i8 __a) */
+#define msa_get_low_s8(__a)  __GET_LOW(v8i8, __a)
+
+/* v4i16 msa_get_low_s16(v8i16 __a) */
+#define msa_get_low_s16(__a)  __GET_LOW(v4i16, __a)
+
+/* v2i32 msa_get_low_s32(v4i32 __a) */
+#define msa_get_low_s32(__a)  __GET_LOW(v2i32, __a)
+
+/* v1i64 msa_get_low_s64(v2i64 __a) */
+#define msa_get_low_s64(__a)  __GET_LOW(v1i64, __a)
+
+/* v8u8 msa_get_low_u8(v16u8 __a) */
+#define msa_get_low_u8(__a)  __GET_LOW(v8u8, __a)
+
+/* v4u16 msa_get_low_u16(v8u16 __a) */
+#define msa_get_low_u16(__a)  __GET_LOW(v4u16, __a)
+
+/* v2u32 msa_get_low_u32(v4u32 __a) */
+#define msa_get_low_u32(__a)  __GET_LOW(v2u32, __a)
+
+/* v1u64 msa_get_low_u64(v2u64 __a) */
+#define msa_get_low_u64(__a)  __GET_LOW(v1u64, __a)
+
+/* v2f32 msa_get_low_f32(v4f32 __a) */
+#define msa_get_low_f32(__a)  __GET_LOW(v2f32, __a)
+
+/* v1f64 msa_get_low_f64(v2f64 __a) */
+#define msa_get_low_f64(__a)  __GET_LOW(v1f64, __a)
+
+/* v8i8 msa_get_high_s8(v16i8 __a) */
+#define msa_get_high_s8(__a)  __GET_HIGH(v8i8, __a)
+
+/* v4i16 msa_get_high_s16(v8i16 __a) */
+#define msa_get_high_s16(__a)  __GET_HIGH(v4i16, __a)
+
+/* v2i32 msa_get_high_s32(v4i32 __a) */
+#define msa_get_high_s32(__a)  __GET_HIGH(v2i32, __a)
+
+/* v1i64 msa_get_high_s64(v2i64 __a) */
+#define msa_get_high_s64(__a)  __GET_HIGH(v1i64, __a)
+
+/* v8u8 msa_get_high_u8(v16u8 __a) */
+#define msa_get_high_u8(__a)  __GET_HIGH(v8u8, __a)
+
+/* v4u16 msa_get_high_u16(v8u16 __a) */
+#define msa_get_high_u16(__a)  __GET_HIGH(v4u16, __a)
+
+/* v2u32 msa_get_high_u32(v4u32 __a) */
+#define msa_get_high_u32(__a)  __GET_HIGH(v2u32, __a)
+
+/* v1u64 msa_get_high_u64(v2u64 __a) */
+#define msa_get_high_u64(__a)  __GET_HIGH(v1u64, __a)
+
+/* v2f32 msa_get_high_f32(v4f32 __a) */
+#define msa_get_high_f32(__a)  __GET_HIGH(v2f32, __a)
+
+/* v1f64 msa_get_high_f64(v2f64 __a) */
+#define msa_get_high_f64(__a)  __GET_HIGH(v1f64, __a)
+
+/* ri = ai * b[lane] */
+/* v4f32 msa_mulq_lane_f32(v4f32 __a, v4f32 __b, const int __lane) */
+#define msa_mulq_lane_f32(__a, __b, __lane)  ((__a) * msa_getq_lane_f32(__b, __lane))
+
+/* ri = ai + bi * c[lane] */
+/* v4f32 msa_mlaq_lane_f32(v4f32 __a, v4f32 __b, v4f32 __c, const int __lane) */
+#define msa_mlaq_lane_f32(__a, __b, __c, __lane)  ((__a) + ((__b) * msa_getq_lane_f32(__c, __lane)))
+
+/* uint16_t msa_sum_u16(v8u16 __a)*/
+#define msa_sum_u16(__a)                         \
+({                                               \
+  v4u32 _b;                                      \
+  v2u64 _c;                                      \
+  _b = __builtin_msa_hadd_u_w(__a, __a);         \
+  _c = __builtin_msa_hadd_u_d(_b, _b);           \
+  (uint16_t)(_c[0] + _c[1]);                     \
+})
+
+/* int16_t msa_sum_s16(v8i16 __a) */
+#define msa_sum_s16(__a)                        \
+({                                              \
+  v4i32 _b;                                     \
+  v2i64 _c;                                     \
+  _b = __builtin_msa_hadd_s_w(__a, __a);        \
+  _c = __builtin_msa_hadd_s_d(_b, _b);          \
+  (int32_t)(_c[0] + _c[1]);                     \
+})
+
+
+/* uint32_t msa_sum_u32(v4u32 __a)*/
+#define msa_sum_u32(__a)                       \
+({                                             \
+  v2u64 _b;                                    \
+  _b = __builtin_msa_hadd_u_d(__a, __a);       \
+  (uint32_t)(_b[0] + _b[1]);                   \
+})
+
+/* int32_t  msa_sum_s32(v4i32 __a)*/
+#define msa_sum_s32(__a)                       \
+({                                             \
+  v2i64 _b;                                    \
+  _b = __builtin_msa_hadd_s_d(__a, __a);       \
+  (int64_t)(_b[0] + _b[1]);                    \
+})
+
+/* uint8_t msa_sum_u8(v16u8 __a)*/
+#define msa_sum_u8(__a)                        \
+({                                             \
+  v8u16 _b16;                                    \
+  v4u32 _c32;                                    \
+  _b16 = __builtin_msa_hadd_u_h(__a, __a);       \
+  _c32 = __builtin_msa_hadd_u_w(_b16, _b16);         \
+  (uint8_t)msa_sum_u32(_c32);                    \
+})
+
+/* int8_t msa_sum_s8(v16s8 __a)*/
+#define msa_sum_s8(__a)                        \
+({                                             \
+  v8i16 _b16;                                    \
+  v4i32 _c32;                                    \
+  _b16 = __builtin_msa_hadd_s_h(__a, __a);       \
+  _c32 = __builtin_msa_hadd_s_w(_b16, _b16);         \
+  (int16_t)msa_sum_s32(_c32);                     \
+})
+
+/* float msa_sum_f32(v4f32 __a)*/
+#define msa_sum_f32(__a)  ((__a)[0] + (__a)[1] + (__a)[2] + (__a)[3])
+
+/* v8u16 msa_paddlq_u8(v16u8 __a) */
+#define msa_paddlq_u8(__a)  (__builtin_msa_hadd_u_h(__a, __a))
+
+/* v8i16 msa_paddlq_s8(v16i8 __a) */
+#define msa_paddlq_s8(__a)  (__builtin_msa_hadd_s_h(__a, __a))
+
+/* v4u32 msa_paddlq_u16 (v8u16 __a)*/
+#define msa_paddlq_u16(__a)  (__builtin_msa_hadd_u_w(__a, __a))
+
+/* v4i32 msa_paddlq_s16 (v8i16 __a)*/
+#define msa_paddlq_s16(__a)  (__builtin_msa_hadd_s_w(__a, __a))
+
+/* v2u64 msa_paddlq_u32(v4u32 __a) */
+#define msa_paddlq_u32(__a)  (__builtin_msa_hadd_u_d(__a, __a))
+
+/* v2i64 msa_paddlq_s32(v4i32 __a) */
+#define msa_paddlq_s32(__a)  (__builtin_msa_hadd_s_d(__a, __a))
+
+#define V8U8_2_V8U16(x)   {(uint16_t)x[0], (uint16_t)x[1], (uint16_t)x[2], (uint16_t)x[3], \
+                           (uint16_t)x[4], (uint16_t)x[5], (uint16_t)x[6], (uint16_t)x[7]}
+#define V8U8_2_V8I16(x)   {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
+                           (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
+#define V8I8_2_V8I16(x)   {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
+                           (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
+#define V4U16_2_V4U32(x)  {(uint32_t)x[0], (uint32_t)x[1], (uint32_t)x[2], (uint32_t)x[3]}
+#define V4U16_2_V4I32(x)  {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
+#define V4I16_2_V4I32(x)  {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
+#define V2U32_2_V2U64(x)  {(uint64_t)x[0], (uint64_t)x[1]}
+#define V2U32_2_V2I64(x)  {(int64_t)x[0], (int64_t)x[1]}
+
+/* v8u16 msa_mull_u8(v8u8 __a, v8u8 __b) */
+#define msa_mull_u8(__a, __b)  ((v8u16)__builtin_msa_mulv_h((v8i16)V8U8_2_V8I16(__a), (v8i16)V8U8_2_V8I16(__b)))
+
+/* v8i16 msa_mull_s8(v8i8 __a, v8i8 __b)*/
+#define msa_mull_s8(__a, __b)  (__builtin_msa_mulv_h((v8i16)V8I8_2_V8I16(__a), (v8i16)V8I8_2_V8I16(__b)))
+
+/* v4u32 msa_mull_u16(v4u16 __a, v4u16 __b) */
+#define msa_mull_u16(__a, __b)  ((v4u32)__builtin_msa_mulv_w((v4i32)V4U16_2_V4I32(__a), (v4i32)V4U16_2_V4I32(__b)))
+
+/* v4i32 msa_mull_s16(v4i16 __a, v4i16 __b) */
+#define msa_mull_s16(__a, __b)  (__builtin_msa_mulv_w((v4i32)V4I16_2_V4I32(__a), (v4i32)V4I16_2_V4I32(__b)))
+
+/* v2u64 msa_mull_u32(v2u32 __a, v2u32 __b) */
+#define msa_mull_u32(__a, __b)  ((v2u64)__builtin_msa_mulv_d((v2i64)V2U32_2_V2I64(__a), (v2i64)V2U32_2_V2I64(__b)))
+
+/* bitwise and: __builtin_msa_and_v */
+#define msa_andq_u8(__a, __b)  ((v16u8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s8(__a, __b)  ((v16i8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u16(__a, __b) ((v8u16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s16(__a, __b) ((v8i16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u32(__a, __b) ((v4u32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s32(__a, __b) ((v4i32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u64(__a, __b) ((v2u64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s64(__a, __b) ((v2i64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise or: __builtin_msa_or_v */
+#define msa_orrq_u8(__a, __b)  ((v16u8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s8(__a, __b)  ((v16i8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u16(__a, __b) ((v8u16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s16(__a, __b) ((v8i16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u32(__a, __b) ((v4u32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s32(__a, __b) ((v4i32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u64(__a, __b) ((v2u64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s64(__a, __b) ((v2i64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise xor: __builtin_msa_xor_v */
+#define msa_eorq_u8(__a, __b)  ((v16u8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s8(__a, __b)  ((v16i8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u16(__a, __b) ((v8u16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s16(__a, __b) ((v8i16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u32(__a, __b) ((v4u32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s32(__a, __b) ((v4i32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u64(__a, __b) ((v2u64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s64(__a, __b) ((v2i64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise not: v16u8 __builtin_msa_xori_b (v16u8, 0xff) */
+#define msa_mvnq_u8(__a)  ((v16u8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s8(__a)  ((v16i8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u16(__a) ((v8u16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s16(__a) ((v8i16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u32(__a) ((v4u32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s32(__a) ((v4i32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u64(__a) ((v2u64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s64(__a) ((v2i64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+
+/* compare equal: ceq -> ri = ai == bi ? 1...1:0...0 */
+#define msa_ceqq_u8(__a, __b)  ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_ceqq_s8(__a, __b)  ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_ceqq_u16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_ceqq_s16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_ceqq_u32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_ceqq_s32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_ceqq_f32(__a, __b) ((v4u32)__builtin_msa_fceq_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_ceqq_u64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_ceqq_s64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_ceqq_f64(__a, __b) ((v2u64)__builtin_msa_fceq_d((v2f64)(__a), (v2f64)(__b)))
+
+/* Compare less-than: clt -> ri = ai < bi ? 1...1:0...0 */
+#define msa_cltq_u8(__a, __b)  ((v16u8)__builtin_msa_clt_u_b((v16u8)(__a), (v16u8)(__b)))
+#define msa_cltq_s8(__a, __b)  ((v16u8)__builtin_msa_clt_s_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_cltq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__a), (v8u16)(__b)))
+#define msa_cltq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_cltq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__a), (v4u32)(__b)))
+#define msa_cltq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_cltq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_cltq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__a), (v2u64)(__b)))
+#define msa_cltq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_cltq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__a), (v2f64)(__b)))
+
+/* compare greater-than: cgt -> ri = ai > bi ? 1...1:0...0 */
+#define msa_cgtq_u8(__a, __b)  ((v16u8)__builtin_msa_clt_u_b((v16u8)(__b), (v16u8)(__a)))
+#define msa_cgtq_s8(__a, __b)  ((v16u8)__builtin_msa_clt_s_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_cgtq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__b), (v8u16)(__a)))
+#define msa_cgtq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_cgtq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__b), (v4u32)(__a)))
+#define msa_cgtq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_cgtq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__b), (v4f32)(__a)))
+#define msa_cgtq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__b), (v2u64)(__a)))
+#define msa_cgtq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__b), (v2i64)(__a)))
+#define msa_cgtq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__b), (v2f64)(__a)))
+
+/* compare less-equal: cle -> ri = ai <= bi ? 1...1:0...0 */
+#define msa_cleq_u8(__a, __b)  ((v16u8)__builtin_msa_cle_u_b((v16u8)(__a), (v16u8)(__b)))
+#define msa_cleq_s8(__a, __b)  ((v16u8)__builtin_msa_cle_s_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_cleq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__a), (v8u16)(__b)))
+#define msa_cleq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_cleq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__a), (v4u32)(__b)))
+#define msa_cleq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_cleq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_cleq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__a), (v2u64)(__b)))
+#define msa_cleq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_cleq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__a), (v2f64)(__b)))
+
+/* compare greater-equal: cge -> ri = ai >= bi ? 1...1:0...0 */
+#define msa_cgeq_u8(__a, __b)  ((v16u8)__builtin_msa_cle_u_b((v16u8)(__b), (v16u8)(__a)))
+#define msa_cgeq_s8(__a, __b)  ((v16u8)__builtin_msa_cle_s_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_cgeq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__b), (v8u16)(__a)))
+#define msa_cgeq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_cgeq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__b), (v4u32)(__a)))
+#define msa_cgeq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_cgeq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__b), (v4f32)(__a)))
+#define msa_cgeq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__b), (v2u64)(__a)))
+#define msa_cgeq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__b), (v2i64)(__a)))
+#define msa_cgeq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__b), (v2f64)(__a)))
+
+/* Shift Left Logical: shl -> ri = ai << bi; */
+#define msa_shlq_u8(__a, __b)  ((v16u8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shlq_s8(__a, __b)  ((v16i8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shlq_u16(__a, __b) ((v8u16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shlq_s16(__a, __b) ((v8i16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shlq_u32(__a, __b) ((v4u32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shlq_s32(__a, __b) ((v4i32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shlq_u64(__a, __b) ((v2u64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_shlq_s64(__a, __b) ((v2i64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Immediate Shift Left Logical: shl -> ri = ai << imm; */
+#define msa_shlq_n_u8(__a, __imm)  ((v16u8)__builtin_msa_slli_b((v16i8)(__a), __imm))
+#define msa_shlq_n_s8(__a, __imm)  ((v16i8)__builtin_msa_slli_b((v16i8)(__a), __imm))
+#define msa_shlq_n_u16(__a, __imm) ((v8u16)__builtin_msa_slli_h((v8i16)(__a), __imm))
+#define msa_shlq_n_s16(__a, __imm) ((v8i16)__builtin_msa_slli_h((v8i16)(__a), __imm))
+#define msa_shlq_n_u32(__a, __imm) ((v4u32)__builtin_msa_slli_w((v4i32)(__a), __imm))
+#define msa_shlq_n_s32(__a, __imm) ((v4i32)__builtin_msa_slli_w((v4i32)(__a), __imm))
+#define msa_shlq_n_u64(__a, __imm) ((v2u64)__builtin_msa_slli_d((v2i64)(__a), __imm))
+#define msa_shlq_n_s64(__a, __imm) ((v2i64)__builtin_msa_slli_d((v2i64)(__a), __imm))
+
+/* shift right: shrq -> ri = ai >> bi; */
+#define msa_shrq_u8(__a, __b)  ((v16u8)__builtin_msa_srl_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shrq_s8(__a, __b)  ((v16i8)__builtin_msa_sra_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shrq_u16(__a, __b) ((v8u16)__builtin_msa_srl_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shrq_s16(__a, __b) ((v8i16)__builtin_msa_sra_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shrq_u32(__a, __b) ((v4u32)__builtin_msa_srl_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shrq_s32(__a, __b) ((v4i32)__builtin_msa_sra_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shrq_u64(__a, __b) ((v2u64)__builtin_msa_srl_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_shrq_s64(__a, __b) ((v2i64)__builtin_msa_sra_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Immediate Shift Right: shr -> ri = ai >> imm; */
+#define msa_shrq_n_u8(__a, __imm)  ((v16u8)__builtin_msa_srli_b((v16i8)(__a), __imm))
+#define msa_shrq_n_s8(__a, __imm)  ((v16i8)__builtin_msa_srai_b((v16i8)(__a), __imm))
+#define msa_shrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srli_h((v8i16)(__a), __imm))
+#define msa_shrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srai_h((v8i16)(__a), __imm))
+#define msa_shrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srli_w((v4i32)(__a), __imm))
+#define msa_shrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srai_w((v4i32)(__a), __imm))
+#define msa_shrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srli_d((v2i64)(__a), __imm))
+#define msa_shrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srai_d((v2i64)(__a), __imm))
+
+/* Immediate Shift Right Rounded: shr -> ri = ai >> (rounded)imm; */
+#define msa_rshrq_n_u8(__a, __imm)  ((v16u8)__builtin_msa_srlri_b((v16i8)(__a), __imm))
+#define msa_rshrq_n_s8(__a, __imm)  ((v16i8)__builtin_msa_srari_b((v16i8)(__a), __imm))
+#define msa_rshrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srlri_h((v8i16)(__a), __imm))
+#define msa_rshrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srari_h((v8i16)(__a), __imm))
+#define msa_rshrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srlri_w((v4i32)(__a), __imm))
+#define msa_rshrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srari_w((v4i32)(__a), __imm))
+#define msa_rshrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srlri_d((v2i64)(__a), __imm))
+#define msa_rshrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srari_d((v2i64)(__a), __imm))
+
+/* Vector saturating rounding shift left, qrshl -> ri = ai << bi; */
+#define msa_qrshrq_s32(a, b)  ((v4i32)__msa_srar_w((v4i32)(a), (v4i32)(b)))
+
+/* Rename the msa builtin func to unify the name style for intrin_msa.hpp */
+#define msa_qaddq_u8          __builtin_msa_adds_u_b
+#define msa_qaddq_s8          __builtin_msa_adds_s_b
+#define msa_qaddq_u16         __builtin_msa_adds_u_h
+#define msa_qaddq_s16         __builtin_msa_adds_s_h
+#define msa_qaddq_u32         __builtin_msa_adds_u_w
+#define msa_qaddq_s32         __builtin_msa_adds_s_w
+#define msa_qaddq_u64         __builtin_msa_adds_u_d
+#define msa_qaddq_s64         __builtin_msa_adds_s_d
+#define msa_addq_u8(a, b)     ((v16u8)__builtin_msa_addv_b((v16i8)(a), (v16i8)(b)))
+#define msa_addq_s8           __builtin_msa_addv_b
+#define msa_addq_u16(a, b)    ((v8u16)__builtin_msa_addv_h((v8i16)(a), (v8i16)(b)))
+#define msa_addq_s16          __builtin_msa_addv_h
+#define msa_addq_u32(a, b)    ((v4u32)__builtin_msa_addv_w((v4i32)(a), (v4i32)(b)))
+#define msa_addq_s32          __builtin_msa_addv_w
+#define msa_addq_f32          __builtin_msa_fadd_w
+#define msa_addq_u64(a, b)    ((v2u64)__builtin_msa_addv_d((v2i64)(a), (v2i64)(b)))
+#define msa_addq_s64          __builtin_msa_addv_d
+#define msa_addq_f64          __builtin_msa_fadd_d
+#define msa_qsubq_u8          __builtin_msa_subs_u_b
+#define msa_qsubq_s8          __builtin_msa_subs_s_b
+#define msa_qsubq_u16         __builtin_msa_subs_u_h
+#define msa_qsubq_s16         __builtin_msa_subs_s_h
+#define msa_subq_u8(a, b)     ((v16u8)__builtin_msa_subv_b((v16i8)(a), (v16i8)(b)))
+#define msa_subq_s8           __builtin_msa_subv_b
+#define msa_subq_u16(a, b)    ((v8u16)__builtin_msa_subv_h((v8i16)(a), (v8i16)(b)))
+#define msa_subq_s16          __builtin_msa_subv_h
+#define msa_subq_u32(a, b)    ((v4u32)__builtin_msa_subv_w((v4i32)(a), (v4i32)(b)))
+#define msa_subq_s32          __builtin_msa_subv_w
+#define msa_subq_f32          __builtin_msa_fsub_w
+#define msa_subq_u64(a, b)    ((v2u64)__builtin_msa_subv_d((v2i64)(a), (v2i64)(b)))
+#define msa_subq_s64          __builtin_msa_subv_d
+#define msa_subq_f64          __builtin_msa_fsub_d
+#define msa_mulq_u8(a, b)     ((v16u8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
+#define msa_mulq_s8(a, b)     ((v16i8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
+#define msa_mulq_u16(a, b)    ((v8u16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
+#define msa_mulq_s16(a, b)    ((v8i16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
+#define msa_mulq_u32(a, b)    ((v4u32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
+#define msa_mulq_s32(a, b)    ((v4i32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
+#define msa_mulq_u64(a, b)    ((v2u64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
+#define msa_mulq_s64(a, b)    ((v2i64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
+#define msa_mulq_f32          __builtin_msa_fmul_w
+#define msa_mulq_f64          __builtin_msa_fmul_d
+#define msa_divq_f32          __builtin_msa_fdiv_w
+#define msa_divq_f64          __builtin_msa_fdiv_d
+#define msa_dotp_s_h          __builtin_msa_dotp_s_h
+#define msa_dotp_s_w          __builtin_msa_dotp_s_w
+#define msa_dotp_s_d          __builtin_msa_dotp_s_d
+#define msa_dotp_u_h          __builtin_msa_dotp_u_h
+#define msa_dotp_u_w          __builtin_msa_dotp_u_w
+#define msa_dotp_u_d          __builtin_msa_dotp_u_d
+#define msa_dpadd_s_h         __builtin_msa_dpadd_s_h
+#define msa_dpadd_s_w         __builtin_msa_dpadd_s_w
+#define msa_dpadd_s_d         __builtin_msa_dpadd_s_d
+#define msa_dpadd_u_h         __builtin_msa_dpadd_u_h
+#define msa_dpadd_u_w         __builtin_msa_dpadd_u_w
+#define msa_dpadd_u_d         __builtin_msa_dpadd_u_d
+
+#define ILVRL_B2(RTYPE, in0, in1, low, hi) do {       \
+      low = (RTYPE)__builtin_msa_ilvr_b((v16i8)(in0), (v16i8)(in1));  \
+      hi  = (RTYPE)__builtin_msa_ilvl_b((v16i8)(in0), (v16i8)(in1));  \
+    } while (0)
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, low, hi) do {       \
+      low = (RTYPE)__builtin_msa_ilvr_h((v8i16)(in0), (v8i16)(in1));  \
+      hi  = (RTYPE)__builtin_msa_ilvl_h((v8i16)(in0), (v8i16)(in1));  \
+    } while (0)
+#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
+#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
+#define ILVRL_H2_UH(...) ILVRL_H2(v8u16, __VA_ARGS__)
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, low, hi) do {       \
+      low = (RTYPE)__builtin_msa_ilvr_w((v4i32)(in0), (v4i32)(in1));  \
+      hi  = (RTYPE)__builtin_msa_ilvl_w((v4i32)(in0), (v4i32)(in1));  \
+    } while (0)
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
+
+/* absq, qabsq (r = |a|;) */
+#define msa_absq_s8(a)        __builtin_msa_add_a_b(a, __builtin_msa_fill_b(0))
+#define msa_absq_s16(a)       __builtin_msa_add_a_h(a, __builtin_msa_fill_h(0))
+#define msa_absq_s32(a)       __builtin_msa_add_a_w(a, __builtin_msa_fill_w(0))
+#define msa_absq_s64(a)       __builtin_msa_add_a_d(a, __builtin_msa_fill_d(0))
+#define msa_absq_f32(a)       ((v4f32)__builtin_msa_bclri_w((v4u32)(a), 31))
+#define msa_absq_f64(a)       ((v2f64)__builtin_msa_bclri_d((v2u64)(a), 63))
+#define msa_qabsq_s8(a)       __builtin_msa_adds_a_b(a, __builtin_msa_fill_b(0))
+#define msa_qabsq_s16(a)      __builtin_msa_adds_a_h(a, __builtin_msa_fill_h(0))
+#define msa_qabsq_s32(a)      __builtin_msa_adds_a_w(a, __builtin_msa_fill_w(0))
+#define msa_qabsq_s64(a)      __builtin_msa_adds_a_d(a, __builtin_msa_fill_d(0))
+
+/* abdq, qabdq (r = |a - b|;) */
+#define msa_abdq_u8           __builtin_msa_asub_u_b
+#define msa_abdq_s8           __builtin_msa_asub_s_b
+#define msa_abdq_u16          __builtin_msa_asub_u_h
+#define msa_abdq_s16          __builtin_msa_asub_s_h
+#define msa_abdq_u32          __builtin_msa_asub_u_w
+#define msa_abdq_s32          __builtin_msa_asub_s_w
+#define msa_abdq_u64          __builtin_msa_asub_u_d
+#define msa_abdq_s64          __builtin_msa_asub_s_d
+#define msa_abdq_f32(a, b)    msa_absq_f32(__builtin_msa_fsub_w(a, b))
+#define msa_abdq_f64(a, b)    msa_absq_f64(__builtin_msa_fsub_d(a, b))
+#define msa_qabdq_s8(a, b)    msa_qabsq_s8(__builtin_msa_subs_s_b(a, b))
+#define msa_qabdq_s16(a, b)   msa_qabsq_s16(__builtin_msa_subs_s_h(a, b))
+#define msa_qabdq_s32(a, b)   msa_qabsq_s32(__builtin_msa_subs_s_w(a, b))
+#define msa_qabdq_s64(a, b)   msa_qabsq_s64(__builtin_msa_subs_s_d(a, b))
+
+/* sqrtq, rsqrtq */
+#define msa_sqrtq_f32         __builtin_msa_fsqrt_w
+#define msa_sqrtq_f64         __builtin_msa_fsqrt_d
+#define msa_rsqrtq_f32        __builtin_msa_frsqrt_w
+#define msa_rsqrtq_f64        __builtin_msa_frsqrt_d
+
+
+/* mlaq: r = a + b * c; */
+__extension__ extern __inline v4i32
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_s32(v4i32 __a, v4i32 __b, v4i32 __c)
+{
+  __asm__ volatile("maddv.w %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+__extension__ extern __inline v2i64
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_s64(v2i64 __a, v2i64 __b, v2i64 __c)
+{
+  __asm__ volatile("maddv.d %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+__extension__ extern __inline v4f32
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_f32(v4f32 __a, v4f32 __b, v4f32 __c)
+{
+  __asm__ volatile("fmadd.w %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+__extension__ extern __inline v2f64
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_f64(v2f64 __a, v2f64 __b, v2f64 __c)
+{
+  __asm__ volatile("fmadd.d %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+/* cntq */
+#define msa_cntq_s8           __builtin_msa_pcnt_b
+#define msa_cntq_s16          __builtin_msa_pcnt_h
+#define msa_cntq_s32          __builtin_msa_pcnt_w
+#define msa_cntq_s64          __builtin_msa_pcnt_d
+
+/* bslq (a: mask; r = b(if a == 0); r = c(if a == 1);) */
+#define msa_bslq_u8           __builtin_msa_bsel_v
+
+/* ilvrq, ilvlq (For EL only, ilvrq: b0, a0, b1, a1; ilvlq: b2, a2, b3, a3;) */
+#define msa_ilvrq_s8          __builtin_msa_ilvr_b
+#define msa_ilvrq_s16         __builtin_msa_ilvr_h
+#define msa_ilvrq_s32         __builtin_msa_ilvr_w
+#define msa_ilvrq_s64         __builtin_msa_ilvr_d
+#define msa_ilvlq_s8          __builtin_msa_ilvl_b
+#define msa_ilvlq_s16         __builtin_msa_ilvl_h
+#define msa_ilvlq_s32         __builtin_msa_ilvl_w
+#define msa_ilvlq_s64         __builtin_msa_ilvl_d
+
+/* ilvevq, ilvodq (ilvevq: b0, a0, b2, a2; ilvodq: b1, a1, b3, a3; ) */
+#define msa_ilvevq_s8         __builtin_msa_ilvev_b
+#define msa_ilvevq_s16        __builtin_msa_ilvev_h
+#define msa_ilvevq_s32        __builtin_msa_ilvev_w
+#define msa_ilvevq_s64        __builtin_msa_ilvev_d
+#define msa_ilvodq_s8         __builtin_msa_ilvod_b
+#define msa_ilvodq_s16        __builtin_msa_ilvod_h
+#define msa_ilvodq_s32        __builtin_msa_ilvod_w
+#define msa_ilvodq_s64        __builtin_msa_ilvod_d
+
+/* extq (r = (a || b); a concatenation b and get elements from index c) */
+#ifdef _MIPSEB
+#define msa_extq_s8(a, b, c)  \
+(__builtin_msa_vshf_b(__builtin_msa_subv_b((v16i8)((v2i64){0x1716151413121110, 0x1F1E1D1C1B1A1918}), __builtin_msa_fill_b(c)), a, b))
+#define msa_extq_s16(a, b, c) \
+(__builtin_msa_vshf_h(__builtin_msa_subv_h((v8i16)((v2i64){0x000B000A00090008, 0x000F000E000D000C}), __builtin_msa_fill_h(c)), a, b))
+#define msa_extq_s32(a, b, c) \
+(__builtin_msa_vshf_w(__builtin_msa_subv_w((v4i32)((v2i64){0x0000000500000004, 0x0000000700000006}), __builtin_msa_fill_w(c)), a, b))
+#define msa_extq_s64(a, b, c) \
+(__builtin_msa_vshf_d(__builtin_msa_subv_d((v2i64){0x0000000000000002, 0x0000000000000003}, __builtin_msa_fill_d(c)), a, b))
+#else
+#define msa_extq_s8(a, b, c)  \
+(__builtin_msa_vshf_b(__builtin_msa_addv_b((v16i8)((v2i64){0x0706050403020100, 0x0F0E0D0C0B0A0908}), __builtin_msa_fill_b(c)), b, a))
+#define msa_extq_s16(a, b, c) \
+(__builtin_msa_vshf_h(__builtin_msa_addv_h((v8i16)((v2i64){0x0003000200010000, 0x0007000600050004}), __builtin_msa_fill_h(c)), b, a))
+#define msa_extq_s32(a, b, c) \
+(__builtin_msa_vshf_w(__builtin_msa_addv_w((v4i32)((v2i64){0x0000000100000000, 0x0000000300000002}), __builtin_msa_fill_w(c)), b, a))
+#define msa_extq_s64(a, b, c) \
+(__builtin_msa_vshf_d(__builtin_msa_addv_d((v2i64){0x0000000000000000, 0x0000000000000001}, __builtin_msa_fill_d(c)), b, a))
+#endif /* _MIPSEB */
+
+/* cvttruncq, cvttintq, cvtrintq */
+#define msa_cvttruncq_u32_f32 __builtin_msa_ftrunc_u_w
+#define msa_cvttruncq_s32_f32 __builtin_msa_ftrunc_s_w
+#define msa_cvttruncq_u64_f64 __builtin_msa_ftrunc_u_d
+#define msa_cvttruncq_s64_f64 __builtin_msa_ftrunc_s_d
+#define msa_cvttintq_u32_f32  __builtin_msa_ftint_u_w
+#define msa_cvttintq_s32_f32  __builtin_msa_ftint_s_w
+#define msa_cvttintq_u64_f64  __builtin_msa_ftint_u_d
+#define msa_cvttintq_s64_f64  __builtin_msa_ftint_s_d
+#define msa_cvtrintq_f32      __builtin_msa_frint_w
+#define msa_cvtrintq_f64      __builtin_msa_frint_d
+
+/* cvtfintq, cvtfq */
+#define msa_cvtfintq_f32_u32  __builtin_msa_ffint_u_w
+#define msa_cvtfintq_f32_s32  __builtin_msa_ffint_s_w
+#define msa_cvtfintq_f64_u64  __builtin_msa_ffint_u_d
+#define msa_cvtfintq_f64_s64  __builtin_msa_ffint_s_d
+#define msa_cvtfq_f32_f64     __builtin_msa_fexdo_w
+#define msa_cvtflq_f64_f32    __builtin_msa_fexupr_d
+#define msa_cvtfhq_f64_f32    __builtin_msa_fexupl_d
+
+#define msa_addl_u8(a, b)     ((v8u16)__builtin_msa_addv_h((v8i16)V8U8_2_V8I16(a), (v8i16)V8U8_2_V8I16(b)))
+#define msa_addl_s8(a, b)     (__builtin_msa_addv_h((v8i16)V8I8_2_V8I16(a), (v8i16)V8I8_2_V8I16(b)))
+#define msa_addl_u16(a, b)    ((v4u32)__builtin_msa_addv_w((v4i32)V4U16_2_V4I32(a), (v4i32)V4U16_2_V4I32(b)))
+#define msa_addl_s16(a, b)    (__builtin_msa_addv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
+#define msa_subl_s16(a, b)    (__builtin_msa_subv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
+#define msa_recpeq_f32        __builtin_msa_frcp_w
+#define msa_recpsq_f32(a, b)  (__builtin_msa_fsub_w(msa_dupq_n_f32(2.0f), __builtin_msa_fmul_w(a, b)))
+
+#define MSA_INTERLEAVED_IMPL_LOAD2_STORE2(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld2q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
+  *a = (_Tpv)__builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st2q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b) \
+{ \
+  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df((_Tpvs)b, (_Tpvs)a)); \
+  msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df((_Tpvs)b, (_Tpvs)a)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint8_t, v16u8, v16i8, u8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int8_t, v16i8, v16i8, s8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint16_t, v8u16, v8i16, u16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int16_t, v8i16, v8i16, s16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint32_t, v4u32, v4i32, u32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int32_t, v4i32, v4i32, s32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(float, v4f32, v4i32, f32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint64_t, v2u64, v2i64, u64, d, 2)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int64_t, v2i64, v2i64, s64, d, 2)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(double, v2f64, v2i64, f64, d, 2)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
+  _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0704011F1F1F1F1F, 0x1F1C191613100D0A}), (_Tpvs)v0, (_Tpvs)v1); \
+  *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150E0B080502, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0603001F1F1F1F1F, 0x1E1B1815120F0C09}), (_Tpvs)v0, (_Tpvs)v1); \
+  *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150D0A070401, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x05021F1F1F1F1F1F, 0x1D1A1714110E0B08}), (_Tpvs)v0, (_Tpvs)v1); \
+  *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x17160F0C09060300, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
+  _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15120F0C09060300, 0x00000000001E1B18}), (_Tpvs)v1, (_Tpvs)v0); \
+  *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1D1A1714110A0908}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1613100D0A070401, 0x00000000001F1C19}), (_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1E1B1815120A0908}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1714110E0B080502, 0x0000000000001D1A}), (_Tpvs)v1, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1F1C191613100908}), (_Tpvs)v2, v3); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_LOAD3_8(uint8_t, v16u8, v16i8, u8)
+MSA_INTERLEAVED_IMPL_LOAD3_8(int8_t, v16i8, v16i8, s8)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00030000000F000F, 0x000F000C00090006}), (_Tpvs)v1, (_Tpvs)v0); \
+  *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000A00050002, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0002000F000F000F, 0x000E000B00080005}), (_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000700040001, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000F000F000F, 0x000D000A00070004}), (_Tpvs)v1, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000600030000, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0009000600030000, 0x00000000000F000C}), (_Tpvs)v1, (_Tpvs)v0); \
+  *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000D000A00050004}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A000700040001, 0x000000000000000D}), (_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000E000B00080004}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000800050002, 0x000000000000000E}), (_Tpvs)v1, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000F000C00090004}), (_Tpvs)v2, v3); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_LOAD3_16(uint16_t, v8u16, v8i16, u16)
+MSA_INTERLEAVED_IMPL_LOAD3_16(int16_t, v8i16, v8i16, s16)
+
+#define MSA_INTERLEAVED_IMPL_LOAD3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v00 = msa_ld1q_##suffix(ptr); \
+  _Tpv v01 = msa_ld1q_##suffix(ptr + 4); \
+  _Tpv v02 = msa_ld1q_##suffix(ptr + 8); \
+  _Tpvs v10 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v01, (v2i64)v01), (_Tpvs)v00); \
+  _Tpvs v11 = __builtin_msa_ilvr_w((_Tpvs)v02, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v00, (v2i64)v00)); \
+  _Tpvs v12 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v02, (v2i64)v02), (_Tpvs)v01); \
+  *a = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v11, (v2i64)v11), v10); \
+  *b = (_Tpv)__builtin_msa_ilvr_w(v12, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v10, (v2i64)v10)); \
+  *c = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v12, (v2i64)v12), v11); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD3_32(uint32_t, v4u32, v4i32, u32)
+MSA_INTERLEAVED_IMPL_LOAD3_32(int32_t, v4i32, v4i32, s32)
+MSA_INTERLEAVED_IMPL_LOAD3_32(float, v4f32, v4i32, f32)
+
+#define MSA_INTERLEAVED_IMPL_LOAD3_64(_Tp, _Tpv, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  *((_Tp*)a) = *ptr;           *((_Tp*)b) = *(ptr + 1);     *((_Tp*)c) = *(ptr + 2);     \
+  *((_Tp*)a + 1) = *(ptr + 3); *((_Tp*)b + 1) = *(ptr + 4); *((_Tp*)c + 1) = *(ptr + 5); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD3_64(uint64_t, v2u64, u64)
+MSA_INTERLEAVED_IMPL_LOAD3_64(int64_t, v2i64, s64)
+MSA_INTERLEAVED_IMPL_LOAD3_64(double, v2f64, f64)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0F0E0D0C0B1F1F1F, 0x1F1E1D1C1B1A1F1F}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0D1C140C1B130B1A, 0x1F170F1E160E1D15}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A09080706051F1F, 0x19181716151F1F1F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1D14071C13061B12, 0x170A1F16091E1508}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x04030201001F1F1F, 0x14131211101F1F1F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15021C14011B1300, 0x051F17041E16031D}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000050403020100, 0x0000001413121110}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A02110901100800, 0x05140C04130B0312}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000A09080706, 0x00001A1918171615}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x170A011609001508, 0x0D04190C03180B02}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000F0E0D0C0B, 0x0000001F1E1D1C1B}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x021C09011B08001A, 0x1F0C041E0B031D0A}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_8(uint8_t, v16u8, v16i8, u8)
+MSA_INTERLEAVED_IMPL_STORE3_8(int8_t, v16i8, v16i8, s8)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000700060005000F, 0x000F000E000D000F}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A0006000D0009, 0x000F000B0007000E}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00040003000F000F, 0x000C000B000A000F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000E000A0003000D, 0x0005000F000B0004}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000200010000000F, 0x00090008000F000F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000E00090000, 0x000B0002000F000A}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000200010000, 0x0000000A00090008}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000800040000, 0x0006000200090005}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000500040003, 0x00000000000C000B}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B00040000000A, 0x0002000C00050001}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000000070006, 0x0000000F000E000D}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00050000000D0004, 0x000F00060001000E}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_16(uint16_t, v8u16, v8i16, u16)
+MSA_INTERLEAVED_IMPL_STORE3_16(int16_t, v8i16, v8i16, s16)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000007, 0x0000000700000006}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000006, 0x0000000700000005}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000001, 0x0000000500000007}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000700000004, 0x0000000500000002}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000007, 0x0000000400000007}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000000, 0x0000000100000007}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000100000000, 0x0000000000000004}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000000, 0x0000000100000004}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000002, 0x0000000600000005}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000002, 0x0000000300000000}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000003, 0x0000000000000007}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000006, 0x0000000700000002}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_32(uint32_t, v4u32, v4i32, u32)
+MSA_INTERLEAVED_IMPL_STORE3_32(int32_t, v4i32, v4i32, s32)
+MSA_INTERLEAVED_IMPL_STORE3_32(float, v4f32, v4i32, f32)
+
+#define MSA_INTERLEAVED_IMPL_STORE3_64(_Tp, _Tpv, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  *ptr = a[0];       *(ptr + 1) = b[0]; *(ptr + 2) = c[0]; \
+  *(ptr + 3) = a[1]; *(ptr + 4) = b[1]; *(ptr + 5) = c[1]; \
+}
+
+MSA_INTERLEAVED_IMPL_STORE3_64(uint64_t, v2u64, u64)
+MSA_INTERLEAVED_IMPL_STORE3_64(int64_t, v2i64, s64)
+MSA_INTERLEAVED_IMPL_STORE3_64(double, v2f64, f64)
+
+#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + nlanes * 2); \
+  _Tpv v3 = msa_ld1q_##suffix(ptr + nlanes * 3); \
+  _Tpvs t0 = __builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
+  _Tpvs t1 = __builtin_msa_pckev_##df((_Tpvs)v3, (_Tpvs)v2); \
+  _Tpvs t2 = __builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
+  _Tpvs t3 = __builtin_msa_pckod_##df((_Tpvs)v3, (_Tpvs)v2); \
+  *a = (_Tpv)__builtin_msa_pckev_##df(t1, t0); \
+  *b = (_Tpv)__builtin_msa_pckev_##df(t3, t2); \
+  *c = (_Tpv)__builtin_msa_pckod_##df(t1, t0); \
+  *d = (_Tpv)__builtin_msa_pckod_##df(t3, t2); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
+{ \
+  _Tpvs v0 = __builtin_msa_ilvr_##df((_Tpvs)c, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_ilvr_##df((_Tpvs)d, (_Tpvs)b); \
+  _Tpvs v2 = __builtin_msa_ilvl_##df((_Tpvs)c, (_Tpvs)a); \
+  _Tpvs v3 = __builtin_msa_ilvl_##df((_Tpvs)d, (_Tpvs)b); \
+  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df(v1, v0)); \
+  msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df(v1, v0)); \
+  msa_st1q_##suffix(ptr + 2 * nlanes, (_Tpv)__builtin_msa_ilvr_##df(v3, v2)); \
+  msa_st1q_##suffix(ptr + 3 * nlanes, (_Tpv)__builtin_msa_ilvl_##df(v3, v2)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint8_t, v16u8, v16i8, u8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int8_t, v16i8, v16i8, s8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint16_t, v8u16, v8i16, u16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int16_t, v8i16, v8i16, s16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint32_t, v4u32, v4i32, u32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int32_t, v4i32, v4i32, s32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(float, v4f32, v4i32, f32, w, 4)
+
+#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 2); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 4); \
+  _Tpv v3 = msa_ld1q_##suffix(ptr + 6); \
+  *a = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v2, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v2, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v3, (_Tpvs)v1); \
+  *d = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v3, (_Tpvs)v1); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
+{ \
+  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)b, (_Tpvs)a)); \
+  msa_st1q_##suffix(ptr + 2, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)d, (_Tpvs)c)); \
+  msa_st1q_##suffix(ptr + 4, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)b, (_Tpvs)a)); \
+  msa_st1q_##suffix(ptr + 6, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)d, (_Tpvs)c)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(uint64_t, v2u64, v2i64, u64)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(int64_t, v2i64, v2i64, s64)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(double, v2f64, v2i64, f64)
+
+__extension__ extern __inline v8i16
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_qdmulhq_n_s16(v8i16 a, int16_t b)
+{
+  v8i16 a_lo, a_hi;
+  ILVRL_H2_SH(a, msa_dupq_n_s16(0), a_lo, a_hi);
+  return msa_packr_s32(msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_dupq_n_s32(b)), 1),
+                       msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_hi), msa_dupq_n_s32(b)), 1), 16);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /*__mips_msa*/
+#endif /* OPENCV_CORE_MSA_MACROS_H */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/simd_utils.impl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/simd_utils.impl.hpp
new file mode 100644
index 000000000000..0a1ab2c52307
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/hal/simd_utils.impl.hpp
@@ -0,0 +1,186 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
+#ifdef OPENCV_HAL_INTRIN_HPP  // defined in intrin.hpp
+
+
+#if CV_SIMD128 || CV_SIMD128_CPP
+
+template<typename _T> struct Type2Vec128_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC128_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec128_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uchar, v_uint8x16);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(schar, v_int8x16);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(ushort, v_uint16x8);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(short, v_int16x8);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(unsigned, v_uint32x4);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int, v_int32x4);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(float, v_float32x4);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uint64, v_uint64x2);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int64, v_int64x2);
+#if CV_SIMD128_64F
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(double, v_float64x2);
+#endif
+
+template<typename _T> static inline
+typename Type2Vec128_Traits<_T>::vec_type v_setall(const _T& a);
+
+template<> inline Type2Vec128_Traits< uchar>::vec_type v_setall< uchar>(const  uchar& a) { return v_setall_u8(a); }
+template<> inline Type2Vec128_Traits< schar>::vec_type v_setall< schar>(const  schar& a) { return v_setall_s8(a); }
+template<> inline Type2Vec128_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
+template<> inline Type2Vec128_Traits< short>::vec_type v_setall< short>(const  short& a) { return v_setall_s16(a); }
+template<> inline Type2Vec128_Traits<  uint>::vec_type v_setall<  uint>(const   uint& a) { return v_setall_u32(a); }
+template<> inline Type2Vec128_Traits<   int>::vec_type v_setall<   int>(const    int& a) { return v_setall_s32(a); }
+template<> inline Type2Vec128_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
+template<> inline Type2Vec128_Traits< int64>::vec_type v_setall< int64>(const  int64& a) { return v_setall_s64(a); }
+template<> inline Type2Vec128_Traits< float>::vec_type v_setall< float>(const  float& a) { return v_setall_f32(a); }
+#if CV_SIMD128_64F
+template<> inline Type2Vec128_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
+#endif
+
+#endif  // SIMD128
+
+
+#if CV_SIMD256
+
+template<typename _T> struct Type2Vec256_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC256_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec256_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uchar, v_uint8x32);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(schar, v_int8x32);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(ushort, v_uint16x16);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(short, v_int16x16);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(unsigned, v_uint32x8);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int, v_int32x8);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(float, v_float32x8);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uint64, v_uint64x4);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int64, v_int64x4);
+#if CV_SIMD256_64F
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(double, v_float64x4);
+#endif
+
+template<typename _T> static inline
+typename Type2Vec256_Traits<_T>::vec_type v256_setall(const _T& a);
+
+template<> inline Type2Vec256_Traits< uchar>::vec_type v256_setall< uchar>(const  uchar& a) { return v256_setall_u8(a); }
+template<> inline Type2Vec256_Traits< schar>::vec_type v256_setall< schar>(const  schar& a) { return v256_setall_s8(a); }
+template<> inline Type2Vec256_Traits<ushort>::vec_type v256_setall<ushort>(const ushort& a) { return v256_setall_u16(a); }
+template<> inline Type2Vec256_Traits< short>::vec_type v256_setall< short>(const  short& a) { return v256_setall_s16(a); }
+template<> inline Type2Vec256_Traits<  uint>::vec_type v256_setall<  uint>(const   uint& a) { return v256_setall_u32(a); }
+template<> inline Type2Vec256_Traits<   int>::vec_type v256_setall<   int>(const    int& a) { return v256_setall_s32(a); }
+template<> inline Type2Vec256_Traits<uint64>::vec_type v256_setall<uint64>(const uint64& a) { return v256_setall_u64(a); }
+template<> inline Type2Vec256_Traits< int64>::vec_type v256_setall< int64>(const  int64& a) { return v256_setall_s64(a); }
+template<> inline Type2Vec256_Traits< float>::vec_type v256_setall< float>(const  float& a) { return v256_setall_f32(a); }
+#if CV_SIMD256_64F
+template<> inline Type2Vec256_Traits<double>::vec_type v256_setall<double>(const double& a) { return v256_setall_f64(a); }
+#endif
+
+#endif  // SIMD256
+
+
+#if CV_SIMD512
+
+template<typename _T> struct Type2Vec512_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC512_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec512_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uchar, v_uint8x64);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(schar, v_int8x64);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(ushort, v_uint16x32);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(short, v_int16x32);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(unsigned, v_uint32x16);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int, v_int32x16);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(float, v_float32x16);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uint64, v_uint64x8);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int64, v_int64x8);
+#if CV_SIMD512_64F
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(double, v_float64x8);
+#endif
+
+template<typename _T> static inline
+typename Type2Vec512_Traits<_T>::vec_type v512_setall(const _T& a);
+
+template<> inline Type2Vec512_Traits< uchar>::vec_type v512_setall< uchar>(const  uchar& a) { return v512_setall_u8(a); }
+template<> inline Type2Vec512_Traits< schar>::vec_type v512_setall< schar>(const  schar& a) { return v512_setall_s8(a); }
+template<> inline Type2Vec512_Traits<ushort>::vec_type v512_setall<ushort>(const ushort& a) { return v512_setall_u16(a); }
+template<> inline Type2Vec512_Traits< short>::vec_type v512_setall< short>(const  short& a) { return v512_setall_s16(a); }
+template<> inline Type2Vec512_Traits<  uint>::vec_type v512_setall<  uint>(const   uint& a) { return v512_setall_u32(a); }
+template<> inline Type2Vec512_Traits<   int>::vec_type v512_setall<   int>(const    int& a) { return v512_setall_s32(a); }
+template<> inline Type2Vec512_Traits<uint64>::vec_type v512_setall<uint64>(const uint64& a) { return v512_setall_u64(a); }
+template<> inline Type2Vec512_Traits< int64>::vec_type v512_setall< int64>(const  int64& a) { return v512_setall_s64(a); }
+template<> inline Type2Vec512_Traits< float>::vec_type v512_setall< float>(const  float& a) { return v512_setall_f32(a); }
+#if CV_SIMD512_64F
+template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const double& a) { return v512_setall_f64(a); }
+#endif
+
+#endif  // SIMD512
+
+#if CV_SIMD_SCALABLE
+template<typename _T> struct Type2Vec_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(uchar, v_uint8);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(schar, v_int8);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(ushort, v_uint16);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(short, v_int16);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(unsigned, v_uint32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(int, v_int32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(float, v_float32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(uint64, v_uint64);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(int64, v_int64);
+#if CV_SIMD_SCALABLE_64F
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(double, v_float64);
+#endif
+template<typename _T> static inline
+typename Type2Vec_Traits<_T>::vec_type v_setall(const _T& a);
+
+template<> inline Type2Vec_Traits< uchar>::vec_type v_setall< uchar>(const  uchar& a) { return v_setall_u8(a); }
+template<> inline Type2Vec_Traits< schar>::vec_type v_setall< schar>(const  schar& a) { return v_setall_s8(a); }
+template<> inline Type2Vec_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
+template<> inline Type2Vec_Traits< short>::vec_type v_setall< short>(const  short& a) { return v_setall_s16(a); }
+template<> inline Type2Vec_Traits<  uint>::vec_type v_setall<  uint>(const   uint& a) { return v_setall_u32(a); }
+template<> inline Type2Vec_Traits<   int>::vec_type v_setall<   int>(const    int& a) { return v_setall_s32(a); }
+template<> inline Type2Vec_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
+template<> inline Type2Vec_Traits< int64>::vec_type v_setall< int64>(const  int64& a) { return v_setall_s64(a); }
+template<> inline Type2Vec_Traits< float>::vec_type v_setall< float>(const  float& a) { return v_setall_f32(a); }
+#if CV_SIMD_SCALABLE_64F
+template<> inline Type2Vec_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
+#endif
+#endif
+
+
+#if CV_SIMD_SCALABLE
+template<typename _T> static inline
+typename Type2Vec_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
+#elif CV_SIMD_WIDTH == 16
+template<typename _T> static inline
+typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
+#elif CV_SIMD_WIDTH == 32
+template<typename _T> static inline
+typename Type2Vec256_Traits<_T>::vec_type vx_setall(const _T& a) { return v256_setall(a); }
+#elif CV_SIMD_WIDTH == 64
+template<typename _T> static inline
+typename Type2Vec512_Traits<_T>::vec_type vx_setall(const _T& a) { return v512_setall(a); }
+#else
+#error "Build configuration error, unsupported CV_SIMD_WIDTH"
+#endif
+
+
+#endif  // OPENCV_HAL_INTRIN_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/mat.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/mat.hpp
new file mode 100644
index 000000000000..2bfb0966c2a2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/mat.hpp
@@ -0,0 +1,3797 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_MAT_HPP
+#define OPENCV_CORE_MAT_HPP
+
+#ifndef __cplusplus
+#  error mat.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/matx.hpp"
+#include "opencv2/core/types.hpp"
+
+#include "opencv2/core/bufferpool.hpp"
+
+#include <array>
+#include <type_traits>
+
+namespace cv
+{
+
+//! @addtogroup core_basic
+//! @{
+
+enum AccessFlag { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25,
+    ACCESS_RW=3<<24, ACCESS_MASK=ACCESS_RW, ACCESS_FAST=1<<26 };
+CV_ENUM_FLAGS(AccessFlag)
+__CV_ENUM_FLAGS_BITWISE_AND(AccessFlag, int, AccessFlag)
+
+CV__DEBUG_NS_BEGIN
+
+class CV_EXPORTS _OutputArray;
+
+//////////////////////// Input/Output Array Arguments /////////////////////////////////
+
+/** @brief This is the proxy class for passing read-only input arrays into OpenCV functions.
+
+It is defined as:
+@code
+    typedef const _InputArray& InputArray;
+@endcode
+where _InputArray is a class that can be constructed from `Mat`, `Mat_<T>`, `Matx<T, m, n>`,
+`std::vector<T>`, `std::vector<std::vector<T> >`, `std::vector<Mat>`, `std::vector<Mat_<T> >`,
+`UMat`, `std::vector<UMat>` or `double`. It can also be constructed from a matrix expression.
+
+Since this is mostly implementation-level class, and its interface may change in future versions, we
+do not describe it in details. There are a few key things, though, that should be kept in mind:
+
+-   When you see in the reference manual or in OpenCV source code a function that takes
+    InputArray, it means that you can actually pass `Mat`, `Matx`, `vector<T>` etc. (see above the
+    complete list).
+-   Optional input arguments: If some of the input arrays may be empty, pass cv::noArray() (or
+    simply cv::Mat() as you probably did before).
+-   The class is designed solely for passing parameters. That is, normally you *should not*
+    declare class members, local and global variables of this type.
+-   If you want to design your own function or a class method that can operate of arrays of
+    multiple types, you can use InputArray (or OutputArray) for the respective parameters. Inside
+    a function you should use _InputArray::getMat() method to construct a matrix header for the
+    array (without copying data). _InputArray::kind() can be used to distinguish Mat from
+    `vector<>` etc., but normally it is not needed.
+
+Here is how you can use a function that takes InputArray :
+@code
+    std::vector<Point2f> vec;
+    // points or a circle
+    for( int i = 0; i < 30; i++ )
+        vec.push_back(Point2f((float)(100 + 30*cos(i*CV_PI*2/5)),
+                              (float)(100 - 30*sin(i*CV_PI*2/5))));
+    cv::transform(vec, vec, cv::Matx23f(0.707, -0.707, 10, 0.707, 0.707, 20));
+@endcode
+That is, we form an STL vector containing points, and apply in-place affine transformation to the
+vector using the 2x3 matrix created inline as `Matx<float, 2, 3>` instance.
+
+Here is how such a function can be implemented (for simplicity, we implement a very specific case of
+it, according to the assertion statement inside) :
+@code
+    void myAffineTransform(InputArray _src, OutputArray _dst, InputArray _m)
+    {
+        // get Mat headers for input arrays. This is O(1) operation,
+        // unless _src and/or _m are matrix expressions.
+        Mat src = _src.getMat(), m = _m.getMat();
+        CV_Assert( src.type() == CV_32FC2 && m.type() == CV_32F && m.size() == Size(3, 2) );
+
+        // [re]create the output array so that it has the proper size and type.
+        // In case of Mat it calls Mat::create, in case of STL vector it calls vector::resize.
+        _dst.create(src.size(), src.type());
+        Mat dst = _dst.getMat();
+
+        for( int i = 0; i < src.rows; i++ )
+            for( int j = 0; j < src.cols; j++ )
+            {
+                Point2f pt = src.at<Point2f>(i, j);
+                dst.at<Point2f>(i, j) = Point2f(m.at<float>(0, 0)*pt.x +
+                                                m.at<float>(0, 1)*pt.y +
+                                                m.at<float>(0, 2),
+                                                m.at<float>(1, 0)*pt.x +
+                                                m.at<float>(1, 1)*pt.y +
+                                                m.at<float>(1, 2));
+            }
+    }
+@endcode
+There is another related type, InputArrayOfArrays, which is currently defined as a synonym for
+InputArray:
+@code
+    typedef InputArray InputArrayOfArrays;
+@endcode
+It denotes function arguments that are either vectors of vectors or vectors of matrices. A separate
+synonym is needed to generate Python/Java etc. wrappers properly. At the function implementation
+level their use is similar, but _InputArray::getMat(idx) should be used to get header for the
+idx-th component of the outer vector and _InputArray::size().area() should be used to find the
+number of components (vectors/matrices) of the outer vector.
+
+In general, type support is limited to cv::Mat types. Other types are forbidden.
+But in some cases we need to support passing of custom non-general Mat types, like arrays of cv::KeyPoint, cv::DMatch, etc.
+This data is not intended to be interpreted as an image data, or processed somehow like regular cv::Mat.
+To pass such custom type use rawIn() / rawOut() / rawInOut() wrappers.
+Custom type is wrapped as Mat-compatible `CV_8UC<N>` values (N = sizeof(T), N <= CV_CN_MAX).
+ */
+class CV_EXPORTS _InputArray
+{
+public:
+    enum KindFlag {
+        KIND_SHIFT = 16,
+        FIXED_TYPE = 0x8000 << KIND_SHIFT,
+        FIXED_SIZE = 0x4000 << KIND_SHIFT,
+        KIND_MASK = 31 << KIND_SHIFT,
+
+        NONE              = 0 << KIND_SHIFT,
+        MAT               = 1 << KIND_SHIFT,
+        MATX              = 2 << KIND_SHIFT,
+        STD_VECTOR        = 3 << KIND_SHIFT,
+        STD_VECTOR_VECTOR = 4 << KIND_SHIFT,
+        STD_VECTOR_MAT    = 5 << KIND_SHIFT,
+#if OPENCV_ABI_COMPATIBILITY < 500
+        EXPR              = 6 << KIND_SHIFT,  //!< removed: https://github.com/opencv/opencv/pull/17046
+#endif
+        OPENGL_BUFFER     = 7 << KIND_SHIFT,
+        CUDA_HOST_MEM     = 8 << KIND_SHIFT,
+        CUDA_GPU_MAT      = 9 << KIND_SHIFT,
+        UMAT              =10 << KIND_SHIFT,
+        STD_VECTOR_UMAT   =11 << KIND_SHIFT,
+        STD_BOOL_VECTOR   =12 << KIND_SHIFT,
+        STD_VECTOR_CUDA_GPU_MAT = 13 << KIND_SHIFT,
+#if OPENCV_ABI_COMPATIBILITY < 500
+        STD_ARRAY         =14 << KIND_SHIFT,  //!< removed: https://github.com/opencv/opencv/issues/18897
+#endif
+        STD_ARRAY_MAT     =15 << KIND_SHIFT
+    };
+
+    _InputArray();
+    _InputArray(int _flags, void* _obj);
+    _InputArray(const Mat& m);
+    _InputArray(const MatExpr& expr);
+    _InputArray(const std::vector<Mat>& vec);
+    template<typename _Tp> _InputArray(const Mat_<_Tp>& m);
+    template<typename _Tp> _InputArray(const std::vector<_Tp>& vec);
+    _InputArray(const std::vector<bool>& vec);
+    template<typename _Tp> _InputArray(const std::vector<std::vector<_Tp> >& vec);
+    _InputArray(const std::vector<std::vector<bool> >&) = delete;  // not supported
+    template<typename _Tp> _InputArray(const std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _InputArray(const _Tp* vec, int n);
+    template<typename _Tp, int m, int n> _InputArray(const Matx<_Tp, m, n>& matx);
+    _InputArray(const double& val);
+    _InputArray(const cuda::GpuMat& d_mat);
+    _InputArray(const std::vector<cuda::GpuMat>& d_mat_array);
+    _InputArray(const ogl::Buffer& buf);
+    _InputArray(const cuda::HostMem& cuda_mem);
+    template<typename _Tp> _InputArray(const cudev::GpuMat_<_Tp>& m);
+    _InputArray(const UMat& um);
+    _InputArray(const std::vector<UMat>& umv);
+
+    template<typename _Tp, std::size_t _Nm> _InputArray(const std::array<_Tp, _Nm>& arr);
+    template<std::size_t _Nm> _InputArray(const std::array<Mat, _Nm>& arr);
+
+    template<typename _Tp> static _InputArray rawIn(const std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> static _InputArray rawIn(const std::array<_Tp, _Nm>& arr);
+
+    Mat getMat(int idx=-1) const;
+    Mat getMat_(int idx=-1) const;
+    UMat getUMat(int idx=-1) const;
+    void getMatVector(std::vector<Mat>& mv) const;
+    void getUMatVector(std::vector<UMat>& umv) const;
+    void getGpuMatVector(std::vector<cuda::GpuMat>& gpumv) const;
+    cuda::GpuMat getGpuMat() const;
+    ogl::Buffer getOGlBuffer() const;
+
+    int getFlags() const;
+    void* getObj() const;
+    Size getSz() const;
+
+    _InputArray::KindFlag kind() const;
+    int dims(int i=-1) const;
+    int cols(int i=-1) const;
+    int rows(int i=-1) const;
+    Size size(int i=-1) const;
+    int sizend(int* sz, int i=-1) const;
+    bool sameSize(const _InputArray& arr) const;
+    size_t total(int i=-1) const;
+    int type(int i=-1) const;
+    int depth(int i=-1) const;
+    int channels(int i=-1) const;
+    bool isContinuous(int i=-1) const;
+    bool isSubmatrix(int i=-1) const;
+    bool empty() const;
+    void copyTo(const _OutputArray& arr) const;
+    void copyTo(const _OutputArray& arr, const _InputArray & mask) const;
+    size_t offset(int i=-1) const;
+    size_t step(int i=-1) const;
+    bool isMat() const;
+    bool isUMat() const;
+    bool isMatVector() const;
+    bool isUMatVector() const;
+    bool isMatx() const;
+    bool isVector() const;
+    bool isGpuMat() const;
+    bool isGpuMatVector() const;
+    ~_InputArray();
+
+protected:
+    int flags;
+    void* obj;
+    Size sz;
+
+    void init(int _flags, const void* _obj);
+    void init(int _flags, const void* _obj, Size _sz);
+};
+CV_ENUM_FLAGS(_InputArray::KindFlag)
+__CV_ENUM_FLAGS_BITWISE_AND(_InputArray::KindFlag, int, _InputArray::KindFlag)
+
+/** @brief This type is very similar to InputArray except that it is used for input/output and output function
+parameters.
+
+Just like with InputArray, OpenCV users should not care about OutputArray, they just pass `Mat`,
+`vector<T>` etc. to the functions. The same limitation as for `InputArray`: *Do not explicitly
+create OutputArray instances* applies here too.
+
+If you want to make your function polymorphic (i.e. accept different arrays as output parameters),
+it is also not very difficult. Take the sample above as the reference. Note that
+_OutputArray::create() needs to be called before _OutputArray::getMat(). This way you guarantee
+that the output array is properly allocated.
+
+Optional output parameters. If you do not need certain output array to be computed and returned to
+you, pass cv::noArray(), just like you would in the case of optional input array. At the
+implementation level, use _OutputArray::needed() to check if certain output array needs to be
+computed or not.
+
+There are several synonyms for OutputArray that are used to assist automatic Python/Java/... wrapper
+generators:
+@code
+    typedef OutputArray OutputArrayOfArrays;
+    typedef OutputArray InputOutputArray;
+    typedef OutputArray InputOutputArrayOfArrays;
+@endcode
+ */
+class CV_EXPORTS _OutputArray : public _InputArray
+{
+public:
+    enum DepthMask
+    {
+        DEPTH_MASK_8U = 1 << CV_8U,
+        DEPTH_MASK_8S = 1 << CV_8S,
+        DEPTH_MASK_16U = 1 << CV_16U,
+        DEPTH_MASK_16S = 1 << CV_16S,
+        DEPTH_MASK_32S = 1 << CV_32S,
+        DEPTH_MASK_32F = 1 << CV_32F,
+        DEPTH_MASK_64F = 1 << CV_64F,
+        DEPTH_MASK_16F = 1 << CV_16F,
+        DEPTH_MASK_ALL = (DEPTH_MASK_64F<<1)-1,
+        DEPTH_MASK_ALL_BUT_8S = DEPTH_MASK_ALL & ~DEPTH_MASK_8S,
+        DEPTH_MASK_ALL_16F = (DEPTH_MASK_16F<<1)-1,
+        DEPTH_MASK_FLT = DEPTH_MASK_32F + DEPTH_MASK_64F
+    };
+
+    _OutputArray();
+    _OutputArray(int _flags, void* _obj);
+    _OutputArray(Mat& m);
+    _OutputArray(std::vector<Mat>& vec);
+    _OutputArray(cuda::GpuMat& d_mat);
+    _OutputArray(std::vector<cuda::GpuMat>& d_mat);
+    _OutputArray(ogl::Buffer& buf);
+    _OutputArray(cuda::HostMem& cuda_mem);
+    template<typename _Tp> _OutputArray(cudev::GpuMat_<_Tp>& m);
+    template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
+    _OutputArray(std::vector<bool>& vec) = delete;  // not supported
+    template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
+    _OutputArray(std::vector<std::vector<bool> >&) = delete;  // not supported
+    template<typename _Tp> _OutputArray(std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _OutputArray(Mat_<_Tp>& m);
+    template<typename _Tp> _OutputArray(_Tp* vec, int n);
+    template<typename _Tp, int m, int n> _OutputArray(Matx<_Tp, m, n>& matx);
+    _OutputArray(UMat& m);
+    _OutputArray(std::vector<UMat>& vec);
+
+    _OutputArray(const Mat& m);
+    _OutputArray(const std::vector<Mat>& vec);
+    _OutputArray(const cuda::GpuMat& d_mat);
+    _OutputArray(const std::vector<cuda::GpuMat>& d_mat);
+    _OutputArray(const ogl::Buffer& buf);
+    _OutputArray(const cuda::HostMem& cuda_mem);
+    template<typename _Tp> _OutputArray(const cudev::GpuMat_<_Tp>& m);
+    template<typename _Tp> _OutputArray(const std::vector<_Tp>& vec);
+    template<typename _Tp> _OutputArray(const std::vector<std::vector<_Tp> >& vec);
+    template<typename _Tp> _OutputArray(const std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _OutputArray(const Mat_<_Tp>& m);
+    template<typename _Tp> _OutputArray(const _Tp* vec, int n);
+    template<typename _Tp, int m, int n> _OutputArray(const Matx<_Tp, m, n>& matx);
+    _OutputArray(const UMat& m);
+    _OutputArray(const std::vector<UMat>& vec);
+
+    template<typename _Tp, std::size_t _Nm> _OutputArray(std::array<_Tp, _Nm>& arr);
+    template<typename _Tp, std::size_t _Nm> _OutputArray(const std::array<_Tp, _Nm>& arr);
+    template<std::size_t _Nm> _OutputArray(std::array<Mat, _Nm>& arr);
+    template<std::size_t _Nm> _OutputArray(const std::array<Mat, _Nm>& arr);
+
+    template<typename _Tp> static _OutputArray rawOut(std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> static _OutputArray rawOut(std::array<_Tp, _Nm>& arr);
+
+    bool fixedSize() const;
+    bool fixedType() const;
+    bool needed() const;
+    Mat& getMatRef(int i=-1) const;
+    UMat& getUMatRef(int i=-1) const;
+    cuda::GpuMat& getGpuMatRef() const;
+    std::vector<cuda::GpuMat>& getGpuMatVecRef() const;
+    ogl::Buffer& getOGlBufferRef() const;
+    cuda::HostMem& getHostMemRef() const;
+    void create(Size sz, int type, int i=-1, bool allowTransposed=false, _OutputArray::DepthMask fixedDepthMask=static_cast<_OutputArray::DepthMask>(0)) const;
+    void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, _OutputArray::DepthMask fixedDepthMask=static_cast<_OutputArray::DepthMask>(0)) const;
+    void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, _OutputArray::DepthMask fixedDepthMask=static_cast<_OutputArray::DepthMask>(0)) const;
+    void createSameSize(const _InputArray& arr, int mtype) const;
+    void release() const;
+    void clear() const;
+    void setTo(const _InputArray& value, const _InputArray & mask = _InputArray()) const;
+
+    void assign(const UMat& u) const;
+    void assign(const Mat& m) const;
+
+    void assign(const std::vector<UMat>& v) const;
+    void assign(const std::vector<Mat>& v) const;
+
+    void move(UMat& u) const;
+    void move(Mat& m) const;
+};
+
+
+class CV_EXPORTS _InputOutputArray : public _OutputArray
+{
+public:
+    _InputOutputArray();
+    _InputOutputArray(int _flags, void* _obj);
+    _InputOutputArray(Mat& m);
+    _InputOutputArray(std::vector<Mat>& vec);
+    _InputOutputArray(cuda::GpuMat& d_mat);
+    _InputOutputArray(ogl::Buffer& buf);
+    _InputOutputArray(cuda::HostMem& cuda_mem);
+    template<typename _Tp> _InputOutputArray(cudev::GpuMat_<_Tp>& m);
+    template<typename _Tp> _InputOutputArray(std::vector<_Tp>& vec);
+    _InputOutputArray(std::vector<bool>& vec) = delete;  // not supported
+    template<typename _Tp> _InputOutputArray(std::vector<std::vector<_Tp> >& vec);
+    template<typename _Tp> _InputOutputArray(std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _InputOutputArray(Mat_<_Tp>& m);
+    template<typename _Tp> _InputOutputArray(_Tp* vec, int n);
+    template<typename _Tp, int m, int n> _InputOutputArray(Matx<_Tp, m, n>& matx);
+    _InputOutputArray(UMat& m);
+    _InputOutputArray(std::vector<UMat>& vec);
+
+    _InputOutputArray(const Mat& m);
+    _InputOutputArray(const std::vector<Mat>& vec);
+    _InputOutputArray(const cuda::GpuMat& d_mat);
+    _InputOutputArray(const std::vector<cuda::GpuMat>& d_mat);
+    _InputOutputArray(const ogl::Buffer& buf);
+    _InputOutputArray(const cuda::HostMem& cuda_mem);
+    template<typename _Tp> _InputOutputArray(const cudev::GpuMat_<_Tp>& m);
+    template<typename _Tp> _InputOutputArray(const std::vector<_Tp>& vec);
+    template<typename _Tp> _InputOutputArray(const std::vector<std::vector<_Tp> >& vec);
+    template<typename _Tp> _InputOutputArray(const std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _InputOutputArray(const Mat_<_Tp>& m);
+    template<typename _Tp> _InputOutputArray(const _Tp* vec, int n);
+    template<typename _Tp, int m, int n> _InputOutputArray(const Matx<_Tp, m, n>& matx);
+    _InputOutputArray(const UMat& m);
+    _InputOutputArray(const std::vector<UMat>& vec);
+
+    template<typename _Tp, std::size_t _Nm> _InputOutputArray(std::array<_Tp, _Nm>& arr);
+    template<typename _Tp, std::size_t _Nm> _InputOutputArray(const std::array<_Tp, _Nm>& arr);
+    template<std::size_t _Nm> _InputOutputArray(std::array<Mat, _Nm>& arr);
+    template<std::size_t _Nm> _InputOutputArray(const std::array<Mat, _Nm>& arr);
+
+    template<typename _Tp> static _InputOutputArray rawInOut(std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> _InputOutputArray rawInOut(std::array<_Tp, _Nm>& arr);
+
+};
+
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _InputArray rawIn(_Tp& v);
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _OutputArray rawOut(_Tp& v);
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _InputOutputArray rawInOut(_Tp& v);
+
+CV__DEBUG_NS_END
+
+typedef const _InputArray& InputArray;
+typedef InputArray InputArrayOfArrays;
+typedef const _OutputArray& OutputArray;
+typedef OutputArray OutputArrayOfArrays;
+typedef const _InputOutputArray& InputOutputArray;
+typedef InputOutputArray InputOutputArrayOfArrays;
+
+CV_EXPORTS InputOutputArray noArray();
+
+/////////////////////////////////// MatAllocator //////////////////////////////////////
+
+/** @brief  Usage flags for allocator
+
+ @warning  All flags except `USAGE_DEFAULT` are experimental.
+
+ @warning  For the OpenCL allocator, `USAGE_ALLOCATE_SHARED_MEMORY` depends on
+ OpenCV's optional, experimental integration with OpenCL SVM. To enable this
+ integration, build OpenCV using the `WITH_OPENCL_SVM=ON` CMake option and, at
+ runtime, call `cv::ocl::Context::getDefault().setUseSVM(true);` or similar
+ code. Note that SVM is incompatible with OpenCL 1.x.
+*/
+enum UMatUsageFlags
+{
+    USAGE_DEFAULT = 0,
+
+    // buffer allocation policy is platform and usage specific
+    USAGE_ALLOCATE_HOST_MEMORY = 1 << 0,
+    USAGE_ALLOCATE_DEVICE_MEMORY = 1 << 1,
+    USAGE_ALLOCATE_SHARED_MEMORY = 1 << 2, // It is not equal to: USAGE_ALLOCATE_HOST_MEMORY | USAGE_ALLOCATE_DEVICE_MEMORY
+
+    __UMAT_USAGE_FLAGS_32BIT = 0x7fffffff // Binary compatibility hint
+};
+
+struct CV_EXPORTS UMatData;
+
+/** @brief  Custom array allocator
+*/
+class CV_EXPORTS MatAllocator
+{
+public:
+    MatAllocator() {}
+    virtual ~MatAllocator() {}
+
+    // let's comment it off for now to detect and fix all the uses of allocator
+    //virtual void allocate(int dims, const int* sizes, int type, int*& refcount,
+    //                      uchar*& datastart, uchar*& data, size_t* step) = 0;
+    //virtual void deallocate(int* refcount, uchar* datastart, uchar* data) = 0;
+    virtual UMatData* allocate(int dims, const int* sizes, int type,
+                               void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const = 0;
+    virtual bool allocate(UMatData* data, AccessFlag accessflags, UMatUsageFlags usageFlags) const = 0;
+    virtual void deallocate(UMatData* data) const = 0;
+    virtual void map(UMatData* data, AccessFlag accessflags) const;
+    virtual void unmap(UMatData* data) const;
+    virtual void download(UMatData* data, void* dst, int dims, const size_t sz[],
+                          const size_t srcofs[], const size_t srcstep[],
+                          const size_t dststep[]) const;
+    virtual void upload(UMatData* data, const void* src, int dims, const size_t sz[],
+                        const size_t dstofs[], const size_t dststep[],
+                        const size_t srcstep[]) const;
+    virtual void copy(UMatData* srcdata, UMatData* dstdata, int dims, const size_t sz[],
+                      const size_t srcofs[], const size_t srcstep[],
+                      const size_t dstofs[], const size_t dststep[], bool sync) const;
+
+    // default implementation returns DummyBufferPoolController
+    virtual BufferPoolController* getBufferPoolController(const char* id = NULL) const;
+};
+
+
+//////////////////////////////// MatCommaInitializer //////////////////////////////////
+
+/** @brief  Comma-separated Matrix Initializer
+
+ The class instances are usually not created explicitly.
+ Instead, they are created on "matrix << firstValue" operator.
+
+ The sample below initializes 2x2 rotation matrix:
+
+ \code
+ double angle = 30, a = cos(angle*CV_PI/180), b = sin(angle*CV_PI/180);
+ Mat R = (Mat_<double>(2,2) << a, -b, b, a);
+ \endcode
+*/
+template<typename _Tp> class MatCommaInitializer_
+{
+public:
+    //! the constructor, created by "matrix << firstValue" operator, where matrix is cv::Mat
+    MatCommaInitializer_(Mat_<_Tp>* _m);
+    //! the operator that takes the next value and put it to the matrix
+    template<typename T2> MatCommaInitializer_<_Tp>& operator , (T2 v);
+    //! another form of conversion operator
+    operator Mat_<_Tp>() const;
+protected:
+    MatIterator_<_Tp> it;
+};
+
+
+/////////////////////////////////////// Mat ///////////////////////////////////////////
+
+// note that umatdata might be allocated together
+// with the matrix data, not as a separate object.
+// therefore, it does not have constructor or destructor;
+// it should be explicitly initialized using init().
+struct CV_EXPORTS UMatData
+{
+    enum MemoryFlag { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2,
+        DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24,
+        USER_ALLOCATED=32, DEVICE_MEM_MAPPED=64,
+        ASYNC_CLEANUP=128
+    };
+    UMatData(const MatAllocator* allocator);
+    ~UMatData();
+
+    // provide atomic access to the structure
+    void lock();
+    void unlock();
+
+    bool hostCopyObsolete() const;
+    bool deviceCopyObsolete() const;
+    bool deviceMemMapped() const;
+    bool copyOnMap() const;
+    bool tempUMat() const;
+    bool tempCopiedUMat() const;
+    void markHostCopyObsolete(bool flag);
+    void markDeviceCopyObsolete(bool flag);
+    void markDeviceMemMapped(bool flag);
+
+    const MatAllocator* prevAllocator;
+    const MatAllocator* currAllocator;
+    int urefcount;
+    int refcount;
+    uchar* data;
+    uchar* origdata;
+    size_t size;
+
+    UMatData::MemoryFlag flags;
+    void* handle;
+    void* userdata;
+    int allocatorFlags_;
+    int mapcount;
+    UMatData* originalUMatData;
+    std::shared_ptr<void> allocatorContext;
+};
+CV_ENUM_FLAGS(UMatData::MemoryFlag)
+
+
+struct CV_EXPORTS MatSize
+{
+    explicit MatSize(int* _p) CV_NOEXCEPT;
+    int dims() const CV_NOEXCEPT;
+    Size operator()() const;
+    const int& operator[](int i) const;
+    int& operator[](int i);
+    operator const int*() const CV_NOEXCEPT;  // TODO OpenCV 4.0: drop this
+    bool operator == (const MatSize& sz) const CV_NOEXCEPT;
+    bool operator != (const MatSize& sz) const CV_NOEXCEPT;
+
+    int* p;
+};
+
+struct CV_EXPORTS MatStep
+{
+    MatStep() CV_NOEXCEPT;
+    explicit MatStep(size_t s) CV_NOEXCEPT;
+    const size_t& operator[](int i) const CV_NOEXCEPT;
+    size_t& operator[](int i) CV_NOEXCEPT;
+    operator size_t() const;
+    MatStep& operator = (size_t s);
+
+    size_t* p;
+    size_t buf[2];
+protected:
+    MatStep& operator = (const MatStep&);
+};
+
+/** @example samples/cpp/cout_mat.cpp
+An example demonstrating the serial out capabilities of cv::Mat
+*/
+
+ /** @brief n-dimensional dense array class \anchor CVMat_Details
+
+The class Mat represents an n-dimensional dense numerical single-channel or multi-channel array. It
+can be used to store real or complex-valued vectors and matrices, grayscale or color images, voxel
+volumes, vector fields, point clouds, tensors, histograms (though, very high-dimensional histograms
+may be better stored in a SparseMat ). The data layout of the array `M` is defined by the array
+`M.step[]`, so that the address of element \f$(i_0,...,i_{M.dims-1})\f$, where \f$0\leq i_k<M.size[k]\f$, is
+computed as:
+\f[addr(M_{i_0,...,i_{M.dims-1}}) = M.data + M.step[0]*i_0 + M.step[1]*i_1 + ... + M.step[M.dims-1]*i_{M.dims-1}\f]
+In case of a 2-dimensional array, the above formula is reduced to:
+\f[addr(M_{i,j}) = M.data + M.step[0]*i + M.step[1]*j\f]
+Note that `M.step[i] >= M.step[i+1]` (in fact, `M.step[i] >= M.step[i+1]*M.size[i+1]` ). This means
+that 2-dimensional matrices are stored row-by-row, 3-dimensional matrices are stored plane-by-plane,
+and so on. M.step[M.dims-1] is minimal and always equal to the element size M.elemSize() .
+
+So, the data layout in Mat is compatible with the majority of dense array types from the standard
+toolkits and SDKs, such as Numpy (ndarray), Win32 (independent device bitmaps), and others,
+that is, with any array that uses *steps* (or *strides*) to compute the position of a pixel.
+Due to this compatibility, it is possible to make a Mat header for user-allocated data and process
+it in-place using OpenCV functions.
+
+There are many different ways to create a Mat object. The most popular options are listed below:
+
+- Use the create(nrows, ncols, type) method or the similar Mat(nrows, ncols, type[, fillValue])
+constructor. A new array of the specified size and type is allocated. type has the same meaning as
+in the cvCreateMat method. For example, CV_8UC1 means a 8-bit single-channel array, CV_32FC2
+means a 2-channel (complex) floating-point array, and so on.
+@code
+    // make a 7x7 complex matrix filled with 1+3j.
+    Mat M(7,7,CV_32FC2,Scalar(1,3));
+    // and now turn M to a 100x60 15-channel 8-bit matrix.
+    // The old content will be deallocated
+    M.create(100,60,CV_8UC(15));
+@endcode
+As noted in the introduction to this chapter, create() allocates only a new array when the shape
+or type of the current array are different from the specified ones.
+
+- Create a multi-dimensional array:
+@code
+    // create a 100x100x100 8-bit array
+    int sz[] = {100, 100, 100};
+    Mat bigCube(3, sz, CV_8U, Scalar::all(0));
+@endcode
+It passes the number of dimensions =1 to the Mat constructor but the created array will be
+2-dimensional with the number of columns set to 1. So, Mat::dims is always \>= 2 (can also be 0
+when the array is empty).
+
+- Use a copy constructor or assignment operator where there can be an array or expression on the
+right side (see below). As noted in the introduction, the array assignment is an O(1) operation
+because it only copies the header and increases the reference counter. The Mat::clone() method can
+be used to get a full (deep) copy of the array when you need it.
+
+- Construct a header for a part of another array. It can be a single row, single column, several
+rows, several columns, rectangular region in the array (called a *minor* in algebra) or a
+diagonal. Such operations are also O(1) because the new header references the same data. You can
+actually modify a part of the array using this feature, for example:
+@code
+    // add the 5-th row, multiplied by 3 to the 3rd row
+    M.row(3) = M.row(3) + M.row(5)*3;
+    // now copy the 7-th column to the 1-st column
+    // M.col(1) = M.col(7); // this will not work
+    Mat M1 = M.col(1);
+    M.col(7).copyTo(M1);
+    // create a new 320x240 image
+    Mat img(Size(320,240),CV_8UC3);
+    // select a ROI
+    Mat roi(img, Rect(10,10,100,100));
+    // fill the ROI with (0,255,0) (which is green in RGB space);
+    // the original 320x240 image will be modified
+    roi = Scalar(0,255,0);
+@endcode
+Due to the additional datastart and dataend members, it is possible to compute a relative
+sub-array position in the main *container* array using locateROI():
+@code
+    Mat A = Mat::eye(10, 10, CV_32S);
+    // extracts A columns, 1 (inclusive) to 3 (exclusive).
+    Mat B = A(Range::all(), Range(1, 3));
+    // extracts B rows, 5 (inclusive) to 9 (exclusive).
+    // that is, C \~ A(Range(5, 9), Range(1, 3))
+    Mat C = B(Range(5, 9), Range::all());
+    Size size; Point ofs;
+    C.locateROI(size, ofs);
+    // size will be (width=10,height=10) and the ofs will be (x=1, y=5)
+@endcode
+As in case of whole matrices, if you need a deep copy, use the `clone()` method of the extracted
+sub-matrices.
+
+- Make a header for user-allocated data. It can be useful to do the following:
+    -# Process "foreign" data using OpenCV (for example, when you implement a DirectShow\* filter or
+    a processing module for gstreamer, and so on). For example:
+    @code
+        Mat process_video_frame(const unsigned char* pixels,
+                                int width, int height, int step)
+        {
+            // wrap input buffer
+            Mat img(height, width, CV_8UC3, (unsigned char*)pixels, step);
+
+            Mat result;
+            GaussianBlur(img, result, Size(7, 7), 1.5, 1.5);
+
+            return result;
+        }
+    @endcode
+    -# Quickly initialize small matrices and/or get a super-fast element access.
+    @code
+        double m[3][3] = {{a, b, c}, {d, e, f}, {g, h, i}};
+        Mat M = Mat(3, 3, CV_64F, m).inv();
+    @endcode
+    .
+
+- Use MATLAB-style array initializers, zeros(), ones(), eye(), for example:
+@code
+    // create a double-precision identity matrix and add it to M.
+    M += Mat::eye(M.rows, M.cols, CV_64F);
+@endcode
+
+- Use a comma-separated initializer:
+@code
+    // create a 3x3 double-precision identity matrix
+    Mat M = (Mat_<double>(3,3) << 1, 0, 0, 0, 1, 0, 0, 0, 1);
+@endcode
+With this approach, you first call a constructor of the Mat class with the proper parameters, and
+then you just put `<< operator` followed by comma-separated values that can be constants,
+variables, expressions, and so on. Also, note the extra parentheses required to avoid compilation
+errors.
+
+Once the array is created, it is automatically managed via a reference-counting mechanism. If the
+array header is built on top of user-allocated data, you should handle the data by yourself. The
+array data is deallocated when no one points to it. If you want to release the data pointed by a
+array header before the array destructor is called, use Mat::release().
+
+The next important thing to learn about the array class is element access. This manual already
+described how to compute an address of each array element. Normally, you are not required to use the
+formula directly in the code. If you know the array element type (which can be retrieved using the
+method Mat::type() ), you can access the element \f$M_{ij}\f$ of a 2-dimensional array as:
+@code
+    M.at<double>(i,j) += 1.f;
+@endcode
+assuming that `M` is a double-precision floating-point array. There are several variants of the method
+at for a different number of dimensions.
+
+If you need to process a whole row of a 2D array, the most efficient way is to get the pointer to
+the row first, and then just use the plain C operator [] :
+@code
+    // compute sum of positive matrix elements
+    // (assuming that M is a double-precision matrix)
+    double sum=0;
+    for(int i = 0; i < M.rows; i++)
+    {
+        const double* Mi = M.ptr<double>(i);
+        for(int j = 0; j < M.cols; j++)
+            sum += std::max(Mi[j], 0.);
+    }
+@endcode
+Some operations, like the one above, do not actually depend on the array shape. They just process
+elements of an array one by one (or elements from multiple arrays that have the same coordinates,
+for example, array addition). Such operations are called *element-wise*. It makes sense to check
+whether all the input/output arrays are continuous, namely, have no gaps at the end of each row. If
+yes, process them as a long single row:
+@code
+    // compute the sum of positive matrix elements, optimized variant
+    double sum=0;
+    int cols = M.cols, rows = M.rows;
+    if(M.isContinuous())
+    {
+        cols *= rows;
+        rows = 1;
+    }
+    for(int i = 0; i < rows; i++)
+    {
+        const double* Mi = M.ptr<double>(i);
+        for(int j = 0; j < cols; j++)
+            sum += std::max(Mi[j], 0.);
+    }
+@endcode
+In case of the continuous matrix, the outer loop body is executed just once. So, the overhead is
+smaller, which is especially noticeable in case of small matrices.
+
+Finally, there are STL-style iterators that are smart enough to skip gaps between successive rows:
+@code
+    // compute sum of positive matrix elements, iterator-based variant
+    double sum=0;
+    MatConstIterator_<double> it = M.begin<double>(), it_end = M.end<double>();
+    for(; it != it_end; ++it)
+        sum += std::max(*it, 0.);
+@endcode
+The matrix iterators are random-access iterators, so they can be passed to any STL algorithm,
+including std::sort().
+
+@note Matrix Expressions and arithmetic see MatExpr
+*/
+class CV_EXPORTS Mat
+{
+public:
+    /**
+    These are various constructors that form a matrix. As noted in the AutomaticAllocation, often
+    the default constructor is enough, and the proper matrix will be allocated by an OpenCV function.
+    The constructed matrix can further be assigned to another matrix or matrix expression or can be
+    allocated with Mat::create . In the former case, the old content is de-referenced.
+     */
+    Mat() CV_NOEXCEPT;
+
+    /** @overload
+    @param rows Number of rows in a 2D array.
+    @param cols Number of columns in a 2D array.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    Mat(int rows, int cols, int type);
+
+    /** @overload
+    @param size 2D array size: Size(cols, rows) . In the Size() constructor, the number of rows and the
+    number of columns go in the reverse order.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+      */
+    Mat(Size size, int type);
+
+    /** @overload
+    @param rows Number of rows in a 2D array.
+    @param cols Number of columns in a 2D array.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param s An optional value to initialize each matrix element with. To set all the matrix elements to
+    the particular value after the construction, use the assignment operator
+    Mat::operator=(const Scalar& value) .
+    */
+    Mat(int rows, int cols, int type, const Scalar& s);
+
+    /** @overload
+    @param size 2D array size: Size(cols, rows) . In the Size() constructor, the number of rows and the
+    number of columns go in the reverse order.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param s An optional value to initialize each matrix element with. To set all the matrix elements to
+    the particular value after the construction, use the assignment operator
+    Mat::operator=(const Scalar& value) .
+      */
+    Mat(Size size, int type, const Scalar& s);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    Mat(int ndims, const int* sizes, int type);
+
+    /** @overload
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    Mat(const std::vector<int>& sizes, int type);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param s An optional value to initialize each matrix element with. To set all the matrix elements to
+    the particular value after the construction, use the assignment operator
+    Mat::operator=(const Scalar& value) .
+    */
+    Mat(int ndims, const int* sizes, int type, const Scalar& s);
+
+    /** @overload
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param s An optional value to initialize each matrix element with. To set all the matrix elements to
+    the particular value after the construction, use the assignment operator
+    Mat::operator=(const Scalar& value) .
+    */
+    Mat(const std::vector<int>& sizes, int type, const Scalar& s);
+
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    */
+    Mat(const Mat& m);
+
+    /** @overload
+    @param rows Number of rows in a 2D array.
+    @param cols Number of columns in a 2D array.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param step Number of bytes each matrix row occupies. The value should include the padding bytes at
+    the end of each row, if any. If the parameter is missing (set to AUTO_STEP ), no padding is assumed
+    and the actual step is calculated as cols*elemSize(). See Mat::elemSize.
+    */
+    Mat(int rows, int cols, int type, void* data, size_t step=AUTO_STEP);
+
+    /** @overload
+    @param size 2D array size: Size(cols, rows) . In the Size() constructor, the number of rows and the
+    number of columns go in the reverse order.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param step Number of bytes each matrix row occupies. The value should include the padding bytes at
+    the end of each row, if any. If the parameter is missing (set to AUTO_STEP ), no padding is assumed
+    and the actual step is calculated as cols*elemSize(). See Mat::elemSize.
+    */
+    Mat(Size size, int type, void* data, size_t step=AUTO_STEP);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param steps Array of ndims-1 steps in case of a multi-dimensional array (the last step is always
+    set to the element size). If not specified, the matrix is assumed to be continuous.
+    */
+    Mat(int ndims, const int* sizes, int type, void* data, const size_t* steps=0);
+
+    /** @overload
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param steps Array of ndims-1 steps in case of a multi-dimensional array (the last step is always
+    set to the element size). If not specified, the matrix is assumed to be continuous.
+    */
+    Mat(const std::vector<int>& sizes, int type, void* data, const size_t* steps=0);
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    @param rowRange Range of the m rows to take. As usual, the range start is inclusive and the range
+    end is exclusive. Use Range::all() to take all the rows.
+    @param colRange Range of the m columns to take. Use Range::all() to take all the columns.
+    */
+    Mat(const Mat& m, const Range& rowRange, const Range& colRange=Range::all());
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    @param roi Region of interest.
+    */
+    Mat(const Mat& m, const Rect& roi);
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    @param ranges Array of selected ranges of m along each dimensionality.
+    */
+    Mat(const Mat& m, const Range* ranges);
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    @param ranges Array of selected ranges of m along each dimensionality.
+    */
+    Mat(const Mat& m, const std::vector<Range>& ranges);
+
+    /** @overload
+    @param vec STL vector whose elements form the matrix. The matrix has a single column and the number
+    of rows equal to the number of vector elements. Type of the matrix matches the type of vector
+    elements. The constructor can handle arbitrary types, for which there is a properly declared
+    DataType . This means that the vector elements must be primitive numbers or uni-type numerical
+    tuples of numbers. Mixed-type structures are not supported. The corresponding constructor is
+    explicit. Since STL vectors are not automatically converted to Mat instances, you should write
+    Mat(vec) explicitly. Unless you copy the data into the matrix ( copyData=true ), no new elements
+    will be added to the vector because it can potentially yield vector data reallocation, and, thus,
+    the matrix data pointer will be invalid.
+    @param copyData Flag to specify whether the underlying data of the STL vector should be copied
+    to (true) or shared with (false) the newly constructed matrix. When the data is copied, the
+    allocated buffer is managed using Mat reference counting mechanism. While the data is shared,
+    the reference counter is NULL, and you should not deallocate the data until the matrix is
+    destructed.
+    */
+    template<typename _Tp> explicit Mat(const std::vector<_Tp>& vec, bool copyData=false);
+
+    /** @overload
+    */
+    template<typename _Tp, typename = typename std::enable_if<std::is_arithmetic<_Tp>::value>::type>
+    explicit Mat(const std::initializer_list<_Tp> list);
+
+    /** @overload
+    */
+    template<typename _Tp> explicit Mat(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> list);
+
+    /** @overload
+    */
+    template<typename _Tp, size_t _Nm> explicit Mat(const std::array<_Tp, _Nm>& arr, bool copyData=false);
+
+    /** @overload
+    */
+    template<typename _Tp, int n> explicit Mat(const Vec<_Tp, n>& vec, bool copyData=true);
+
+    /** @overload
+    */
+    template<typename _Tp, int m, int n> explicit Mat(const Matx<_Tp, m, n>& mtx, bool copyData=true);
+
+    /** @overload
+    */
+    template<typename _Tp> explicit Mat(const Point_<_Tp>& pt, bool copyData=true);
+
+    /** @overload
+    */
+    template<typename _Tp> explicit Mat(const Point3_<_Tp>& pt, bool copyData=true);
+
+    /** @overload
+    */
+    template<typename _Tp> explicit Mat(const MatCommaInitializer_<_Tp>& commaInitializer);
+
+    //! download data from GpuMat
+    explicit Mat(const cuda::GpuMat& m);
+
+    //! destructor - calls release()
+    ~Mat();
+
+    /** @brief assignment operators
+
+    These are available assignment operators. Since they all are very different, make sure to read the
+    operator parameters description.
+    @param m Assigned, right-hand-side matrix. Matrix assignment is an O(1) operation. This means that
+    no data is copied but the data is shared and the reference counter, if any, is incremented. Before
+    assigning new data, the old data is de-referenced via Mat::release .
+     */
+    Mat& operator = (const Mat& m);
+
+    /** @overload
+    @param expr Assigned matrix expression object. As opposite to the first form of the assignment
+    operation, the second form can reuse already allocated matrix if it has the right size and type to
+    fit the matrix expression result. It is automatically handled by the real function that the matrix
+    expressions is expanded to. For example, C=A+B is expanded to add(A, B, C), and add takes care of
+    automatic C reallocation.
+    */
+    Mat& operator = (const MatExpr& expr);
+
+    //! retrieve UMat from Mat
+    UMat getUMat(AccessFlag accessFlags, UMatUsageFlags usageFlags = USAGE_DEFAULT) const;
+
+    /** @brief Creates a matrix header for the specified matrix row.
+
+    The method makes a new header for the specified matrix row and returns it. This is an O(1)
+    operation, regardless of the matrix size. The underlying data of the new matrix is shared with the
+    original matrix. Here is the example of one of the classical basic matrix processing operations,
+    axpy, used by LU and many other algorithms:
+    @code
+        inline void matrix_axpy(Mat& A, int i, int j, double alpha)
+        {
+            A.row(i) += A.row(j)*alpha;
+        }
+    @endcode
+    @note In the current implementation, the following code does not work as expected:
+    @code
+        Mat A;
+        ...
+        A.row(i) = A.row(j); // will not work
+    @endcode
+    This happens because A.row(i) forms a temporary header that is further assigned to another header.
+    Remember that each of these operations is O(1), that is, no data is copied. Thus, the above
+    assignment is not true if you may have expected the j-th row to be copied to the i-th row. To
+    achieve that, you should either turn this simple assignment into an expression or use the
+    Mat::copyTo method:
+    @code
+        Mat A;
+        ...
+        // works, but looks a bit obscure.
+        A.row(i) = A.row(j) + 0;
+        // this is a bit longer, but the recommended method.
+        A.row(j).copyTo(A.row(i));
+    @endcode
+    @param y A 0-based row index.
+     */
+    Mat row(int y) const;
+
+    /** @brief Creates a matrix header for the specified matrix column.
+
+    The method makes a new header for the specified matrix column and returns it. This is an O(1)
+    operation, regardless of the matrix size. The underlying data of the new matrix is shared with the
+    original matrix. See also the Mat::row description.
+    @param x A 0-based column index.
+     */
+    Mat col(int x) const;
+
+    /** @brief Creates a matrix header for the specified row span.
+
+    The method makes a new header for the specified row span of the matrix. Similarly to Mat::row and
+    Mat::col , this is an O(1) operation.
+    @param startrow An inclusive 0-based start index of the row span.
+    @param endrow An exclusive 0-based ending index of the row span.
+     */
+    Mat rowRange(int startrow, int endrow) const;
+
+    /** @overload
+    @param r Range structure containing both the start and the end indices.
+    */
+    Mat rowRange(const Range& r) const;
+
+    /** @brief Creates a matrix header for the specified column span.
+
+    The method makes a new header for the specified column span of the matrix. Similarly to Mat::row and
+    Mat::col , this is an O(1) operation.
+    @param startcol An inclusive 0-based start index of the column span.
+    @param endcol An exclusive 0-based ending index of the column span.
+     */
+    Mat colRange(int startcol, int endcol) const;
+
+    /** @overload
+    @param r Range structure containing both the start and the end indices.
+    */
+    Mat colRange(const Range& r) const;
+
+    /** @brief Extracts a diagonal from a matrix
+
+    The method makes a new header for the specified matrix diagonal. The new matrix is represented as a
+    single-column matrix. Similarly to Mat::row and Mat::col, this is an O(1) operation.
+    @param d index of the diagonal, with the following values:
+    - `d=0` is the main diagonal.
+    - `d<0` is a diagonal from the lower half. For example, d=-1 means the diagonal is set
+      immediately below the main one.
+    - `d>0` is a diagonal from the upper half. For example, d=1 means the diagonal is set
+      immediately above the main one.
+    For example:
+    @code
+        Mat m = (Mat_<int>(3,3) <<
+                    1,2,3,
+                    4,5,6,
+                    7,8,9);
+        Mat d0 = m.diag(0);
+        Mat d1 = m.diag(1);
+        Mat d_1 = m.diag(-1);
+    @endcode
+    The resulting matrices are
+    @code
+     d0 =
+       [1;
+        5;
+        9]
+     d1 =
+       [2;
+        6]
+     d_1 =
+       [4;
+        8]
+    @endcode
+     */
+    Mat diag(int d=0) const;
+
+    /** @brief creates a diagonal matrix
+
+    The method creates a square diagonal matrix from specified main diagonal.
+    @param d One-dimensional matrix that represents the main diagonal.
+     */
+    CV_NODISCARD_STD static Mat diag(const Mat& d);
+
+    /** @brief Creates a full copy of the array and the underlying data.
+
+    The method creates a full copy of the array. The original step[] is not taken into account. So, the
+    array copy is a continuous array occupying total()*elemSize() bytes.
+     */
+    CV_NODISCARD_STD Mat clone() const;
+
+    /** @brief Copies the matrix to another one.
+
+    The method copies the matrix data to another matrix. Before copying the data, the method invokes :
+    @code
+        m.create(this->size(), this->type());
+    @endcode
+    so that the destination matrix is reallocated if needed. While m.copyTo(m); works flawlessly, the
+    function does not handle the case of a partial overlap between the source and the destination
+    matrices.
+
+    When the operation mask is specified, if the Mat::create call shown above reallocates the matrix,
+    the newly allocated matrix is initialized with all zeros before copying the data.
+    @param m Destination matrix. If it does not have a proper size or type before the operation, it is
+    reallocated.
+     */
+    void copyTo( OutputArray m ) const;
+
+    /** @overload
+    @param m Destination matrix. If it does not have a proper size or type before the operation, it is
+    reallocated.
+    @param mask Operation mask of the same size as \*this. Its non-zero elements indicate which matrix
+    elements need to be copied. The mask has to be of type CV_8U and can have 1 or multiple channels.
+    */
+    void copyTo( OutputArray m, InputArray mask ) const;
+
+    /** @brief Converts an array to another data type with optional scaling.
+
+    The method converts source pixel values to the target data type. saturate_cast\<\> is applied at
+    the end to avoid possible overflows:
+
+    \f[m(x,y) = saturate \_ cast<rType>( \alpha (*this)(x,y) +  \beta )\f]
+    @param m output matrix; if it does not have a proper size or type before the operation, it is
+    reallocated.
+    @param rtype desired output matrix type or, rather, the depth since the number of channels are the
+    same as the input has; if rtype is negative, the output matrix will have the same type as the input.
+    @param alpha optional scale factor.
+    @param beta optional delta added to the scaled values.
+     */
+    void convertTo( OutputArray m, int rtype, double alpha=1, double beta=0 ) const;
+
+    /** @brief Provides a functional form of convertTo.
+
+    This is an internally used method called by the @ref MatrixExpressions engine.
+    @param m Destination array.
+    @param type Desired destination array depth (or -1 if it should be the same as the source type).
+     */
+    void assignTo( Mat& m, int type=-1 ) const;
+
+    /** @brief Sets all or some of the array elements to the specified value.
+    @param s Assigned scalar converted to the actual array type.
+    */
+    Mat& operator = (const Scalar& s);
+
+    /** @brief Sets all or some of the array elements to the specified value.
+
+    This is an advanced variant of the Mat::operator=(const Scalar& s) operator.
+    @param value Assigned scalar converted to the actual array type.
+    @param mask Operation mask of the same size as \*this. Its non-zero elements indicate which matrix
+    elements need to be copied. The mask has to be of type CV_8U and can have 1 or multiple channels
+     */
+    Mat& setTo(InputArray value, InputArray mask=noArray());
+
+    /** @brief Changes the shape and/or the number of channels of a 2D matrix without copying the data.
+
+    The method makes a new matrix header for \*this elements. The new matrix may have a different size
+    and/or different number of channels. Any combination is possible if:
+    -   No extra elements are included into the new matrix and no elements are excluded. Consequently,
+        the product rows\*cols\*channels() must stay the same after the transformation.
+    -   No data is copied. That is, this is an O(1) operation. Consequently, if you change the number of
+        rows, or the operation changes the indices of elements row in some other way, the matrix must be
+        continuous. See Mat::isContinuous .
+
+    For example, if there is a set of 3D points stored as an STL vector, and you want to represent the
+    points as a 3xN matrix, do the following:
+    @code
+        std::vector<Point3f> vec;
+        ...
+        Mat pointMat = Mat(vec). // convert vector to Mat, O(1) operation
+                          reshape(1). // make Nx3 1-channel matrix out of Nx1 3-channel.
+                                      // Also, an O(1) operation
+                             t(); // finally, transpose the Nx3 matrix.
+                                  // This involves copying all the elements
+    @endcode
+    3-channel 2x2 matrix reshaped to 1-channel 4x3 matrix, each column has values from one of original channels:
+    @code
+    Mat m(Size(2, 2), CV_8UC3, Scalar(1, 2, 3));
+    vector<int> new_shape {4, 3};
+    m = m.reshape(1, new_shape);
+    @endcode
+    or:
+    @code
+    Mat m(Size(2, 2), CV_8UC3, Scalar(1, 2, 3));
+    const int new_shape[] = {4, 3};
+    m = m.reshape(1, 2, new_shape);
+    @endcode
+    @param cn New number of channels. If the parameter is 0, the number of channels remains the same.
+    @param rows New number of rows. If the parameter is 0, the number of rows remains the same.
+     */
+    Mat reshape(int cn, int rows=0) const;
+
+    /** @overload
+     * @param cn New number of channels. If the parameter is 0, the number of channels remains the same.
+     * @param newndims New number of dimentions.
+     * @param newsz Array with new matrix size by all dimentions. If some sizes are zero,
+     * the original sizes in those dimensions are presumed.
+     */
+    Mat reshape(int cn, int newndims, const int* newsz) const;
+
+    /** @overload
+     * @param cn New number of channels. If the parameter is 0, the number of channels remains the same.
+     * @param newshape Vector with new matrix size by all dimentions. If some sizes are zero,
+     * the original sizes in those dimensions are presumed.
+     */
+    Mat reshape(int cn, const std::vector<int>& newshape) const;
+
+    /** @brief Transposes a matrix.
+
+    The method performs matrix transposition by means of matrix expressions. It does not perform the
+    actual transposition but returns a temporary matrix transposition object that can be further used as
+    a part of more complex matrix expressions or can be assigned to a matrix:
+    @code
+        Mat A1 = A + Mat::eye(A.size(), A.type())*lambda;
+        Mat C = A1.t()*A1; // compute (A + lambda*I)^t * (A + lamda*I)
+    @endcode
+     */
+    MatExpr t() const;
+
+    /** @brief Inverses a matrix.
+
+    The method performs a matrix inversion by means of matrix expressions. This means that a temporary
+    matrix inversion object is returned by the method and can be used further as a part of more complex
+    matrix expressions or can be assigned to a matrix.
+    @param method Matrix inversion method. One of cv::DecompTypes
+     */
+    MatExpr inv(int method=DECOMP_LU) const;
+
+    /** @brief Performs an element-wise multiplication or division of the two matrices.
+
+    The method returns a temporary object encoding per-element array multiplication, with optional
+    scale. Note that this is not a matrix multiplication that corresponds to a simpler "\*" operator.
+
+    Example:
+    @code
+        Mat C = A.mul(5/B); // equivalent to divide(A, B, C, 5)
+    @endcode
+    @param m Another array of the same type and the same size as \*this, or a matrix expression.
+    @param scale Optional scale factor.
+     */
+    MatExpr mul(InputArray m, double scale=1) const;
+
+    /** @brief Computes a cross-product of two 3-element vectors.
+
+    The method computes a cross-product of two 3-element vectors. The vectors must be 3-element
+    floating-point vectors of the same shape and size. The result is another 3-element vector of the
+    same shape and type as operands.
+    @param m Another cross-product operand.
+     */
+    Mat cross(InputArray m) const;
+
+    /** @brief Computes a dot-product of two vectors.
+
+    The method computes a dot-product of two matrices. If the matrices are not single-column or
+    single-row vectors, the top-to-bottom left-to-right scan ordering is used to treat them as 1D
+    vectors. The vectors must have the same size and type. If the matrices have more than one channel,
+    the dot products from all the channels are summed together.
+    @param m another dot-product operand.
+     */
+    double dot(InputArray m) const;
+
+    /** @brief Returns a zero array of the specified size and type.
+
+    The method returns a Matlab-style zero array initializer. It can be used to quickly form a constant
+    array as a function parameter, part of a matrix expression, or as a matrix initializer:
+    @code
+        Mat A;
+        A = Mat::zeros(3, 3, CV_32F);
+    @endcode
+    In the example above, a new matrix is allocated only if A is not a 3x3 floating-point matrix.
+    Otherwise, the existing matrix A is filled with zeros.
+    @param rows Number of rows.
+    @param cols Number of columns.
+    @param type Created matrix type.
+     */
+    CV_NODISCARD_STD static MatExpr zeros(int rows, int cols, int type);
+
+    /** @overload
+    @param size Alternative to the matrix size specification Size(cols, rows) .
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr zeros(Size size, int type);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sz Array of integers specifying the array shape.
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr zeros(int ndims, const int* sz, int type);
+
+    /** @brief Returns an array of all 1's of the specified size and type.
+
+    The method returns a Matlab-style 1's array initializer, similarly to Mat::zeros. Note that using
+    this method you can initialize an array with an arbitrary value, using the following Matlab idiom:
+    @code
+        Mat A = Mat::ones(100, 100, CV_8U)*3; // make 100x100 matrix filled with 3.
+    @endcode
+    The above operation does not form a 100x100 matrix of 1's and then multiply it by 3. Instead, it
+    just remembers the scale factor (3 in this case) and use it when actually invoking the matrix
+    initializer.
+    @note In case of multi-channels type, only the first channel will be initialized with 1's, the
+    others will be set to 0's.
+    @param rows Number of rows.
+    @param cols Number of columns.
+    @param type Created matrix type.
+     */
+    CV_NODISCARD_STD static MatExpr ones(int rows, int cols, int type);
+
+    /** @overload
+    @param size Alternative to the matrix size specification Size(cols, rows) .
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr ones(Size size, int type);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sz Array of integers specifying the array shape.
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr ones(int ndims, const int* sz, int type);
+
+    /** @brief Returns an identity matrix of the specified size and type.
+
+    The method returns a Matlab-style identity matrix initializer, similarly to Mat::zeros. Similarly to
+    Mat::ones, you can use a scale operation to create a scaled identity matrix efficiently:
+    @code
+        // make a 4x4 diagonal matrix with 0.1's on the diagonal.
+        Mat A = Mat::eye(4, 4, CV_32F)*0.1;
+    @endcode
+    @note In case of multi-channels type, identity matrix will be initialized only for the first channel,
+    the others will be set to 0's
+    @param rows Number of rows.
+    @param cols Number of columns.
+    @param type Created matrix type.
+     */
+    CV_NODISCARD_STD static MatExpr eye(int rows, int cols, int type);
+
+    /** @overload
+    @param size Alternative matrix size specification as Size(cols, rows) .
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr eye(Size size, int type);
+
+    /** @brief Allocates new array data if needed.
+
+    This is one of the key Mat methods. Most new-style OpenCV functions and methods that produce arrays
+    call this method for each output array. The method uses the following algorithm:
+
+    -# If the current array shape and the type match the new ones, return immediately. Otherwise,
+       de-reference the previous data by calling Mat::release.
+    -# Initialize the new header.
+    -# Allocate the new data of total()\*elemSize() bytes.
+    -# Allocate the new, associated with the data, reference counter and set it to 1.
+
+    Such a scheme makes the memory management robust and efficient at the same time and helps avoid
+    extra typing for you. This means that usually there is no need to explicitly allocate output arrays.
+    That is, instead of writing:
+    @code
+        Mat color;
+        ...
+        Mat gray(color.rows, color.cols, color.depth());
+        cvtColor(color, gray, COLOR_BGR2GRAY);
+    @endcode
+    you can simply write:
+    @code
+        Mat color;
+        ...
+        Mat gray;
+        cvtColor(color, gray, COLOR_BGR2GRAY);
+    @endcode
+    because cvtColor, as well as the most of OpenCV functions, calls Mat::create() for the output array
+    internally.
+    @param rows New number of rows.
+    @param cols New number of columns.
+    @param type New matrix type.
+     */
+    void create(int rows, int cols, int type);
+
+    /** @overload
+    @param size Alternative new matrix size specification: Size(cols, rows)
+    @param type New matrix type.
+    */
+    void create(Size size, int type);
+
+    /** @overload
+    @param ndims New array dimensionality.
+    @param sizes Array of integers specifying a new array shape.
+    @param type New matrix type.
+    */
+    void create(int ndims, const int* sizes, int type);
+
+    /** @overload
+    @param sizes Array of integers specifying a new array shape.
+    @param type New matrix type.
+    */
+    void create(const std::vector<int>& sizes, int type);
+
+    /** @brief Increments the reference counter.
+
+    The method increments the reference counter associated with the matrix data. If the matrix header
+    points to an external data set (see Mat::Mat ), the reference counter is NULL, and the method has no
+    effect in this case. Normally, to avoid memory leaks, the method should not be called explicitly. It
+    is called implicitly by the matrix assignment operator. The reference counter increment is an atomic
+    operation on the platforms that support it. Thus, it is safe to operate on the same matrices
+    asynchronously in different threads.
+     */
+    void addref();
+
+    /** @brief Decrements the reference counter and deallocates the matrix if needed.
+
+    The method decrements the reference counter associated with the matrix data. When the reference
+    counter reaches 0, the matrix data is deallocated and the data and the reference counter pointers
+    are set to NULL's. If the matrix header points to an external data set (see Mat::Mat ), the
+    reference counter is NULL, and the method has no effect in this case.
+
+    This method can be called manually to force the matrix data deallocation. But since this method is
+    automatically called in the destructor, or by any other method that changes the data pointer, it is
+    usually not needed. The reference counter decrement and check for 0 is an atomic operation on the
+    platforms that support it. Thus, it is safe to operate on the same matrices asynchronously in
+    different threads.
+     */
+    void release();
+
+    //! internal use function, consider to use 'release' method instead; deallocates the matrix data
+    void deallocate();
+    //! internal use function; properly re-allocates _size, _step arrays
+    void copySize(const Mat& m);
+
+    /** @brief Reserves space for the certain number of rows.
+
+    The method reserves space for sz rows. If the matrix already has enough space to store sz rows,
+    nothing happens. If the matrix is reallocated, the first Mat::rows rows are preserved. The method
+    emulates the corresponding method of the STL vector class.
+    @param sz Number of rows.
+     */
+    void reserve(size_t sz);
+
+    /** @brief Reserves space for the certain number of bytes.
+
+    The method reserves space for sz bytes. If the matrix already has enough space to store sz bytes,
+    nothing happens. If matrix has to be reallocated its previous content could be lost.
+    @param sz Number of bytes.
+    */
+    void reserveBuffer(size_t sz);
+
+    /** @brief Changes the number of matrix rows.
+
+    The methods change the number of matrix rows. If the matrix is reallocated, the first
+    min(Mat::rows, sz) rows are preserved. The methods emulate the corresponding methods of the STL
+    vector class.
+    @param sz New number of rows.
+     */
+    void resize(size_t sz);
+
+    /** @overload
+    @param sz New number of rows.
+    @param s Value assigned to the newly added elements.
+     */
+    void resize(size_t sz, const Scalar& s);
+
+    //! internal function
+    void push_back_(const void* elem);
+
+    /** @brief Adds elements to the bottom of the matrix.
+
+    The methods add one or more elements to the bottom of the matrix. They emulate the corresponding
+    method of the STL vector class. When elem is Mat , its type and the number of columns must be the
+    same as in the container matrix.
+    @param elem Added element(s).
+     */
+    template<typename _Tp> void push_back(const _Tp& elem);
+
+    /** @overload
+    @param elem Added element(s).
+    */
+    template<typename _Tp> void push_back(const Mat_<_Tp>& elem);
+
+    /** @overload
+    @param elem Added element(s).
+    */
+    template<typename _Tp> void push_back(const std::vector<_Tp>& elem);
+
+    /** @overload
+    @param m Added line(s).
+    */
+    void push_back(const Mat& m);
+
+    /** @brief Removes elements from the bottom of the matrix.
+
+    The method removes one or more rows from the bottom of the matrix.
+    @param nelems Number of removed rows. If it is greater than the total number of rows, an exception
+    is thrown.
+     */
+    void pop_back(size_t nelems=1);
+
+    /** @brief Locates the matrix header within a parent matrix.
+
+    After you extracted a submatrix from a matrix using Mat::row, Mat::col, Mat::rowRange,
+    Mat::colRange, and others, the resultant submatrix points just to the part of the original big
+    matrix. However, each submatrix contains information (represented by datastart and dataend
+    fields) that helps reconstruct the original matrix size and the position of the extracted
+    submatrix within the original matrix. The method locateROI does exactly that.
+    @param wholeSize Output parameter that contains the size of the whole matrix containing *this*
+    as a part.
+    @param ofs Output parameter that contains an offset of *this* inside the whole matrix.
+     */
+    void locateROI( Size& wholeSize, Point& ofs ) const;
+
+    /** @brief Adjusts a submatrix size and position within the parent matrix.
+
+    The method is complimentary to Mat::locateROI . The typical use of these functions is to determine
+    the submatrix position within the parent matrix and then shift the position somehow. Typically, it
+    can be required for filtering operations when pixels outside of the ROI should be taken into
+    account. When all the method parameters are positive, the ROI needs to grow in all directions by the
+    specified amount, for example:
+    @code
+        A.adjustROI(2, 2, 2, 2);
+    @endcode
+    In this example, the matrix size is increased by 4 elements in each direction. The matrix is shifted
+    by 2 elements to the left and 2 elements up, which brings in all the necessary pixels for the
+    filtering with the 5x5 kernel.
+
+    adjustROI forces the adjusted ROI to be inside of the parent matrix that is boundaries of the
+    adjusted ROI are constrained by boundaries of the parent matrix. For example, if the submatrix A is
+    located in the first row of a parent matrix and you called A.adjustROI(2, 2, 2, 2) then A will not
+    be increased in the upward direction.
+
+    The function is used internally by the OpenCV filtering functions, like filter2D , morphological
+    operations, and so on.
+    @param dtop Shift of the top submatrix boundary upwards.
+    @param dbottom Shift of the bottom submatrix boundary downwards.
+    @param dleft Shift of the left submatrix boundary to the left.
+    @param dright Shift of the right submatrix boundary to the right.
+    @sa copyMakeBorder
+     */
+    Mat& adjustROI( int dtop, int dbottom, int dleft, int dright );
+
+    /** @brief Extracts a rectangular submatrix.
+
+    The operators make a new header for the specified sub-array of \*this . They are the most
+    generalized forms of Mat::row, Mat::col, Mat::rowRange, and Mat::colRange . For example,
+    `A(Range(0, 10), Range::all())` is equivalent to `A.rowRange(0, 10)`. Similarly to all of the above,
+    the operators are O(1) operations, that is, no matrix data is copied.
+    @param rowRange Start and end row of the extracted submatrix. The upper boundary is not included. To
+    select all the rows, use Range::all().
+    @param colRange Start and end column of the extracted submatrix. The upper boundary is not included.
+    To select all the columns, use Range::all().
+     */
+    Mat operator()( Range rowRange, Range colRange ) const;
+
+    /** @overload
+    @param roi Extracted submatrix specified as a rectangle.
+    */
+    Mat operator()( const Rect& roi ) const;
+
+    /** @overload
+    @param ranges Array of selected ranges along each array dimension.
+    */
+    Mat operator()( const Range* ranges ) const;
+
+    /** @overload
+    @param ranges Array of selected ranges along each array dimension.
+    */
+    Mat operator()(const std::vector<Range>& ranges) const;
+
+    template<typename _Tp> operator std::vector<_Tp>() const;
+    template<typename _Tp, int n> operator Vec<_Tp, n>() const;
+    template<typename _Tp, int m, int n> operator Matx<_Tp, m, n>() const;
+
+    template<typename _Tp, std::size_t _Nm> operator std::array<_Tp, _Nm>() const;
+
+    /** @brief Reports whether the matrix is continuous or not.
+
+    The method returns true if the matrix elements are stored continuously without gaps at the end of
+    each row. Otherwise, it returns false. Obviously, 1x1 or 1xN matrices are always continuous.
+    Matrices created with Mat::create are always continuous. But if you extract a part of the matrix
+    using Mat::col, Mat::diag, and so on, or constructed a matrix header for externally allocated data,
+    such matrices may no longer have this property.
+
+    The continuity flag is stored as a bit in the Mat::flags field and is computed automatically when
+    you construct a matrix header. Thus, the continuity check is a very fast operation, though
+    theoretically it could be done as follows:
+    @code
+        // alternative implementation of Mat::isContinuous()
+        bool myCheckMatContinuity(const Mat& m)
+        {
+            //return (m.flags & Mat::CONTINUOUS_FLAG) != 0;
+            return m.rows == 1 || m.step == m.cols*m.elemSize();
+        }
+    @endcode
+    The method is used in quite a few of OpenCV functions. The point is that element-wise operations
+    (such as arithmetic and logical operations, math functions, alpha blending, color space
+    transformations, and others) do not depend on the image geometry. Thus, if all the input and output
+    arrays are continuous, the functions can process them as very long single-row vectors. The example
+    below illustrates how an alpha-blending function can be implemented:
+    @code
+        template<typename T>
+        void alphaBlendRGBA(const Mat& src1, const Mat& src2, Mat& dst)
+        {
+            const float alpha_scale = (float)std::numeric_limits<T>::max(),
+                        inv_scale = 1.f/alpha_scale;
+
+            CV_Assert( src1.type() == src2.type() &&
+                       src1.type() == CV_MAKETYPE(traits::Depth<T>::value, 4) &&
+                       src1.size() == src2.size());
+            Size size = src1.size();
+            dst.create(size, src1.type());
+
+            // here is the idiom: check the arrays for continuity and,
+            // if this is the case,
+            // treat the arrays as 1D vectors
+            if( src1.isContinuous() && src2.isContinuous() && dst.isContinuous() )
+            {
+                size.width *= size.height;
+                size.height = 1;
+            }
+            size.width *= 4;
+
+            for( int i = 0; i < size.height; i++ )
+            {
+                // when the arrays are continuous,
+                // the outer loop is executed only once
+                const T* ptr1 = src1.ptr<T>(i);
+                const T* ptr2 = src2.ptr<T>(i);
+                T* dptr = dst.ptr<T>(i);
+
+                for( int j = 0; j < size.width; j += 4 )
+                {
+                    float alpha = ptr1[j+3]*inv_scale, beta = ptr2[j+3]*inv_scale;
+                    dptr[j] = saturate_cast<T>(ptr1[j]*alpha + ptr2[j]*beta);
+                    dptr[j+1] = saturate_cast<T>(ptr1[j+1]*alpha + ptr2[j+1]*beta);
+                    dptr[j+2] = saturate_cast<T>(ptr1[j+2]*alpha + ptr2[j+2]*beta);
+                    dptr[j+3] = saturate_cast<T>((1 - (1-alpha)*(1-beta))*alpha_scale);
+                }
+            }
+        }
+    @endcode
+    This approach, while being very simple, can boost the performance of a simple element-operation by
+    10-20 percents, especially if the image is rather small and the operation is quite simple.
+
+    Another OpenCV idiom in this function, a call of Mat::create for the destination array, that
+    allocates the destination array unless it already has the proper size and type. And while the newly
+    allocated arrays are always continuous, you still need to check the destination array because
+    Mat::create does not always allocate a new matrix.
+     */
+    bool isContinuous() const;
+
+    //! returns true if the matrix is a submatrix of another matrix
+    bool isSubmatrix() const;
+
+    /** @brief Returns the matrix element size in bytes.
+
+    The method returns the matrix element size in bytes. For example, if the matrix type is CV_16SC3 ,
+    the method returns 3\*sizeof(short) or 6.
+     */
+    size_t elemSize() const;
+
+    /** @brief Returns the size of each matrix element channel in bytes.
+
+    The method returns the matrix element channel size in bytes, that is, it ignores the number of
+    channels. For example, if the matrix type is CV_16SC3 , the method returns sizeof(short) or 2.
+     */
+    size_t elemSize1() const;
+
+    /** @brief Returns the type of a matrix element.
+
+    The method returns a matrix element type. This is an identifier compatible with the CvMat type
+    system, like CV_16SC3 or 16-bit signed 3-channel array, and so on.
+     */
+    int type() const;
+
+    /** @brief Returns the depth of a matrix element.
+
+    The method returns the identifier of the matrix element depth (the type of each individual channel).
+    For example, for a 16-bit signed element array, the method returns CV_16S . A complete list of
+    matrix types contains the following values:
+    -   CV_8U - 8-bit unsigned integers ( 0..255 )
+    -   CV_8S - 8-bit signed integers ( -128..127 )
+    -   CV_16U - 16-bit unsigned integers ( 0..65535 )
+    -   CV_16S - 16-bit signed integers ( -32768..32767 )
+    -   CV_32S - 32-bit signed integers ( -2147483648..2147483647 )
+    -   CV_32F - 32-bit floating-point numbers ( -FLT_MAX..FLT_MAX, INF, NAN )
+    -   CV_64F - 64-bit floating-point numbers ( -DBL_MAX..DBL_MAX, INF, NAN )
+     */
+    int depth() const;
+
+    /** @brief Returns the number of matrix channels.
+
+    The method returns the number of matrix channels.
+     */
+    int channels() const;
+
+    /** @brief Returns a normalized step.
+
+    The method returns a matrix step divided by Mat::elemSize1() . It can be useful to quickly access an
+    arbitrary matrix element.
+     */
+    size_t step1(int i=0) const;
+
+    /** @brief Returns true if the array has no elements.
+
+    The method returns true if Mat::total() is 0 or if Mat::data is NULL. Because of pop_back() and
+    resize() methods `M.total() == 0` does not imply that `M.data == NULL`.
+     */
+    bool empty() const;
+
+    /** @brief Returns the total number of array elements.
+
+    The method returns the number of array elements (a number of pixels if the array represents an
+    image).
+     */
+    size_t total() const;
+
+    /** @brief Returns the total number of array elements.
+
+     The method returns the number of elements within a certain sub-array slice with startDim <= dim < endDim
+     */
+    size_t total(int startDim, int endDim=INT_MAX) const;
+
+    /**
+     * @param elemChannels Number of channels or number of columns the matrix should have.
+     *                     For a 2-D matrix, when the matrix has only 1 column, then it should have
+     *                     elemChannels channels; When the matrix has only 1 channel,
+     *                     then it should have elemChannels columns.
+     *                     For a 3-D matrix, it should have only one channel. Furthermore,
+     *                     if the number of planes is not one, then the number of rows
+     *                     within every plane has to be 1; if the number of rows within
+     *                     every plane is not 1, then the number of planes has to be 1.
+     * @param depth The depth the matrix should have. Set it to -1 when any depth is fine.
+     * @param requireContinuous Set it to true to require the matrix to be continuous
+     * @return -1 if the requirement is not satisfied.
+     *         Otherwise, it returns the number of elements in the matrix. Note
+     *         that an element may have multiple channels.
+     *
+     * The following code demonstrates its usage for a 2-d matrix:
+     * @snippet snippets/core_mat_checkVector.cpp example-2d
+     *
+     * The following code demonstrates its usage for a 3-d matrix:
+     * @snippet snippets/core_mat_checkVector.cpp example-3d
+     */
+    int checkVector(int elemChannels, int depth=-1, bool requireContinuous=true) const;
+
+    /** @brief Returns a pointer to the specified matrix row.
+
+    The methods return `uchar*` or typed pointer to the specified matrix row. See the sample in
+    Mat::isContinuous to know how to use these methods.
+    @param i0 A 0-based row index.
+     */
+    uchar* ptr(int i0=0);
+    /** @overload */
+    const uchar* ptr(int i0=0) const;
+
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    uchar* ptr(int row, int col);
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    const uchar* ptr(int row, int col) const;
+
+    /** @overload */
+    uchar* ptr(int i0, int i1, int i2);
+    /** @overload */
+    const uchar* ptr(int i0, int i1, int i2) const;
+
+    /** @overload */
+    uchar* ptr(const int* idx);
+    /** @overload */
+    const uchar* ptr(const int* idx) const;
+    /** @overload */
+    template<int n> uchar* ptr(const Vec<int, n>& idx);
+    /** @overload */
+    template<int n> const uchar* ptr(const Vec<int, n>& idx) const;
+
+    /** @overload */
+    template<typename _Tp> _Tp* ptr(int i0=0);
+    /** @overload */
+    template<typename _Tp> const _Tp* ptr(int i0=0) const;
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    template<typename _Tp> _Tp* ptr(int row, int col);
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    template<typename _Tp> const _Tp* ptr(int row, int col) const;
+    /** @overload */
+    template<typename _Tp> _Tp* ptr(int i0, int i1, int i2);
+    /** @overload */
+    template<typename _Tp> const _Tp* ptr(int i0, int i1, int i2) const;
+    /** @overload */
+    template<typename _Tp> _Tp* ptr(const int* idx);
+    /** @overload */
+    template<typename _Tp> const _Tp* ptr(const int* idx) const;
+    /** @overload */
+    template<typename _Tp, int n> _Tp* ptr(const Vec<int, n>& idx);
+    /** @overload */
+    template<typename _Tp, int n> const _Tp* ptr(const Vec<int, n>& idx) const;
+
+    /** @brief Returns a reference to the specified array element.
+
+    The template methods return a reference to the specified array element. For the sake of higher
+    performance, the index range checks are only performed in the Debug configuration.
+
+    Note that the variants with a single index (i) can be used to access elements of single-row or
+    single-column 2-dimensional arrays. That is, if, for example, A is a 1 x N floating-point matrix and
+    B is an M x 1 integer matrix, you can simply write `A.at<float>(k+4)` and `B.at<int>(2*i+1)`
+    instead of `A.at<float>(0,k+4)` and `B.at<int>(2*i+1,0)`, respectively.
+
+    The example below initializes a Hilbert matrix:
+    @code
+        Mat H(100, 100, CV_64F);
+        for(int i = 0; i < H.rows; i++)
+            for(int j = 0; j < H.cols; j++)
+                H.at<double>(i,j)=1./(i+j+1);
+    @endcode
+
+    Keep in mind that the size identifier used in the at operator cannot be chosen at random. It depends
+    on the image from which you are trying to retrieve the data. The table below gives a better insight in this:
+     - If matrix is of type `CV_8U` then use `Mat.at<uchar>(y,x)`.
+     - If matrix is of type `CV_8S` then use `Mat.at<schar>(y,x)`.
+     - If matrix is of type `CV_16U` then use `Mat.at<ushort>(y,x)`.
+     - If matrix is of type `CV_16S` then use `Mat.at<short>(y,x)`.
+     - If matrix is of type `CV_32S`  then use `Mat.at<int>(y,x)`.
+     - If matrix is of type `CV_32F`  then use `Mat.at<float>(y,x)`.
+     - If matrix is of type `CV_64F` then use `Mat.at<double>(y,x)`.
+
+    @param i0 Index along the dimension 0
+     */
+    template<typename _Tp> _Tp& at(int i0=0);
+    /** @overload
+    @param i0 Index along the dimension 0
+    */
+    template<typename _Tp> const _Tp& at(int i0=0) const;
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    template<typename _Tp> _Tp& at(int row, int col);
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    template<typename _Tp> const _Tp& at(int row, int col) const;
+
+    /** @overload
+    @param i0 Index along the dimension 0
+    @param i1 Index along the dimension 1
+    @param i2 Index along the dimension 2
+    */
+    template<typename _Tp> _Tp& at(int i0, int i1, int i2);
+    /** @overload
+    @param i0 Index along the dimension 0
+    @param i1 Index along the dimension 1
+    @param i2 Index along the dimension 2
+    */
+    template<typename _Tp> const _Tp& at(int i0, int i1, int i2) const;
+
+    /** @overload
+    @param idx Array of Mat::dims indices.
+    */
+    template<typename _Tp> _Tp& at(const int* idx);
+    /** @overload
+    @param idx Array of Mat::dims indices.
+    */
+    template<typename _Tp> const _Tp& at(const int* idx) const;
+
+    /** @overload */
+    template<typename _Tp, int n> _Tp& at(const Vec<int, n>& idx);
+    /** @overload */
+    template<typename _Tp, int n> const _Tp& at(const Vec<int, n>& idx) const;
+
+    /** @overload
+    special versions for 2D arrays (especially convenient for referencing image pixels)
+    @param pt Element position specified as Point(j,i) .
+    */
+    template<typename _Tp> _Tp& at(Point pt);
+    /** @overload
+    special versions for 2D arrays (especially convenient for referencing image pixels)
+    @param pt Element position specified as Point(j,i) .
+    */
+    template<typename _Tp> const _Tp& at(Point pt) const;
+
+    /** @brief Returns the matrix iterator and sets it to the first matrix element.
+
+    The methods return the matrix read-only or read-write iterators. The use of matrix iterators is very
+    similar to the use of bi-directional STL iterators. In the example below, the alpha blending
+    function is rewritten using the matrix iterators:
+    @code
+        template<typename T>
+        void alphaBlendRGBA(const Mat& src1, const Mat& src2, Mat& dst)
+        {
+            typedef Vec<T, 4> VT;
+
+            const float alpha_scale = (float)std::numeric_limits<T>::max(),
+                        inv_scale = 1.f/alpha_scale;
+
+            CV_Assert( src1.type() == src2.type() &&
+                       src1.type() == traits::Type<VT>::value &&
+                       src1.size() == src2.size());
+            Size size = src1.size();
+            dst.create(size, src1.type());
+
+            MatConstIterator_<VT> it1 = src1.begin<VT>(), it1_end = src1.end<VT>();
+            MatConstIterator_<VT> it2 = src2.begin<VT>();
+            MatIterator_<VT> dst_it = dst.begin<VT>();
+
+            for( ; it1 != it1_end; ++it1, ++it2, ++dst_it )
+            {
+                VT pix1 = *it1, pix2 = *it2;
+                float alpha = pix1[3]*inv_scale, beta = pix2[3]*inv_scale;
+                *dst_it = VT(saturate_cast<T>(pix1[0]*alpha + pix2[0]*beta),
+                             saturate_cast<T>(pix1[1]*alpha + pix2[1]*beta),
+                             saturate_cast<T>(pix1[2]*alpha + pix2[2]*beta),
+                             saturate_cast<T>((1 - (1-alpha)*(1-beta))*alpha_scale));
+            }
+        }
+    @endcode
+     */
+    template<typename _Tp> MatIterator_<_Tp> begin();
+    template<typename _Tp> MatConstIterator_<_Tp> begin() const;
+
+    /** @brief Same as begin() but for inverse traversal
+     */
+    template<typename _Tp> std::reverse_iterator<MatIterator_<_Tp>> rbegin();
+    template<typename _Tp> std::reverse_iterator<MatConstIterator_<_Tp>> rbegin() const;
+
+    /** @brief Returns the matrix iterator and sets it to the after-last matrix element.
+
+    The methods return the matrix read-only or read-write iterators, set to the point following the last
+    matrix element.
+     */
+    template<typename _Tp> MatIterator_<_Tp> end();
+    template<typename _Tp> MatConstIterator_<_Tp> end() const;
+
+    /** @brief Same as end() but for inverse traversal
+     */
+    template<typename _Tp> std::reverse_iterator< MatIterator_<_Tp>> rend();
+    template<typename _Tp> std::reverse_iterator< MatConstIterator_<_Tp>> rend() const;
+
+
+    /** @brief Runs the given functor over all matrix elements in parallel.
+
+    The operation passed as argument has to be a function pointer, a function object or a lambda(C++11).
+
+    Example 1. All of the operations below put 0xFF the first channel of all matrix elements:
+    @code
+        Mat image(1920, 1080, CV_8UC3);
+        typedef cv::Point3_<uint8_t> Pixel;
+
+        // first. raw pointer access.
+        for (int r = 0; r < image.rows; ++r) {
+            Pixel* ptr = image.ptr<Pixel>(r, 0);
+            const Pixel* ptr_end = ptr + image.cols;
+            for (; ptr != ptr_end; ++ptr) {
+                ptr->x = 255;
+            }
+        }
+
+        // Using MatIterator. (Simple but there are a Iterator's overhead)
+        for (Pixel &p : cv::Mat_<Pixel>(image)) {
+            p.x = 255;
+        }
+
+        // Parallel execution with function object.
+        struct Operator {
+            void operator ()(Pixel &pixel, const int * position) {
+                pixel.x = 255;
+            }
+        };
+        image.forEach<Pixel>(Operator());
+
+        // Parallel execution using C++11 lambda.
+        image.forEach<Pixel>([](Pixel &p, const int * position) -> void {
+            p.x = 255;
+        });
+    @endcode
+    Example 2. Using the pixel's position:
+    @code
+        // Creating 3D matrix (255 x 255 x 255) typed uint8_t
+        // and initialize all elements by the value which equals elements position.
+        // i.e. pixels (x,y,z) = (1,2,3) is (b,g,r) = (1,2,3).
+
+        int sizes[] = { 255, 255, 255 };
+        typedef cv::Point3_<uint8_t> Pixel;
+
+        Mat_<Pixel> image = Mat::zeros(3, sizes, CV_8UC3);
+
+        image.forEach<Pixel>([](Pixel& pixel, const int position[]) -> void {
+            pixel.x = position[0];
+            pixel.y = position[1];
+            pixel.z = position[2];
+        });
+    @endcode
+     */
+    template<typename _Tp, typename Functor> void forEach(const Functor& operation);
+    /** @overload */
+    template<typename _Tp, typename Functor> void forEach(const Functor& operation) const;
+
+    Mat(Mat&& m);
+    Mat& operator = (Mat&& m);
+
+    enum { MAGIC_VAL  = 0x42FF0000, AUTO_STEP = 0, CONTINUOUS_FLAG = CV_MAT_CONT_FLAG, SUBMATRIX_FLAG = CV_SUBMAT_FLAG };
+    enum { MAGIC_MASK = 0xFFFF0000, TYPE_MASK = 0x00000FFF, DEPTH_MASK = 7 };
+
+    /*! includes several bit-fields:
+         - the magic signature
+         - continuity flag
+         - depth
+         - number of channels
+     */
+    int flags;
+    //! the matrix dimensionality, >= 2
+    int dims;
+    //! the number of rows and columns or (-1, -1) when the matrix has more than 2 dimensions
+    int rows, cols;
+    //! pointer to the data
+    uchar* data;
+
+    //! helper fields used in locateROI and adjustROI
+    const uchar* datastart;
+    const uchar* dataend;
+    const uchar* datalimit;
+
+    //! custom allocator
+    MatAllocator* allocator;
+    //! and the standard allocator
+    static MatAllocator* getStdAllocator();
+    static MatAllocator* getDefaultAllocator();
+    static void setDefaultAllocator(MatAllocator* allocator);
+
+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
+    //! interaction with UMat
+    UMatData* u;
+
+    MatSize size;
+    MatStep step;
+
+protected:
+    template<typename _Tp, typename Functor> void forEach_impl(const Functor& operation);
+};
+
+
+///////////////////////////////// Mat_<_Tp> ////////////////////////////////////
+
+/** @brief Template matrix class derived from Mat
+
+@code{.cpp}
+    template<typename _Tp> class Mat_ : public Mat
+    {
+    public:
+        // ... some specific methods
+        //         and
+        // no new extra fields
+    };
+@endcode
+The class `Mat_<_Tp>` is a *thin* template wrapper on top of the Mat class. It does not have any
+extra data fields. Nor this class nor Mat has any virtual methods. Thus, references or pointers to
+these two classes can be freely but carefully converted one to another. For example:
+@code{.cpp}
+    // create a 100x100 8-bit matrix
+    Mat M(100,100,CV_8U);
+    // this will be compiled fine. no any data conversion will be done.
+    Mat_<float>& M1 = (Mat_<float>&)M;
+    // the program is likely to crash at the statement below
+    M1(99,99) = 1.f;
+@endcode
+While Mat is sufficient in most cases, Mat_ can be more convenient if you use a lot of element
+access operations and if you know matrix type at the compilation time. Note that
+`Mat::at(int y,int x)` and `Mat_::operator()(int y,int x)` do absolutely the same
+and run at the same speed, but the latter is certainly shorter:
+@code{.cpp}
+    Mat_<double> M(20,20);
+    for(int i = 0; i < M.rows; i++)
+        for(int j = 0; j < M.cols; j++)
+            M(i,j) = 1./(i+j+1);
+    Mat E, V;
+    eigen(M,E,V);
+    cout << E.at<double>(0,0)/E.at<double>(M.rows-1,0);
+@endcode
+To use Mat_ for multi-channel images/matrices, pass Vec as a Mat_ parameter:
+@code{.cpp}
+    // allocate a 320x240 color image and fill it with green (in RGB space)
+    Mat_<Vec3b> img(240, 320, Vec3b(0,255,0));
+    // now draw a diagonal white line
+    for(int i = 0; i < 100; i++)
+        img(i,i)=Vec3b(255,255,255);
+    // and now scramble the 2nd (red) channel of each pixel
+    for(int i = 0; i < img.rows; i++)
+        for(int j = 0; j < img.cols; j++)
+            img(i,j)[2] ^= (uchar)(i ^ j);
+@endcode
+Mat_ is fully compatible with C++11 range-based for loop. For example such loop
+can be used to safely apply look-up table:
+@code{.cpp}
+void applyTable(Mat_<uchar>& I, const uchar* const table)
+{
+    for(auto& pixel : I)
+    {
+        pixel = table[pixel];
+    }
+}
+@endcode
+ */
+template<typename _Tp> class Mat_ : public Mat
+{
+public:
+    typedef _Tp value_type;
+    typedef typename DataType<_Tp>::channel_type channel_type;
+    typedef MatIterator_<_Tp> iterator;
+    typedef MatConstIterator_<_Tp> const_iterator;
+
+    //! default constructor
+    Mat_() CV_NOEXCEPT;
+    //! equivalent to Mat(_rows, _cols, DataType<_Tp>::type)
+    Mat_(int _rows, int _cols);
+    //! constructor that sets each matrix element to specified value
+    Mat_(int _rows, int _cols, const _Tp& value);
+    //! equivalent to Mat(_size, DataType<_Tp>::type)
+    explicit Mat_(Size _size);
+    //! constructor that sets each matrix element to specified value
+    Mat_(Size _size, const _Tp& value);
+    //! n-dim array constructor
+    Mat_(int _ndims, const int* _sizes);
+    //! n-dim array constructor that sets each matrix element to specified value
+    Mat_(int _ndims, const int* _sizes, const _Tp& value);
+    //! copy/conversion constructor. If m is of different type, it's converted
+    Mat_(const Mat& m);
+    //! copy constructor
+    Mat_(const Mat_& m);
+    //! constructs a matrix on top of user-allocated data. step is in bytes(!!!), regardless of the type
+    Mat_(int _rows, int _cols, _Tp* _data, size_t _step=AUTO_STEP);
+    //! constructs n-dim matrix on top of user-allocated data. steps are in bytes(!!!), regardless of the type
+    Mat_(int _ndims, const int* _sizes, _Tp* _data, const size_t* _steps=0);
+    //! selects a submatrix
+    Mat_(const Mat_& m, const Range& rowRange, const Range& colRange=Range::all());
+    //! selects a submatrix
+    Mat_(const Mat_& m, const Rect& roi);
+    //! selects a submatrix, n-dim version
+    Mat_(const Mat_& m, const Range* ranges);
+    //! selects a submatrix, n-dim version
+    Mat_(const Mat_& m, const std::vector<Range>& ranges);
+    //! from a matrix expression
+    explicit Mat_(const MatExpr& e);
+    //! makes a matrix out of Vec, std::vector, Point_ or Point3_. The matrix will have a single column
+    explicit Mat_(const std::vector<_Tp>& vec, bool copyData=false);
+    template<int n> explicit Mat_(const Vec<typename DataType<_Tp>::channel_type, n>& vec, bool copyData=true);
+    template<int m, int n> explicit Mat_(const Matx<typename DataType<_Tp>::channel_type, m, n>& mtx, bool copyData=true);
+    explicit Mat_(const Point_<typename DataType<_Tp>::channel_type>& pt, bool copyData=true);
+    explicit Mat_(const Point3_<typename DataType<_Tp>::channel_type>& pt, bool copyData=true);
+    explicit Mat_(const MatCommaInitializer_<_Tp>& commaInitializer);
+
+    Mat_(std::initializer_list<_Tp> values);
+    explicit Mat_(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> values);
+
+    template <std::size_t _Nm> explicit Mat_(const std::array<_Tp, _Nm>& arr, bool copyData=false);
+
+    Mat_& operator = (const Mat& m);
+    Mat_& operator = (const Mat_& m);
+    //! set all the elements to s.
+    Mat_& operator = (const _Tp& s);
+    //! assign a matrix expression
+    Mat_& operator = (const MatExpr& e);
+
+    //! iterators; they are smart enough to skip gaps in the end of rows
+    iterator begin();
+    iterator end();
+    const_iterator begin() const;
+    const_iterator end() const;
+
+    //reverse iterators
+    std::reverse_iterator<iterator> rbegin();
+    std::reverse_iterator<iterator> rend();
+    std::reverse_iterator<const_iterator> rbegin() const;
+    std::reverse_iterator<const_iterator> rend() const;
+
+    //! template methods for operation over all matrix elements.
+    // the operations take care of skipping gaps in the end of rows (if any)
+    template<typename Functor> void forEach(const Functor& operation);
+    template<typename Functor> void forEach(const Functor& operation) const;
+
+    //! equivalent to Mat::create(_rows, _cols, DataType<_Tp>::type)
+    void create(int _rows, int _cols);
+    //! equivalent to Mat::create(_size, DataType<_Tp>::type)
+    void create(Size _size);
+    //! equivalent to Mat::create(_ndims, _sizes, DatType<_Tp>::type)
+    void create(int _ndims, const int* _sizes);
+    //! equivalent to Mat::release()
+    void release();
+    //! cross-product
+    Mat_ cross(const Mat_& m) const;
+    //! data type conversion
+    template<typename T2> operator Mat_<T2>() const;
+    //! overridden forms of Mat::row() etc.
+    Mat_ row(int y) const;
+    Mat_ col(int x) const;
+    Mat_ diag(int d=0) const;
+    CV_NODISCARD_STD Mat_ clone() const;
+
+    //! overridden forms of Mat::elemSize() etc.
+    size_t elemSize() const;
+    size_t elemSize1() const;
+    int type() const;
+    int depth() const;
+    int channels() const;
+    size_t step1(int i=0) const;
+    //! returns step()/sizeof(_Tp)
+    size_t stepT(int i=0) const;
+
+    //! overridden forms of Mat::zeros() etc. Data type is omitted, of course
+    CV_NODISCARD_STD static MatExpr zeros(int rows, int cols);
+    CV_NODISCARD_STD static MatExpr zeros(Size size);
+    CV_NODISCARD_STD static MatExpr zeros(int _ndims, const int* _sizes);
+    CV_NODISCARD_STD static MatExpr ones(int rows, int cols);
+    CV_NODISCARD_STD static MatExpr ones(Size size);
+    CV_NODISCARD_STD static MatExpr ones(int _ndims, const int* _sizes);
+    CV_NODISCARD_STD static MatExpr eye(int rows, int cols);
+    CV_NODISCARD_STD static MatExpr eye(Size size);
+
+    //! some more overridden methods
+    Mat_& adjustROI( int dtop, int dbottom, int dleft, int dright );
+    Mat_ operator()( const Range& rowRange, const Range& colRange ) const;
+    Mat_ operator()( const Rect& roi ) const;
+    Mat_ operator()( const Range* ranges ) const;
+    Mat_ operator()(const std::vector<Range>& ranges) const;
+
+    //! more convenient forms of row and element access operators
+    _Tp* operator [](int y);
+    const _Tp* operator [](int y) const;
+
+    //! returns reference to the specified element
+    _Tp& operator ()(const int* idx);
+    //! returns read-only reference to the specified element
+    const _Tp& operator ()(const int* idx) const;
+
+    //! returns reference to the specified element
+    template<int n> _Tp& operator ()(const Vec<int, n>& idx);
+    //! returns read-only reference to the specified element
+    template<int n> const _Tp& operator ()(const Vec<int, n>& idx) const;
+
+    //! returns reference to the specified element (1D case)
+    _Tp& operator ()(int idx0);
+    //! returns read-only reference to the specified element (1D case)
+    const _Tp& operator ()(int idx0) const;
+    //! returns reference to the specified element (2D case)
+    _Tp& operator ()(int row, int col);
+    //! returns read-only reference to the specified element (2D case)
+    const _Tp& operator ()(int row, int col) const;
+    //! returns reference to the specified element (3D case)
+    _Tp& operator ()(int idx0, int idx1, int idx2);
+    //! returns read-only reference to the specified element (3D case)
+    const _Tp& operator ()(int idx0, int idx1, int idx2) const;
+
+    _Tp& operator ()(Point pt);
+    const _Tp& operator ()(Point pt) const;
+
+    //! conversion to vector.
+    operator std::vector<_Tp>() const;
+
+    //! conversion to array.
+    template<std::size_t _Nm> operator std::array<_Tp, _Nm>() const;
+
+    //! conversion to Vec
+    template<int n> operator Vec<typename DataType<_Tp>::channel_type, n>() const;
+    //! conversion to Matx
+    template<int m, int n> operator Matx<typename DataType<_Tp>::channel_type, m, n>() const;
+
+    Mat_(Mat_&& m);
+    Mat_& operator = (Mat_&& m);
+
+    Mat_(Mat&& m);
+    Mat_& operator = (Mat&& m);
+
+    Mat_(MatExpr&& e);
+};
+
+typedef Mat_<uchar> Mat1b;
+typedef Mat_<Vec2b> Mat2b;
+typedef Mat_<Vec3b> Mat3b;
+typedef Mat_<Vec4b> Mat4b;
+
+typedef Mat_<short> Mat1s;
+typedef Mat_<Vec2s> Mat2s;
+typedef Mat_<Vec3s> Mat3s;
+typedef Mat_<Vec4s> Mat4s;
+
+typedef Mat_<ushort> Mat1w;
+typedef Mat_<Vec2w> Mat2w;
+typedef Mat_<Vec3w> Mat3w;
+typedef Mat_<Vec4w> Mat4w;
+
+typedef Mat_<int>   Mat1i;
+typedef Mat_<Vec2i> Mat2i;
+typedef Mat_<Vec3i> Mat3i;
+typedef Mat_<Vec4i> Mat4i;
+
+typedef Mat_<float> Mat1f;
+typedef Mat_<Vec2f> Mat2f;
+typedef Mat_<Vec3f> Mat3f;
+typedef Mat_<Vec4f> Mat4f;
+
+typedef Mat_<double> Mat1d;
+typedef Mat_<Vec2d> Mat2d;
+typedef Mat_<Vec3d> Mat3d;
+typedef Mat_<Vec4d> Mat4d;
+
+/** @todo document */
+class CV_EXPORTS UMat
+{
+public:
+    //! default constructor
+    UMat(UMatUsageFlags usageFlags = USAGE_DEFAULT) CV_NOEXCEPT;
+    //! constructs 2D matrix of the specified size and type
+    // (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
+    UMat(int rows, int cols, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(Size size, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    //! constructs 2D matrix and fills it with the specified value _s.
+    UMat(int rows, int cols, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(Size size, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+
+    //! constructs n-dimensional matrix
+    UMat(int ndims, const int* sizes, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(int ndims, const int* sizes, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+
+    //! copy constructor
+    UMat(const UMat& m);
+
+    //! creates a matrix header for a part of the bigger matrix
+    UMat(const UMat& m, const Range& rowRange, const Range& colRange=Range::all());
+    UMat(const UMat& m, const Rect& roi);
+    UMat(const UMat& m, const Range* ranges);
+    UMat(const UMat& m, const std::vector<Range>& ranges);
+
+    // FIXIT copyData=false is not implemented, drop this in favor of cv::Mat (OpenCV 5.0)
+    //! builds matrix from std::vector with or without copying the data
+    template<typename _Tp> explicit UMat(const std::vector<_Tp>& vec, bool copyData=false);
+
+    //! destructor - calls release()
+    ~UMat();
+    //! assignment operators
+    UMat& operator = (const UMat& m);
+
+    Mat getMat(AccessFlag flags) const;
+
+    //! returns a new matrix header for the specified row
+    UMat row(int y) const;
+    //! returns a new matrix header for the specified column
+    UMat col(int x) const;
+    //! ... for the specified row span
+    UMat rowRange(int startrow, int endrow) const;
+    UMat rowRange(const Range& r) const;
+    //! ... for the specified column span
+    UMat colRange(int startcol, int endcol) const;
+    UMat colRange(const Range& r) const;
+    //! ... for the specified diagonal
+    //! (d=0 - the main diagonal,
+    //!  >0 - a diagonal from the upper half,
+    //!  <0 - a diagonal from the lower half)
+    UMat diag(int d=0) const;
+    //! constructs a square diagonal matrix which main diagonal is vector "d"
+    CV_NODISCARD_STD static UMat diag(const UMat& d, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat diag(const UMat& d) { return diag(d, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+
+    //! returns deep copy of the matrix, i.e. the data is copied
+    CV_NODISCARD_STD UMat clone() const;
+    //! copies the matrix content to "m".
+    // It calls m.create(this->size(), this->type()).
+    void copyTo( OutputArray m ) const;
+    //! copies those matrix elements to "m" that are marked with non-zero mask elements.
+    void copyTo( OutputArray m, InputArray mask ) const;
+    //! converts matrix to another datatype with optional scaling. See cvConvertScale.
+    void convertTo( OutputArray m, int rtype, double alpha=1, double beta=0 ) const;
+
+    void assignTo( UMat& m, int type=-1 ) const;
+
+    //! sets every matrix element to s
+    UMat& operator = (const Scalar& s);
+    //! sets some of the matrix elements to s, according to the mask
+    UMat& setTo(InputArray value, InputArray mask=noArray());
+    //! creates alternative matrix header for the same data, with different
+    // number of channels and/or different number of rows. see cvReshape.
+    UMat reshape(int cn, int rows=0) const;
+    UMat reshape(int cn, int newndims, const int* newsz) const;
+
+    //! matrix transposition by means of matrix expressions
+    UMat t() const;
+    //! matrix inversion by means of matrix expressions
+    UMat inv(int method=DECOMP_LU) const;
+    //! per-element matrix multiplication by means of matrix expressions
+    UMat mul(InputArray m, double scale=1) const;
+
+    //! computes dot-product
+    double dot(InputArray m) const;
+
+    //! Matlab-style matrix initialization
+    CV_NODISCARD_STD static UMat zeros(int rows, int cols, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat zeros(Size size, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat zeros(int ndims, const int* sz, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat zeros(int rows, int cols, int type) { return zeros(rows, cols, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat zeros(Size size, int type) { return zeros(size, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat zeros(int ndims, const int* sz, int type) { return zeros(ndims, sz, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat ones(int rows, int cols, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat ones(Size size, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat ones(int ndims, const int* sz, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat ones(int rows, int cols, int type) { return ones(rows, cols, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat ones(Size size, int type) { return ones(size, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat ones(int ndims, const int* sz, int type) { return ones(ndims, sz, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat eye(int rows, int cols, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat eye(Size size, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat eye(int rows, int cols, int type) { return eye(rows, cols, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat eye(Size size, int type) { return eye(size, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+
+    //! allocates new matrix data unless the matrix already has specified size and type.
+    // previous data is unreferenced if needed.
+    void create(int rows, int cols, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    void create(Size size, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    void create(int ndims, const int* sizes, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    void create(const std::vector<int>& sizes, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+
+    //! increases the reference counter; use with care to avoid memleaks
+    void addref();
+    //! decreases reference counter;
+    // deallocates the data when reference counter reaches 0.
+    void release();
+
+    //! deallocates the matrix data
+    void deallocate();
+    //! internal use function; properly re-allocates _size, _step arrays
+    void copySize(const UMat& m);
+
+    //! locates matrix header within a parent matrix. See below
+    void locateROI( Size& wholeSize, Point& ofs ) const;
+    //! moves/resizes the current matrix ROI inside the parent matrix.
+    UMat& adjustROI( int dtop, int dbottom, int dleft, int dright );
+    //! extracts a rectangular sub-matrix
+    // (this is a generalized form of row, rowRange etc.)
+    UMat operator()( Range rowRange, Range colRange ) const;
+    UMat operator()( const Rect& roi ) const;
+    UMat operator()( const Range* ranges ) const;
+    UMat operator()(const std::vector<Range>& ranges) const;
+
+    //! returns true iff the matrix data is continuous
+    // (i.e. when there are no gaps between successive rows).
+    // similar to CV_IS_MAT_CONT(cvmat->type)
+    bool isContinuous() const;
+
+    //! returns true if the matrix is a submatrix of another matrix
+    bool isSubmatrix() const;
+
+    //! returns element size in bytes,
+    // similar to CV_ELEM_SIZE(cvmat->type)
+    size_t elemSize() const;
+    //! returns the size of element channel in bytes.
+    size_t elemSize1() const;
+    //! returns element type, similar to CV_MAT_TYPE(cvmat->type)
+    int type() const;
+    //! returns element type, similar to CV_MAT_DEPTH(cvmat->type)
+    int depth() const;
+    //! returns element type, similar to CV_MAT_CN(cvmat->type)
+    int channels() const;
+    //! returns step/elemSize1()
+    size_t step1(int i=0) const;
+    //! returns true if matrix data is NULL
+    bool empty() const;
+    //! returns the total number of matrix elements
+    size_t total() const;
+
+    //! returns N if the matrix is 1-channel (N x ptdim) or ptdim-channel (1 x N) or (N x 1); negative number otherwise
+    int checkVector(int elemChannels, int depth=-1, bool requireContinuous=true) const;
+
+    UMat(UMat&& m);
+    UMat& operator = (UMat&& m);
+
+    /*! Returns the OpenCL buffer handle on which UMat operates on.
+        The UMat instance should be kept alive during the use of the handle to prevent the buffer to be
+        returned to the OpenCV buffer pool.
+     */
+    void* handle(AccessFlag accessFlags) const;
+    void ndoffset(size_t* ofs) const;
+
+    enum { MAGIC_VAL  = 0x42FF0000, AUTO_STEP = 0, CONTINUOUS_FLAG = CV_MAT_CONT_FLAG, SUBMATRIX_FLAG = CV_SUBMAT_FLAG };
+    enum { MAGIC_MASK = 0xFFFF0000, TYPE_MASK = 0x00000FFF, DEPTH_MASK = 7 };
+
+    /*! includes several bit-fields:
+         - the magic signature
+         - continuity flag
+         - depth
+         - number of channels
+     */
+    int flags;
+
+    //! the matrix dimensionality, >= 2
+    int dims;
+
+    //! number of rows in the matrix; -1 when the matrix has more than 2 dimensions
+    int rows;
+
+    //! number of columns in the matrix; -1 when the matrix has more than 2 dimensions
+    int cols;
+
+    //! custom allocator
+    MatAllocator* allocator;
+
+    //! usage flags for allocator; recommend do not set directly, instead set during construct/create/getUMat
+    UMatUsageFlags usageFlags;
+
+    //! and the standard allocator
+    static MatAllocator* getStdAllocator();
+
+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
+    //! black-box container of UMat data
+    UMatData* u;
+
+    //! offset of the submatrix (or 0)
+    size_t offset;
+
+    //! dimensional size of the matrix; accessible in various formats
+    MatSize size;
+
+    //! number of bytes each matrix element/row/plane/dimension occupies
+    MatStep step;
+
+protected:
+};
+
+
+/////////////////////////// multi-dimensional sparse matrix //////////////////////////
+
+/** @brief The class SparseMat represents multi-dimensional sparse numerical arrays.
+
+Such a sparse array can store elements of any type that Mat can store. *Sparse* means that only
+non-zero elements are stored (though, as a result of operations on a sparse matrix, some of its
+stored elements can actually become 0. It is up to you to detect such elements and delete them
+using SparseMat::erase ). The non-zero elements are stored in a hash table that grows when it is
+filled so that the search time is O(1) in average (regardless of whether element is there or not).
+Elements can be accessed using the following methods:
+-   Query operations (SparseMat::ptr and the higher-level SparseMat::ref, SparseMat::value and
+    SparseMat::find), for example:
+    @code
+        const int dims = 5;
+        int size[5] = {10, 10, 10, 10, 10};
+        SparseMat sparse_mat(dims, size, CV_32F);
+        for(int i = 0; i < 1000; i++)
+        {
+            int idx[dims];
+            for(int k = 0; k < dims; k++)
+                idx[k] = rand() % size[k];
+            sparse_mat.ref<float>(idx) += 1.f;
+        }
+        cout << "nnz = " << sparse_mat.nzcount() << endl;
+    @endcode
+-   Sparse matrix iterators. They are similar to MatIterator but different from NAryMatIterator.
+    That is, the iteration loop is familiar to STL users:
+    @code
+        // prints elements of a sparse floating-point matrix
+        // and the sum of elements.
+        SparseMatConstIterator_<float>
+            it = sparse_mat.begin<float>(),
+            it_end = sparse_mat.end<float>();
+        double s = 0;
+        int dims = sparse_mat.dims();
+        for(; it != it_end; ++it)
+        {
+            // print element indices and the element value
+            const SparseMat::Node* n = it.node();
+            printf("(");
+            for(int i = 0; i < dims; i++)
+                printf("%d%s", n->idx[i], i < dims-1 ? ", " : ")");
+            printf(": %g\n", it.value<float>());
+            s += *it;
+        }
+        printf("Element sum is %g\n", s);
+    @endcode
+    If you run this loop, you will notice that elements are not enumerated in a logical order
+    (lexicographical, and so on). They come in the same order as they are stored in the hash table
+    (semi-randomly). You may collect pointers to the nodes and sort them to get the proper ordering.
+    Note, however, that pointers to the nodes may become invalid when you add more elements to the
+    matrix. This may happen due to possible buffer reallocation.
+-   Combination of the above 2 methods when you need to process 2 or more sparse matrices
+    simultaneously. For example, this is how you can compute unnormalized cross-correlation of the 2
+    floating-point sparse matrices:
+    @code
+        double cross_corr(const SparseMat& a, const SparseMat& b)
+        {
+            const SparseMat *_a = &a, *_b = &b;
+            // if b contains less elements than a,
+            // it is faster to iterate through b
+            if(_a->nzcount() > _b->nzcount())
+                std::swap(_a, _b);
+            SparseMatConstIterator_<float> it = _a->begin<float>(),
+                                           it_end = _a->end<float>();
+            double ccorr = 0;
+            for(; it != it_end; ++it)
+            {
+                // take the next element from the first matrix
+                float avalue = *it;
+                const Node* anode = it.node();
+                // and try to find an element with the same index in the second matrix.
+                // since the hash value depends only on the element index,
+                // reuse the hash value stored in the node
+                float bvalue = _b->value<float>(anode->idx,&anode->hashval);
+                ccorr += avalue*bvalue;
+            }
+            return ccorr;
+        }
+    @endcode
+ */
+class CV_EXPORTS SparseMat
+{
+public:
+    typedef SparseMatIterator iterator;
+    typedef SparseMatConstIterator const_iterator;
+
+    enum { MAGIC_VAL=0x42FD0000, MAX_DIM=32, HASH_SCALE=0x5bd1e995, HASH_BIT=0x80000000 };
+
+    //! the sparse matrix header
+    struct CV_EXPORTS Hdr
+    {
+        Hdr(int _dims, const int* _sizes, int _type);
+        void clear();
+        int refcount;
+        int dims;
+        int valueOffset;
+        size_t nodeSize;
+        size_t nodeCount;
+        size_t freeList;
+        std::vector<uchar> pool;
+        std::vector<size_t> hashtab;
+        int size[MAX_DIM];
+    };
+
+    //! sparse matrix node - element of a hash table
+    struct CV_EXPORTS Node
+    {
+        //! hash value
+        size_t hashval;
+        //! index of the next node in the same hash table entry
+        size_t next;
+        //! index of the matrix element
+        int idx[MAX_DIM];
+    };
+
+    /** @brief Various SparseMat constructors.
+     */
+    SparseMat();
+
+    /** @overload
+    @param dims Array dimensionality.
+    @param _sizes Sparce matrix size on all dementions.
+    @param _type Sparse matrix data type.
+    */
+    SparseMat(int dims, const int* _sizes, int _type);
+
+    /** @overload
+    @param m Source matrix for copy constructor. If m is dense matrix (ocvMat) then it will be converted
+    to sparse representation.
+    */
+    SparseMat(const SparseMat& m);
+
+    /** @overload
+    @param m Source matrix for copy constructor. If m is dense matrix (ocvMat) then it will be converted
+    to sparse representation.
+    */
+    explicit SparseMat(const Mat& m);
+
+    //! the destructor
+    ~SparseMat();
+
+    //! assignment operator. This is O(1) operation, i.e. no data is copied
+    SparseMat& operator = (const SparseMat& m);
+    //! equivalent to the corresponding constructor
+    SparseMat& operator = (const Mat& m);
+
+    //! creates full copy of the matrix
+    CV_NODISCARD_STD SparseMat clone() const;
+
+    //! copies all the data to the destination matrix. All the previous content of m is erased
+    void copyTo( SparseMat& m ) const;
+    //! converts sparse matrix to dense matrix.
+    void copyTo( Mat& m ) const;
+    //! multiplies all the matrix elements by the specified scale factor alpha and converts the results to the specified data type
+    void convertTo( SparseMat& m, int rtype, double alpha=1 ) const;
+    //! converts sparse matrix to dense n-dim matrix with optional type conversion and scaling.
+    /*!
+        @param [out] m - output matrix; if it does not have a proper size or type before the operation,
+            it is reallocated
+        @param [in] rtype - desired output matrix type or, rather, the depth since the number of channels
+            are the same as the input has; if rtype is negative, the output matrix will have the
+            same type as the input.
+        @param [in] alpha - optional scale factor
+        @param [in] beta - optional delta added to the scaled values
+    */
+    void convertTo( Mat& m, int rtype, double alpha=1, double beta=0 ) const;
+
+    // not used now
+    void assignTo( SparseMat& m, int type=-1 ) const;
+
+    //! reallocates sparse matrix.
+    /*!
+        If the matrix already had the proper size and type,
+        it is simply cleared with clear(), otherwise,
+        the old matrix is released (using release()) and the new one is allocated.
+    */
+    void create(int dims, const int* _sizes, int _type);
+    //! sets all the sparse matrix elements to 0, which means clearing the hash table.
+    void clear();
+    //! manually increments the reference counter to the header.
+    void addref();
+    // decrements the header reference counter. When the counter reaches 0, the header and all the underlying data are deallocated.
+    void release();
+
+    //! converts sparse matrix to the old-style representation; all the elements are copied.
+    //operator CvSparseMat*() const;
+    //! returns the size of each element in bytes (not including the overhead - the space occupied by SparseMat::Node elements)
+    size_t elemSize() const;
+    //! returns elemSize()/channels()
+    size_t elemSize1() const;
+
+    //! returns type of sparse matrix elements
+    int type() const;
+    //! returns the depth of sparse matrix elements
+    int depth() const;
+    //! returns the number of channels
+    int channels() const;
+
+    //! returns the array of sizes, or NULL if the matrix is not allocated
+    const int* size() const;
+    //! returns the size of i-th matrix dimension (or 0)
+    int size(int i) const;
+    //! returns the matrix dimensionality
+    int dims() const;
+    //! returns the number of non-zero elements (=the number of hash table nodes)
+    size_t nzcount() const;
+
+    //! computes the element hash value (1D case)
+    size_t hash(int i0) const;
+    //! computes the element hash value (2D case)
+    size_t hash(int i0, int i1) const;
+    //! computes the element hash value (3D case)
+    size_t hash(int i0, int i1, int i2) const;
+    //! computes the element hash value (nD case)
+    size_t hash(const int* idx) const;
+
+    //!@{
+    /*!
+     specialized variants for 1D, 2D, 3D cases and the generic_type one for n-D case.
+     return pointer to the matrix element.
+      - if the element is there (it's non-zero), the pointer to it is returned
+      - if it's not there and createMissing=false, NULL pointer is returned
+      - if it's not there and createMissing=true, then the new element
+        is created and initialized with 0. Pointer to it is returned
+      - if the optional hashval pointer is not NULL, the element hash value is
+        not computed, but *hashval is taken instead.
+    */
+    //! returns pointer to the specified element (1D case)
+    uchar* ptr(int i0, bool createMissing, size_t* hashval=0);
+    //! returns pointer to the specified element (2D case)
+    uchar* ptr(int i0, int i1, bool createMissing, size_t* hashval=0);
+    //! returns pointer to the specified element (3D case)
+    uchar* ptr(int i0, int i1, int i2, bool createMissing, size_t* hashval=0);
+    //! returns pointer to the specified element (nD case)
+    uchar* ptr(const int* idx, bool createMissing, size_t* hashval=0);
+    //!@}
+
+    //!@{
+    /*!
+     return read-write reference to the specified sparse matrix element.
+
+     `ref<_Tp>(i0,...[,hashval])` is equivalent to `*(_Tp*)ptr(i0,...,true[,hashval])`.
+     The methods always return a valid reference.
+     If the element did not exist, it is created and initialized with 0.
+    */
+    //! returns reference to the specified element (1D case)
+    template<typename _Tp> _Tp& ref(int i0, size_t* hashval=0);
+    //! returns reference to the specified element (2D case)
+    template<typename _Tp> _Tp& ref(int i0, int i1, size_t* hashval=0);
+    //! returns reference to the specified element (3D case)
+    template<typename _Tp> _Tp& ref(int i0, int i1, int i2, size_t* hashval=0);
+    //! returns reference to the specified element (nD case)
+    template<typename _Tp> _Tp& ref(const int* idx, size_t* hashval=0);
+    //!@}
+
+    //!@{
+    /*!
+     return value of the specified sparse matrix element.
+
+     `value<_Tp>(i0,...[,hashval])` is equivalent to
+     @code
+     { const _Tp* p = find<_Tp>(i0,...[,hashval]); return p ? *p : _Tp(); }
+     @endcode
+
+     That is, if the element did not exist, the methods return 0.
+     */
+    //! returns value of the specified element (1D case)
+    template<typename _Tp> _Tp value(int i0, size_t* hashval=0) const;
+    //! returns value of the specified element (2D case)
+    template<typename _Tp> _Tp value(int i0, int i1, size_t* hashval=0) const;
+    //! returns value of the specified element (3D case)
+    template<typename _Tp> _Tp value(int i0, int i1, int i2, size_t* hashval=0) const;
+    //! returns value of the specified element (nD case)
+    template<typename _Tp> _Tp value(const int* idx, size_t* hashval=0) const;
+    //!@}
+
+    //!@{
+    /*!
+     Return pointer to the specified sparse matrix element if it exists
+
+     `find<_Tp>(i0,...[,hashval])` is equivalent to `(_const Tp*)ptr(i0,...false[,hashval])`.
+
+     If the specified element does not exist, the methods return NULL.
+    */
+    //! returns pointer to the specified element (1D case)
+    template<typename _Tp> const _Tp* find(int i0, size_t* hashval=0) const;
+    //! returns pointer to the specified element (2D case)
+    template<typename _Tp> const _Tp* find(int i0, int i1, size_t* hashval=0) const;
+    //! returns pointer to the specified element (3D case)
+    template<typename _Tp> const _Tp* find(int i0, int i1, int i2, size_t* hashval=0) const;
+    //! returns pointer to the specified element (nD case)
+    template<typename _Tp> const _Tp* find(const int* idx, size_t* hashval=0) const;
+    //!@}
+
+    //! erases the specified element (2D case)
+    void erase(int i0, int i1, size_t* hashval=0);
+    //! erases the specified element (3D case)
+    void erase(int i0, int i1, int i2, size_t* hashval=0);
+    //! erases the specified element (nD case)
+    void erase(const int* idx, size_t* hashval=0);
+
+    //!@{
+    /*!
+       return the sparse matrix iterator pointing to the first sparse matrix element
+    */
+    //! returns the sparse matrix iterator at the matrix beginning
+    SparseMatIterator begin();
+    //! returns the sparse matrix iterator at the matrix beginning
+    template<typename _Tp> SparseMatIterator_<_Tp> begin();
+    //! returns the read-only sparse matrix iterator at the matrix beginning
+    SparseMatConstIterator begin() const;
+    //! returns the read-only sparse matrix iterator at the matrix beginning
+    template<typename _Tp> SparseMatConstIterator_<_Tp> begin() const;
+    //!@}
+    /*!
+       return the sparse matrix iterator pointing to the element following the last sparse matrix element
+    */
+    //! returns the sparse matrix iterator at the matrix end
+    SparseMatIterator end();
+    //! returns the read-only sparse matrix iterator at the matrix end
+    SparseMatConstIterator end() const;
+    //! returns the typed sparse matrix iterator at the matrix end
+    template<typename _Tp> SparseMatIterator_<_Tp> end();
+    //! returns the typed read-only sparse matrix iterator at the matrix end
+    template<typename _Tp> SparseMatConstIterator_<_Tp> end() const;
+
+    //! returns the value stored in the sparse martix node
+    template<typename _Tp> _Tp& value(Node* n);
+    //! returns the value stored in the sparse martix node
+    template<typename _Tp> const _Tp& value(const Node* n) const;
+
+    ////////////// some internal-use methods ///////////////
+    Node* node(size_t nidx);
+    const Node* node(size_t nidx) const;
+
+    uchar* newNode(const int* idx, size_t hashval);
+    void removeNode(size_t hidx, size_t nidx, size_t previdx);
+    void resizeHashTab(size_t newsize);
+
+    int flags;
+    Hdr* hdr;
+};
+
+
+
+///////////////////////////////// SparseMat_<_Tp> ////////////////////////////////////
+
+/** @brief Template sparse n-dimensional array class derived from SparseMat
+
+SparseMat_ is a thin wrapper on top of SparseMat created in the same way as Mat_ . It simplifies
+notation of some operations:
+@code
+    int sz[] = {10, 20, 30};
+    SparseMat_<double> M(3, sz);
+    ...
+    M.ref(1, 2, 3) = M(4, 5, 6) + M(7, 8, 9);
+@endcode
+ */
+template<typename _Tp> class SparseMat_ : public SparseMat
+{
+public:
+    typedef SparseMatIterator_<_Tp> iterator;
+    typedef SparseMatConstIterator_<_Tp> const_iterator;
+
+    //! the default constructor
+    SparseMat_();
+    //! the full constructor equivalent to SparseMat(dims, _sizes, DataType<_Tp>::type)
+    SparseMat_(int dims, const int* _sizes);
+    //! the copy constructor. If DataType<_Tp>.type != m.type(), the m elements are converted
+    SparseMat_(const SparseMat& m);
+    //! the copy constructor. This is O(1) operation - no data is copied
+    SparseMat_(const SparseMat_& m);
+    //! converts dense matrix to the sparse form
+    SparseMat_(const Mat& m);
+    //! converts the old-style sparse matrix to the C++ class. All the elements are copied
+    //SparseMat_(const CvSparseMat* m);
+    //! the assignment operator. If DataType<_Tp>.type != m.type(), the m elements are converted
+    SparseMat_& operator = (const SparseMat& m);
+    //! the assignment operator. This is O(1) operation - no data is copied
+    SparseMat_& operator = (const SparseMat_& m);
+    //! converts dense matrix to the sparse form
+    SparseMat_& operator = (const Mat& m);
+
+    //! makes full copy of the matrix. All the elements are duplicated
+    CV_NODISCARD_STD SparseMat_ clone() const;
+    //! equivalent to cv::SparseMat::create(dims, _sizes, DataType<_Tp>::type)
+    void create(int dims, const int* _sizes);
+    //! converts sparse matrix to the old-style CvSparseMat. All the elements are copied
+    //operator CvSparseMat*() const;
+
+    //! returns type of the matrix elements
+    int type() const;
+    //! returns depth of the matrix elements
+    int depth() const;
+    //! returns the number of channels in each matrix element
+    int channels() const;
+
+    //! equivalent to SparseMat::ref<_Tp>(i0, hashval)
+    _Tp& ref(int i0, size_t* hashval=0);
+    //! equivalent to SparseMat::ref<_Tp>(i0, i1, hashval)
+    _Tp& ref(int i0, int i1, size_t* hashval=0);
+    //! equivalent to SparseMat::ref<_Tp>(i0, i1, i2, hashval)
+    _Tp& ref(int i0, int i1, int i2, size_t* hashval=0);
+    //! equivalent to SparseMat::ref<_Tp>(idx, hashval)
+    _Tp& ref(const int* idx, size_t* hashval=0);
+
+    //! equivalent to SparseMat::value<_Tp>(i0, hashval)
+    _Tp operator()(int i0, size_t* hashval=0) const;
+    //! equivalent to SparseMat::value<_Tp>(i0, i1, hashval)
+    _Tp operator()(int i0, int i1, size_t* hashval=0) const;
+    //! equivalent to SparseMat::value<_Tp>(i0, i1, i2, hashval)
+    _Tp operator()(int i0, int i1, int i2, size_t* hashval=0) const;
+    //! equivalent to SparseMat::value<_Tp>(idx, hashval)
+    _Tp operator()(const int* idx, size_t* hashval=0) const;
+
+    //! returns sparse matrix iterator pointing to the first sparse matrix element
+    SparseMatIterator_<_Tp> begin();
+    //! returns read-only sparse matrix iterator pointing to the first sparse matrix element
+    SparseMatConstIterator_<_Tp> begin() const;
+    //! returns sparse matrix iterator pointing to the element following the last sparse matrix element
+    SparseMatIterator_<_Tp> end();
+    //! returns read-only sparse matrix iterator pointing to the element following the last sparse matrix element
+    SparseMatConstIterator_<_Tp> end() const;
+};
+
+
+
+////////////////////////////////// MatConstIterator //////////////////////////////////
+
+class CV_EXPORTS MatConstIterator
+{
+public:
+    typedef uchar* value_type;
+    typedef ptrdiff_t difference_type;
+    typedef const uchar** pointer;
+    typedef uchar* reference;
+
+    typedef std::random_access_iterator_tag iterator_category;
+
+    //! default constructor
+    MatConstIterator();
+    //! constructor that sets the iterator to the beginning of the matrix
+    MatConstIterator(const Mat* _m);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator(const Mat* _m, int _row, int _col=0);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator(const Mat* _m, Point _pt);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator(const Mat* _m, const int* _idx);
+    //! copy constructor
+    MatConstIterator(const MatConstIterator& it);
+
+    //! copy operator
+    MatConstIterator& operator = (const MatConstIterator& it);
+    //! returns the current matrix element
+    const uchar* operator *() const;
+    //! returns the i-th matrix element, relative to the current
+    const uchar* operator [](ptrdiff_t i) const;
+
+    //! shifts the iterator forward by the specified number of elements
+    MatConstIterator& operator += (ptrdiff_t ofs);
+    //! shifts the iterator backward by the specified number of elements
+    MatConstIterator& operator -= (ptrdiff_t ofs);
+    //! decrements the iterator
+    MatConstIterator& operator --();
+    //! decrements the iterator
+    MatConstIterator operator --(int);
+    //! increments the iterator
+    MatConstIterator& operator ++();
+    //! increments the iterator
+    MatConstIterator operator ++(int);
+    //! returns the current iterator position
+    Point pos() const;
+    //! returns the current iterator position
+    void pos(int* _idx) const;
+
+    ptrdiff_t lpos() const;
+    void seek(ptrdiff_t ofs, bool relative = false);
+    void seek(const int* _idx, bool relative = false);
+
+    const Mat* m;
+    size_t elemSize;
+    const uchar* ptr;
+    const uchar* sliceStart;
+    const uchar* sliceEnd;
+};
+
+
+
+////////////////////////////////// MatConstIterator_ /////////////////////////////////
+
+/** @brief Matrix read-only iterator
+ */
+template<typename _Tp>
+class MatConstIterator_ : public MatConstIterator
+{
+public:
+    typedef _Tp value_type;
+    typedef ptrdiff_t difference_type;
+    typedef const _Tp* pointer;
+    typedef const _Tp& reference;
+
+    typedef std::random_access_iterator_tag iterator_category;
+
+    //! default constructor
+    MatConstIterator_();
+    //! constructor that sets the iterator to the beginning of the matrix
+    MatConstIterator_(const Mat_<_Tp>* _m);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator_(const Mat_<_Tp>* _m, int _row, int _col=0);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator_(const Mat_<_Tp>* _m, Point _pt);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator_(const Mat_<_Tp>* _m, const int* _idx);
+    //! copy constructor
+    MatConstIterator_(const MatConstIterator_& it);
+
+    //! copy operator
+    MatConstIterator_& operator = (const MatConstIterator_& it);
+    //! returns the current matrix element
+    const _Tp& operator *() const;
+    //! returns the i-th matrix element, relative to the current
+    const _Tp& operator [](ptrdiff_t i) const;
+
+    //! shifts the iterator forward by the specified number of elements
+    MatConstIterator_& operator += (ptrdiff_t ofs);
+    //! shifts the iterator backward by the specified number of elements
+    MatConstIterator_& operator -= (ptrdiff_t ofs);
+    //! decrements the iterator
+    MatConstIterator_& operator --();
+    //! decrements the iterator
+    MatConstIterator_ operator --(int);
+    //! increments the iterator
+    MatConstIterator_& operator ++();
+    //! increments the iterator
+    MatConstIterator_ operator ++(int);
+    //! returns the current iterator position
+    Point pos() const;
+};
+
+
+
+//////////////////////////////////// MatIterator_ ////////////////////////////////////
+
+/** @brief Matrix read-write iterator
+*/
+template<typename _Tp>
+class MatIterator_ : public MatConstIterator_<_Tp>
+{
+public:
+    typedef _Tp* pointer;
+    typedef _Tp& reference;
+
+    typedef std::random_access_iterator_tag iterator_category;
+
+    //! the default constructor
+    MatIterator_();
+    //! constructor that sets the iterator to the beginning of the matrix
+    MatIterator_(Mat_<_Tp>* _m);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatIterator_(Mat_<_Tp>* _m, int _row, int _col=0);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatIterator_(Mat_<_Tp>* _m, Point _pt);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatIterator_(Mat_<_Tp>* _m, const int* _idx);
+    //! copy constructor
+    MatIterator_(const MatIterator_& it);
+    //! copy operator
+    MatIterator_& operator = (const MatIterator_<_Tp>& it );
+
+    //! returns the current matrix element
+    _Tp& operator *() const;
+    //! returns the i-th matrix element, relative to the current
+    _Tp& operator [](ptrdiff_t i) const;
+
+    //! shifts the iterator forward by the specified number of elements
+    MatIterator_& operator += (ptrdiff_t ofs);
+    //! shifts the iterator backward by the specified number of elements
+    MatIterator_& operator -= (ptrdiff_t ofs);
+    //! decrements the iterator
+    MatIterator_& operator --();
+    //! decrements the iterator
+    MatIterator_ operator --(int);
+    //! increments the iterator
+    MatIterator_& operator ++();
+    //! increments the iterator
+    MatIterator_ operator ++(int);
+};
+
+
+
+/////////////////////////////// SparseMatConstIterator ///////////////////////////////
+
+/**  @brief Read-Only Sparse Matrix Iterator.
+
+ Here is how to use the iterator to compute the sum of floating-point sparse matrix elements:
+
+ \code
+ SparseMatConstIterator it = m.begin(), it_end = m.end();
+ double s = 0;
+ CV_Assert( m.type() == CV_32F );
+ for( ; it != it_end; ++it )
+    s += it.value<float>();
+ \endcode
+*/
+class CV_EXPORTS SparseMatConstIterator
+{
+public:
+    //! the default constructor
+    SparseMatConstIterator();
+    //! the full constructor setting the iterator to the first sparse matrix element
+    SparseMatConstIterator(const SparseMat* _m);
+    //! the copy constructor
+    SparseMatConstIterator(const SparseMatConstIterator& it);
+
+    //! the assignment operator
+    SparseMatConstIterator& operator = (const SparseMatConstIterator& it);
+
+    //! template method returning the current matrix element
+    template<typename _Tp> const _Tp& value() const;
+    //! returns the current node of the sparse matrix. it.node->idx is the current element index
+    const SparseMat::Node* node() const;
+
+    //! moves iterator to the previous element
+    SparseMatConstIterator& operator --();
+    //! moves iterator to the previous element
+    SparseMatConstIterator operator --(int);
+    //! moves iterator to the next element
+    SparseMatConstIterator& operator ++();
+    //! moves iterator to the next element
+    SparseMatConstIterator operator ++(int);
+
+    //! moves iterator to the element after the last element
+    void seekEnd();
+
+    const SparseMat* m;
+    size_t hashidx;
+    uchar* ptr;
+};
+
+
+
+////////////////////////////////// SparseMatIterator /////////////////////////////////
+
+/** @brief  Read-write Sparse Matrix Iterator
+
+ The class is similar to cv::SparseMatConstIterator,
+ but can be used for in-place modification of the matrix elements.
+*/
+class CV_EXPORTS SparseMatIterator : public SparseMatConstIterator
+{
+public:
+    //! the default constructor
+    SparseMatIterator();
+    //! the full constructor setting the iterator to the first sparse matrix element
+    SparseMatIterator(SparseMat* _m);
+    //! the full constructor setting the iterator to the specified sparse matrix element
+    SparseMatIterator(SparseMat* _m, const int* idx);
+    //! the copy constructor
+    SparseMatIterator(const SparseMatIterator& it);
+
+    //! the assignment operator
+    SparseMatIterator& operator = (const SparseMatIterator& it);
+    //! returns read-write reference to the current sparse matrix element
+    template<typename _Tp> _Tp& value() const;
+    //! returns pointer to the current sparse matrix node. it.node->idx is the index of the current element (do not modify it!)
+    SparseMat::Node* node() const;
+
+    //! moves iterator to the next element
+    SparseMatIterator& operator ++();
+    //! moves iterator to the next element
+    SparseMatIterator operator ++(int);
+};
+
+
+
+/////////////////////////////// SparseMatConstIterator_ //////////////////////////////
+
+/** @brief  Template Read-Only Sparse Matrix Iterator Class.
+
+ This is the derived from SparseMatConstIterator class that
+ introduces more convenient operator *() for accessing the current element.
+*/
+template<typename _Tp> class SparseMatConstIterator_ : public SparseMatConstIterator
+{
+public:
+
+    typedef std::forward_iterator_tag iterator_category;
+
+    //! the default constructor
+    SparseMatConstIterator_();
+    //! the full constructor setting the iterator to the first sparse matrix element
+    SparseMatConstIterator_(const SparseMat_<_Tp>* _m);
+    SparseMatConstIterator_(const SparseMat* _m);
+    //! the copy constructor
+    SparseMatConstIterator_(const SparseMatConstIterator_& it);
+
+    //! the assignment operator
+    SparseMatConstIterator_& operator = (const SparseMatConstIterator_& it);
+    //! the element access operator
+    const _Tp& operator *() const;
+
+    //! moves iterator to the next element
+    SparseMatConstIterator_& operator ++();
+    //! moves iterator to the next element
+    SparseMatConstIterator_ operator ++(int);
+};
+
+
+
+///////////////////////////////// SparseMatIterator_ /////////////////////////////////
+
+/** @brief  Template Read-Write Sparse Matrix Iterator Class.
+
+ This is the derived from cv::SparseMatConstIterator_ class that
+ introduces more convenient operator *() for accessing the current element.
+*/
+template<typename _Tp> class SparseMatIterator_ : public SparseMatConstIterator_<_Tp>
+{
+public:
+
+    typedef std::forward_iterator_tag iterator_category;
+
+    //! the default constructor
+    SparseMatIterator_();
+    //! the full constructor setting the iterator to the first sparse matrix element
+    SparseMatIterator_(SparseMat_<_Tp>* _m);
+    SparseMatIterator_(SparseMat* _m);
+    //! the copy constructor
+    SparseMatIterator_(const SparseMatIterator_& it);
+
+    //! the assignment operator
+    SparseMatIterator_& operator = (const SparseMatIterator_& it);
+    //! returns the reference to the current element
+    _Tp& operator *() const;
+
+    //! moves the iterator to the next element
+    SparseMatIterator_& operator ++();
+    //! moves the iterator to the next element
+    SparseMatIterator_ operator ++(int);
+};
+
+
+
+/////////////////////////////////// NAryMatIterator //////////////////////////////////
+
+/** @brief n-ary multi-dimensional array iterator.
+
+Use the class to implement unary, binary, and, generally, n-ary element-wise operations on
+multi-dimensional arrays. Some of the arguments of an n-ary function may be continuous arrays, some
+may be not. It is possible to use conventional MatIterator 's for each array but incrementing all of
+the iterators after each small operations may be a big overhead. In this case consider using
+NAryMatIterator to iterate through several matrices simultaneously as long as they have the same
+geometry (dimensionality and all the dimension sizes are the same). On each iteration `it.planes[0]`,
+`it.planes[1]`,... will be the slices of the corresponding matrices.
+
+The example below illustrates how you can compute a normalized and threshold 3D color histogram:
+@code
+    void computeNormalizedColorHist(const Mat& image, Mat& hist, int N, double minProb)
+    {
+        const int histSize[] = {N, N, N};
+
+        // make sure that the histogram has a proper size and type
+        hist.create(3, histSize, CV_32F);
+
+        // and clear it
+        hist = Scalar(0);
+
+        // the loop below assumes that the image
+        // is a 8-bit 3-channel. check it.
+        CV_Assert(image.type() == CV_8UC3);
+        MatConstIterator_<Vec3b> it = image.begin<Vec3b>(),
+                                 it_end = image.end<Vec3b>();
+        for( ; it != it_end; ++it )
+        {
+            const Vec3b& pix = *it;
+            hist.at<float>(pix[0]*N/256, pix[1]*N/256, pix[2]*N/256) += 1.f;
+        }
+
+        minProb *= image.rows*image.cols;
+
+        // initialize iterator (the style is different from STL).
+        // after initialization the iterator will contain
+        // the number of slices or planes the iterator will go through.
+        // it simultaneously increments iterators for several matrices
+        // supplied as a null terminated list of pointers
+        const Mat* arrays[] = {&hist, 0};
+        Mat planes[1];
+        NAryMatIterator itNAry(arrays, planes, 1);
+        double s = 0;
+        // iterate through the matrix. on each iteration
+        // itNAry.planes[i] (of type Mat) will be set to the current plane
+        // of the i-th n-dim matrix passed to the iterator constructor.
+        for(int p = 0; p < itNAry.nplanes; p++, ++itNAry)
+        {
+            threshold(itNAry.planes[0], itNAry.planes[0], minProb, 0, THRESH_TOZERO);
+            s += sum(itNAry.planes[0])[0];
+        }
+
+        s = 1./s;
+        itNAry = NAryMatIterator(arrays, planes, 1);
+        for(int p = 0; p < itNAry.nplanes; p++, ++itNAry)
+            itNAry.planes[0] *= s;
+    }
+@endcode
+ */
+class CV_EXPORTS NAryMatIterator
+{
+public:
+    //! the default constructor
+    NAryMatIterator();
+    //! the full constructor taking arbitrary number of n-dim matrices
+    NAryMatIterator(const Mat** arrays, uchar** ptrs, int narrays=-1);
+    //! the full constructor taking arbitrary number of n-dim matrices
+    NAryMatIterator(const Mat** arrays, Mat* planes, int narrays=-1);
+    //! the separate iterator initialization method
+    void init(const Mat** arrays, Mat* planes, uchar** ptrs, int narrays=-1);
+
+    //! proceeds to the next plane of every iterated matrix
+    NAryMatIterator& operator ++();
+    //! proceeds to the next plane of every iterated matrix (postfix increment operator)
+    NAryMatIterator operator ++(int);
+
+    //! the iterated arrays
+    const Mat** arrays;
+    //! the current planes
+    Mat* planes;
+    //! data pointers
+    uchar** ptrs;
+    //! the number of arrays
+    int narrays;
+    //! the number of hyper-planes that the iterator steps through
+    size_t nplanes;
+    //! the size of each segment (in elements)
+    size_t size;
+protected:
+    int iterdepth;
+    size_t idx;
+};
+
+
+
+///////////////////////////////// Matrix Expressions /////////////////////////////////
+
+class CV_EXPORTS MatOp
+{
+public:
+    MatOp();
+    virtual ~MatOp();
+
+    virtual bool elementWise(const MatExpr& expr) const;
+    virtual void assign(const MatExpr& expr, Mat& m, int type=-1) const = 0;
+    virtual void roi(const MatExpr& expr, const Range& rowRange,
+                     const Range& colRange, MatExpr& res) const;
+    virtual void diag(const MatExpr& expr, int d, MatExpr& res) const;
+    virtual void augAssignAdd(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignSubtract(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignMultiply(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignDivide(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignAnd(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignOr(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignXor(const MatExpr& expr, Mat& m) const;
+
+    virtual void add(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res) const;
+    virtual void add(const MatExpr& expr1, const Scalar& s, MatExpr& res) const;
+
+    virtual void subtract(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res) const;
+    virtual void subtract(const Scalar& s, const MatExpr& expr, MatExpr& res) const;
+
+    virtual void multiply(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res, double scale=1) const;
+    virtual void multiply(const MatExpr& expr1, double s, MatExpr& res) const;
+
+    virtual void divide(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res, double scale=1) const;
+    virtual void divide(double s, const MatExpr& expr, MatExpr& res) const;
+
+    virtual void abs(const MatExpr& expr, MatExpr& res) const;
+
+    virtual void transpose(const MatExpr& expr, MatExpr& res) const;
+    virtual void matmul(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res) const;
+    virtual void invert(const MatExpr& expr, int method, MatExpr& res) const;
+
+    virtual Size size(const MatExpr& expr) const;
+    virtual int type(const MatExpr& expr) const;
+};
+
+/** @brief Matrix expression representation
+@anchor MatrixExpressions
+This is a list of implemented matrix operations that can be combined in arbitrary complex
+expressions (here A, B stand for matrices ( Mat ), s for a scalar ( Scalar ), alpha for a
+real-valued scalar ( double )):
+-   Addition, subtraction, negation: `A+B`, `A-B`, `A+s`, `A-s`, `s+A`, `s-A`, `-A`
+-   Scaling: `A*alpha`
+-   Per-element multiplication and division: `A.mul(B)`, `A/B`, `alpha/A`
+-   Matrix multiplication: `A*B`
+-   Transposition: `A.t()` (means A<sup>T</sup>)
+-   Matrix inversion and pseudo-inversion, solving linear systems and least-squares problems:
+    `A.inv([method]) (~ A<sup>-1</sup>)`,   `A.inv([method])*B (~ X: AX=B)`
+-   Comparison: `A cmpop B`, `A cmpop alpha`, `alpha cmpop A`, where *cmpop* is one of
+  `>`, `>=`, `==`, `!=`, `<=`, `<`. The result of comparison is an 8-bit single channel mask whose
+    elements are set to 255 (if the particular element or pair of elements satisfy the condition) or
+    0.
+-   Bitwise logical operations: `A logicop B`, `A logicop s`, `s logicop A`, `~A`, where *logicop* is one of
+  `&`, `|`, `^`.
+-   Element-wise minimum and maximum: `min(A, B)`, `min(A, alpha)`, `max(A, B)`, `max(A, alpha)`
+-   Element-wise absolute value: `abs(A)`
+-   Cross-product, dot-product: `A.cross(B)`, `A.dot(B)`
+-   Any function of matrix or matrices and scalars that returns a matrix or a scalar, such as norm,
+    mean, sum, countNonZero, trace, determinant, repeat, and others.
+-   Matrix initializers ( Mat::eye(), Mat::zeros(), Mat::ones() ), matrix comma-separated
+    initializers, matrix constructors and operators that extract sub-matrices (see Mat description).
+-   Mat_<destination_type>() constructors to cast the result to the proper type.
+@note Comma-separated initializers and probably some other operations may require additional
+explicit Mat() or Mat_<T>() constructor calls to resolve a possible ambiguity.
+
+Here are examples of matrix expressions:
+@code
+    // compute pseudo-inverse of A, equivalent to A.inv(DECOMP_SVD)
+    SVD svd(A);
+    Mat pinvA = svd.vt.t()*Mat::diag(1./svd.w)*svd.u.t();
+
+    // compute the new vector of parameters in the Levenberg-Marquardt algorithm
+    x -= (A.t()*A + lambda*Mat::eye(A.cols,A.cols,A.type())).inv(DECOMP_CHOLESKY)*(A.t()*err);
+
+    // sharpen image using "unsharp mask" algorithm
+    Mat blurred; double sigma = 1, threshold = 5, amount = 1;
+    GaussianBlur(img, blurred, Size(), sigma, sigma);
+    Mat lowContrastMask = abs(img - blurred) < threshold;
+    Mat sharpened = img*(1+amount) + blurred*(-amount);
+    img.copyTo(sharpened, lowContrastMask);
+@endcode
+*/
+class CV_EXPORTS MatExpr
+{
+public:
+    MatExpr();
+    explicit MatExpr(const Mat& m);
+
+    MatExpr(const MatOp* _op, int _flags, const Mat& _a = Mat(), const Mat& _b = Mat(),
+            const Mat& _c = Mat(), double _alpha = 1, double _beta = 1, const Scalar& _s = Scalar());
+
+    operator Mat() const;
+    template<typename _Tp> operator Mat_<_Tp>() const;
+
+    Size size() const;
+    int type() const;
+
+    MatExpr row(int y) const;
+    MatExpr col(int x) const;
+    MatExpr diag(int d = 0) const;
+    MatExpr operator()( const Range& rowRange, const Range& colRange ) const;
+    MatExpr operator()( const Rect& roi ) const;
+
+    MatExpr t() const;
+    MatExpr inv(int method = DECOMP_LU) const;
+    MatExpr mul(const MatExpr& e, double scale=1) const;
+    MatExpr mul(const Mat& m, double scale=1) const;
+
+    Mat cross(const Mat& m) const;
+    double dot(const Mat& m) const;
+
+    void swap(MatExpr& b);
+
+    const MatOp* op;
+    int flags;
+
+    Mat a, b, c;
+    double alpha, beta;
+    Scalar s;
+};
+
+//! @} core_basic
+
+//! @relates cv::MatExpr
+//! @{
+CV_EXPORTS MatExpr operator + (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator + (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator + (const Scalar& s, const Mat& a);
+CV_EXPORTS MatExpr operator + (const MatExpr& e, const Mat& m);
+CV_EXPORTS MatExpr operator + (const Mat& m, const MatExpr& e);
+CV_EXPORTS MatExpr operator + (const MatExpr& e, const Scalar& s);
+CV_EXPORTS MatExpr operator + (const Scalar& s, const MatExpr& e);
+CV_EXPORTS MatExpr operator + (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator + (const Mat& a, const Matx<_Tp, m, n>& b) { return a + Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator + (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) + b; }
+
+CV_EXPORTS MatExpr operator - (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator - (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator - (const Scalar& s, const Mat& a);
+CV_EXPORTS MatExpr operator - (const MatExpr& e, const Mat& m);
+CV_EXPORTS MatExpr operator - (const Mat& m, const MatExpr& e);
+CV_EXPORTS MatExpr operator - (const MatExpr& e, const Scalar& s);
+CV_EXPORTS MatExpr operator - (const Scalar& s, const MatExpr& e);
+CV_EXPORTS MatExpr operator - (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator - (const Mat& a, const Matx<_Tp, m, n>& b) { return a - Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator - (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) - b; }
+
+CV_EXPORTS MatExpr operator - (const Mat& m);
+CV_EXPORTS MatExpr operator - (const MatExpr& e);
+
+CV_EXPORTS MatExpr operator * (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator * (const Mat& a, double s);
+CV_EXPORTS MatExpr operator * (double s, const Mat& a);
+CV_EXPORTS MatExpr operator * (const MatExpr& e, const Mat& m);
+CV_EXPORTS MatExpr operator * (const Mat& m, const MatExpr& e);
+CV_EXPORTS MatExpr operator * (const MatExpr& e, double s);
+CV_EXPORTS MatExpr operator * (double s, const MatExpr& e);
+CV_EXPORTS MatExpr operator * (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator * (const Mat& a, const Matx<_Tp, m, n>& b) { return a * Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator * (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) * b; }
+
+CV_EXPORTS MatExpr operator / (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator / (const Mat& a, double s);
+CV_EXPORTS MatExpr operator / (double s, const Mat& a);
+CV_EXPORTS MatExpr operator / (const MatExpr& e, const Mat& m);
+CV_EXPORTS MatExpr operator / (const Mat& m, const MatExpr& e);
+CV_EXPORTS MatExpr operator / (const MatExpr& e, double s);
+CV_EXPORTS MatExpr operator / (double s, const MatExpr& e);
+CV_EXPORTS MatExpr operator / (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator / (const Mat& a, const Matx<_Tp, m, n>& b) { return a / Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator / (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) / b; }
+
+CV_EXPORTS MatExpr operator < (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator < (const Mat& a, double s);
+CV_EXPORTS MatExpr operator < (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator < (const Mat& a, const Matx<_Tp, m, n>& b) { return a < Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator < (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) < b; }
+
+CV_EXPORTS MatExpr operator <= (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator <= (const Mat& a, double s);
+CV_EXPORTS MatExpr operator <= (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator <= (const Mat& a, const Matx<_Tp, m, n>& b) { return a <= Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator <= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) <= b; }
+
+CV_EXPORTS MatExpr operator == (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator == (const Mat& a, double s);
+CV_EXPORTS MatExpr operator == (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator == (const Mat& a, const Matx<_Tp, m, n>& b) { return a == Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator == (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) == b; }
+
+CV_EXPORTS MatExpr operator != (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator != (const Mat& a, double s);
+CV_EXPORTS MatExpr operator != (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator != (const Mat& a, const Matx<_Tp, m, n>& b) { return a != Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator != (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) != b; }
+
+CV_EXPORTS MatExpr operator >= (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator >= (const Mat& a, double s);
+CV_EXPORTS MatExpr operator >= (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator >= (const Mat& a, const Matx<_Tp, m, n>& b) { return a >= Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator >= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) >= b; }
+
+CV_EXPORTS MatExpr operator > (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator > (const Mat& a, double s);
+CV_EXPORTS MatExpr operator > (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator > (const Mat& a, const Matx<_Tp, m, n>& b) { return a > Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator > (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) > b; }
+
+CV_EXPORTS MatExpr operator & (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator & (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator & (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator & (const Mat& a, const Matx<_Tp, m, n>& b) { return a & Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator & (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) & b; }
+
+CV_EXPORTS MatExpr operator | (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator | (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator | (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator | (const Mat& a, const Matx<_Tp, m, n>& b) { return a | Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator | (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) | b; }
+
+CV_EXPORTS MatExpr operator ^ (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator ^ (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator ^ (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator ^ (const Mat& a, const Matx<_Tp, m, n>& b) { return a ^ Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator ^ (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) ^ b; }
+
+CV_EXPORTS MatExpr operator ~(const Mat& m);
+
+CV_EXPORTS MatExpr min(const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr min(const Mat& a, double s);
+CV_EXPORTS MatExpr min(double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr min (const Mat& a, const Matx<_Tp, m, n>& b) { return min(a, Mat(b)); }
+template<typename _Tp, int m, int n> static inline
+MatExpr min (const Matx<_Tp, m, n>& a, const Mat& b) { return min(Mat(a), b); }
+
+CV_EXPORTS MatExpr max(const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr max(const Mat& a, double s);
+CV_EXPORTS MatExpr max(double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr max (const Mat& a, const Matx<_Tp, m, n>& b) { return max(a, Mat(b)); }
+template<typename _Tp, int m, int n> static inline
+MatExpr max (const Matx<_Tp, m, n>& a, const Mat& b) { return max(Mat(a), b); }
+
+/** @brief Calculates an absolute value of each matrix element.
+
+abs is a meta-function that is expanded to one of absdiff or convertScaleAbs forms:
+- C = abs(A-B) is equivalent to `absdiff(A, B, C)`
+- C = abs(A) is equivalent to `absdiff(A, Scalar::all(0), C)`
+- C = `Mat_<Vec<uchar,n> >(abs(A*alpha + beta))` is equivalent to `convertScaleAbs(A, C, alpha,
+beta)`
+
+The output matrix has the same size and the same type as the input one except for the last case,
+where C is depth=CV_8U .
+@param m matrix.
+@sa @ref MatrixExpressions, absdiff, convertScaleAbs
+ */
+CV_EXPORTS MatExpr abs(const Mat& m);
+/** @overload
+@param e matrix expression.
+*/
+CV_EXPORTS MatExpr abs(const MatExpr& e);
+//! @} relates cv::MatExpr
+
+} // cv
+
+#include "opencv2/core/mat.inl.hpp"
+
+#endif // OPENCV_CORE_MAT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/mat.inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/mat.inl.hpp
new file mode 100644
index 000000000000..f0eed783a595
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/mat.inl.hpp
@@ -0,0 +1,3422 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_MATRIX_OPERATIONS_HPP
+#define OPENCV_CORE_MATRIX_OPERATIONS_HPP
+
+#ifndef __cplusplus
+#  error mat.inl.hpp header must be compiled as C++
+#endif
+
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 4127 5054 )
+#endif
+
+#if defined(CV_SKIP_DISABLE_CLANG_ENUM_WARNINGS)
+  // nothing
+#elif defined(CV_FORCE_DISABLE_CLANG_ENUM_WARNINGS)
+  #define CV_DISABLE_CLANG_ENUM_WARNINGS
+#elif defined(__clang__) && defined(__has_warning)
+  #if __has_warning("-Wdeprecated-enum-enum-conversion") && __has_warning("-Wdeprecated-anon-enum-enum-conversion")
+    #define CV_DISABLE_CLANG_ENUM_WARNINGS
+  #endif
+#endif
+#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-enum-enum-conversion"
+#pragma clang diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion"
+#endif
+
+namespace cv
+{
+CV__DEBUG_NS_BEGIN
+
+
+//! @cond IGNORED
+
+////////////////////////// Custom (raw) type wrapper //////////////////////////
+
+template<typename _Tp> static inline
+int rawType()
+{
+    CV_StaticAssert(sizeof(_Tp) <= CV_CN_MAX, "sizeof(_Tp) is too large");
+    const int elemSize = sizeof(_Tp);
+    return (int)CV_MAKETYPE(CV_8U, elemSize);
+}
+
+//////////////////////// Input/Output Arrays ////////////////////////
+
+inline void _InputArray::init(int _flags, const void* _obj)
+{ flags = _flags; obj = (void*)_obj; }
+
+inline void _InputArray::init(int _flags, const void* _obj, Size _sz)
+{ flags = _flags; obj = (void*)_obj; sz = _sz; }
+
+inline void* _InputArray::getObj() const { return obj; }
+inline int _InputArray::getFlags() const { return flags; }
+inline Size _InputArray::getSz() const { return sz; }
+
+inline _InputArray::_InputArray() { init(0 + NONE, 0); }
+inline _InputArray::_InputArray(int _flags, void* _obj) { init(_flags, _obj); }
+inline _InputArray::_InputArray(const Mat& m) { init(MAT+ACCESS_READ, &m); }
+inline _InputArray::_InputArray(const std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_READ, &vec); }
+inline _InputArray::_InputArray(const UMat& m) { init(UMAT+ACCESS_READ, &m); }
+inline _InputArray::_InputArray(const std::vector<UMat>& vec) { init(STD_VECTOR_UMAT+ACCESS_READ, &vec); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_READ, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputArray::_InputArray(const std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_InputArray::_InputArray(const std::array<Mat, _Nm>& arr)
+{ init(STD_ARRAY_MAT + ACCESS_READ, arr.data(), Size(1, _Nm)); }
+
+inline
+_InputArray::_InputArray(const std::vector<bool>& vec)
+{ init(FIXED_TYPE + STD_BOOL_VECTOR + traits::Type<bool>::value + ACCESS_READ, &vec); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_READ, &vec); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_READ, &vec); }
+
+template<typename _Tp, int m, int n> inline
+_InputArray::_InputArray(const Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const _Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ, vec, Size(n, 1)); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const Mat_<_Tp>& m)
+{ init(FIXED_TYPE + MAT + traits::Type<_Tp>::value + ACCESS_READ, &m); }
+
+inline _InputArray::_InputArray(const double& val)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F + ACCESS_READ, &val, Size(1,1)); }
+
+inline _InputArray::_InputArray(const cuda::GpuMat& d_mat)
+{ init(CUDA_GPU_MAT + ACCESS_READ, &d_mat); }
+
+inline _InputArray::_InputArray(const std::vector<cuda::GpuMat>& d_mat)
+{	init(STD_VECTOR_CUDA_GPU_MAT + ACCESS_READ, &d_mat);}
+
+inline _InputArray::_InputArray(const ogl::Buffer& buf)
+{ init(OPENGL_BUFFER + ACCESS_READ, &buf); }
+
+inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }
+
+template<typename _Tp> inline
+_InputArray _InputArray::rawIn(const std::vector<_Tp>& vec)
+{
+    _InputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_READ;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputArray _InputArray::rawIn(const std::array<_Tp, _Nm>& arr)
+{
+    _InputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
+inline _InputArray::~_InputArray() {}
+
+inline Mat _InputArray::getMat(int i) const
+{
+    if( kind() == MAT && i < 0 )
+        return *(const Mat*)obj;
+    return getMat_(i);
+}
+
+inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
+inline bool _InputArray::isUMat() const  { return kind() == _InputArray::UMAT; }
+inline bool _InputArray::isMatVector() const { return kind() == _InputArray::STD_VECTOR_MAT; }
+inline bool _InputArray::isUMatVector() const  { return kind() == _InputArray::STD_VECTOR_UMAT; }
+inline bool _InputArray::isMatx() const { return kind() == _InputArray::MATX; }
+inline bool _InputArray::isVector() const { return kind() == _InputArray::STD_VECTOR ||
+                                                   kind() == _InputArray::STD_BOOL_VECTOR ||
+                                                   (kind() == _InputArray::MATX && (sz.width <= 1 || sz.height <= 1)); }
+inline bool _InputArray::isGpuMat() const { return kind() == _InputArray::CUDA_GPU_MAT; }
+inline bool _InputArray::isGpuMatVector() const { return kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT; }
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+inline _OutputArray::_OutputArray() { init(NONE + ACCESS_WRITE, 0); }
+inline _OutputArray::_OutputArray(int _flags, void* _obj) { init(_flags + ACCESS_WRITE, _obj); }
+inline _OutputArray::_OutputArray(Mat& m) { init(MAT+ACCESS_WRITE, &m); }
+inline _OutputArray::_OutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
+inline _OutputArray::_OutputArray(UMat& m) { init(UMAT + ACCESS_WRITE, &m); }
+inline _OutputArray::_OutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_OutputArray::_OutputArray(std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_OutputArray::_OutputArray(std::array<Mat, _Nm>& arr)
+{ init(STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(Mat_<_Tp>& m)
+{ init(FIXED_TYPE + MAT + traits::Type<_Tp>::value + ACCESS_WRITE, &m); }
+
+template<typename _Tp, int m, int n> inline
+_OutputArray::_OutputArray(Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(_Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, vec, Size(n, 1)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_OutputArray::_OutputArray(const std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_OutputArray::_OutputArray(const std::array<Mat, _Nm>& arr)
+{ init(FIXED_SIZE + STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const Mat_<_Tp>& m)
+{ init(FIXED_TYPE + FIXED_SIZE + MAT + traits::Type<_Tp>::value + ACCESS_WRITE, &m); }
+
+template<typename _Tp, int m, int n> inline
+_OutputArray::_OutputArray(const Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const _Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, vec, Size(n, 1)); }
+
+inline _OutputArray::_OutputArray(cuda::GpuMat& d_mat)
+{ init(CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
+
+inline _OutputArray::_OutputArray(std::vector<cuda::GpuMat>& d_mat)
+{	init(STD_VECTOR_CUDA_GPU_MAT + ACCESS_WRITE, &d_mat);}
+
+inline _OutputArray::_OutputArray(ogl::Buffer& buf)
+{ init(OPENGL_BUFFER + ACCESS_WRITE, &buf); }
+
+inline _OutputArray::_OutputArray(cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
+
+inline _OutputArray::_OutputArray(const Mat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_WRITE, &m); }
+
+inline _OutputArray::_OutputArray(const std::vector<Mat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
+
+inline _OutputArray::_OutputArray(const UMat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + UMAT + ACCESS_WRITE, &m); }
+
+inline _OutputArray::_OutputArray(const std::vector<UMat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
+
+inline _OutputArray::_OutputArray(const cuda::GpuMat& d_mat)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
+
+
+inline _OutputArray::_OutputArray(const ogl::Buffer& buf)
+{ init(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER + ACCESS_WRITE, &buf); }
+
+inline _OutputArray::_OutputArray(const cuda::HostMem& cuda_mem)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
+
+template<typename _Tp> inline
+_OutputArray _OutputArray::rawOut(std::vector<_Tp>& vec)
+{
+    _OutputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_WRITE;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_OutputArray _OutputArray::rawOut(std::array<_Tp, _Nm>& arr)
+{
+    _OutputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+
+inline _InputOutputArray::_InputOutputArray() { init(0+ACCESS_RW, 0); }
+inline _InputOutputArray::_InputOutputArray(int _flags, void* _obj) { init(_flags+ACCESS_RW, _obj); }
+inline _InputOutputArray::_InputOutputArray(Mat& m) { init(MAT+ACCESS_RW, &m); }
+inline _InputOutputArray::_InputOutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_RW, &vec); }
+inline _InputOutputArray::_InputOutputArray(UMat& m) { init(UMAT+ACCESS_RW, &m); }
+inline _InputOutputArray::_InputOutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT+ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputOutputArray::_InputOutputArray(std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_InputOutputArray::_InputOutputArray(std::array<Mat, _Nm>& arr)
+{ init(STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(Mat_<_Tp>& m)
+{ init(FIXED_TYPE + MAT + traits::Type<_Tp>::value + ACCESS_RW, &m); }
+
+template<typename _Tp, int m, int n> inline
+_InputOutputArray::_InputOutputArray(Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(_Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, vec, Size(n, 1)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputOutputArray::_InputOutputArray(const std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_InputOutputArray::_InputOutputArray(const std::array<Mat, _Nm>& arr)
+{ init(FIXED_SIZE + STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const Mat_<_Tp>& m)
+{ init(FIXED_TYPE + FIXED_SIZE + MAT + traits::Type<_Tp>::value + ACCESS_RW, &m); }
+
+template<typename _Tp, int m, int n> inline
+_InputOutputArray::_InputOutputArray(const Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const _Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, vec, Size(n, 1)); }
+
+inline _InputOutputArray::_InputOutputArray(cuda::GpuMat& d_mat)
+{ init(CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
+
+inline _InputOutputArray::_InputOutputArray(ogl::Buffer& buf)
+{ init(OPENGL_BUFFER + ACCESS_RW, &buf); }
+
+inline _InputOutputArray::_InputOutputArray(cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
+
+inline _InputOutputArray::_InputOutputArray(const Mat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_RW, &m); }
+
+inline _InputOutputArray::_InputOutputArray(const std::vector<Mat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_MAT + ACCESS_RW, &vec); }
+
+inline _InputOutputArray::_InputOutputArray(const UMat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + UMAT + ACCESS_RW, &m); }
+
+inline _InputOutputArray::_InputOutputArray(const std::vector<UMat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_RW, &vec); }
+
+inline _InputOutputArray::_InputOutputArray(const cuda::GpuMat& d_mat)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
+
+inline _InputOutputArray::_InputOutputArray(const std::vector<cuda::GpuMat>& d_mat)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_CUDA_GPU_MAT + ACCESS_RW, &d_mat);}
+
+template<> inline _InputOutputArray::_InputOutputArray(std::vector<cuda::GpuMat>& d_mat)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_CUDA_GPU_MAT + ACCESS_RW, &d_mat);}
+
+inline _InputOutputArray::_InputOutputArray(const ogl::Buffer& buf)
+{ init(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER + ACCESS_RW, &buf); }
+
+inline _InputOutputArray::_InputOutputArray(const cuda::HostMem& cuda_mem)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
+
+template<typename _Tp> inline
+_InputOutputArray _InputOutputArray::rawInOut(std::vector<_Tp>& vec)
+{
+    _InputOutputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_RW;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputOutputArray _InputOutputArray::rawInOut(std::array<_Tp, _Nm>& arr)
+{
+    _InputOutputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
+
+template<typename _Tp> static inline _InputArray rawIn(_Tp& v) { return _InputArray::rawIn(v); }
+template<typename _Tp> static inline _OutputArray rawOut(_Tp& v) { return _OutputArray::rawOut(v); }
+template<typename _Tp> static inline _InputOutputArray rawInOut(_Tp& v) { return _InputOutputArray::rawInOut(v); }
+
+CV__DEBUG_NS_END
+
+//////////////////////////////////////////// Mat //////////////////////////////////////////
+
+template<typename _Tp> inline
+Mat::Mat(const std::vector<_Tp>& vec, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
+      cols(1), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if(vec.empty())
+        return;
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)&vec[0];
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+        Mat((int)vec.size(), 1, traits::Type<_Tp>::value, (uchar*)&vec[0]).copyTo(*this);
+}
+
+template<typename _Tp, typename> inline
+Mat::Mat(const std::initializer_list<_Tp> list)
+    : Mat()
+{
+    CV_Assert(list.size() != 0);
+    Mat((int)list.size(), 1, traits::Type<_Tp>::value, (uchar*)list.begin()).copyTo(*this);
+}
+
+template<typename _Tp> inline
+Mat::Mat(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> list)
+    : Mat()
+{
+    size_t size_total = 1;
+    for(auto s : sizes)
+        size_total *= s;
+    CV_Assert(list.size() != 0);
+    CV_Assert(size_total == list.size());
+    Mat((int)sizes.size(), (int*)sizes.begin(), traits::Type<_Tp>::value, (uchar*)list.begin()).copyTo(*this);
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+Mat::Mat(const std::array<_Tp, _Nm>& arr, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)arr.size()),
+      cols(1), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if(arr.empty())
+        return;
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)arr.data();
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+        Mat((int)arr.size(), 1, traits::Type<_Tp>::value, (uchar*)arr.data()).copyTo(*this);
+}
+
+template<typename _Tp, int n> inline
+Mat::Mat(const Vec<_Tp, n>& vec, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(n), cols(1), data(0),
+      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)vec.val;
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+        Mat(n, 1, traits::Type<_Tp>::value, (void*)vec.val).copyTo(*this);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Mat::Mat(const Matx<_Tp,m,n>& M, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(m), cols(n), data(0),
+      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if( !copyData )
+    {
+        step[0] = cols * sizeof(_Tp);
+        step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)M.val;
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+        Mat(m, n, traits::Type<_Tp>::value, (uchar*)M.val).copyTo(*this);
+}
+
+template<typename _Tp> inline
+Mat::Mat(const Point_<_Tp>& pt, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(2), cols(1), data(0),
+      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)&pt.x;
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+    {
+        create(2, 1, traits::Type<_Tp>::value);
+        ((_Tp*)data)[0] = pt.x;
+        ((_Tp*)data)[1] = pt.y;
+    }
+}
+
+template<typename _Tp> inline
+Mat::Mat(const Point3_<_Tp>& pt, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(3), cols(1), data(0),
+      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)&pt.x;
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+    {
+        create(3, 1, traits::Type<_Tp>::value);
+        ((_Tp*)data)[0] = pt.x;
+        ((_Tp*)data)[1] = pt.y;
+        ((_Tp*)data)[2] = pt.z;
+    }
+}
+
+template<typename _Tp> inline
+Mat::Mat(const MatCommaInitializer_<_Tp>& commaInitializer)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(0), rows(0), cols(0), data(0),
+      datastart(0), dataend(0), allocator(0), u(0), size(&rows)
+{
+    *this = commaInitializer.operator Mat_<_Tp>();
+}
+
+inline
+Mat Mat::row(int y) const
+{
+    return Mat(*this, Range(y, y + 1), Range::all());
+}
+
+inline
+Mat Mat::col(int x) const
+{
+    return Mat(*this, Range::all(), Range(x, x + 1));
+}
+
+inline
+Mat Mat::rowRange(int startrow, int endrow) const
+{
+    return Mat(*this, Range(startrow, endrow), Range::all());
+}
+
+inline
+Mat Mat::rowRange(const Range& r) const
+{
+    return Mat(*this, r, Range::all());
+}
+
+inline
+Mat Mat::colRange(int startcol, int endcol) const
+{
+    return Mat(*this, Range::all(), Range(startcol, endcol));
+}
+
+inline
+Mat Mat::colRange(const Range& r) const
+{
+    return Mat(*this, Range::all(), r);
+}
+
+inline
+Mat Mat::operator()( Range _rowRange, Range _colRange ) const
+{
+    return Mat(*this, _rowRange, _colRange);
+}
+
+inline
+Mat Mat::operator()( const Rect& roi ) const
+{
+    return Mat(*this, roi);
+}
+
+inline
+Mat Mat::operator()(const Range* ranges) const
+{
+    return Mat(*this, ranges);
+}
+
+inline
+Mat Mat::operator()(const std::vector<Range>& ranges) const
+{
+    return Mat(*this, ranges);
+}
+
+inline
+bool Mat::isContinuous() const
+{
+    return (flags & CONTINUOUS_FLAG) != 0;
+}
+
+inline
+bool Mat::isSubmatrix() const
+{
+    return (flags & SUBMATRIX_FLAG) != 0;
+}
+
+inline
+size_t Mat::elemSize() const
+{
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
+}
+
+inline
+size_t Mat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int Mat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int Mat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int Mat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+uchar* Mat::ptr(int y)
+{
+    CV_DbgAssert( y == 0 || (data && dims >= 1 && (unsigned)y < (unsigned)size.p[0]) );
+    return data + step.p[0] * y;
+}
+
+inline
+const uchar* Mat::ptr(int y) const
+{
+    CV_DbgAssert( y == 0 || (data && dims >= 1 && (unsigned)y < (unsigned)size.p[0]) );
+    return data + step.p[0] * y;
+}
+
+template<typename _Tp> inline
+_Tp* Mat::ptr(int y)
+{
+    CV_DbgAssert( y == 0 || (data && dims >= 1 && (unsigned)y < (unsigned)size.p[0]) );
+    return (_Tp*)(data + step.p[0] * y);
+}
+
+template<typename _Tp> inline
+const _Tp* Mat::ptr(int y) const
+{
+    CV_DbgAssert( y == 0 || (data && dims >= 1 && (unsigned)y < (unsigned)size.p[0]) );
+    return (const _Tp*)(data + step.p[0] * y);
+}
+
+inline
+uchar* Mat::ptr(int i0, int i1)
+{
+    CV_DbgAssert(dims >= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    return data + i0 * step.p[0] + i1 * step.p[1];
+}
+
+inline
+const uchar* Mat::ptr(int i0, int i1) const
+{
+    CV_DbgAssert(dims >= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    return data + i0 * step.p[0] + i1 * step.p[1];
+}
+
+template<typename _Tp> inline
+_Tp* Mat::ptr(int i0, int i1)
+{
+    CV_DbgAssert(dims >= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    return (_Tp*)(data + i0 * step.p[0] + i1 * step.p[1]);
+}
+
+template<typename _Tp> inline
+const _Tp* Mat::ptr(int i0, int i1) const
+{
+    CV_DbgAssert(dims >= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    return (const _Tp*)(data + i0 * step.p[0] + i1 * step.p[1]);
+}
+
+inline
+uchar* Mat::ptr(int i0, int i1, int i2)
+{
+    CV_DbgAssert(dims >= 3);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert((unsigned)i2 < (unsigned)size.p[2]);
+    return data + i0 * step.p[0] + i1 * step.p[1] + i2 * step.p[2];
+}
+
+inline
+const uchar* Mat::ptr(int i0, int i1, int i2) const
+{
+    CV_DbgAssert(dims >= 3);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert((unsigned)i2 < (unsigned)size.p[2]);
+    return data + i0 * step.p[0] + i1 * step.p[1] + i2 * step.p[2];
+}
+
+template<typename _Tp> inline
+_Tp* Mat::ptr(int i0, int i1, int i2)
+{
+    CV_DbgAssert(dims >= 3);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert((unsigned)i2 < (unsigned)size.p[2]);
+    return (_Tp*)(data + i0 * step.p[0] + i1 * step.p[1] + i2 * step.p[2]);
+}
+
+template<typename _Tp> inline
+const _Tp* Mat::ptr(int i0, int i1, int i2) const
+{
+    CV_DbgAssert(dims >= 3);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert((unsigned)i2 < (unsigned)size.p[2]);
+    return (const _Tp*)(data + i0 * step.p[0] + i1 * step.p[1] + i2 * step.p[2]);
+}
+
+inline
+uchar* Mat::ptr(const int* idx)
+{
+    int i, d = dims;
+    uchar* p = data;
+    CV_DbgAssert( d >= 1 && p );
+    for( i = 0; i < d; i++ )
+    {
+        CV_DbgAssert( (unsigned)idx[i] < (unsigned)size.p[i] );
+        p += idx[i] * step.p[i];
+    }
+    return p;
+}
+
+inline
+const uchar* Mat::ptr(const int* idx) const
+{
+    int i, d = dims;
+    uchar* p = data;
+    CV_DbgAssert( d >= 1 && p );
+    for( i = 0; i < d; i++ )
+    {
+        CV_DbgAssert( (unsigned)idx[i] < (unsigned)size.p[i] );
+        p += idx[i] * step.p[i];
+    }
+    return p;
+}
+
+template<typename _Tp> inline
+_Tp* Mat::ptr(const int* idx)
+{
+    int i, d = dims;
+    uchar* p = data;
+    CV_DbgAssert( d >= 1 && p );
+    for( i = 0; i < d; i++ )
+    {
+        CV_DbgAssert( (unsigned)idx[i] < (unsigned)size.p[i] );
+        p += idx[i] * step.p[i];
+    }
+    return (_Tp*)p;
+}
+
+template<typename _Tp> inline
+const _Tp* Mat::ptr(const int* idx) const
+{
+    int i, d = dims;
+    uchar* p = data;
+    CV_DbgAssert( d >= 1 && p );
+    for( i = 0; i < d; i++ )
+    {
+        CV_DbgAssert( (unsigned)idx[i] < (unsigned)size.p[i] );
+        p += idx[i] * step.p[i];
+    }
+    return (const _Tp*)p;
+}
+
+template<int n> inline
+uchar* Mat::ptr(const Vec<int, n>& idx)
+{
+    return Mat::ptr(idx.val);
+}
+
+template<int n> inline
+const uchar* Mat::ptr(const Vec<int, n>& idx) const
+{
+    return Mat::ptr(idx.val);
+}
+
+template<typename _Tp, int n> inline
+_Tp* Mat::ptr(const Vec<int, n>& idx)
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return Mat::ptr<_Tp>(idx.val);
+}
+
+template<typename _Tp, int n> inline
+const _Tp* Mat::ptr(const Vec<int, n>& idx) const
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return Mat::ptr<_Tp>(idx.val);
+}
+
+
+template<typename _Tp> inline
+_Tp& Mat::at(int i0, int i1)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)(i1 * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels()));
+    CV_DbgAssert(CV_ELEM_SIZE1(traits::Depth<_Tp>::value) == elemSize1());
+    return ((_Tp*)(data + step.p[0] * i0))[i1];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(int i0, int i1) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)(i1 * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels()));
+    CV_DbgAssert(CV_ELEM_SIZE1(traits::Depth<_Tp>::value) == elemSize1());
+    return ((const _Tp*)(data + step.p[0] * i0))[i1];
+}
+
+template<typename _Tp> inline
+_Tp& Mat::at(Point pt)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)pt.y < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)(pt.x * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels()));
+    CV_DbgAssert(CV_ELEM_SIZE1(traits::Depth<_Tp>::value) == elemSize1());
+    return ((_Tp*)(data + step.p[0] * pt.y))[pt.x];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(Point pt) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)pt.y < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)(pt.x * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels()));
+    CV_DbgAssert(CV_ELEM_SIZE1(traits::Depth<_Tp>::value) == elemSize1());
+    return ((const _Tp*)(data + step.p[0] * pt.y))[pt.x];
+}
+
+template<typename _Tp> inline
+_Tp& Mat::at(int i0)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)(size.p[0] * size.p[1]));
+    CV_DbgAssert(elemSize() == sizeof(_Tp));
+    if( isContinuous() || size.p[0] == 1 )
+        return ((_Tp*)data)[i0];
+    if( size.p[1] == 1 )
+        return *(_Tp*)(data + step.p[0] * i0);
+    int i = i0 / cols, j = i0 - i * cols;
+    return ((_Tp*)(data + step.p[0] * i))[j];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(int i0) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)(size.p[0] * size.p[1]));
+    CV_DbgAssert(elemSize() == sizeof(_Tp));
+    if( isContinuous() || size.p[0] == 1 )
+        return ((const _Tp*)data)[i0];
+    if( size.p[1] == 1 )
+        return *(const _Tp*)(data + step.p[0] * i0);
+    int i = i0 / cols, j = i0 - i * cols;
+    return ((const _Tp*)(data + step.p[0] * i))[j];
+}
+
+template<typename _Tp> inline
+_Tp& Mat::at(int i0, int i1, int i2)
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(_Tp*)ptr(i0, i1, i2);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(int i0, int i1, int i2) const
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(const _Tp*)ptr(i0, i1, i2);
+}
+
+template<typename _Tp> inline
+_Tp& Mat::at(const int* idx)
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(_Tp*)ptr(idx);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(const int* idx) const
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(const _Tp*)ptr(idx);
+}
+
+template<typename _Tp, int n> inline
+_Tp& Mat::at(const Vec<int, n>& idx)
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(_Tp*)ptr(idx.val);
+}
+
+template<typename _Tp, int n> inline
+const _Tp& Mat::at(const Vec<int, n>& idx) const
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(const _Tp*)ptr(idx.val);
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> Mat::begin() const
+{
+    if (empty())
+        return MatConstIterator_<_Tp>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return MatConstIterator_<_Tp>((const Mat_<_Tp>*)this);
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat::rbegin() const
+{
+    if (empty())
+        return std::reverse_iterator<MatConstIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatConstIterator_<_Tp> it((const Mat_<_Tp>*)this);
+    it += total();
+    return std::reverse_iterator<MatConstIterator_<_Tp>> (it);
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> Mat::end() const
+{
+    if (empty())
+        return MatConstIterator_<_Tp>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatConstIterator_<_Tp> it((const Mat_<_Tp>*)this);
+    it += total();
+    return it;
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat::rend() const
+{
+    if (empty())
+        return std::reverse_iterator<MatConstIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return std::reverse_iterator<MatConstIterator_<_Tp>>((const Mat_<_Tp>*)this);
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> Mat::begin()
+{
+    if (empty())
+        return MatIterator_<_Tp>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return MatIterator_<_Tp>((Mat_<_Tp>*)this);
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat::rbegin()
+{
+    if (empty())
+        return std::reverse_iterator<MatIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatIterator_<_Tp> it((Mat_<_Tp>*)this);
+    it += total();
+    return std::reverse_iterator<MatIterator_<_Tp>>(it);
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> Mat::end()
+{
+    if (empty())
+        return MatIterator_<_Tp>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatIterator_<_Tp> it((Mat_<_Tp>*)this);
+    it += total();
+    return it;
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat::rend()
+{
+    if (empty())
+        return std::reverse_iterator<MatIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return std::reverse_iterator<MatIterator_<_Tp>>(MatIterator_<_Tp>((Mat_<_Tp>*)this));
+}
+
+template<typename _Tp, typename Functor> inline
+void Mat::forEach(const Functor& operation) {
+    this->forEach_impl<_Tp>(operation);
+}
+
+template<typename _Tp, typename Functor> inline
+void Mat::forEach(const Functor& operation) const {
+    // call as not const
+    (const_cast<Mat*>(this))->forEach<_Tp>(operation);
+}
+
+template<typename _Tp> inline
+Mat::operator std::vector<_Tp>() const
+{
+    std::vector<_Tp> v;
+    copyTo(v);
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+Mat::operator std::array<_Tp, _Nm>() const
+{
+    std::array<_Tp, _Nm> v;
+    copyTo(v);
+    return v;
+}
+
+template<typename _Tp, int n> inline
+Mat::operator Vec<_Tp, n>() const
+{
+    CV_Assert( data && dims <= 2 && (rows == 1 || cols == 1) &&
+               rows + cols - 1 == n && channels() == 1 );
+
+    if( isContinuous() && type() == traits::Type<_Tp>::value )
+        return Vec<_Tp, n>((_Tp*)data);
+    Vec<_Tp, n> v;
+    Mat tmp(rows, cols, traits::Type<_Tp>::value, v.val);
+    convertTo(tmp, tmp.type());
+    return v;
+}
+
+template<typename _Tp, int m, int n> inline
+Mat::operator Matx<_Tp, m, n>() const
+{
+    CV_Assert( data && dims <= 2 && rows == m && cols == n && channels() == 1 );
+
+    if( isContinuous() && type() == traits::Type<_Tp>::value )
+        return Matx<_Tp, m, n>((_Tp*)data);
+    Matx<_Tp, m, n> mtx;
+    Mat tmp(rows, cols, traits::Type<_Tp>::value, mtx.val);
+    convertTo(tmp, tmp.type());
+    return mtx;
+}
+
+template<typename _Tp> inline
+void Mat::push_back(const _Tp& elem)
+{
+    if( !data )
+    {
+        *this = Mat(1, 1, traits::Type<_Tp>::value, (void*)&elem).clone();
+        return;
+    }
+    CV_Assert(traits::Type<_Tp>::value == type() && cols == 1
+              /* && dims == 2 (cols == 1 implies dims == 2) */);
+    const uchar* tmp = dataend + step[0];
+    if( !isSubmatrix() && isContinuous() && tmp <= datalimit )
+    {
+        *(_Tp*)(data + (size.p[0]++) * step.p[0]) = elem;
+        dataend = tmp;
+    }
+    else
+        push_back_(&elem);
+}
+
+template<typename _Tp> inline
+void Mat::push_back(const Mat_<_Tp>& m)
+{
+    push_back((const Mat&)m);
+}
+
+template<> inline
+void Mat::push_back(const MatExpr& expr)
+{
+    push_back(static_cast<Mat>(expr));
+}
+
+
+template<typename _Tp> inline
+void Mat::push_back(const std::vector<_Tp>& v)
+{
+    push_back(Mat(v));
+}
+
+
+///////////////////////////// MatSize ////////////////////////////
+
+inline
+MatSize::MatSize(int* _p) CV_NOEXCEPT
+    : p(_p) {}
+
+inline
+int MatSize::dims() const CV_NOEXCEPT
+{
+    return (p - 1)[0];
+}
+
+inline
+Size MatSize::operator()() const
+{
+    CV_DbgAssert(dims() <= 2);
+    return Size(p[1], p[0]);
+}
+
+inline
+const int& MatSize::operator[](int i) const
+{
+    CV_DbgAssert(i < dims());
+#ifdef __OPENCV_BUILD
+    CV_DbgAssert(i >= 0);
+#endif
+    return p[i];
+}
+
+inline
+int& MatSize::operator[](int i)
+{
+    CV_DbgAssert(i < dims());
+#ifdef __OPENCV_BUILD
+    CV_DbgAssert(i >= 0);
+#endif
+    return p[i];
+}
+
+inline
+MatSize::operator const int*() const CV_NOEXCEPT
+{
+    return p;
+}
+
+inline
+bool MatSize::operator != (const MatSize& sz) const CV_NOEXCEPT
+{
+    return !(*this == sz);
+}
+
+
+
+///////////////////////////// MatStep ////////////////////////////
+
+inline
+MatStep::MatStep() CV_NOEXCEPT
+{
+    p = buf; p[0] = p[1] = 0;
+}
+
+inline
+MatStep::MatStep(size_t s) CV_NOEXCEPT
+{
+    p = buf; p[0] = s; p[1] = 0;
+}
+
+inline
+const size_t& MatStep::operator[](int i) const CV_NOEXCEPT
+{
+    return p[i];
+}
+
+inline
+size_t& MatStep::operator[](int i) CV_NOEXCEPT
+{
+    return p[i];
+}
+
+inline MatStep::operator size_t() const
+{
+    CV_DbgAssert( p == buf );
+    return buf[0];
+}
+
+inline MatStep& MatStep::operator = (size_t s)
+{
+    CV_DbgAssert( p == buf );
+    buf[0] = s;
+    return *this;
+}
+
+
+
+////////////////////////////// Mat_<_Tp> ////////////////////////////
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_() CV_NOEXCEPT
+    : Mat()
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _rows, int _cols)
+    : Mat(_rows, _cols, traits::Type<_Tp>::value)
+{
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _rows, int _cols, const _Tp& value)
+    : Mat(_rows, _cols, traits::Type<_Tp>::value)
+{
+    *this = value;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(Size _sz)
+    : Mat(_sz.height, _sz.width, traits::Type<_Tp>::value)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(Size _sz, const _Tp& value)
+    : Mat(_sz.height, _sz.width, traits::Type<_Tp>::value)
+{
+    *this = value;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _dims, const int* _sz)
+    : Mat(_dims, _sz, traits::Type<_Tp>::value)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _dims, const int* _sz, const _Tp& _s)
+    : Mat(_dims, _sz, traits::Type<_Tp>::value, Scalar(_s))
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _dims, const int* _sz, _Tp* _data, const size_t* _steps)
+    : Mat(_dims, _sz, traits::Type<_Tp>::value, _data, _steps)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_<_Tp>& m, const Range* ranges)
+    : Mat(m, ranges)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_<_Tp>& m, const std::vector<Range>& ranges)
+    : Mat(m, ranges)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat& m)
+    : Mat()
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+    *this = m;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_& m)
+    : Mat(m)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _rows, int _cols, _Tp* _data, size_t steps)
+    : Mat(_rows, _cols, traits::Type<_Tp>::value, _data, steps)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_& m, const Range& _rowRange, const Range& _colRange)
+    : Mat(m, _rowRange, _colRange)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_& m, const Rect& roi)
+    : Mat(m, roi)
+{}
+
+template<typename _Tp> template<int n> inline
+Mat_<_Tp>::Mat_(const Vec<typename DataType<_Tp>::channel_type, n>& vec, bool copyData)
+    : Mat(n / DataType<_Tp>::channels, 1, traits::Type<_Tp>::value, (void*)&vec)
+{
+    CV_Assert(n%DataType<_Tp>::channels == 0);
+    if( copyData )
+        *this = clone();
+}
+
+template<typename _Tp> template<int m, int n> inline
+Mat_<_Tp>::Mat_(const Matx<typename DataType<_Tp>::channel_type, m, n>& M, bool copyData)
+    : Mat(m, n / DataType<_Tp>::channels, traits::Type<_Tp>::value, (void*)&M)
+{
+    CV_Assert(n % DataType<_Tp>::channels == 0);
+    if( copyData )
+        *this = clone();
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Point_<typename DataType<_Tp>::channel_type>& pt, bool copyData)
+    : Mat(2 / DataType<_Tp>::channels, 1, traits::Type<_Tp>::value, (void*)&pt)
+{
+    CV_Assert(2 % DataType<_Tp>::channels == 0);
+    if( copyData )
+        *this = clone();
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Point3_<typename DataType<_Tp>::channel_type>& pt, bool copyData)
+    : Mat(3 / DataType<_Tp>::channels, 1, traits::Type<_Tp>::value, (void*)&pt)
+{
+    CV_Assert(3 % DataType<_Tp>::channels == 0);
+    if( copyData )
+        *this = clone();
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const MatCommaInitializer_<_Tp>& commaInitializer)
+    : Mat(commaInitializer)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const std::vector<_Tp>& vec, bool copyData)
+    : Mat(vec, copyData)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(std::initializer_list<_Tp> list)
+    : Mat(list)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const std::initializer_list<int> sizes, std::initializer_list<_Tp> list)
+    : Mat(sizes, list)
+{}
+
+template<typename _Tp> template<std::size_t _Nm> inline
+Mat_<_Tp>::Mat_(const std::array<_Tp, _Nm>& arr, bool copyData)
+    : Mat(arr, copyData)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (const Mat& m)
+{
+    if (m.empty())
+    {
+        release();
+        return *this;
+    }
+    if( traits::Type<_Tp>::value == m.type() )
+    {
+        Mat::operator = (m);
+        return *this;
+    }
+    if( traits::Depth<_Tp>::value == m.depth() )
+    {
+        return (*this = m.reshape(DataType<_Tp>::channels, m.dims, 0));
+    }
+    CV_Assert(DataType<_Tp>::channels == m.channels() || m.empty());
+    m.convertTo(*this, type());
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (const Mat_& m)
+{
+    Mat::operator=(m);
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (const _Tp& s)
+{
+    typedef typename DataType<_Tp>::vec_type VT;
+    Mat::operator=(Scalar((const VT&)s));
+    return *this;
+}
+
+template<typename _Tp> inline
+void Mat_<_Tp>::create(int _rows, int _cols)
+{
+    Mat::create(_rows, _cols, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+void Mat_<_Tp>::create(Size _sz)
+{
+    Mat::create(_sz, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+void Mat_<_Tp>::create(int _dims, const int* _sz)
+{
+    Mat::create(_dims, _sz, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+void Mat_<_Tp>::release()
+{
+    Mat::release();
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::cross(const Mat_& m) const
+{
+    return Mat_<_Tp>(Mat::cross(m));
+}
+
+template<typename _Tp> template<typename T2> inline
+Mat_<_Tp>::operator Mat_<T2>() const
+{
+    return Mat_<T2>(static_cast<const Mat&>(*this));
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::row(int y) const
+{
+    return Mat_(*this, Range(y, y+1), Range::all());
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::col(int x) const
+{
+    return Mat_(*this, Range::all(), Range(x, x+1));
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::diag(int d) const
+{
+    return Mat_(Mat::diag(d));
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::clone() const
+{
+    return Mat_(Mat::clone());
+}
+
+template<typename _Tp> inline
+size_t Mat_<_Tp>::elemSize() const
+{
+    CV_DbgAssert( Mat::elemSize() == sizeof(_Tp) );
+    return sizeof(_Tp);
+}
+
+template<typename _Tp> inline
+size_t Mat_<_Tp>::elemSize1() const
+{
+    CV_DbgAssert( Mat::elemSize1() == sizeof(_Tp) / DataType<_Tp>::channels );
+    return sizeof(_Tp) / DataType<_Tp>::channels;
+}
+
+template<typename _Tp> inline
+int Mat_<_Tp>::type() const
+{
+    CV_DbgAssert( Mat::type() == traits::Type<_Tp>::value );
+    return traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+int Mat_<_Tp>::depth() const
+{
+    CV_DbgAssert( Mat::depth() == traits::Depth<_Tp>::value );
+    return traits::Depth<_Tp>::value;
+}
+
+template<typename _Tp> inline
+int Mat_<_Tp>::channels() const
+{
+    CV_DbgAssert( Mat::channels() == DataType<_Tp>::channels );
+    return DataType<_Tp>::channels;
+}
+
+template<typename _Tp> inline
+size_t Mat_<_Tp>::stepT(int i) const
+{
+    return step.p[i] / elemSize();
+}
+
+template<typename _Tp> inline
+size_t Mat_<_Tp>::step1(int i) const
+{
+    return step.p[i] / elemSize1();
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::adjustROI( int dtop, int dbottom, int dleft, int dright )
+{
+    return (Mat_<_Tp>&)(Mat::adjustROI(dtop, dbottom, dleft, dright));
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::operator()( const Range& _rowRange, const Range& _colRange ) const
+{
+    return Mat_<_Tp>(*this, _rowRange, _colRange);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::operator()( const Rect& roi ) const
+{
+    return Mat_<_Tp>(*this, roi);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::operator()( const Range* ranges ) const
+{
+    return Mat_<_Tp>(*this, ranges);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::operator()(const std::vector<Range>& ranges) const
+{
+    return Mat_<_Tp>(*this, ranges);
+}
+
+template<typename _Tp> inline
+_Tp* Mat_<_Tp>::operator [](int y)
+{
+    CV_DbgAssert( 0 <= y && y < size.p[0] );
+    return (_Tp*)(data + y*step.p[0]);
+}
+
+template<typename _Tp> inline
+const _Tp* Mat_<_Tp>::operator [](int y) const
+{
+    CV_DbgAssert( 0 <= y && y < size.p[0] );
+    return (const _Tp*)(data + y*step.p[0]);
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(int i0, int i1)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert(type() == traits::Type<_Tp>::value);
+    return ((_Tp*)(data + step.p[0] * i0))[i1];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(int i0, int i1) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert(type() == traits::Type<_Tp>::value);
+    return ((const _Tp*)(data + step.p[0] * i0))[i1];
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(Point pt)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)pt.y < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)pt.x < (unsigned)size.p[1]);
+    CV_DbgAssert(type() == traits::Type<_Tp>::value);
+    return ((_Tp*)(data + step.p[0] * pt.y))[pt.x];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(Point pt) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)pt.y < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)pt.x < (unsigned)size.p[1]);
+    CV_DbgAssert(type() == traits::Type<_Tp>::value);
+    return ((const _Tp*)(data + step.p[0] * pt.y))[pt.x];
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(const int* idx)
+{
+    return Mat::at<_Tp>(idx);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(const int* idx) const
+{
+    return Mat::at<_Tp>(idx);
+}
+
+template<typename _Tp> template<int n> inline
+_Tp& Mat_<_Tp>::operator ()(const Vec<int, n>& idx)
+{
+    return Mat::at<_Tp>(idx);
+}
+
+template<typename _Tp> template<int n> inline
+const _Tp& Mat_<_Tp>::operator ()(const Vec<int, n>& idx) const
+{
+    return Mat::at<_Tp>(idx);
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(int i0)
+{
+    return this->at<_Tp>(i0);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(int i0) const
+{
+    return this->at<_Tp>(i0);
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(int i0, int i1, int i2)
+{
+    return this->at<_Tp>(i0, i1, i2);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(int i0, int i1, int i2) const
+{
+    return this->at<_Tp>(i0, i1, i2);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::operator std::vector<_Tp>() const
+{
+    std::vector<_Tp> v;
+    copyTo(v);
+    return v;
+}
+
+template<typename _Tp> template<std::size_t _Nm> inline
+Mat_<_Tp>::operator std::array<_Tp, _Nm>() const
+{
+    std::array<_Tp, _Nm> a;
+    copyTo(a);
+    return a;
+}
+
+template<typename _Tp> template<int n> inline
+Mat_<_Tp>::operator Vec<typename DataType<_Tp>::channel_type, n>() const
+{
+    CV_Assert(n % DataType<_Tp>::channels == 0);
+
+#if defined _MSC_VER
+    const Mat* pMat = (const Mat*)this; // workaround for MSVS <= 2012 compiler bugs (but GCC 4.6 dislikes this workaround)
+    return pMat->operator Vec<typename DataType<_Tp>::channel_type, n>();
+#else
+    return this->Mat::operator Vec<typename DataType<_Tp>::channel_type, n>();
+#endif
+}
+
+template<typename _Tp> template<int m, int n> inline
+Mat_<_Tp>::operator Matx<typename DataType<_Tp>::channel_type, m, n>() const
+{
+    CV_Assert(n % DataType<_Tp>::channels == 0);
+
+#if defined _MSC_VER
+    const Mat* pMat = (const Mat*)this; // workaround for MSVS <= 2012 compiler bugs (but GCC 4.6 dislikes this workaround)
+    Matx<typename DataType<_Tp>::channel_type, m, n> res = pMat->operator Matx<typename DataType<_Tp>::channel_type, m, n>();
+    return res;
+#else
+    Matx<typename DataType<_Tp>::channel_type, m, n> res = this->Mat::operator Matx<typename DataType<_Tp>::channel_type, m, n>();
+    return res;
+#endif
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> Mat_<_Tp>::begin() const
+{
+    return Mat::begin<_Tp>();
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat_<_Tp>::rbegin() const
+{
+    return Mat::rbegin<_Tp>();
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> Mat_<_Tp>::end() const
+{
+    return Mat::end<_Tp>();
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat_<_Tp>::rend() const
+{
+    return Mat::rend<_Tp>();
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> Mat_<_Tp>::begin()
+{
+    return Mat::begin<_Tp>();
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat_<_Tp>::rbegin()
+{
+    return Mat::rbegin<_Tp>();
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> Mat_<_Tp>::end()
+{
+    return Mat::end<_Tp>();
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat_<_Tp>::rend()
+{
+    return Mat::rend<_Tp>();
+}
+
+template<typename _Tp> template<typename Functor> inline
+void Mat_<_Tp>::forEach(const Functor& operation) {
+    Mat::forEach<_Tp, Functor>(operation);
+}
+
+template<typename _Tp> template<typename Functor> inline
+void Mat_<_Tp>::forEach(const Functor& operation) const {
+    Mat::forEach<_Tp, Functor>(operation);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(Mat_&& m)
+    : Mat(std::move(m))
+{
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (Mat_&& m)
+{
+    Mat::operator = (std::move(m));
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(Mat&& m)
+    : Mat()
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+    *this = std::move(m);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (Mat&& m)
+{
+    if (m.empty())
+    {
+        release();
+        return *this;
+    }
+    if( traits::Type<_Tp>::value == m.type() )
+    {
+        Mat::operator = ((Mat&&)m);
+        return *this;
+    }
+    if( traits::Depth<_Tp>::value == m.depth() )
+    {
+        Mat::operator = ((Mat&&)m.reshape(DataType<_Tp>::channels, m.dims, 0));
+        return *this;
+    }
+    CV_DbgAssert(DataType<_Tp>::channels == m.channels());
+    m.convertTo(*this, type());
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(MatExpr&& e)
+    : Mat()
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+    *this = Mat(e);
+}
+
+
+///////////////////////////// SparseMat /////////////////////////////
+
+inline
+SparseMat SparseMat::clone() const
+{
+    SparseMat temp;
+    this->copyTo(temp);
+    return temp;
+}
+
+inline
+size_t SparseMat::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t SparseMat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int SparseMat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int SparseMat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int SparseMat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+const int* SparseMat::size() const
+{
+    return hdr ? hdr->size : 0;
+}
+
+inline
+int SparseMat::size(int i) const
+{
+    if( hdr )
+    {
+        CV_DbgAssert((unsigned)i < (unsigned)hdr->dims);
+        return hdr->size[i];
+    }
+    return 0;
+}
+
+inline
+int SparseMat::dims() const
+{
+    return hdr ? hdr->dims : 0;
+}
+
+inline
+size_t SparseMat::nzcount() const
+{
+    return hdr ? hdr->nodeCount : 0;
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::ref(int i0, size_t* hashval)
+{
+    return *(_Tp*)((SparseMat*)this)->ptr(i0, true, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::ref(int i0, int i1, size_t* hashval)
+{
+    return *(_Tp*)((SparseMat*)this)->ptr(i0, i1, true, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::ref(int i0, int i1, int i2, size_t* hashval)
+{
+    return *(_Tp*)((SparseMat*)this)->ptr(i0, i1, i2, true, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::ref(const int* idx, size_t* hashval)
+{
+    return *(_Tp*)((SparseMat*)this)->ptr(idx, true, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat::value(int i0, size_t* hashval) const
+{
+    const _Tp* p = (const _Tp*)((SparseMat*)this)->ptr(i0, false, hashval);
+    return p ? *p : _Tp();
+}
+
+template<typename _Tp> inline
+_Tp SparseMat::value(int i0, int i1, size_t* hashval) const
+{
+    const _Tp* p = (const _Tp*)((SparseMat*)this)->ptr(i0, i1, false, hashval);
+    return p ? *p : _Tp();
+}
+
+template<typename _Tp> inline
+_Tp SparseMat::value(int i0, int i1, int i2, size_t* hashval) const
+{
+    const _Tp* p = (const _Tp*)((SparseMat*)this)->ptr(i0, i1, i2, false, hashval);
+    return p ? *p : _Tp();
+}
+
+template<typename _Tp> inline
+_Tp SparseMat::value(const int* idx, size_t* hashval) const
+{
+    const _Tp* p = (const _Tp*)((SparseMat*)this)->ptr(idx, false, hashval);
+    return p ? *p : _Tp();
+}
+
+template<typename _Tp> inline
+const _Tp* SparseMat::find(int i0, size_t* hashval) const
+{
+    return (const _Tp*)((SparseMat*)this)->ptr(i0, false, hashval);
+}
+
+template<typename _Tp> inline
+const _Tp* SparseMat::find(int i0, int i1, size_t* hashval) const
+{
+    return (const _Tp*)((SparseMat*)this)->ptr(i0, i1, false, hashval);
+}
+
+template<typename _Tp> inline
+const _Tp* SparseMat::find(int i0, int i1, int i2, size_t* hashval) const
+{
+    return (const _Tp*)((SparseMat*)this)->ptr(i0, i1, i2, false, hashval);
+}
+
+template<typename _Tp> inline
+const _Tp* SparseMat::find(const int* idx, size_t* hashval) const
+{
+    return (const _Tp*)((SparseMat*)this)->ptr(idx, false, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::value(Node* n)
+{
+    return *(_Tp*)((uchar*)n + hdr->valueOffset);
+}
+
+template<typename _Tp> inline
+const _Tp& SparseMat::value(const Node* n) const
+{
+    return *(const _Tp*)((const uchar*)n + hdr->valueOffset);
+}
+
+inline
+SparseMat::Node* SparseMat::node(size_t nidx)
+{
+    return (Node*)(void*)&hdr->pool[nidx];
+}
+
+inline
+const SparseMat::Node* SparseMat::node(size_t nidx) const
+{
+    return (const Node*)(const void*)&hdr->pool[nidx];
+}
+
+inline
+SparseMatIterator SparseMat::begin()
+{
+    return SparseMatIterator(this);
+}
+
+inline
+SparseMatConstIterator SparseMat::begin() const
+{
+    return SparseMatConstIterator(this);
+}
+
+inline
+SparseMatIterator SparseMat::end()
+{
+    SparseMatIterator it(this);
+    it.seekEnd();
+    return it;
+}
+
+inline
+SparseMatConstIterator SparseMat::end() const
+{
+    SparseMatConstIterator it(this);
+    it.seekEnd();
+    return it;
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMat::begin()
+{
+    return SparseMatIterator_<_Tp>(this);
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMat::begin() const
+{
+    return SparseMatConstIterator_<_Tp>(this);
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMat::end()
+{
+    SparseMatIterator_<_Tp> it(this);
+    it.seekEnd();
+    return it;
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMat::end() const
+{
+    SparseMatConstIterator_<_Tp> it(this);
+    it.seekEnd();
+    return it;
+}
+
+
+
+///////////////////////////// SparseMat_ ////////////////////////////
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_()
+{
+    flags = MAGIC_VAL + traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_(int _dims, const int* _sizes)
+    : SparseMat(_dims, _sizes, traits::Type<_Tp>::value)
+{}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_(const SparseMat& m)
+{
+    if( m.type() == traits::Type<_Tp>::value )
+        *this = (const SparseMat_<_Tp>&)m;
+    else
+        m.convertTo(*this, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_(const SparseMat_<_Tp>& m)
+{
+    this->flags = m.flags;
+    this->hdr = m.hdr;
+    if( this->hdr )
+        CV_XADD(&this->hdr->refcount, 1);
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_(const Mat& m)
+{
+    SparseMat sm(m);
+    *this = sm;
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>& SparseMat_<_Tp>::operator = (const SparseMat_<_Tp>& m)
+{
+    if( this != &m )
+    {
+        if( m.hdr ) CV_XADD(&m.hdr->refcount, 1);
+        release();
+        flags = m.flags;
+        hdr = m.hdr;
+    }
+    return *this;
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>& SparseMat_<_Tp>::operator = (const SparseMat& m)
+{
+    if( m.type() == traits::Type<_Tp>::value )
+        return (*this = (const SparseMat_<_Tp>&)m);
+    m.convertTo(*this, traits::Type<_Tp>::value);
+    return *this;
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>& SparseMat_<_Tp>::operator = (const Mat& m)
+{
+    return (*this = SparseMat(m));
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp> SparseMat_<_Tp>::clone() const
+{
+    SparseMat_<_Tp> m;
+    this->copyTo(m);
+    return m;
+}
+
+template<typename _Tp> inline
+void SparseMat_<_Tp>::create(int _dims, const int* _sizes)
+{
+    SparseMat::create(_dims, _sizes, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+int SparseMat_<_Tp>::type() const
+{
+    return traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+int SparseMat_<_Tp>::depth() const
+{
+    return traits::Depth<_Tp>::value;
+}
+
+template<typename _Tp> inline
+int SparseMat_<_Tp>::channels() const
+{
+    return DataType<_Tp>::channels;
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat_<_Tp>::ref(int i0, size_t* hashval)
+{
+    return SparseMat::ref<_Tp>(i0, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat_<_Tp>::operator()(int i0, size_t* hashval) const
+{
+    return SparseMat::value<_Tp>(i0, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat_<_Tp>::ref(int i0, int i1, size_t* hashval)
+{
+    return SparseMat::ref<_Tp>(i0, i1, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat_<_Tp>::operator()(int i0, int i1, size_t* hashval) const
+{
+    return SparseMat::value<_Tp>(i0, i1, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat_<_Tp>::ref(int i0, int i1, int i2, size_t* hashval)
+{
+    return SparseMat::ref<_Tp>(i0, i1, i2, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat_<_Tp>::operator()(int i0, int i1, int i2, size_t* hashval) const
+{
+    return SparseMat::value<_Tp>(i0, i1, i2, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat_<_Tp>::ref(const int* idx, size_t* hashval)
+{
+    return SparseMat::ref<_Tp>(idx, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat_<_Tp>::operator()(const int* idx, size_t* hashval) const
+{
+    return SparseMat::value<_Tp>(idx, hashval);
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMat_<_Tp>::begin()
+{
+    return SparseMatIterator_<_Tp>(this);
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMat_<_Tp>::begin() const
+{
+    return SparseMatConstIterator_<_Tp>(this);
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMat_<_Tp>::end()
+{
+    SparseMatIterator_<_Tp> it(this);
+    it.seekEnd();
+    return it;
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMat_<_Tp>::end() const
+{
+    SparseMatConstIterator_<_Tp> it(this);
+    it.seekEnd();
+    return it;
+}
+
+
+
+////////////////////////// MatConstIterator /////////////////////////
+
+inline
+MatConstIterator::MatConstIterator()
+    : m(0), elemSize(0), ptr(0), sliceStart(0), sliceEnd(0)
+{}
+
+inline
+MatConstIterator::MatConstIterator(const Mat* _m)
+    : m(_m), elemSize(_m->elemSize()), ptr(0), sliceStart(0), sliceEnd(0)
+{
+    if( m && m->isContinuous() )
+    {
+        CV_Assert(!m->empty());
+        sliceStart = m->ptr();
+        sliceEnd = sliceStart + m->total()*elemSize;
+    }
+    seek((const int*)0);
+}
+
+inline
+MatConstIterator::MatConstIterator(const Mat* _m, int _row, int _col)
+    : m(_m), elemSize(_m->elemSize()), ptr(0), sliceStart(0), sliceEnd(0)
+{
+    CV_Assert(m && m->dims <= 2);
+    if( m->isContinuous() )
+    {
+        CV_Assert(!m->empty());
+        sliceStart = m->ptr();
+        sliceEnd = sliceStart + m->total()*elemSize;
+    }
+    int idx[] = {_row, _col};
+    seek(idx);
+}
+
+inline
+MatConstIterator::MatConstIterator(const Mat* _m, Point _pt)
+    : m(_m), elemSize(_m->elemSize()), ptr(0), sliceStart(0), sliceEnd(0)
+{
+    CV_Assert(m && m->dims <= 2);
+    if( m->isContinuous() )
+    {
+        CV_Assert(!m->empty());
+        sliceStart = m->ptr();
+        sliceEnd = sliceStart + m->total()*elemSize;
+    }
+    int idx[] = {_pt.y, _pt.x};
+    seek(idx);
+}
+
+inline
+MatConstIterator::MatConstIterator(const MatConstIterator& it)
+    : m(it.m), elemSize(it.elemSize), ptr(it.ptr), sliceStart(it.sliceStart), sliceEnd(it.sliceEnd)
+{}
+
+inline
+MatConstIterator& MatConstIterator::operator = (const MatConstIterator& it )
+{
+    m = it.m; elemSize = it.elemSize; ptr = it.ptr;
+    sliceStart = it.sliceStart; sliceEnd = it.sliceEnd;
+    return *this;
+}
+
+inline
+const uchar* MatConstIterator::operator *() const
+{
+    return ptr;
+}
+
+inline MatConstIterator& MatConstIterator::operator += (ptrdiff_t ofs)
+{
+    if( !m || ofs == 0 )
+        return *this;
+    ptrdiff_t ofsb = ofs*elemSize;
+    ptr += ofsb;
+    if( ptr < sliceStart || sliceEnd <= ptr )
+    {
+        ptr -= ofsb;
+        seek(ofs, true);
+    }
+    return *this;
+}
+
+inline
+MatConstIterator& MatConstIterator::operator -= (ptrdiff_t ofs)
+{
+    return (*this += -ofs);
+}
+
+inline
+MatConstIterator& MatConstIterator::operator --()
+{
+    if( m && (ptr -= elemSize) < sliceStart )
+    {
+        ptr += elemSize;
+        seek(-1, true);
+    }
+    return *this;
+}
+
+inline
+MatConstIterator MatConstIterator::operator --(int)
+{
+    MatConstIterator b = *this;
+    *this += -1;
+    return b;
+}
+
+inline
+MatConstIterator& MatConstIterator::operator ++()
+{
+    if( m && (ptr += elemSize) >= sliceEnd )
+    {
+        ptr -= elemSize;
+        seek(1, true);
+    }
+    return *this;
+}
+
+inline MatConstIterator MatConstIterator::operator ++(int)
+{
+    MatConstIterator b = *this;
+    *this += 1;
+    return b;
+}
+
+
+static inline
+bool operator == (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.m == b.m && a.ptr == b.ptr;
+}
+
+static inline
+bool operator != (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return !(a == b);
+}
+
+static inline
+bool operator < (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.ptr < b.ptr;
+}
+
+static inline
+bool operator > (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.ptr > b.ptr;
+}
+
+static inline
+bool operator <= (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.ptr <= b.ptr;
+}
+
+static inline
+bool operator >= (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.ptr >= b.ptr;
+}
+
+static inline
+ptrdiff_t operator - (const MatConstIterator& b, const MatConstIterator& a)
+{
+    if( a.m != b.m )
+        return ((size_t)(-1) >> 1);
+    if( a.sliceEnd == b.sliceEnd )
+        return (b.ptr - a.ptr)/static_cast<ptrdiff_t>(b.elemSize);
+
+    return b.lpos() - a.lpos();
+}
+
+static inline
+MatConstIterator operator + (const MatConstIterator& a, ptrdiff_t ofs)
+{
+    MatConstIterator b = a;
+    return b += ofs;
+}
+
+static inline
+MatConstIterator operator + (ptrdiff_t ofs, const MatConstIterator& a)
+{
+    MatConstIterator b = a;
+    return b += ofs;
+}
+
+static inline
+MatConstIterator operator - (const MatConstIterator& a, ptrdiff_t ofs)
+{
+    MatConstIterator b = a;
+    return b += -ofs;
+}
+
+
+inline
+const uchar* MatConstIterator::operator [](ptrdiff_t i) const
+{
+    return *(*this + i);
+}
+
+
+
+///////////////////////// MatConstIterator_ /////////////////////////
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_()
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_(const Mat_<_Tp>* _m)
+    : MatConstIterator(_m)
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_(const Mat_<_Tp>* _m, int _row, int _col)
+    : MatConstIterator(_m, _row, _col)
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_(const Mat_<_Tp>* _m, Point _pt)
+    : MatConstIterator(_m, _pt)
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_(const MatConstIterator_& it)
+    : MatConstIterator(it)
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator = (const MatConstIterator_& it )
+{
+    MatConstIterator::operator = (it);
+    return *this;
+}
+
+template<typename _Tp> inline
+const _Tp& MatConstIterator_<_Tp>::operator *() const
+{
+    return *(_Tp*)(this->ptr);
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator += (ptrdiff_t ofs)
+{
+    MatConstIterator::operator += (ofs);
+    return *this;
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator -= (ptrdiff_t ofs)
+{
+    return (*this += -ofs);
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator --()
+{
+    MatConstIterator::operator --();
+    return *this;
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> MatConstIterator_<_Tp>::operator --(int)
+{
+    MatConstIterator_ b = *this;
+    MatConstIterator::operator --();
+    return b;
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator ++()
+{
+    MatConstIterator::operator ++();
+    return *this;
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> MatConstIterator_<_Tp>::operator ++(int)
+{
+    MatConstIterator_ b = *this;
+    MatConstIterator::operator ++();
+    return b;
+}
+
+
+template<typename _Tp> inline
+Point MatConstIterator_<_Tp>::pos() const
+{
+    if( !m )
+        return Point();
+    CV_DbgAssert( m->dims <= 2 );
+    if( m->isContinuous() )
+    {
+        ptrdiff_t ofs = (const _Tp*)ptr - (const _Tp*)m->data;
+        int y = (int)(ofs / m->cols);
+        int x = (int)(ofs - (ptrdiff_t)y * m->cols);
+        return Point(x, y);
+    }
+    else
+    {
+        ptrdiff_t ofs = (uchar*)ptr - m->data;
+        int y = (int)(ofs / m->step);
+        int x = (int)((ofs - y * m->step)/sizeof(_Tp));
+        return Point(x, y);
+    }
+}
+
+
+template<typename _Tp> static inline
+bool operator == (const MatConstIterator_<_Tp>& a, const MatConstIterator_<_Tp>& b)
+{
+    return a.m == b.m && a.ptr == b.ptr;
+}
+
+template<typename _Tp> static inline
+bool operator != (const MatConstIterator_<_Tp>& a, const MatConstIterator_<_Tp>& b)
+{
+    return a.m != b.m || a.ptr != b.ptr;
+}
+
+template<typename _Tp> static inline
+MatConstIterator_<_Tp> operator + (const MatConstIterator_<_Tp>& a, ptrdiff_t ofs)
+{
+    MatConstIterator t = (const MatConstIterator&)a + ofs;
+    return (MatConstIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> static inline
+MatConstIterator_<_Tp> operator + (ptrdiff_t ofs, const MatConstIterator_<_Tp>& a)
+{
+    MatConstIterator t = (const MatConstIterator&)a + ofs;
+    return (MatConstIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> static inline
+MatConstIterator_<_Tp> operator - (const MatConstIterator_<_Tp>& a, ptrdiff_t ofs)
+{
+    MatConstIterator t = (const MatConstIterator&)a - ofs;
+    return (MatConstIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> inline
+const _Tp& MatConstIterator_<_Tp>::operator [](ptrdiff_t i) const
+{
+    return *(_Tp*)MatConstIterator::operator [](i);
+}
+
+
+
+//////////////////////////// MatIterator_ ///////////////////////////
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_()
+    : MatConstIterator_<_Tp>()
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m)
+    : MatConstIterator_<_Tp>(_m)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m, int _row, int _col)
+    : MatConstIterator_<_Tp>(_m, _row, _col)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m, Point _pt)
+    : MatConstIterator_<_Tp>(_m, _pt)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m, const int* _idx)
+    : MatConstIterator_<_Tp>(_m, _idx)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(const MatIterator_& it)
+    : MatConstIterator_<_Tp>(it)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator = (const MatIterator_<_Tp>& it )
+{
+    MatConstIterator::operator = (it);
+    return *this;
+}
+
+template<typename _Tp> inline
+_Tp& MatIterator_<_Tp>::operator *() const
+{
+    return *(_Tp*)(this->ptr);
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator += (ptrdiff_t ofs)
+{
+    MatConstIterator::operator += (ofs);
+    return *this;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator -= (ptrdiff_t ofs)
+{
+    MatConstIterator::operator += (-ofs);
+    return *this;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator --()
+{
+    MatConstIterator::operator --();
+    return *this;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> MatIterator_<_Tp>::operator --(int)
+{
+    MatIterator_ b = *this;
+    MatConstIterator::operator --();
+    return b;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator ++()
+{
+    MatConstIterator::operator ++();
+    return *this;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> MatIterator_<_Tp>::operator ++(int)
+{
+    MatIterator_ b = *this;
+    MatConstIterator::operator ++();
+    return b;
+}
+
+template<typename _Tp> inline
+_Tp& MatIterator_<_Tp>::operator [](ptrdiff_t i) const
+{
+    return *(*this + i);
+}
+
+
+template<typename _Tp> static inline
+bool operator == (const MatIterator_<_Tp>& a, const MatIterator_<_Tp>& b)
+{
+    return a.m == b.m && a.ptr == b.ptr;
+}
+
+template<typename _Tp> static inline
+bool operator != (const MatIterator_<_Tp>& a, const MatIterator_<_Tp>& b)
+{
+    return a.m != b.m || a.ptr != b.ptr;
+}
+
+template<typename _Tp> static inline
+MatIterator_<_Tp> operator + (const MatIterator_<_Tp>& a, ptrdiff_t ofs)
+{
+    MatConstIterator t = (const MatConstIterator&)a + ofs;
+    return (MatIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> static inline
+MatIterator_<_Tp> operator + (ptrdiff_t ofs, const MatIterator_<_Tp>& a)
+{
+    MatConstIterator t = (const MatConstIterator&)a + ofs;
+    return (MatIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> static inline
+MatIterator_<_Tp> operator - (const MatIterator_<_Tp>& a, ptrdiff_t ofs)
+{
+    MatConstIterator t = (const MatConstIterator&)a - ofs;
+    return (MatIterator_<_Tp>&)t;
+}
+
+
+
+/////////////////////// SparseMatConstIterator //////////////////////
+
+inline
+SparseMatConstIterator::SparseMatConstIterator()
+    : m(0), hashidx(0), ptr(0)
+{}
+
+inline
+SparseMatConstIterator::SparseMatConstIterator(const SparseMatConstIterator& it)
+    : m(it.m), hashidx(it.hashidx), ptr(it.ptr)
+{}
+
+inline SparseMatConstIterator& SparseMatConstIterator::operator = (const SparseMatConstIterator& it)
+{
+    if( this != &it )
+    {
+        m = it.m;
+        hashidx = it.hashidx;
+        ptr = it.ptr;
+    }
+    return *this;
+}
+
+template<typename _Tp> inline
+const _Tp& SparseMatConstIterator::value() const
+{
+    return *(const _Tp*)ptr;
+}
+
+inline
+const SparseMat::Node* SparseMatConstIterator::node() const
+{
+    return (ptr && m && m->hdr) ? (const SparseMat::Node*)(const void*)(ptr - m->hdr->valueOffset) : 0;
+}
+
+inline
+SparseMatConstIterator SparseMatConstIterator::operator ++(int)
+{
+    SparseMatConstIterator it = *this;
+    ++*this;
+    return it;
+}
+
+inline
+void SparseMatConstIterator::seekEnd()
+{
+    if( m && m->hdr )
+    {
+        hashidx = m->hdr->hashtab.size();
+        ptr = 0;
+    }
+}
+
+
+static inline
+bool operator == (const SparseMatConstIterator& it1, const SparseMatConstIterator& it2)
+{
+    return it1.m == it2.m && it1.ptr == it2.ptr;
+}
+
+static inline
+bool operator != (const SparseMatConstIterator& it1, const SparseMatConstIterator& it2)
+{
+    return !(it1 == it2);
+}
+
+
+
+///////////////////////// SparseMatIterator /////////////////////////
+
+inline
+SparseMatIterator::SparseMatIterator()
+{}
+
+inline
+SparseMatIterator::SparseMatIterator(SparseMat* _m)
+    : SparseMatConstIterator(_m)
+{}
+
+inline
+SparseMatIterator::SparseMatIterator(const SparseMatIterator& it)
+    : SparseMatConstIterator(it)
+{}
+
+inline
+SparseMatIterator& SparseMatIterator::operator = (const SparseMatIterator& it)
+{
+    (SparseMatConstIterator&)*this = it;
+    return *this;
+}
+
+template<typename _Tp> inline
+_Tp& SparseMatIterator::value() const
+{
+    return *(_Tp*)ptr;
+}
+
+inline
+SparseMat::Node* SparseMatIterator::node() const
+{
+    return (SparseMat::Node*)SparseMatConstIterator::node();
+}
+
+inline
+SparseMatIterator& SparseMatIterator::operator ++()
+{
+    SparseMatConstIterator::operator ++();
+    return *this;
+}
+
+inline
+SparseMatIterator SparseMatIterator::operator ++(int)
+{
+    SparseMatIterator it = *this;
+    ++*this;
+    return it;
+}
+
+
+
+////////////////////// SparseMatConstIterator_ //////////////////////
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_()
+{}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMat_<_Tp>* _m)
+    : SparseMatConstIterator(_m)
+{}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMat* _m)
+    : SparseMatConstIterator(_m)
+{
+    CV_Assert( _m->type() == traits::Type<_Tp>::value );
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMatConstIterator_<_Tp>& it)
+    : SparseMatConstIterator(it)
+{}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>& SparseMatConstIterator_<_Tp>::operator = (const SparseMatConstIterator_<_Tp>& it)
+{
+    return reinterpret_cast<SparseMatConstIterator_<_Tp>&>
+         (*reinterpret_cast<SparseMatConstIterator*>(this) =
+           reinterpret_cast<const SparseMatConstIterator&>(it));
+}
+
+template<typename _Tp> inline
+const _Tp& SparseMatConstIterator_<_Tp>::operator *() const
+{
+    return *(const _Tp*)this->ptr;
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>& SparseMatConstIterator_<_Tp>::operator ++()
+{
+    SparseMatConstIterator::operator ++();
+    return *this;
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMatConstIterator_<_Tp>::operator ++(int)
+{
+    SparseMatConstIterator_<_Tp> it = *this;
+    SparseMatConstIterator::operator ++();
+    return it;
+}
+
+
+
+///////////////////////// SparseMatIterator_ ////////////////////////
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_()
+{}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_(SparseMat_<_Tp>* _m)
+    : SparseMatConstIterator_<_Tp>(_m)
+{}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_(SparseMat* _m)
+    : SparseMatConstIterator_<_Tp>(_m)
+{}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_(const SparseMatIterator_<_Tp>& it)
+    : SparseMatConstIterator_<_Tp>(it)
+{}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>& SparseMatIterator_<_Tp>::operator = (const SparseMatIterator_<_Tp>& it)
+{
+    return reinterpret_cast<SparseMatIterator_<_Tp>&>
+         (*reinterpret_cast<SparseMatConstIterator*>(this) =
+           reinterpret_cast<const SparseMatConstIterator&>(it));
+}
+
+template<typename _Tp> inline
+_Tp& SparseMatIterator_<_Tp>::operator *() const
+{
+    return *(_Tp*)this->ptr;
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>& SparseMatIterator_<_Tp>::operator ++()
+{
+    SparseMatConstIterator::operator ++();
+    return *this;
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMatIterator_<_Tp>::operator ++(int)
+{
+    SparseMatIterator_<_Tp> it = *this;
+    SparseMatConstIterator::operator ++();
+    return it;
+}
+
+
+
+//////////////////////// MatCommaInitializer_ ///////////////////////
+
+template<typename _Tp> inline
+MatCommaInitializer_<_Tp>::MatCommaInitializer_(Mat_<_Tp>* _m)
+    : it(_m)
+{}
+
+template<typename _Tp> template<typename T2> inline
+MatCommaInitializer_<_Tp>& MatCommaInitializer_<_Tp>::operator , (T2 v)
+{
+    CV_DbgAssert( this->it < ((const Mat_<_Tp>*)this->it.m)->end() );
+    *this->it = _Tp(v);
+    ++this->it;
+    return *this;
+}
+
+template<typename _Tp> inline
+MatCommaInitializer_<_Tp>::operator Mat_<_Tp>() const
+{
+    CV_DbgAssert( this->it == ((const Mat_<_Tp>*)this->it.m)->end() );
+    return Mat_<_Tp>(*this->it.m);
+}
+
+
+template<typename _Tp, typename T2> static inline
+MatCommaInitializer_<_Tp> operator << (const Mat_<_Tp>& m, T2 val)
+{
+    MatCommaInitializer_<_Tp> commaInitializer((Mat_<_Tp>*)&m);
+    return (commaInitializer, val);
+}
+
+
+
+///////////////////////// Matrix Expressions ////////////////////////
+
+inline
+Mat& Mat::operator = (const MatExpr& e)
+{
+    e.op->assign(e, *this);
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const MatExpr& e)
+{
+    e.op->assign(e, *this, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (const MatExpr& e)
+{
+    e.op->assign(e, *this, traits::Type<_Tp>::value);
+    return *this;
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::zeros(int rows, int cols)
+{
+    return Mat::zeros(rows, cols, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::zeros(Size sz)
+{
+    return Mat::zeros(sz, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::ones(int rows, int cols)
+{
+    return Mat::ones(rows, cols, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::ones(Size sz)
+{
+    return Mat::ones(sz, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::eye(int rows, int cols)
+{
+    return Mat::eye(rows, cols, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::eye(Size sz)
+{
+    return Mat::eye(sz, traits::Type<_Tp>::value);
+}
+
+inline
+MatExpr::MatExpr()
+    : op(0), flags(0), a(Mat()), b(Mat()), c(Mat()), alpha(0), beta(0), s()
+{}
+
+inline
+MatExpr::MatExpr(const MatOp* _op, int _flags, const Mat& _a, const Mat& _b,
+                 const Mat& _c, double _alpha, double _beta, const Scalar& _s)
+    : op(_op), flags(_flags), a(_a), b(_b), c(_c), alpha(_alpha), beta(_beta), s(_s)
+{}
+
+inline
+MatExpr::operator Mat() const
+{
+    Mat m;
+    op->assign(*this, m);
+    return m;
+}
+
+template<typename _Tp> inline
+MatExpr::operator Mat_<_Tp>() const
+{
+    Mat_<_Tp> m;
+    op->assign(*this, m, traits::Type<_Tp>::value);
+    return m;
+}
+
+
+template<typename _Tp> static inline
+MatExpr min(const Mat_<_Tp>& a, const Mat_<_Tp>& b)
+{
+    return cv::min((const Mat&)a, (const Mat&)b);
+}
+
+template<typename _Tp> static inline
+MatExpr min(const Mat_<_Tp>& a, double s)
+{
+    return cv::min((const Mat&)a, s);
+}
+
+template<typename _Tp> static inline
+MatExpr min(double s, const Mat_<_Tp>& a)
+{
+    return cv::min((const Mat&)a, s);
+}
+
+template<typename _Tp> static inline
+MatExpr max(const Mat_<_Tp>& a, const Mat_<_Tp>& b)
+{
+    return cv::max((const Mat&)a, (const Mat&)b);
+}
+
+template<typename _Tp> static inline
+MatExpr max(const Mat_<_Tp>& a, double s)
+{
+    return cv::max((const Mat&)a, s);
+}
+
+template<typename _Tp> static inline
+MatExpr max(double s, const Mat_<_Tp>& a)
+{
+    return cv::max((const Mat&)a, s);
+}
+
+template<typename _Tp> static inline
+MatExpr abs(const Mat_<_Tp>& m)
+{
+    return cv::abs((const Mat&)m);
+}
+
+
+static inline
+Mat& operator += (Mat& a, const MatExpr& b)
+{
+    b.op->augAssignAdd(b, a);
+    return a;
+}
+
+static inline
+const Mat& operator += (const Mat& a, const MatExpr& b)
+{
+    b.op->augAssignAdd(b, (Mat&)a);
+    return a;
+}
+
+template<typename _Tp> static inline
+Mat_<_Tp>& operator += (Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignAdd(b, a);
+    return a;
+}
+
+template<typename _Tp> static inline
+const Mat_<_Tp>& operator += (const Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignAdd(b, (Mat&)a);
+    return a;
+}
+
+static inline
+Mat& operator -= (Mat& a, const MatExpr& b)
+{
+    b.op->augAssignSubtract(b, a);
+    return a;
+}
+
+static inline
+const Mat& operator -= (const Mat& a, const MatExpr& b)
+{
+    b.op->augAssignSubtract(b, (Mat&)a);
+    return a;
+}
+
+template<typename _Tp> static inline
+Mat_<_Tp>& operator -= (Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignSubtract(b, a);
+    return a;
+}
+
+template<typename _Tp> static inline
+const Mat_<_Tp>& operator -= (const Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignSubtract(b, (Mat&)a);
+    return a;
+}
+
+static inline
+Mat& operator *= (Mat& a, const MatExpr& b)
+{
+    b.op->augAssignMultiply(b, a);
+    return a;
+}
+
+static inline
+const Mat& operator *= (const Mat& a, const MatExpr& b)
+{
+    b.op->augAssignMultiply(b, (Mat&)a);
+    return a;
+}
+
+template<typename _Tp> static inline
+Mat_<_Tp>& operator *= (Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignMultiply(b, a);
+    return a;
+}
+
+template<typename _Tp> static inline
+const Mat_<_Tp>& operator *= (const Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignMultiply(b, (Mat&)a);
+    return a;
+}
+
+static inline
+Mat& operator /= (Mat& a, const MatExpr& b)
+{
+    b.op->augAssignDivide(b, a);
+    return a;
+}
+
+static inline
+const Mat& operator /= (const Mat& a, const MatExpr& b)
+{
+    b.op->augAssignDivide(b, (Mat&)a);
+    return a;
+}
+
+template<typename _Tp> static inline
+Mat_<_Tp>& operator /= (Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignDivide(b, a);
+    return a;
+}
+
+template<typename _Tp> static inline
+const Mat_<_Tp>& operator /= (const Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignDivide(b, (Mat&)a);
+    return a;
+}
+
+
+//////////////////////////////// UMat ////////////////////////////////
+
+template<typename _Tp> inline
+UMat::UMat(const std::vector<_Tp>& vec, bool copyData)
+: flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
+cols(1), allocator(0), usageFlags(USAGE_DEFAULT), u(0), offset(0), size(&rows)
+{
+    if(vec.empty())
+        return;
+    if( !copyData )
+    {
+        // !!!TODO!!!
+        CV_Error(Error::StsNotImplemented, "");
+    }
+    else
+        Mat((int)vec.size(), 1, traits::Type<_Tp>::value, (uchar*)&vec[0]).copyTo(*this);
+}
+
+inline
+UMat UMat::row(int y) const
+{
+    return UMat(*this, Range(y, y + 1), Range::all());
+}
+
+inline
+UMat UMat::col(int x) const
+{
+    return UMat(*this, Range::all(), Range(x, x + 1));
+}
+
+inline
+UMat UMat::rowRange(int startrow, int endrow) const
+{
+    return UMat(*this, Range(startrow, endrow), Range::all());
+}
+
+inline
+UMat UMat::rowRange(const Range& r) const
+{
+    return UMat(*this, r, Range::all());
+}
+
+inline
+UMat UMat::colRange(int startcol, int endcol) const
+{
+    return UMat(*this, Range::all(), Range(startcol, endcol));
+}
+
+inline
+UMat UMat::colRange(const Range& r) const
+{
+    return UMat(*this, Range::all(), r);
+}
+
+inline
+UMat UMat::operator()( Range _rowRange, Range _colRange ) const
+{
+    return UMat(*this, _rowRange, _colRange);
+}
+
+inline
+UMat UMat::operator()( const Rect& roi ) const
+{
+    return UMat(*this, roi);
+}
+
+inline
+UMat UMat::operator()(const Range* ranges) const
+{
+    return UMat(*this, ranges);
+}
+
+inline
+UMat UMat::operator()(const std::vector<Range>& ranges) const
+{
+    return UMat(*this, ranges);
+}
+
+inline
+bool UMat::isContinuous() const
+{
+    return (flags & CONTINUOUS_FLAG) != 0;
+}
+
+inline
+bool UMat::isSubmatrix() const
+{
+    return (flags & SUBMATRIX_FLAG) != 0;
+}
+
+inline
+size_t UMat::elemSize() const
+{
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
+}
+
+inline
+size_t UMat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int UMat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int UMat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int UMat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t UMat::step1(int i) const
+{
+    return step.p[i] / elemSize1();
+}
+
+
+inline bool UMatData::hostCopyObsolete() const { return (flags & HOST_COPY_OBSOLETE) != 0; }
+inline bool UMatData::deviceCopyObsolete() const { return (flags & DEVICE_COPY_OBSOLETE) != 0; }
+inline bool UMatData::deviceMemMapped() const { return (flags & DEVICE_MEM_MAPPED) != 0; }
+inline bool UMatData::copyOnMap() const { return (flags & COPY_ON_MAP) != 0; }
+inline bool UMatData::tempUMat() const { return (flags & TEMP_UMAT) != 0; }
+inline bool UMatData::tempCopiedUMat() const { return (flags & TEMP_COPIED_UMAT) == TEMP_COPIED_UMAT; }
+
+inline void UMatData::markDeviceMemMapped(bool flag)
+{
+  if(flag)
+    flags |= DEVICE_MEM_MAPPED;
+  else
+    flags &= ~DEVICE_MEM_MAPPED;
+}
+
+inline void UMatData::markHostCopyObsolete(bool flag)
+{
+    if(flag)
+        flags |= HOST_COPY_OBSOLETE;
+    else
+        flags &= ~HOST_COPY_OBSOLETE;
+}
+inline void UMatData::markDeviceCopyObsolete(bool flag)
+{
+    if(flag)
+        flags |= DEVICE_COPY_OBSOLETE;
+    else
+        flags &= ~DEVICE_COPY_OBSOLETE;
+}
+
+//! @endcond
+
+static inline
+void swap(MatExpr& a, MatExpr& b) { a.swap(b); }
+
+} //cv
+
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
+#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS
+#undef CV_DISABLE_CLANG_ENUM_WARNINGS
+#pragma clang diagnostic pop
+#endif
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/matx.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/matx.hpp
new file mode 100644
index 000000000000..ad13797da3a8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/matx.hpp
@@ -0,0 +1,544 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_MATX_HPP
+#define OPENCV_CORE_MATX_HPP
+
+#ifndef __cplusplus
+#  error matx.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/traits.hpp"
+#include "opencv2/core/saturate.hpp"
+
+#include <initializer_list>
+
+namespace cv
+{
+
+//! @addtogroup core_basic
+//! @{
+
+//! @cond IGNORED
+// FIXIT Remove this (especially CV_EXPORTS modifier)
+struct CV_EXPORTS Matx_AddOp { Matx_AddOp() {} Matx_AddOp(const Matx_AddOp&) {} };
+struct CV_EXPORTS Matx_SubOp { Matx_SubOp() {} Matx_SubOp(const Matx_SubOp&) {} };
+struct CV_EXPORTS Matx_ScaleOp { Matx_ScaleOp() {} Matx_ScaleOp(const Matx_ScaleOp&) {} };
+struct CV_EXPORTS Matx_MulOp { Matx_MulOp() {} Matx_MulOp(const Matx_MulOp&) {} };
+struct CV_EXPORTS Matx_DivOp { Matx_DivOp() {} Matx_DivOp(const Matx_DivOp&) {} };
+struct CV_EXPORTS Matx_MatMulOp { Matx_MatMulOp() {} Matx_MatMulOp(const Matx_MatMulOp&) {} };
+struct CV_EXPORTS Matx_TOp { Matx_TOp() {} Matx_TOp(const Matx_TOp&) {} };
+//! @endcond
+
+////////////////////////////// Small Matrix ///////////////////////////
+
+/** @brief Template class for small matrices whose type and size are known at compilation time
+
+If you need a more flexible type, use Mat . The elements of the matrix M are accessible using the
+M(i,j) notation. Most of the common matrix operations (see also @ref MatrixExpressions ) are
+available. To do an operation on Matx that is not implemented, you can easily convert the matrix to
+Mat and backwards:
+@code{.cpp}
+    Matx33f m(1, 2, 3,
+              4, 5, 6,
+              7, 8, 9);
+    cout << sum(Mat(m*m.t())) << endl;
+@endcode
+Except of the plain constructor which takes a list of elements, Matx can be initialized from a C-array:
+@code{.cpp}
+    float values[] = { 1, 2, 3};
+    Matx31f m(values);
+@endcode
+In case if C++11 features are available, std::initializer_list can be also used to initialize Matx:
+@code{.cpp}
+    Matx31f m = { 1, 2, 3};
+@endcode
+ */
+template<typename _Tp, int m, int n> class Matx
+{
+public:
+    enum {
+           rows     = m,
+           cols     = n,
+           channels = rows*cols,
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           depth    = traits::Type<_Tp>::value,
+           type     = CV_MAKETYPE(depth, channels),
+#endif
+           shortdim = (m < n ? m : n)
+         };
+
+    typedef _Tp                           value_type;
+    typedef Matx<_Tp, m, n>               mat_type;
+    typedef Matx<_Tp, shortdim, 1> diag_type;
+
+    //! default constructor
+    Matx();
+
+    explicit Matx(_Tp v0); //!< 1x1 matrix
+    Matx(_Tp v0, _Tp v1); //!< 1x2 or 2x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2); //!< 1x3 or 3x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3); //!< 1x4, 2x2 or 4x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4); //!< 1x5 or 5x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5); //!< 1x6, 2x3, 3x2 or 6x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6); //!< 1x7 or 7x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7); //!< 1x8, 2x4, 4x2 or 8x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8); //!< 1x9, 3x3 or 9x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9); //!< 1x10, 2x5 or 5x2 or 10x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3,
+         _Tp v4, _Tp v5, _Tp v6, _Tp v7,
+         _Tp v8, _Tp v9, _Tp v10, _Tp v11); //!< 1x12, 2x6, 3x4, 4x3, 6x2 or 12x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3,
+         _Tp v4, _Tp v5, _Tp v6, _Tp v7,
+         _Tp v8, _Tp v9, _Tp v10, _Tp v11,
+         _Tp v12, _Tp v13); //!< 1x14, 2x7, 7x2 or 14x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3,
+         _Tp v4, _Tp v5, _Tp v6, _Tp v7,
+         _Tp v8, _Tp v9, _Tp v10, _Tp v11,
+         _Tp v12, _Tp v13, _Tp v14, _Tp v15); //!< 1x16, 4x4 or 16x1 matrix
+    explicit Matx(const _Tp* vals); //!< initialize from a plain array
+
+    Matx(std::initializer_list<_Tp>); //!< initialize from an initializer list
+
+    CV_NODISCARD_STD static Matx all(_Tp alpha);
+    CV_NODISCARD_STD static Matx zeros();
+    CV_NODISCARD_STD static Matx ones();
+    CV_NODISCARD_STD static Matx eye();
+    CV_NODISCARD_STD static Matx diag(const diag_type& d);
+    /** @brief Generates uniformly distributed random numbers
+    @param a Range boundary.
+    @param b The other range boundary (boundaries don't have to be ordered, the lower boundary is inclusive,
+    the upper one is exclusive).
+     */
+    CV_NODISCARD_STD static Matx randu(_Tp a, _Tp b);
+    /** @brief Generates normally distributed random numbers
+    @param a Mean value.
+    @param b Standard deviation.
+     */
+    CV_NODISCARD_STD static Matx randn(_Tp a, _Tp b);
+
+    //! dot product computed with the default precision
+    _Tp dot(const Matx<_Tp, m, n>& v) const;
+
+    //! dot product computed in double-precision arithmetics
+    double ddot(const Matx<_Tp, m, n>& v) const;
+
+    //! conversion to another data type
+    template<typename T2> operator Matx<T2, m, n>() const;
+
+    //! change the matrix shape
+    template<int m1, int n1> Matx<_Tp, m1, n1> reshape() const;
+
+    //! extract part of the matrix
+    template<int m1, int n1> Matx<_Tp, m1, n1> get_minor(int base_row, int base_col) const;
+
+    //! extract the matrix row
+    Matx<_Tp, 1, n> row(int i) const;
+
+    //! extract the matrix column
+    Matx<_Tp, m, 1> col(int i) const;
+
+    //! extract the matrix diagonal
+    diag_type diag() const;
+
+    //! transpose the matrix
+    Matx<_Tp, n, m> t() const;
+
+    //! invert the matrix
+    Matx<_Tp, n, m> inv(int method=DECOMP_LU, bool *p_is_ok = NULL) const;
+
+    //! solve linear system
+    template<int l> Matx<_Tp, n, l> solve(const Matx<_Tp, m, l>& rhs, int flags=DECOMP_LU) const;
+    Vec<_Tp, n> solve(const Vec<_Tp, m>& rhs, int method) const;
+
+    //! multiply two matrices element-wise
+    Matx<_Tp, m, n> mul(const Matx<_Tp, m, n>& a) const;
+
+    //! divide two matrices element-wise
+    Matx<_Tp, m, n> div(const Matx<_Tp, m, n>& a) const;
+
+    //! element access
+    const _Tp& operator ()(int row, int col) const;
+    _Tp& operator ()(int row, int col);
+
+    //! 1D element access
+    const _Tp& operator ()(int i) const;
+    _Tp& operator ()(int i);
+
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_AddOp);
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp);
+    template<typename _T2> Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp);
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp);
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp);
+    template<int l> Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp);
+    Matx(const Matx<_Tp, n, m>& a, Matx_TOp);
+
+    _Tp val[m*n]; ///< matrix elements
+};
+
+typedef Matx<float, 1, 2> Matx12f;
+typedef Matx<double, 1, 2> Matx12d;
+typedef Matx<float, 1, 3> Matx13f;
+typedef Matx<double, 1, 3> Matx13d;
+typedef Matx<float, 1, 4> Matx14f;
+typedef Matx<double, 1, 4> Matx14d;
+typedef Matx<float, 1, 6> Matx16f;
+typedef Matx<double, 1, 6> Matx16d;
+
+typedef Matx<float, 2, 1> Matx21f;
+typedef Matx<double, 2, 1> Matx21d;
+typedef Matx<float, 3, 1> Matx31f;
+typedef Matx<double, 3, 1> Matx31d;
+typedef Matx<float, 4, 1> Matx41f;
+typedef Matx<double, 4, 1> Matx41d;
+typedef Matx<float, 6, 1> Matx61f;
+typedef Matx<double, 6, 1> Matx61d;
+
+typedef Matx<float, 2, 2> Matx22f;
+typedef Matx<double, 2, 2> Matx22d;
+typedef Matx<float, 2, 3> Matx23f;
+typedef Matx<double, 2, 3> Matx23d;
+typedef Matx<float, 3, 2> Matx32f;
+typedef Matx<double, 3, 2> Matx32d;
+
+typedef Matx<float, 3, 3> Matx33f;
+typedef Matx<double, 3, 3> Matx33d;
+
+typedef Matx<float, 3, 4> Matx34f;
+typedef Matx<double, 3, 4> Matx34d;
+typedef Matx<float, 4, 3> Matx43f;
+typedef Matx<double, 4, 3> Matx43d;
+
+typedef Matx<float, 4, 4> Matx44f;
+typedef Matx<double, 4, 4> Matx44d;
+typedef Matx<float, 6, 6> Matx66f;
+typedef Matx<double, 6, 6> Matx66d;
+
+template<typename _Tp, int m> static inline
+double determinant(const Matx<_Tp, m, m>& a);
+
+template<typename _Tp, int m, int n> static inline
+double trace(const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M);
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M, int normType);
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator += (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b);
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator -= (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator + (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, int alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, int alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (int alpha, const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (float alpha, const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n, int l> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+bool operator != (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+
+/////////////////////// Vec (used as element of multi-channel images /////////////////////
+
+/** @brief Template class for short numerical vectors, a partial case of Matx
+
+This template class represents short numerical vectors (of 1, 2, 3, 4 ... elements) on which you
+can perform basic arithmetical operations, access individual elements using [] operator etc. The
+vectors are allocated on stack, as opposite to std::valarray, std::vector, cv::Mat etc., which
+elements are dynamically allocated in the heap.
+
+The template takes 2 parameters:
+@tparam _Tp element type
+@tparam cn the number of elements
+
+In addition to the universal notation like Vec<float, 3>, you can use shorter aliases
+for the most popular specialized variants of Vec, e.g. Vec3f ~ Vec<float, 3>.
+
+It is possible to convert Vec\<T,2\> to/from Point_, Vec\<T,3\> to/from Point3_ , and Vec\<T,4\>
+to CvScalar or Scalar_. Use operator[] to access the elements of Vec.
+
+All the expected vector operations are also implemented:
+-   v1 = v2 + v3
+-   v1 = v2 - v3
+-   v1 = v2 \* scale
+-   v1 = scale \* v2
+-   v1 = -v2
+-   v1 += v2 and other augmenting operations
+-   v1 == v2, v1 != v2
+-   norm(v1) (euclidean norm)
+The Vec class is commonly used to describe pixel types of multi-channel arrays. See Mat for details.
+*/
+template<typename _Tp, int cn> class Vec : public Matx<_Tp, cn, 1>
+{
+public:
+    typedef _Tp value_type;
+    enum {
+           channels = cn,
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           depth    = Matx<_Tp, cn, 1>::depth,
+           type     = CV_MAKETYPE(depth, channels),
+#endif
+           _dummy_enum_finalizer = 0
+         };
+
+    //! default constructor
+    Vec();
+
+    Vec(_Tp v0); //!< 1-element vector constructor
+    Vec(_Tp v0, _Tp v1); //!< 2-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2); //!< 3-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3); //!< 4-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4); //!< 5-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5); //!< 6-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6); //!< 7-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7); //!< 8-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8); //!< 9-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9); //!< 10-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13); //!< 14-element vector constructor
+    explicit Vec(const _Tp* values);
+
+    Vec(std::initializer_list<_Tp>);
+
+    Vec(const Vec<_Tp, cn>& v);
+
+    static Vec all(_Tp alpha);
+    static Vec ones();
+    static Vec randn(_Tp a, _Tp b);
+    static Vec randu(_Tp a, _Tp b);
+    static Vec zeros();
+    static Vec diag(_Tp alpha) = delete;
+    static Vec eye() = delete;
+
+    //! per-element multiplication
+    Vec mul(const Vec<_Tp, cn>& v) const;
+
+    //! conjugation (makes sense for complex numbers and quaternions)
+    Vec conj() const;
+
+    /*!
+      cross product of the two 3D vectors.
+
+      For other dimensionalities the exception is raised
+    */
+    Vec cross(const Vec& v) const;
+    //! conversion to another data type
+    template<typename T2> operator Vec<T2, cn>() const;
+
+    /*! element access */
+    const _Tp& operator [](int i) const;
+    _Tp& operator[](int i);
+    const _Tp& operator ()(int i) const;
+    _Tp& operator ()(int i);
+
+    Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default;
+
+    Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp);
+    Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp);
+    template<typename _T2> Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp);
+};
+
+/** @name Shorter aliases for the most popular specializations of Vec<T,n>
+  @{
+*/
+typedef Vec<uchar, 2> Vec2b;
+typedef Vec<uchar, 3> Vec3b;
+typedef Vec<uchar, 4> Vec4b;
+
+typedef Vec<short, 2> Vec2s;
+typedef Vec<short, 3> Vec3s;
+typedef Vec<short, 4> Vec4s;
+
+typedef Vec<ushort, 2> Vec2w;
+typedef Vec<ushort, 3> Vec3w;
+typedef Vec<ushort, 4> Vec4w;
+
+typedef Vec<int, 2> Vec2i;
+typedef Vec<int, 3> Vec3i;
+typedef Vec<int, 4> Vec4i;
+typedef Vec<int, 6> Vec6i;
+typedef Vec<int, 8> Vec8i;
+
+typedef Vec<float, 2> Vec2f;
+typedef Vec<float, 3> Vec3f;
+typedef Vec<float, 4> Vec4f;
+typedef Vec<float, 6> Vec6f;
+
+typedef Vec<double, 2> Vec2d;
+typedef Vec<double, 3> Vec3d;
+typedef Vec<double, 4> Vec4d;
+typedef Vec<double, 6> Vec6d;
+/** @} */
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v);
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator += (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b);
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator -= (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator + (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, int alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, float alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, int alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (int alpha, const Vec<_Tp, cn>& a);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, float alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (float alpha, const Vec<_Tp, cn>& a);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, double alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (double alpha, const Vec<_Tp, cn>& a);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, int alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, float alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, double alpha);
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a);
+
+template<typename _Tp> inline
+Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2);
+
+template<typename _Tp> inline
+Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2);
+
+//! @} core_basic
+
+} // cv
+
+#include "opencv2/core/matx.inl.hpp"
+
+#endif // OPENCV_CORE_MATX_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/matx.inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/matx.inl.hpp
new file mode 100644
index 000000000000..faa3e749d62f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/matx.inl.hpp
@@ -0,0 +1,1115 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_MATX_INL_HPP
+#define OPENCV_CORE_MATX_INL_HPP
+
+#ifndef __cplusplus
+#  error matx.inl.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/matx.hpp"
+
+namespace cv
+{
+
+//==============================================================================
+// Helpers
+
+namespace internal
+{
+
+template<typename _Tp, int m> struct Matx_DetOp
+{
+    double operator ()(const Matx<_Tp, m, m>& a) const
+    {
+        Matx<_Tp, m, m> temp = a;
+        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
+        if( p == 0 )
+            return p;
+        for( int i = 0; i < m; i++ )
+            p *= temp(i, i);
+        return p;
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 1>
+{
+    double operator ()(const Matx<_Tp, 1, 1>& a) const
+    {
+        return a(0,0);
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 2>
+{
+    double operator ()(const Matx<_Tp, 2, 2>& a) const
+    {
+        return a(0,0)*a(1,1) - a(0,1)*a(1,0);
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 3>
+{
+    double operator ()(const Matx<_Tp, 3, 3>& a) const
+    {
+        return a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2)) -
+            a(0,1)*(a(1,0)*a(2,2) - a(2,0)*a(1,2)) +
+            a(0,2)*(a(1,0)*a(2,1) - a(2,0)*a(1,1));
+    }
+};
+
+template<typename _Tp> Vec<_Tp, 2> inline conjugate(const Vec<_Tp, 2>& v)
+{
+    return Vec<_Tp, 2>(v[0], -v[1]);
+}
+
+template<typename _Tp> Vec<_Tp, 4> inline conjugate(const Vec<_Tp, 4>& v)
+{
+    return Vec<_Tp, 4>(v[0], -v[1], -v[2], -v[3]);
+}
+
+} // internal::
+
+
+//==============================================================================
+// Matx
+
+template<typename _Tp, int m, int n> class DataType< Matx<_Tp, m, n> >
+{
+public:
+    typedef Matx<_Tp, m, n>                               value_type;
+    typedef Matx<typename DataType<_Tp>::work_type, m, n> work_type;
+    typedef _Tp                                           channel_type;
+    typedef value_type                                    vec_type;
+
+    enum { generic_type = 0,
+           channels     = m * n,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+};
+
+
+namespace traits {
+template<typename _Tp, int m, int n>
+struct Depth< Matx<_Tp, m, n> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp, int m, int n>
+struct Type< Matx<_Tp, m, n> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, n*m) }; };
+} // namespace
+
+
+//! @brief  Comma-separated Matrix Initializer
+template<typename _Tp, int m, int n> class MatxCommaInitializer
+{
+public:
+    MatxCommaInitializer(Matx<_Tp, m, n>* _mtx);
+    template<typename T2> MatxCommaInitializer<_Tp, m, n>& operator , (T2 val);
+    Matx<_Tp, m, n> operator *() const;
+
+    Matx<_Tp, m, n>* dst;
+    int idx;
+};
+
+template<typename _Tp, typename _T2, int m, int n> static inline
+MatxCommaInitializer<_Tp, m, n> operator << (const Matx<_Tp, m, n>& mtx, _T2 val)
+{
+    MatxCommaInitializer<_Tp, m, n> commaInitializer((Matx<_Tp, m, n>*)&mtx);
+    return (commaInitializer, val);
+}
+
+template<typename _Tp, int m, int n> inline
+MatxCommaInitializer<_Tp, m, n>::MatxCommaInitializer(Matx<_Tp, m, n>* _mtx)
+    : dst(_mtx), idx(0)
+{}
+
+template<typename _Tp, int m, int n> template<typename _T2> inline
+MatxCommaInitializer<_Tp, m, n>& MatxCommaInitializer<_Tp, m, n>::operator , (_T2 value)
+{
+    CV_DbgAssert( idx < m*n );
+    dst->val[idx++] = saturate_cast<_Tp>(value);
+    return *this;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> MatxCommaInitializer<_Tp, m, n>::operator *() const
+{
+    CV_DbgAssert( idx == n*m );
+    return *dst;
+}
+
+////////////////////////////////// Matx Implementation ///////////////////////////////////
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx()
+{
+    for(int i = 0; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0)
+{
+    val[0] = v0;
+    for(int i = 1; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1)
+{
+    CV_StaticAssert(channels >= 2, "Matx should have at least 2 elements.");
+    val[0] = v0; val[1] = v1;
+    for(int i = 2; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2)
+{
+    CV_StaticAssert(channels >= 3, "Matx should have at least 3 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2;
+    for(int i = 3; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+{
+    CV_StaticAssert(channels >= 4, "Matx should have at least 4 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    for(int i = 4; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
+{
+    CV_StaticAssert(channels >= 5, "Matx should have at least 5 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3; val[4] = v4;
+    for(int i = 5; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
+{
+    CV_StaticAssert(channels >= 6, "Matx should have at least 6 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5;
+    for(int i = 6; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
+{
+    CV_StaticAssert(channels >= 7, "Matx should have at least 7 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6;
+    for(int i = 7; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
+{
+    CV_StaticAssert(channels >= 8, "Matx should have at least 8 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    for(int i = 8; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
+{
+    CV_StaticAssert(channels >= 9, "Matx should have at least 9 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8;
+    for(int i = 9; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
+{
+    CV_StaticAssert(channels >= 10, "Matx should have at least 10 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9;
+    for(int i = 10; i < channels; i++) val[i] = _Tp(0);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11)
+{
+    CV_StaticAssert(channels >= 12, "Matx should have at least 12 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    for(int i = 12; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
+{
+    CV_StaticAssert(channels >= 14, "Matx should have at least 14 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    val[12] = v12; val[13] = v13;
+    for (int i = 14; i < channels; i++) val[i] = _Tp(0);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13, _Tp v14, _Tp v15)
+{
+    CV_StaticAssert(channels >= 16, "Matx should have at least 16 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    val[12] = v12; val[13] = v13; val[14] = v14; val[15] = v15;
+    for(int i = 16; i < channels; i++) val[i] = _Tp(0);
+}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(const _Tp* values)
+{
+    for( int i = 0; i < channels; i++ ) val[i] = values[i];
+}
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(std::initializer_list<_Tp> list)
+{
+    CV_DbgAssert(list.size() == channels);
+    int i = 0;
+    for(const auto& elem : list)
+    {
+        val[i++] = elem;
+    }
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::all(_Tp alpha)
+{
+    Matx<_Tp, m, n> M;
+    for( int i = 0; i < m*n; i++ ) M.val[i] = alpha;
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::zeros()
+{
+    return all(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::ones()
+{
+    return all(1);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::eye()
+{
+    Matx<_Tp,m,n> M;
+    for(int i = 0; i < shortdim; i++)
+        M(i,i) = 1;
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp Matx<_Tp, m, n>::dot(const Matx<_Tp, m, n>& M) const
+{
+    _Tp s = 0;
+    for( int i = 0; i < channels; i++ ) s += val[i]*M.val[i];
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+double Matx<_Tp, m, n>::ddot(const Matx<_Tp, m, n>& M) const
+{
+    double s = 0;
+    for( int i = 0; i < channels; i++ ) s += (double)val[i]*M.val[i];
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::diag(const typename Matx<_Tp,m,n>::diag_type& d)
+{
+    Matx<_Tp,m,n> M;
+    for(int i = 0; i < shortdim; i++)
+        M(i,i) = d(i, 0);
+    return M;
+}
+
+template<typename _Tp, int m, int n> template<typename T2>
+inline Matx<_Tp, m, n>::operator Matx<T2, m, n>() const
+{
+    Matx<T2, m, n> M;
+    for( int i = 0; i < m*n; i++ ) M.val[i] = saturate_cast<T2>(val[i]);
+    return M;
+}
+
+template<typename _Tp, int m, int n> template<int m1, int n1> inline
+Matx<_Tp, m1, n1> Matx<_Tp, m, n>::reshape() const
+{
+    CV_StaticAssert(m1*n1 == m*n, "Input and destination matrices must have the same number of elements");
+    return (const Matx<_Tp, m1, n1>&)*this;
+}
+
+template<typename _Tp, int m, int n>
+template<int m1, int n1> inline
+Matx<_Tp, m1, n1> Matx<_Tp, m, n>::get_minor(int base_row, int base_col) const
+{
+    CV_DbgAssert(0 <= base_row && base_row+m1 <= m && 0 <= base_col && base_col+n1 <= n);
+    Matx<_Tp, m1, n1> s;
+    for( int di = 0; di < m1; di++ )
+        for( int dj = 0; dj < n1; dj++ )
+            s(di, dj) = (*this)(base_row+di, base_col+dj);
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, 1, n> Matx<_Tp, m, n>::row(int i) const
+{
+    CV_DbgAssert((unsigned)i < (unsigned)m);
+    return Matx<_Tp, 1, n>(&val[i*n]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, 1> Matx<_Tp, m, n>::col(int j) const
+{
+    CV_DbgAssert((unsigned)j < (unsigned)n);
+    Matx<_Tp, m, 1> v;
+    for( int i = 0; i < m; i++ )
+        v.val[i] = val[i*n + j];
+    return v;
+}
+
+template<typename _Tp, int m, int n> inline
+typename Matx<_Tp, m, n>::diag_type Matx<_Tp, m, n>::diag() const
+{
+    diag_type d;
+    for( int i = 0; i < shortdim; i++ )
+        d.val[i] = val[i*n + i];
+    return d;
+}
+
+template<typename _Tp, int m, int n> inline
+const _Tp& Matx<_Tp, m, n>::operator()(int row_idx, int col_idx) const
+{
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return this->val[row_idx*n + col_idx];
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp& Matx<_Tp, m, n>::operator ()(int row_idx, int col_idx)
+{
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return val[row_idx*n + col_idx];
+}
+
+template<typename _Tp, int m, int n> inline
+const _Tp& Matx<_Tp, m, n>::operator ()(int i) const
+{
+    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
+    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
+    return val[i];
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp& Matx<_Tp, m, n>::operator ()(int i)
+{
+    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
+    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
+    return val[i];
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_AddOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] + b.val[i]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] - b.val[i]);
+}
+
+template<typename _Tp, int m, int n> template<typename _T2> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] * b.val[i]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] / b.val[i]);
+}
+
+template<typename _Tp, int m, int n> template<int l> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp)
+{
+    for( int i = 0; i < m; i++ )
+        for( int j = 0; j < n; j++ )
+        {
+            _Tp s = 0;
+            for( int k = 0; k < l; k++ )
+                s += a(i, k) * b(k, j);
+            val[i*n + j] = s;
+        }
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, n, m>& a, Matx_TOp)
+{
+    for( int i = 0; i < m; i++ )
+        for( int j = 0; j < n; j++ )
+            val[i*n + j] = a(j, i);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::mul(const Matx<_Tp, m, n>& a) const
+{
+    return Matx<_Tp, m, n>(*this, a, Matx_MulOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::div(const Matx<_Tp, m, n>& a) const
+{
+    return Matx<_Tp, m, n>(*this, a, Matx_DivOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, n, m> Matx<_Tp, m, n>::t() const
+{
+    return Matx<_Tp, n, m>(*this, Matx_TOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Vec<_Tp, n> Matx<_Tp, m, n>::solve(const Vec<_Tp, m>& rhs, int method) const
+{
+    Matx<_Tp, n, 1> x = solve((const Matx<_Tp, m, 1>&)(rhs), method);
+    return (Vec<_Tp, n>&)(x);
+}
+
+template<typename _Tp, int m> static inline
+double determinant(const Matx<_Tp, m, m>& a)
+{
+    return cv::internal::Matx_DetOp<_Tp, m>()(a);
+}
+
+template<typename _Tp, int m, int n> static inline
+double trace(const Matx<_Tp, m, n>& a)
+{
+    _Tp s = 0;
+    for( int i = 0; i < std::min(m, n); i++ )
+        s += a(i,i);
+    return s;
+}
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M)
+{
+    return std::sqrt(normL2Sqr<_Tp, double>(M.val, m*n));
+}
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M, int normType)
+{
+    switch(normType) {
+    case NORM_INF:
+        return (double)normInf<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    case NORM_L1:
+        return (double)normL1<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    case NORM_L2SQR:
+        return (double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    default:
+    case NORM_L2:
+        return std::sqrt((double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n));
+    }
+}
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator += (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
+    return a;
+}
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator -= (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator + (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_AddOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_SubOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, int alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, int alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (int alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (float alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, -1, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n, int l> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_MatMulOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b)
+{
+    Matx<_Tp, m, 1> c(a, b, Matx_MatMulOp());
+    return (const Vec<_Tp, m>&)(c);
+}
+
+template<typename _Tp, int m, int n> static inline
+bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        if( a.val[i] != b.val[i] ) return false;
+    return true;
+}
+
+template<typename _Tp, int m, int n> static inline
+bool operator != (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return !(a == b);
+}
+
+//==============================================================================
+// Vec
+
+template<typename _Tp, int cn> class DataType< Vec<_Tp, cn> >
+{
+public:
+    typedef Vec<_Tp, cn>                               value_type;
+    typedef Vec<typename DataType<_Tp>::work_type, cn> work_type;
+    typedef _Tp                                        channel_type;
+    typedef value_type                                 vec_type;
+
+    enum { generic_type = 0,
+           channels     = cn,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           depth        = DataType<channel_type>::depth,
+           type         = CV_MAKETYPE(depth, channels),
+#endif
+           _dummy_enum_finalizer = 0
+         };
+};
+
+namespace traits {
+template<typename _Tp, int cn>
+struct Depth< Vec<_Tp, cn> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp, int cn>
+struct Type< Vec<_Tp, cn> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, cn) }; };
+} // namespace
+
+/** @brief  Comma-separated Vec Initializer
+*/
+template<typename _Tp, int m> class VecCommaInitializer : public MatxCommaInitializer<_Tp, m, 1>
+{
+public:
+    VecCommaInitializer(Vec<_Tp, m>* _vec);
+    template<typename T2> VecCommaInitializer<_Tp, m>& operator , (T2 val);
+    Vec<_Tp, m> operator *() const;
+};
+
+template<typename _Tp, typename _T2, int cn> static inline
+VecCommaInitializer<_Tp, cn> operator << (const Vec<_Tp, cn>& vec, _T2 val)
+{
+    VecCommaInitializer<_Tp, cn> commaInitializer((Vec<_Tp, cn>*)&vec);
+    return (commaInitializer, val);
+}
+
+template<typename _Tp, int cn> inline
+VecCommaInitializer<_Tp, cn>::VecCommaInitializer(Vec<_Tp, cn>* _vec)
+    : MatxCommaInitializer<_Tp, cn, 1>(_vec)
+{}
+
+template<typename _Tp, int cn> template<typename _T2> inline
+VecCommaInitializer<_Tp, cn>& VecCommaInitializer<_Tp, cn>::operator , (_T2 value)
+{
+    CV_DbgAssert( this->idx < cn );
+    this->dst->val[this->idx++] = saturate_cast<_Tp>(value);
+    return *this;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> VecCommaInitializer<_Tp, cn>::operator *() const
+{
+    CV_DbgAssert( this->idx == cn );
+    return *this->dst;
+}
+
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec() {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0)
+    : Matx<_Tp, cn, 1>(v0) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1)
+    : Matx<_Tp, cn, 1>(v0, v1) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2)
+    : Matx<_Tp, cn, 1>(v0, v1, v2) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const _Tp* values)
+    : Matx<_Tp, cn, 1>(values) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(std::initializer_list<_Tp> list)
+    : Matx<_Tp, cn, 1>(list) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Vec<_Tp, cn>& m)
+    : Matx<_Tp, cn, 1>(m.val) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp op)
+    : Matx<_Tp, cn, 1>(a, b, op) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp op)
+    : Matx<_Tp, cn, 1>(a, b, op) {}
+
+template<typename _Tp, int cn> template<typename _T2> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp op)
+    : Matx<_Tp, cn, 1>(a, alpha, op) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::all(_Tp alpha)
+{
+    Vec v;
+    for( int i = 0; i < cn; i++ ) v.val[i] = alpha;
+    return v;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::ones()
+{
+    return Vec::all(1);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::zeros()
+{
+    return Vec::all(0);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::mul(const Vec<_Tp, cn>& v) const
+{
+    Vec<_Tp, cn> w;
+    for( int i = 0; i < cn; i++ ) w.val[i] = saturate_cast<_Tp>(this->val[i]*v.val[i]);
+    return w;
+}
+
+template<> inline
+Vec<float, 2> Vec<float, 2>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<double, 2> Vec<double, 2>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<float, 4> Vec<float, 4>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<double, 4> Vec<double, 4>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::cross(const Vec<_Tp, cn>&) const
+{
+    CV_StaticAssert(cn == 3, "for arbitrary-size vector there is no cross-product defined");
+    return Vec<_Tp, cn>();
+}
+
+template<> inline
+Vec<float, 3> Vec<float, 3>::cross(const Vec<float, 3>& v) const
+{
+    return Vec<float,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
+                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
+                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
+}
+
+template<> inline
+Vec<double, 3> Vec<double, 3>::cross(const Vec<double, 3>& v) const
+{
+    return Vec<double,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
+                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
+                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
+}
+
+template<typename _Tp, int cn> template<typename T2> inline
+Vec<_Tp, cn>::operator Vec<T2, cn>() const
+{
+    Vec<T2, cn> v;
+    for( int i = 0; i < cn; i++ ) v.val[i] = saturate_cast<T2>(this->val[i]);
+    return v;
+}
+
+template<typename _Tp, int cn> inline
+const _Tp& Vec<_Tp, cn>::operator [](int i) const
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+_Tp& Vec<_Tp, cn>::operator [](int i)
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+const _Tp& Vec<_Tp, cn>::operator ()(int i) const
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+_Tp& Vec<_Tp, cn>::operator ()(int i)
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v)
+{
+    double nv = norm(v);
+    return v * (nv ? 1./nv : 0.);
+}
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator += (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
+{
+    for( int i = 0; i < cn; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
+    return a;
+}
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator -= (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
+{
+    for( int i = 0; i < cn; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator + (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
+{
+    return Vec<_Tp, cn>(a, b, Matx_AddOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
+{
+    return Vec<_Tp, cn>(a, b, Matx_SubOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, int alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, float alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha)
+{
+    float ialpha = 1.f/alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (int alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (float alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (double alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a)
+{
+    Vec<_Tp,cn> t;
+    for( int i = 0; i < cn; i++ ) t.val[i] = saturate_cast<_Tp>(-a.val[i]);
+    return t;
+}
+
+template<typename _Tp> inline Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    return Vec<_Tp, 4>(saturate_cast<_Tp>(v1[0]*v2[0] - v1[1]*v2[1] - v1[2]*v2[2] - v1[3]*v2[3]),
+                       saturate_cast<_Tp>(v1[0]*v2[1] + v1[1]*v2[0] + v1[2]*v2[3] - v1[3]*v2[2]),
+                       saturate_cast<_Tp>(v1[0]*v2[2] - v1[1]*v2[3] + v1[2]*v2[0] + v1[3]*v2[1]),
+                       saturate_cast<_Tp>(v1[0]*v2[3] + v1[1]*v2[2] - v1[2]*v2[1] + v1[3]*v2[0]));
+}
+
+template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    v1 = v1 * v2;
+    return v1;
+}
+
+} // cv::
+
+#endif // OPENCV_CORE_MATX_INL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/neon_utils.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/neon_utils.hpp
new file mode 100644
index 000000000000..573ba99ec356
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/neon_utils.hpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_NEON_UTILS_HPP
+#define OPENCV_HAL_NEON_UTILS_HPP
+
+#include "opencv2/core/cvdef.h"
+
+//! @addtogroup core_utils_neon
+//! @{
+
+#if CV_NEON
+
+inline int32x2_t cv_vrnd_s32_f32(float32x2_t v)
+{
+    static int32x2_t v_sign = vdup_n_s32(1 << 31),
+        v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f));
+
+    int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v)));
+    return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition)));
+}
+
+inline int32x4_t cv_vrndq_s32_f32(float32x4_t v)
+{
+    static int32x4_t v_sign = vdupq_n_s32(1 << 31),
+        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+
+    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v)));
+    return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition)));
+}
+
+inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v)
+{
+    static float32x2_t v_05 = vdup_n_f32(0.5f);
+    return vcvt_u32_f32(vadd_f32(v, v_05));
+}
+
+inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v)
+{
+    static float32x4_t v_05 = vdupq_n_f32(0.5f);
+    return vcvtq_u32_f32(vaddq_f32(v, v_05));
+}
+
+inline float32x4_t cv_vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+inline float32x2_t cv_vrecp_f32(float32x2_t val)
+{
+    float32x2_t reciprocal = vrecpe_f32(val);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+inline float32x4_t cv_vrsqrtq_f32(float32x4_t val)
+{
+    float32x4_t e = vrsqrteq_f32(val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x2_t cv_vrsqrt_f32(float32x2_t val)
+{
+    float32x2_t e = vrsqrte_f32(val);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x4_t cv_vsqrtq_f32(float32x4_t val)
+{
+    return cv_vrecpq_f32(cv_vrsqrtq_f32(val));
+}
+
+inline float32x2_t cv_vsqrt_f32(float32x2_t val)
+{
+    return cv_vrecp_f32(cv_vrsqrt_f32(val));
+}
+
+#endif
+
+//! @}
+
+#endif // OPENCV_HAL_NEON_UTILS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/ocl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/ocl.hpp
new file mode 100644
index 000000000000..891fd678b74f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/ocl.hpp
@@ -0,0 +1,923 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OPENCL_HPP
+#define OPENCV_OPENCL_HPP
+
+#include "opencv2/core.hpp"
+#include <typeinfo>
+#include <typeindex>
+
+namespace cv { namespace ocl {
+
+//! @addtogroup core_opencl
+//! @{
+
+CV_EXPORTS_W bool haveOpenCL();
+CV_EXPORTS_W bool useOpenCL();
+CV_EXPORTS_W bool haveAmdBlas();
+CV_EXPORTS_W bool haveAmdFft();
+CV_EXPORTS_W void setUseOpenCL(bool flag);
+CV_EXPORTS_W void finish();
+
+CV_EXPORTS bool haveSVM();
+
+class CV_EXPORTS Context;
+class CV_EXPORTS_W_SIMPLE Device;
+class CV_EXPORTS Kernel;
+class CV_EXPORTS Program;
+class CV_EXPORTS ProgramSource;
+class CV_EXPORTS Queue;
+class CV_EXPORTS PlatformInfo;
+class CV_EXPORTS Image2D;
+
+class CV_EXPORTS_W_SIMPLE Device
+{
+public:
+    CV_WRAP Device() CV_NOEXCEPT;
+    explicit Device(void* d);
+    Device(const Device& d);
+    Device& operator = (const Device& d);
+    Device(Device&& d) CV_NOEXCEPT;
+    Device& operator = (Device&& d) CV_NOEXCEPT;
+    CV_WRAP ~Device();
+
+    void set(void* d);
+
+    enum
+    {
+        TYPE_DEFAULT     = (1 << 0),
+        TYPE_CPU         = (1 << 1),
+        TYPE_GPU         = (1 << 2),
+        TYPE_ACCELERATOR = (1 << 3),
+        TYPE_DGPU        = TYPE_GPU + (1 << 16),
+        TYPE_IGPU        = TYPE_GPU + (1 << 17),
+        TYPE_ALL         = 0xFFFFFFFF
+    };
+
+    CV_WRAP String name() const;
+    CV_WRAP String extensions() const;
+    CV_WRAP bool isExtensionSupported(const String& extensionName) const;
+    CV_WRAP String version() const;
+    CV_WRAP String vendorName() const;
+    CV_WRAP String OpenCL_C_Version() const;
+    CV_WRAP String OpenCLVersion() const;
+    CV_WRAP int deviceVersionMajor() const;
+    CV_WRAP int deviceVersionMinor() const;
+    CV_WRAP String driverVersion() const;
+    void* ptr() const;
+
+    CV_WRAP int type() const;
+
+    CV_WRAP int addressBits() const;
+    CV_WRAP bool available() const;
+    CV_WRAP bool compilerAvailable() const;
+    CV_WRAP bool linkerAvailable() const;
+
+    enum
+    {
+        FP_DENORM=(1 << 0),
+        FP_INF_NAN=(1 << 1),
+        FP_ROUND_TO_NEAREST=(1 << 2),
+        FP_ROUND_TO_ZERO=(1 << 3),
+        FP_ROUND_TO_INF=(1 << 4),
+        FP_FMA=(1 << 5),
+        FP_SOFT_FLOAT=(1 << 6),
+        FP_CORRECTLY_ROUNDED_DIVIDE_SQRT=(1 << 7)
+    };
+    CV_WRAP int doubleFPConfig() const;
+    CV_WRAP int singleFPConfig() const;
+    CV_WRAP int halfFPConfig() const;
+
+    /// true if 'cl_khr_fp64' extension is available
+    CV_WRAP bool hasFP64() const;
+    /// true if 'cl_khr_fp16' extension is available
+    CV_WRAP bool hasFP16() const;
+
+    CV_WRAP bool endianLittle() const;
+    CV_WRAP bool errorCorrectionSupport() const;
+
+    enum
+    {
+        EXEC_KERNEL=(1 << 0),
+        EXEC_NATIVE_KERNEL=(1 << 1)
+    };
+    CV_WRAP int executionCapabilities() const;
+
+    CV_WRAP size_t globalMemCacheSize() const;
+
+    enum
+    {
+        NO_CACHE=0,
+        READ_ONLY_CACHE=1,
+        READ_WRITE_CACHE=2
+    };
+    CV_WRAP int globalMemCacheType() const;
+    CV_WRAP int globalMemCacheLineSize() const;
+    CV_WRAP size_t globalMemSize() const;
+
+    CV_WRAP size_t localMemSize() const;
+    enum
+    {
+        NO_LOCAL_MEM=0,
+        LOCAL_IS_LOCAL=1,
+        LOCAL_IS_GLOBAL=2
+    };
+    CV_WRAP int localMemType() const;
+    CV_WRAP bool hostUnifiedMemory() const;
+
+    CV_WRAP bool imageSupport() const;
+
+    CV_WRAP bool imageFromBufferSupport() const;
+    uint imagePitchAlignment() const;
+    uint imageBaseAddressAlignment() const;
+
+    /// deprecated, use isExtensionSupported() method (probably with "cl_khr_subgroups" value)
+    CV_WRAP bool intelSubgroupsSupport() const;
+
+    CV_WRAP size_t image2DMaxWidth() const;
+    CV_WRAP size_t image2DMaxHeight() const;
+
+    CV_WRAP size_t image3DMaxWidth() const;
+    CV_WRAP size_t image3DMaxHeight() const;
+    CV_WRAP size_t image3DMaxDepth() const;
+
+    CV_WRAP size_t imageMaxBufferSize() const;
+    CV_WRAP size_t imageMaxArraySize() const;
+
+    enum
+    {
+        UNKNOWN_VENDOR=0,
+        VENDOR_AMD=1,
+        VENDOR_INTEL=2,
+        VENDOR_NVIDIA=3
+    };
+    CV_WRAP int vendorID() const;
+    // FIXIT
+    // dev.isAMD() doesn't work for OpenCL CPU devices from AMD OpenCL platform.
+    // This method should use platform name instead of vendor name.
+    // After fix restore code in arithm.cpp: ocl_compare()
+    CV_WRAP inline bool isAMD() const { return vendorID() == VENDOR_AMD; }
+    CV_WRAP inline bool isIntel() const { return vendorID() == VENDOR_INTEL; }
+    CV_WRAP inline bool isNVidia() const { return vendorID() == VENDOR_NVIDIA; }
+
+    CV_WRAP int maxClockFrequency() const;
+    CV_WRAP int maxComputeUnits() const;
+    CV_WRAP int maxConstantArgs() const;
+    CV_WRAP size_t maxConstantBufferSize() const;
+
+    CV_WRAP size_t maxMemAllocSize() const;
+    CV_WRAP size_t maxParameterSize() const;
+
+    CV_WRAP int maxReadImageArgs() const;
+    CV_WRAP int maxWriteImageArgs() const;
+    CV_WRAP int maxSamplers() const;
+
+    CV_WRAP size_t maxWorkGroupSize() const;
+    CV_WRAP int maxWorkItemDims() const;
+    void maxWorkItemSizes(size_t*) const;
+
+    CV_WRAP int memBaseAddrAlign() const;
+
+    CV_WRAP int nativeVectorWidthChar() const;
+    CV_WRAP int nativeVectorWidthShort() const;
+    CV_WRAP int nativeVectorWidthInt() const;
+    CV_WRAP int nativeVectorWidthLong() const;
+    CV_WRAP int nativeVectorWidthFloat() const;
+    CV_WRAP int nativeVectorWidthDouble() const;
+    CV_WRAP int nativeVectorWidthHalf() const;
+
+    CV_WRAP int preferredVectorWidthChar() const;
+    CV_WRAP int preferredVectorWidthShort() const;
+    CV_WRAP int preferredVectorWidthInt() const;
+    CV_WRAP int preferredVectorWidthLong() const;
+    CV_WRAP int preferredVectorWidthFloat() const;
+    CV_WRAP int preferredVectorWidthDouble() const;
+    CV_WRAP int preferredVectorWidthHalf() const;
+
+    CV_WRAP size_t printfBufferSize() const;
+    CV_WRAP size_t profilingTimerResolution() const;
+
+    CV_WRAP static const Device& getDefault();
+
+    /**
+     * @param d OpenCL handle (cl_device_id). clRetainDevice() is called on success.
+     *
+     * @note Ownership of the passed device is passed to OpenCV on success.
+     * The caller should additionally call `clRetainDevice` on it if it intends
+     * to continue using the device.
+      */
+    static Device fromHandle(void* d);
+
+    struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+
+class CV_EXPORTS Context
+{
+public:
+    Context() CV_NOEXCEPT;
+    explicit Context(int dtype);  //!< @deprecated
+    ~Context();
+    Context(const Context& c);
+    Context& operator= (const Context& c);
+    Context(Context&& c) CV_NOEXCEPT;
+    Context& operator = (Context&& c) CV_NOEXCEPT;
+
+    /** @deprecated */
+    bool create();
+    /** @deprecated */
+    bool create(int dtype);
+
+    size_t ndevices() const;
+    Device& device(size_t idx) const;
+    Program getProg(const ProgramSource& prog,
+                    const String& buildopt, String& errmsg);
+    void unloadProg(Program& prog);
+
+
+    /** Get thread-local OpenCL context (initialize if necessary) */
+#if 0  // OpenCV 5.0
+    static Context& getDefault();
+#else
+    static Context& getDefault(bool initialize = true);
+#endif
+
+    /** @returns cl_context value */
+    void* ptr() const;
+
+    /**
+     * @brief Get OpenCL context property specified on context creation
+     * @param propertyId Property id (CL_CONTEXT_* as defined in cl_context_properties type)
+     * @returns Property value if property was specified on clCreateContext, or NULL if context created without the property
+     */
+    void* getOpenCLContextProperty(int propertyId) const;
+
+    bool useSVM() const;
+    void setUseSVM(bool enabled);
+
+    /**
+     * @param context OpenCL handle (cl_context). clRetainContext() is called on success
+     */
+    static Context fromHandle(void* context);
+    static Context fromDevice(const ocl::Device& device);
+    static Context create(const std::string& configuration);
+
+    void release();
+
+    class CV_EXPORTS UserContext {
+    public:
+        virtual ~UserContext();
+    };
+    template <typename T>
+    inline void setUserContext(const std::shared_ptr<T>& userContext) {
+        setUserContext(typeid(T), userContext);
+    }
+    template <typename T>
+    inline std::shared_ptr<T> getUserContext() {
+        return std::dynamic_pointer_cast<T>(getUserContext(typeid(T)));
+    }
+    void setUserContext(std::type_index typeId, const std::shared_ptr<UserContext>& userContext);
+    std::shared_ptr<UserContext> getUserContext(std::type_index typeId);
+
+    struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+// TODO OpenCV 5.0
+//protected:
+    Impl* p;
+};
+
+/** @deprecated */
+class CV_EXPORTS Platform
+{
+public:
+    Platform() CV_NOEXCEPT;
+    ~Platform();
+    Platform(const Platform& p);
+    Platform& operator = (const Platform& p);
+    Platform(Platform&& p) CV_NOEXCEPT;
+    Platform& operator = (Platform&& p) CV_NOEXCEPT;
+
+    void* ptr() const;
+
+    /** @deprecated */
+    static Platform& getDefault();
+
+    struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+/** @brief Attaches OpenCL context to OpenCV
+@note
+  OpenCV will check if available OpenCL platform has platformName name, then assign context to
+  OpenCV and call `clRetainContext` function. The deviceID device will be used as target device and
+  new command queue will be created.
+@param platformName name of OpenCL platform to attach, this string is used to check if platform is available to OpenCV at runtime
+@param platformID ID of platform attached context was created for
+@param context OpenCL context to be attached to OpenCV
+@param deviceID ID of device, must be created from attached context
+*/
+CV_EXPORTS void attachContext(const String& platformName, void* platformID, void* context, void* deviceID);
+
+/** @brief Convert OpenCL buffer to UMat
+@note
+  OpenCL buffer (cl_mem_buffer) should contain 2D image data, compatible with OpenCV. Memory
+  content is not copied from `clBuffer` to UMat. Instead, buffer handle assigned to UMat and
+  `clRetainMemObject` is called.
+@param cl_mem_buffer source clBuffer handle
+@param step num of bytes in single row
+@param rows number of rows
+@param cols number of cols
+@param type OpenCV type of image
+@param dst destination UMat
+*/
+CV_EXPORTS void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst);
+
+/** @brief Convert OpenCL image2d_t to UMat
+@note
+  OpenCL `image2d_t` (cl_mem_image), should be compatible with OpenCV UMat formats. Memory content
+  is copied from image to UMat with `clEnqueueCopyImageToBuffer` function.
+@param cl_mem_image source image2d_t handle
+@param dst destination UMat
+*/
+CV_EXPORTS void convertFromImage(void* cl_mem_image, UMat& dst);
+
+// TODO Move to internal header
+/// @deprecated
+void initializeContextFromHandle(Context& ctx, void* platform, void* context, void* device);
+
+class CV_EXPORTS Queue
+{
+public:
+    Queue() CV_NOEXCEPT;
+    explicit Queue(const Context& c, const Device& d=Device());
+    ~Queue();
+    Queue(const Queue& q);
+    Queue& operator = (const Queue& q);
+    Queue(Queue&& q) CV_NOEXCEPT;
+    Queue& operator = (Queue&& q) CV_NOEXCEPT;
+
+    bool create(const Context& c=Context(), const Device& d=Device());
+    void finish();
+    void* ptr() const;
+    static Queue& getDefault();
+
+    /// @brief Returns OpenCL command queue with enable profiling mode support
+    const Queue& getProfilingQueue() const;
+
+    struct Impl; friend struct Impl;
+    inline Impl* getImpl() const { return p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+
+class CV_EXPORTS KernelArg
+{
+public:
+    enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, PTR_ONLY = 16, NO_SIZE=256 };
+    KernelArg(int _flags, UMat* _m, int wscale=1, int iwscale=1, const void* _obj=0, size_t _sz=0);
+    KernelArg() CV_NOEXCEPT;
+
+    static KernelArg Local(size_t localMemSize)
+    { return KernelArg(LOCAL, 0, 1, 1, 0, localMemSize); }
+    static KernelArg PtrWriteOnly(const UMat& m)
+    { return KernelArg(PTR_ONLY+WRITE_ONLY, (UMat*)&m); }
+    static KernelArg PtrReadOnly(const UMat& m)
+    { return KernelArg(PTR_ONLY+READ_ONLY, (UMat*)&m); }
+    static KernelArg PtrReadWrite(const UMat& m)
+    { return KernelArg(PTR_ONLY+READ_WRITE, (UMat*)&m); }
+    static KernelArg ReadWrite(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(READ_WRITE, (UMat*)&m, wscale, iwscale); }
+    static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale, iwscale); }
+    static KernelArg ReadOnly(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(READ_ONLY, (UMat*)&m, wscale, iwscale); }
+    static KernelArg WriteOnly(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale, iwscale); }
+    static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale, iwscale); }
+    static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale, iwscale); }
+    static KernelArg Constant(const Mat& m);
+    template<typename _Tp> static KernelArg Constant(const _Tp* arr, size_t n)
+    { return KernelArg(CONSTANT, 0, 1, 1, (void*)arr, n); }
+
+    int flags;
+    UMat* m;
+    const void* obj;
+    size_t sz;
+    int wscale, iwscale;
+};
+
+
+class CV_EXPORTS Kernel
+{
+public:
+    Kernel() CV_NOEXCEPT;
+    Kernel(const char* kname, const Program& prog);
+    Kernel(const char* kname, const ProgramSource& prog,
+           const String& buildopts = String(), String* errmsg=0);
+    ~Kernel();
+    Kernel(const Kernel& k);
+    Kernel& operator = (const Kernel& k);
+    Kernel(Kernel&& k) CV_NOEXCEPT;
+    Kernel& operator = (Kernel&& k) CV_NOEXCEPT;
+
+    bool empty() const;
+    bool create(const char* kname, const Program& prog);
+    bool create(const char* kname, const ProgramSource& prog,
+                const String& buildopts, String* errmsg=0);
+
+    int set(int i, const void* value, size_t sz);
+    int set(int i, const Image2D& image2D);
+    int set(int i, const UMat& m);
+    int set(int i, const KernelArg& arg);
+    template<typename _Tp> int set(int i, const _Tp& value)
+    { return set(i, &value, sizeof(value)); }
+
+
+protected:
+    template<typename _Tp0> inline
+    int set_args_(int i, const _Tp0& a0) { return set(i, a0); }
+    template<typename _Tp0, typename... _Tps> inline
+    int set_args_(int i, const _Tp0& a0, const _Tps&... rest_args) { i = set(i, a0); return set_args_(i, rest_args...); }
+public:
+    /** @brief Setup OpenCL Kernel arguments.
+    Avoid direct using of set(i, ...) methods.
+    @code
+    bool ok = kernel
+        .args(
+            srcUMat, dstUMat,
+            (float)some_float_param
+        ).run(ndims, globalSize, localSize);
+    if (!ok) return false;
+    @endcode
+    */
+    template<typename... _Tps> inline
+    Kernel& args(const _Tps&... kernel_args) { set_args_(0, kernel_args...); return *this; }
+
+    /** @brief Run the OpenCL kernel (globalsize value may be adjusted)
+
+    @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
+    @param globalsize work items for each dimension. It is not the final globalsize passed to
+      OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
+      value in localsize. If localsize is NULL, it will still be adjusted depending on dims. The
+      adjusted values are greater than or equal to the original values.
+    @param localsize work-group size for each dimension.
+    @param sync specify whether to wait for OpenCL computation to finish before return.
+    @param q command queue
+
+    @note Use run_() if your kernel code doesn't support adjusted globalsize.
+    */
+    bool run(int dims, size_t globalsize[],
+             size_t localsize[], bool sync, const Queue& q=Queue());
+
+    /** @brief Run the OpenCL kernel
+     *
+     * @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
+     * @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
+     * @param localsize work-group size for each dimension.
+     * @param sync specify whether to wait for OpenCL computation to finish before return.
+     * @param q command queue
+     */
+    bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
+
+    bool runTask(bool sync, const Queue& q=Queue());
+
+    /** @brief Similar to synchronized run_() call with returning of kernel execution time
+     *
+     * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
+     * @return Execution time in nanoseconds or negative number on error
+     */
+    int64 runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q=Queue());
+
+    size_t workGroupSize() const;
+    size_t preferedWorkGroupSizeMultiple() const;
+    bool compileWorkGroupSize(size_t wsz[]) const;
+    size_t localMemSize() const;
+
+    void* ptr() const;
+    struct Impl;
+
+protected:
+    Impl* p;
+};
+
+class CV_EXPORTS Program
+{
+public:
+    Program() CV_NOEXCEPT;
+    Program(const ProgramSource& src,
+            const String& buildflags, String& errmsg);
+    Program(const Program& prog);
+    Program& operator = (const Program& prog);
+    Program(Program&& prog) CV_NOEXCEPT;
+    Program& operator = (Program&& prog) CV_NOEXCEPT;
+    ~Program();
+
+    bool create(const ProgramSource& src,
+                const String& buildflags, String& errmsg);
+
+    void* ptr() const;
+
+    /**
+     * @brief Query device-specific program binary.
+     *
+     * Returns RAW OpenCL executable binary without additional attachments.
+     *
+     * @sa ProgramSource::fromBinary
+     *
+     * @param[out] binary output buffer
+     */
+    void getBinary(std::vector<char>& binary) const;
+
+    struct Impl; friend struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+public:
+#ifndef OPENCV_REMOVE_DEPRECATED_API
+    // TODO Remove this
+    CV_DEPRECATED bool read(const String& buf, const String& buildflags); // removed, use ProgramSource instead
+    CV_DEPRECATED bool write(String& buf) const; // removed, use getBinary() method instead (RAW OpenCL binary)
+    CV_DEPRECATED const ProgramSource& source() const; // implementation removed
+    CV_DEPRECATED String getPrefix() const; // deprecated, implementation replaced
+    CV_DEPRECATED static String getPrefix(const String& buildflags); // deprecated, implementation replaced
+#endif
+};
+
+
+class CV_EXPORTS ProgramSource
+{
+public:
+    typedef uint64 hash_t; // deprecated
+
+    ProgramSource() CV_NOEXCEPT;
+    explicit ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash);
+    explicit ProgramSource(const String& prog); // deprecated
+    explicit ProgramSource(const char* prog); // deprecated
+    ~ProgramSource();
+    ProgramSource(const ProgramSource& prog);
+    ProgramSource& operator = (const ProgramSource& prog);
+    ProgramSource(ProgramSource&& prog) CV_NOEXCEPT;
+    ProgramSource& operator = (ProgramSource&& prog) CV_NOEXCEPT;
+
+    const String& source() const; // deprecated
+    hash_t hash() const; // deprecated
+
+
+    /** @brief Describe OpenCL program binary.
+     * Do not call clCreateProgramWithBinary() and/or clBuildProgram().
+     *
+     * Caller should guarantee binary buffer lifetime greater than ProgramSource object (and any of its copies).
+     *
+     * This kind of binary is not portable between platforms in general - it is specific to OpenCL vendor / device / driver version.
+     *
+     * @param module name of program owner module
+     * @param name unique name of program (module+name is used as key for OpenCL program caching)
+     * @param binary buffer address. See buffer lifetime requirement in description.
+     * @param size buffer size
+     * @param buildOptions additional program-related build options passed to clBuildProgram()
+     * @return created ProgramSource object
+     */
+    static ProgramSource fromBinary(const String& module, const String& name,
+            const unsigned char* binary, const size_t size,
+            const cv::String& buildOptions = cv::String());
+
+    /** @brief Describe OpenCL program in SPIR format.
+     * Do not call clCreateProgramWithBinary() and/or clBuildProgram().
+     *
+     * Supports SPIR 1.2 by default (pass '-spir-std=X.Y' in buildOptions to override this behavior)
+     *
+     * Caller should guarantee binary buffer lifetime greater than ProgramSource object (and any of its copies).
+     *
+     * Programs in this format are portable between OpenCL implementations with 'khr_spir' extension:
+     * https://www.khronos.org/registry/OpenCL/sdk/2.0/docs/man/xhtml/cl_khr_spir.html
+     * (but they are not portable between different platforms: 32-bit / 64-bit)
+     *
+     * Note: these programs can't support vendor specific extensions, like 'cl_intel_subgroups'.
+     *
+     * @param module name of program owner module
+     * @param name unique name of program (module+name is used as key for OpenCL program caching)
+     * @param binary buffer address. See buffer lifetime requirement in description.
+     * @param size buffer size
+     * @param buildOptions additional program-related build options passed to clBuildProgram()
+     *        (these options are added automatically: '-x spir' and '-spir-std=1.2')
+     * @return created ProgramSource object.
+     */
+    static ProgramSource fromSPIR(const String& module, const String& name,
+            const unsigned char* binary, const size_t size,
+            const cv::String& buildOptions = cv::String());
+
+    //OpenCL 2.1+ only
+    //static Program fromSPIRV(const String& module, const String& name,
+    //        const unsigned char* binary, const size_t size,
+    //        const cv::String& buildOptions = cv::String());
+
+    struct Impl; friend struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+class CV_EXPORTS PlatformInfo
+{
+public:
+    PlatformInfo() CV_NOEXCEPT;
+    /**
+     * @param id pointer cl_platform_id (cl_platform_id*)
+     */
+    explicit PlatformInfo(void* id);
+    ~PlatformInfo();
+
+    PlatformInfo(const PlatformInfo& i);
+    PlatformInfo& operator =(const PlatformInfo& i);
+    PlatformInfo(PlatformInfo&& i) CV_NOEXCEPT;
+    PlatformInfo& operator = (PlatformInfo&& i) CV_NOEXCEPT;
+
+    String name() const;
+    String vendor() const;
+
+    /// See CL_PLATFORM_VERSION
+    String version() const;
+    int versionMajor() const;
+    int versionMinor() const;
+
+    int deviceNumber() const;
+    void getDevice(Device& device, int d) const;
+
+    struct Impl;
+    bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+CV_EXPORTS CV_DEPRECATED const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
+CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf, size_t buf_size);
+CV_EXPORTS const char* typeToStr(int t);
+CV_EXPORTS const char* memopTypeToStr(int t);
+CV_EXPORTS const char* vecopTypeToStr(int t);
+CV_EXPORTS const char* getOpenCLErrorString(int errorCode);
+CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
+CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
+
+
+enum OclVectorStrategy
+{
+    // all matrices have its own vector width
+    OCL_VECTOR_OWN = 0,
+    // all matrices have maximal vector width among all matrices
+    // (useful for cases when matrices have different data types)
+    OCL_VECTOR_MAX = 1,
+
+    // default strategy
+    OCL_VECTOR_DEFAULT = OCL_VECTOR_OWN
+};
+
+CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
+                                         InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
+                                         InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(),
+                                         OclVectorStrategy strat = OCL_VECTOR_DEFAULT);
+
+CV_EXPORTS int checkOptimalVectorWidth(const int *vectorWidths,
+                                       InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
+                                       InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
+                                       InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(),
+                                       OclVectorStrategy strat = OCL_VECTOR_DEFAULT);
+
+// with OCL_VECTOR_MAX strategy
+CV_EXPORTS int predictOptimalVectorWidthMax(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
+                                            InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
+                                            InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray());
+
+CV_EXPORTS void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m);
+
+class CV_EXPORTS Image2D
+{
+public:
+    Image2D() CV_NOEXCEPT;
+
+    /**
+    @param src UMat object from which to get image properties and data
+    @param norm flag to enable the use of normalized channel data types
+    @param alias flag indicating that the image should alias the src UMat. If true, changes to the
+        image or src will be reflected in both objects.
+    */
+    explicit Image2D(const UMat &src, bool norm = false, bool alias = false);
+    Image2D(const Image2D & i);
+    ~Image2D();
+
+    Image2D & operator = (const Image2D & i);
+    Image2D(Image2D &&) CV_NOEXCEPT;
+    Image2D &operator=(Image2D &&) CV_NOEXCEPT;
+
+    /** Indicates if creating an aliased image should succeed.
+    Depends on the underlying platform and the dimensions of the UMat.
+    */
+    static bool canCreateAlias(const UMat &u);
+
+    /** Indicates if the image format is supported.
+    */
+    static bool isFormatSupported(int depth, int cn, bool norm);
+
+    void* ptr() const;
+protected:
+    struct Impl;
+    Impl* p;
+};
+
+class CV_EXPORTS Timer
+{
+public:
+    Timer(const Queue& q);
+    ~Timer();
+    void start();
+    void stop();
+
+    uint64 durationNS() const; ///< duration in nanoseconds
+
+protected:
+    struct Impl;
+    Impl* const p;
+
+private:
+    Timer(const Timer&); // disabled
+    Timer& operator=(const Timer&); // disabled
+};
+
+CV_EXPORTS MatAllocator* getOpenCLAllocator();
+
+
+class CV_EXPORTS_W OpenCLExecutionContext
+{
+public:
+    OpenCLExecutionContext() = default;
+    ~OpenCLExecutionContext() = default;
+
+    OpenCLExecutionContext(const OpenCLExecutionContext&) = default;
+    OpenCLExecutionContext(OpenCLExecutionContext&&) = default;
+
+    OpenCLExecutionContext& operator=(const OpenCLExecutionContext&) = default;
+    OpenCLExecutionContext& operator=(OpenCLExecutionContext&&) = default;
+
+    /** Get associated ocl::Context */
+    Context& getContext() const;
+    /** Get the single default associated ocl::Device */
+    Device& getDevice() const;
+    /** Get the single ocl::Queue that is associated with the ocl::Context and
+     *  the single default ocl::Device
+     */
+    Queue& getQueue() const;
+
+    bool useOpenCL() const;
+    void setUseOpenCL(bool flag);
+
+    /** Get OpenCL execution context of current thread.
+     *
+     * Initialize OpenCL execution context if it is empty
+     * - create new
+     * - reuse context of the main thread (threadID = 0)
+     */
+    static OpenCLExecutionContext& getCurrent();
+
+    /** Get OpenCL execution context of current thread (can be empty) */
+    static OpenCLExecutionContext& getCurrentRef();
+
+    /** Bind this OpenCL execution context to current thread.
+     *
+     * Context can't be empty.
+     *
+     * @note clFinish is not called for queue of previous execution context
+     */
+    void bind() const;
+
+    /** Creates new execution context with same OpenCV context and device
+     *
+     * @param q OpenCL queue
+     */
+    OpenCLExecutionContext cloneWithNewQueue(const ocl::Queue& q) const;
+    /** @overload */
+    OpenCLExecutionContext cloneWithNewQueue() const;
+
+    /** @brief Creates OpenCL execution context
+     * OpenCV will check if available OpenCL platform has platformName name,
+     * then assign context to OpenCV.
+     * The deviceID device will be used as target device and a new command queue will be created.
+     *
+     * @note On success, ownership of one reference of the context and device is taken.
+     * The caller should additionally call `clRetainContext` and/or `clRetainDevice`
+     * to increase the reference count if it wishes to continue using them.
+     *
+     * @param platformName name of OpenCL platform to attach, this string is used to check if platform is available to OpenCV at runtime
+     * @param platformID ID of platform attached context was created for (cl_platform_id)
+     * @param context OpenCL context to be attached to OpenCV (cl_context)
+     * @param deviceID OpenCL device (cl_device_id)
+     */
+    static OpenCLExecutionContext create(const std::string& platformName, void* platformID, void* context, void* deviceID);
+
+    /** @brief Creates OpenCL execution context
+     *
+     * @param context non-empty OpenCL context
+     * @param device non-empty OpenCL device (must be a part of context)
+     * @param queue non-empty OpenCL queue for provided context and device
+     */
+    static OpenCLExecutionContext create(const Context& context, const Device& device, const ocl::Queue& queue);
+    /** @overload */
+    static OpenCLExecutionContext create(const Context& context, const Device& device);
+
+    struct Impl;
+    inline bool empty() const { return !p; }
+    void release();
+protected:
+    std::shared_ptr<Impl> p;
+};
+
+class OpenCLExecutionContextScope
+{
+    OpenCLExecutionContext ctx_;
+public:
+    inline OpenCLExecutionContextScope(const OpenCLExecutionContext& ctx)
+    {
+        CV_Assert(!ctx.empty());
+        ctx_ = OpenCLExecutionContext::getCurrentRef();
+        ctx.bind();
+    }
+
+    inline ~OpenCLExecutionContextScope()
+    {
+        if (!ctx_.empty())
+        {
+            ctx_.bind();
+        }
+    }
+};
+
+#ifdef __OPENCV_BUILD
+namespace internal {
+
+CV_EXPORTS bool isOpenCLForced();
+#define OCL_FORCE_CHECK(condition) (cv::ocl::internal::isOpenCLForced() || (condition))
+
+CV_EXPORTS bool isPerformanceCheckBypassed();
+#define OCL_PERFORMANCE_CHECK(condition) (cv::ocl::internal::isPerformanceCheckBypassed() || (condition))
+
+CV_EXPORTS bool isCLBuffer(UMat& u);
+
+} // namespace internal
+#endif
+
+//! @}
+
+}}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/ocl_genbase.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/ocl_genbase.hpp
new file mode 100644
index 000000000000..5334cf1f4f2d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/ocl_genbase.hpp
@@ -0,0 +1,69 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OPENCL_GENBASE_HPP
+#define OPENCV_OPENCL_GENBASE_HPP
+
+//! @cond IGNORED
+
+namespace cv {
+namespace ocl {
+
+class ProgramSource;
+
+namespace internal {
+
+struct CV_EXPORTS ProgramEntry
+{
+    const char* module;
+    const char* name;
+    const char* programCode;
+    const char* programHash;
+    ProgramSource* pProgramSource;
+
+    operator ProgramSource& () const;
+};
+
+} } } // namespace
+
+//! @endcond
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/ocl_defs.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/ocl_defs.hpp
new file mode 100644
index 000000000000..14df750fc7be
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/ocl_defs.hpp
@@ -0,0 +1,82 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_CORE_OPENCL_DEFS_HPP
+#define OPENCV_CORE_OPENCL_DEFS_HPP
+
+#include "opencv2/core/utility.hpp"
+#include "cvconfig.h"
+
+namespace cv { namespace ocl {
+#ifdef HAVE_OPENCL
+/// Call is similar to useOpenCL() but doesn't try to load OpenCL runtime or create OpenCL context
+CV_EXPORTS bool isOpenCLActivated();
+#else
+static inline bool isOpenCLActivated() { return false; }
+#endif
+}} // namespace
+
+
+//#define CV_OPENCL_RUN_ASSERT
+
+#ifdef HAVE_OPENCL
+
+#ifdef CV_OPENCL_RUN_VERBOSE
+#define CV_OCL_RUN_(condition, func, ...)                                   \
+    {                                                                       \
+        if (cv::ocl::isOpenCLActivated() && (condition) && func)            \
+        {                                                                   \
+            printf("%s: OpenCL implementation is running\n", CV_Func);      \
+            fflush(stdout);                                                 \
+            CV_IMPL_ADD(CV_IMPL_OCL);                                       \
+            return __VA_ARGS__;                                             \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            printf("%s: Plain implementation is running\n", CV_Func);       \
+            fflush(stdout);                                                 \
+        }                                                                   \
+    }
+#elif defined CV_OPENCL_RUN_ASSERT
+#define CV_OCL_RUN_(condition, func, ...)                                   \
+    {                                                                       \
+        if (cv::ocl::isOpenCLActivated() && (condition))                    \
+        {                                                                   \
+            if(func)                                                        \
+            {                                                               \
+                CV_IMPL_ADD(CV_IMPL_OCL);                                   \
+            }                                                               \
+            else                                                            \
+            {                                                               \
+                CV_Error(cv::Error::StsAssert, #func);                      \
+            }                                                               \
+            return __VA_ARGS__;                                             \
+        }                                                                   \
+    }
+#else
+#define CV_OCL_RUN_(condition, func, ...)                                   \
+try \
+{ \
+    if (cv::ocl::isOpenCLActivated() && (condition) && func)                \
+    {                                                                       \
+        CV_IMPL_ADD(CV_IMPL_OCL);                                           \
+        return __VA_ARGS__;                                                 \
+    } \
+} \
+catch (const cv::Exception& e) \
+{ \
+    CV_UNUSED(e); /* TODO: Add some logging here */ \
+}
+#endif
+
+#else
+#define CV_OCL_RUN_(condition, func, ...)
+#endif
+
+#define CV_OCL_RUN(condition, func) CV_OCL_RUN_(condition, func)
+
+#endif // OPENCV_CORE_OPENCL_DEFS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/opencl_info.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/opencl_info.hpp
new file mode 100644
index 000000000000..845efba9fc1d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/opencl_info.hpp
@@ -0,0 +1,213 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <iostream>
+#include <sstream>
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/ocl.hpp>
+
+#ifndef DUMP_CONFIG_PROPERTY
+#define DUMP_CONFIG_PROPERTY(...)
+#endif
+
+#ifndef DUMP_MESSAGE_STDOUT
+#define DUMP_MESSAGE_STDOUT(...) do { std::cout << __VA_ARGS__ << std::endl; } while (false)
+#endif
+
+namespace cv {
+
+namespace {
+static std::string bytesToStringRepr(size_t value)
+{
+    size_t b = value % 1024;
+    value /= 1024;
+
+    size_t kb = value % 1024;
+    value /= 1024;
+
+    size_t mb = value % 1024;
+    value /= 1024;
+
+    size_t gb = value;
+
+    std::ostringstream stream;
+
+    if (gb > 0)
+        stream << gb << " GB ";
+    if (mb > 0)
+        stream << mb << " MB ";
+    if (kb > 0)
+        stream << kb << " KB ";
+    if (b > 0)
+        stream << b << " B";
+
+    std::string s = stream.str();
+    if (s[s.size() - 1] == ' ')
+        s = s.substr(0, s.size() - 1);
+    return s;
+}
+
+static String getDeviceTypeString(const cv::ocl::Device& device)
+{
+    if (device.type() == cv::ocl::Device::TYPE_CPU) {
+        return "CPU";
+    }
+
+    if (device.type() == cv::ocl::Device::TYPE_GPU) {
+        if (device.hostUnifiedMemory()) {
+            return "iGPU";
+        } else {
+            return "dGPU";
+        }
+    }
+
+    return "unknown";
+}
+} // namespace
+
+static void dumpOpenCLInformation()
+{
+    using namespace cv::ocl;
+
+    try
+    {
+        if (!haveOpenCL() || !useOpenCL())
+        {
+            DUMP_MESSAGE_STDOUT("OpenCL is disabled");
+            DUMP_CONFIG_PROPERTY("cv_ocl", "disabled");
+            return;
+        }
+
+        std::vector<PlatformInfo> platforms;
+        cv::ocl::getPlatfomsInfo(platforms);
+        if (platforms.empty())
+        {
+            DUMP_MESSAGE_STDOUT("OpenCL is not available");
+            DUMP_CONFIG_PROPERTY("cv_ocl", "not available");
+            return;
+        }
+
+        DUMP_MESSAGE_STDOUT("OpenCL Platforms: ");
+        for (size_t i = 0; i < platforms.size(); i++)
+        {
+            const PlatformInfo* platform = &platforms[i];
+            DUMP_MESSAGE_STDOUT("    " << platform->name());
+            Device current_device;
+            for (int j = 0; j < platform->deviceNumber(); j++)
+            {
+                platform->getDevice(current_device, j);
+                String deviceTypeStr = getDeviceTypeString(current_device);
+                DUMP_MESSAGE_STDOUT( "        " << deviceTypeStr << ": " << current_device.name() << " (" << current_device.version() << ")");
+                DUMP_CONFIG_PROPERTY( cv::format("cv_ocl_platform_%d_device_%d", (int)i, j ),
+                    cv::format("(Platform=%s)(Type=%s)(Name=%s)(Version=%s)",
+                    platform->name().c_str(), deviceTypeStr.c_str(), current_device.name().c_str(), current_device.version().c_str()) );
+            }
+        }
+        const Device& device = Device::getDefault();
+        if (!device.available())
+            CV_Error(Error::OpenCLInitError, "OpenCL device is not available");
+
+        DUMP_MESSAGE_STDOUT("Current OpenCL device: ");
+
+        String deviceTypeStr = getDeviceTypeString(device);
+        DUMP_MESSAGE_STDOUT("    Type = " << deviceTypeStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_deviceType", deviceTypeStr);
+
+        DUMP_MESSAGE_STDOUT("    Name = " << device.name());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_deviceName", device.name());
+
+        DUMP_MESSAGE_STDOUT("    Version = " << device.version());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_deviceVersion", device.version());
+
+        DUMP_MESSAGE_STDOUT("    Driver version = " << device.driverVersion());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_driverVersion", device.driverVersion());
+
+        DUMP_MESSAGE_STDOUT("    Address bits = " << device.addressBits());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_addressBits", device.addressBits());
+
+        DUMP_MESSAGE_STDOUT("    Compute units = " << device.maxComputeUnits());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_maxComputeUnits", device.maxComputeUnits());
+
+        DUMP_MESSAGE_STDOUT("    Max work group size = " << device.maxWorkGroupSize());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_maxWorkGroupSize", device.maxWorkGroupSize());
+
+        std::string localMemorySizeStr = bytesToStringRepr(device.localMemSize());
+        DUMP_MESSAGE_STDOUT("    Local memory size = " << localMemorySizeStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_localMemSize", device.localMemSize());
+
+        std::string maxMemAllocSizeStr = bytesToStringRepr(device.maxMemAllocSize());
+        DUMP_MESSAGE_STDOUT("    Max memory allocation size = " << maxMemAllocSizeStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_maxMemAllocSize", device.maxMemAllocSize());
+
+        const char* doubleSupportStr = device.hasFP64() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Double support = " << doubleSupportStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.hasFP64());
+
+        const char* halfSupportStr = device.hasFP16() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Half support = " << halfSupportStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.hasFP16());
+
+        const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Host unified memory = " << isUnifiedMemoryStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_hostUnifiedMemory", device.hostUnifiedMemory());
+
+        DUMP_MESSAGE_STDOUT("    Device extensions:");
+        String extensionsStr = device.extensions();
+        size_t pos = 0;
+        while (pos < extensionsStr.size())
+        {
+            size_t pos2 = extensionsStr.find(' ', pos);
+            if (pos2 == String::npos)
+                pos2 = extensionsStr.size();
+            if (pos2 > pos)
+            {
+                String extensionName = extensionsStr.substr(pos, pos2 - pos);
+                DUMP_MESSAGE_STDOUT("        " << extensionName);
+            }
+            pos = pos2 + 1;
+        }
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_extensions", extensionsStr);
+
+        const char* haveAmdBlasStr = haveAmdBlas() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Has AMD Blas = " << haveAmdBlasStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_AmdBlas", haveAmdBlas());
+
+        const char* haveAmdFftStr = haveAmdFft() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Has AMD Fft = " << haveAmdFftStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_AmdFft", haveAmdFft());
+
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width char = " << device.preferredVectorWidthChar());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthChar", device.preferredVectorWidthChar());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width short = " << device.preferredVectorWidthShort());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthShort", device.preferredVectorWidthShort());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width int = " << device.preferredVectorWidthInt());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthInt", device.preferredVectorWidthInt());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width long = " << device.preferredVectorWidthLong());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthLong", device.preferredVectorWidthLong());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width float = " << device.preferredVectorWidthFloat());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthFloat", device.preferredVectorWidthFloat());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width double = " << device.preferredVectorWidthDouble());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthDouble", device.preferredVectorWidthDouble());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width half = " << device.preferredVectorWidthHalf());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthHalf", device.preferredVectorWidthHalf());
+    }
+    catch (...)
+    {
+        DUMP_MESSAGE_STDOUT("Exception. Can't dump OpenCL info");
+        DUMP_MESSAGE_STDOUT("OpenCL device not available");
+        DUMP_CONFIG_PROPERTY("cv_ocl", "not available");
+    }
+}
+#undef DUMP_MESSAGE_STDOUT
+#undef DUMP_CONFIG_PROPERTY
+
+} // namespace
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/opencl_svm.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/opencl_svm.hpp
new file mode 100644
index 000000000000..7453082a67b3
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/opencl_svm.hpp
@@ -0,0 +1,81 @@
+/* See LICENSE file in the root OpenCV directory */
+
+#ifndef OPENCV_CORE_OPENCL_SVM_HPP
+#define OPENCV_CORE_OPENCL_SVM_HPP
+
+//
+// Internal usage only (binary compatibility is not guaranteed)
+//
+#ifndef __OPENCV_BUILD
+#error Internal header file
+#endif
+
+#if defined(HAVE_OPENCL) && defined(HAVE_OPENCL_SVM)
+#include "runtime/opencl_core.hpp"
+#include "runtime/opencl_svm_20.hpp"
+#include "runtime/opencl_svm_hsa_extension.hpp"
+
+namespace cv { namespace ocl { namespace svm {
+
+struct SVMCapabilities
+{
+    enum Value
+    {
+        SVM_COARSE_GRAIN_BUFFER = (1 << 0),
+        SVM_FINE_GRAIN_BUFFER = (1 << 1),
+        SVM_FINE_GRAIN_SYSTEM = (1 << 2),
+        SVM_ATOMICS = (1 << 3),
+    };
+    int value_;
+
+    SVMCapabilities(int capabilities = 0) : value_(capabilities) { }
+    operator int() const { return value_; }
+
+    inline bool isNoSVMSupport() const { return value_ == 0; }
+    inline bool isSupportCoarseGrainBuffer() const { return (value_ & SVM_COARSE_GRAIN_BUFFER) != 0; }
+    inline bool isSupportFineGrainBuffer() const { return (value_ & SVM_FINE_GRAIN_BUFFER) != 0; }
+    inline bool isSupportFineGrainSystem() const { return (value_ & SVM_FINE_GRAIN_SYSTEM) != 0; }
+    inline bool isSupportAtomics() const { return (value_ & SVM_ATOMICS) != 0; }
+};
+
+CV_EXPORTS const SVMCapabilities getSVMCapabilitites(const ocl::Context& context);
+
+struct SVMFunctions
+{
+    clSVMAllocAMD_fn fn_clSVMAlloc;
+    clSVMFreeAMD_fn fn_clSVMFree;
+    clSetKernelArgSVMPointerAMD_fn fn_clSetKernelArgSVMPointer;
+    //clSetKernelExecInfoAMD_fn fn_clSetKernelExecInfo;
+    //clEnqueueSVMFreeAMD_fn fn_clEnqueueSVMFree;
+    clEnqueueSVMMemcpyAMD_fn fn_clEnqueueSVMMemcpy;
+    clEnqueueSVMMemFillAMD_fn fn_clEnqueueSVMMemFill;
+    clEnqueueSVMMapAMD_fn fn_clEnqueueSVMMap;
+    clEnqueueSVMUnmapAMD_fn fn_clEnqueueSVMUnmap;
+
+    inline SVMFunctions()
+        : fn_clSVMAlloc(NULL), fn_clSVMFree(NULL),
+          fn_clSetKernelArgSVMPointer(NULL), /*fn_clSetKernelExecInfo(NULL),*/
+          /*fn_clEnqueueSVMFree(NULL),*/ fn_clEnqueueSVMMemcpy(NULL), fn_clEnqueueSVMMemFill(NULL),
+          fn_clEnqueueSVMMap(NULL), fn_clEnqueueSVMUnmap(NULL)
+    {
+        // nothing
+    }
+
+    inline bool isValid() const
+    {
+        return fn_clSVMAlloc != NULL && fn_clSVMFree && fn_clSetKernelArgSVMPointer &&
+                /*fn_clSetKernelExecInfo && fn_clEnqueueSVMFree &&*/ fn_clEnqueueSVMMemcpy &&
+                fn_clEnqueueSVMMemFill && fn_clEnqueueSVMMap && fn_clEnqueueSVMUnmap;
+    }
+};
+
+// We should guarantee that SVMFunctions lifetime is not less than context's lifetime
+CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context);
+
+CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags);
+
+}}} //namespace cv::ocl::svm
+#endif
+
+#endif // OPENCV_CORE_OPENCL_SVM_HPP
+/* End of file. */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp
new file mode 100644
index 000000000000..2749927bea65
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp
@@ -0,0 +1,602 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_CLAMDBLAS_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_clblas.py
+#define clblasCaxpy clblasCaxpy_
+#define clblasCcopy clblasCcopy_
+#define clblasCdotc clblasCdotc_
+#define clblasCdotu clblasCdotu_
+#define clblasCgbmv clblasCgbmv_
+#define clblasCgemm clblasCgemm_
+#define clblasCgemv clblasCgemv_
+#define clblasCgerc clblasCgerc_
+#define clblasCgeru clblasCgeru_
+#define clblasChbmv clblasChbmv_
+#define clblasChemm clblasChemm_
+#define clblasChemv clblasChemv_
+#define clblasCher clblasCher_
+#define clblasCher2 clblasCher2_
+#define clblasCher2k clblasCher2k_
+#define clblasCherk clblasCherk_
+#define clblasChpmv clblasChpmv_
+#define clblasChpr clblasChpr_
+#define clblasChpr2 clblasChpr2_
+#define clblasCrotg clblasCrotg_
+#define clblasCscal clblasCscal_
+#define clblasCsrot clblasCsrot_
+#define clblasCsscal clblasCsscal_
+#define clblasCswap clblasCswap_
+#define clblasCsymm clblasCsymm_
+#define clblasCsyr2k clblasCsyr2k_
+#define clblasCsyrk clblasCsyrk_
+#define clblasCtbmv clblasCtbmv_
+#define clblasCtbsv clblasCtbsv_
+#define clblasCtpmv clblasCtpmv_
+#define clblasCtpsv clblasCtpsv_
+#define clblasCtrmm clblasCtrmm_
+#define clblasCtrmv clblasCtrmv_
+#define clblasCtrsm clblasCtrsm_
+#define clblasCtrsv clblasCtrsv_
+#define clblasDasum clblasDasum_
+#define clblasDaxpy clblasDaxpy_
+#define clblasDcopy clblasDcopy_
+#define clblasDdot clblasDdot_
+#define clblasDgbmv clblasDgbmv_
+#define clblasDgemm clblasDgemm_
+#define clblasDgemv clblasDgemv_
+#define clblasDger clblasDger_
+#define clblasDnrm2 clblasDnrm2_
+#define clblasDrot clblasDrot_
+#define clblasDrotg clblasDrotg_
+#define clblasDrotm clblasDrotm_
+#define clblasDrotmg clblasDrotmg_
+#define clblasDsbmv clblasDsbmv_
+#define clblasDscal clblasDscal_
+#define clblasDspmv clblasDspmv_
+#define clblasDspr clblasDspr_
+#define clblasDspr2 clblasDspr2_
+#define clblasDswap clblasDswap_
+#define clblasDsymm clblasDsymm_
+#define clblasDsymv clblasDsymv_
+#define clblasDsyr clblasDsyr_
+#define clblasDsyr2 clblasDsyr2_
+#define clblasDsyr2k clblasDsyr2k_
+#define clblasDsyrk clblasDsyrk_
+#define clblasDtbmv clblasDtbmv_
+#define clblasDtbsv clblasDtbsv_
+#define clblasDtpmv clblasDtpmv_
+#define clblasDtpsv clblasDtpsv_
+#define clblasDtrmm clblasDtrmm_
+#define clblasDtrmv clblasDtrmv_
+#define clblasDtrsm clblasDtrsm_
+#define clblasDtrsv clblasDtrsv_
+#define clblasDzasum clblasDzasum_
+#define clblasDznrm2 clblasDznrm2_
+#define clblasGetVersion clblasGetVersion_
+#define clblasSasum clblasSasum_
+#define clblasSaxpy clblasSaxpy_
+#define clblasScasum clblasScasum_
+#define clblasScnrm2 clblasScnrm2_
+#define clblasScopy clblasScopy_
+#define clblasSdot clblasSdot_
+#define clblasSetup clblasSetup_
+#define clblasSgbmv clblasSgbmv_
+#define clblasSgemm clblasSgemm_
+#define clblasSgemv clblasSgemv_
+#define clblasSger clblasSger_
+#define clblasSnrm2 clblasSnrm2_
+#define clblasSrot clblasSrot_
+#define clblasSrotg clblasSrotg_
+#define clblasSrotm clblasSrotm_
+#define clblasSrotmg clblasSrotmg_
+#define clblasSsbmv clblasSsbmv_
+#define clblasSscal clblasSscal_
+#define clblasSspmv clblasSspmv_
+#define clblasSspr clblasSspr_
+#define clblasSspr2 clblasSspr2_
+#define clblasSswap clblasSswap_
+#define clblasSsymm clblasSsymm_
+#define clblasSsymv clblasSsymv_
+#define clblasSsyr clblasSsyr_
+#define clblasSsyr2 clblasSsyr2_
+#define clblasSsyr2k clblasSsyr2k_
+#define clblasSsyrk clblasSsyrk_
+#define clblasStbmv clblasStbmv_
+#define clblasStbsv clblasStbsv_
+#define clblasStpmv clblasStpmv_
+#define clblasStpsv clblasStpsv_
+#define clblasStrmm clblasStrmm_
+#define clblasStrmv clblasStrmv_
+#define clblasStrsm clblasStrsm_
+#define clblasStrsv clblasStrsv_
+#define clblasTeardown clblasTeardown_
+#define clblasZaxpy clblasZaxpy_
+#define clblasZcopy clblasZcopy_
+#define clblasZdotc clblasZdotc_
+#define clblasZdotu clblasZdotu_
+#define clblasZdrot clblasZdrot_
+#define clblasZdscal clblasZdscal_
+#define clblasZgbmv clblasZgbmv_
+#define clblasZgemm clblasZgemm_
+#define clblasZgemv clblasZgemv_
+#define clblasZgerc clblasZgerc_
+#define clblasZgeru clblasZgeru_
+#define clblasZhbmv clblasZhbmv_
+#define clblasZhemm clblasZhemm_
+#define clblasZhemv clblasZhemv_
+#define clblasZher clblasZher_
+#define clblasZher2 clblasZher2_
+#define clblasZher2k clblasZher2k_
+#define clblasZherk clblasZherk_
+#define clblasZhpmv clblasZhpmv_
+#define clblasZhpr clblasZhpr_
+#define clblasZhpr2 clblasZhpr2_
+#define clblasZrotg clblasZrotg_
+#define clblasZscal clblasZscal_
+#define clblasZswap clblasZswap_
+#define clblasZsymm clblasZsymm_
+#define clblasZsyr2k clblasZsyr2k_
+#define clblasZsyrk clblasZsyrk_
+#define clblasZtbmv clblasZtbmv_
+#define clblasZtbsv clblasZtbsv_
+#define clblasZtpmv clblasZtpmv_
+#define clblasZtpsv clblasZtpsv_
+#define clblasZtrmm clblasZtrmm_
+#define clblasZtrmv clblasZtrmv_
+#define clblasZtrsm clblasZtrsm_
+#define clblasZtrsv clblasZtrsv_
+#define clblasiCamax clblasiCamax_
+#define clblasiDamax clblasiDamax_
+#define clblasiSamax clblasiSamax_
+#define clblasiZamax clblasiZamax_
+
+#include <clBLAS.h>
+
+// generated by parser_clblas.py
+#undef clblasCaxpy
+//#define clblasCaxpy clblasCaxpy_pfn
+#undef clblasCcopy
+//#define clblasCcopy clblasCcopy_pfn
+#undef clblasCdotc
+//#define clblasCdotc clblasCdotc_pfn
+#undef clblasCdotu
+//#define clblasCdotu clblasCdotu_pfn
+#undef clblasCgbmv
+//#define clblasCgbmv clblasCgbmv_pfn
+#undef clblasCgemm
+#define clblasCgemm clblasCgemm_pfn
+#undef clblasCgemv
+//#define clblasCgemv clblasCgemv_pfn
+#undef clblasCgerc
+//#define clblasCgerc clblasCgerc_pfn
+#undef clblasCgeru
+//#define clblasCgeru clblasCgeru_pfn
+#undef clblasChbmv
+//#define clblasChbmv clblasChbmv_pfn
+#undef clblasChemm
+//#define clblasChemm clblasChemm_pfn
+#undef clblasChemv
+//#define clblasChemv clblasChemv_pfn
+#undef clblasCher
+//#define clblasCher clblasCher_pfn
+#undef clblasCher2
+//#define clblasCher2 clblasCher2_pfn
+#undef clblasCher2k
+//#define clblasCher2k clblasCher2k_pfn
+#undef clblasCherk
+//#define clblasCherk clblasCherk_pfn
+#undef clblasChpmv
+//#define clblasChpmv clblasChpmv_pfn
+#undef clblasChpr
+//#define clblasChpr clblasChpr_pfn
+#undef clblasChpr2
+//#define clblasChpr2 clblasChpr2_pfn
+#undef clblasCrotg
+//#define clblasCrotg clblasCrotg_pfn
+#undef clblasCscal
+//#define clblasCscal clblasCscal_pfn
+#undef clblasCsrot
+//#define clblasCsrot clblasCsrot_pfn
+#undef clblasCsscal
+//#define clblasCsscal clblasCsscal_pfn
+#undef clblasCswap
+//#define clblasCswap clblasCswap_pfn
+#undef clblasCsymm
+//#define clblasCsymm clblasCsymm_pfn
+#undef clblasCsyr2k
+//#define clblasCsyr2k clblasCsyr2k_pfn
+#undef clblasCsyrk
+//#define clblasCsyrk clblasCsyrk_pfn
+#undef clblasCtbmv
+//#define clblasCtbmv clblasCtbmv_pfn
+#undef clblasCtbsv
+//#define clblasCtbsv clblasCtbsv_pfn
+#undef clblasCtpmv
+//#define clblasCtpmv clblasCtpmv_pfn
+#undef clblasCtpsv
+//#define clblasCtpsv clblasCtpsv_pfn
+#undef clblasCtrmm
+//#define clblasCtrmm clblasCtrmm_pfn
+#undef clblasCtrmv
+//#define clblasCtrmv clblasCtrmv_pfn
+#undef clblasCtrsm
+//#define clblasCtrsm clblasCtrsm_pfn
+#undef clblasCtrsv
+//#define clblasCtrsv clblasCtrsv_pfn
+#undef clblasDasum
+//#define clblasDasum clblasDasum_pfn
+#undef clblasDaxpy
+//#define clblasDaxpy clblasDaxpy_pfn
+#undef clblasDcopy
+//#define clblasDcopy clblasDcopy_pfn
+#undef clblasDdot
+//#define clblasDdot clblasDdot_pfn
+#undef clblasDgbmv
+//#define clblasDgbmv clblasDgbmv_pfn
+#undef clblasDgemm
+#define clblasDgemm clblasDgemm_pfn
+#undef clblasDgemv
+//#define clblasDgemv clblasDgemv_pfn
+#undef clblasDger
+//#define clblasDger clblasDger_pfn
+#undef clblasDnrm2
+//#define clblasDnrm2 clblasDnrm2_pfn
+#undef clblasDrot
+//#define clblasDrot clblasDrot_pfn
+#undef clblasDrotg
+//#define clblasDrotg clblasDrotg_pfn
+#undef clblasDrotm
+//#define clblasDrotm clblasDrotm_pfn
+#undef clblasDrotmg
+//#define clblasDrotmg clblasDrotmg_pfn
+#undef clblasDsbmv
+//#define clblasDsbmv clblasDsbmv_pfn
+#undef clblasDscal
+//#define clblasDscal clblasDscal_pfn
+#undef clblasDspmv
+//#define clblasDspmv clblasDspmv_pfn
+#undef clblasDspr
+//#define clblasDspr clblasDspr_pfn
+#undef clblasDspr2
+//#define clblasDspr2 clblasDspr2_pfn
+#undef clblasDswap
+//#define clblasDswap clblasDswap_pfn
+#undef clblasDsymm
+//#define clblasDsymm clblasDsymm_pfn
+#undef clblasDsymv
+//#define clblasDsymv clblasDsymv_pfn
+#undef clblasDsyr
+//#define clblasDsyr clblasDsyr_pfn
+#undef clblasDsyr2
+//#define clblasDsyr2 clblasDsyr2_pfn
+#undef clblasDsyr2k
+//#define clblasDsyr2k clblasDsyr2k_pfn
+#undef clblasDsyrk
+//#define clblasDsyrk clblasDsyrk_pfn
+#undef clblasDtbmv
+//#define clblasDtbmv clblasDtbmv_pfn
+#undef clblasDtbsv
+//#define clblasDtbsv clblasDtbsv_pfn
+#undef clblasDtpmv
+//#define clblasDtpmv clblasDtpmv_pfn
+#undef clblasDtpsv
+//#define clblasDtpsv clblasDtpsv_pfn
+#undef clblasDtrmm
+//#define clblasDtrmm clblasDtrmm_pfn
+#undef clblasDtrmv
+//#define clblasDtrmv clblasDtrmv_pfn
+#undef clblasDtrsm
+//#define clblasDtrsm clblasDtrsm_pfn
+#undef clblasDtrsv
+//#define clblasDtrsv clblasDtrsv_pfn
+#undef clblasDzasum
+//#define clblasDzasum clblasDzasum_pfn
+#undef clblasDznrm2
+//#define clblasDznrm2 clblasDznrm2_pfn
+#undef clblasGetVersion
+//#define clblasGetVersion clblasGetVersion_pfn
+#undef clblasSasum
+//#define clblasSasum clblasSasum_pfn
+#undef clblasSaxpy
+//#define clblasSaxpy clblasSaxpy_pfn
+#undef clblasScasum
+//#define clblasScasum clblasScasum_pfn
+#undef clblasScnrm2
+//#define clblasScnrm2 clblasScnrm2_pfn
+#undef clblasScopy
+//#define clblasScopy clblasScopy_pfn
+#undef clblasSdot
+//#define clblasSdot clblasSdot_pfn
+#undef clblasSetup
+#define clblasSetup clblasSetup_pfn
+#undef clblasSgbmv
+//#define clblasSgbmv clblasSgbmv_pfn
+#undef clblasSgemm
+#define clblasSgemm clblasSgemm_pfn
+#undef clblasSgemv
+//#define clblasSgemv clblasSgemv_pfn
+#undef clblasSger
+//#define clblasSger clblasSger_pfn
+#undef clblasSnrm2
+//#define clblasSnrm2 clblasSnrm2_pfn
+#undef clblasSrot
+//#define clblasSrot clblasSrot_pfn
+#undef clblasSrotg
+//#define clblasSrotg clblasSrotg_pfn
+#undef clblasSrotm
+//#define clblasSrotm clblasSrotm_pfn
+#undef clblasSrotmg
+//#define clblasSrotmg clblasSrotmg_pfn
+#undef clblasSsbmv
+//#define clblasSsbmv clblasSsbmv_pfn
+#undef clblasSscal
+//#define clblasSscal clblasSscal_pfn
+#undef clblasSspmv
+//#define clblasSspmv clblasSspmv_pfn
+#undef clblasSspr
+//#define clblasSspr clblasSspr_pfn
+#undef clblasSspr2
+//#define clblasSspr2 clblasSspr2_pfn
+#undef clblasSswap
+//#define clblasSswap clblasSswap_pfn
+#undef clblasSsymm
+//#define clblasSsymm clblasSsymm_pfn
+#undef clblasSsymv
+//#define clblasSsymv clblasSsymv_pfn
+#undef clblasSsyr
+//#define clblasSsyr clblasSsyr_pfn
+#undef clblasSsyr2
+//#define clblasSsyr2 clblasSsyr2_pfn
+#undef clblasSsyr2k
+//#define clblasSsyr2k clblasSsyr2k_pfn
+#undef clblasSsyrk
+//#define clblasSsyrk clblasSsyrk_pfn
+#undef clblasStbmv
+//#define clblasStbmv clblasStbmv_pfn
+#undef clblasStbsv
+//#define clblasStbsv clblasStbsv_pfn
+#undef clblasStpmv
+//#define clblasStpmv clblasStpmv_pfn
+#undef clblasStpsv
+//#define clblasStpsv clblasStpsv_pfn
+#undef clblasStrmm
+//#define clblasStrmm clblasStrmm_pfn
+#undef clblasStrmv
+//#define clblasStrmv clblasStrmv_pfn
+#undef clblasStrsm
+//#define clblasStrsm clblasStrsm_pfn
+#undef clblasStrsv
+//#define clblasStrsv clblasStrsv_pfn
+#undef clblasTeardown
+#define clblasTeardown clblasTeardown_pfn
+#undef clblasZaxpy
+//#define clblasZaxpy clblasZaxpy_pfn
+#undef clblasZcopy
+//#define clblasZcopy clblasZcopy_pfn
+#undef clblasZdotc
+//#define clblasZdotc clblasZdotc_pfn
+#undef clblasZdotu
+//#define clblasZdotu clblasZdotu_pfn
+#undef clblasZdrot
+//#define clblasZdrot clblasZdrot_pfn
+#undef clblasZdscal
+//#define clblasZdscal clblasZdscal_pfn
+#undef clblasZgbmv
+//#define clblasZgbmv clblasZgbmv_pfn
+#undef clblasZgemm
+#define clblasZgemm clblasZgemm_pfn
+#undef clblasZgemv
+//#define clblasZgemv clblasZgemv_pfn
+#undef clblasZgerc
+//#define clblasZgerc clblasZgerc_pfn
+#undef clblasZgeru
+//#define clblasZgeru clblasZgeru_pfn
+#undef clblasZhbmv
+//#define clblasZhbmv clblasZhbmv_pfn
+#undef clblasZhemm
+//#define clblasZhemm clblasZhemm_pfn
+#undef clblasZhemv
+//#define clblasZhemv clblasZhemv_pfn
+#undef clblasZher
+//#define clblasZher clblasZher_pfn
+#undef clblasZher2
+//#define clblasZher2 clblasZher2_pfn
+#undef clblasZher2k
+//#define clblasZher2k clblasZher2k_pfn
+#undef clblasZherk
+//#define clblasZherk clblasZherk_pfn
+#undef clblasZhpmv
+//#define clblasZhpmv clblasZhpmv_pfn
+#undef clblasZhpr
+//#define clblasZhpr clblasZhpr_pfn
+#undef clblasZhpr2
+//#define clblasZhpr2 clblasZhpr2_pfn
+#undef clblasZrotg
+//#define clblasZrotg clblasZrotg_pfn
+#undef clblasZscal
+//#define clblasZscal clblasZscal_pfn
+#undef clblasZswap
+//#define clblasZswap clblasZswap_pfn
+#undef clblasZsymm
+//#define clblasZsymm clblasZsymm_pfn
+#undef clblasZsyr2k
+//#define clblasZsyr2k clblasZsyr2k_pfn
+#undef clblasZsyrk
+//#define clblasZsyrk clblasZsyrk_pfn
+#undef clblasZtbmv
+//#define clblasZtbmv clblasZtbmv_pfn
+#undef clblasZtbsv
+//#define clblasZtbsv clblasZtbsv_pfn
+#undef clblasZtpmv
+//#define clblasZtpmv clblasZtpmv_pfn
+#undef clblasZtpsv
+//#define clblasZtpsv clblasZtpsv_pfn
+#undef clblasZtrmm
+//#define clblasZtrmm clblasZtrmm_pfn
+#undef clblasZtrmv
+//#define clblasZtrmv clblasZtrmv_pfn
+#undef clblasZtrsm
+//#define clblasZtrsm clblasZtrsm_pfn
+#undef clblasZtrsv
+//#define clblasZtrsv clblasZtrsv_pfn
+#undef clblasiCamax
+//#define clblasiCamax clblasiCamax_pfn
+#undef clblasiDamax
+//#define clblasiDamax clblasiDamax_pfn
+#undef clblasiSamax
+//#define clblasiSamax clblasiSamax_pfn
+#undef clblasiZamax
+//#define clblasiZamax clblasiZamax_pfn
+
+// generated by parser_clblas.py
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCaxpy)(size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCcopy)(size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCdotc)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCdotu)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgbmv)(clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgemm)(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgemv)(clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, FloatComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgerc)(clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgeru)(clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChbmv)(clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChemm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChemv)(clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCher)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCher2)(clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCher2k)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCherk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const cl_mem A, size_t offa, size_t lda, float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChpmv)(clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChpr)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChpr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCrotg)(cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCscal)(size_t N, cl_float2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsrot)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsscal)(size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCswap)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsymm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsyr2k)(clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsyrk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtbmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtbsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtpmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtpsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtrmm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtrmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtrsm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtrsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDasum)(size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDaxpy)(size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDcopy)(size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDdot)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDgbmv)(clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasDgemm)(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDgemv)(clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDger)(clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDnrm2)(size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDrot)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDrotg)(cl_mem DA, size_t offDA, cl_mem DB, size_t offDB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDrotm)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDrotmg)(cl_mem DD1, size_t offDD1, cl_mem DD2, size_t offDD2, cl_mem DX1, size_t offDX1, const cl_mem DY1, size_t offDY1, cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsbmv)(clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDscal)(size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDspmv)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDspr)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDspr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDswap)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsymm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsymv)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsyr)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsyr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsyr2k)(clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsyrk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtbmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtbsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtpmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtpsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtrmm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtrmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtrsm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtrsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDzasum)(size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDznrm2)(size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasGetVersion)(cl_uint* major, cl_uint* minor, cl_uint* patch);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSasum)(size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSaxpy)(size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasScasum)(size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasScnrm2)(size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasScopy)(size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSdot)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasSetup)();
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSgbmv)(clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasSgemm)(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSgemv)(clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSger)(clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSnrm2)(size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSrot)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSrotg)(cl_mem SA, size_t offSA, cl_mem SB, size_t offSB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSrotm)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSrotmg)(cl_mem SD1, size_t offSD1, cl_mem SD2, size_t offSD2, cl_mem SX1, size_t offSX1, const cl_mem SY1, size_t offSY1, cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsbmv)(clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSscal)(size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSspmv)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSspr)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSspr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSswap)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsymm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsymv)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsyr)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsyr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsyr2k)(clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsyrk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStbmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStbsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStpmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStpsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStrmm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStrmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStrsm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStrsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT void (*clblasTeardown)();
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZaxpy)(size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZcopy)(size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZdotc)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZdotu)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZdrot)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZdscal)(size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgbmv)(clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgemm)(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgemv)(clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, DoubleComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgerc)(clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgeru)(clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhbmv)(clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhemm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhemv)(clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZher)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZher2)(clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZher2k)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZherk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const cl_mem A, size_t offa, size_t lda, double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhpmv)(clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhpr)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhpr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZrotg)(cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZscal)(size_t N, cl_double2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZswap)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZsymm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZsyr2k)(clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZsyrk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtbmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtbsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtpmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtpsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtrmm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtrmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtrsm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtrsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasiCamax)(size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasiDamax)(size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasiSamax)(size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasiZamax)(size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp
new file mode 100644
index 000000000000..dff3b406a611
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp
@@ -0,0 +1,146 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_CLAMDFFT_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_clfft.py
+#define clfftBakePlan clfftBakePlan_
+#define clfftCopyPlan clfftCopyPlan_
+#define clfftCreateDefaultPlan clfftCreateDefaultPlan_
+#define clfftDestroyPlan clfftDestroyPlan_
+#define clfftEnqueueTransform clfftEnqueueTransform_
+#define clfftGetLayout clfftGetLayout_
+#define clfftGetPlanBatchSize clfftGetPlanBatchSize_
+#define clfftGetPlanContext clfftGetPlanContext_
+#define clfftGetPlanDim clfftGetPlanDim_
+#define clfftGetPlanDistance clfftGetPlanDistance_
+#define clfftGetPlanInStride clfftGetPlanInStride_
+#define clfftGetPlanLength clfftGetPlanLength_
+#define clfftGetPlanOutStride clfftGetPlanOutStride_
+#define clfftGetPlanPrecision clfftGetPlanPrecision_
+#define clfftGetPlanScale clfftGetPlanScale_
+#define clfftGetPlanTransposeResult clfftGetPlanTransposeResult_
+#define clfftGetResultLocation clfftGetResultLocation_
+#define clfftGetTmpBufSize clfftGetTmpBufSize_
+#define clfftGetVersion clfftGetVersion_
+#define clfftSetLayout clfftSetLayout_
+#define clfftSetPlanBatchSize clfftSetPlanBatchSize_
+#define clfftSetPlanCallback clfftSetPlanCallback_
+#define clfftSetPlanDim clfftSetPlanDim_
+#define clfftSetPlanDistance clfftSetPlanDistance_
+#define clfftSetPlanInStride clfftSetPlanInStride_
+#define clfftSetPlanLength clfftSetPlanLength_
+#define clfftSetPlanOutStride clfftSetPlanOutStride_
+#define clfftSetPlanPrecision clfftSetPlanPrecision_
+#define clfftSetPlanScale clfftSetPlanScale_
+#define clfftSetPlanTransposeResult clfftSetPlanTransposeResult_
+#define clfftSetResultLocation clfftSetResultLocation_
+#define clfftSetup clfftSetup_
+#define clfftTeardown clfftTeardown_
+
+#include <clFFT.h>
+
+// generated by parser_clfft.py
+#undef clfftBakePlan
+#define clfftBakePlan clfftBakePlan_pfn
+#undef clfftCopyPlan
+//#define clfftCopyPlan clfftCopyPlan_pfn
+#undef clfftCreateDefaultPlan
+#define clfftCreateDefaultPlan clfftCreateDefaultPlan_pfn
+#undef clfftDestroyPlan
+#define clfftDestroyPlan clfftDestroyPlan_pfn
+#undef clfftEnqueueTransform
+#define clfftEnqueueTransform clfftEnqueueTransform_pfn
+#undef clfftGetLayout
+//#define clfftGetLayout clfftGetLayout_pfn
+#undef clfftGetPlanBatchSize
+//#define clfftGetPlanBatchSize clfftGetPlanBatchSize_pfn
+#undef clfftGetPlanContext
+//#define clfftGetPlanContext clfftGetPlanContext_pfn
+#undef clfftGetPlanDim
+//#define clfftGetPlanDim clfftGetPlanDim_pfn
+#undef clfftGetPlanDistance
+//#define clfftGetPlanDistance clfftGetPlanDistance_pfn
+#undef clfftGetPlanInStride
+//#define clfftGetPlanInStride clfftGetPlanInStride_pfn
+#undef clfftGetPlanLength
+//#define clfftGetPlanLength clfftGetPlanLength_pfn
+#undef clfftGetPlanOutStride
+//#define clfftGetPlanOutStride clfftGetPlanOutStride_pfn
+#undef clfftGetPlanPrecision
+//#define clfftGetPlanPrecision clfftGetPlanPrecision_pfn
+#undef clfftGetPlanScale
+//#define clfftGetPlanScale clfftGetPlanScale_pfn
+#undef clfftGetPlanTransposeResult
+//#define clfftGetPlanTransposeResult clfftGetPlanTransposeResult_pfn
+#undef clfftGetResultLocation
+//#define clfftGetResultLocation clfftGetResultLocation_pfn
+#undef clfftGetTmpBufSize
+#define clfftGetTmpBufSize clfftGetTmpBufSize_pfn
+#undef clfftGetVersion
+#define clfftGetVersion clfftGetVersion_pfn
+#undef clfftSetLayout
+#define clfftSetLayout clfftSetLayout_pfn
+#undef clfftSetPlanBatchSize
+#define clfftSetPlanBatchSize clfftSetPlanBatchSize_pfn
+#undef clfftSetPlanCallback
+//#define clfftSetPlanCallback clfftSetPlanCallback_pfn
+#undef clfftSetPlanDim
+//#define clfftSetPlanDim clfftSetPlanDim_pfn
+#undef clfftSetPlanDistance
+#define clfftSetPlanDistance clfftSetPlanDistance_pfn
+#undef clfftSetPlanInStride
+#define clfftSetPlanInStride clfftSetPlanInStride_pfn
+#undef clfftSetPlanLength
+//#define clfftSetPlanLength clfftSetPlanLength_pfn
+#undef clfftSetPlanOutStride
+#define clfftSetPlanOutStride clfftSetPlanOutStride_pfn
+#undef clfftSetPlanPrecision
+#define clfftSetPlanPrecision clfftSetPlanPrecision_pfn
+#undef clfftSetPlanScale
+#define clfftSetPlanScale clfftSetPlanScale_pfn
+#undef clfftSetPlanTransposeResult
+//#define clfftSetPlanTransposeResult clfftSetPlanTransposeResult_pfn
+#undef clfftSetResultLocation
+#define clfftSetResultLocation clfftSetResultLocation_pfn
+#undef clfftSetup
+#define clfftSetup clfftSetup_pfn
+#undef clfftTeardown
+#define clfftTeardown clfftTeardown_pfn
+
+// generated by parser_clfft.py
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftBakePlan)(clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT, void (CL_CALLBACK* pfn_notify) (clfftPlanHandle plHandle, void* user_data), void* user_data);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftCopyPlan)(clfftPlanHandle* out_plHandle, cl_context new_context, clfftPlanHandle in_plHandle);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftCreateDefaultPlan)(clfftPlanHandle* plHandle, cl_context context, const clfftDim dim, const size_t* clLengths);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftDestroyPlan)(clfftPlanHandle* plHandle);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftEnqueueTransform)(clfftPlanHandle plHandle, clfftDirection dir, cl_uint numQueuesAndEvents, cl_command_queue* commQueues, cl_uint numWaitEvents, const cl_event* waitEvents, cl_event* outEvents, cl_mem* inputBuffers, cl_mem* outputBuffers, cl_mem tmpBuffer);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetLayout)(const clfftPlanHandle plHandle, clfftLayout* iLayout, clfftLayout* oLayout);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanBatchSize)(const clfftPlanHandle plHandle, size_t* batchSize);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanContext)(const clfftPlanHandle plHandle, cl_context* context);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanDim)(const clfftPlanHandle plHandle, clfftDim* dim, cl_uint* size);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanDistance)(const clfftPlanHandle plHandle, size_t* iDist, size_t* oDist);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanInStride)(const clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanLength)(const clfftPlanHandle plHandle, const clfftDim dim, size_t* clLengths);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanOutStride)(const clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanPrecision)(const clfftPlanHandle plHandle, clfftPrecision* precision);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanScale)(const clfftPlanHandle plHandle, clfftDirection dir, cl_float* scale);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanTransposeResult)(const clfftPlanHandle plHandle, clfftResultTransposed* transposed);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetResultLocation)(const clfftPlanHandle plHandle, clfftResultLocation* placeness);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetTmpBufSize)(const clfftPlanHandle plHandle, size_t* buffersize);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetVersion)(cl_uint* major, cl_uint* minor, cl_uint* patch);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetLayout)(clfftPlanHandle plHandle, clfftLayout iLayout, clfftLayout oLayout);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanBatchSize)(clfftPlanHandle plHandle, size_t batchSize);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanCallback)(clfftPlanHandle plHandle, const char* funcName, const char* funcString, int localMemSize, clfftCallbackType callbackType, cl_mem* userdata, int numUserdataBuffers);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanDim)(clfftPlanHandle plHandle, const clfftDim dim);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanDistance)(clfftPlanHandle plHandle, size_t iDist, size_t oDist);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanInStride)(clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanLength)(clfftPlanHandle plHandle, const clfftDim dim, const size_t* clLengths);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanOutStride)(clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanPrecision)(clfftPlanHandle plHandle, clfftPrecision precision);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanScale)(clfftPlanHandle plHandle, clfftDirection dir, cl_float scale);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanTransposeResult)(clfftPlanHandle plHandle, clfftResultTransposed transposed);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetResultLocation)(clfftPlanHandle plHandle, clfftResultLocation placeness);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetup)(const clfftSetupData* setupData);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftTeardown)();
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp
new file mode 100644
index 000000000000..28618a1f3a6a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp
@@ -0,0 +1,371 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_CORE_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_cl.py
+#define clBuildProgram clBuildProgram_
+#define clCompileProgram clCompileProgram_
+#define clCreateBuffer clCreateBuffer_
+#define clCreateCommandQueue clCreateCommandQueue_
+#define clCreateContext clCreateContext_
+#define clCreateContextFromType clCreateContextFromType_
+#define clCreateImage clCreateImage_
+#define clCreateImage2D clCreateImage2D_
+#define clCreateImage3D clCreateImage3D_
+#define clCreateKernel clCreateKernel_
+#define clCreateKernelsInProgram clCreateKernelsInProgram_
+#define clCreateProgramWithBinary clCreateProgramWithBinary_
+#define clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels_
+#define clCreateProgramWithSource clCreateProgramWithSource_
+#define clCreateSampler clCreateSampler_
+#define clCreateSubBuffer clCreateSubBuffer_
+#define clCreateSubDevices clCreateSubDevices_
+#define clCreateUserEvent clCreateUserEvent_
+#define clEnqueueBarrier clEnqueueBarrier_
+#define clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList_
+#define clEnqueueCopyBuffer clEnqueueCopyBuffer_
+#define clEnqueueCopyBufferRect clEnqueueCopyBufferRect_
+#define clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage_
+#define clEnqueueCopyImage clEnqueueCopyImage_
+#define clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer_
+#define clEnqueueFillBuffer clEnqueueFillBuffer_
+#define clEnqueueFillImage clEnqueueFillImage_
+#define clEnqueueMapBuffer clEnqueueMapBuffer_
+#define clEnqueueMapImage clEnqueueMapImage_
+#define clEnqueueMarker clEnqueueMarker_
+#define clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList_
+#define clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects_
+#define clEnqueueNDRangeKernel clEnqueueNDRangeKernel_
+#define clEnqueueNativeKernel clEnqueueNativeKernel_
+#define clEnqueueReadBuffer clEnqueueReadBuffer_
+#define clEnqueueReadBufferRect clEnqueueReadBufferRect_
+#define clEnqueueReadImage clEnqueueReadImage_
+#define clEnqueueTask clEnqueueTask_
+#define clEnqueueUnmapMemObject clEnqueueUnmapMemObject_
+#define clEnqueueWaitForEvents clEnqueueWaitForEvents_
+#define clEnqueueWriteBuffer clEnqueueWriteBuffer_
+#define clEnqueueWriteBufferRect clEnqueueWriteBufferRect_
+#define clEnqueueWriteImage clEnqueueWriteImage_
+#define clFinish clFinish_
+#define clFlush clFlush_
+#define clGetCommandQueueInfo clGetCommandQueueInfo_
+#define clGetContextInfo clGetContextInfo_
+#define clGetDeviceIDs clGetDeviceIDs_
+#define clGetDeviceInfo clGetDeviceInfo_
+#define clGetEventInfo clGetEventInfo_
+#define clGetEventProfilingInfo clGetEventProfilingInfo_
+#define clGetExtensionFunctionAddress clGetExtensionFunctionAddress_
+#define clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform_
+#define clGetImageInfo clGetImageInfo_
+#define clGetKernelArgInfo clGetKernelArgInfo_
+#define clGetKernelInfo clGetKernelInfo_
+#define clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo_
+#define clGetMemObjectInfo clGetMemObjectInfo_
+#define clGetPlatformIDs clGetPlatformIDs_
+#define clGetPlatformInfo clGetPlatformInfo_
+#define clGetProgramBuildInfo clGetProgramBuildInfo_
+#define clGetProgramInfo clGetProgramInfo_
+#define clGetSamplerInfo clGetSamplerInfo_
+#define clGetSupportedImageFormats clGetSupportedImageFormats_
+#define clLinkProgram clLinkProgram_
+#define clReleaseCommandQueue clReleaseCommandQueue_
+#define clReleaseContext clReleaseContext_
+#define clReleaseDevice clReleaseDevice_
+#define clReleaseEvent clReleaseEvent_
+#define clReleaseKernel clReleaseKernel_
+#define clReleaseMemObject clReleaseMemObject_
+#define clReleaseProgram clReleaseProgram_
+#define clReleaseSampler clReleaseSampler_
+#define clRetainCommandQueue clRetainCommandQueue_
+#define clRetainContext clRetainContext_
+#define clRetainDevice clRetainDevice_
+#define clRetainEvent clRetainEvent_
+#define clRetainKernel clRetainKernel_
+#define clRetainMemObject clRetainMemObject_
+#define clRetainProgram clRetainProgram_
+#define clRetainSampler clRetainSampler_
+#define clSetEventCallback clSetEventCallback_
+#define clSetKernelArg clSetKernelArg_
+#define clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback_
+#define clSetUserEventStatus clSetUserEventStatus_
+#define clUnloadCompiler clUnloadCompiler_
+#define clUnloadPlatformCompiler clUnloadPlatformCompiler_
+#define clWaitForEvents clWaitForEvents_
+
+#if defined __APPLE__
+#define CL_SILENCE_DEPRECATION
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+// generated by parser_cl.py
+#undef clBuildProgram
+#define clBuildProgram clBuildProgram_pfn
+#undef clCompileProgram
+#define clCompileProgram clCompileProgram_pfn
+#undef clCreateBuffer
+#define clCreateBuffer clCreateBuffer_pfn
+#undef clCreateCommandQueue
+#define clCreateCommandQueue clCreateCommandQueue_pfn
+#undef clCreateContext
+#define clCreateContext clCreateContext_pfn
+#undef clCreateContextFromType
+#define clCreateContextFromType clCreateContextFromType_pfn
+#undef clCreateImage
+#define clCreateImage clCreateImage_pfn
+#undef clCreateImage2D
+#define clCreateImage2D clCreateImage2D_pfn
+#undef clCreateImage3D
+#define clCreateImage3D clCreateImage3D_pfn
+#undef clCreateKernel
+#define clCreateKernel clCreateKernel_pfn
+#undef clCreateKernelsInProgram
+#define clCreateKernelsInProgram clCreateKernelsInProgram_pfn
+#undef clCreateProgramWithBinary
+#define clCreateProgramWithBinary clCreateProgramWithBinary_pfn
+#undef clCreateProgramWithBuiltInKernels
+#define clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels_pfn
+#undef clCreateProgramWithSource
+#define clCreateProgramWithSource clCreateProgramWithSource_pfn
+#undef clCreateSampler
+#define clCreateSampler clCreateSampler_pfn
+#undef clCreateSubBuffer
+#define clCreateSubBuffer clCreateSubBuffer_pfn
+#undef clCreateSubDevices
+#define clCreateSubDevices clCreateSubDevices_pfn
+#undef clCreateUserEvent
+#define clCreateUserEvent clCreateUserEvent_pfn
+#undef clEnqueueBarrier
+#define clEnqueueBarrier clEnqueueBarrier_pfn
+#undef clEnqueueBarrierWithWaitList
+#define clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList_pfn
+#undef clEnqueueCopyBuffer
+#define clEnqueueCopyBuffer clEnqueueCopyBuffer_pfn
+#undef clEnqueueCopyBufferRect
+#define clEnqueueCopyBufferRect clEnqueueCopyBufferRect_pfn
+#undef clEnqueueCopyBufferToImage
+#define clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage_pfn
+#undef clEnqueueCopyImage
+#define clEnqueueCopyImage clEnqueueCopyImage_pfn
+#undef clEnqueueCopyImageToBuffer
+#define clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer_pfn
+#undef clEnqueueFillBuffer
+#define clEnqueueFillBuffer clEnqueueFillBuffer_pfn
+#undef clEnqueueFillImage
+#define clEnqueueFillImage clEnqueueFillImage_pfn
+#undef clEnqueueMapBuffer
+#define clEnqueueMapBuffer clEnqueueMapBuffer_pfn
+#undef clEnqueueMapImage
+#define clEnqueueMapImage clEnqueueMapImage_pfn
+#undef clEnqueueMarker
+#define clEnqueueMarker clEnqueueMarker_pfn
+#undef clEnqueueMarkerWithWaitList
+#define clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList_pfn
+#undef clEnqueueMigrateMemObjects
+#define clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects_pfn
+#undef clEnqueueNDRangeKernel
+#define clEnqueueNDRangeKernel clEnqueueNDRangeKernel_pfn
+#undef clEnqueueNativeKernel
+#define clEnqueueNativeKernel clEnqueueNativeKernel_pfn
+#undef clEnqueueReadBuffer
+#define clEnqueueReadBuffer clEnqueueReadBuffer_pfn
+#undef clEnqueueReadBufferRect
+#define clEnqueueReadBufferRect clEnqueueReadBufferRect_pfn
+#undef clEnqueueReadImage
+#define clEnqueueReadImage clEnqueueReadImage_pfn
+#undef clEnqueueTask
+#define clEnqueueTask clEnqueueTask_pfn
+#undef clEnqueueUnmapMemObject
+#define clEnqueueUnmapMemObject clEnqueueUnmapMemObject_pfn
+#undef clEnqueueWaitForEvents
+#define clEnqueueWaitForEvents clEnqueueWaitForEvents_pfn
+#undef clEnqueueWriteBuffer
+#define clEnqueueWriteBuffer clEnqueueWriteBuffer_pfn
+#undef clEnqueueWriteBufferRect
+#define clEnqueueWriteBufferRect clEnqueueWriteBufferRect_pfn
+#undef clEnqueueWriteImage
+#define clEnqueueWriteImage clEnqueueWriteImage_pfn
+#undef clFinish
+#define clFinish clFinish_pfn
+#undef clFlush
+#define clFlush clFlush_pfn
+#undef clGetCommandQueueInfo
+#define clGetCommandQueueInfo clGetCommandQueueInfo_pfn
+#undef clGetContextInfo
+#define clGetContextInfo clGetContextInfo_pfn
+#undef clGetDeviceIDs
+#define clGetDeviceIDs clGetDeviceIDs_pfn
+#undef clGetDeviceInfo
+#define clGetDeviceInfo clGetDeviceInfo_pfn
+#undef clGetEventInfo
+#define clGetEventInfo clGetEventInfo_pfn
+#undef clGetEventProfilingInfo
+#define clGetEventProfilingInfo clGetEventProfilingInfo_pfn
+#undef clGetExtensionFunctionAddress
+#define clGetExtensionFunctionAddress clGetExtensionFunctionAddress_pfn
+#undef clGetExtensionFunctionAddressForPlatform
+#define clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform_pfn
+#undef clGetImageInfo
+#define clGetImageInfo clGetImageInfo_pfn
+#undef clGetKernelArgInfo
+#define clGetKernelArgInfo clGetKernelArgInfo_pfn
+#undef clGetKernelInfo
+#define clGetKernelInfo clGetKernelInfo_pfn
+#undef clGetKernelWorkGroupInfo
+#define clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo_pfn
+#undef clGetMemObjectInfo
+#define clGetMemObjectInfo clGetMemObjectInfo_pfn
+#undef clGetPlatformIDs
+#define clGetPlatformIDs clGetPlatformIDs_pfn
+#undef clGetPlatformInfo
+#define clGetPlatformInfo clGetPlatformInfo_pfn
+#undef clGetProgramBuildInfo
+#define clGetProgramBuildInfo clGetProgramBuildInfo_pfn
+#undef clGetProgramInfo
+#define clGetProgramInfo clGetProgramInfo_pfn
+#undef clGetSamplerInfo
+#define clGetSamplerInfo clGetSamplerInfo_pfn
+#undef clGetSupportedImageFormats
+#define clGetSupportedImageFormats clGetSupportedImageFormats_pfn
+#undef clLinkProgram
+#define clLinkProgram clLinkProgram_pfn
+#undef clReleaseCommandQueue
+#define clReleaseCommandQueue clReleaseCommandQueue_pfn
+#undef clReleaseContext
+#define clReleaseContext clReleaseContext_pfn
+#undef clReleaseDevice
+#define clReleaseDevice clReleaseDevice_pfn
+#undef clReleaseEvent
+#define clReleaseEvent clReleaseEvent_pfn
+#undef clReleaseKernel
+#define clReleaseKernel clReleaseKernel_pfn
+#undef clReleaseMemObject
+#define clReleaseMemObject clReleaseMemObject_pfn
+#undef clReleaseProgram
+#define clReleaseProgram clReleaseProgram_pfn
+#undef clReleaseSampler
+#define clReleaseSampler clReleaseSampler_pfn
+#undef clRetainCommandQueue
+#define clRetainCommandQueue clRetainCommandQueue_pfn
+#undef clRetainContext
+#define clRetainContext clRetainContext_pfn
+#undef clRetainDevice
+#define clRetainDevice clRetainDevice_pfn
+#undef clRetainEvent
+#define clRetainEvent clRetainEvent_pfn
+#undef clRetainKernel
+#define clRetainKernel clRetainKernel_pfn
+#undef clRetainMemObject
+#define clRetainMemObject clRetainMemObject_pfn
+#undef clRetainProgram
+#define clRetainProgram clRetainProgram_pfn
+#undef clRetainSampler
+#define clRetainSampler clRetainSampler_pfn
+#undef clSetEventCallback
+#define clSetEventCallback clSetEventCallback_pfn
+#undef clSetKernelArg
+#define clSetKernelArg clSetKernelArg_pfn
+#undef clSetMemObjectDestructorCallback
+#define clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback_pfn
+#undef clSetUserEventStatus
+#define clSetUserEventStatus clSetUserEventStatus_pfn
+#undef clUnloadCompiler
+#define clUnloadCompiler clUnloadCompiler_pfn
+#undef clUnloadPlatformCompiler
+#define clUnloadPlatformCompiler clUnloadPlatformCompiler_pfn
+#undef clWaitForEvents
+#define clWaitForEvents clWaitForEvents_pfn
+
+// generated by parser_cl.py
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void (CL_CALLBACK*) (cl_program, void*), void*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clCompileProgram)(cl_program, cl_uint, const cl_device_id*, const char*, cl_uint, const cl_program*, const char**, void (CL_CALLBACK*) (cl_program, void*), void*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateBuffer)(cl_context, cl_mem_flags, size_t, void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_command_queue (CL_API_CALL*clCreateCommandQueue)(cl_context, cl_device_id, cl_command_queue_properties, cl_int*);
+extern CL_RUNTIME_EXPORT cl_context (CL_API_CALL*clCreateContext)(const cl_context_properties*, cl_uint, const cl_device_id*, void (CL_CALLBACK*) (const char*, const void*, size_t, void*), void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_context (CL_API_CALL*clCreateContextFromType)(const cl_context_properties*, cl_device_type, void (CL_CALLBACK*) (const char*, const void*, size_t, void*), void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateImage)(cl_context, cl_mem_flags, const cl_image_format*, const cl_image_desc*, void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateImage2D)(cl_context, cl_mem_flags, const cl_image_format*, size_t, size_t, size_t, void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateImage3D)(cl_context, cl_mem_flags, const cl_image_format*, size_t, size_t, size_t, size_t, size_t, void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_kernel (CL_API_CALL*clCreateKernel)(cl_program, const char*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clCreateKernelsInProgram)(cl_program, cl_uint, cl_kernel*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_program (CL_API_CALL*clCreateProgramWithBinary)(cl_context, cl_uint, const cl_device_id*, const size_t*, const unsigned char**, cl_int*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_program (CL_API_CALL*clCreateProgramWithBuiltInKernels)(cl_context, cl_uint, const cl_device_id*, const char*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_program (CL_API_CALL*clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_sampler (CL_API_CALL*clCreateSampler)(cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateSubBuffer)(cl_mem, cl_mem_flags, cl_buffer_create_type, const void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clCreateSubDevices)(cl_device_id, const cl_device_partition_property*, cl_uint, cl_device_id*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_event (CL_API_CALL*clCreateUserEvent)(cl_context, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueBarrier)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueBarrierWithWaitList)(cl_command_queue, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyBuffer)(cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyBufferRect)(cl_command_queue, cl_mem, cl_mem, const size_t*, const size_t*, const size_t*, size_t, size_t, size_t, size_t, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyBufferToImage)(cl_command_queue, cl_mem, cl_mem, size_t, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyImage)(cl_command_queue, cl_mem, cl_mem, const size_t*, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyImageToBuffer)(cl_command_queue, cl_mem, cl_mem, const size_t*, const size_t*, size_t, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueFillBuffer)(cl_command_queue, cl_mem, const void*, size_t, size_t, size_t, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueFillImage)(cl_command_queue, cl_mem, const void*, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL*clEnqueueMapBuffer)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event*, cl_event*, cl_int*);
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL*clEnqueueMapImage)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, const size_t*, const size_t*, size_t*, size_t*, cl_uint, const cl_event*, cl_event*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueMarker)(cl_command_queue, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueMarkerWithWaitList)(cl_command_queue, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueMigrateMemObjects)(cl_command_queue, cl_uint, const cl_mem*, cl_mem_migration_flags, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueNativeKernel)(cl_command_queue, void (CL_CALLBACK*) (void*), void*, size_t, cl_uint, const cl_mem*, const void**, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueReadBufferRect)(cl_command_queue, cl_mem, cl_bool, const size_t*, const size_t*, const size_t*, size_t, size_t, size_t, size_t, void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueReadImage)(cl_command_queue, cl_mem, cl_bool, const size_t*, const size_t*, size_t, size_t, void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueTask)(cl_command_queue, cl_kernel, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueUnmapMemObject)(cl_command_queue, cl_mem, void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueWaitForEvents)(cl_command_queue, cl_uint, const cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueWriteBufferRect)(cl_command_queue, cl_mem, cl_bool, const size_t*, const size_t*, const size_t*, size_t, size_t, size_t, size_t, const void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueWriteImage)(cl_command_queue, cl_mem, cl_bool, const size_t*, const size_t*, size_t, size_t, const void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clFinish)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clFlush)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetCommandQueueInfo)(cl_command_queue, cl_command_queue_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetContextInfo)(cl_context, cl_context_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetDeviceIDs)(cl_platform_id, cl_device_type, cl_uint, cl_device_id*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetDeviceInfo)(cl_device_id, cl_device_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetEventInfo)(cl_event, cl_event_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetEventProfilingInfo)(cl_event, cl_profiling_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL*clGetExtensionFunctionAddress)(const char*);
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL*clGetExtensionFunctionAddressForPlatform)(cl_platform_id, const char*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetImageInfo)(cl_mem, cl_image_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetKernelArgInfo)(cl_kernel, cl_uint, cl_kernel_arg_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetKernelInfo)(cl_kernel, cl_kernel_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetKernelWorkGroupInfo)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetMemObjectInfo)(cl_mem, cl_mem_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetPlatformInfo)(cl_platform_id, cl_platform_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetProgramBuildInfo)(cl_program, cl_device_id, cl_program_build_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetProgramInfo)(cl_program, cl_program_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetSamplerInfo)(cl_sampler, cl_sampler_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetSupportedImageFormats)(cl_context, cl_mem_flags, cl_mem_object_type, cl_uint, cl_image_format*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_program (CL_API_CALL*clLinkProgram)(cl_context, cl_uint, const cl_device_id*, const char*, cl_uint, const cl_program*, void (CL_CALLBACK*) (cl_program, void*), void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseCommandQueue)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseContext)(cl_context);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseDevice)(cl_device_id);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseEvent)(cl_event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseKernel)(cl_kernel);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseMemObject)(cl_mem);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseProgram)(cl_program);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseSampler)(cl_sampler);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainCommandQueue)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainContext)(cl_context);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainDevice)(cl_device_id);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainEvent)(cl_event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainKernel)(cl_kernel);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainMemObject)(cl_mem);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainProgram)(cl_program);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainSampler)(cl_sampler);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clSetEventCallback)(cl_event, cl_int, void (CL_CALLBACK*) (cl_event, cl_int, void*), void*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clSetMemObjectDestructorCallback)(cl_mem, void (CL_CALLBACK*) (cl_mem, void*), void*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clSetUserEventStatus)(cl_event, cl_int);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clUnloadCompiler)();
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clUnloadPlatformCompiler)(cl_platform_id);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clWaitForEvents)(cl_uint, const cl_event*);
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp
new file mode 100644
index 000000000000..216b22b8a85c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp
@@ -0,0 +1,272 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_WRAPPERS_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_cl.py
+#undef clBuildProgram
+#define clBuildProgram clBuildProgram_fn
+inline cl_int clBuildProgram(cl_program p0, cl_uint p1, const cl_device_id* p2, const char* p3, void (CL_CALLBACK*p4) (cl_program, void*), void* p5) { return clBuildProgram_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCompileProgram
+#define clCompileProgram clCompileProgram_fn
+inline cl_int clCompileProgram(cl_program p0, cl_uint p1, const cl_device_id* p2, const char* p3, cl_uint p4, const cl_program* p5, const char** p6, void (CL_CALLBACK*p7) (cl_program, void*), void* p8) { return clCompileProgram_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clCreateBuffer
+#define clCreateBuffer clCreateBuffer_fn
+inline cl_mem clCreateBuffer(cl_context p0, cl_mem_flags p1, size_t p2, void* p3, cl_int* p4) { return clCreateBuffer_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateCommandQueue
+#define clCreateCommandQueue clCreateCommandQueue_fn
+inline cl_command_queue clCreateCommandQueue(cl_context p0, cl_device_id p1, cl_command_queue_properties p2, cl_int* p3) { return clCreateCommandQueue_pfn(p0, p1, p2, p3); }
+#undef clCreateContext
+#define clCreateContext clCreateContext_fn
+inline cl_context clCreateContext(const cl_context_properties* p0, cl_uint p1, const cl_device_id* p2, void (CL_CALLBACK*p3) (const char*, const void*, size_t, void*), void* p4, cl_int* p5) { return clCreateContext_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateContextFromType
+#define clCreateContextFromType clCreateContextFromType_fn
+inline cl_context clCreateContextFromType(const cl_context_properties* p0, cl_device_type p1, void (CL_CALLBACK*p2) (const char*, const void*, size_t, void*), void* p3, cl_int* p4) { return clCreateContextFromType_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateImage
+#define clCreateImage clCreateImage_fn
+inline cl_mem clCreateImage(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, const cl_image_desc* p3, void* p4, cl_int* p5) { return clCreateImage_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateImage2D
+#define clCreateImage2D clCreateImage2D_fn
+inline cl_mem clCreateImage2D(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, size_t p3, size_t p4, size_t p5, void* p6, cl_int* p7) { return clCreateImage2D_pfn(p0, p1, p2, p3, p4, p5, p6, p7); }
+#undef clCreateImage3D
+#define clCreateImage3D clCreateImage3D_fn
+inline cl_mem clCreateImage3D(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, size_t p3, size_t p4, size_t p5, size_t p6, size_t p7, void* p8, cl_int* p9) { return clCreateImage3D_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clCreateKernel
+#define clCreateKernel clCreateKernel_fn
+inline cl_kernel clCreateKernel(cl_program p0, const char* p1, cl_int* p2) { return clCreateKernel_pfn(p0, p1, p2); }
+#undef clCreateKernelsInProgram
+#define clCreateKernelsInProgram clCreateKernelsInProgram_fn
+inline cl_int clCreateKernelsInProgram(cl_program p0, cl_uint p1, cl_kernel* p2, cl_uint* p3) { return clCreateKernelsInProgram_pfn(p0, p1, p2, p3); }
+#undef clCreateProgramWithBinary
+#define clCreateProgramWithBinary clCreateProgramWithBinary_fn
+inline cl_program clCreateProgramWithBinary(cl_context p0, cl_uint p1, const cl_device_id* p2, const size_t* p3, const unsigned char** p4, cl_int* p5, cl_int* p6) { return clCreateProgramWithBinary_pfn(p0, p1, p2, p3, p4, p5, p6); }
+#undef clCreateProgramWithBuiltInKernels
+#define clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels_fn
+inline cl_program clCreateProgramWithBuiltInKernels(cl_context p0, cl_uint p1, const cl_device_id* p2, const char* p3, cl_int* p4) { return clCreateProgramWithBuiltInKernels_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateProgramWithSource
+#define clCreateProgramWithSource clCreateProgramWithSource_fn
+inline cl_program clCreateProgramWithSource(cl_context p0, cl_uint p1, const char** p2, const size_t* p3, cl_int* p4) { return clCreateProgramWithSource_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateSampler
+#define clCreateSampler clCreateSampler_fn
+inline cl_sampler clCreateSampler(cl_context p0, cl_bool p1, cl_addressing_mode p2, cl_filter_mode p3, cl_int* p4) { return clCreateSampler_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateSubBuffer
+#define clCreateSubBuffer clCreateSubBuffer_fn
+inline cl_mem clCreateSubBuffer(cl_mem p0, cl_mem_flags p1, cl_buffer_create_type p2, const void* p3, cl_int* p4) { return clCreateSubBuffer_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateSubDevices
+#define clCreateSubDevices clCreateSubDevices_fn
+inline cl_int clCreateSubDevices(cl_device_id p0, const cl_device_partition_property* p1, cl_uint p2, cl_device_id* p3, cl_uint* p4) { return clCreateSubDevices_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateUserEvent
+#define clCreateUserEvent clCreateUserEvent_fn
+inline cl_event clCreateUserEvent(cl_context p0, cl_int* p1) { return clCreateUserEvent_pfn(p0, p1); }
+#undef clEnqueueBarrier
+#define clEnqueueBarrier clEnqueueBarrier_fn
+inline cl_int clEnqueueBarrier(cl_command_queue p0) { return clEnqueueBarrier_pfn(p0); }
+#undef clEnqueueBarrierWithWaitList
+#define clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList_fn
+inline cl_int clEnqueueBarrierWithWaitList(cl_command_queue p0, cl_uint p1, const cl_event* p2, cl_event* p3) { return clEnqueueBarrierWithWaitList_pfn(p0, p1, p2, p3); }
+#undef clEnqueueCopyBuffer
+#define clEnqueueCopyBuffer clEnqueueCopyBuffer_fn
+inline cl_int clEnqueueCopyBuffer(cl_command_queue p0, cl_mem p1, cl_mem p2, size_t p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyBufferRect
+#define clEnqueueCopyBufferRect clEnqueueCopyBufferRect_fn
+inline cl_int clEnqueueCopyBufferRect(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, cl_uint p10, const cl_event* p11, cl_event* p12) { return clEnqueueCopyBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12); }
+#undef clEnqueueCopyBufferToImage
+#define clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage_fn
+inline cl_int clEnqueueCopyBufferToImage(cl_command_queue p0, cl_mem p1, cl_mem p2, size_t p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyBufferToImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyImage
+#define clEnqueueCopyImage clEnqueueCopyImage_fn
+inline cl_int clEnqueueCopyImage(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyImageToBuffer
+#define clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer_fn
+inline cl_int clEnqueueCopyImageToBuffer(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyImageToBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueFillBuffer
+#define clEnqueueFillBuffer clEnqueueFillBuffer_fn
+inline cl_int clEnqueueFillBuffer(cl_command_queue p0, cl_mem p1, const void* p2, size_t p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueFillBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueFillImage
+#define clEnqueueFillImage clEnqueueFillImage_fn
+inline cl_int clEnqueueFillImage(cl_command_queue p0, cl_mem p1, const void* p2, const size_t* p3, const size_t* p4, cl_uint p5, const cl_event* p6, cl_event* p7) { return clEnqueueFillImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7); }
+#undef clEnqueueMapBuffer
+#define clEnqueueMapBuffer clEnqueueMapBuffer_fn
+inline void* clEnqueueMapBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, cl_map_flags p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8, cl_int* p9) { return clEnqueueMapBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clEnqueueMapImage
+#define clEnqueueMapImage clEnqueueMapImage_fn
+inline void* clEnqueueMapImage(cl_command_queue p0, cl_mem p1, cl_bool p2, cl_map_flags p3, const size_t* p4, const size_t* p5, size_t* p6, size_t* p7, cl_uint p8, const cl_event* p9, cl_event* p10, cl_int* p11) { return clEnqueueMapImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11); }
+#undef clEnqueueMarker
+#define clEnqueueMarker clEnqueueMarker_fn
+inline cl_int clEnqueueMarker(cl_command_queue p0, cl_event* p1) { return clEnqueueMarker_pfn(p0, p1); }
+#undef clEnqueueMarkerWithWaitList
+#define clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList_fn
+inline cl_int clEnqueueMarkerWithWaitList(cl_command_queue p0, cl_uint p1, const cl_event* p2, cl_event* p3) { return clEnqueueMarkerWithWaitList_pfn(p0, p1, p2, p3); }
+#undef clEnqueueMigrateMemObjects
+#define clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects_fn
+inline cl_int clEnqueueMigrateMemObjects(cl_command_queue p0, cl_uint p1, const cl_mem* p2, cl_mem_migration_flags p3, cl_uint p4, const cl_event* p5, cl_event* p6) { return clEnqueueMigrateMemObjects_pfn(p0, p1, p2, p3, p4, p5, p6); }
+#undef clEnqueueNDRangeKernel
+#define clEnqueueNDRangeKernel clEnqueueNDRangeKernel_fn
+inline cl_int clEnqueueNDRangeKernel(cl_command_queue p0, cl_kernel p1, cl_uint p2, const size_t* p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueNDRangeKernel_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueNativeKernel
+#define clEnqueueNativeKernel clEnqueueNativeKernel_fn
+inline cl_int clEnqueueNativeKernel(cl_command_queue p0, void (CL_CALLBACK*p1) (void*), void* p2, size_t p3, cl_uint p4, const cl_mem* p5, const void** p6, cl_uint p7, const cl_event* p8, cl_event* p9) { return clEnqueueNativeKernel_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clEnqueueReadBuffer
+#define clEnqueueReadBuffer clEnqueueReadBuffer_fn
+inline cl_int clEnqueueReadBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, size_t p3, size_t p4, void* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueReadBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueReadBufferRect
+#define clEnqueueReadBufferRect clEnqueueReadBufferRect_fn
+inline cl_int clEnqueueReadBufferRect(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, void* p10, cl_uint p11, const cl_event* p12, cl_event* p13) { return clEnqueueReadBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13); }
+#undef clEnqueueReadImage
+#define clEnqueueReadImage clEnqueueReadImage_fn
+inline cl_int clEnqueueReadImage(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, size_t p5, size_t p6, void* p7, cl_uint p8, const cl_event* p9, cl_event* p10) { return clEnqueueReadImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10); }
+#undef clEnqueueTask
+#define clEnqueueTask clEnqueueTask_fn
+inline cl_int clEnqueueTask(cl_command_queue p0, cl_kernel p1, cl_uint p2, const cl_event* p3, cl_event* p4) { return clEnqueueTask_pfn(p0, p1, p2, p3, p4); }
+#undef clEnqueueUnmapMemObject
+#define clEnqueueUnmapMemObject clEnqueueUnmapMemObject_fn
+inline cl_int clEnqueueUnmapMemObject(cl_command_queue p0, cl_mem p1, void* p2, cl_uint p3, const cl_event* p4, cl_event* p5) { return clEnqueueUnmapMemObject_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clEnqueueWaitForEvents
+#define clEnqueueWaitForEvents clEnqueueWaitForEvents_fn
+inline cl_int clEnqueueWaitForEvents(cl_command_queue p0, cl_uint p1, const cl_event* p2) { return clEnqueueWaitForEvents_pfn(p0, p1, p2); }
+#undef clEnqueueWriteBuffer
+#define clEnqueueWriteBuffer clEnqueueWriteBuffer_fn
+inline cl_int clEnqueueWriteBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, size_t p3, size_t p4, const void* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueWriteBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueWriteBufferRect
+#define clEnqueueWriteBufferRect clEnqueueWriteBufferRect_fn
+inline cl_int clEnqueueWriteBufferRect(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, const void* p10, cl_uint p11, const cl_event* p12, cl_event* p13) { return clEnqueueWriteBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13); }
+#undef clEnqueueWriteImage
+#define clEnqueueWriteImage clEnqueueWriteImage_fn
+inline cl_int clEnqueueWriteImage(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, size_t p5, size_t p6, const void* p7, cl_uint p8, const cl_event* p9, cl_event* p10) { return clEnqueueWriteImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10); }
+#undef clFinish
+#define clFinish clFinish_fn
+inline cl_int clFinish(cl_command_queue p0) { return clFinish_pfn(p0); }
+#undef clFlush
+#define clFlush clFlush_fn
+inline cl_int clFlush(cl_command_queue p0) { return clFlush_pfn(p0); }
+#undef clGetCommandQueueInfo
+#define clGetCommandQueueInfo clGetCommandQueueInfo_fn
+inline cl_int clGetCommandQueueInfo(cl_command_queue p0, cl_command_queue_info p1, size_t p2, void* p3, size_t* p4) { return clGetCommandQueueInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetContextInfo
+#define clGetContextInfo clGetContextInfo_fn
+inline cl_int clGetContextInfo(cl_context p0, cl_context_info p1, size_t p2, void* p3, size_t* p4) { return clGetContextInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetDeviceIDs
+#define clGetDeviceIDs clGetDeviceIDs_fn
+inline cl_int clGetDeviceIDs(cl_platform_id p0, cl_device_type p1, cl_uint p2, cl_device_id* p3, cl_uint* p4) { return clGetDeviceIDs_pfn(p0, p1, p2, p3, p4); }
+#undef clGetDeviceInfo
+#define clGetDeviceInfo clGetDeviceInfo_fn
+inline cl_int clGetDeviceInfo(cl_device_id p0, cl_device_info p1, size_t p2, void* p3, size_t* p4) { return clGetDeviceInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetEventInfo
+#define clGetEventInfo clGetEventInfo_fn
+inline cl_int clGetEventInfo(cl_event p0, cl_event_info p1, size_t p2, void* p3, size_t* p4) { return clGetEventInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetEventProfilingInfo
+#define clGetEventProfilingInfo clGetEventProfilingInfo_fn
+inline cl_int clGetEventProfilingInfo(cl_event p0, cl_profiling_info p1, size_t p2, void* p3, size_t* p4) { return clGetEventProfilingInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetExtensionFunctionAddress
+#define clGetExtensionFunctionAddress clGetExtensionFunctionAddress_fn
+inline void* clGetExtensionFunctionAddress(const char* p0) { return clGetExtensionFunctionAddress_pfn(p0); }
+#undef clGetExtensionFunctionAddressForPlatform
+#define clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform_fn
+inline void* clGetExtensionFunctionAddressForPlatform(cl_platform_id p0, const char* p1) { return clGetExtensionFunctionAddressForPlatform_pfn(p0, p1); }
+#undef clGetImageInfo
+#define clGetImageInfo clGetImageInfo_fn
+inline cl_int clGetImageInfo(cl_mem p0, cl_image_info p1, size_t p2, void* p3, size_t* p4) { return clGetImageInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetKernelArgInfo
+#define clGetKernelArgInfo clGetKernelArgInfo_fn
+inline cl_int clGetKernelArgInfo(cl_kernel p0, cl_uint p1, cl_kernel_arg_info p2, size_t p3, void* p4, size_t* p5) { return clGetKernelArgInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetKernelInfo
+#define clGetKernelInfo clGetKernelInfo_fn
+inline cl_int clGetKernelInfo(cl_kernel p0, cl_kernel_info p1, size_t p2, void* p3, size_t* p4) { return clGetKernelInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetKernelWorkGroupInfo
+#define clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo_fn
+inline cl_int clGetKernelWorkGroupInfo(cl_kernel p0, cl_device_id p1, cl_kernel_work_group_info p2, size_t p3, void* p4, size_t* p5) { return clGetKernelWorkGroupInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetMemObjectInfo
+#define clGetMemObjectInfo clGetMemObjectInfo_fn
+inline cl_int clGetMemObjectInfo(cl_mem p0, cl_mem_info p1, size_t p2, void* p3, size_t* p4) { return clGetMemObjectInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetPlatformIDs
+#define clGetPlatformIDs clGetPlatformIDs_fn
+inline cl_int clGetPlatformIDs(cl_uint p0, cl_platform_id* p1, cl_uint* p2) { return clGetPlatformIDs_pfn(p0, p1, p2); }
+#undef clGetPlatformInfo
+#define clGetPlatformInfo clGetPlatformInfo_fn
+inline cl_int clGetPlatformInfo(cl_platform_id p0, cl_platform_info p1, size_t p2, void* p3, size_t* p4) { return clGetPlatformInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetProgramBuildInfo
+#define clGetProgramBuildInfo clGetProgramBuildInfo_fn
+inline cl_int clGetProgramBuildInfo(cl_program p0, cl_device_id p1, cl_program_build_info p2, size_t p3, void* p4, size_t* p5) { return clGetProgramBuildInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetProgramInfo
+#define clGetProgramInfo clGetProgramInfo_fn
+inline cl_int clGetProgramInfo(cl_program p0, cl_program_info p1, size_t p2, void* p3, size_t* p4) { return clGetProgramInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetSamplerInfo
+#define clGetSamplerInfo clGetSamplerInfo_fn
+inline cl_int clGetSamplerInfo(cl_sampler p0, cl_sampler_info p1, size_t p2, void* p3, size_t* p4) { return clGetSamplerInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetSupportedImageFormats
+#define clGetSupportedImageFormats clGetSupportedImageFormats_fn
+inline cl_int clGetSupportedImageFormats(cl_context p0, cl_mem_flags p1, cl_mem_object_type p2, cl_uint p3, cl_image_format* p4, cl_uint* p5) { return clGetSupportedImageFormats_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clLinkProgram
+#define clLinkProgram clLinkProgram_fn
+inline cl_program clLinkProgram(cl_context p0, cl_uint p1, const cl_device_id* p2, const char* p3, cl_uint p4, const cl_program* p5, void (CL_CALLBACK*p6) (cl_program, void*), void* p7, cl_int* p8) { return clLinkProgram_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clReleaseCommandQueue
+#define clReleaseCommandQueue clReleaseCommandQueue_fn
+inline cl_int clReleaseCommandQueue(cl_command_queue p0) { return clReleaseCommandQueue_pfn(p0); }
+#undef clReleaseContext
+#define clReleaseContext clReleaseContext_fn
+inline cl_int clReleaseContext(cl_context p0) { return clReleaseContext_pfn(p0); }
+#undef clReleaseDevice
+#define clReleaseDevice clReleaseDevice_fn
+inline cl_int clReleaseDevice(cl_device_id p0) { return clReleaseDevice_pfn(p0); }
+#undef clReleaseEvent
+#define clReleaseEvent clReleaseEvent_fn
+inline cl_int clReleaseEvent(cl_event p0) { return clReleaseEvent_pfn(p0); }
+#undef clReleaseKernel
+#define clReleaseKernel clReleaseKernel_fn
+inline cl_int clReleaseKernel(cl_kernel p0) { return clReleaseKernel_pfn(p0); }
+#undef clReleaseMemObject
+#define clReleaseMemObject clReleaseMemObject_fn
+inline cl_int clReleaseMemObject(cl_mem p0) { return clReleaseMemObject_pfn(p0); }
+#undef clReleaseProgram
+#define clReleaseProgram clReleaseProgram_fn
+inline cl_int clReleaseProgram(cl_program p0) { return clReleaseProgram_pfn(p0); }
+#undef clReleaseSampler
+#define clReleaseSampler clReleaseSampler_fn
+inline cl_int clReleaseSampler(cl_sampler p0) { return clReleaseSampler_pfn(p0); }
+#undef clRetainCommandQueue
+#define clRetainCommandQueue clRetainCommandQueue_fn
+inline cl_int clRetainCommandQueue(cl_command_queue p0) { return clRetainCommandQueue_pfn(p0); }
+#undef clRetainContext
+#define clRetainContext clRetainContext_fn
+inline cl_int clRetainContext(cl_context p0) { return clRetainContext_pfn(p0); }
+#undef clRetainDevice
+#define clRetainDevice clRetainDevice_fn
+inline cl_int clRetainDevice(cl_device_id p0) { return clRetainDevice_pfn(p0); }
+#undef clRetainEvent
+#define clRetainEvent clRetainEvent_fn
+inline cl_int clRetainEvent(cl_event p0) { return clRetainEvent_pfn(p0); }
+#undef clRetainKernel
+#define clRetainKernel clRetainKernel_fn
+inline cl_int clRetainKernel(cl_kernel p0) { return clRetainKernel_pfn(p0); }
+#undef clRetainMemObject
+#define clRetainMemObject clRetainMemObject_fn
+inline cl_int clRetainMemObject(cl_mem p0) { return clRetainMemObject_pfn(p0); }
+#undef clRetainProgram
+#define clRetainProgram clRetainProgram_fn
+inline cl_int clRetainProgram(cl_program p0) { return clRetainProgram_pfn(p0); }
+#undef clRetainSampler
+#define clRetainSampler clRetainSampler_fn
+inline cl_int clRetainSampler(cl_sampler p0) { return clRetainSampler_pfn(p0); }
+#undef clSetEventCallback
+#define clSetEventCallback clSetEventCallback_fn
+inline cl_int clSetEventCallback(cl_event p0, cl_int p1, void (CL_CALLBACK*p2) (cl_event, cl_int, void*), void* p3) { return clSetEventCallback_pfn(p0, p1, p2, p3); }
+#undef clSetKernelArg
+#define clSetKernelArg clSetKernelArg_fn
+inline cl_int clSetKernelArg(cl_kernel p0, cl_uint p1, size_t p2, const void* p3) { return clSetKernelArg_pfn(p0, p1, p2, p3); }
+#undef clSetMemObjectDestructorCallback
+#define clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback_fn
+inline cl_int clSetMemObjectDestructorCallback(cl_mem p0, void (CL_CALLBACK*p1) (cl_mem, void*), void* p2) { return clSetMemObjectDestructorCallback_pfn(p0, p1, p2); }
+#undef clSetUserEventStatus
+#define clSetUserEventStatus clSetUserEventStatus_fn
+inline cl_int clSetUserEventStatus(cl_event p0, cl_int p1) { return clSetUserEventStatus_pfn(p0, p1); }
+#undef clUnloadCompiler
+#define clUnloadCompiler clUnloadCompiler_fn
+inline cl_int clUnloadCompiler() { return clUnloadCompiler_pfn(); }
+#undef clUnloadPlatformCompiler
+#define clUnloadPlatformCompiler clUnloadPlatformCompiler_fn
+inline cl_int clUnloadPlatformCompiler(cl_platform_id p0) { return clUnloadPlatformCompiler_pfn(p0); }
+#undef clWaitForEvents
+#define clWaitForEvents clWaitForEvents_fn
+inline cl_int clWaitForEvents(cl_uint p0, const cl_event* p1) { return clWaitForEvents_pfn(p0, p1); }
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp
new file mode 100644
index 000000000000..0b12aed6c6ce
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp
@@ -0,0 +1,62 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_cl.py
+#define clCreateFromGLBuffer clCreateFromGLBuffer_
+#define clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer_
+#define clCreateFromGLTexture clCreateFromGLTexture_
+#define clCreateFromGLTexture2D clCreateFromGLTexture2D_
+#define clCreateFromGLTexture3D clCreateFromGLTexture3D_
+#define clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects_
+#define clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects_
+#define clGetGLContextInfoKHR clGetGLContextInfoKHR_
+#define clGetGLObjectInfo clGetGLObjectInfo_
+#define clGetGLTextureInfo clGetGLTextureInfo_
+
+#if defined __APPLE__
+#include <OpenCL/cl_gl.h>
+#else
+#include <CL/cl_gl.h>
+#endif
+
+// generated by parser_cl.py
+#undef clCreateFromGLBuffer
+#define clCreateFromGLBuffer clCreateFromGLBuffer_pfn
+#undef clCreateFromGLRenderbuffer
+#define clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer_pfn
+#undef clCreateFromGLTexture
+#define clCreateFromGLTexture clCreateFromGLTexture_pfn
+#undef clCreateFromGLTexture2D
+#define clCreateFromGLTexture2D clCreateFromGLTexture2D_pfn
+#undef clCreateFromGLTexture3D
+#define clCreateFromGLTexture3D clCreateFromGLTexture3D_pfn
+#undef clEnqueueAcquireGLObjects
+#define clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects_pfn
+#undef clEnqueueReleaseGLObjects
+#define clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects_pfn
+#undef clGetGLContextInfoKHR
+#define clGetGLContextInfoKHR clGetGLContextInfoKHR_pfn
+#undef clGetGLObjectInfo
+#define clGetGLObjectInfo clGetGLObjectInfo_pfn
+#undef clGetGLTextureInfo
+#define clGetGLTextureInfo clGetGLTextureInfo_pfn
+
+#ifdef cl_khr_gl_sharing
+
+// generated by parser_cl.py
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLBuffer)(cl_context, cl_mem_flags, cl_GLuint, int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLRenderbuffer)(cl_context, cl_mem_flags, cl_GLuint, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLTexture)(cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLTexture2D)(cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLTexture3D)(cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueAcquireGLObjects)(cl_command_queue, cl_uint, const cl_mem*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueReleaseGLObjects)(cl_command_queue, cl_uint, const cl_mem*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetGLContextInfoKHR)(const cl_context_properties*, cl_gl_context_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetGLObjectInfo)(cl_mem, cl_gl_object_type*, cl_GLuint*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetGLTextureInfo)(cl_mem, cl_gl_texture_info, size_t, void*, size_t*);
+
+#endif // cl_khr_gl_sharing
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp
new file mode 100644
index 000000000000..12f342b2e4be
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp
@@ -0,0 +1,42 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_WRAPPERS_HPP
+#error "Invalid usage"
+#endif
+
+#ifdef cl_khr_gl_sharing
+
+// generated by parser_cl.py
+#undef clCreateFromGLBuffer
+#define clCreateFromGLBuffer clCreateFromGLBuffer_fn
+inline cl_mem clCreateFromGLBuffer(cl_context p0, cl_mem_flags p1, cl_GLuint p2, int* p3) { return clCreateFromGLBuffer_pfn(p0, p1, p2, p3); }
+#undef clCreateFromGLRenderbuffer
+#define clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer_fn
+inline cl_mem clCreateFromGLRenderbuffer(cl_context p0, cl_mem_flags p1, cl_GLuint p2, cl_int* p3) { return clCreateFromGLRenderbuffer_pfn(p0, p1, p2, p3); }
+#undef clCreateFromGLTexture
+#define clCreateFromGLTexture clCreateFromGLTexture_fn
+inline cl_mem clCreateFromGLTexture(cl_context p0, cl_mem_flags p1, cl_GLenum p2, cl_GLint p3, cl_GLuint p4, cl_int* p5) { return clCreateFromGLTexture_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateFromGLTexture2D
+#define clCreateFromGLTexture2D clCreateFromGLTexture2D_fn
+inline cl_mem clCreateFromGLTexture2D(cl_context p0, cl_mem_flags p1, cl_GLenum p2, cl_GLint p3, cl_GLuint p4, cl_int* p5) { return clCreateFromGLTexture2D_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateFromGLTexture3D
+#define clCreateFromGLTexture3D clCreateFromGLTexture3D_fn
+inline cl_mem clCreateFromGLTexture3D(cl_context p0, cl_mem_flags p1, cl_GLenum p2, cl_GLint p3, cl_GLuint p4, cl_int* p5) { return clCreateFromGLTexture3D_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clEnqueueAcquireGLObjects
+#define clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects_fn
+inline cl_int clEnqueueAcquireGLObjects(cl_command_queue p0, cl_uint p1, const cl_mem* p2, cl_uint p3, const cl_event* p4, cl_event* p5) { return clEnqueueAcquireGLObjects_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clEnqueueReleaseGLObjects
+#define clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects_fn
+inline cl_int clEnqueueReleaseGLObjects(cl_command_queue p0, cl_uint p1, const cl_mem* p2, cl_uint p3, const cl_event* p4, cl_event* p5) { return clEnqueueReleaseGLObjects_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetGLContextInfoKHR
+#define clGetGLContextInfoKHR clGetGLContextInfoKHR_fn
+inline cl_int clGetGLContextInfoKHR(const cl_context_properties* p0, cl_gl_context_info p1, size_t p2, void* p3, size_t* p4) { return clGetGLContextInfoKHR_pfn(p0, p1, p2, p3, p4); }
+#undef clGetGLObjectInfo
+#define clGetGLObjectInfo clGetGLObjectInfo_fn
+inline cl_int clGetGLObjectInfo(cl_mem p0, cl_gl_object_type* p1, cl_GLuint* p2) { return clGetGLObjectInfo_pfn(p0, p1, p2); }
+#undef clGetGLTextureInfo
+#define clGetGLTextureInfo clGetGLTextureInfo_fn
+inline cl_int clGetGLTextureInfo(cl_mem p0, cl_gl_texture_info p1, size_t p2, void* p3, size_t* p4) { return clGetGLTextureInfo_pfn(p0, p1, p2, p3, p4); }
+
+#endif // cl_khr_gl_sharing
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_clblas.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_clblas.hpp
new file mode 100644
index 000000000000..ccddf8f76c19
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_clblas.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_CLAMDBLAS_HPP
+#define OPENCV_CORE_OCL_RUNTIME_CLAMDBLAS_HPP
+
+#ifdef HAVE_CLAMDBLAS
+
+#include "opencl_core.hpp"
+
+#include "autogenerated/opencl_clblas.hpp"
+
+#endif // HAVE_CLAMDBLAS
+
+#endif // OPENCV_CORE_OCL_RUNTIME_CLAMDBLAS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_clfft.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_clfft.hpp
new file mode 100644
index 000000000000..7f4af5e60b7e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_clfft.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_CLAMDFFT_HPP
+#define OPENCV_CORE_OCL_RUNTIME_CLAMDFFT_HPP
+
+#ifdef HAVE_CLAMDFFT
+
+#include "opencl_core.hpp"
+
+#include "autogenerated/opencl_clfft.hpp"
+
+#endif // HAVE_CLAMDFFT
+
+#endif // OPENCV_CORE_OCL_RUNTIME_CLAMDFFT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_core.hpp
new file mode 100644
index 000000000000..0404b3177a58
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_core.hpp
@@ -0,0 +1,84 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_CORE_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_CORE_HPP
+
+#ifdef HAVE_OPENCL
+
+#ifndef CL_RUNTIME_EXPORT
+#if (defined(BUILD_SHARED_LIBS) || defined(OPENCV_CORE_SHARED)) && (defined _WIN32 || defined WINCE) && \
+    !(defined(__OPENCV_BUILD) && defined(OPENCV_MODULE_IS_PART_OF_WORLD))
+#define CL_RUNTIME_EXPORT __declspec(dllimport)
+#else
+#define CL_RUNTIME_EXPORT
+#endif
+#endif
+
+#ifdef HAVE_OPENCL_SVM
+#define clSVMAlloc clSVMAlloc_
+#define clSVMFree clSVMFree_
+#define clSetKernelArgSVMPointer clSetKernelArgSVMPointer_
+#define clSetKernelExecInfo clSetKernelExecInfo_
+#define clEnqueueSVMFree clEnqueueSVMFree_
+#define clEnqueueSVMMemcpy clEnqueueSVMMemcpy_
+#define clEnqueueSVMMemFill clEnqueueSVMMemFill_
+#define clEnqueueSVMMap clEnqueueSVMMap_
+#define clEnqueueSVMUnmap clEnqueueSVMUnmap_
+#endif
+
+#include "autogenerated/opencl_core.hpp"
+
+#ifndef CL_DEVICE_DOUBLE_FP_CONFIG
+#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
+#endif
+
+#ifndef CL_DEVICE_HALF_FP_CONFIG
+#define CL_DEVICE_HALF_FP_CONFIG 0x1033
+#endif
+
+#ifndef CL_VERSION_1_2
+#define CV_REQUIRE_OPENCL_1_2_ERROR CV_Error(cv::Error::OpenCLApiCallError, "OpenCV compiled without OpenCL v1.2 support, so we can't use functionality from OpenCL v1.2")
+#endif
+
+#endif // HAVE_OPENCL
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_CORE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp
new file mode 100644
index 000000000000..38fcae9952f2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_WRAPPERS_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_WRAPPERS_HPP
+
+#include "autogenerated/opencl_core_wrappers.hpp"
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_WRAPPERS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_gl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_gl.hpp
new file mode 100644
index 000000000000..659c7d805814
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_gl.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_HPP
+
+#if defined HAVE_OPENCL && defined HAVE_OPENGL
+
+#include "opencl_core.hpp"
+
+#include "autogenerated/opencl_gl.hpp"
+
+#endif // defined HAVE_OPENCL && defined HAVE_OPENGL
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp
new file mode 100644
index 000000000000..9700004cae5a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_WRAPPERS_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_WRAPPERS_HPP
+
+#include "autogenerated/opencl_gl_wrappers.hpp"
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_WRAPPERS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp
new file mode 100644
index 000000000000..9636b19b027f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp
@@ -0,0 +1,48 @@
+/* See LICENSE file in the root OpenCV directory */
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_2_0_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_2_0_HPP
+
+#if defined(HAVE_OPENCL_SVM)
+#include "opencl_core.hpp"
+
+#include "opencl_svm_definitions.hpp"
+
+#undef clSVMAlloc
+#define clSVMAlloc clSVMAlloc_pfn
+#undef clSVMFree
+#define clSVMFree clSVMFree_pfn
+#undef clSetKernelArgSVMPointer
+#define clSetKernelArgSVMPointer clSetKernelArgSVMPointer_pfn
+#undef clSetKernelExecInfo
+//#define clSetKernelExecInfo clSetKernelExecInfo_pfn
+#undef clEnqueueSVMFree
+//#define clEnqueueSVMFree clEnqueueSVMFree_pfn
+#undef clEnqueueSVMMemcpy
+#define clEnqueueSVMMemcpy clEnqueueSVMMemcpy_pfn
+#undef clEnqueueSVMMemFill
+#define clEnqueueSVMMemFill clEnqueueSVMMemFill_pfn
+#undef clEnqueueSVMMap
+#define clEnqueueSVMMap clEnqueueSVMMap_pfn
+#undef clEnqueueSVMUnmap
+#define clEnqueueSVMUnmap clEnqueueSVMUnmap_pfn
+
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL *clSVMAlloc)(cl_context context, cl_svm_mem_flags flags, size_t size, unsigned int alignment);
+extern CL_RUNTIME_EXPORT void (CL_API_CALL *clSVMFree)(cl_context context, void* svm_pointer);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clSetKernelArgSVMPointer)(cl_kernel kernel, cl_uint arg_index, const void* arg_value);
+//extern CL_RUNTIME_EXPORT void* (CL_API_CALL *clSetKernelExecInfo)(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void* param_value);
+//extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMFree)(cl_command_queue command_queue, cl_uint num_svm_pointers, void* svm_pointers[],
+//        void (CL_CALLBACK *pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void* svm_pointers[], void* user_data), void* user_data,
+//        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMMemcpy)(cl_command_queue command_queue, cl_bool blocking_copy, void* dst_ptr, const void* src_ptr, size_t size,
+        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMMemFill)(cl_command_queue command_queue, void* svm_ptr, const void* pattern, size_t pattern_size, size_t size,
+        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMMap)(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags map_flags, void* svm_ptr, size_t size,
+        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMUnmap)(cl_command_queue command_queue, void* svm_ptr,
+        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+
+#endif // HAVE_OPENCL_SVM
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_2_0_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp
new file mode 100644
index 000000000000..97c927b44d8c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp
@@ -0,0 +1,42 @@
+/* See LICENSE file in the root OpenCV directory */
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_DEFINITIONS_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_DEFINITIONS_HPP
+
+#if defined(HAVE_OPENCL_SVM)
+#if defined(CL_VERSION_2_0)
+
+// OpenCL 2.0 contains SVM definitions
+
+#else
+
+typedef cl_bitfield cl_device_svm_capabilities;
+typedef cl_bitfield cl_svm_mem_flags;
+typedef cl_uint     cl_kernel_exec_info;
+
+//
+// TODO Add real values after OpenCL 2.0 release
+//
+
+#ifndef CL_DEVICE_SVM_CAPABILITIES
+#define CL_DEVICE_SVM_CAPABILITIES 0x1053
+
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER             (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER               (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM               (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                         (1 << 3)
+#endif
+
+#ifndef CL_MEM_SVM_FINE_GRAIN_BUFFER
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10)
+#endif
+
+#ifndef CL_MEM_SVM_ATOMICS
+#define CL_MEM_SVM_ATOMICS (1 << 11)
+#endif
+
+
+#endif // CL_VERSION_2_0
+#endif // HAVE_OPENCL_SVM
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_DEFINITIONS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp
new file mode 100644
index 000000000000..497bc3de7205
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp
@@ -0,0 +1,166 @@
+/* See LICENSE file in the root OpenCV directory */
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_HSA_EXTENSION_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_HSA_EXTENSION_HPP
+
+#if defined(HAVE_OPENCL_SVM)
+#include "opencl_core.hpp"
+
+#ifndef CL_DEVICE_SVM_CAPABILITIES_AMD
+//
+//  Part of the file is an extract from the cl_ext.h file from AMD APP SDK package.
+//  Below is the original copyright.
+//
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*******************************************
+ * Shared Virtual Memory (SVM) extension
+ *******************************************/
+typedef cl_bitfield                      cl_device_svm_capabilities_amd;
+typedef cl_bitfield                      cl_svm_mem_flags_amd;
+typedef cl_uint                          cl_kernel_exec_info_amd;
+
+/* cl_device_info */
+#define CL_DEVICE_SVM_CAPABILITIES_AMD                     0x1053
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT_AMD  0x1054
+
+/* cl_device_svm_capabilities_amd */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD             (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_AMD               (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_AMD               (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_AMD                         (1 << 3)
+
+/* cl_svm_mem_flags_amd */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_AMD                  (1 << 10)
+#define CL_MEM_SVM_ATOMICS_AMD                            (1 << 11)
+
+/* cl_mem_info */
+#define CL_MEM_USES_SVM_POINTER_AMD                       0x1109
+
+/* cl_kernel_exec_info_amd */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_AMD                  0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_AMD     0x11B7
+
+/* cl_command_type */
+#define CL_COMMAND_SVM_FREE_AMD                           0x1209
+#define CL_COMMAND_SVM_MEMCPY_AMD                         0x120A
+#define CL_COMMAND_SVM_MEMFILL_AMD                        0x120B
+#define CL_COMMAND_SVM_MAP_AMD                            0x120C
+#define CL_COMMAND_SVM_UNMAP_AMD                          0x120D
+
+typedef CL_API_ENTRY void*
+(CL_API_CALL * clSVMAllocAMD_fn)(
+    cl_context            /* context */,
+    cl_svm_mem_flags_amd  /* flags */,
+    size_t                /* size */,
+    unsigned int          /* alignment */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY void
+(CL_API_CALL * clSVMFreeAMD_fn)(
+    cl_context  /* context */,
+    void*       /* svm_pointer */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMFreeAMD_fn)(
+    cl_command_queue /* command_queue */,
+    cl_uint          /* num_svm_pointers */,
+    void**           /* svm_pointers */,
+    void (CL_CALLBACK *)( /*pfn_free_func*/
+        cl_command_queue /* queue */,
+        cl_uint          /* num_svm_pointers */,
+        void**           /* svm_pointers */,
+        void*            /* user_data */),
+    void*             /* user_data */,
+    cl_uint           /* num_events_in_wait_list */,
+    const cl_event*   /* event_wait_list */,
+    cl_event*         /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMMemcpyAMD_fn)(
+    cl_command_queue /* command_queue */,
+    cl_bool          /* blocking_copy */,
+    void*            /* dst_ptr */,
+    const void*      /* src_ptr */,
+    size_t           /* size */,
+    cl_uint          /* num_events_in_wait_list */,
+    const cl_event*  /* event_wait_list */,
+    cl_event*        /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMMemFillAMD_fn)(
+    cl_command_queue /* command_queue */,
+    void*            /* svm_ptr */,
+    const void*      /* pattern */,
+    size_t           /* pattern_size */,
+    size_t           /* size */,
+    cl_uint          /* num_events_in_wait_list */,
+    const cl_event*  /* event_wait_list */,
+    cl_event*        /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMMapAMD_fn)(
+    cl_command_queue /* command_queue */,
+    cl_bool          /* blocking_map */,
+    cl_map_flags     /* map_flags */,
+    void*            /* svm_ptr */,
+    size_t           /* size */,
+    cl_uint          /* num_events_in_wait_list */,
+    const cl_event*  /* event_wait_list */,
+    cl_event*        /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMUnmapAMD_fn)(
+    cl_command_queue /* command_queue */,
+    void*            /* svm_ptr */,
+    cl_uint          /* num_events_in_wait_list */,
+    const cl_event*  /* event_wait_list */,
+    cl_event*        /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clSetKernelArgSVMPointerAMD_fn)(
+    cl_kernel     /* kernel */,
+    cl_uint       /* arg_index */,
+    const void *  /* arg_value */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clSetKernelExecInfoAMD_fn)(
+     cl_kernel                /* kernel */,
+     cl_kernel_exec_info_amd  /* param_name */,
+     size_t                   /* param_value_size */,
+     const void *             /* param_value */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+#endif
+
+#endif // HAVE_OPENCL_SVM
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_HSA_EXTENSION_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/opengl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/opengl.hpp
new file mode 100644
index 000000000000..fceb85bd0665
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/opengl.hpp
@@ -0,0 +1,733 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OPENGL_HPP
+#define OPENCV_CORE_OPENGL_HPP
+
+#ifndef __cplusplus
+#  error opengl.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core.hpp"
+#include "ocl.hpp"
+
+namespace cv { namespace ogl {
+
+/** @addtogroup core_opengl
+This section describes OpenGL interoperability.
+
+To enable OpenGL support, configure OpenCV using CMake with WITH_OPENGL=ON . Currently OpenGL is
+supported only with WIN32, GTK and Qt backends on Windows and Linux (MacOS and Android are not
+supported). For GTK backend gtkglext-1.0 library is required.
+
+To use OpenGL functionality you should first create OpenGL context (window or frame buffer). You can
+do this with namedWindow function or with other OpenGL toolkit (GLUT, for example).
+*/
+//! @{
+
+/////////////////// OpenGL Objects ///////////////////
+
+/** @brief Smart pointer for OpenGL buffer object with reference counting.
+
+Buffer Objects are OpenGL objects that store an array of unformatted memory allocated by the OpenGL
+context. These can be used to store vertex data, pixel data retrieved from images or the
+framebuffer, and a variety of other things.
+
+ogl::Buffer has interface similar with Mat interface and represents 2D array memory.
+
+ogl::Buffer supports memory transfers between host and device and also can be mapped to CUDA memory.
+ */
+class CV_EXPORTS Buffer
+{
+public:
+    /** @brief The target defines how you intend to use the buffer object.
+    */
+    enum Target
+    {
+        ARRAY_BUFFER         = 0x8892, //!< The buffer will be used as a source for vertex data
+        ELEMENT_ARRAY_BUFFER = 0x8893, //!< The buffer will be used for indices (in glDrawElements, for example)
+        PIXEL_PACK_BUFFER    = 0x88EB, //!< The buffer will be used for reading from OpenGL textures
+        PIXEL_UNPACK_BUFFER  = 0x88EC  //!< The buffer will be used for writing to OpenGL textures
+    };
+
+    enum Access
+    {
+        READ_ONLY  = 0x88B8,
+        WRITE_ONLY = 0x88B9,
+        READ_WRITE = 0x88BA
+    };
+
+    /** @brief The constructors.
+
+    Creates empty ogl::Buffer object, creates ogl::Buffer object from existed buffer ( abufId
+    parameter), allocates memory for ogl::Buffer object or copies from host/device memory.
+     */
+    Buffer();
+
+    /** @overload
+    @param arows Number of rows in a 2D array.
+    @param acols Number of columns in a 2D array.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param abufId Buffer object name.
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Buffer(int arows, int acols, int atype, unsigned int abufId, bool autoRelease = false);
+
+    /** @overload
+    @param asize 2D array size.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param abufId Buffer object name.
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Buffer(Size asize, int atype, unsigned int abufId, bool autoRelease = false);
+
+    /** @overload
+    @param arows Number of rows in a 2D array.
+    @param acols Number of columns in a 2D array.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Buffer(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @overload
+    @param asize 2D array size.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Buffer(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @overload
+    @param arr Input array (host or device memory, it can be Mat , cuda::GpuMat or std::vector ).
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    explicit Buffer(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @brief Allocates memory for ogl::Buffer object.
+
+    @param arows Number of rows in a 2D array.
+    @param acols Number of columns in a 2D array.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+     */
+    void create(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @overload
+    @param asize 2D array size.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    void create(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @brief Decrements the reference counter and destroys the buffer object if needed.
+
+    The function will call setAutoRelease(true) .
+     */
+    void release();
+
+    /** @brief Sets auto release mode.
+
+    The lifetime of the OpenGL object is tied to the lifetime of the context. If OpenGL context was
+    bound to a window it could be released at any time (user can close a window). If object's destructor
+    is called after destruction of the context it will cause an error. Thus ogl::Buffer doesn't destroy
+    OpenGL object in destructor by default (all OpenGL resources will be released with OpenGL context).
+    This function can force ogl::Buffer destructor to destroy OpenGL object.
+    @param flag Auto release mode (if true, release will be called in object's destructor).
+     */
+    void setAutoRelease(bool flag);
+
+    /** @brief Copies from host/device memory to OpenGL buffer.
+    @param arr Input array (host or device memory, it can be Mat , cuda::GpuMat or std::vector ).
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+     */
+    void copyFrom(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @overload */
+    void copyFrom(InputArray arr, cuda::Stream& stream, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @brief Copies from OpenGL buffer to host/device memory or another OpenGL buffer object.
+
+    @param arr Destination array (host or device memory, can be Mat , cuda::GpuMat , std::vector or
+    ogl::Buffer ).
+     */
+    void copyTo(OutputArray arr) const;
+
+    /** @overload */
+    void copyTo(OutputArray arr, cuda::Stream& stream) const;
+
+    /** @brief Creates a full copy of the buffer object and the underlying data.
+
+    @param target Buffer usage for destination buffer.
+    @param autoRelease Auto release mode for destination buffer.
+     */
+    Buffer clone(Target target = ARRAY_BUFFER, bool autoRelease = false) const;
+
+    /** @brief Binds OpenGL buffer to the specified buffer binding point.
+
+    @param target Binding point. See cv::ogl::Buffer::Target .
+     */
+    void bind(Target target) const;
+
+    /** @brief Unbind any buffers from the specified binding point.
+
+    @param target Binding point. See cv::ogl::Buffer::Target .
+     */
+    static void unbind(Target target);
+
+    /** @brief Maps OpenGL buffer to host memory.
+
+    mapHost maps to the client's address space the entire data store of the buffer object. The data can
+    then be directly read and/or written relative to the returned pointer, depending on the specified
+    access policy.
+
+    A mapped data store must be unmapped with ogl::Buffer::unmapHost before its buffer object is used.
+
+    This operation can lead to memory transfers between host and device.
+
+    Only one buffer object can be mapped at a time.
+    @param access Access policy, indicating whether it will be possible to read from, write to, or both
+    read from and write to the buffer object's mapped data store. The symbolic constant must be
+    ogl::Buffer::READ_ONLY , ogl::Buffer::WRITE_ONLY or ogl::Buffer::READ_WRITE .
+     */
+    Mat mapHost(Access access);
+
+    /** @brief Unmaps OpenGL buffer.
+    */
+    void unmapHost();
+
+    //! map to device memory (blocking)
+    cuda::GpuMat mapDevice();
+    void unmapDevice();
+
+    /** @brief Maps OpenGL buffer to CUDA device memory.
+
+    This operation doesn't copy data. Several buffer objects can be mapped to CUDA memory at a time.
+
+    A mapped data store must be unmapped with ogl::Buffer::unmapDevice before its buffer object is used.
+     */
+    cuda::GpuMat mapDevice(cuda::Stream& stream);
+
+    /** @brief Unmaps OpenGL buffer.
+    */
+    void unmapDevice(cuda::Stream& stream);
+
+    int rows() const;
+    int cols() const;
+    Size size() const;
+    bool empty() const;
+
+    int type() const;
+    int depth() const;
+    int channels() const;
+    int elemSize() const;
+    int elemSize1() const;
+
+    //! get OpenGL opject id
+    unsigned int bufId() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    int rows_;
+    int cols_;
+    int type_;
+};
+
+/** @brief Smart pointer for OpenGL 2D texture memory with reference counting.
+ */
+class CV_EXPORTS Texture2D
+{
+public:
+    /** @brief An Image Format describes the way that the images in Textures store their data.
+    */
+    enum Format
+    {
+        NONE            = 0,
+        DEPTH_COMPONENT = 0x1902, //!< Depth
+        RGB             = 0x1907, //!< Red, Green, Blue
+        RGBA            = 0x1908  //!< Red, Green, Blue, Alpha
+    };
+
+    /** @brief The constructors.
+
+    Creates empty ogl::Texture2D object, allocates memory for ogl::Texture2D object or copies from
+    host/device memory.
+     */
+    Texture2D();
+
+    /** @overload */
+    Texture2D(int arows, int acols, Format aformat, unsigned int atexId, bool autoRelease = false);
+
+    /** @overload */
+    Texture2D(Size asize, Format aformat, unsigned int atexId, bool autoRelease = false);
+
+    /** @overload
+    @param arows Number of rows.
+    @param acols Number of columns.
+    @param aformat Image format. See cv::ogl::Texture2D::Format .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Texture2D(int arows, int acols, Format aformat, bool autoRelease = false);
+
+    /** @overload
+    @param asize 2D array size.
+    @param aformat Image format. See cv::ogl::Texture2D::Format .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Texture2D(Size asize, Format aformat, bool autoRelease = false);
+
+    /** @overload
+    @param arr Input array (host or device memory, it can be Mat , cuda::GpuMat or ogl::Buffer ).
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    explicit Texture2D(InputArray arr, bool autoRelease = false);
+
+    /** @brief Allocates memory for ogl::Texture2D object.
+
+    @param arows Number of rows.
+    @param acols Number of columns.
+    @param aformat Image format. See cv::ogl::Texture2D::Format .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+     */
+    void create(int arows, int acols, Format aformat, bool autoRelease = false);
+    /** @overload
+    @param asize 2D array size.
+    @param aformat Image format. See cv::ogl::Texture2D::Format .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    void create(Size asize, Format aformat, bool autoRelease = false);
+
+    /** @brief Decrements the reference counter and destroys the texture object if needed.
+
+    The function will call setAutoRelease(true) .
+     */
+    void release();
+
+    /** @brief Sets auto release mode.
+
+    @param flag Auto release mode (if true, release will be called in object's destructor).
+
+    The lifetime of the OpenGL object is tied to the lifetime of the context. If OpenGL context was
+    bound to a window it could be released at any time (user can close a window). If object's destructor
+    is called after destruction of the context it will cause an error. Thus ogl::Texture2D doesn't
+    destroy OpenGL object in destructor by default (all OpenGL resources will be released with OpenGL
+    context). This function can force ogl::Texture2D destructor to destroy OpenGL object.
+     */
+    void setAutoRelease(bool flag);
+
+    /** @brief Copies from host/device memory to OpenGL texture.
+
+    @param arr Input array (host or device memory, it can be Mat , cuda::GpuMat or ogl::Buffer ).
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+     */
+    void copyFrom(InputArray arr, bool autoRelease = false);
+
+    /** @brief Copies from OpenGL texture to host/device memory or another OpenGL texture object.
+
+    @param arr Destination array (host or device memory, can be Mat , cuda::GpuMat , ogl::Buffer or
+    ogl::Texture2D ).
+    @param ddepth Destination depth.
+    @param autoRelease Auto release mode for destination buffer (if arr is OpenGL buffer or texture).
+     */
+    void copyTo(OutputArray arr, int ddepth = CV_32F, bool autoRelease = false) const;
+
+    /** @brief Binds texture to current active texture unit for GL_TEXTURE_2D target.
+    */
+    void bind() const;
+
+    int rows() const;
+    int cols() const;
+    Size size() const;
+    bool empty() const;
+
+    Format format() const;
+
+    //! get OpenGL opject id
+    unsigned int texId() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    int rows_;
+    int cols_;
+    Format format_;
+};
+
+/** @brief Wrapper for OpenGL Client-Side Vertex arrays.
+
+ogl::Arrays stores vertex data in ogl::Buffer objects.
+ */
+class CV_EXPORTS Arrays
+{
+public:
+    /** @brief Default constructor
+     */
+    Arrays();
+
+    /** @brief Sets an array of vertex coordinates.
+    @param vertex array with vertex coordinates, can be both host and device memory.
+    */
+    void setVertexArray(InputArray vertex);
+
+    /** @brief Resets vertex coordinates.
+    */
+    void resetVertexArray();
+
+    /** @brief Sets an array of vertex colors.
+    @param color array with vertex colors, can be both host and device memory.
+     */
+    void setColorArray(InputArray color);
+
+    /** @brief Resets vertex colors.
+    */
+    void resetColorArray();
+
+    /** @brief Sets an array of vertex normals.
+    @param normal array with vertex normals, can be both host and device memory.
+     */
+    void setNormalArray(InputArray normal);
+
+    /** @brief Resets vertex normals.
+    */
+    void resetNormalArray();
+
+    /** @brief Sets an array of vertex texture coordinates.
+    @param texCoord array with vertex texture coordinates, can be both host and device memory.
+     */
+    void setTexCoordArray(InputArray texCoord);
+
+    /** @brief Resets vertex texture coordinates.
+    */
+    void resetTexCoordArray();
+
+    /** @brief Releases all inner buffers.
+    */
+    void release();
+
+    /** @brief Sets auto release mode all inner buffers.
+    @param flag Auto release mode.
+     */
+    void setAutoRelease(bool flag);
+
+    /** @brief Binds all vertex arrays.
+    */
+    void bind() const;
+
+    /** @brief Returns the vertex count.
+    */
+    int size() const;
+    bool empty() const;
+
+private:
+    int size_;
+    Buffer vertex_;
+    Buffer color_;
+    Buffer normal_;
+    Buffer texCoord_;
+};
+
+/////////////////// Render Functions ///////////////////
+
+//! render mode
+enum RenderModes {
+    POINTS         = 0x0000,
+    LINES          = 0x0001,
+    LINE_LOOP      = 0x0002,
+    LINE_STRIP     = 0x0003,
+    TRIANGLES      = 0x0004,
+    TRIANGLE_STRIP = 0x0005,
+    TRIANGLE_FAN   = 0x0006,
+    QUADS          = 0x0007,
+    QUAD_STRIP     = 0x0008,
+    POLYGON        = 0x0009
+};
+
+/** @brief Render OpenGL texture or primitives.
+@param tex Texture to draw.
+@param wndRect Region of window, where to draw a texture (normalized coordinates).
+@param texRect Region of texture to draw (normalized coordinates).
+ */
+CV_EXPORTS void render(const Texture2D& tex,
+    Rect_<double> wndRect = Rect_<double>(0.0, 0.0, 1.0, 1.0),
+    Rect_<double> texRect = Rect_<double>(0.0, 0.0, 1.0, 1.0));
+
+/** @overload
+@param arr Array of privitives vertices.
+@param mode Render mode. One of cv::ogl::RenderModes
+@param color Color for all vertices. Will be used if arr doesn't contain color array.
+*/
+CV_EXPORTS void render(const Arrays& arr, int mode = POINTS, Scalar color = Scalar::all(255));
+
+/** @overload
+@param arr Array of privitives vertices.
+@param indices Array of vertices indices (host or device memory).
+@param mode Render mode. One of cv::ogl::RenderModes
+@param color Color for all vertices. Will be used if arr doesn't contain color array.
+*/
+CV_EXPORTS void render(const Arrays& arr, InputArray indices, int mode = POINTS, Scalar color = Scalar::all(255));
+
+/////////////////// CL-GL Interoperability Functions ///////////////////
+
+namespace ocl {
+using namespace cv::ocl;
+
+// TODO static functions in the Context class
+/** @brief Creates OpenCL context from GL.
+@return Returns reference to OpenCL Context
+ */
+CV_EXPORTS Context& initializeContextFromGL();
+
+} // namespace cv::ogl::ocl
+
+/** @brief Converts InputArray to Texture2D object.
+@param src     - source InputArray.
+@param texture - destination Texture2D object.
+ */
+CV_EXPORTS void convertToGLTexture2D(InputArray src, Texture2D& texture);
+
+/** @brief Converts Texture2D object to OutputArray.
+@param texture - source Texture2D object.
+@param dst     - destination OutputArray.
+ */
+CV_EXPORTS void convertFromGLTexture2D(const Texture2D& texture, OutputArray dst);
+
+/** @brief Maps Buffer object to process on CL side (convert to UMat).
+
+Function creates CL buffer from GL one, and then constructs UMat that can be used
+to process buffer data with OpenCV functions. Note that in current implementation
+UMat constructed this way doesn't own corresponding GL buffer object, so it is
+the user responsibility to close down CL/GL buffers relationships by explicitly
+calling unmapGLBuffer() function.
+@param buffer      - source Buffer object.
+@param accessFlags - data access flags (ACCESS_READ|ACCESS_WRITE).
+@return Returns UMat object
+ */
+CV_EXPORTS UMat mapGLBuffer(const Buffer& buffer, AccessFlag accessFlags = ACCESS_READ | ACCESS_WRITE);
+
+/** @brief Unmaps Buffer object (releases UMat, previously mapped from Buffer).
+
+Function must be called explicitly by the user for each UMat previously constructed
+by the call to mapGLBuffer() function.
+@param u           - source UMat, created by mapGLBuffer().
+ */
+CV_EXPORTS void unmapGLBuffer(UMat& u);
+
+//! @}
+}} // namespace cv::ogl
+
+namespace cv { namespace cuda {
+
+/** @brief Sets a CUDA device and initializes it for the current thread with OpenGL interoperability.
+
+This function should be explicitly called after OpenGL context creation and before any CUDA calls.
+@param device System index of a CUDA device starting with 0.
+@ingroup core_opengl
+ */
+CV_EXPORTS void setGlDevice(int device = 0);
+
+}}
+
+//! @cond IGNORED
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+inline
+cv::ogl::Buffer::Buffer(int arows, int acols, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(arows, acols, atype, target, autoRelease);
+}
+
+inline
+cv::ogl::Buffer::Buffer(Size asize, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(asize, atype, target, autoRelease);
+}
+
+inline
+void cv::ogl::Buffer::create(Size asize, int atype, Target target, bool autoRelease)
+{
+    create(asize.height, asize.width, atype, target, autoRelease);
+}
+
+inline
+int cv::ogl::Buffer::rows() const
+{
+    return rows_;
+}
+
+inline
+int cv::ogl::Buffer::cols() const
+{
+    return cols_;
+}
+
+inline
+cv::Size cv::ogl::Buffer::size() const
+{
+    return Size(cols_, rows_);
+}
+
+inline
+bool cv::ogl::Buffer::empty() const
+{
+    return rows_ == 0 || cols_ == 0;
+}
+
+inline
+int cv::ogl::Buffer::type() const
+{
+    return type_;
+}
+
+inline
+int cv::ogl::Buffer::depth() const
+{
+    return CV_MAT_DEPTH(type_);
+}
+
+inline
+int cv::ogl::Buffer::channels() const
+{
+    return CV_MAT_CN(type_);
+}
+
+inline
+int cv::ogl::Buffer::elemSize() const
+{
+    return CV_ELEM_SIZE(type_);
+}
+
+inline
+int cv::ogl::Buffer::elemSize1() const
+{
+    return CV_ELEM_SIZE1(type_);
+}
+
+///////
+
+inline
+cv::ogl::Texture2D::Texture2D(int arows, int acols, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(arows, acols, aformat, autoRelease);
+}
+
+inline
+cv::ogl::Texture2D::Texture2D(Size asize, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(asize, aformat, autoRelease);
+}
+
+inline
+void cv::ogl::Texture2D::create(Size asize, Format aformat, bool autoRelease)
+{
+    create(asize.height, asize.width, aformat, autoRelease);
+}
+
+inline
+int cv::ogl::Texture2D::rows() const
+{
+    return rows_;
+}
+
+inline
+int cv::ogl::Texture2D::cols() const
+{
+    return cols_;
+}
+
+inline
+cv::Size cv::ogl::Texture2D::size() const
+{
+    return Size(cols_, rows_);
+}
+
+inline
+bool cv::ogl::Texture2D::empty() const
+{
+    return rows_ == 0 || cols_ == 0;
+}
+
+inline
+cv::ogl::Texture2D::Format cv::ogl::Texture2D::format() const
+{
+    return format_;
+}
+
+///////
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+inline
+cv::ogl::Arrays::Arrays() : size_(0)
+{
+}
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+inline
+int cv::ogl::Arrays::size() const
+{
+    return size_;
+}
+
+inline
+bool cv::ogl::Arrays::empty() const
+{
+    return size_ == 0;
+}
+
+//! @endcond
+
+#endif /* OPENCV_CORE_OPENGL_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/operations.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/operations.hpp
new file mode 100644
index 000000000000..ab7d4297822b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/operations.hpp
@@ -0,0 +1,612 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OPERATIONS_HPP
+#define OPENCV_CORE_OPERATIONS_HPP
+
+#ifndef __cplusplus
+#  error operations.hpp header must be compiled as C++
+#endif
+
+#include <cstdio>
+
+#if defined(__GNUC__) || defined(__clang__) // at least GCC 3.1+, clang 3.5+
+#  if defined(__MINGW_PRINTF_FORMAT)  // https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#    define CV_FORMAT_PRINTF(string_idx, first_to_check) __attribute__ ((format (__MINGW_PRINTF_FORMAT, string_idx, first_to_check)))
+#  else
+#    define CV_FORMAT_PRINTF(string_idx, first_to_check) __attribute__ ((format (printf, string_idx, first_to_check)))
+#  endif
+#else
+#  define CV_FORMAT_PRINTF(A, B)
+#endif
+
+namespace cv
+{
+//! @cond IGNORED
+
+
+////////////////////////////// Matx methods depending on core API /////////////////////////////
+
+namespace internal
+{
+
+template<typename _Tp, int m, int n> struct Matx_FastInvOp
+{
+    bool operator()(const Matx<_Tp, m, n>& a, Matx<_Tp, n, m>& b, int method) const
+    {
+        return invert(a, b, method) != 0;
+    }
+};
+
+template<typename _Tp, int m> struct Matx_FastInvOp<_Tp, m, m>
+{
+    bool operator()(const Matx<_Tp, m, m>& a, Matx<_Tp, m, m>& b, int method) const
+    {
+        if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+        {
+            Matx<_Tp, m, m> temp = a;
+
+            // assume that b is all 0's on input => make it a unity matrix
+            for (int i = 0; i < m; i++)
+                b(i, i) = (_Tp)1;
+
+            if (method == DECOMP_CHOLESKY)
+                return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+
+            return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+        }
+        else
+        {
+            return invert(a, b, method) != 0;
+        }
+    }
+};
+
+template<typename _Tp> struct Matx_FastInvOp<_Tp, 2, 2>
+{
+    bool operator()(const Matx<_Tp, 2, 2>& a, Matx<_Tp, 2, 2>& b, int /*method*/) const
+    {
+        _Tp d = (_Tp)determinant(a);
+        if (d == 0)
+            return false;
+        d = 1/d;
+        b(1,1) = a(0,0)*d;
+        b(0,0) = a(1,1)*d;
+        b(0,1) = -a(0,1)*d;
+        b(1,0) = -a(1,0)*d;
+        return true;
+    }
+};
+
+template<typename _Tp> struct Matx_FastInvOp<_Tp, 3, 3>
+{
+    bool operator()(const Matx<_Tp, 3, 3>& a, Matx<_Tp, 3, 3>& b, int /*method*/) const
+    {
+        _Tp d = (_Tp)determinant(a);
+        if (d == 0)
+            return false;
+        d = 1/d;
+        b(0,0) = (a(1,1) * a(2,2) - a(1,2) * a(2,1)) * d;
+        b(0,1) = (a(0,2) * a(2,1) - a(0,1) * a(2,2)) * d;
+        b(0,2) = (a(0,1) * a(1,2) - a(0,2) * a(1,1)) * d;
+
+        b(1,0) = (a(1,2) * a(2,0) - a(1,0) * a(2,2)) * d;
+        b(1,1) = (a(0,0) * a(2,2) - a(0,2) * a(2,0)) * d;
+        b(1,2) = (a(0,2) * a(1,0) - a(0,0) * a(1,2)) * d;
+
+        b(2,0) = (a(1,0) * a(2,1) - a(1,1) * a(2,0)) * d;
+        b(2,1) = (a(0,1) * a(2,0) - a(0,0) * a(2,1)) * d;
+        b(2,2) = (a(0,0) * a(1,1) - a(0,1) * a(1,0)) * d;
+        return true;
+    }
+};
+
+
+template<typename _Tp, int m, int l, int n> struct Matx_FastSolveOp
+{
+    bool operator()(const Matx<_Tp, m, l>& a, const Matx<_Tp, m, n>& b,
+                    Matx<_Tp, l, n>& x, int method) const
+    {
+        return cv::solve(a, b, x, method);
+    }
+};
+
+template<typename _Tp, int m, int n> struct Matx_FastSolveOp<_Tp, m, m, n>
+{
+    bool operator()(const Matx<_Tp, m, m>& a, const Matx<_Tp, m, n>& b,
+                    Matx<_Tp, m, n>& x, int method) const
+    {
+        if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+        {
+            Matx<_Tp, m, m> temp = a;
+            x = b;
+            if( method == DECOMP_CHOLESKY )
+                return Cholesky(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n);
+
+            return LU(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n) != 0;
+        }
+        else
+        {
+            return cv::solve(a, b, x, method);
+        }
+    }
+};
+
+template<typename _Tp> struct Matx_FastSolveOp<_Tp, 2, 2, 1>
+{
+    bool operator()(const Matx<_Tp, 2, 2>& a, const Matx<_Tp, 2, 1>& b,
+                    Matx<_Tp, 2, 1>& x, int) const
+    {
+        _Tp d = (_Tp)determinant(a);
+        if (d == 0)
+            return false;
+        d = 1/d;
+        x(0) = (b(0)*a(1,1) - b(1)*a(0,1))*d;
+        x(1) = (b(1)*a(0,0) - b(0)*a(1,0))*d;
+        return true;
+    }
+};
+
+template<typename _Tp> struct Matx_FastSolveOp<_Tp, 3, 3, 1>
+{
+    bool operator()(const Matx<_Tp, 3, 3>& a, const Matx<_Tp, 3, 1>& b,
+                    Matx<_Tp, 3, 1>& x, int) const
+    {
+        _Tp d = (_Tp)determinant(a);
+        if (d == 0)
+            return false;
+        d = 1/d;
+        x(0) = d*(b(0)*(a(1,1)*a(2,2) - a(1,2)*a(2,1)) -
+                a(0,1)*(b(1)*a(2,2) - a(1,2)*b(2)) +
+                a(0,2)*(b(1)*a(2,1) - a(1,1)*b(2)));
+
+        x(1) = d*(a(0,0)*(b(1)*a(2,2) - a(1,2)*b(2)) -
+                b(0)*(a(1,0)*a(2,2) - a(1,2)*a(2,0)) +
+                a(0,2)*(a(1,0)*b(2) - b(1)*a(2,0)));
+
+        x(2) = d*(a(0,0)*(a(1,1)*b(2) - b(1)*a(2,1)) -
+                a(0,1)*(a(1,0)*b(2) - b(1)*a(2,0)) +
+                b(0)*(a(1,0)*a(2,1) - a(1,1)*a(2,0)));
+        return true;
+    }
+};
+
+} // internal
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::randu(_Tp a, _Tp b)
+{
+    Matx<_Tp,m,n> M;
+    cv::randu(M, Scalar(a), Scalar(b));
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::randn(_Tp a, _Tp b)
+{
+    Matx<_Tp,m,n> M;
+    cv::randn(M, Scalar(a), Scalar(b));
+    return M;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::randu(_Tp a, _Tp b)
+{
+    Vec<_Tp,cn> V;
+    cv::randu(V, Scalar(a), Scalar(b));
+    return V;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::randn(_Tp a, _Tp b)
+{
+    Vec<_Tp,cn> V;
+    cv::randn(V, Scalar(a), Scalar(b));
+    return V;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, n, m> Matx<_Tp, m, n>::inv(int method, bool *p_is_ok /*= NULL*/) const
+{
+    Matx<_Tp, n, m> b;
+    bool ok = cv::internal::Matx_FastInvOp<_Tp, m, n>()(*this, b, method);
+    if (p_is_ok) *p_is_ok = ok;
+    return ok ? b : Matx<_Tp, n, m>::zeros();
+}
+
+template<typename _Tp, int m, int n> template<int l> inline
+Matx<_Tp, n, l> Matx<_Tp, m, n>::solve(const Matx<_Tp, m, l>& rhs, int method) const
+{
+    Matx<_Tp, n, l> x;
+    bool ok = cv::internal::Matx_FastSolveOp<_Tp, m, n, l>()(*this, rhs, x, method);
+    return ok ? x : Matx<_Tp, n, l>::zeros();
+}
+
+
+
+////////////////////////// Augmenting algebraic & logical operations //////////////////////////
+
+#define CV_MAT_AUG_OPERATOR1(op, cvop, A, B) \
+    static inline A& operator op (A& a, const B& b) { cvop; return a; }
+
+#define CV_MAT_AUG_OPERATOR(op, cvop, A, B)   \
+    CV_MAT_AUG_OPERATOR1(op, cvop, A, B)      \
+    CV_MAT_AUG_OPERATOR1(op, cvop, const A, B)
+
+#define CV_MAT_AUG_OPERATOR_T(op, cvop, A, B)                   \
+    template<typename _Tp> CV_MAT_AUG_OPERATOR1(op, cvop, A, B) \
+    template<typename _Tp> CV_MAT_AUG_OPERATOR1(op, cvop, const A, B)
+
+#define CV_MAT_AUG_OPERATOR_TN(op, cvop, A)                                \
+    template<typename _Tp, int m, int n> static inline A& operator op (A& a, const Matx<_Tp,m,n>& b) { cvop; return a; } \
+    template<typename _Tp, int m, int n> static inline const A& operator op (const A& a, const Matx<_Tp,m,n>& b) { cvop; return a; }
+
+CV_MAT_AUG_OPERATOR  (+=, cv::add(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (+=, cv::add(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(+=, cv::add(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(+=, cv::add(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(+=, cv::add(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(+=, cv::add(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(+=, cv::add(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (-=, cv::subtract(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (-=, cv::subtract(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(-=, cv::subtract(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(-=, cv::subtract(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat, Mat)
+CV_MAT_AUG_OPERATOR_T(*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR  (*=, a.convertTo(a, -1, b), Mat, double)
+CV_MAT_AUG_OPERATOR_T(*=, a.convertTo(a, -1, b), Mat_<_Tp>, double)
+CV_MAT_AUG_OPERATOR_TN(*=, cv::gemm(a, Mat(b), 1, Mat(), 0, a, 0), Mat)
+CV_MAT_AUG_OPERATOR_TN(*=, cv::gemm(a, Mat(b), 1, Mat(), 0, a, 0), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (/=, cv::divide(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR_T(/=, cv::divide(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(/=, cv::divide(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR  (/=, a.convertTo((Mat&)a, -1, 1./b), Mat, double)
+CV_MAT_AUG_OPERATOR_T(/=, a.convertTo((Mat&)a, -1, 1./b), Mat_<_Tp>, double)
+CV_MAT_AUG_OPERATOR_TN(/=, cv::divide(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(/=, cv::divide(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (&=, cv::bitwise_and(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (&=, cv::bitwise_and(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(&=, cv::bitwise_and(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(&=, cv::bitwise_and(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (|=, cv::bitwise_or(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (|=, cv::bitwise_or(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(|=, cv::bitwise_or(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(|=, cv::bitwise_or(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(^=, cv::bitwise_xor(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(^=, cv::bitwise_xor(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+#undef CV_MAT_AUG_OPERATOR_TN
+#undef CV_MAT_AUG_OPERATOR_T
+#undef CV_MAT_AUG_OPERATOR
+#undef CV_MAT_AUG_OPERATOR1
+
+
+
+///////////////////////////////////////////// SVD /////////////////////////////////////////////
+
+inline SVD::SVD() {}
+inline SVD::SVD( InputArray m, int flags ) { operator ()(m, flags); }
+inline void SVD::solveZ( InputArray m, OutputArray _dst )
+{
+    Mat mtx = m.getMat();
+    SVD svd(mtx, (mtx.rows >= mtx.cols ? 0 : SVD::FULL_UV));
+    _dst.create(svd.vt.cols, 1, svd.vt.type());
+    Mat dst = _dst.getMat();
+    svd.vt.row(svd.vt.rows-1).reshape(1,svd.vt.cols).copyTo(dst);
+}
+
+template<typename _Tp, int m, int n, int nm> inline void
+    SVD::compute( const Matx<_Tp, m, n>& a, Matx<_Tp, nm, 1>& w, Matx<_Tp, m, nm>& u, Matx<_Tp, n, nm>& vt )
+{
+    CV_StaticAssert( nm == MIN(m, n), "Invalid size of output vector.");
+    Mat _a(a, false), _u(u, false), _w(w, false), _vt(vt, false);
+    SVD::compute(_a, _w, _u, _vt);
+    CV_Assert(_w.data == (uchar*)&w.val[0] && _u.data == (uchar*)&u.val[0] && _vt.data == (uchar*)&vt.val[0]);
+}
+
+template<typename _Tp, int m, int n, int nm> inline void
+SVD::compute( const Matx<_Tp, m, n>& a, Matx<_Tp, nm, 1>& w )
+{
+    CV_StaticAssert( nm == MIN(m, n), "Invalid size of output vector.");
+    Mat _a(a, false), _w(w, false);
+    SVD::compute(_a, _w);
+    CV_Assert(_w.data == (uchar*)&w.val[0]);
+}
+
+template<typename _Tp, int m, int n, int nm, int nb> inline void
+SVD::backSubst( const Matx<_Tp, nm, 1>& w, const Matx<_Tp, m, nm>& u,
+                const Matx<_Tp, n, nm>& vt, const Matx<_Tp, m, nb>& rhs,
+                Matx<_Tp, n, nb>& dst )
+{
+    CV_StaticAssert( nm == MIN(m, n), "Invalid size of output vector.");
+    Mat _u(u, false), _w(w, false), _vt(vt, false), _rhs(rhs, false), _dst(dst, false);
+    SVD::backSubst(_w, _u, _vt, _rhs, _dst);
+    CV_Assert(_dst.data == (uchar*)&dst.val[0]);
+}
+
+
+
+/////////////////////////////////// Multiply-with-Carry RNG ///////////////////////////////////
+
+inline RNG::RNG()              { state = 0xffffffff; }
+inline RNG::RNG(uint64 _state) { state = _state ? _state : 0xffffffff; }
+
+inline RNG::operator uchar()    { return (uchar)next(); }
+inline RNG::operator schar()    { return (schar)next(); }
+inline RNG::operator ushort()   { return (ushort)next(); }
+inline RNG::operator short()    { return (short)next(); }
+inline RNG::operator int()      { return (int)next(); }
+inline RNG::operator unsigned() { return next(); }
+inline RNG::operator float()    { return next()*2.3283064365386962890625e-10f; }
+inline RNG::operator double()   { unsigned t = next(); return (((uint64)t << 32) | next()) * 5.4210108624275221700372640043497e-20; }
+
+inline unsigned RNG::operator ()(unsigned N) { return (unsigned)uniform(0,N); }
+inline unsigned RNG::operator ()()           { return next(); }
+
+inline int    RNG::uniform(int a, int b)       { return a == b ? a : (int)(next() % (b - a) + a); }
+inline float  RNG::uniform(float a, float b)   { return ((float)*this)*(b - a) + a; }
+inline double RNG::uniform(double a, double b) { return ((double)*this)*(b - a) + a; }
+
+inline bool RNG::operator ==(const RNG& other) const { return state == other.state; }
+
+inline unsigned RNG::next()
+{
+    state = (uint64)(unsigned)state* /*CV_RNG_COEFF*/ 4164903690U + (unsigned)(state >> 32);
+    return (unsigned)state;
+}
+
+//! returns the next uniformly-distributed random number of the specified type
+template<typename _Tp> static inline _Tp randu()
+{
+  return (_Tp)theRNG();
+}
+
+
+///////////////////////////////// Formatted output of cv::Mat /////////////////////////////////
+
+static inline
+Ptr<Formatted> format(InputArray mtx, Formatter::FormatType fmt)
+{
+    return Formatter::get(fmt)->format(mtx.getMat());
+}
+
+static inline
+int print(Ptr<Formatted> fmtd, FILE* stream = stdout)
+{
+    int written = 0;
+    fmtd->reset();
+    for(const char* str = fmtd->next(); str; str = fmtd->next())
+        written += fputs(str, stream);
+
+    return written;
+}
+
+static inline
+int print(const Mat& mtx, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(mtx), stream);
+}
+
+static inline
+int print(const UMat& mtx, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(mtx.getMat(ACCESS_READ)), stream);
+}
+
+template<typename _Tp> static inline
+int print(const std::vector<Point_<_Tp> >& vec, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(Mat(vec)), stream);
+}
+
+template<typename _Tp> static inline
+int print(const std::vector<Point3_<_Tp> >& vec, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(Mat(vec)), stream);
+}
+
+template<typename _Tp, int m, int n> static inline
+int print(const Matx<_Tp, m, n>& matx, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(cv::Mat(matx)), stream);
+}
+
+//! @endcond
+
+///////////////////////////////// Formatted string generation /////////////////////////////////
+
+/** @brief Returns a text string formatted using the printf-like expression.
+
+The function acts like sprintf but forms and returns an STL string. It can be used to form an error
+message in the Exception constructor.
+@param fmt printf-compatible formatting specifiers.
+
+**Note**:
+|Type|Specifier|
+|-|-|
+|`const char*`|`%s`|
+|`char`|`%c`|
+|`float` / `double`|`%f`,`%g`|
+|`int`, `long`, `long long`|`%d`, `%ld`, ``%lld`|
+|`unsigned`, `unsigned long`, `unsigned long long`|`%u`, `%lu`, `%llu`|
+|`uint64` -> `uintmax_t`, `int64` -> `intmax_t`|`%ju`, `%jd`|
+|`size_t`|`%zu`|
+@ingroup core_utils
+ */
+CV_EXPORTS String format(const char* fmt, ...) CV_FORMAT_PRINTF(1, 2);
+
+/****************************************************************************************\
+*                                  Auxiliary algorithms                                  *
+\****************************************************************************************/
+
+/** @brief Splits an element set into equivalency classes.
+
+The generic function partition implements an \f$O(N^2)\f$ algorithm for splitting a set of \f$N\f$ elements
+into one or more equivalency classes, as described in
+<http://en.wikipedia.org/wiki/Disjoint-set_data_structure> . The function returns the number of
+equivalency classes.
+@param _vec Set of elements stored as a vector.
+@param labels Output vector of labels. It contains as many elements as vec. Each label labels[i] is
+a 0-based cluster index of `vec[i]`.
+@param predicate Equivalence predicate (pointer to a boolean function of two arguments or an
+instance of the class that has the method bool operator()(const _Tp& a, const _Tp& b) ). The
+predicate returns true when the elements are certainly in the same class, and returns false if they
+may or may not be in the same class.
+@ingroup core_cluster
+*/
+template<typename _Tp, class _EqPredicate> int
+partition( const std::vector<_Tp>& _vec, std::vector<int>& labels,
+          _EqPredicate predicate=_EqPredicate())
+{
+    int i, j, N = (int)_vec.size();
+    const _Tp* vec = &_vec[0];
+
+    const int PARENT=0;
+    const int RANK=1;
+
+    std::vector<int> _nodes(N*2);
+    int (*nodes)[2] = (int(*)[2])&_nodes[0];
+
+    // The first O(N) pass: create N single-vertex trees
+    for(i = 0; i < N; i++)
+    {
+        nodes[i][PARENT]=-1;
+        nodes[i][RANK] = 0;
+    }
+
+    // The main O(N^2) pass: merge connected components
+    for( i = 0; i < N; i++ )
+    {
+        int root = i;
+
+        // find root
+        while( nodes[root][PARENT] >= 0 )
+            root = nodes[root][PARENT];
+
+        for( j = 0; j < N; j++ )
+        {
+            if( i == j || !predicate(vec[i], vec[j]))
+                continue;
+            int root2 = j;
+
+            while( nodes[root2][PARENT] >= 0 )
+                root2 = nodes[root2][PARENT];
+
+            if( root2 != root )
+            {
+                // unite both trees
+                int rank = nodes[root][RANK], rank2 = nodes[root2][RANK];
+                if( rank > rank2 )
+                    nodes[root2][PARENT] = root;
+                else
+                {
+                    nodes[root][PARENT] = root2;
+                    nodes[root2][RANK] += rank == rank2;
+                    root = root2;
+                }
+                CV_Assert( nodes[root][PARENT] < 0 );
+
+                int k = j, parent;
+
+                // compress the path from node2 to root
+                while( (parent = nodes[k][PARENT]) >= 0 )
+                {
+                    nodes[k][PARENT] = root;
+                    k = parent;
+                }
+
+                // compress the path from node to root
+                k = i;
+                while( (parent = nodes[k][PARENT]) >= 0 )
+                {
+                    nodes[k][PARENT] = root;
+                    k = parent;
+                }
+            }
+        }
+    }
+
+    // Final O(N) pass: enumerate classes
+    labels.resize(N);
+    int nclasses = 0;
+
+    for( i = 0; i < N; i++ )
+    {
+        int root = i;
+        while( nodes[root][PARENT] >= 0 )
+            root = nodes[root][PARENT];
+        // re-use the rank as the class label
+        if( nodes[root][RANK] >= 0 )
+            nodes[root][RANK] = ~nclasses++;
+        labels[i] = ~nodes[root][RANK];
+    }
+
+    return nclasses;
+}
+
+} // cv
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/optim.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/optim.hpp
new file mode 100644
index 000000000000..59fe978c2600
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/optim.hpp
@@ -0,0 +1,307 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OPTIM_HPP
+#define OPENCV_OPTIM_HPP
+
+#include "opencv2/core.hpp"
+
+namespace cv
+{
+
+/** @addtogroup core_optim
+The algorithms in this section minimize or maximize function value within specified constraints or
+without any constraints.
+@{
+*/
+
+/** @brief Basic interface for all solvers
+ */
+class CV_EXPORTS MinProblemSolver : public Algorithm
+{
+public:
+    /** @brief Represents function being optimized
+     */
+    class CV_EXPORTS Function
+    {
+    public:
+        virtual ~Function() {}
+        virtual int getDims() const = 0;
+        virtual double getGradientEps() const;
+        virtual double calc(const double* x) const = 0;
+        virtual void getGradient(const double* x,double* grad);
+    };
+
+    /** @brief Getter for the optimized function.
+
+    The optimized function is represented by Function interface, which requires derivatives to
+    implement the calc(double*) and getDim() methods to evaluate the function.
+
+    @return Smart-pointer to an object that implements Function interface - it represents the
+    function that is being optimized. It can be empty, if no function was given so far.
+     */
+    virtual Ptr<Function> getFunction() const = 0;
+
+    /** @brief Setter for the optimized function.
+
+    *It should be called at least once before the call to* minimize(), as default value is not usable.
+
+    @param f The new function to optimize.
+     */
+    virtual void setFunction(const Ptr<Function>& f) = 0;
+
+    /** @brief Getter for the previously set terminal criteria for this algorithm.
+
+    @return Deep copy of the terminal criteria used at the moment.
+     */
+    virtual TermCriteria getTermCriteria() const = 0;
+
+    /** @brief Set terminal criteria for solver.
+
+    This method *is not necessary* to be called before the first call to minimize(), as the default
+    value is sensible.
+
+    Algorithm stops when the number of function evaluations done exceeds termcrit.maxCount, when
+    the function values at the vertices of simplex are within termcrit.epsilon range or simplex
+    becomes so small that it can enclosed in a box with termcrit.epsilon sides, whatever comes
+    first.
+    @param termcrit Terminal criteria to be used, represented as cv::TermCriteria structure.
+     */
+    virtual void setTermCriteria(const TermCriteria& termcrit) = 0;
+
+    /** @brief actually runs the algorithm and performs the minimization.
+
+    The sole input parameter determines the centroid of the starting simplex (roughly, it tells
+    where to start), all the others (terminal criteria, initial step, function to be minimized) are
+    supposed to be set via the setters before the call to this method or the default values (not
+    always sensible) will be used.
+
+    @param x The initial point, that will become a centroid of an initial simplex. After the algorithm
+    will terminate, it will be set to the point where the algorithm stops, the point of possible
+    minimum.
+    @return The value of a function at the point found.
+     */
+    virtual double minimize(InputOutputArray x) = 0;
+};
+
+/** @brief This class is used to perform the non-linear non-constrained minimization of a function,
+
+defined on an `n`-dimensional Euclidean space, using the **Nelder-Mead method**, also known as
+**downhill simplex method**. The basic idea about the method can be obtained from
+<http://en.wikipedia.org/wiki/Nelder-Mead_method>.
+
+It should be noted, that this method, although deterministic, is rather a heuristic and therefore
+may converge to a local minima, not necessary a global one. It is iterative optimization technique,
+which at each step uses an information about the values of a function evaluated only at `n+1`
+points, arranged as a *simplex* in `n`-dimensional space (hence the second name of the method). At
+each step new point is chosen to evaluate function at, obtained value is compared with previous
+ones and based on this information simplex changes it's shape , slowly moving to the local minimum.
+Thus this method is using *only* function values to make decision, on contrary to, say, Nonlinear
+Conjugate Gradient method (which is also implemented in optim).
+
+Algorithm stops when the number of function evaluations done exceeds termcrit.maxCount, when the
+function values at the vertices of simplex are within termcrit.epsilon range or simplex becomes so
+small that it can enclosed in a box with termcrit.epsilon sides, whatever comes first, for some
+defined by user positive integer termcrit.maxCount and positive non-integer termcrit.epsilon.
+
+@note DownhillSolver is a derivative of the abstract interface
+cv::MinProblemSolver, which in turn is derived from the Algorithm interface and is used to
+encapsulate the functionality, common to all non-linear optimization algorithms in the optim
+module.
+
+@note term criteria should meet following condition:
+@code
+    termcrit.type == (TermCriteria::MAX_ITER + TermCriteria::EPS) && termcrit.epsilon > 0 && termcrit.maxCount > 0
+@endcode
+ */
+class CV_EXPORTS DownhillSolver : public MinProblemSolver
+{
+public:
+    /** @brief Returns the initial step that will be used in downhill simplex algorithm.
+
+    @param step Initial step that will be used in algorithm. Note, that although corresponding setter
+    accepts column-vectors as well as row-vectors, this method will return a row-vector.
+    @see DownhillSolver::setInitStep
+     */
+    virtual void getInitStep(OutputArray step) const=0;
+
+    /** @brief Sets the initial step that will be used in downhill simplex algorithm.
+
+    Step, together with initial point (given in DownhillSolver::minimize) are two `n`-dimensional
+    vectors that are used to determine the shape of initial simplex. Roughly said, initial point
+    determines the position of a simplex (it will become simplex's centroid), while step determines the
+    spread (size in each dimension) of a simplex. To be more precise, if \f$s,x_0\in\mathbb{R}^n\f$ are
+    the initial step and initial point respectively, the vertices of a simplex will be:
+    \f$v_0:=x_0-\frac{1}{2} s\f$ and \f$v_i:=x_0+s_i\f$ for \f$i=1,2,\dots,n\f$ where \f$s_i\f$ denotes
+    projections of the initial step of *n*-th coordinate (the result of projection is treated to be
+    vector given by \f$s_i:=e_i\cdot\left<e_i\cdot s\right>\f$, where \f$e_i\f$ form canonical basis)
+
+    @param step Initial step that will be used in algorithm. Roughly said, it determines the spread
+    (size in each dimension) of an initial simplex.
+     */
+    virtual void setInitStep(InputArray step)=0;
+
+    /** @brief This function returns the reference to the ready-to-use DownhillSolver object.
+
+    All the parameters are optional, so this procedure can be called even without parameters at
+    all. In this case, the default values will be used. As default value for terminal criteria are
+    the only sensible ones, MinProblemSolver::setFunction() and DownhillSolver::setInitStep()
+    should be called upon the obtained object, if the respective parameters were not given to
+    create(). Otherwise, the two ways (give parameters to createDownhillSolver() or miss them out
+    and call the MinProblemSolver::setFunction() and DownhillSolver::setInitStep()) are absolutely
+    equivalent (and will drop the same errors in the same way, should invalid input be detected).
+    @param f Pointer to the function that will be minimized, similarly to the one you submit via
+    MinProblemSolver::setFunction.
+    @param initStep Initial step, that will be used to construct the initial simplex, similarly to the one
+    you submit via MinProblemSolver::setInitStep.
+    @param termcrit Terminal criteria to the algorithm, similarly to the one you submit via
+    MinProblemSolver::setTermCriteria.
+     */
+    static Ptr<DownhillSolver> create(const Ptr<MinProblemSolver::Function>& f=Ptr<MinProblemSolver::Function>(),
+                                      InputArray initStep=Mat_<double>(1,1,0.0),
+                                      TermCriteria termcrit=TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS,5000,0.000001));
+};
+
+/** @brief This class is used to perform the non-linear non-constrained minimization of a function
+with known gradient,
+
+defined on an *n*-dimensional Euclidean space, using the **Nonlinear Conjugate Gradient method**.
+The implementation was done based on the beautifully clear explanatory article [An Introduction to
+the Conjugate Gradient Method Without the Agonizing
+Pain](http://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf) by Jonathan Richard
+Shewchuk. The method can be seen as an adaptation of a standard Conjugate Gradient method (see, for
+example <http://en.wikipedia.org/wiki/Conjugate_gradient_method>) for numerically solving the
+systems of linear equations.
+
+It should be noted, that this method, although deterministic, is rather a heuristic method and
+therefore may converge to a local minima, not necessary a global one. What is even more disastrous,
+most of its behaviour is ruled by gradient, therefore it essentially cannot distinguish between
+local minima and maxima. Therefore, if it starts sufficiently near to the local maximum, it may
+converge to it. Another obvious restriction is that it should be possible to compute the gradient of
+a function at any point, thus it is preferable to have analytic expression for gradient and
+computational burden should be born by the user.
+
+The latter responsibility is accomplished via the getGradient method of a
+MinProblemSolver::Function interface (which represents function being optimized). This method takes
+point a point in *n*-dimensional space (first argument represents the array of coordinates of that
+point) and compute its gradient (it should be stored in the second argument as an array).
+
+@note class ConjGradSolver thus does not add any new methods to the basic MinProblemSolver interface.
+
+@note term criteria should meet following condition:
+@code
+    termcrit.type == (TermCriteria::MAX_ITER + TermCriteria::EPS) && termcrit.epsilon > 0 && termcrit.maxCount > 0
+    // or
+    termcrit.type == TermCriteria::MAX_ITER) && termcrit.maxCount > 0
+@endcode
+ */
+class CV_EXPORTS ConjGradSolver : public MinProblemSolver
+{
+public:
+    /** @brief This function returns the reference to the ready-to-use ConjGradSolver object.
+
+    All the parameters are optional, so this procedure can be called even without parameters at
+    all. In this case, the default values will be used. As default value for terminal criteria are
+    the only sensible ones, MinProblemSolver::setFunction() should be called upon the obtained
+    object, if the function was not given to create(). Otherwise, the two ways (submit it to
+    create() or miss it out and call the MinProblemSolver::setFunction()) are absolutely equivalent
+    (and will drop the same errors in the same way, should invalid input be detected).
+    @param f Pointer to the function that will be minimized, similarly to the one you submit via
+    MinProblemSolver::setFunction.
+    @param termcrit Terminal criteria to the algorithm, similarly to the one you submit via
+    MinProblemSolver::setTermCriteria.
+    */
+    static Ptr<ConjGradSolver> create(const Ptr<MinProblemSolver::Function>& f=Ptr<ConjGradSolver::Function>(),
+                                      TermCriteria termcrit=TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS,5000,0.000001));
+};
+
+//! return codes for cv::solveLP() function
+enum SolveLPResult
+{
+    SOLVELP_LOST   = -3, //!< problem is feasible, but solver lost solution due to floating-point arithmetic errors
+    SOLVELP_UNBOUNDED    = -2, //!< problem is unbounded (target function can achieve arbitrary high values)
+    SOLVELP_UNFEASIBLE    = -1, //!< problem is unfeasible (there are no points that satisfy all the constraints imposed)
+    SOLVELP_SINGLE    = 0, //!< there is only one maximum for target function
+    SOLVELP_MULTI    = 1 //!< there are multiple maxima for target function - the arbitrary one is returned
+};
+
+/** @brief Solve given (non-integer) linear programming problem using the Simplex Algorithm (Simplex Method).
+
+What we mean here by "linear programming problem" (or LP problem, for short) can be formulated as:
+
+\f[\mbox{Maximize } c\cdot x\\
+ \mbox{Subject to:}\\
+ Ax\leq b\\
+ x\geq 0\f]
+
+Where \f$c\f$ is fixed `1`-by-`n` row-vector, \f$A\f$ is fixed `m`-by-`n` matrix, \f$b\f$ is fixed `m`-by-`1`
+column vector and \f$x\f$ is an arbitrary `n`-by-`1` column vector, which satisfies the constraints.
+
+Simplex algorithm is one of many algorithms that are designed to handle this sort of problems
+efficiently. Although it is not optimal in theoretical sense (there exist algorithms that can solve
+any problem written as above in polynomial time, while simplex method degenerates to exponential
+time for some special cases), it is well-studied, easy to implement and is shown to work well for
+real-life purposes.
+
+The particular implementation is taken almost verbatim from **Introduction to Algorithms, third
+edition** by T. H. Cormen, C. E. Leiserson, R. L. Rivest and Clifford Stein. In particular, the
+Bland's rule <http://en.wikipedia.org/wiki/Bland%27s_rule> is used to prevent cycling.
+
+@param Func This row-vector corresponds to \f$c\f$ in the LP problem formulation (see above). It should
+contain 32- or 64-bit floating point numbers. As a convenience, column-vector may be also submitted,
+in the latter case it is understood to correspond to \f$c^T\f$.
+@param Constr `m`-by-`n+1` matrix, whose rightmost column corresponds to \f$b\f$ in formulation above
+and the remaining to \f$A\f$. It should contain 32- or 64-bit floating point numbers.
+@param z The solution will be returned here as a column-vector - it corresponds to \f$c\f$ in the
+formulation above. It will contain 64-bit floating point numbers.
+@param constr_eps allowed numeric disparity for constraints
+@return One of cv::SolveLPResult
+ */
+CV_EXPORTS_W int solveLP(InputArray Func, InputArray Constr, OutputArray z, double constr_eps);
+
+/** @overload */
+CV_EXPORTS_W int solveLP(InputArray Func, InputArray Constr, OutputArray z);
+
+//! @}
+
+}// cv
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/ovx.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/ovx.hpp
new file mode 100644
index 000000000000..8bb7d5491172
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/ovx.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2016, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+// OpenVX related definitions and declarations
+
+#pragma once
+#ifndef OPENCV_OVX_HPP
+#define OPENCV_OVX_HPP
+
+#include "cvdef.h"
+
+namespace cv
+{
+/// Check if use of OpenVX is possible
+CV_EXPORTS_W bool haveOpenVX();
+
+/// Check if use of OpenVX is enabled
+CV_EXPORTS_W bool useOpenVX();
+
+/// Enable/disable use of OpenVX
+CV_EXPORTS_W void setUseOpenVX(bool flag);
+} // namespace cv
+
+#endif // OPENCV_OVX_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp
new file mode 100644
index 000000000000..b172cac34d3f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_FOR_OPENMP_HPP
+#define OPENCV_CORE_PARALLEL_FOR_OPENMP_HPP
+
+#include "opencv2/core/parallel/parallel_backend.hpp"
+
+#if !defined(_OPENMP) && !defined(OPENCV_SKIP_OPENMP_PRESENSE_CHECK)
+#error "This file must be compiled with enabled OpenMP"
+#endif
+
+#include <omp.h>
+
+namespace cv { namespace parallel { namespace openmp {
+
+/** OpenMP parallel_for API implementation
+ *
+ * @sa setParallelForBackend
+ * @ingroup core_parallel_backend
+ */
+class ParallelForBackend : public ParallelForAPI
+{
+protected:
+    int numThreads;
+    int numThreadsMax;
+public:
+    ParallelForBackend()
+    {
+        numThreads = 0;
+        numThreadsMax = omp_get_max_threads();
+    }
+
+    virtual ~ParallelForBackend() {}
+
+    virtual void parallel_for(int tasks, FN_parallel_for_body_cb_t body_callback, void* callback_data) CV_OVERRIDE
+    {
+#pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
+        for (int i = 0; i < tasks; ++i)
+            body_callback(i, i + 1, callback_data);
+    }
+
+    virtual int getThreadNum() const CV_OVERRIDE
+    {
+        return omp_get_thread_num();
+    }
+
+    virtual int getNumThreads() const CV_OVERRIDE
+    {
+        return numThreads > 0
+               ? numThreads
+               : numThreadsMax;
+    }
+
+    virtual int setNumThreads(int nThreads) CV_OVERRIDE
+    {
+        int oldNumThreads = numThreads;
+        numThreads = nThreads;
+        // nothing needed as numThreads is used in #pragma omp parallel for directly
+        return oldNumThreads;
+    }
+
+    const char* getName() const CV_OVERRIDE
+    {
+        return "openmp";
+    }
+};
+
+}}}  // namespace
+
+#endif  // OPENCV_CORE_PARALLEL_FOR_OPENMP_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
new file mode 100644
index 000000000000..04b0c4c6cb59
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
@@ -0,0 +1,153 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_FOR_TBB_HPP
+#define OPENCV_CORE_PARALLEL_FOR_TBB_HPP
+
+#include "opencv2/core/parallel/parallel_backend.hpp"
+#include <opencv2/core/utils/logger.hpp>
+
+#ifndef TBB_SUPPRESS_DEPRECATED_MESSAGES  // supress warning
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#endif
+#include "tbb/tbb.h"
+#if !defined(TBB_INTERFACE_VERSION)
+#error "Unknows/unsupported TBB version"
+#endif
+
+#if TBB_INTERFACE_VERSION >= 8000
+#include "tbb/task_arena.h"
+#endif
+
+namespace cv { namespace parallel { namespace tbb {
+
+using namespace ::tbb;
+
+#if TBB_INTERFACE_VERSION >= 8000
+static tbb::task_arena& getArena()
+{
+    static tbb::task_arena tbbArena(tbb::task_arena::automatic);
+    return tbbArena;
+}
+#else
+static tbb::task_scheduler_init& getScheduler()
+{
+    static tbb::task_scheduler_init tbbScheduler(tbb::task_scheduler_init::deferred);
+    return tbbScheduler;
+}
+#endif
+
+/** TBB parallel_for API implementation
+ *
+ * @sa setParallelForBackend
+ * @ingroup core_parallel_backend
+ */
+class ParallelForBackend : public ParallelForAPI
+{
+protected:
+    int numThreads;
+    int numThreadsMax;
+public:
+    ParallelForBackend()
+    {
+        CV_LOG_INFO(NULL, "Initializing TBB parallel backend: TBB_INTERFACE_VERSION=" << TBB_INTERFACE_VERSION);
+        numThreads = 0;
+#if TBB_INTERFACE_VERSION >= 8000
+        (void)getArena();
+#else
+        (void)getScheduler();
+#endif
+    }
+
+    virtual ~ParallelForBackend() {}
+
+    class CallbackProxy
+    {
+        const FN_parallel_for_body_cb_t& callback;
+        void* const callback_data;
+        const int tasks;
+    public:
+        inline CallbackProxy(int tasks_, FN_parallel_for_body_cb_t& callback_, void* callback_data_)
+            : callback(callback_), callback_data(callback_data_), tasks(tasks_)
+        {
+            // nothing
+        }
+
+        void operator()(const tbb::blocked_range<int>& range) const
+        {
+            this->callback(range.begin(), range.end(), callback_data);
+        }
+
+        void operator()() const
+        {
+            tbb::parallel_for(tbb::blocked_range<int>(0, tasks), *this);
+        }
+    };
+
+    virtual void parallel_for(int tasks, FN_parallel_for_body_cb_t body_callback, void* callback_data) CV_OVERRIDE
+    {
+        CallbackProxy task(tasks, body_callback, callback_data);
+#if TBB_INTERFACE_VERSION >= 8000
+        getArena().execute(task);
+#else
+        task();
+#endif
+    }
+
+    virtual int getThreadNum() const CV_OVERRIDE
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+        return tbb::this_task_arena::current_thread_index();
+#elif TBB_INTERFACE_VERSION >= 8000
+        return tbb::task_arena::current_thread_index();
+#else
+        return 0;
+#endif
+    }
+
+    virtual int getNumThreads() const CV_OVERRIDE
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+    return getArena().max_concurrency();
+#elif TBB_INTERFACE_VERSION >= 8000
+    return numThreads > 0
+        ? numThreads
+        : tbb::task_scheduler_init::default_num_threads();
+#else
+    return getScheduler().is_active()
+           ? numThreads
+           : tbb::task_scheduler_init::default_num_threads();
+#endif
+    }
+
+    virtual int setNumThreads(int nThreads) CV_OVERRIDE
+    {
+        int oldNumThreads = numThreads;
+        numThreads = nThreads;
+
+#if TBB_INTERFACE_VERSION >= 8000
+        auto& tbbArena = getArena();
+        if (tbbArena.is_active())
+            tbbArena.terminate();
+        if (numThreads > 0)
+            tbbArena.initialize(numThreads);
+#else
+        auto& tbbScheduler = getScheduler();
+        if (tbbScheduler.is_active())
+            tbbScheduler.terminate();
+        if (numThreads > 0)
+            tbbScheduler.initialize(numThreads);
+#endif
+        return oldNumThreads;
+    }
+
+    const char* getName() const CV_OVERRIDE
+    {
+        return "tbb";
+    }
+};
+
+}}}  // namespace
+
+#endif  // OPENCV_CORE_PARALLEL_FOR_TBB_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/parallel_backend.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/parallel_backend.hpp
new file mode 100644
index 000000000000..c3e8333c1cc8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/parallel/parallel_backend.hpp
@@ -0,0 +1,90 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_BACKEND_HPP
+#define OPENCV_CORE_PARALLEL_BACKEND_HPP
+
+#include "opencv2/core/cvdef.h"
+#include <memory>
+
+namespace cv { namespace parallel {
+#ifndef CV_API_CALL
+#define CV_API_CALL
+#endif
+
+/** @addtogroup core_parallel_backend
+ * @{
+ * API below is provided to resolve problem of CPU resource over-subscription by multiple thread pools from different multi-threading frameworks.
+ * This is common problem for cases when OpenCV compiled threading framework is different from the Users Applications framework.
+ *
+ * Applications can replace OpenCV `parallel_for()` backend with own implementation (to reuse Application's thread pool).
+ *
+ *
+ * ### Backend API usage examples
+ *
+ * #### Intel TBB
+ *
+ * - include header with simple implementation of TBB backend:
+ *   @snippet parallel_backend/example-tbb.cpp tbb_include
+ * - execute backend replacement code:
+ *   @snippet parallel_backend/example-tbb.cpp tbb_backend
+ * - configuration of compiler/linker options is responsibility of Application's scripts
+ *
+ * #### OpenMP
+ *
+ * - include header with simple implementation of OpenMP backend:
+ *   @snippet parallel_backend/example-openmp.cpp openmp_include
+ * - execute backend replacement code:
+ *   @snippet parallel_backend/example-openmp.cpp openmp_backend
+ * - Configuration of compiler/linker options is responsibility of Application's scripts
+ *
+ *
+ * ### Plugins support
+ *
+ * Runtime configuration options:
+ * - change backend priority: `OPENCV_PARALLEL_PRIORITY_<backend>=9999`
+ * - disable backend: `OPENCV_PARALLEL_PRIORITY_<backend>=0`
+ * - specify list of backends with high priority (>100000): `OPENCV_PARALLEL_PRIORITY_LIST=TBB,OPENMP`. Unknown backends are registered as new plugins.
+ *
+ */
+
+/** Interface for parallel_for backends implementations
+ *
+ * @sa setParallelForBackend
+ */
+class CV_EXPORTS ParallelForAPI
+{
+public:
+    virtual ~ParallelForAPI();
+
+    typedef void (CV_API_CALL *FN_parallel_for_body_cb_t)(int start, int end, void* data);
+
+    virtual void parallel_for(int tasks, FN_parallel_for_body_cb_t body_callback, void* callback_data) = 0;
+
+    virtual int getThreadNum() const = 0;
+
+    virtual int getNumThreads() const = 0;
+
+    virtual int setNumThreads(int nThreads) = 0;
+
+    virtual const char* getName() const = 0;
+};
+
+/** @brief Replace OpenCV parallel_for backend
+ *
+ * Application can replace OpenCV `parallel_for()` backend with own implementation.
+ *
+ * @note This call is not thread-safe. Consider calling this function from the `main()` before any other OpenCV processing functions (and without any other created threads).
+ */
+CV_EXPORTS void setParallelForBackend(const std::shared_ptr<ParallelForAPI>& api, bool propagateNumThreads = true);
+
+/** @brief Change OpenCV parallel_for backend
+ *
+ * @note This call is not thread-safe. Consider calling this function from the `main()` before any other OpenCV processing functions (and without any other created threads).
+ */
+CV_EXPORTS_W bool setParallelForBackend(const std::string& backendName, bool propagateNumThreads = true);
+
+//! @}
+}}  // namespace
+#endif  // OPENCV_CORE_PARALLEL_BACKEND_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/persistence.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/persistence.hpp
new file mode 100644
index 000000000000..9c4f33fb1457
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/persistence.hpp
@@ -0,0 +1,1310 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_PERSISTENCE_HPP
+#define OPENCV_CORE_PERSISTENCE_HPP
+
+#ifndef CV_DOXYGEN
+/// Define to support persistence legacy formats
+#define CV__LEGACY_PERSISTENCE
+#endif
+
+#ifndef __cplusplus
+#  error persistence.hpp header must be compiled as C++
+#endif
+
+//! @addtogroup core_c
+//! @{
+
+/** @brief "black box" representation of the file storage associated with a file on disk.
+
+Several functions that are described below take CvFileStorage\* as inputs and allow the user to
+save or to load hierarchical collections that consist of scalar values, standard CXCore objects
+(such as matrices, sequences, graphs), and user-defined objects.
+
+OpenCV can read and write data in XML (<http://www.w3c.org/XML>), YAML (<http://www.yaml.org>) or
+JSON (<http://www.json.org/>) formats. Below is an example of 3x3 floating-point identity matrix A,
+stored in XML and YAML files
+using CXCore functions:
+XML:
+@code{.xml}
+    <?xml version="1.0">
+    <opencv_storage>
+    <A type_id="opencv-matrix">
+      <rows>3</rows>
+      <cols>3</cols>
+      <dt>f</dt>
+      <data>1. 0. 0. 0. 1. 0. 0. 0. 1.</data>
+    </A>
+    </opencv_storage>
+@endcode
+YAML:
+@code{.yaml}
+    %YAML:1.0
+    A: !!opencv-matrix
+      rows: 3
+      cols: 3
+      dt: f
+      data: [ 1., 0., 0., 0., 1., 0., 0., 0., 1.]
+@endcode
+As it can be seen from the examples, XML uses nested tags to represent hierarchy, while YAML uses
+indentation for that purpose (similar to the Python programming language).
+
+The same functions can read and write data in both formats; the particular format is determined by
+the extension of the opened file, ".xml" for XML files, ".yml" or ".yaml" for YAML and ".json" for
+JSON.
+ */
+
+//! @} core_c
+
+#include "opencv2/core/types.hpp"
+#include "opencv2/core/mat.hpp"
+
+namespace cv {
+
+/** @addtogroup core_xml
+
+XML/YAML/JSON file storages.     {#xml_storage}
+=======================
+Writing to a file storage.
+--------------------------
+You can store and then restore various OpenCV data structures to/from XML (<http://www.w3c.org/XML>),
+YAML (<http://www.yaml.org>) or JSON (<http://www.json.org/>) formats. Also, it is possible to store
+and load arbitrarily complex data structures, which include OpenCV data structures, as well as
+primitive data types (integer and floating-point numbers and text strings) as their elements.
+
+Use the following procedure to write something to XML, YAML or JSON:
+-# Create new FileStorage and open it for writing. It can be done with a single call to
+FileStorage::FileStorage constructor that takes a filename, or you can use the default constructor
+and then call FileStorage::open. Format of the file (XML, YAML or JSON) is determined from the filename
+extension (".xml", ".yml"/".yaml" and ".json", respectively)
+-# Write all the data you want using the streaming operator `<<`, just like in the case of STL
+streams.
+-# Close the file using FileStorage::release. FileStorage destructor also closes the file.
+
+Here is an example:
+@code
+    #include "opencv2/core.hpp"
+    #include <time.h>
+
+    using namespace cv;
+
+    int main(int, char** argv)
+    {
+        FileStorage fs("test.yml", FileStorage::WRITE);
+
+        fs << "frameCount" << 5;
+        time_t rawtime; time(&rawtime);
+        fs << "calibrationDate" << asctime(localtime(&rawtime));
+        Mat cameraMatrix = (Mat_<double>(3,3) << 1000, 0, 320, 0, 1000, 240, 0, 0, 1);
+        Mat distCoeffs = (Mat_<double>(5,1) << 0.1, 0.01, -0.001, 0, 0);
+        fs << "cameraMatrix" << cameraMatrix << "distCoeffs" << distCoeffs;
+        fs << "features" << "[";
+        for( int i = 0; i < 3; i++ )
+        {
+            int x = rand() % 640;
+            int y = rand() % 480;
+            uchar lbp = rand() % 256;
+
+            fs << "{:" << "x" << x << "y" << y << "lbp" << "[:";
+            for( int j = 0; j < 8; j++ )
+                fs << ((lbp >> j) & 1);
+            fs << "]" << "}";
+        }
+        fs << "]";
+        fs.release();
+        return 0;
+    }
+@endcode
+The sample above stores to YML an integer, a text string (calibration date), 2 matrices, and a custom
+structure "feature", which includes feature coordinates and LBP (local binary pattern) value. Here
+is output of the sample:
+@code{.yaml}
+%YAML:1.0
+frameCount: 5
+calibrationDate: "Fri Jun 17 14:09:29 2011\n"
+cameraMatrix: !!opencv-matrix
+   rows: 3
+   cols: 3
+   dt: d
+   data: [ 1000., 0., 320., 0., 1000., 240., 0., 0., 1. ]
+distCoeffs: !!opencv-matrix
+   rows: 5
+   cols: 1
+   dt: d
+   data: [ 1.0000000000000001e-01, 1.0000000000000000e-02,
+       -1.0000000000000000e-03, 0., 0. ]
+features:
+   - { x:167, y:49, lbp:[ 1, 0, 0, 1, 1, 0, 1, 1 ] }
+   - { x:298, y:130, lbp:[ 0, 0, 0, 1, 0, 0, 1, 1 ] }
+   - { x:344, y:158, lbp:[ 1, 1, 0, 0, 0, 0, 1, 0 ] }
+@endcode
+
+As an exercise, you can replace ".yml" with ".xml" or ".json" in the sample above and see, how the
+corresponding XML file will look like.
+
+Several things can be noted by looking at the sample code and the output:
+
+-   The produced YAML (and XML/JSON) consists of heterogeneous collections that can be nested. There are
+    2 types of collections: named collections (mappings) and unnamed collections (sequences). In mappings
+    each element has a name and is accessed by name. This is similar to structures and std::map in
+    C/C++ and dictionaries in Python. In sequences elements do not have names, they are accessed by
+    indices. This is similar to arrays and std::vector in C/C++ and lists, tuples in Python.
+    "Heterogeneous" means that elements of each single collection can have different types.
+
+    Top-level collection in YAML/XML/JSON is a mapping. Each matrix is stored as a mapping, and the matrix
+    elements are stored as a sequence. Then, there is a sequence of features, where each feature is
+    represented a mapping, and lbp value in a nested sequence.
+
+-   When you write to a mapping (a structure), you write element name followed by its value. When you
+    write to a sequence, you simply write the elements one by one. OpenCV data structures (such as
+    cv::Mat) are written in absolutely the same way as simple C data structures - using `<<`
+    operator.
+
+-   To write a mapping, you first write the special string `{` to the storage, then write the
+    elements as pairs (`fs << <element_name> << <element_value>`) and then write the closing
+    `}`.
+
+-   To write a sequence, you first write the special string `[`, then write the elements, then
+    write the closing `]`.
+
+-   In YAML/JSON (but not XML), mappings and sequences can be written in a compact Python-like inline
+    form. In the sample above matrix elements, as well as each feature, including its lbp value, is
+    stored in such inline form. To store a mapping/sequence in a compact form, put `:` after the
+    opening character, e.g. use `{:` instead of `{` and `[:` instead of `[`. When the
+    data is written to XML, those extra `:` are ignored.
+
+Reading data from a file storage.
+---------------------------------
+To read the previously written XML, YAML or JSON file, do the following:
+-#  Open the file storage using FileStorage::FileStorage constructor or FileStorage::open method.
+    In the current implementation the whole file is parsed and the whole representation of file
+    storage is built in memory as a hierarchy of file nodes (see FileNode)
+
+-#  Read the data you are interested in. Use FileStorage::operator [], FileNode::operator []
+    and/or FileNodeIterator.
+
+-#  Close the storage using FileStorage::release.
+
+Here is how to read the file created by the code sample above:
+@code
+    FileStorage fs2("test.yml", FileStorage::READ);
+
+    // first method: use (type) operator on FileNode.
+    int frameCount = (int)fs2["frameCount"];
+
+    String date;
+    // second method: use FileNode::operator >>
+    fs2["calibrationDate"] >> date;
+
+    Mat cameraMatrix2, distCoeffs2;
+    fs2["cameraMatrix"] >> cameraMatrix2;
+    fs2["distCoeffs"] >> distCoeffs2;
+
+    cout << "frameCount: " << frameCount << endl
+         << "calibration date: " << date << endl
+         << "camera matrix: " << cameraMatrix2 << endl
+         << "distortion coeffs: " << distCoeffs2 << endl;
+
+    FileNode features = fs2["features"];
+    FileNodeIterator it = features.begin(), it_end = features.end();
+    int idx = 0;
+    std::vector<uchar> lbpval;
+
+    // iterate through a sequence using FileNodeIterator
+    for( ; it != it_end; ++it, idx++ )
+    {
+        cout << "feature #" << idx << ": ";
+        cout << "x=" << (int)(*it)["x"] << ", y=" << (int)(*it)["y"] << ", lbp: (";
+        // you can also easily read numerical arrays using FileNode >> std::vector operator.
+        (*it)["lbp"] >> lbpval;
+        for( int i = 0; i < (int)lbpval.size(); i++ )
+            cout << " " << (int)lbpval[i];
+        cout << ")" << endl;
+    }
+    fs2.release();
+@endcode
+
+Format specification    {#format_spec}
+--------------------
+`([count]{u|c|w|s|i|f|d})`... where the characters correspond to fundamental C++ types:
+-   `u` 8-bit unsigned number
+-   `c` 8-bit signed number
+-   `w` 16-bit unsigned number
+-   `s` 16-bit signed number
+-   `i` 32-bit signed number
+-   `f` single precision floating-point number
+-   `d` double precision floating-point number
+-   `r` pointer, 32 lower bits of which are written as a signed integer. The type can be used to
+    store structures with links between the elements.
+
+`count` is the optional counter of values of a given type. For example, `2if` means that each array
+element is a structure of 2 integers, followed by a single-precision floating-point number. The
+equivalent notations of the above specification are `iif`, `2i1f` and so forth. Other examples: `u`
+means that the array consists of bytes, and `2d` means the array consists of pairs of doubles.
+
+@see @ref samples/cpp/filestorage.cpp
+*/
+
+//! @{
+
+/** @example samples/cpp/filestorage.cpp
+A complete example using the FileStorage interface
+*/
+
+////////////////////////// XML & YAML I/O //////////////////////////
+
+class CV_EXPORTS FileNode;
+class CV_EXPORTS FileNodeIterator;
+
+/** @brief XML/YAML/JSON file storage class that encapsulates all the information necessary for writing or
+reading data to/from a file.
+ */
+class CV_EXPORTS_W FileStorage
+{
+public:
+    //! file storage mode
+    enum Mode
+    {
+        READ        = 0, //!< value, open the file for reading
+        WRITE       = 1, //!< value, open the file for writing
+        APPEND      = 2, //!< value, open the file for appending
+        MEMORY      = 4, /**< flag, read data from source or write data to the internal buffer (which is
+                              returned by FileStorage::release) */
+        FORMAT_MASK = (7<<3), //!< mask for format flags
+        FORMAT_AUTO = 0,      //!< flag, auto format
+        FORMAT_XML  = (1<<3), //!< flag, XML format
+        FORMAT_YAML = (2<<3), //!< flag, YAML format
+        FORMAT_JSON = (3<<3), //!< flag, JSON format
+
+        BASE64      = 64,     //!< flag, write rawdata in Base64 by default. (consider using WRITE_BASE64)
+        WRITE_BASE64 = BASE64 | WRITE, //!< flag, enable both WRITE and BASE64
+    };
+    enum State
+    {
+        UNDEFINED      = 0,
+        VALUE_EXPECTED = 1,
+        NAME_EXPECTED  = 2,
+        INSIDE_MAP     = 4
+    };
+
+    /** @brief The constructors.
+
+     The full constructor opens the file. Alternatively you can use the default constructor and then
+     call FileStorage::open.
+     */
+    CV_WRAP FileStorage();
+
+    /** @overload
+     @copydoc open()
+     */
+    CV_WRAP FileStorage(const String& filename, int flags, const String& encoding=String());
+
+    //! the destructor. calls release()
+    virtual ~FileStorage();
+
+    /** @brief Opens a file.
+
+     See description of parameters in FileStorage::FileStorage. The method calls FileStorage::release
+     before opening the file.
+     @param filename Name of the file to open or the text string to read the data from.
+     Extension of the file (.xml, .yml/.yaml or .json) determines its format (XML, YAML or JSON
+     respectively). Also you can append .gz to work with compressed files, for example myHugeMatrix.xml.gz. If both
+     FileStorage::WRITE and FileStorage::MEMORY flags are specified, source is used just to specify
+     the output file format (e.g. mydata.xml, .yml etc.). A file name can also contain parameters.
+     You can use this format, "*?base64" (e.g. "file.json?base64" (case sensitive)), as an alternative to
+     FileStorage::BASE64 flag.
+     @param flags Mode of operation. One of FileStorage::Mode
+     @param encoding Encoding of the file. Note that UTF-16 XML encoding is not supported currently and
+     you should use 8-bit encoding instead of it.
+     */
+    CV_WRAP virtual bool open(const String& filename, int flags, const String& encoding=String());
+
+    /** @brief Checks whether the file is opened.
+
+     @returns true if the object is associated with the current file and false otherwise. It is a
+     good practice to call this method after you tried to open a file.
+     */
+    CV_WRAP virtual bool isOpened() const;
+
+    /** @brief Closes the file and releases all the memory buffers.
+
+     Call this method after all I/O operations with the storage are finished.
+     */
+    CV_WRAP virtual void release();
+
+    /** @brief Closes the file and releases all the memory buffers.
+
+     Call this method after all I/O operations with the storage are finished. If the storage was
+     opened for writing data and FileStorage::WRITE was specified
+     */
+    CV_WRAP virtual String releaseAndGetString();
+
+    /** @brief Returns the first element of the top-level mapping.
+     @returns The first element of the top-level mapping.
+     */
+    CV_WRAP FileNode getFirstTopLevelNode() const;
+
+    /** @brief Returns the top-level mapping
+     @param streamidx Zero-based index of the stream. In most cases there is only one stream in the file.
+     However, YAML supports multiple streams and so there can be several.
+     @returns The top-level mapping.
+     */
+    CV_WRAP FileNode root(int streamidx=0) const;
+
+    /** @brief Returns the specified element of the top-level mapping.
+     @param nodename Name of the file node.
+     @returns Node with the given name.
+     */
+    FileNode operator[](const String& nodename) const;
+
+    /** @overload */
+    CV_WRAP_AS(getNode) FileNode operator[](const char* nodename) const;
+
+    /**
+     * @brief Simplified writing API to use with bindings.
+     * @param name Name of the written object. When writing to sequences (a.k.a. "arrays"), pass an empty string.
+     * @param val Value of the written object.
+     */
+    CV_WRAP void write(const String& name, int val);
+    /// @overload
+    CV_WRAP void write(const String& name, double val);
+    /// @overload
+    CV_WRAP void write(const String& name, const String& val);
+    /// @overload
+    CV_WRAP void write(const String& name, const Mat& val);
+    /// @overload
+    CV_WRAP void write(const String& name, const std::vector<String>& val);
+
+    /** @brief Writes multiple numbers.
+
+     Writes one or more numbers of the specified format to the currently written structure. Usually it is
+     more convenient to use operator `<<` instead of this method.
+     @param fmt Specification of each array element, see @ref format_spec "format specification"
+     @param vec Pointer to the written array.
+     @param len Number of the uchar elements to write.
+     */
+    void writeRaw( const String& fmt, const void* vec, size_t len );
+
+    /** @brief Writes a comment.
+
+     The function writes a comment into file storage. The comments are skipped when the storage is read.
+     @param comment The written comment, single-line or multi-line
+     @param append If true, the function tries to put the comment at the end of current line.
+     Else if the comment is multi-line, or if it does not fit at the end of the current
+     line, the comment starts a new line.
+     */
+    CV_WRAP void writeComment(const String& comment, bool append = false);
+
+    /** @brief Starts to write a nested structure (sequence or a mapping).
+    @param name name of the structure. When writing to sequences (a.k.a. "arrays"), pass an empty string.
+    @param flags type of the structure (FileNode::MAP or FileNode::SEQ (both with optional FileNode::FLOW)).
+    @param typeName optional name of the type you store. The effect of setting this depends on the storage format.
+    I.e. if the format has a specification for storing type information, this parameter is used.
+    */
+    CV_WRAP void startWriteStruct(const String& name, int flags, const String& typeName=String());
+
+    /** @brief Finishes writing nested structure (should pair startWriteStruct())
+    */
+    CV_WRAP void endWriteStruct();
+
+    /** @brief Returns the normalized object name for the specified name of a file.
+    @param filename Name of a file
+    @returns The normalized object name.
+     */
+    static String getDefaultObjectName(const String& filename);
+
+    /** @brief Returns the current format.
+     * @returns The current format, see FileStorage::Mode
+     */
+    CV_WRAP int getFormat() const;
+
+    int state;
+    std::string elname;
+
+    class Impl;
+    Ptr<Impl> p;
+};
+
+/** @brief File Storage Node class.
+
+The node is used to store each and every element of the file storage opened for reading. When
+XML/YAML file is read, it is first parsed and stored in the memory as a hierarchical collection of
+nodes. Each node can be a "leaf" that is contain a single number or a string, or be a collection of
+other nodes. There can be named collections (mappings) where each element has a name and it is
+accessed by a name, and ordered collections (sequences) where elements do not have names but rather
+accessed by index. Type of the file node can be determined using FileNode::type method.
+
+Note that file nodes are only used for navigating file storages opened for reading. When a file
+storage is opened for writing, no data is stored in memory after it is written.
+ */
+class CV_EXPORTS_W_SIMPLE FileNode
+{
+public:
+    //! type of the file storage node
+    enum
+    {
+        NONE      = 0, //!< empty node
+        INT       = 1, //!< an integer
+        REAL      = 2, //!< floating-point number
+        FLOAT     = REAL, //!< synonym or REAL
+        STR       = 3, //!< text string in UTF-8 encoding
+        STRING    = STR, //!< synonym for STR
+        SEQ       = 4, //!< sequence
+        MAP       = 5, //!< mapping
+        TYPE_MASK = 7,
+
+        FLOW      = 8,  //!< compact representation of a sequence or mapping. Used only by YAML writer
+        UNIFORM   = 8,  //!< if set, means that all the collection elements are numbers of the same type (real's or int's).
+        //!< UNIFORM is used only when reading FileStorage; FLOW is used only when writing. So they share the same bit
+        EMPTY     = 16, //!< empty structure (sequence or mapping)
+        NAMED     = 32  //!< the node has a name (i.e. it is element of a mapping).
+    };
+    /** @brief The constructors.
+
+     These constructors are used to create a default file node, construct it from obsolete structures or
+     from the another file node.
+     */
+    CV_WRAP FileNode();
+
+    /** @overload
+     @param fs Pointer to the file storage structure.
+     @param blockIdx Index of the memory block where the file node is stored
+     @param ofs Offset in bytes from the beginning of the serialized storage
+
+     @deprecated
+     */
+    FileNode(const FileStorage* fs, size_t blockIdx, size_t ofs);
+
+    /** @overload
+     @param node File node to be used as initialization for the created file node.
+     */
+    FileNode(const FileNode& node);
+
+    FileNode& operator=(const FileNode& node);
+
+    /** @brief Returns element of a mapping node or a sequence node.
+     @param nodename Name of an element in the mapping node.
+     @returns Returns the element with the given identifier.
+     */
+    FileNode operator[](const String& nodename) const;
+
+    /** @overload
+     @param nodename Name of an element in the mapping node.
+     */
+    CV_WRAP_AS(getNode) FileNode operator[](const char* nodename) const;
+
+    /** @overload
+     @param i Index of an element in the sequence node.
+     */
+    CV_WRAP_AS(at) FileNode operator[](int i) const;
+
+    /** @brief Returns keys of a mapping node.
+     @returns Keys of a mapping node.
+     */
+    CV_WRAP std::vector<String> keys() const;
+
+    /** @brief Returns type of the node.
+     @returns Type of the node. See FileNode::Type
+     */
+    CV_WRAP int type() const;
+
+    //! returns true if the node is empty
+    CV_WRAP bool empty() const;
+    //! returns true if the node is a "none" object
+    CV_WRAP bool isNone() const;
+    //! returns true if the node is a sequence
+    CV_WRAP bool isSeq() const;
+    //! returns true if the node is a mapping
+    CV_WRAP bool isMap() const;
+    //! returns true if the node is an integer
+    CV_WRAP bool isInt() const;
+    //! returns true if the node is a floating-point number
+    CV_WRAP bool isReal() const;
+    //! returns true if the node is a text string
+    CV_WRAP bool isString() const;
+    //! returns true if the node has a name
+    CV_WRAP bool isNamed() const;
+    //! returns the node name or an empty string if the node is nameless
+    CV_WRAP std::string name() const;
+    //! returns the number of elements in the node, if it is a sequence or mapping, or 1 otherwise.
+    CV_WRAP size_t size() const;
+    //! returns raw size of the FileNode in bytes
+    CV_WRAP size_t rawSize() const;
+    //! returns the node content as an integer. If the node stores floating-point number, it is rounded.
+    operator int() const;
+    //! returns the node content as float
+    operator float() const;
+    //! returns the node content as double
+    operator double() const;
+    //! returns the node content as text string
+    inline operator std::string() const { return this->string(); }
+
+    static bool isMap(int flags);
+    static bool isSeq(int flags);
+    static bool isCollection(int flags);
+    static bool isEmptyCollection(int flags);
+    static bool isFlow(int flags);
+
+    uchar* ptr();
+    const uchar* ptr() const;
+
+    //! returns iterator pointing to the first node element
+    FileNodeIterator begin() const;
+    //! returns iterator pointing to the element following the last node element
+    FileNodeIterator end() const;
+
+    /** @brief Reads node elements to the buffer with the specified format.
+
+    Usually it is more convenient to use operator `>>` instead of this method.
+    @param fmt Specification of each array element. See @ref format_spec "format specification"
+    @param vec Pointer to the destination array.
+    @param len Number of bytes to read (buffer size limit). If it is greater than number of
+               remaining elements then all of them will be read.
+     */
+    void readRaw( const String& fmt, void* vec, size_t len ) const;
+
+    /** Internal method used when reading FileStorage.
+     Sets the type (int, real or string) and value of the previously created node.
+     */
+    void setValue( int type, const void* value, int len=-1 );
+
+    //! Simplified reading API to use with bindings.
+    CV_WRAP double real() const;
+    //! Simplified reading API to use with bindings.
+    CV_WRAP std::string string() const;
+    //! Simplified reading API to use with bindings.
+    CV_WRAP Mat mat() const;
+
+    //protected:
+    FileNode(FileStorage::Impl* fs, size_t blockIdx, size_t ofs);
+
+    FileStorage::Impl* fs;
+    size_t blockIdx;
+    size_t ofs;
+};
+
+
+/** @brief used to iterate through sequences and mappings.
+
+ A standard STL notation, with node.begin(), node.end() denoting the beginning and the end of a
+ sequence, stored in node. See the data reading sample in the beginning of the section.
+ */
+class CV_EXPORTS FileNodeIterator
+{
+public:
+    /** @brief The constructors.
+
+     These constructors are used to create a default iterator, set it to specific element in a file node
+     or construct it from another iterator.
+     */
+    FileNodeIterator();
+
+    /** @overload
+     @param node File node - the collection to iterate over;
+        it can be a scalar (equivalent to 1-element collection) or "none" (equivalent to empty collection).
+     @param seekEnd - true if iterator needs to be set after the last element of the node;
+        that is:
+            * node.begin() => FileNodeIterator(node, false)
+            * node.end() => FileNodeIterator(node, true)
+     */
+    FileNodeIterator(const FileNode& node, bool seekEnd);
+
+    /** @overload
+     @param it Iterator to be used as initialization for the created iterator.
+     */
+    FileNodeIterator(const FileNodeIterator& it);
+
+    FileNodeIterator& operator=(const FileNodeIterator& it);
+
+    //! returns the currently observed element
+    FileNode operator *() const;
+
+    //! moves iterator to the next node
+    FileNodeIterator& operator ++ ();
+    //! moves iterator to the next node
+    FileNodeIterator operator ++ (int);
+    //! moves iterator forward by the specified offset (possibly negative)
+    FileNodeIterator& operator += (int ofs);
+
+    /** @brief Reads node elements to the buffer with the specified format.
+
+    Usually it is more convenient to use operator `>>` instead of this method.
+    @param fmt Specification of each array element. See @ref format_spec "format specification"
+    @param vec Pointer to the destination array.
+    @param len Number of bytes to read (buffer size limit). If it is greater than number of
+               remaining elements then all of them will be read.
+     */
+    FileNodeIterator& readRaw( const String& fmt, void* vec,
+                               size_t len=(size_t)INT_MAX );
+
+    //! returns the number of remaining (not read yet) elements
+    size_t remaining() const;
+
+    bool equalTo(const FileNodeIterator& it) const;
+
+protected:
+    FileStorage::Impl* fs;
+    size_t blockIdx;
+    size_t ofs;
+    size_t blockSize;
+    size_t nodeNElems;
+    size_t idx;
+};
+
+//! @} core_xml
+
+/////////////////// XML & YAML I/O implementation //////////////////
+
+CV_EXPORTS void write( FileStorage& fs, const String& name, int value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, float value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, double value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, const String& value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, const Mat& value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, const SparseMat& value );
+#ifdef CV__LEGACY_PERSISTENCE
+CV_EXPORTS void write( FileStorage& fs, const String& name, const std::vector<KeyPoint>& value);
+CV_EXPORTS void write( FileStorage& fs, const String& name, const std::vector<DMatch>& value);
+#endif
+
+CV_EXPORTS void writeScalar( FileStorage& fs, int value );
+CV_EXPORTS void writeScalar( FileStorage& fs, float value );
+CV_EXPORTS void writeScalar( FileStorage& fs, double value );
+CV_EXPORTS void writeScalar( FileStorage& fs, const String& value );
+
+CV_EXPORTS void read(const FileNode& node, int& value, int default_value);
+CV_EXPORTS void read(const FileNode& node, float& value, float default_value);
+CV_EXPORTS void read(const FileNode& node, double& value, double default_value);
+CV_EXPORTS void read(const FileNode& node, std::string& value, const std::string& default_value);
+CV_EXPORTS void read(const FileNode& node, Mat& mat, const Mat& default_mat = Mat() );
+CV_EXPORTS void read(const FileNode& node, SparseMat& mat, const SparseMat& default_mat = SparseMat() );
+#ifdef CV__LEGACY_PERSISTENCE
+CV_EXPORTS void read(const FileNode& node, std::vector<KeyPoint>& keypoints);
+CV_EXPORTS void read(const FileNode& node, std::vector<DMatch>& matches);
+#endif
+CV_EXPORTS void read(const FileNode& node, KeyPoint& value, const KeyPoint& default_value);
+CV_EXPORTS void read(const FileNode& node, DMatch& value, const DMatch& default_value);
+
+template<typename _Tp> static inline void read(const FileNode& node, Point_<_Tp>& value, const Point_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 2 ? default_value : Point_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]));
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Point3_<_Tp>& value, const Point3_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 3 ? default_value : Point3_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]),
+                                                            saturate_cast<_Tp>(temp[2]));
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Size_<_Tp>& value, const Size_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 2 ? default_value : Size_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]));
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Complex<_Tp>& value, const Complex<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 2 ? default_value : Complex<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]));
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Rect_<_Tp>& value, const Rect_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 4 ? default_value : Rect_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]),
+                                                          saturate_cast<_Tp>(temp[2]), saturate_cast<_Tp>(temp[3]));
+}
+
+template<typename _Tp, int cn> static inline void read(const FileNode& node, Vec<_Tp, cn>& value, const Vec<_Tp, cn>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != cn ? default_value : Vec<_Tp, cn>(&temp[0]);
+}
+
+template<typename _Tp, int m, int n> static inline void read(const FileNode& node, Matx<_Tp, m, n>& value, const Matx<_Tp, m, n>& default_matx = Matx<_Tp, m, n>())
+{
+    Mat temp;
+    read(node, temp); // read as a Mat class
+
+    if (temp.empty())
+        value = default_matx;
+    else
+        value = Matx<_Tp, m, n>(temp);
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Scalar_<_Tp>& value, const Scalar_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 4 ? default_value : Scalar_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]),
+                                                            saturate_cast<_Tp>(temp[2]), saturate_cast<_Tp>(temp[3]));
+}
+
+static inline void read(const FileNode& node, Range& value, const Range& default_value)
+{
+    Point2i temp(value.start, value.end); const Point2i default_temp = Point2i(default_value.start, default_value.end);
+    read(node, temp, default_temp);
+    value.start = temp.x; value.end = temp.y;
+}
+
+/** @brief Writes string to a file storage.
+ */
+CV_EXPORTS FileStorage& operator << (FileStorage& fs, const String& str);
+
+//! @cond IGNORED
+
+namespace internal
+{
+    class CV_EXPORTS WriteStructContext
+    {
+    public:
+        WriteStructContext(FileStorage& _fs, const String& name, int flags, const String& typeName = String());
+        ~WriteStructContext();
+    private:
+        FileStorage* fs;
+    };
+
+    template<typename _Tp, int numflag> class VecWriterProxy
+    {
+    public:
+        VecWriterProxy( FileStorage* _fs ) : fs(_fs) {}
+        void operator()(const std::vector<_Tp>& vec) const
+        {
+            size_t count = vec.size();
+            for (size_t i = 0; i < count; i++)
+                write(*fs, vec[i]);
+        }
+    private:
+        FileStorage* fs;
+    };
+
+    template<typename _Tp> class VecWriterProxy<_Tp, 1>
+    {
+    public:
+        VecWriterProxy( FileStorage* _fs ) : fs(_fs) {}
+        void operator()(const std::vector<_Tp>& vec) const
+        {
+            int _fmt = traits::SafeFmt<_Tp>::fmt;
+            char fmt[] = { (char)((_fmt >> 8) + '1'), (char)_fmt, '\0' };
+            fs->writeRaw(fmt, !vec.empty() ? (uchar*)&vec[0] : 0, vec.size() * sizeof(_Tp));
+        }
+    private:
+        FileStorage* fs;
+    };
+
+    template<typename _Tp, int numflag> class VecReaderProxy
+    {
+    public:
+        VecReaderProxy( FileNodeIterator* _it ) : it(_it) {}
+        void operator()(std::vector<_Tp>& vec, size_t count) const
+        {
+            count = std::min(count, it->remaining());
+            vec.resize(count);
+            for (size_t i = 0; i < count; i++, ++(*it))
+                read(**it, vec[i], _Tp());
+        }
+    private:
+        FileNodeIterator* it;
+    };
+
+    template<typename _Tp> class VecReaderProxy<_Tp, 1>
+    {
+    public:
+        VecReaderProxy( FileNodeIterator* _it ) : it(_it) {}
+        void operator()(std::vector<_Tp>& vec, size_t count) const
+        {
+            size_t remaining = it->remaining();
+            size_t cn = DataType<_Tp>::channels;
+            int _fmt = traits::SafeFmt<_Tp>::fmt;
+            CV_Assert((_fmt >> 8) < 9);
+            char fmt[] = { (char)((_fmt >> 8)+'1'), (char)_fmt, '\0' };
+            CV_Assert((remaining % cn) == 0);
+            size_t remaining1 = remaining / cn;
+            count = count > remaining1 ? remaining1 : count;
+            vec.resize(count);
+            it->readRaw(fmt, !vec.empty() ? (uchar*)&vec[0] : 0, count*sizeof(_Tp));
+        }
+    private:
+        FileNodeIterator* it;
+    };
+
+} // internal
+
+//! @endcond
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const _Tp& value)
+{
+    write(fs, String(), value);
+}
+
+template<> inline
+void write( FileStorage& fs, const int& value )
+{
+    writeScalar(fs, value);
+}
+
+template<> inline
+void write( FileStorage& fs, const float& value )
+{
+    writeScalar(fs, value);
+}
+
+template<> inline
+void write( FileStorage& fs, const double& value )
+{
+    writeScalar(fs, value);
+}
+
+template<> inline
+void write( FileStorage& fs, const String& value )
+{
+    writeScalar(fs, value);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Point_<_Tp>& pt )
+{
+    write(fs, pt.x);
+    write(fs, pt.y);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Point3_<_Tp>& pt )
+{
+    write(fs, pt.x);
+    write(fs, pt.y);
+    write(fs, pt.z);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Size_<_Tp>& sz )
+{
+    write(fs, sz.width);
+    write(fs, sz.height);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Complex<_Tp>& c )
+{
+    write(fs, c.re);
+    write(fs, c.im);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Rect_<_Tp>& r )
+{
+    write(fs, r.x);
+    write(fs, r.y);
+    write(fs, r.width);
+    write(fs, r.height);
+}
+
+template<typename _Tp, int cn> static inline
+void write(FileStorage& fs, const Vec<_Tp, cn>& v )
+{
+    for(int i = 0; i < cn; i++)
+        write(fs, v.val[i]);
+}
+
+template<typename _Tp, int m, int n> static inline
+void write(FileStorage& fs, const Matx<_Tp, m, n>& x )
+{
+    write(fs, Mat(x)); // write as a Mat class
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Scalar_<_Tp>& s )
+{
+    write(fs, s.val[0]);
+    write(fs, s.val[1]);
+    write(fs, s.val[2]);
+    write(fs, s.val[3]);
+}
+
+static inline
+void write(FileStorage& fs, const Range& r )
+{
+    write(fs, r.start);
+    write(fs, r.end);
+}
+
+template<typename _Tp> static inline
+void write( FileStorage& fs, const std::vector<_Tp>& vec )
+{
+    cv::internal::VecWriterProxy<_Tp, traits::SafeFmt<_Tp>::fmt != 0> w(&fs);
+    w(vec);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Point_<_Tp>& pt )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, pt);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Point3_<_Tp>& pt )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, pt);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Size_<_Tp>& sz )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, sz);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Complex<_Tp>& c )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, c);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Rect_<_Tp>& r )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, r);
+}
+
+template<typename _Tp, int cn> static inline
+void write(FileStorage& fs, const String& name, const Vec<_Tp, cn>& v )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, v);
+}
+
+template<typename _Tp, int m, int n> static inline
+void write(FileStorage& fs, const String& name, const Matx<_Tp, m, n>& x )
+{
+    write(fs, name, Mat(x)); // write as a Mat class
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Scalar_<_Tp>& s )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, s);
+}
+
+static inline
+void write(FileStorage& fs, const String& name, const Range& r )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, r);
+}
+
+static inline
+void write(FileStorage& fs, const String& name, const KeyPoint& kpt)
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, kpt.pt.x);
+    write(fs, kpt.pt.y);
+    write(fs, kpt.size);
+    write(fs, kpt.angle);
+    write(fs, kpt.response);
+    write(fs, kpt.octave);
+    write(fs, kpt.class_id);
+}
+
+static inline
+void write(FileStorage& fs, const String& name, const DMatch& m)
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, m.queryIdx);
+    write(fs, m.trainIdx);
+    write(fs, m.imgIdx);
+    write(fs, m.distance);
+}
+
+template<typename _Tp, typename std::enable_if< std::is_enum<_Tp>::value >::type* = nullptr>
+static inline void write( FileStorage& fs, const String& name, const _Tp& val )
+{
+    write(fs, name, static_cast<int>(val));
+}
+
+template<typename _Tp> static inline
+void write( FileStorage& fs, const String& name, const std::vector<_Tp>& vec )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+(traits::SafeFmt<_Tp>::fmt != 0 ? FileNode::FLOW : 0));
+    write(fs, vec);
+}
+
+template<typename _Tp> static inline
+void write( FileStorage& fs, const String& name, const std::vector< std::vector<_Tp> >& vec )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ);
+    for(size_t i = 0; i < vec.size(); i++)
+    {
+        cv::internal::WriteStructContext ws_(fs, name, FileNode::SEQ+(traits::SafeFmt<_Tp>::fmt != 0 ? FileNode::FLOW : 0));
+        write(fs, vec[i]);
+    }
+}
+
+#ifdef CV__LEGACY_PERSISTENCE
+// This code is not needed anymore, but it is preserved here to keep source compatibility
+// Implementation is similar to templates instantiations
+static inline void write(FileStorage& fs, const KeyPoint& kpt) { write(fs, String(), kpt); }
+static inline void write(FileStorage& fs, const DMatch& m) { write(fs, String(), m); }
+static inline void write(FileStorage& fs, const std::vector<KeyPoint>& vec)
+{
+    cv::internal::VecWriterProxy<KeyPoint, 0> w(&fs);
+    w(vec);
+}
+static inline void write(FileStorage& fs, const std::vector<DMatch>& vec)
+{
+    cv::internal::VecWriterProxy<DMatch, 0> w(&fs);
+    w(vec);
+
+}
+#endif
+
+
+static inline
+void read(const FileNode& node, bool& value, bool default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = temp != 0;
+}
+
+static inline
+void read(const FileNode& node, uchar& value, uchar default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = saturate_cast<uchar>(temp);
+}
+
+static inline
+void read(const FileNode& node, schar& value, schar default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = saturate_cast<schar>(temp);
+}
+
+static inline
+void read(const FileNode& node, ushort& value, ushort default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = saturate_cast<ushort>(temp);
+}
+
+static inline
+void read(const FileNode& node, short& value, short default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = saturate_cast<short>(temp);
+}
+
+template<typename _Tp> static inline
+void read( FileNodeIterator& it, std::vector<_Tp>& vec, size_t maxCount = (size_t)INT_MAX )
+{
+    cv::internal::VecReaderProxy<_Tp, traits::SafeFmt<_Tp>::fmt != 0> r(&it);
+    r(vec, maxCount);
+}
+
+template<typename _Tp, typename std::enable_if< std::is_enum<_Tp>::value >::type* = nullptr>
+static inline void read(const FileNode& node, _Tp& value, const _Tp& default_value = static_cast<_Tp>(0))
+{
+    int temp;
+    read(node, temp, static_cast<int>(default_value));
+    value = static_cast<_Tp>(temp);
+}
+
+template<typename _Tp> static inline
+void read( const FileNode& node, std::vector<_Tp>& vec, const std::vector<_Tp>& default_value = std::vector<_Tp>() )
+{
+    if(node.empty())
+        vec = default_value;
+    else
+    {
+        FileNodeIterator it = node.begin();
+        read( it, vec );
+    }
+}
+
+static inline
+void read( const FileNode& node, std::vector<KeyPoint>& vec, const std::vector<KeyPoint>& default_value )
+{
+    if(node.empty())
+        vec = default_value;
+    else
+        read(node, vec);
+}
+
+static inline
+void read( const FileNode& node, std::vector<DMatch>& vec, const std::vector<DMatch>& default_value )
+{
+    if(node.empty())
+        vec = default_value;
+    else
+        read(node, vec);
+}
+
+/** @brief Writes data to a file storage.
+ */
+template<typename _Tp> static inline
+FileStorage& operator << (FileStorage& fs, const _Tp& value)
+{
+    if( !fs.isOpened() )
+        return fs;
+    if( fs.state == FileStorage::NAME_EXPECTED + FileStorage::INSIDE_MAP )
+        CV_Error( Error::StsError, "No element name has been given" );
+    write( fs, fs.elname, value );
+    if( fs.state & FileStorage::INSIDE_MAP )
+        fs.state = FileStorage::NAME_EXPECTED + FileStorage::INSIDE_MAP;
+    return fs;
+}
+
+/** @brief Writes data to a file storage.
+ */
+static inline
+FileStorage& operator << (FileStorage& fs, const char* str)
+{
+    return (fs << String(str));
+}
+
+/** @brief Writes data to a file storage.
+ */
+static inline
+FileStorage& operator << (FileStorage& fs, char* value)
+{
+    return (fs << String(value));
+}
+
+/** @brief Reads data from a file storage.
+ */
+template<typename _Tp> static inline
+FileNodeIterator& operator >> (FileNodeIterator& it, _Tp& value)
+{
+    read( *it, value, _Tp());
+    return ++it;
+}
+
+/** @brief Reads data from a file storage.
+ */
+template<typename _Tp> static inline
+FileNodeIterator& operator >> (FileNodeIterator& it, std::vector<_Tp>& vec)
+{
+    cv::internal::VecReaderProxy<_Tp, traits::SafeFmt<_Tp>::fmt != 0> r(&it);
+    r(vec, (size_t)INT_MAX);
+    return it;
+}
+
+/** @brief Reads data from a file storage.
+ */
+template<typename _Tp> static inline
+void operator >> (const FileNode& n, _Tp& value)
+{
+    read( n, value, _Tp());
+}
+
+/** @brief Reads data from a file storage.
+ */
+template<typename _Tp> static inline
+void operator >> (const FileNode& n, std::vector<_Tp>& vec)
+{
+    FileNodeIterator it = n.begin();
+    it >> vec;
+}
+
+/** @brief Reads KeyPoint from a file storage.
+*/
+//It needs special handling because it contains two types of fields, int & float.
+static inline
+void operator >> (const FileNode& n, KeyPoint& kpt)
+{
+    FileNodeIterator it = n.begin();
+    it >> kpt.pt.x >> kpt.pt.y >> kpt.size >> kpt.angle >> kpt.response >> kpt.octave >> kpt.class_id;
+}
+
+#ifdef CV__LEGACY_PERSISTENCE
+static inline
+void operator >> (const FileNode& n, std::vector<KeyPoint>& vec)
+{
+    read(n, vec);
+}
+static inline
+void operator >> (const FileNode& n, std::vector<DMatch>& vec)
+{
+    read(n, vec);
+}
+#endif
+
+/** @brief Reads DMatch from a file storage.
+*/
+//It needs special handling because it contains two types of fields, int & float.
+static inline
+void operator >> (const FileNode& n, DMatch& m)
+{
+    FileNodeIterator it = n.begin();
+    it >> m.queryIdx >> m.trainIdx >> m.imgIdx >> m.distance;
+}
+
+CV_EXPORTS bool operator == (const FileNodeIterator& it1, const FileNodeIterator& it2);
+CV_EXPORTS bool operator != (const FileNodeIterator& it1, const FileNodeIterator& it2);
+
+static inline
+ptrdiff_t operator - (const FileNodeIterator& it1, const FileNodeIterator& it2)
+{
+    return it2.remaining() - it1.remaining();
+}
+
+static inline
+bool operator < (const FileNodeIterator& it1, const FileNodeIterator& it2)
+{
+    return it1.remaining() > it2.remaining();
+}
+
+} // cv
+
+#endif // OPENCV_CORE_PERSISTENCE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/quaternion.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/quaternion.hpp
new file mode 100644
index 000000000000..9e3e44332f60
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/quaternion.hpp
@@ -0,0 +1,1696 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <chargerKong@126.com>
+//         Longbu Wang <riskiest@gmail.com>
+#ifndef OPENCV_CORE_QUATERNION_HPP
+#define OPENCV_CORE_QUATERNION_HPP
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/utils/logger.hpp>
+#include <iostream>
+namespace cv
+{
+//! @addtogroup core
+//! @{
+
+//! Unit quaternion flag
+enum QuatAssumeType
+{
+    /**
+     * This flag is specified by default.
+     * If this flag is specified, the input quaternions are assumed to be not unit quaternions.
+     * It can guarantee the correctness of the calculations,
+     * although the calculation speed will be slower than the flag QUAT_ASSUME_UNIT.
+     */
+    QUAT_ASSUME_NOT_UNIT,
+    /**
+     * If this flag is specified, the input quaternions are assumed to be unit quaternions which
+     * will save some computations. However, if this flag is specified without unit quaternion,
+     * the program correctness of the result will not be guaranteed.
+     */
+    QUAT_ASSUME_UNIT
+};
+
+class QuatEnum
+{
+public:
+    /** @brief Enum of Euler angles type.
+     *
+     * Without considering the possibility of using two different convertions for the definition of the rotation axes ,
+     * there exists twelve possible sequences of rotation axes, divided into two groups:
+     * - Proper Euler angles (Z-X-Z, X-Y-X, Y-Z-Y, Z-Y-Z, X-Z-X, Y-X-Y)
+     * - Tait–Bryan angles (X-Y-Z, Y-Z-X, Z-X-Y, X-Z-Y, Z-Y-X, Y-X-Z).
+     *
+     * The three elemental rotations may be [extrinsic](https://en.wikipedia.org/wiki/Euler_angles#Definition_by_extrinsic_rotations)
+     * (rotations about the axes *xyz* of the original coordinate system, which is assumed to remain motionless),
+     * or [intrinsic](https://en.wikipedia.org/wiki/Euler_angles#Definition_by_intrinsic_rotations)(rotations about the axes of the rotating coordinate system *XYZ*, solidary with the moving body, which changes its orientation after each elemental rotation).
+     *
+     *
+     * Extrinsic and intrinsic rotations are relevant.
+     *
+     * The definition of the Euler angles is as following,
+     * - \f$\theta_1 \f$ represents the first rotation angle,
+     * - \f$\theta_2 \f$ represents the second rotation angle,
+     * - \f$\theta_3 \f$ represents the third rotation angle.
+     *
+     * For intrinsic rotations in the order of X-Y-Z, the rotation matrix R can be calculated by:\f[R =X(\theta_1) Y(\theta_2) Z(\theta_3) \f]
+     * For extrinsic rotations in the order of X-Y-Z, the rotation matrix R can be calculated by:\f[R =Z({\theta_3}) Y({\theta_2}) X({\theta_1})\f]
+     * where
+     * \f[X({\theta_1})={\begin{bmatrix}1&0&0\\0&\cos {\theta_1} &-\sin {\theta_1} \\0&\sin {\theta_1} &\cos {\theta_1} \\\end{bmatrix}},
+     * Y({\theta_2})={\begin{bmatrix}\cos \theta_{2}&0&\sin \theta_{2}\\0&1 &0 \\\ -sin \theta_2& 0&\cos \theta_{2} \\\end{bmatrix}},
+     * Z({\theta_3})={\begin{bmatrix}\cos\theta_{3} &-\sin \theta_3&0\\\sin \theta_3 &\cos \theta_3 &0\\0&0&1\\\end{bmatrix}}.
+     * \f]
+     *
+     * The function is designed according to this set of conventions:
+     * - [Right handed](https://en.wikipedia.org/wiki/Right_hand_rule) reference frames are adopted, and the [right hand rule](https://en.wikipedia.org/wiki/Right_hand_rule) is used to determine the sign of angles.
+     * - Each matrix is meant to represent an [active rotation](https://en.wikipedia.org/wiki/Active_and_passive_transformation) (the composing and composed matrices
+     * are supposed to act on the coordinates of vectors defined in the initial fixed reference frame and give as a result the coordinates of a rotated vector defined in the same reference frame).
+     * - For \f$\theta_1\f$ and \f$\theta_3\f$, the valid range is (−π, π].
+     *
+     *   For \f$\theta_2\f$, the valid range is [−π/2, π/2] or [0, π].
+     *
+     *   For Tait–Bryan angles, the valid range of \f$\theta_2\f$ is [−π/2, π/2]. When transforming a quaternion to Euler angles, the solution of Euler angles is unique in condition of \f$ \theta_2 \in (−π/2, π/2)\f$ .
+     *   If \f$\theta_2 = −π/2 \f$ or \f$ \theta_2 = π/2\f$, there are infinite solutions. The common name for this situation is gimbal lock.
+     *   For Proper Euler angles,the valid range of \f$\theta_2\f$ is in [0, π]. The solutions of Euler angles are unique in condition of  \f$ \theta_2 \in (0, π)\f$ . If \f$\theta_2 =0 \f$ or \f$\theta_2 =π \f$,
+     *   there are infinite solutions and gimbal lock will occur.
+     */
+    enum EulerAnglesType
+    {
+        INT_XYZ, ///< Intrinsic rotations with the Euler angles type X-Y-Z
+        INT_XZY, ///< Intrinsic rotations with the Euler angles type X-Z-Y
+        INT_YXZ, ///< Intrinsic rotations with the Euler angles type Y-X-Z
+        INT_YZX, ///< Intrinsic rotations with the Euler angles type Y-Z-X
+        INT_ZXY, ///< Intrinsic rotations with the Euler angles type Z-X-Y
+        INT_ZYX, ///< Intrinsic rotations with the Euler angles type Z-Y-X
+        INT_XYX, ///< Intrinsic rotations with the Euler angles type X-Y-X
+        INT_XZX, ///< Intrinsic rotations with the Euler angles type X-Z-X
+        INT_YXY, ///< Intrinsic rotations with the Euler angles type Y-X-Y
+        INT_YZY, ///< Intrinsic rotations with the Euler angles type Y-Z-Y
+        INT_ZXZ, ///< Intrinsic rotations with the Euler angles type Z-X-Z
+        INT_ZYZ, ///< Intrinsic rotations with the Euler angles type Z-Y-Z
+
+        EXT_XYZ, ///< Extrinsic rotations with the Euler angles type X-Y-Z
+        EXT_XZY, ///< Extrinsic rotations with the Euler angles type X-Z-Y
+        EXT_YXZ, ///< Extrinsic rotations with the Euler angles type Y-X-Z
+        EXT_YZX, ///< Extrinsic rotations with the Euler angles type Y-Z-X
+        EXT_ZXY, ///< Extrinsic rotations with the Euler angles type Z-X-Y
+        EXT_ZYX, ///< Extrinsic rotations with the Euler angles type Z-Y-X
+        EXT_XYX, ///< Extrinsic rotations with the Euler angles type X-Y-X
+        EXT_XZX, ///< Extrinsic rotations with the Euler angles type X-Z-X
+        EXT_YXY, ///< Extrinsic rotations with the Euler angles type Y-X-Y
+        EXT_YZY,  ///< Extrinsic rotations with the Euler angles type Y-Z-Y
+        EXT_ZXZ, ///< Extrinsic rotations with the Euler angles type Z-X-Z
+        EXT_ZYZ, ///< Extrinsic rotations with the Euler angles type Z-Y-Z
+        #ifndef CV_DOXYGEN
+            EULER_ANGLES_MAX_VALUE
+        #endif
+    };
+
+};
+
+template <typename _Tp> class Quat;
+template <typename _Tp> std::ostream& operator<<(std::ostream&, const Quat<_Tp>&);
+
+/**
+ * Quaternion is a number system that extends the complex numbers. It can be expressed as a
+ * rotation in three-dimensional space.
+ * A quaternion is generally represented in the form:
+ *      \f[q = w + x\boldsymbol{i} + y\boldsymbol{j} + z\boldsymbol{k}\f]
+ *      \f[q = [w, x, y, z]\f]
+ *      \f[q = [w, \boldsymbol{v}] \f]
+ *      \f[q = ||q||[\cos\psi, u_x\sin\psi,u_y\sin\psi,  u_z\sin\psi].\f]
+ *      \f[q = ||q||[\cos\psi, \boldsymbol{u}\sin\psi]\f]
+ * where \f$\psi = \frac{\theta}{2}\f$, \f$\theta\f$ represents rotation angle,
+ * \f$\boldsymbol{u} = [u_x, u_y, u_z]\f$ represents normalized rotation axis,
+ * and \f$||q||\f$ represents the norm of \f$q\f$.
+ *
+ * A unit quaternion is usually represents rotation, which has the form:
+ *      \f[q = [\cos\psi, u_x\sin\psi,u_y\sin\psi,  u_z\sin\psi].\f]
+ *
+ * To create a quaternion representing the rotation around the axis \f$\boldsymbol{u}\f$
+ * with angle \f$\theta\f$, you can use
+ * ```
+ * using namespace cv;
+ * double angle = CV_PI;
+ * Vec3d axis = {0, 0, 1};
+ * Quatd q = Quatd::createFromAngleAxis(angle, axis);
+ * ```
+ *
+ * You can simply use four same type number to create a quaternion
+ * ```
+ * Quatd q(1, 2, 3, 4);
+ * ```
+ * Or use a Vec4d or Vec4f vector.
+ * ```
+ * Vec4d vec{1, 2, 3, 4};
+ * Quatd q(vec);
+ * ```
+ *
+ * ```
+ * Vec4f vec{1, 2, 3, 4};
+ * Quatf q(vec);
+ * ```
+ *
+ * If you already have a 3x3 rotation matrix R, then you can use
+ * ```
+ * Quatd q = Quatd::createFromRotMat(R);
+ * ```
+ *
+ * If you already have a rotation vector rvec which has the form of `angle * axis`, then you can use
+ * ```
+ * Quatd q = Quatd::createFromRvec(rvec);
+ * ```
+ *
+ * To extract the rotation matrix from quaternion, see toRotMat3x3()
+ *
+ * To extract the Vec4d or Vec4f, see toVec()
+ *
+ * To extract the rotation vector, see toRotVec()
+ *
+ * If there are two quaternions \f$q_0, q_1\f$ are needed to interpolate, you can use nlerp(), slerp() or spline()
+ * ```
+ * Quatd::nlerp(q0, q1, t)
+ *
+ * Quatd::slerp(q0, q1, t)
+ *
+ * Quatd::spline(q0, q0, q1, q1, t)
+ * ```
+ * spline can smoothly connect rotations of  multiple quaternions
+ *
+ * Three ways to get an element in Quaternion
+ * ```
+ * Quatf q(1,2,3,4);
+ * std::cout << q.w << std::endl; // w=1, x=2, y=3, z=4
+ * std::cout << q[0] << std::endl; // q[0]=1, q[1]=2, q[2]=3, q[3]=4
+ * std::cout << q.at(0) << std::endl;
+ * ```
+ */
+template <typename _Tp>
+class Quat
+{
+    static_assert(std::is_floating_point<_Tp>::value, "Quaternion only make sense with type of float or double");
+    using value_type = _Tp;
+public:
+    static constexpr _Tp CV_QUAT_EPS = (_Tp)1.e-6;
+    static constexpr _Tp CV_QUAT_CONVERT_THRESHOLD = (_Tp)1.e-6;
+
+    Quat();
+
+    /**
+     * @brief From Vec4d or Vec4f.
+     */
+    explicit Quat(const Vec<_Tp, 4> &coeff);
+
+    /**
+     * @brief from four numbers.
+     */
+    Quat(_Tp w, _Tp x, _Tp y, _Tp z);
+
+    /**
+     * @brief from an angle, axis. Axis will be normalized in this function. And
+     * it generates
+     * \f[q = [\cos\psi, u_x\sin\psi,u_y\sin\psi,  u_z\sin\psi].\f]
+     * where \f$\psi = \frac{\theta}{2}\f$, \f$\theta\f$ is the rotation angle.
+     */
+    static Quat<_Tp> createFromAngleAxis(const _Tp angle, const Vec<_Tp, 3> &axis);
+
+    /**
+     * @brief from a 3x3 rotation matrix.
+     */
+    static Quat<_Tp> createFromRotMat(InputArray R);
+
+    /**
+     * @brief from a rotation vector
+     * \f$r\f$ has the form \f$\theta \cdot \boldsymbol{u}\f$, where \f$\theta\f$
+     * represents rotation angle and \f$\boldsymbol{u}\f$ represents normalized rotation axis.
+     *
+     * Angle and axis could be easily derived as:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \psi &= ||r||\\
+     * \boldsymbol{u} &= \frac{r}{\theta}
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * Then a quaternion can be calculated by
+     *  \f[q = [\cos\psi, \boldsymbol{u}\sin\psi]\f]
+     *  where \f$\psi = \theta / 2 \f$
+     */
+    static Quat<_Tp> createFromRvec(InputArray rvec);
+
+     /**
+     * @brief
+     * from Euler angles
+     *
+     * A quaternion can be generated from Euler angles by combining the quaternion representations of the Euler rotations.
+     *
+     * For example, if we use intrinsic rotations in the order of X-Y-Z,\f$\theta_1 \f$ is rotation around the X-axis, \f$\theta_2 \f$ is rotation around the Y-axis,
+     * \f$\theta_3 \f$ is rotation around the Z-axis. The final quaternion q can be calculated by
+     *
+     * \f[ {q} = q_{X, \theta_1}  q_{Y, \theta_2} q_{Z, \theta_3}\f]
+     * where \f$ q_{X, \theta_1} \f$ is created from @ref createFromXRot,  \f$ q_{Y, \theta_2} \f$ is created from @ref createFromYRot,
+     *  \f$ q_{Z, \theta_3} \f$ is created from @ref createFromZRot.
+     * @param angles the Euler angles in a vector of length 3
+     * @param eulerAnglesType the convertion Euler angles type
+     */
+    static Quat<_Tp> createFromEulerAngles(const Vec<_Tp, 3> &angles, QuatEnum::EulerAnglesType eulerAnglesType);
+
+    /**
+     * @brief get a quaternion from a rotation about the Y-axis by \f$\theta\f$ .
+     * \f[q = \cos(\theta/2)+0 i+ sin(\theta/2) j +0k \f]
+     */
+    static Quat<_Tp> createFromYRot(const _Tp theta);
+
+    /**
+     * @brief get a quaternion from a rotation about the X-axis by \f$\theta\f$ .
+     * \f[q = \cos(\theta/2)+sin(\theta/2) i +0 j +0 k \f]
+     */
+    static Quat<_Tp> createFromXRot(const _Tp theta);
+
+    /**
+     * @brief get a quaternion from a rotation about the Z-axis by \f$\theta\f$.
+     * \f[q = \cos(\theta/2)+0 i +0 j +sin(\theta/2) k \f]
+     */
+    static Quat<_Tp> createFromZRot(const _Tp theta);
+
+    /**
+     * @brief a way to get element.
+     * @param index over a range [0, 3].
+     *
+     * A quaternion q
+     *
+     * q.at(0) is equivalent to q.w,
+     *
+     * q.at(1) is equivalent to q.x,
+     *
+     * q.at(2) is equivalent to q.y,
+     *
+     * q.at(3) is equivalent to q.z.
+     */
+    _Tp at(size_t index) const;
+
+    /**
+     * @brief return the conjugate of this quaternion.
+     * \f[q.conjugate() = (w, -x, -y, -z).\f]
+     */
+    Quat<_Tp> conjugate() const;
+
+    /**
+     *
+     * @brief return the value of exponential value.
+     * \f[\exp(q) = e^w (\cos||\boldsymbol{v}||+ \frac{v}{||\boldsymbol{v}||})\sin||\boldsymbol{v}||\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example:
+     * ```
+     * Quatd q{1,2,3,4};
+     * cout << exp(q) << endl;
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> exp(const Quat<T> &q);
+
+    /**
+     * @brief return the value of exponential value.
+     * \f[\exp(q) = e^w (\cos||\boldsymbol{v}||+ \frac{v}{||\boldsymbol{v}||}\sin||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q{1,2,3,4};
+     * cout << q.exp() << endl;
+     * ```
+     */
+    Quat<_Tp> exp() const;
+
+    /**
+     * @brief return the value of logarithm function.
+     * \f[\ln(q) = \ln||q|| + \frac{\boldsymbol{v}}{||\boldsymbol{v}||}\arccos\frac{w}{||q||}.\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q1{1,2,3,4};
+     * cout << log(q1) << endl;
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> log(const Quat<T> &q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of logarithm function.
+     *  \f[\ln(q) = \ln||q|| + \frac{\boldsymbol{v}}{||\boldsymbol{v}||}\arccos\frac{w}{||q||}\f].
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.log();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * Quatd q1(1,2,3,4);
+     * q1.normalize().log(assumeUnit);
+     * ```
+     */
+    Quat<_Tp> log(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of power function with index \f$x\f$.
+     * \f[q^x = ||q||(cos(x\theta) + \boldsymbol{u}sin(x\theta))).\f]
+     * @param q a quaternion.
+     * @param x index of exponentiation.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * power(q, 2.0);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * double angle = CV_PI;
+     * Vec3d axis{0, 0, 1};
+     * Quatd q1 = Quatd::createFromAngleAxis(angle, axis); //generate a unit quat by axis and angle
+     * power(q1, 2.0, assumeUnit);//This assumeUnit means q1 is a unit quaternion.
+     * ```
+     * @note the type of the index should be the same as the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> power(const Quat<T> &q, const T x, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of power function with index \f$x\f$.
+     * \f[q^x = ||q||(\cos(x\theta) + \boldsymbol{u}\sin(x\theta))).\f]
+     * @param x index of exponentiation.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.power(2.0);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * double angle = CV_PI;
+     * Vec3d axis{0, 0, 1};
+     * Quatd q1 = Quatd::createFromAngleAxis(angle, axis); //generate a unit quat by axis and angle
+     * q1.power(2.0, assumeUnit); //This assumeUnt means q1 is a unit quaternion
+     * ```
+     */
+    Quat<_Tp> power(const _Tp x, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return \f$\sqrt{q}\f$.
+     * @param q a quaternion.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatf q(1,2,3,4);
+     * sqrt(q);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q = {1,0,0,0};
+     * sqrt(q, assumeUnit); //This assumeUnit means q is a unit quaternion.
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> sqrt(const Quat<T> &q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return \f$\sqrt{q}\f$.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatf q(1,2,3,4);
+     * q.sqrt();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q = {1,0,0,0};
+     * q.sqrt(assumeUnit); //This assumeUnit means q is a unit quaternion
+     * ```
+     */
+    Quat<_Tp> sqrt(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of power function with quaternion \f$q\f$.
+     * \f[p^q = e^{q\ln(p)}.\f]
+     * @param p base quaternion of power function.
+     * @param q index quaternion of power function.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion \f$p\f$ assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd p(1,2,3,4);
+     * Quatd q(5,6,7,8);
+     * power(p, q);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * p = p.normalize();
+     * power(p, q, assumeUnit); //This assumeUnit means p is a unit quaternion
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> power(const Quat<T> &p, const Quat<T> &q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of power function with quaternion \f$q\f$.
+     * \f[p^q = e^{q\ln(p)}.\f]
+     * @param q index quaternion of power function.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd p(1,2,3,4);
+     * Quatd q(5,6,7,8);
+     * p.power(q);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * p = p.normalize();
+     * p.power(q, assumeUnit); //This assumeUnit means p is a unit quaternion
+     * ```
+     */
+    Quat<_Tp> power(const Quat<_Tp> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the crossProduct between \f$p = (a, b, c, d) = (a, \boldsymbol{u})\f$ and \f$q = (w, x, y, z) = (w, \boldsymbol{v})\f$.
+     * \f[p \times q = \frac{pq- qp}{2}\f]
+     * \f[p \times q = \boldsymbol{u} \times \boldsymbol{v}\f]
+     * \f[p \times q = (cz-dy)i + (dx-bz)j + (by-xc)k \f]
+     *
+     * For example
+     * ```
+     * Quatd q{1,2,3,4};
+     * Quatd p{5,6,7,8};
+     * crossProduct(p, q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> crossProduct(const Quat<T> &p, const Quat<T> &q);
+
+    /**
+     * @brief return the crossProduct between \f$p = (a, b, c, d) = (a, \boldsymbol{u})\f$ and \f$q = (w, x, y, z) = (w, \boldsymbol{v})\f$.
+     * \f[p \times q = \frac{pq- qp}{2}.\f]
+     * \f[p \times q = \boldsymbol{u} \times \boldsymbol{v}.\f]
+     * \f[p \times q = (cz-dy)i + (dx-bz)j + (by-xc)k. \f]
+     *
+     * For example
+     * ```
+     * Quatd q{1,2,3,4};
+     * Quatd p{5,6,7,8};
+     * p.crossProduct(q)
+     * ```
+     */
+    Quat<_Tp> crossProduct(const Quat<_Tp> &q) const;
+
+    /**
+     * @brief return the norm of quaternion.
+     * \f[||q|| = \sqrt{w^2 + x^2 + y^2 + z^2}.\f]
+     */
+    _Tp norm() const;
+
+    /**
+     * @brief return a normalized \f$p\f$.
+     * \f[p = \frac{q}{||q||}\f]
+     * where \f$p\f$ satisfies \f$(p.x)^2 + (p.y)^2 + (p.z)^2 + (p.w)^2 = 1.\f$
+     */
+    Quat<_Tp> normalize() const;
+
+    /**
+     * @brief return \f$q^{-1}\f$ which is an inverse of \f$q\f$
+     * which satisfies \f$q * q^{-1} = 1\f$.
+     * @param q a quaternion.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * inv(q);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q = q.normalize();
+     * inv(q, assumeUnit);//This assumeUnit means p is a unit quaternion
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> inv(const Quat<T> &q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return \f$q^{-1}\f$ which is an inverse of \f$q\f$
+     * satisfying \f$q * q^{-1} = 1\f$.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.inv();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q = q.normalize();
+     * q.inv(assumeUnit);  //assumeUnit means p is a unit quaternion
+     * ```
+     */
+    Quat<_Tp> inv(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return sinh value of quaternion q, sinh could be calculated as:
+     * \f[\sinh(p) = \sin(w)\cos(||\boldsymbol{v}||) + \cosh(w)\frac{v}{||\boldsymbol{v}||}\sin||\boldsymbol{v}||\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * sinh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> sinh(const Quat<T> &q);
+
+    /**
+     * @brief return sinh value of this quaternion, sinh could be calculated as:
+     * \f$\sinh(p) = \sin(w)\cos(||\boldsymbol{v}||) + \cosh(w)\frac{v}{||\boldsymbol{v}||}\sin||\boldsymbol{v}||\f$
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.sinh();
+     * ```
+     */
+    Quat<_Tp> sinh() const;
+
+    /**
+     * @brief return cosh value of quaternion q, cosh could be calculated as:
+     * \f[\cosh(p) = \cosh(w) * \cos(||\boldsymbol{v}||) + \sinh(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sin(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * cosh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> cosh(const Quat<T> &q);
+
+    /**
+     * @brief return cosh value of this quaternion, cosh could be calculated as:
+     * \f[\cosh(p) = \cosh(w) * \cos(||\boldsymbol{v}||) + \sinh(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}sin(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.cosh();
+     * ```
+     */
+    Quat<_Tp> cosh() const;
+
+    /**
+     * @brief return tanh value of quaternion q, tanh could be calculated as:
+     * \f[ \tanh(q) = \frac{\sinh(q)}{\cosh(q)}.\f]
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * tanh(q);
+     * ```
+     * @sa sinh, cosh
+     */
+    template <typename T>
+    friend Quat<T> tanh(const Quat<T> &q);
+
+    /**
+     * @brief return tanh value of this quaternion, tanh could be calculated as:
+     * \f[ \tanh(q) = \frac{\sinh(q)}{\cosh(q)}.\f]
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.tanh();
+     * ```
+     * @sa sinh, cosh
+     */
+    Quat<_Tp> tanh() const;
+
+    /**
+     * @brief return tanh value of quaternion q, sin could be calculated as:
+     * \f[\sin(p) = \sin(w) * \cosh(||\boldsymbol{v}||) + \cos(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sinh(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * sin(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> sin(const Quat<T> &q);
+
+    /**
+     * @brief return sin value of this quaternion, sin could be calculated as:
+     * \f[\sin(p) = \sin(w) * \cosh(||\boldsymbol{v}||) + \cos(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sinh(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.sin();
+     * ```
+     */
+    Quat<_Tp> sin() const;
+
+    /**
+     * @brief return sin value of quaternion q, cos could be calculated as:
+     * \f[\cos(p) = \cos(w) * \cosh(||\boldsymbol{v}||) - \sin(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sinh(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * cos(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> cos(const Quat<T> &q);
+
+    /**
+     * @brief return cos value of this quaternion, cos could be calculated as:
+     * \f[\cos(p) = \cos(w) * \cosh(||\boldsymbol{v}||) - \sin(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sinh(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.cos();
+     * ```
+     */
+    Quat<_Tp> cos() const;
+
+    /**
+     * @brief return tan value of quaternion q, tan could be calculated as:
+     * \f[\tan(q) = \frac{\sin(q)}{\cos(q)}.\f]
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * tan(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> tan(const Quat<T> &q);
+
+    /**
+     * @brief return tan value of this quaternion, tan could be calculated as:
+     * \f[\tan(q) = \frac{\sin(q)}{\cos(q)}.\f]
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.tan();
+     * ```
+     */
+    Quat<_Tp> tan() const;
+
+    /**
+     * @brief return arcsin value of quaternion q, arcsin could be calculated as:
+     * \f[\arcsin(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arcsinh(q\frac{\boldsymbol{v}}{||\boldsymbol{v}||})\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * asin(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> asin(const Quat<T> &q);
+
+    /**
+     * @brief return arcsin value of this quaternion, arcsin could be calculated as:
+     * \f[\arcsin(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arcsinh(q\frac{\boldsymbol{v}}{||\boldsymbol{v}||})\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.asin();
+     * ```
+     */
+    Quat<_Tp> asin() const;
+
+    /**
+     * @brief return arccos value of quaternion q, arccos could be calculated as:
+     * \f[\arccos(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arccosh(q)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * acos(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> acos(const Quat<T> &q);
+
+    /**
+     * @brief return arccos value of this quaternion, arccos could be calculated as:
+     * \f[\arccos(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arccosh(q)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.acos();
+     * ```
+     */
+    Quat<_Tp> acos() const;
+
+    /**
+     * @brief return arctan value of quaternion q, arctan could be calculated as:
+     * \f[\arctan(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arctanh(q\frac{\boldsymbol{v}}{||\boldsymbol{v}||})\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * atan(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> atan(const Quat<T> &q);
+
+    /**
+     * @brief return arctan value of this quaternion, arctan could be calculated as:
+     * \f[\arctan(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arctanh(q\frac{\boldsymbol{v}}{||\boldsymbol{v}||})\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.atan();
+     * ```
+     */
+    Quat<_Tp> atan() const;
+
+    /**
+     * @brief return arcsinh value of quaternion q, arcsinh could be calculated as:
+     * \f[arcsinh(q) = \ln(q + \sqrt{q^2 + 1})\f].
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * asinh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> asinh(const Quat<T> &q);
+
+    /**
+     * @brief return arcsinh value of this quaternion, arcsinh could be calculated as:
+     * \f[arcsinh(q) = \ln(q + \sqrt{q^2 + 1})\f].
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.asinh();
+     * ```
+     */
+    Quat<_Tp> asinh() const;
+
+    /**
+     * @brief return arccosh value of quaternion q, arccosh could be calculated as:
+     * \f[arccosh(q) = \ln(q + \sqrt{q^2 - 1})\f].
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * acosh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> acosh(const Quat<T> &q);
+
+    /**
+     * @brief return arccosh value of this quaternion, arccosh could be calculated as:
+     * \f[arcosh(q) = \ln(q + \sqrt{q^2 - 1})\f].
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.acosh();
+     * ```
+     */
+    Quat<_Tp> acosh() const;
+
+    /**
+     * @brief return arctanh value of quaternion q, arctanh could be calculated as:
+     * \f[arctanh(q) = \frac{\ln(q + 1) - \ln(1 - q)}{2}\f].
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * atanh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> atanh(const Quat<T> &q);
+
+    /**
+     * @brief return arctanh value of this quaternion, arctanh could be calculated as:
+     * \f[arcsinh(q) = \frac{\ln(q + 1) - \ln(1 - q)}{2}\f].
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.atanh();
+     * ```
+     */
+    Quat<_Tp> atanh() const;
+
+    /**
+     * @brief return true if this quaternion is a unit quaternion.
+     * @param eps tolerance scope of normalization. The eps could be defined as
+     *
+     * \f[eps = |1 - dotValue|\f] where \f[dotValue = (this.w^2 + this.x^2 + this,y^2 + this.z^2).\f]
+     * And this function will consider it is normalized when the dotValue over a range \f$[1-eps, 1+eps]\f$.
+     */
+    bool isNormal(_Tp eps=CV_QUAT_EPS) const;
+
+    /**
+     * @brief to throw an error if this quaternion is not a unit quaternion.
+     * @param eps tolerance scope of normalization.
+     * @sa isNormal
+     */
+    void assertNormal(_Tp eps=CV_QUAT_EPS) const;
+
+    /**
+     * @brief transform a quaternion to a 3x3 rotation matrix.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations. Otherwise, this function will normalize this
+     * quaternion at first then do the transformation.
+     *
+     * @note Matrix A which is to be rotated should have the form
+     * \f[\begin{bmatrix}
+     * x_0& x_1& x_2&...&x_n\\
+     * y_0& y_1& y_2&...&y_n\\
+     * z_0& z_1& z_2&...&z_n
+     * \end{bmatrix}\f]
+     * where the same subscript represents a point. The shape of A assume to be [3, n]
+     * The points matrix A can be rotated by toRotMat3x3() * A.
+     * The result has 3 rows and n columns too.
+
+     * For example
+     * ```
+     * double angle = CV_PI;
+     * Vec3d axis{0,0,1};
+     * Quatd q_unit = Quatd::createFromAngleAxis(angle, axis); //quaternion could also be get by interpolation by two or more quaternions.
+     *
+     * //assume there is two points (1,0,0) and (1,0,1) to be rotated
+     * Mat pointsA = (Mat_<double>(2, 3) << 1,0,0,1,0,1);
+     * //change the shape
+     * pointsA = pointsA.t();
+     * // rotate 180 degrees around the z axis
+     * Mat new_point = q_unit.toRotMat3x3() * pointsA;
+     * // print two points
+     * cout << new_point << endl;
+     * ```
+     */
+    Matx<_Tp, 3, 3> toRotMat3x3(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief transform a quaternion to a 4x4 rotation matrix.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations. Otherwise, this function will normalize this
+     * quaternion at first then do the transformation.
+     *
+     * The operations is similar as toRotMat3x3
+     * except that the points matrix should have the form
+     * \f[\begin{bmatrix}
+     * x_0& x_1& x_2&...&x_n\\
+     * y_0& y_1& y_2&...&y_n\\
+     * z_0& z_1& z_2&...&z_n\\
+     * 0&0&0&...&0
+     * \end{bmatrix}\f]
+     *
+     * @sa toRotMat3x3
+     */
+
+    Matx<_Tp, 4, 4> toRotMat4x4(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief transform the this quaternion to a Vec<T, 4>.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.toVec();
+     * ```
+     */
+    Vec<_Tp, 4> toVec() const;
+
+    /**
+     * @brief transform this quaternion to a Rotation vector.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations.
+     * Rotation vector rVec is defined as:
+     * \f[ rVec = [\theta v_x, \theta v_y, \theta v_z]\f]
+     * where \f$\theta\f$ represents rotation angle, and \f$\boldsymbol{v}\f$ represents the normalized rotation axis.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.toRotVec();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q.normalize().toRotVec(assumeUnit); //answer is same as q.toRotVec().
+     * ```
+     */
+    Vec<_Tp, 3> toRotVec(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief get the angle of quaternion, it returns the rotation angle.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations.
+     * \f[\psi = 2 *arccos(\frac{w}{||q||})\f]
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.getAngle();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q.normalize().getAngle(assumeUnit);//same as q.getAngle().
+     * ```
+     * @note It always return the value between \f$[0, 2\pi]\f$.
+     */
+    _Tp getAngle(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief get the axis of quaternion, it returns a vector of length 3.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations.
+     *
+     * the unit axis \f$\boldsymbol{u}\f$ is defined by
+     * \f[\begin{equation}
+     *    \begin{split}
+     *      \boldsymbol{v}
+     *      &= \boldsymbol{u} ||\boldsymbol{v}||\\
+     *      &= \boldsymbol{u}||q||sin(\frac{\theta}{2})
+     *    \end{split}
+     *    \end{equation}\f]
+     *  where \f$v=[x, y ,z]\f$ and \f$\theta\f$ represents rotation angle.
+     *
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.getAxis();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q.normalize().getAxis(assumeUnit);//same as q.getAxis()
+     * ```
+     */
+    Vec<_Tp, 3> getAxis(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the dot between quaternion \f$q\f$ and this quaternion.
+     *
+     * dot(p, q) is a good metric of how close the quaternions are.
+     * Indeed, consider the unit quaternion difference \f$p^{-1} * q\f$, its real part is dot(p, q).
+     * At the same time its real part is equal to \f$\cos(\beta/2)\f$ where \f$\beta\f$ is
+     * an angle of rotation between p and q, i.e.,
+     * Therefore, the closer dot(p, q) to 1,
+     * the smaller rotation between them.
+     * \f[p \cdot q = p.w \cdot q.w + p.x \cdot q.x + p.y \cdot q.y + p.z \cdot q.z\f]
+     * @param q the other quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * Quatd p(5,6,7,8);
+     * p.dot(q);
+     * ```
+     */
+    _Tp dot(Quat<_Tp> q) const;
+
+    /**
+     * @brief To calculate the interpolation from \f$q_0\f$ to \f$q_1\f$ by Linear Interpolation(Nlerp)
+     * For two quaternions, this interpolation curve can be displayed as:
+     * \f[Lerp(q_0, q_1, t) = (1 - t)q_0 + tq_1.\f]
+     * Obviously, the lerp will interpolate along a straight line if we think of \f$q_0\f$ and \f$q_1\f$ as a vector
+     * in a two-dimensional space. When \f$t = 0\f$, it returns \f$q_0\f$ and when \f$t= 1\f$, it returns \f$q_1\f$.
+     * \f$t\f$ should to be ranged in \f$[0, 1]\f$ normally.
+     * @param q0 a quaternion used in linear interpolation.
+     * @param q1 a quaternion used in linear interpolation.
+     * @param t percent of vector \f$\overrightarrow{q_0q_1}\f$ over a range [0, 1].
+     * @note it returns a non-unit quaternion.
+     */
+    static Quat<_Tp> lerp(const Quat<_Tp> &q0, const Quat &q1, const _Tp t);
+
+    /**
+     * @brief To calculate the interpolation from \f$q_0\f$ to \f$q_1\f$ by Normalized Linear Interpolation(Nlerp).
+     * it returns a normalized quaternion of Linear Interpolation(Lerp).
+     * \f[ Nlerp(q_0, q_1, t) = \frac{(1 - t)q_0 + tq_1}{||(1 - t)q_0 + tq_1||}.\f]
+     * The interpolation will always choose the shortest path but the constant speed is not guaranteed.
+     * @param q0 a quaternion used in normalized linear interpolation.
+     * @param q1 a quaternion used in normalized linear interpolation.
+     * @param t percent of vector \f$\overrightarrow{q_0q_1}\f$ over a range [0, 1].
+     * @param assumeUnit if QUAT_ASSUME_UNIT, all input quaternions assume to be unit quaternion. Otherwise, all inputs
+     quaternion will be normalized inside the function.
+     * @sa lerp
+     */
+    static Quat<_Tp> nlerp(const Quat<_Tp> &q0, const Quat &q1, const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     @brief To calculate the interpolation between \f$q_0\f$ and \f$q_1\f$ by Spherical Linear
+     Interpolation(Slerp), which can be defined as:
+    \f[ Slerp(q_0, q_1, t) = \frac{\sin((1-t)\theta)}{\sin(\theta)}q_0 + \frac{\sin(t\theta)}{\sin(\theta)}q_1\f]
+    where \f$\theta\f$ can be calculated as:
+    \f[\theta=cos^{-1}(q_0\cdot q_1)\f]
+    resulting from the both of their norm is unit.
+    @param q0 a quaternion used in Slerp.
+    @param q1 a quaternion used in Slerp.
+    @param t percent of angle between \f$q_0\f$ and \f$q_1\f$ over a range [0, 1].
+    @param assumeUnit if QUAT_ASSUME_UNIT, all input quaternions assume to be unit quaternions. Otherwise, all input
+    quaternions will be normalized inside the function.
+    @param directChange if QUAT_ASSUME_UNIT, the interpolation will choose the nearest path.
+    @note If the interpolation angle is small, the error between Nlerp and Slerp is not so large. To improve efficiency and
+    avoid zero division error, we use Nlerp instead of Slerp.
+    */
+    static Quat<_Tp> slerp(const Quat<_Tp> &q0, const Quat &q1, const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT, bool directChange=true);
+
+    /**
+     * @brief To calculate the interpolation between \f$q_0\f$,\f$q_1\f$,\f$q_2\f$,\f$q_3\f$  by Spherical and quadrangle(Squad). This could be defined as:
+     * \f[Squad(q_i, s_i, s_{i+1}, q_{i+1}, t) = Slerp(Slerp(q_i, q_{i+1}, t), Slerp(s_i, s_{i+1}, t), 2t(1-t))\f]
+     * where
+     * \f[s_i = q_i\exp(-\frac{\log(q^*_iq_{i+1}) + \log(q^*_iq_{i-1})}{4})\f]
+     *
+     * The Squad expression is analogous to the \f$B\acute{e}zier\f$ curve, but involves spherical linear
+     * interpolation instead of simple linear interpolation. Each \f$s_i\f$ needs to be calculated by three
+     * quaternions.
+     *
+     * @param q0 the first quaternion.
+     * @param s0 the second quaternion.
+     * @param s1 the third quaternion.
+     * @param q1 thr fourth quaternion.
+     * @param t interpolation parameter of quadratic and linear interpolation over a range \f$[0, 1]\f$.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, all input quaternions assume to be unit quaternion. Otherwise, all input
+     * quaternions will be normalized inside the function.
+     * @param directChange if QUAT_ASSUME_UNIT, squad will find the nearest path to interpolate.
+     * @sa interPoint, spline
+     */
+    static Quat<_Tp> squad(const Quat<_Tp> &q0, const Quat<_Tp> &s0,
+                            const Quat<_Tp> &s1, const Quat<_Tp> &q1,
+                            const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT,
+                            bool directChange=true);
+
+    /**
+     * @brief This is the part calculation of squad.
+     * To calculate the intermedia quaternion \f$s_i\f$ between each three quaternion
+     * \f[s_i = q_i\exp(-\frac{\log(q^*_iq_{i+1}) + \log(q^*_iq_{i-1})}{4}).\f]
+     * @param q0 the first quaternion.
+     * @param q1 the second quaternion.
+     * @param q2 the third quaternion.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, all input quaternions assume to be unit quaternion. Otherwise, all input
+     * quaternions will be normalized inside the function.
+     * @sa squad
+     */
+    static Quat<_Tp> interPoint(const Quat<_Tp> &q0, const Quat<_Tp> &q1,
+                                 const Quat<_Tp> &q2, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief to calculate a quaternion which is the result of a \f$C^1\f$ continuous
+     * spline curve constructed by squad at the ratio t. Here, the interpolation values are
+     * between \f$q_1\f$ and \f$q_2\f$. \f$q_0\f$ and \f$q_2\f$ are used to ensure the \f$C^1\f$
+     * continuity. if t = 0, it returns \f$q_1\f$, if t = 1, it returns \f$q_2\f$.
+     * @param q0 the first input quaternion to ensure \f$C^1\f$ continuity.
+     * @param q1 the second input quaternion.
+     * @param q2 the third input quaternion.
+     * @param q3 the fourth input quaternion the same use of \f$q1\f$.
+     * @param t ratio over a range [0, 1].
+     * @param assumeUnit if QUAT_ASSUME_UNIT, \f$q_0, q_1, q_2, q_3\f$ assume to be unit quaternion. Otherwise, all input
+     * quaternions will be normalized inside the function.
+     *
+     * For example:
+     *
+     * If there are three double quaternions \f$v_0, v_1, v_2\f$ waiting to be interpolated.
+     *
+     * Interpolation between \f$v_0\f$ and \f$v_1\f$ with a ratio \f$t_0\f$ could be calculated as
+     * ```
+     * Quatd::spline(v0, v0, v1, v2, t0);
+     * ```
+     * Interpolation between \f$v_1\f$ and \f$v_2\f$ with a ratio \f$t_0\f$ could be calculated as
+     * ```
+     * Quatd::spline(v0, v1, v2, v2, t0);
+     * ```
+     * @sa squad, slerp
+     */
+    static Quat<_Tp> spline(const Quat<_Tp> &q0, const Quat<_Tp> &q1,
+                            const Quat<_Tp> &q2, const Quat<_Tp> &q3,
+                            const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief Return opposite quaternion \f$-p\f$
+     * which satisfies \f$p + (-p) = 0.\f$
+     *
+     * For example
+     * ```
+     * Quatd q{1, 2, 3, 4};
+     * std::cout << -q << std::endl; // [-1, -2, -3, -4]
+     * ```
+     */
+    Quat<_Tp> operator-() const;
+
+    /**
+     * @brief return true if two quaternions p and q are nearly equal, i.e. when the absolute
+     * value of each \f$p_i\f$ and \f$q_i\f$ is less than CV_QUAT_EPS.
+     */
+    bool operator==(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Addition operator of two quaternions p and q.
+     * It returns a new quaternion that each value is the sum of \f$p_i\f$ and \f$q_i\f$.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p + q << std::endl; //[6, 8, 10, 12]
+     * ```
+     */
+    Quat<_Tp> operator+(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Addition assignment operator of two quaternions p and q.
+     * It adds right operand to the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p += q; // equivalent to p = p + q
+     * std::cout << p << std::endl; //[6, 8, 10, 12]
+     *
+     * ```
+     */
+    Quat<_Tp>& operator+=(const Quat<_Tp>&);
+
+    /**
+     * @brief Subtraction operator of two quaternions p and q.
+     * It returns a new quaternion that each value is the sum of \f$p_i\f$ and \f$-q_i\f$.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p - q << std::endl; //[-4, -4, -4, -4]
+     * ```
+     */
+    Quat<_Tp> operator-(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Subtraction assignment operator of two quaternions p and q.
+     * It subtracts right operand from the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p -= q; // equivalent to p = p - q
+     * std::cout << p << std::endl; //[-4, -4, -4, -4]
+     *
+     * ```
+     */
+    Quat<_Tp>& operator-=(const Quat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of two quaternions q and p.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [p_0, \boldsymbol{u}]*[q_0, \boldsymbol{v}]\\
+     * &=[p_0q_0 - \boldsymbol{u}\cdot \boldsymbol{v}, p_0\boldsymbol{v} + q_0\boldsymbol{u}+ \boldsymbol{u}\times \boldsymbol{v}].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$\cdot\f$ means dot product and \f$\times \f$ means cross product.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p *= q; // equivalent to p = p * q
+     * std::cout << p << std::endl; //[-60, 12, 30, 24]
+     * ```
+     */
+    Quat<_Tp>& operator*=(const Quat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of a quaternions and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z] * s\\
+     * &=[w * s, x * s, y * s, z * s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * p *= s; // equivalent to p = p * s
+     * std::cout << p << std::endl; //[2.0, 4.0, 6.0, 8.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    Quat<_Tp>& operator*=(const _Tp s);
+
+    /**
+     * @brief Multiplication operator of two quaternions q and p.
+     * Multiplies values on either side of the operator.
+     *
+     * Rule of quaternion multiplication:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [p_0, \boldsymbol{u}]*[q_0, \boldsymbol{v}]\\
+     * &=[p_0q_0 - \boldsymbol{u}\cdot \boldsymbol{v}, p_0\boldsymbol{v} + q_0\boldsymbol{u}+ \boldsymbol{u}\times \boldsymbol{v}].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$\cdot\f$ means dot product and \f$\times \f$ means cross product.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p * q << std::endl; //[-60, 12, 30, 24]
+     * ```
+     */
+    Quat<_Tp> operator*(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Division operator of a quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z] / s\\
+     * &=[w/s, x/s, y/s, z/s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1, 1.5, 2]
+     * ```
+     * @note the type of scalar should be equal to this quaternion.
+     */
+    Quat<_Tp> operator/(const _Tp s) const;
+
+    /**
+     * @brief Division operator of two quaternions p and q.
+     * Divides left hand operand by right hand operand.
+     *
+     * Rule of quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q &= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p / q << std::endl; // equivalent to p * q.inv()
+     * ```
+     */
+    Quat<_Tp> operator/(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Division assignment operator of a quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z] / s\\
+     * &=[w / s, x / s, y / s, z / s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1.0, 1.5, 2.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    Quat<_Tp>& operator/=(const _Tp s);
+
+    /**
+     * @brief Division assignment operator of two quaternions p and q;
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of quaternion division with a quaternion:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q&= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p /= q; // equivalent to p = p * q.inv()
+     * std::cout << p << std::endl;
+     * ```
+     */
+    Quat<_Tp>& operator/=(const Quat<_Tp>&);
+
+    _Tp& operator[](std::size_t n);
+
+    const _Tp& operator[](std::size_t n) const;
+
+    /**
+     * @brief Subtraction operator of a scalar and a quaternions.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << scalar - p << std::endl; //[1.0, -2, -3, -4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator-(const T s, const Quat<T>&);
+
+    /**
+     * @brief Subtraction operator of a quaternions and a scalar.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << p - scalar << std::endl; //[-1.0, 2, 3, 4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator-(const Quat<T>&, const T s);
+
+    /**
+     * @brief Addition operator of a quaternions and a scalar.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << scalar + p << std::endl; //[3.0, 2, 3, 4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator+(const T s, const Quat<T>&);
+
+    /**
+     * @brief Addition operator of a quaternions and a scalar.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << p + scalar << std::endl; //[3.0, 2, 3, 4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator+(const Quat<T>&, const T s);
+
+    /**
+     * @brief Multiplication operator of a scalar and a quaternions.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z] * s\\
+     * &=[w * s, x * s, y * s, z * s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * std::cout << s * p << std::endl; //[2.0, 4.0, 6.0, 8.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator*(const T s, const Quat<T>&);
+
+    /**
+     * @brief Multiplication operator of a quaternion and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z] * s\\
+     * &=[w * s, x * s, y * s, z * s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * std::cout << p * s << std::endl; //[2.0, 4.0, 6.0, 8.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator*(const Quat<T>&, const T s);
+
+    template <typename S>
+    friend std::ostream& cv::operator<<(std::ostream&, const Quat<S>&);
+
+    /**
+     * @brief Transform a quaternion q to Euler angles.
+     *
+     *
+     * When transforming a quaternion \f$q = w + x\boldsymbol{i} + y\boldsymbol{j} + z\boldsymbol{k}\f$ to Euler angles, rotation matrix M can be calculated by:
+     * \f[ \begin{aligned} {M} &={\begin{bmatrix}1-2(y^{2}+z^{2})&2(xy-zx)&2(xz+yw)\\2(xy+zw)&1-2(x^{2}+z^{2})&2(yz-xw)\\2(xz-yw)&2(yz+xw)&1-2(x^{2}+y^{2})\end{bmatrix}}\end{aligned}.\f]
+     * On the other hand, the rotation matrix can be obtained from Euler angles.
+     * Using intrinsic rotations with Euler angles type XYZ as an example,
+     * \f$\theta_1 \f$, \f$\theta_2 \f$, \f$\theta_3 \f$ are three angles for Euler angles, the rotation matrix R can be calculated by:\f[R =X(\theta_1)Y(\theta_2)Z(\theta_3)
+     * ={\begin{bmatrix}\cos\theta_{2}\cos\theta_{3}&-\cos\theta_{2}\sin\theta_{3}&\sin\theta_{2}\\\cos\theta_{1}\sin\theta_{3}+\cos\theta_{3}\sin\theta_{1}\sin\theta_{2}&\cos\theta_{1}\cos\theta_{3}-\sin\theta_{1}\sin\theta_{2}\sin\theta_{3}&-\cos\theta_{2}\sin\theta_{1}\\\sin\theta_{1}\sin\theta_{3}-\cos\theta_{1}\cos\theta_{3}\sin\theta_{2}&\cos\theta_{3}\sin\theta_{1}+\cos\theta_{1}\sin\theta_{2}\sin\theta_{3}&\cos\theta_{1}\cos_{2}\end{bmatrix}}\f]
+     * Rotation matrix M and R are equal. As long as \f$ s_{2} \neq 1 \f$, by comparing each element of two matrices ,the solution is\f$\begin{cases} \theta_1 = \arctan2(-m_{23},m_{33})\\\theta_2 = arcsin(m_{13}) \\\theta_3 = \arctan2(-m_{12},m_{11}) \end{cases}\f$.
+     *
+     * When \f$ s_{2}=1\f$ or \f$ s_{2}=-1\f$, the gimbal lock occurs. The function will prompt "WARNING: Gimbal Lock will occur. Euler angles is non-unique. For intrinsic rotations, we set the third angle to 0, and for external rotation, we set the first angle to 0.".
+     *
+     * When \f$ s_{2}=1\f$ ,
+     * The rotation matrix R is \f$R = {\begin{bmatrix}0&0&1\\\sin(\theta_1+\theta_3)&\cos(\theta_1+\theta_3)&0\\-\cos(\theta_1+\theta_3)&\sin(\theta_1+\theta_3)&0\end{bmatrix}}\f$.
+     *
+     * The number of solutions is infinite with the condition \f$\begin{cases} \theta_1+\theta_3 = \arctan2(m_{21},m_{22})\\ \theta_2=\pi/2 \end{cases}\ \f$.
+     *
+     * We set \f$ \theta_3 = 0\f$, the solution is \f$\begin{cases} \theta_1=\arctan2(m_{21},m_{22})\\ \theta_2=\pi/2\\ \theta_3=0 \end{cases}\f$.
+     *
+     * When \f$ s_{2}=-1\f$,
+     * The rotation matrix R is \f$X_{1}Y_{2}Z_{3}={\begin{bmatrix}0&0&-1\\-\sin(\theta_1-\theta_3)&\cos(\theta_1-\theta_3)&0\\\cos(\theta_1-\theta_3)&\sin(\theta_1-\theta_3)&0\end{bmatrix}}\f$.
+     *
+     * The number of solutions is infinite with the condition \f$\begin{cases} \theta_1+\theta_3 = \arctan2(m_{32},m_{22})\\ \theta_2=\pi/2 \end{cases}\ \f$.
+     *
+     * We set \f$ \theta_3 = 0\f$, the solution is \f$ \begin{cases}\theta_1=\arctan2(m_{32},m_{22}) \\ \theta_2=-\pi/2\\  \theta_3=0\end{cases}\f$.
+     *
+     * Since \f$ sin \theta\in [-1,1] \f$ and \f$ cos \theta \in [-1,1] \f$, the unnormalized quaternion will cause computational troubles. For this reason, this function will normalize the quaternion at first and @ref QuatAssumeType is not needed.
+     *
+     * When the gimbal lock occurs, we set \f$\theta_3 = 0\f$ for intrinsic rotations or \f$\theta_1 = 0\f$ for extrinsic rotations.
+     *
+     * As a result, for every Euler angles type, we can get solution as shown in the following table.
+     * EulerAnglesType  | Ordinary | \f$\theta_2 = π/2\f$ | \f$\theta_2 = -π/2\f$
+     * ------------- | -------------| -------------| -------------
+     * INT_XYZ|\f$ \theta_1 = \arctan2(-m_{23},m_{33})\\\theta_2 = \arcsin(m_{13}) \\\theta_3= \arctan2(-m_{12},m_{11}) \f$|\f$ \theta_1=\arctan2(m_{21},m_{22})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(m_{32},m_{22})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_XZY|\f$ \theta_1 = \arctan2(m_{32},m_{22})\\\theta_2 = -\arcsin(m_{12}) \\\theta_3= \arctan2(m_{13},m_{11}) \f$|\f$ \theta_1=\arctan2(m_{31},m_{33})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(-m_{23},m_{33})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_YXZ|\f$ \theta_1 = \arctan2(m_{13},m_{33})\\\theta_2 = -\arcsin(m_{23}) \\\theta_3= \arctan2(m_{21},m_{22}) \f$|\f$ \theta_1=\arctan2(m_{12},m_{11})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(-m_{12},m_{11})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_YZX|\f$ \theta_1 = \arctan2(-m_{31},m_{11})\\\theta_2 = \arcsin(m_{21}) \\\theta_3= \arctan2(-m_{23},m_{22}) \f$|\f$ \theta_1=\arctan2(m_{13},m_{33})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(m_{13},m_{12})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_ZXY|\f$ \theta_1 = \arctan2(-m_{12},m_{22})\\\theta_2 = \arcsin(m_{32}) \\\theta_3= \arctan2(-m_{31},m_{33}) \f$|\f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_ZYX|\f$ \theta_1 = \arctan2(m_{21},m_{11})\\\theta_2 = \arcsin(-m_{31}) \\\theta_3= \arctan2(m_{32},m_{33}) \f$|\f$ \theta_1=\arctan2(m_{23},m_{22})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(-m_{12},m_{22})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * EXT_XYZ|\f$ \theta_1 = \arctan2(m_{32},m_{33})\\\theta_2 = \arcsin(-m_{31}) \\\ \theta_3 = \arctan2(m_{21},m_{11})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{23},m_{22}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(-m_{12},m_{22}) \f$
+     * EXT_XZY|\f$ \theta_1 = \arctan2(-m_{23},m_{22})\\\theta_2 = \arcsin(m_{21}) \\\theta_3=  \arctan2(-m_{31},m_{11})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{13},m_{33}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(m_{13},m_{12}) \f$
+     * EXT_YXZ|\f$ \theta_1 = \arctan2(-m_{31},m_{33}) \\\theta_2 = \arcsin(m_{32}) \\\theta_3= \arctan2(-m_{12},m_{22})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{21},m_{11}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(m_{21},m_{11}) \f$
+     * EXT_YZX|\f$ \theta_1 = \arctan2(m_{13},m_{11})\\\theta_2 = -\arcsin(m_{12}) \\\theta_3= \arctan2(m_{32},m_{22})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{31},m_{33}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(-m_{23},m_{33}) \f$
+     * EXT_ZXY|\f$ \theta_1 = \arctan2(m_{21},m_{22})\\\theta_2 = -\arcsin(m_{23}) \\\theta_3= \arctan2(m_{13},m_{33})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{12},m_{11}) \f$|\f$ \theta_1= 0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(-m_{12},m_{11}) \f$
+     * EXT_ZYX|\f$ \theta_1 = \arctan2(-m_{12},m_{11})\\\theta_2 = \arcsin(m_{13}) \\\theta_3= \arctan2(-m_{23},m_{33})\f$|\f$ \theta_1=0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{21},m_{22}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(m_{32},m_{22}) \f$
+     *
+     *  EulerAnglesType  | Ordinary | \f$\theta_2 = 0\f$ | \f$\theta_2 = π\f$
+     * ------------- | -------------| -------------| -------------
+     * INT_XYX| \f$ \theta_1 = \arctan2(m_{21},-m_{31})\\\theta_2 =\arccos(m_{11}) \\\theta_3 = \arctan2(m_{12},m_{13}) \f$| \f$ \theta_1=\arctan2(m_{32},m_{33})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{23},m_{22})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_XZX| \f$ \theta_1 = \arctan2(m_{31},m_{21})\\\theta_2 = \arccos(m_{11}) \\\theta_3 = \arctan2(m_{13},-m_{12}) \f$| \f$ \theta_1=\arctan2(m_{32},m_{33})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(-m_{32},m_{33})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_YXY| \f$ \theta_1 = \arctan2(m_{12},m_{32})\\\theta_2 = \arccos(m_{22}) \\\theta_3 = \arctan2(m_{21},-m_{23}) \f$| \f$ \theta_1=\arctan2(m_{13},m_{11})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(-m_{31},m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_YZY| \f$ \theta_1 = \arctan2(m_{32},-m_{12})\\\theta_2 = \arccos(m_{22}) \\\theta_3 =\arctan2(m_{23},m_{21}) \f$| \f$ \theta_1=\arctan2(m_{13},m_{11})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{13},-m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_ZXZ| \f$ \theta_1 = \arctan2(-m_{13},m_{23})\\\theta_2 = \arccos(m_{33}) \\\theta_3 =\arctan2(m_{31},m_{32}) \f$| \f$ \theta_1=\arctan2(m_{21},m_{22})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_ZYZ| \f$ \theta_1 = \arctan2(m_{23},m_{13})\\\theta_2 = \arccos(m_{33}) \\\theta_3 = \arctan2(m_{32},-m_{31}) \f$| \f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * EXT_XYX| \f$ \theta_1 = \arctan2(m_{12},m_{13}) \\\theta_2 = \arccos(m_{11}) \\\theta_3 = \arctan2(m_{21},-m_{31})\f$| \f$ \theta_1=0\\ \theta_2=0\\ \theta_3=\arctan2(m_{32},m_{33}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3= \arctan2(m_{23},m_{22}) \f$
+     * EXT_XZX| \f$ \theta_1 = \arctan2(m_{13},-m_{12})\\\theta_2 = \arccos(m_{11}) \\\theta_3 = \arctan2(m_{31},m_{21})\f$| \f$ \theta_1= 0\\ \theta_2=0\\ \theta_3=\arctan2(m_{32},m_{33}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(-m_{32},m_{33}) \f$
+     * EXT_YXY| \f$ \theta_1 = \arctan2(m_{21},-m_{23})\\\theta_2 = \arccos(m_{22}) \\\theta_3 = \arctan2(m_{12},m_{32}) \f$| \f$ \theta_1= 0\\ \theta_2=0\\ \theta_3=\arctan2(m_{13},m_{11}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(-m_{31},m_{11}) \f$
+     * EXT_YZY| \f$ \theta_1 = \arctan2(m_{23},m_{21}) \\\theta_2 = \arccos(m_{22}) \\\theta_3 = \arctan2(m_{32},-m_{12}) \f$| \f$ \theta_1= 0\\ \theta_2=0\\ \theta_3=\arctan2(m_{13},m_{11}) \f$| \f$ \theta_1=0\\ \theta_2=\pi\\ \theta_3=\arctan2(m_{13},-m_{11}) \f$
+     * EXT_ZXZ| \f$ \theta_1 = \arctan2(m_{31},m_{32}) \\\theta_2 = \arccos(m_{33}) \\\theta_3 = \arctan2(-m_{13},m_{23})\f$| \f$ \theta_1=0\\ \theta_2=0\\ \theta_3=\arctan2(m_{21},m_{22}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(m_{21},m_{11}) \f$
+     * EXT_ZYZ| \f$ \theta_1 = \arctan2(m_{32},-m_{31})\\\theta_2 = \arccos(m_{33}) \\\theta_3 = \arctan2(m_{23},m_{13}) \f$| \f$ \theta_1=0\\ \theta_2=0\\ \theta_3=\arctan2(m_{21},m_{11}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(m_{21},m_{11}) \f$
+     *
+     * @param eulerAnglesType the convertion Euler angles type
+     */
+
+    Vec<_Tp, 3> toEulerAngles(QuatEnum::EulerAnglesType eulerAnglesType);
+
+    _Tp w, x, y, z;
+
+};
+
+template <typename T>
+Quat<T> inv(const Quat<T> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> sinh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> cosh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> tanh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> sin(const Quat<T> &q);
+
+template <typename T>
+Quat<T> cos(const Quat<T> &q);
+
+template <typename T>
+Quat<T> tan(const Quat<T> &q);
+
+template <typename T>
+Quat<T> asinh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> acosh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> atanh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> asin(const Quat<T> &q);
+
+template <typename T>
+Quat<T> acos(const Quat<T> &q);
+
+template <typename T>
+Quat<T> atan(const Quat<T> &q);
+
+template <typename T>
+Quat<T> power(const Quat<T> &q, const Quat<T> &p, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> exp(const Quat<T> &q);
+
+template <typename T>
+Quat<T> log(const Quat<T> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> power(const Quat<T>& q, const T x, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> crossProduct(const Quat<T> &p, const Quat<T> &q);
+
+template <typename S>
+Quat<S> sqrt(const Quat<S> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> operator*(const T, const Quat<T>&);
+
+template <typename T>
+Quat<T> operator*(const Quat<T>&, const T);
+
+template <typename S>
+std::ostream& operator<<(std::ostream&, const Quat<S>&);
+
+using Quatd = Quat<double>;
+using Quatf = Quat<float>;
+
+//! @} core
+}
+
+#include "opencv2/core/quaternion.inl.hpp"
+
+#endif /* OPENCV_CORE_QUATERNION_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/quaternion.inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/quaternion.inl.hpp
new file mode 100644
index 000000000000..4204806a823e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/quaternion.inl.hpp
@@ -0,0 +1,1063 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <chargerKong@126.com>
+//         Longbu Wang <riskiest@gmail.com>
+
+#ifndef OPENCV_CORE_QUATERNION_INL_HPP
+#define OPENCV_CORE_QUATERNION_INL_HPP
+
+#ifndef OPENCV_CORE_QUATERNION_HPP
+#error This is not a standalone header. Include quaternion.hpp instead.
+#endif
+
+//@cond IGNORE
+///////////////////////////////////////////////////////////////////////////////////////
+//Implementation
+namespace cv {
+
+template <typename T>
+Quat<T>::Quat() : w(0), x(0), y(0), z(0) {}
+
+template <typename T>
+Quat<T>::Quat(const Vec<T, 4> &coeff):w(coeff[0]), x(coeff[1]), y(coeff[2]), z(coeff[3]){}
+
+template <typename T>
+Quat<T>::Quat(const T qw, const T qx, const T qy, const T qz):w(qw), x(qx), y(qy), z(qz){}
+
+template <typename T>
+Quat<T> Quat<T>::createFromAngleAxis(const T angle, const Vec<T, 3> &axis)
+{
+    T w, x, y, z;
+    T vNorm = std::sqrt(axis.dot(axis));
+    if (vNorm < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "this quaternion does not represent a rotation");
+    }
+    const T angle_half = angle * T(0.5);
+    w = std::cos(angle_half);
+    const T sin_v = std::sin(angle_half);
+    const T sin_norm = sin_v / vNorm;
+    x = sin_norm * axis[0];
+    y = sin_norm * axis[1];
+    z = sin_norm * axis[2];
+    return Quat<T>(w, x, y, z);
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromRotMat(InputArray _R)
+{
+    CV_CheckTypeEQ(_R.type(), cv::traits::Type<T>::value, "");
+    if (_R.rows() != 3 || _R.cols() != 3)
+    {
+        CV_Error(Error::StsBadArg, "Cannot convert matrix to quaternion: rotation matrix should be a 3x3 matrix");
+    }
+    Matx<T, 3, 3> R;
+    _R.copyTo(R);
+
+    T S, w, x, y, z;
+    T trace = R(0, 0) + R(1, 1) + R(2, 2);
+    if (trace > 0)
+    {
+        S = std::sqrt(trace + 1) * T(2);
+        x = (R(1, 2) - R(2, 1)) / S;
+        y = (R(2, 0) - R(0, 2)) / S;
+        z = (R(0, 1) - R(1, 0)) / S;
+        w = -T(0.25) * S;
+    }
+    else if (R(0, 0) > R(1, 1) && R(0, 0) > R(2, 2))
+    {
+
+        S = std::sqrt(T(1.0) + R(0, 0) - R(1, 1) - R(2, 2)) * T(2);
+        x = -T(0.25) * S;
+        y = -(R(1, 0) + R(0, 1)) / S;
+        z = -(R(0, 2) + R(2, 0)) / S;
+        w = (R(1, 2) - R(2, 1)) / S;
+    }
+    else if (R(1, 1) > R(2, 2))
+    {
+        S = std::sqrt(T(1.0) - R(0, 0) + R(1, 1) - R(2, 2)) * T(2);
+        x = (R(0, 1) + R(1, 0)) / S;
+        y = T(0.25) * S;
+        z = (R(1, 2) + R(2, 1)) / S;
+        w = (R(0, 2) - R(2, 0)) / S;
+    }
+    else
+    {
+        S = std::sqrt(T(1.0) - R(0, 0) - R(1, 1) + R(2, 2)) * T(2);
+        x = (R(0, 2) + R(2, 0)) / S;
+        y = (R(1, 2) + R(2, 1)) / S;
+        z = T(0.25) * S;
+        w = -(R(0, 1) - R(1, 0)) / S;
+    }
+    return Quat<T> (w, x, y, z);
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromRvec(InputArray _rvec)
+{
+    if (!((_rvec.cols() == 1 && _rvec.rows() == 3) || (_rvec.cols() == 3 && _rvec.rows() == 1))) {
+        CV_Error(Error::StsBadArg, "Cannot convert rotation vector to quaternion: The length of rotation vector should be 3");
+    }
+    Vec<T, 3> rvec;
+    _rvec.copyTo(rvec);
+    T psi = std::sqrt(rvec.dot(rvec));
+    if (abs(psi) < CV_QUAT_EPS) {
+        return Quat<T> (1, 0, 0, 0);
+    }
+    Vec<T, 3> axis = rvec / psi;
+    return createFromAngleAxis(psi, axis);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator-() const
+{
+    return Quat<T>(-w, -x, -y, -z);
+}
+
+
+template <typename T>
+inline bool Quat<T>::operator==(const Quat<T> &q) const
+{
+    return (abs(w - q.w) < CV_QUAT_EPS && abs(x - q.x) < CV_QUAT_EPS && abs(y - q.y) < CV_QUAT_EPS && abs(z - q.z) < CV_QUAT_EPS);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator+(const Quat<T> &q1) const
+{
+    return Quat<T>(w + q1.w, x + q1.x, y + q1.y, z + q1.z);
+}
+
+template <typename T>
+inline Quat<T> operator+(const T a, const Quat<T>& q)
+{
+    return Quat<T>(q.w + a, q.x, q.y, q.z);
+}
+
+template <typename T>
+inline Quat<T> operator+(const Quat<T>& q, const T a)
+{
+    return Quat<T>(q.w + a, q.x, q.y, q.z);
+}
+
+template <typename T>
+inline Quat<T> operator-(const T a, const Quat<T>& q)
+{
+    return Quat<T>(a - q.w, -q.x, -q.y, -q.z);
+}
+
+template <typename T>
+inline Quat<T> operator-(const Quat<T>& q, const T a)
+{
+    return Quat<T>(q.w - a, q.x, q.y, q.z);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator-(const Quat<T> &q1) const
+{
+    return Quat<T>(w - q1.w, x - q1.x, y - q1.y, z - q1.z);
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator+=(const Quat<T> &q1)
+{
+    w += q1.w;
+    x += q1.x;
+    y += q1.y;
+    z += q1.z;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator-=(const Quat<T> &q1)
+{
+    w -= q1.w;
+    x -= q1.x;
+    y -= q1.y;
+    z -= q1.z;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator*(const Quat<T> &q1) const
+{
+    Vec<T, 4> q{w, x, y, z};
+    Vec<T, 4> q2{q1.w, q1.x, q1.y, q1.z};
+    return Quat<T>(q * q2);
+}
+
+
+template <typename T>
+Quat<T> operator*(const Quat<T> &q1, const T a)
+{
+    return Quat<T>(a * q1.w, a * q1.x, a * q1.y, a * q1.z);
+}
+
+template <typename T>
+Quat<T> operator*(const T a, const Quat<T> &q1)
+{
+    return Quat<T>(a * q1.w, a * q1.x, a * q1.y, a * q1.z);
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator*=(const Quat<T> &q1)
+{
+    T qw, qx, qy, qz;
+    qw = w * q1.w - x * q1.x - y * q1.y - z * q1.z;
+    qx = x * q1.w + w * q1.x + y * q1.z - z * q1.y;
+    qy = y * q1.w + w * q1.y + z * q1.x - x * q1.z;
+    qz = z * q1.w + w * q1.z + x * q1.y - y * q1.x;
+    w = qw;
+    x = qx;
+    y = qy;
+    z = qz;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator/=(const Quat<T> &q1)
+{
+    Quat<T> q(*this * q1.inv());
+    w = q.w;
+    x = q.x;
+    y = q.y;
+    z = q.z;
+    return *this;
+}
+template <typename T>
+Quat<T>& Quat<T>::operator*=(const T q1)
+{
+    w *= q1;
+    x *= q1;
+    y *= q1;
+    z *= q1;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator/=(const T a)
+{
+    const T a_inv = 1.0 / a;
+    w *= a_inv;
+    x *= a_inv;
+    y *= a_inv;
+    z *= a_inv;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator/(const T a) const
+{
+    const T a_inv = T(1.0) / a;
+    return Quat<T>(w * a_inv, x * a_inv, y * a_inv, z * a_inv);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator/(const Quat<T> &q) const
+{
+    return *this * q.inv();
+}
+
+template <typename T>
+inline const T& Quat<T>::operator[](std::size_t n) const
+{
+    switch (n) {
+        case 0:
+            return w;
+        case 1:
+            return x;
+        case 2:
+            return y;
+        case 3:
+            return z;
+        default:
+            CV_Error(Error::StsOutOfRange, "subscript exceeds the index range");
+    }
+}
+
+template <typename T>
+inline T& Quat<T>::operator[](std::size_t n)
+{
+    switch (n) {
+        case 0:
+            return w;
+        case 1:
+            return x;
+        case 2:
+            return y;
+        case 3:
+            return z;
+        default:
+            CV_Error(Error::StsOutOfRange, "subscript exceeds the index range");
+    }
+}
+
+template <typename T>
+std::ostream & operator<<(std::ostream &os, const Quat<T> &q)
+{
+    os << "Quat " << Vec<T, 4>{q.w, q.x, q.y, q.z};
+    return os;
+}
+
+template <typename T>
+inline T Quat<T>::at(size_t index) const
+{
+    return (*this)[index];
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::conjugate() const
+{
+    return Quat<T>(w, -x, -y, -z);
+}
+
+template <typename T>
+inline T Quat<T>::norm() const
+{
+    return std::sqrt(dot(*this));
+}
+
+template <typename T>
+Quat<T> exp(const Quat<T> &q)
+{
+    return q.exp();
+}
+
+template <typename T>
+Quat<T> Quat<T>::exp() const
+{
+    Vec<T, 3> v{x, y, z};
+    T normV = std::sqrt(v.dot(v));
+    T k = normV < CV_QUAT_EPS ? 1 : std::sin(normV) / normV;
+    return std::exp(w) * Quat<T>(std::cos(normV), v[0] * k, v[1] * k, v[2] * k);
+}
+
+template <typename T>
+Quat<T> log(const Quat<T> &q, QuatAssumeType assumeUnit)
+{
+    return q.log(assumeUnit);
+}
+
+template <typename T>
+Quat<T> Quat<T>::log(QuatAssumeType assumeUnit) const
+{
+    Vec<T, 3> v{x, y, z};
+    T vNorm = std::sqrt(v.dot(v));
+    if (assumeUnit)
+    {
+        T k = vNorm < CV_QUAT_EPS ? 1 : std::acos(w) / vNorm;
+        return Quat<T>(0, v[0] * k, v[1] * k, v[2] * k);
+    }
+    T qNorm = norm();
+    if (qNorm < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "Cannot apply this quaternion to log function: undefined");
+    }
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::acos(w / qNorm) / vNorm;
+    return Quat<T>(std::log(qNorm), v[0] * k, v[1] * k, v[2] *k);
+}
+
+template <typename T>
+inline Quat<T> power(const Quat<T> &q1, const T alpha, QuatAssumeType assumeUnit)
+{
+    return q1.power(alpha, assumeUnit);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::power(const T alpha, QuatAssumeType assumeUnit) const
+{
+    if (x * x + y * y + z * z > CV_QUAT_EPS)
+    {
+        T angle = getAngle(assumeUnit);
+        Vec<T, 3> axis = getAxis(assumeUnit);
+        if (assumeUnit)
+        {
+            return createFromAngleAxis(alpha * angle, axis);
+        }
+        return std::pow(norm(), alpha) * createFromAngleAxis(alpha * angle, axis);
+    }
+    else
+    {
+        return std::pow(norm(), alpha) * Quat<T>(w, x, y, z);
+    }
+}
+
+
+template <typename T>
+inline Quat<T> sqrt(const Quat<T> &q, QuatAssumeType assumeUnit)
+{
+    return q.sqrt(assumeUnit);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::sqrt(QuatAssumeType assumeUnit) const
+{
+    return power(0.5, assumeUnit);
+}
+
+
+template <typename T>
+inline Quat<T> power(const Quat<T> &p, const Quat<T> &q, QuatAssumeType assumeUnit)
+{
+    return p.power(q, assumeUnit);
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::power(const Quat<T> &q, QuatAssumeType assumeUnit) const
+{
+    return cv::exp(q * log(assumeUnit));
+}
+
+template <typename T>
+inline T Quat<T>::dot(Quat<T> q1) const
+{
+    return w * q1.w + x * q1.x + y * q1.y + z * q1.z;
+}
+
+
+template <typename T>
+inline Quat<T> crossProduct(const Quat<T> &p, const Quat<T> &q)
+{
+    return p.crossProduct(q);
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::crossProduct(const Quat<T> &q) const
+{
+    return Quat<T> (0, y * q.z - z * q.y, z * q.x - x * q.z, x * q.y - q.x * y);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::normalize() const
+{
+    T normVal = norm();
+    if (normVal < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "Cannot normalize this quaternion: the norm is too small.");
+    }
+    return Quat<T>(w / normVal, x / normVal, y / normVal, z / normVal) ;
+}
+
+template <typename T>
+inline Quat<T> inv(const Quat<T> &q, QuatAssumeType assumeUnit)
+{
+    return q.inv(assumeUnit);
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::inv(QuatAssumeType assumeUnit) const
+{
+    if (assumeUnit)
+    {
+        return conjugate();
+    }
+    T norm2 = dot(*this);
+    if (norm2 < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "This quaternion do not have inverse quaternion");
+    }
+    return conjugate() / norm2;
+}
+
+template <typename T>
+inline Quat<T> sinh(const Quat<T> &q)
+{
+    return q.sinh();
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::sinh() const
+{
+    Vec<T, 3> v{x, y ,z};
+    T vNorm = std::sqrt(v.dot(v));
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::cosh(w) * std::sin(vNorm) / vNorm;
+    return Quat<T>(std::sinh(w) * std::cos(vNorm), v[0] * k, v[1] * k, v[2] * k);
+}
+
+
+template <typename T>
+inline Quat<T> cosh(const Quat<T> &q)
+{
+    return q.cosh();
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::cosh() const
+{
+    Vec<T, 3> v{x, y ,z};
+    T vNorm = std::sqrt(v.dot(v));
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::sinh(w) * std::sin(vNorm) / vNorm;
+    return Quat<T>(std::cosh(w) * std::cos(vNorm), v[0] * k, v[1] * k, v[2] * k);
+}
+
+template <typename T>
+inline Quat<T> tanh(const Quat<T> &q)
+{
+    return q.tanh();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::tanh() const
+{
+    return sinh() * cosh().inv();
+}
+
+
+template <typename T>
+inline Quat<T> sin(const Quat<T> &q)
+{
+    return q.sin();
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::sin() const
+{
+    Vec<T, 3> v{x, y ,z};
+    T vNorm = std::sqrt(v.dot(v));
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::cos(w) * std::sinh(vNorm) / vNorm;
+    return Quat<T>(std::sin(w) * std::cosh(vNorm), v[0] * k, v[1] * k, v[2] * k);
+}
+
+template <typename T>
+inline Quat<T> cos(const Quat<T> &q)
+{
+    return q.cos();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::cos() const
+{
+    Vec<T, 3> v{x, y ,z};
+    T vNorm = std::sqrt(v.dot(v));
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::sin(w) * std::sinh(vNorm) / vNorm;
+    return Quat<T>(std::cos(w) * std::cosh(vNorm), -v[0] * k, -v[1] * k, -v[2] * k);
+}
+
+template <typename T>
+inline Quat<T> tan(const Quat<T> &q)
+{
+    return q.tan();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::tan() const
+{
+    return sin() * cos().inv();
+}
+
+template <typename T>
+inline Quat<T> asinh(const Quat<T> &q)
+{
+    return q.asinh();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::asinh() const
+{
+    return cv::log(*this + cv::power(*this * *this + Quat<T>(1, 0, 0, 0), 0.5));
+}
+
+template <typename T>
+inline Quat<T> acosh(const Quat<T> &q)
+{
+    return q.acosh();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::acosh() const
+{
+    return cv::log(*this + cv::power(*this * *this - Quat<T>(1,0,0,0), 0.5));
+}
+
+template <typename T>
+inline Quat<T> atanh(const Quat<T> &q)
+{
+    return q.atanh();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::atanh() const
+{
+    Quat<T> ident(1, 0, 0, 0);
+    Quat<T> c1 = (ident + *this).log();
+    Quat<T> c2 = (ident - *this).log();
+    return 0.5 * (c1 - c2);
+}
+
+template <typename T>
+inline Quat<T> asin(const Quat<T> &q)
+{
+    return q.asin();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::asin() const
+{
+    Quat<T> v(0, x, y, z);
+    T vNorm = v.norm();
+    T k = vNorm < CV_QUAT_EPS ? 1 : vNorm;
+    return -v / k * (*this * v / k).asinh();
+}
+
+template <typename T>
+inline Quat<T> acos(const Quat<T> &q)
+{
+    return q.acos();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::acos() const
+{
+    Quat<T> v(0, x, y, z);
+    T vNorm = v.norm();
+    T k = vNorm < CV_QUAT_EPS ? 1 : vNorm;
+    return -v / k * acosh();
+}
+
+template <typename T>
+inline Quat<T> atan(const Quat<T> &q)
+{
+    return q.atan();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::atan() const
+{
+    Quat<T> v(0, x, y, z);
+    T vNorm = v.norm();
+    T k = vNorm < CV_QUAT_EPS ? 1 : vNorm;
+    return -v / k * (*this * v / k).atanh();
+}
+
+template <typename T>
+inline T Quat<T>::getAngle(QuatAssumeType assumeUnit) const
+{
+    if (assumeUnit)
+    {
+        return 2 * std::acos(w);
+    }
+    if (norm() < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "This quaternion does not represent a rotation");
+    }
+    return 2 * std::acos(w / norm());
+}
+
+template <typename T>
+inline Vec<T, 3> Quat<T>::getAxis(QuatAssumeType assumeUnit) const
+{
+    T angle = getAngle(assumeUnit);
+    const T sin_v = std::sin(angle * 0.5);
+    if (assumeUnit)
+    {
+        return Vec<T, 3>{x, y, z} / sin_v;
+    }
+    return Vec<T, 3> {x, y, z} / (norm() * sin_v);
+}
+
+template <typename T>
+Matx<T, 4, 4> Quat<T>::toRotMat4x4(QuatAssumeType assumeUnit) const
+{
+    T a = w, b = x, c = y, d = z;
+    if (!assumeUnit)
+    {
+        Quat<T> qTemp = normalize();
+        a = qTemp.w;
+        b = qTemp.x;
+        c = qTemp.y;
+        d = qTemp.z;
+    }
+    Matx<T, 4, 4> R{
+        1 - 2 * (c * c + d * d), 2 * (b * c - a * d)    , 2 * (b * d + a * c)    , 0,
+        2 * (b * c + a * d)    , 1 - 2 * (b * b + d * d), 2 * (c * d - a * b)    , 0,
+        2 * (b * d - a * c)    , 2 * (c * d + a * b)    , 1 - 2 * (b * b + c * c), 0,
+        0                      , 0                      , 0                      , 1,
+    };
+    return R;
+}
+
+template <typename T>
+Matx<T, 3, 3> Quat<T>::toRotMat3x3(QuatAssumeType assumeUnit) const
+{
+    T a = w, b = x, c = y, d = z;
+    if (!assumeUnit)
+    {
+        Quat<T> qTemp = normalize();
+        a = qTemp.w;
+        b = qTemp.x;
+        c = qTemp.y;
+        d = qTemp.z;
+    }
+    Matx<T, 3, 3> R{
+        1 - 2 * (c * c + d * d), 2 * (b * c - a * d)    , 2 * (b * d + a * c),
+        2 * (b * c + a * d)    , 1 - 2 * (b * b + d * d), 2 * (c * d - a * b),
+        2 * (b * d - a * c)    , 2 * (c * d + a * b)    , 1 - 2 * (b * b + c * c)
+    };
+    return R;
+}
+
+template <typename T>
+Vec<T, 3> Quat<T>::toRotVec(QuatAssumeType assumeUnit) const
+{
+    T angle = getAngle(assumeUnit);
+    Vec<T, 3> axis = getAxis(assumeUnit);
+    return angle * axis;
+}
+
+template <typename T>
+Vec<T, 4> Quat<T>::toVec() const
+{
+    return Vec<T, 4>{w, x, y, z};
+}
+
+template <typename T>
+Quat<T> Quat<T>::lerp(const Quat<T> &q0, const Quat<T> &q1, const T t)
+{
+    return (1 - t) * q0 + t * q1;
+}
+
+template <typename T>
+Quat<T> Quat<T>::slerp(const Quat<T> &q0, const Quat<T> &q1, const T t, QuatAssumeType assumeUnit, bool directChange)
+{
+    Quat<T> v0(q0);
+    Quat<T> v1(q1);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+    }
+    T cosTheta = v0.dot(v1);
+    constexpr T DOT_THRESHOLD = 0.995;
+    if (std::abs(cosTheta) > DOT_THRESHOLD)
+    {
+        return nlerp(v0, v1, t, QUAT_ASSUME_UNIT);
+    }
+
+    if (directChange && cosTheta < 0)
+    {
+        v0 = -v0;
+        cosTheta = -cosTheta;
+    }
+    T sinTheta = std::sqrt(1 - cosTheta * cosTheta);
+    T angle = atan2(sinTheta, cosTheta);
+    return (std::sin((1 - t) * angle) / (sinTheta) * v0 + std::sin(t * angle) / (sinTheta) * v1).normalize();
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::nlerp(const Quat<T> &q0, const Quat<T> &q1, const T t, QuatAssumeType assumeUnit)
+{
+    Quat<T> v0(q0), v1(q1);
+    if (v1.dot(v0) < 0)
+    {
+        v0 = -v0;
+    }
+    if (assumeUnit)
+    {
+        return ((1 - t) * v0 + t * v1).normalize();
+    }
+    v0 = v0.normalize();
+    v1 = v1.normalize();
+    return ((1 - t) * v0 + t * v1).normalize();
+}
+
+
+template <typename T>
+inline bool Quat<T>::isNormal(T eps) const
+{
+
+    double normVar = norm();
+    if ((normVar > 1 - eps) && (normVar < 1 + eps))
+        return true;
+    return false;
+}
+
+template <typename T>
+inline void Quat<T>::assertNormal(T eps) const
+{
+    if (!isNormal(eps))
+        CV_Error(Error::StsBadArg, "Quaternion should be normalized");
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::squad(const Quat<T> &q0, const Quat<T> &q1,
+                            const Quat<T> &q2, const Quat<T> &q3,
+                            const T t, QuatAssumeType assumeUnit,
+                            bool directChange)
+{
+    Quat<T> v0(q0), v1(q1), v2(q2), v3(q3);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+        v2 = v2.normalize();
+        v3 = v3.normalize();
+    }
+
+    Quat<T> c0 = slerp(v0, v3, t, assumeUnit, directChange);
+    Quat<T> c1 = slerp(v1, v2, t, assumeUnit, directChange);
+    return slerp(c0, c1, 2 * t * (1 - t), assumeUnit, directChange);
+}
+
+template <typename T>
+Quat<T> Quat<T>::interPoint(const Quat<T> &q0, const Quat<T> &q1,
+                            const Quat<T> &q2, QuatAssumeType assumeUnit)
+{
+    Quat<T> v0(q0), v1(q1), v2(q2);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+        v2 = v2.normalize();
+    }
+    return v1 * cv::exp(-(cv::log(v1.conjugate() * v0, assumeUnit) + (cv::log(v1.conjugate() * v2, assumeUnit))) / 4);
+}
+
+template <typename T>
+Quat<T> Quat<T>::spline(const Quat<T> &q0, const Quat<T> &q1, const Quat<T> &q2, const Quat<T> &q3, const T t, QuatAssumeType assumeUnit)
+{
+    Quat<T> v0(q0), v1(q1), v2(q2), v3(q3);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+        v2 = v2.normalize();
+        v3 = v3.normalize();
+    }
+    T cosTheta;
+    std::vector<Quat<T>> vec{v0, v1, v2, v3};
+    for (size_t i = 0; i < 3; ++i)
+    {
+        cosTheta = vec[i].dot(vec[i + 1]);
+        if (cosTheta < 0)
+        {
+            vec[i + 1] = -vec[i + 1];
+        }
+    }
+    Quat<T> s1 = interPoint(vec[0], vec[1], vec[2], QUAT_ASSUME_UNIT);
+    Quat<T> s2 = interPoint(vec[1], vec[2], vec[3], QUAT_ASSUME_UNIT);
+    return squad(vec[1], s1, s2, vec[2], t, assumeUnit, QUAT_ASSUME_NOT_UNIT);
+}
+
+namespace detail {
+
+template <typename T> static
+Quat<T> createFromAxisRot(int axis, const T theta)
+{
+    if (axis == 0)
+        return Quat<T>::createFromXRot(theta);
+    if (axis == 1)
+        return Quat<T>::createFromYRot(theta);
+    if (axis == 2)
+        return Quat<T>::createFromZRot(theta);
+    CV_Assert(0);
+}
+
+inline bool isIntAngleType(QuatEnum::EulerAnglesType eulerAnglesType)
+{
+    return eulerAnglesType < QuatEnum::EXT_XYZ;
+}
+
+inline bool isTaitBryan(QuatEnum::EulerAnglesType eulerAnglesType)
+{
+    return eulerAnglesType/6 == 1 || eulerAnglesType/6 == 3;
+}
+}  // namespace detail
+
+template <typename T>
+Quat<T> Quat<T>::createFromYRot(const T theta)
+{
+    return Quat<T>{std::cos(theta * 0.5f), 0, std::sin(theta * 0.5f), 0};
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromXRot(const T theta){
+    return Quat<T>{std::cos(theta * 0.5f), std::sin(theta * 0.5f), 0, 0};
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromZRot(const T theta){
+    return Quat<T>{std::cos(theta * 0.5f), 0, 0, std::sin(theta * 0.5f)};
+}
+
+
+template <typename T>
+Quat<T> Quat<T>::createFromEulerAngles(const Vec<T, 3> &angles, QuatEnum::EulerAnglesType eulerAnglesType) {
+    CV_Assert(eulerAnglesType < QuatEnum::EulerAnglesType::EULER_ANGLES_MAX_VALUE);
+    static const int rotationAxis[24][3] = {
+        {0, 1, 2}, ///< Intrinsic rotations with the Euler angles type X-Y-Z
+        {0, 2, 1}, ///< Intrinsic rotations with the Euler angles type X-Z-Y
+        {1, 0, 2}, ///< Intrinsic rotations with the Euler angles type Y-X-Z
+        {1, 2, 0}, ///< Intrinsic rotations with the Euler angles type Y-Z-X
+        {2, 0, 1}, ///< Intrinsic rotations with the Euler angles type Z-X-Y
+        {2, 1, 0}, ///< Intrinsic rotations with the Euler angles type Z-Y-X
+        {0, 1, 0}, ///< Intrinsic rotations with the Euler angles type X-Y-X
+        {0, 2, 0}, ///< Intrinsic rotations with the Euler angles type X-Z-X
+        {1, 0, 1}, ///< Intrinsic rotations with the Euler angles type Y-X-Y
+        {1, 2, 1}, ///< Intrinsic rotations with the Euler angles type Y-Z-Y
+        {2, 0, 2}, ///< Intrinsic rotations with the Euler angles type Z-X-Z
+        {2, 1, 2}, ///< Intrinsic rotations with the Euler angles type Z-Y-Z
+        {0, 1, 2}, ///< Extrinsic rotations with the Euler angles type X-Y-Z
+        {0, 2, 1}, ///< Extrinsic rotations with the Euler angles type X-Z-Y
+        {1, 0, 2}, ///< Extrinsic rotations with the Euler angles type Y-X-Z
+        {1, 2, 0}, ///< Extrinsic rotations with the Euler angles type Y-Z-X
+        {2, 0, 1}, ///< Extrinsic rotations with the Euler angles type Z-X-Y
+        {2, 1, 0}, ///< Extrinsic rotations with the Euler angles type Z-Y-X
+        {0, 1, 0}, ///< Extrinsic rotations with the Euler angles type X-Y-X
+        {0, 2, 0}, ///< Extrinsic rotations with the Euler angles type X-Z-X
+        {1, 0, 1}, ///< Extrinsic rotations with the Euler angles type Y-X-Y
+        {1, 2, 1}, ///< Extrinsic rotations with the Euler angles type Y-Z-Y
+        {2, 0, 2}, ///< Extrinsic rotations with the Euler angles type Z-X-Z
+        {2, 1, 2}  ///< Extrinsic rotations with the Euler angles type Z-Y-Z
+    };
+    Quat<T> q1 = detail::createFromAxisRot(rotationAxis[eulerAnglesType][0], angles(0));
+    Quat<T> q2 = detail::createFromAxisRot(rotationAxis[eulerAnglesType][1], angles(1));
+    Quat<T> q3 = detail::createFromAxisRot(rotationAxis[eulerAnglesType][2], angles(2));
+    if (detail::isIntAngleType(eulerAnglesType))
+    {
+        return q1 * q2 * q3;
+    }
+    else // (!detail::isIntAngleType<T>(eulerAnglesType))
+    {
+        return q3 * q2 * q1;
+    }
+}
+
+template <typename T>
+Vec<T, 3> Quat<T>::toEulerAngles(QuatEnum::EulerAnglesType eulerAnglesType){
+    CV_Assert(eulerAnglesType < QuatEnum::EulerAnglesType::EULER_ANGLES_MAX_VALUE);
+    Matx33d R = toRotMat3x3();
+    enum {
+        C_ZERO,
+        C_PI,
+        C_PI_2,
+        N_CONSTANTS,
+        R_0_0 = N_CONSTANTS, R_0_1, R_0_2,
+        R_1_0, R_1_1, R_1_2,
+        R_2_0, R_2_1, R_2_2
+    };
+    static const T constants_[N_CONSTANTS] = {
+        0,  // C_ZERO
+        (T)CV_PI,  // C_PI
+        (T)(CV_PI * 0.5)  // C_PI_2, -C_PI_2
+    };
+    static const int rotationR_[24][12] = {
+        {+R_0_2,    +R_1_0, +R_1_1, C_PI_2,     +R_2_1, +R_1_1, -C_PI_2,    -R_1_2, +R_2_2,    +R_0_2,    -R_0_1, +R_0_0},  // INT_XYZ
+        {+R_0_1,    -R_1_2, +R_2_2, -C_PI_2,    +R_2_0, +R_2_2, C_PI_2,     +R_2_1, +R_1_1,    -R_0_1,    +R_0_2, +R_0_0},  // INT_XZY
+        {+R_1_2,    -R_0_1, +R_0_0, -C_PI_2,    +R_0_1, +R_0_0, C_PI_2,     +R_0_2, +R_2_2,    -R_1_2,    +R_1_0, +R_1_1},  // INT_YXZ
+        {+R_1_0,    +R_0_2, +R_2_2, C_PI_2,     +R_0_2, +R_0_1, -C_PI_2,    -R_2_0, +R_0_0,    +R_1_0,    -R_1_2, +R_1_1},  // INT_YZX
+        {+R_2_1,    +R_1_0, +R_0_0, C_PI_2,     +R_1_0, +R_0_0, -C_PI_2,    -R_0_1, +R_1_1,    +R_2_1,    -R_2_0, +R_2_2},  // INT_ZXY
+        {+R_2_0,    -R_0_1, +R_1_1, -C_PI_2,    +R_1_2, +R_1_1, C_PI_2,     +R_1_0, +R_0_0,    -R_2_0,    +R_2_1, +R_2_2},  // INT_ZYX
+        {+R_0_0,    +R_2_1, +R_2_2, C_ZERO,     +R_1_2, +R_1_1, C_PI,       +R_1_0, -R_2_0,    +R_0_0,    +R_0_1, +R_0_2},  // INT_XYX
+        {+R_0_0,    +R_2_1, +R_2_2, C_ZERO,     -R_2_1, +R_2_2, C_PI,       +R_2_0, +R_1_0,    +R_0_0,    +R_0_2, -R_0_1},  // INT_XZX
+        {+R_1_1,    +R_0_2, +R_0_0, C_ZERO,     -R_2_0, +R_0_0, C_PI,       +R_0_1, +R_2_1,    +R_1_1,    +R_1_0, -R_1_2},  // INT_YXY
+        {+R_1_1,    +R_0_2, +R_0_0, C_ZERO,     +R_0_2, -R_0_0, C_PI,       +R_2_1, -R_0_1,    +R_1_1,    +R_1_2, +R_1_0},  // INT_YZY
+        {+R_2_2,    +R_1_0, +R_1_1, C_ZERO,     +R_1_0, +R_0_0, C_PI,       +R_0_2, -R_1_2,    +R_2_2,    +R_2_0, +R_2_1},  // INT_ZXZ
+        {+R_2_2,    +R_1_0, +R_0_0, C_ZERO,     +R_1_0, +R_0_0, C_PI,       +R_1_2, +R_0_2,    +R_2_2,    +R_2_1, -R_2_0},  // INT_ZYZ
+
+        {+R_2_0,    -C_PI_2, -R_0_1, +R_1_1,    C_PI_2,  +R_1_2, +R_1_1,    +R_2_1, +R_2_2,    -R_2_0,    +R_1_0, +R_0_0},  // EXT_XYZ
+        {+R_1_0,    C_PI_2,  +R_0_2, +R_2_2,    -C_PI_2, +R_0_2, +R_0_1,    -R_1_2, +R_1_1,    +R_1_0,    -R_2_0, +R_0_0},  // EXT_XZY
+        {+R_2_1,    C_PI_2,  +R_1_0, +R_0_0,    -C_PI_2, +R_1_0, +R_0_0,    -R_2_0, +R_2_2,    +R_2_1,    -R_0_1, +R_1_1},  // EXT_YXZ
+        {+R_0_2,    -C_PI_2, -R_1_2, +R_2_2,    C_PI_2,  +R_2_0, +R_2_2,    +R_0_2, +R_0_0,    -R_0_1,    +R_2_1, +R_1_1},  // EXT_YZX
+        {+R_1_2,    -C_PI_2, -R_0_1, +R_0_0,    C_PI_2,  +R_0_1, +R_0_0,    +R_1_0, +R_1_1,    -R_1_2,    +R_0_2, +R_2_2},  // EXT_ZXY
+        {+R_0_2,    C_PI_2,  +R_1_0, +R_1_1,    -C_PI_2, +R_2_1, +R_1_1,    -R_0_1, +R_0_0,    +R_0_2,    -R_1_2, +R_2_2},  // EXT_ZYX
+        {+R_0_0,    C_ZERO,  +R_2_1, +R_2_2,    C_PI,    +R_1_2, +R_1_1,    +R_0_1, +R_0_2,    +R_0_0,    +R_1_0, -R_2_0},  // EXT_XYX
+        {+R_0_0,    C_ZERO,  +R_2_1, +R_2_2,    C_PI,    +R_2_1, +R_2_2,    +R_0_2, -R_0_1,    +R_0_0,    +R_2_0, +R_1_0},  // EXT_XZX
+        {+R_1_1,    C_ZERO,  +R_0_2, +R_0_0,    C_PI,    -R_2_0, +R_0_0,    +R_1_0, -R_1_2,    +R_1_1,    +R_0_1, +R_2_1},  // EXT_YXY
+        {+R_1_1,    C_ZERO,  +R_0_2, +R_0_0,    C_PI,    +R_0_2, -R_0_0,    +R_1_2, +R_1_0,    +R_1_1,    +R_2_1, -R_0_1},  // EXT_YZY
+        {+R_2_2,    C_ZERO,  +R_1_0, +R_1_1,    C_PI,    +R_1_0, +R_0_0,    +R_2_0, +R_2_1,    +R_2_2,    +R_0_2, -R_1_2},  // EXT_ZXZ
+        {+R_2_2,    C_ZERO,  +R_1_0, +R_0_0,    C_PI,    +R_1_0, +R_0_0,    +R_2_1, -R_2_0,    +R_2_2,    +R_1_2, +R_0_2},  // EXT_ZYZ
+    };
+    T rotationR[12];
+    for (int i = 0; i < 12; i++)
+    {
+        int id = rotationR_[eulerAnglesType][i];
+        unsigned idx = std::abs(id);
+        T value = 0.0f;
+        if (idx < N_CONSTANTS)
+        {
+            value = constants_[idx];
+        }
+        else
+        {
+            unsigned r_idx = idx - N_CONSTANTS;
+            CV_DbgAssert(r_idx < 9);
+            value = R.val[r_idx];
+        }
+        bool isNegative = id < 0;
+        if (isNegative)
+            value = -value;
+        rotationR[i] = value;
+    }
+    Vec<T, 3> angles;
+    if (detail::isIntAngleType(eulerAnglesType))
+    {
+        if (abs(rotationR[0] - 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the third angle to 0");
+            angles = {std::atan2(rotationR[1], rotationR[2]), rotationR[3], 0};
+            return angles;
+        }
+        else if(abs(rotationR[0] + 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the third angle to 0");
+            angles = {std::atan2(rotationR[4], rotationR[5]), rotationR[6], 0};
+            return angles;
+        }
+    }
+    else // (!detail::isIntAngleType<T>(eulerAnglesType))
+    {
+        if (abs(rotationR[0] - 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the first angle to 0");
+            angles = {0, rotationR[1], std::atan2(rotationR[2], rotationR[3])};
+            return angles;
+        }
+        else if (abs(rotationR[0] + 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the first angle to 0");
+            angles = {0, rotationR[4], std::atan2(rotationR[5], rotationR[6])};
+            return angles;
+        }
+    }
+
+    angles(0) = std::atan2(rotationR[7], rotationR[8]);
+    if (detail::isTaitBryan(eulerAnglesType))
+        angles(1) = std::acos(rotationR[9]);
+    else
+        angles(1) = std::asin(rotationR[9]);
+    angles(2) = std::atan2(rotationR[10], rotationR[11]);
+    return angles;
+}
+
+}  // namepsace
+//! @endcond
+
+#endif /*OPENCV_CORE_QUATERNION_INL_HPP*/
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/saturate.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/saturate.hpp
new file mode 100644
index 000000000000..18ffd1c7af2c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/saturate.hpp
@@ -0,0 +1,180 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_SATURATE_HPP
+#define OPENCV_CORE_SATURATE_HPP
+
+#include "opencv2/core/cvdef.h"
+#include <climits>
+#include "opencv2/core/fast_math.hpp"
+
+namespace cv
+{
+
+//! @addtogroup core_utils
+//! @{
+
+/////////////// saturate_cast (used in image & signal processing) ///////////////////
+
+/** @brief Template function for accurate conversion from one primitive type to another.
+
+ The function saturate_cast resembles the standard C++ cast operations, such as static_cast\<T\>()
+ and others. It perform an efficient and accurate conversion from one primitive type to another
+ (see the introduction chapter). saturate in the name means that when the input value v is out of the
+ range of the target type, the result is not formed just by taking low bits of the input, but instead
+ the value is clipped. For example:
+ @code
+ uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
+ short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
+ @endcode
+ Such clipping is done when the target type is unsigned char , signed char , unsigned short or
+ signed short . For 32-bit integers, no clipping is done.
+
+ When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
+ the floating-point value is first rounded to the nearest integer and then clipped if needed (when
+ the target type is 8- or 16-bit).
+
+ @param v Function parameter.
+ @sa add, subtract, multiply, divide, Mat::convertTo
+ */
+template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
+
+template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
+template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
+template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
+
+template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
+template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
+
+template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
+template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
+
+template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
+template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
+template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
+
+template<> inline int saturate_cast<int>(unsigned v)         { return (int)std::min(v, (unsigned)INT_MAX); }
+template<> inline int saturate_cast<int>(int64 v)            { return (int)((uint64)(v - INT_MIN) <= (uint64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); }
+template<> inline int saturate_cast<int>(uint64 v)           { return (int)std::min(v, (uint64)INT_MAX); }
+template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
+template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
+
+template<> inline unsigned saturate_cast<unsigned>(schar v)  { return (unsigned)std::max(v, (schar)0); }
+template<> inline unsigned saturate_cast<unsigned>(short v)  { return (unsigned)std::max(v, (short)0); }
+template<> inline unsigned saturate_cast<unsigned>(int v)    { return (unsigned)std::max(v, (int)0); }
+template<> inline unsigned saturate_cast<unsigned>(int64 v)  { return (unsigned)((uint64)v <= (uint64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
+template<> inline unsigned saturate_cast<unsigned>(uint64 v) { return (unsigned)std::min(v, (uint64)UINT_MAX); }
+// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+template<> inline unsigned saturate_cast<unsigned>(float v)  { return static_cast<unsigned>(cvRound(v)); }
+template<> inline unsigned saturate_cast<unsigned>(double v) { return static_cast<unsigned>(cvRound(v)); }
+
+template<> inline uint64 saturate_cast<uint64>(schar v)      { return (uint64)std::max(v, (schar)0); }
+template<> inline uint64 saturate_cast<uint64>(short v)      { return (uint64)std::max(v, (short)0); }
+template<> inline uint64 saturate_cast<uint64>(int v)        { return (uint64)std::max(v, (int)0); }
+template<> inline uint64 saturate_cast<uint64>(int64 v)      { return (uint64)std::max(v, (int64)0); }
+
+template<> inline int64 saturate_cast<int64>(uint64 v)       { return (int64)std::min(v, (uint64)LLONG_MAX); }
+
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(hfloat v) { return saturate_cast<_Tp>((float)v); }
+
+// in theory, we could use a LUT for 8u/8s->16f conversion,
+// but with hardware support for FP32->FP16 conversion the current approach is preferable
+template<> inline hfloat saturate_cast<hfloat>(uchar v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(schar v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(ushort v)  { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(short v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(unsigned v){ return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(int v)     { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(uint64 v)  { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(int64 v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(float v)   { return hfloat(v); }
+template<> inline hfloat saturate_cast<hfloat>(double v)  { return hfloat((float)v); }
+
+//! @}
+
+} // cv
+
+#endif // OPENCV_CORE_SATURATE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/simd_intrinsics.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/simd_intrinsics.hpp
new file mode 100644
index 000000000000..2658d92c8995
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/simd_intrinsics.hpp
@@ -0,0 +1,87 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_SIMD_INTRINSICS_HPP
+#define OPENCV_CORE_SIMD_INTRINSICS_HPP
+
+/**
+Helper header to support SIMD intrinsics (universal intrinsics) in user code.
+Intrinsics documentation: https://docs.opencv.org/4.x/df/d91/group__core__hal__intrin.html
+
+
+Checks of target CPU instruction set based on compiler definitions don't work well enough.
+More reliable solutions require utilization of configuration systems (like CMake).
+
+So, probably you need to specify your own configuration.
+
+You can do that via CMake in this way:
+    add_definitions(/DOPENCV_SIMD_CONFIG_HEADER=opencv_simd_config_custom.hpp)
+or
+    add_definitions(/DOPENCV_SIMD_CONFIG_INCLUDE_DIR=1)
+
+Additionally you may need to add include directory to your files:
+    include_directories("${CMAKE_CURRENT_LIST_DIR}/opencv_config_${MYTARGET}")
+
+These files can be pre-generated for target configurations of your application
+or generated by CMake on the fly (use CMAKE_BINARY_DIR for that).
+
+Notes:
+- H/W capability checks are still responsibility of your application
+- runtime dispatching is not covered by this helper header
+*/
+
+#ifdef __OPENCV_BUILD
+#error "Use core/hal/intrin.hpp during OpenCV build"
+#endif
+
+#ifdef OPENCV_HAL_INTRIN_HPP
+#error "core/simd_intrinsics.hpp must be included before core/hal/intrin.hpp"
+#endif
+
+#include "opencv2/core/cvdef.h"
+
+#ifdef OPENCV_SIMD_CONFIG_HEADER
+#include CVAUX_STR(OPENCV_SIMD_CONFIG_HEADER)
+#elif defined(OPENCV_SIMD_CONFIG_INCLUDE_DIR)
+#include "opencv_simd_config.hpp"  // corresponding directory should be added via -I compiler parameter
+#else  // custom config headers
+
+#if (!defined(CV_AVX_512F) || !CV_AVX_512F) && (defined(__AVX512__) || defined(__AVX512F__))
+#  include <immintrin.h>
+#  undef CV_AVX_512F
+#  define CV_AVX_512F 1
+#  ifndef OPENCV_SIMD_DONT_ASSUME_SKX  // Skylake-X with AVX-512F/CD/BW/DQ/VL
+#    undef CV_AVX512_SKX
+#    define CV_AVX512_SKX 1
+#    undef CV_AVX_512CD
+#    define CV_AVX_512CD 1
+#    undef CV_AVX_512BW
+#    define CV_AVX_512BW 1
+#    undef CV_AVX_512DQ
+#    define CV_AVX_512DQ 1
+#    undef CV_AVX_512VL
+#    define CV_AVX_512VL 1
+#  endif
+#endif // AVX512
+
+// GCC/Clang: -mavx2
+// MSVC: /arch:AVX2
+#if defined __AVX2__
+#  include <immintrin.h>
+#  undef CV_AVX2
+#  define CV_AVX2 1
+#  if defined __F16C__
+#    undef CV_FP16
+#    define CV_FP16 1
+#  endif
+#endif
+
+#endif
+
+// SSE / NEON / VSX is handled by cv_cpu_dispatch.h compatibility block
+#include "cv_cpu_dispatch.h"
+
+#include "hal/intrin.hpp"
+
+#endif // OPENCV_CORE_SIMD_INTRINSICS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/softfloat.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/softfloat.hpp
new file mode 100644
index 000000000000..485e15c47326
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/softfloat.hpp
@@ -0,0 +1,514 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This file is based on files from package issued with the following license:
+
+/*============================================================================
+
+This C header file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3c, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the
+University of California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#pragma once
+#ifndef softfloat_h
+#define softfloat_h 1
+
+#include "cvdef.h"
+
+namespace cv
+{
+
+/** @addtogroup core_utils_softfloat
+
+  [SoftFloat](http://www.jhauser.us/arithmetic/SoftFloat.html) is a software implementation
+  of floating-point calculations according to IEEE 754 standard.
+  All calculations are done in integers, that's why they are machine-independent and bit-exact.
+  This library can be useful in accuracy-critical parts like look-up tables generation, tests, etc.
+  OpenCV contains a subset of SoftFloat partially rewritten to C++.
+
+  ### Types
+
+  There are two basic types: @ref softfloat and @ref softdouble.
+  These types are binary compatible with float and double types respectively
+  and support conversions to/from them.
+  Other types from original SoftFloat library like fp16 or fp128 were thrown away
+  as well as quiet/signaling NaN support, on-the-fly rounding mode switch
+  and exception flags (though exceptions can be implemented in the future).
+
+  ### Operations
+
+  Both types support the following:
+  - Construction from signed and unsigned 32-bit and 64 integers,
+  float/double or raw binary representation
+  - Conversions between each other, to float or double and to int
+  using @ref cvRound, @ref cvTrunc, @ref cvFloor, @ref cvCeil or a bunch of
+  saturate_cast functions
+  - Add, subtract, multiply, divide, remainder, square root, FMA with absolute precision
+  - Comparison operations
+  - Explicit sign, exponent and significand manipulation through get/set methods,
+ number state indicators (isInf, isNan, isSubnormal)
+  - Type-specific constants like eps, minimum/maximum value, best pi approximation, etc.
+  - min(), max(), abs(), exp(), log() and pow() functions
+
+*/
+//! @{
+
+struct softfloat;
+struct softdouble;
+
+struct CV_EXPORTS softfloat
+{
+public:
+    /** @brief Default constructor */
+    softfloat() { v = 0; }
+    /** @brief Copy constructor */
+    softfloat( const softfloat& c) { v = c.v; }
+    /** @brief Assign constructor */
+    softfloat& operator=( const softfloat& c )
+    {
+        if(&c != this) v = c.v;
+        return *this;
+    }
+    /** @brief Construct from raw
+
+    Builds new value from raw binary representation
+    */
+    static const softfloat fromRaw( const uint32_t a ) { softfloat x; x.v = a; return x; }
+
+    /** @brief Construct from integer */
+    explicit softfloat( const uint32_t );
+    explicit softfloat( const uint64_t );
+    explicit softfloat( const int32_t );
+    explicit softfloat( const int64_t );
+
+#ifdef CV_INT32_T_IS_LONG_INT
+    // for platforms with int32_t = long int
+    explicit softfloat( const int a ) { *this = softfloat(static_cast<int32_t>(a)); }
+#endif
+
+    /** @brief Construct from float */
+    explicit softfloat( const float a ) { Cv32suf s; s.f = a; v = s.u; }
+
+    /** @brief Type casts  */
+    operator softdouble() const;
+    operator float() const { Cv32suf s; s.u = v; return s.f; }
+
+    /** @brief Basic arithmetics */
+    softfloat operator + (const softfloat&) const;
+    softfloat operator - (const softfloat&) const;
+    softfloat operator * (const softfloat&) const;
+    softfloat operator / (const softfloat&) const;
+    softfloat operator - () const { softfloat x; x.v = v ^ (1U << 31); return x; }
+
+    /** @brief Remainder operator
+
+    A quote from original SoftFloat manual:
+
+    > The IEEE Standard remainder operation computes the value
+    > a - n * b, where n is the integer closest to a / b.
+    > If a / b is exactly halfway between two integers, n is the even integer
+    > closest to a / b. The IEEE Standard’s remainder operation is always exact and so requires no rounding.
+    > Depending on the relative magnitudes of the operands, the remainder functions
+    > can take considerably longer to execute than the other SoftFloat functions.
+    > This is an inherent characteristic of the remainder operation itself and is not a flaw
+    > in the SoftFloat implementation.
+    */
+    softfloat operator % (const softfloat&) const;
+
+    softfloat& operator += (const softfloat& a) { *this = *this + a; return *this; }
+    softfloat& operator -= (const softfloat& a) { *this = *this - a; return *this; }
+    softfloat& operator *= (const softfloat& a) { *this = *this * a; return *this; }
+    softfloat& operator /= (const softfloat& a) { *this = *this / a; return *this; }
+    softfloat& operator %= (const softfloat& a) { *this = *this % a; return *this; }
+
+    /** @brief Comparison operations
+
+     - Any operation with NaN produces false
+       + The only exception is when x is NaN: x != y for any y.
+     - Positive and negative zeros are equal
+    */
+    bool operator == ( const softfloat& ) const;
+    bool operator != ( const softfloat& ) const;
+    bool operator >  ( const softfloat& ) const;
+    bool operator >= ( const softfloat& ) const;
+    bool operator <  ( const softfloat& ) const;
+    bool operator <= ( const softfloat& ) const;
+
+    /** @brief NaN state indicator */
+    inline bool isNaN() const { return (v & 0x7fffffff)  > 0x7f800000; }
+    /** @brief Inf state indicator */
+    inline bool isInf() const { return (v & 0x7fffffff) == 0x7f800000; }
+    /** @brief Subnormal number indicator */
+    inline bool isSubnormal() const { return ((v >> 23) & 0xFF) == 0; }
+
+    /** @brief Get sign bit */
+    inline bool getSign() const { return (v >> 31) != 0; }
+    /** @brief Construct a copy with new sign bit */
+    inline softfloat setSign(bool sign) const { softfloat x; x.v = (v & ((1U << 31) - 1)) | ((uint32_t)sign << 31); return x; }
+    /** @brief Get 0-based exponent */
+    inline int getExp() const { return ((v >> 23) & 0xFF) - 127; }
+    /** @brief Construct a copy with new 0-based exponent */
+    inline softfloat setExp(int e) const { softfloat x; x.v = (v & 0x807fffff) | (((e + 127) & 0xFF) << 23 ); return x; }
+
+    /** @brief Get a fraction part
+
+    Returns a number 1 <= x < 2 with the same significand
+    */
+    inline softfloat getFrac() const
+    {
+        uint_fast32_t vv = (v & 0x007fffff) | (127 << 23);
+        return softfloat::fromRaw(vv);
+    }
+    /** @brief Construct a copy with provided significand
+
+    Constructs a copy of a number with significand taken from parameter
+    */
+    inline softfloat setFrac(const softfloat& s) const
+    {
+        softfloat x;
+        x.v = (v & 0xff800000) | (s.v & 0x007fffff);
+        return x;
+    }
+
+    /** @brief Zero constant */
+    static softfloat zero() { return softfloat::fromRaw( 0 ); }
+    /** @brief Positive infinity constant */
+    static softfloat  inf() { return softfloat::fromRaw( 0xFF << 23 ); }
+    /** @brief Default NaN constant */
+    static softfloat  nan() { return softfloat::fromRaw( 0x7fffffff ); }
+    /** @brief One constant */
+    static softfloat  one() { return softfloat::fromRaw(  127 << 23 ); }
+    /** @brief Smallest normalized value */
+    static softfloat  min() { return softfloat::fromRaw( 0x01 << 23 ); }
+    /** @brief Difference between 1 and next representable value */
+    static softfloat  eps() { return softfloat::fromRaw( (127 - 23) << 23 ); }
+    /** @brief Biggest finite value */
+    static softfloat  max() { return softfloat::fromRaw( (0xFF << 23) - 1 ); }
+    /** @brief Correct pi approximation */
+    static softfloat   pi() { return softfloat::fromRaw( 0x40490fdb ); }
+
+    uint32_t v;
+};
+
+/*----------------------------------------------------------------------------
+*----------------------------------------------------------------------------*/
+
+struct CV_EXPORTS softdouble
+{
+public:
+    /** @brief Default constructor */
+    softdouble() : v(0) { }
+    /** @brief Copy constructor */
+    softdouble( const softdouble& c) { v = c.v; }
+    /** @brief Assign constructor */
+    softdouble& operator=( const softdouble& c )
+    {
+        if(&c != this) v = c.v;
+        return *this;
+    }
+    /** @brief Construct from raw
+
+    Builds new value from raw binary representation
+    */
+    static softdouble fromRaw( const uint64_t a ) { softdouble x; x.v = a; return x; }
+
+    /** @brief Construct from integer */
+    explicit softdouble( const uint32_t );
+    explicit softdouble( const uint64_t );
+    explicit softdouble( const  int32_t );
+    explicit softdouble( const  int64_t );
+
+#ifdef CV_INT32_T_IS_LONG_INT
+    // for platforms with int32_t = long int
+    explicit softdouble( const int a ) { *this = softdouble(static_cast<int32_t>(a)); }
+#endif
+
+    /** @brief Construct from double */
+    explicit softdouble( const double a ) { Cv64suf s; s.f = a; v = s.u; }
+
+    /** @brief Type casts  */
+    operator softfloat() const;
+    operator double() const { Cv64suf s; s.u = v; return s.f; }
+
+    /** @brief Basic arithmetics */
+    softdouble operator + (const softdouble&) const;
+    softdouble operator - (const softdouble&) const;
+    softdouble operator * (const softdouble&) const;
+    softdouble operator / (const softdouble&) const;
+    softdouble operator - () const { softdouble x; x.v = v ^ (1ULL << 63); return x; }
+
+    /** @brief Remainder operator
+
+    A quote from original SoftFloat manual:
+
+    > The IEEE Standard remainder operation computes the value
+    > a - n * b, where n is the integer closest to a / b.
+    > If a / b is exactly halfway between two integers, n is the even integer
+    > closest to a / b. The IEEE Standard’s remainder operation is always exact and so requires no rounding.
+    > Depending on the relative magnitudes of the operands, the remainder functions
+    > can take considerably longer to execute than the other SoftFloat functions.
+    > This is an inherent characteristic of the remainder operation itself and is not a flaw
+    > in the SoftFloat implementation.
+    */
+    softdouble operator % (const softdouble&) const;
+
+    softdouble& operator += (const softdouble& a) { *this = *this + a; return *this; }
+    softdouble& operator -= (const softdouble& a) { *this = *this - a; return *this; }
+    softdouble& operator *= (const softdouble& a) { *this = *this * a; return *this; }
+    softdouble& operator /= (const softdouble& a) { *this = *this / a; return *this; }
+    softdouble& operator %= (const softdouble& a) { *this = *this % a; return *this; }
+
+    /** @brief Comparison operations
+
+     - Any operation with NaN produces false
+       + The only exception is when x is NaN: x != y for any y.
+     - Positive and negative zeros are equal
+    */
+    bool operator == ( const softdouble& ) const;
+    bool operator != ( const softdouble& ) const;
+    bool operator >  ( const softdouble& ) const;
+    bool operator >= ( const softdouble& ) const;
+    bool operator <  ( const softdouble& ) const;
+    bool operator <= ( const softdouble& ) const;
+
+    /** @brief NaN state indicator */
+    inline bool isNaN() const { return (v & 0x7fffffffffffffff)  > 0x7ff0000000000000; }
+    /** @brief Inf state indicator */
+    inline bool isInf() const { return (v & 0x7fffffffffffffff) == 0x7ff0000000000000; }
+    /** @brief Subnormal number indicator */
+    inline bool isSubnormal() const { return ((v >> 52) & 0x7FF) == 0; }
+
+    /** @brief Get sign bit */
+    inline bool getSign() const { return (v >> 63) != 0; }
+    /** @brief Construct a copy with new sign bit */
+    softdouble setSign(bool sign) const { softdouble x; x.v = (v & ((1ULL << 63) - 1)) | ((uint_fast64_t)(sign) << 63); return x; }
+    /** @brief Get 0-based exponent */
+    inline int getExp() const { return ((v >> 52) & 0x7FF) - 1023; }
+    /** @brief Construct a copy with new 0-based exponent */
+    inline softdouble setExp(int e) const
+    {
+        softdouble x;
+        x.v = (v & 0x800FFFFFFFFFFFFF) | ((uint_fast64_t)((e + 1023) & 0x7FF) << 52);
+        return x;
+    }
+
+    /** @brief Get a fraction part
+
+    Returns a number 1 <= x < 2 with the same significand
+    */
+    inline softdouble getFrac() const
+    {
+        uint_fast64_t vv = (v & 0x000FFFFFFFFFFFFF) | ((uint_fast64_t)(1023) << 52);
+        return softdouble::fromRaw(vv);
+    }
+    /** @brief Construct a copy with provided significand
+
+    Constructs a copy of a number with significand taken from parameter
+    */
+    inline softdouble setFrac(const softdouble& s) const
+    {
+        softdouble x;
+        x.v = (v & 0xFFF0000000000000) | (s.v & 0x000FFFFFFFFFFFFF);
+        return x;
+    }
+
+    /** @brief Zero constant */
+    static softdouble zero() { return softdouble::fromRaw( 0 ); }
+    /** @brief Positive infinity constant */
+    static softdouble  inf() { return softdouble::fromRaw( (uint_fast64_t)(0x7FF) << 52 ); }
+    /** @brief Default NaN constant */
+    static softdouble  nan() { return softdouble::fromRaw( CV_BIG_INT(0x7FFFFFFFFFFFFFFF) ); }
+    /** @brief One constant */
+    static softdouble  one() { return softdouble::fromRaw( (uint_fast64_t)( 1023) << 52 ); }
+    /** @brief Smallest normalized value */
+    static softdouble  min() { return softdouble::fromRaw( (uint_fast64_t)( 0x01) << 52 ); }
+    /** @brief Difference between 1 and next representable value */
+    static softdouble  eps() { return softdouble::fromRaw( (uint_fast64_t)( 1023 - 52 ) << 52 ); }
+    /** @brief Biggest finite value */
+    static softdouble  max() { return softdouble::fromRaw( ((uint_fast64_t)(0x7FF) << 52) - 1 ); }
+    /** @brief Correct pi approximation */
+    static softdouble   pi() { return softdouble::fromRaw( CV_BIG_INT(0x400921FB54442D18) ); }
+
+    uint64_t v;
+};
+
+/*----------------------------------------------------------------------------
+*----------------------------------------------------------------------------*/
+
+/** @brief Fused Multiplication and Addition
+
+Computes (a*b)+c with single rounding
+*/
+CV_EXPORTS softfloat  mulAdd( const softfloat&  a, const softfloat&  b, const softfloat & c);
+CV_EXPORTS softdouble mulAdd( const softdouble& a, const softdouble& b, const softdouble& c);
+
+/** @brief Square root */
+CV_EXPORTS softfloat  sqrt( const softfloat&  a );
+CV_EXPORTS softdouble sqrt( const softdouble& a );
+}
+
+/*----------------------------------------------------------------------------
+| Ported from OpenCV and added for usability
+*----------------------------------------------------------------------------*/
+
+/** @brief Truncates number to integer with minimum magnitude */
+CV_EXPORTS int cvTrunc(const cv::softfloat&  a);
+CV_EXPORTS int cvTrunc(const cv::softdouble& a);
+
+/** @brief Rounds a number to nearest even integer */
+CV_EXPORTS int cvRound(const cv::softfloat&  a);
+CV_EXPORTS int cvRound(const cv::softdouble& a);
+
+/** @brief Rounds a number to nearest even long long integer */
+CV_EXPORTS int64_t cvRound64(const cv::softdouble& a);
+
+/** @brief Rounds a number down to integer */
+CV_EXPORTS int cvFloor(const cv::softfloat&  a);
+CV_EXPORTS int cvFloor(const cv::softdouble& a);
+
+/** @brief Rounds number up to integer */
+CV_EXPORTS int  cvCeil(const cv::softfloat&  a);
+CV_EXPORTS int  cvCeil(const cv::softdouble& a);
+
+namespace cv
+{
+/** @brief Saturate casts */
+template<typename _Tp> static inline _Tp saturate_cast(softfloat  a) { return _Tp(a); }
+template<typename _Tp> static inline _Tp saturate_cast(softdouble a) { return _Tp(a); }
+
+template<> inline uchar saturate_cast<uchar>(softfloat  a) { return (uchar)std::max(std::min(cvRound(a), (int)UCHAR_MAX), 0); }
+template<> inline uchar saturate_cast<uchar>(softdouble a) { return (uchar)std::max(std::min(cvRound(a), (int)UCHAR_MAX), 0); }
+
+template<> inline schar saturate_cast<schar>(softfloat  a) { return (schar)std::min(std::max(cvRound(a), (int)SCHAR_MIN), (int)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(softdouble a) { return (schar)std::min(std::max(cvRound(a), (int)SCHAR_MIN), (int)SCHAR_MAX); }
+
+template<> inline ushort saturate_cast<ushort>(softfloat  a) { return (ushort)std::max(std::min(cvRound(a), (int)USHRT_MAX), 0); }
+template<> inline ushort saturate_cast<ushort>(softdouble a) { return (ushort)std::max(std::min(cvRound(a), (int)USHRT_MAX), 0); }
+
+template<> inline short saturate_cast<short>(softfloat  a) { return (short)std::min(std::max(cvRound(a), (int)SHRT_MIN), (int)SHRT_MAX); }
+template<> inline short saturate_cast<short>(softdouble a) { return (short)std::min(std::max(cvRound(a), (int)SHRT_MIN), (int)SHRT_MAX); }
+
+template<> inline int saturate_cast<int>(softfloat  a) { return cvRound(a); }
+template<> inline int saturate_cast<int>(softdouble a) { return cvRound(a); }
+
+template<> inline int64_t saturate_cast<int64_t>(softfloat  a) { return cvRound(a); }
+template<> inline int64_t saturate_cast<int64_t>(softdouble a) { return cvRound64(a); }
+
+/** @brief Saturate cast to unsigned integer and unsigned long long integer
+We intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+*/
+template<> inline unsigned saturate_cast<unsigned>(softfloat  a) { return cvRound(a); }
+template<> inline unsigned saturate_cast<unsigned>(softdouble a) { return cvRound(a); }
+
+template<> inline uint64_t saturate_cast<uint64_t>(softfloat  a) { return cvRound(a); }
+template<> inline uint64_t saturate_cast<uint64_t>(softdouble a) { return cvRound64(a); }
+
+/** @brief Min and Max functions */
+inline softfloat  min(const softfloat&  a, const softfloat&  b) { return (a > b) ? b : a; }
+inline softdouble min(const softdouble& a, const softdouble& b) { return (a > b) ? b : a; }
+
+inline softfloat  max(const softfloat&  a, const softfloat&  b) { return (a > b) ? a : b; }
+inline softdouble max(const softdouble& a, const softdouble& b) { return (a > b) ? a : b; }
+
+/** @brief Absolute value */
+inline softfloat  abs( softfloat  a) { softfloat  x; x.v = a.v & ((1U   << 31) - 1); return x; }
+inline softdouble abs( softdouble a) { softdouble x; x.v = a.v & ((1ULL << 63) - 1); return x; }
+
+/** @brief Exponent
+
+Special cases:
+- exp(NaN) is NaN
+- exp(-Inf) == 0
+- exp(+Inf) == +Inf
+*/
+CV_EXPORTS softfloat  exp( const softfloat&  a);
+CV_EXPORTS softdouble exp( const softdouble& a);
+
+/** @brief Natural logarithm
+
+Special cases:
+- log(NaN), log(x < 0) are NaN
+- log(0) == -Inf
+*/
+CV_EXPORTS softfloat  log( const softfloat&  a );
+CV_EXPORTS softdouble log( const softdouble& a );
+
+/** @brief Raising to the power
+
+Special cases:
+- x**NaN is NaN for any x
+- ( |x| == 1 )**Inf is NaN
+- ( |x|  > 1 )**+Inf or ( |x| < 1 )**-Inf is +Inf
+- ( |x|  > 1 )**-Inf or ( |x| < 1 )**+Inf is 0
+- x ** 0 == 1 for any x
+- x ** 1 == 1 for any x
+- NaN ** y is NaN for any other y
+- Inf**(y < 0) == 0
+- Inf ** y is +Inf for any other y
+- (x < 0)**y is NaN for any other y if x can't be correctly rounded to integer
+- 0 ** 0 == 1
+- 0 ** (y < 0) is +Inf
+- 0 ** (y > 0) is 0
+*/
+CV_EXPORTS softfloat  pow( const softfloat&  a, const softfloat&  b);
+CV_EXPORTS softdouble pow( const softdouble& a, const softdouble& b);
+
+/** @brief Cube root
+
+Special cases:
+- cbrt(NaN) is NaN
+- cbrt(+/-Inf) is +/-Inf
+*/
+CV_EXPORTS softfloat cbrt( const softfloat& a );
+
+/** @brief Sine
+
+Special cases:
+- sin(Inf) or sin(NaN) is NaN
+- sin(x) == x when sin(x) is close to zero
+*/
+CV_EXPORTS softdouble sin( const softdouble& a );
+
+/** @brief Cosine
+ *
+Special cases:
+- cos(Inf) or cos(NaN) is NaN
+- cos(x) == +/- 1 when cos(x) is close to +/- 1
+*/
+CV_EXPORTS softdouble cos( const softdouble& a );
+
+//! @} core_utils_softfloat
+
+} // cv::
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/sse_utils.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/sse_utils.hpp
new file mode 100644
index 000000000000..0906583ea433
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/sse_utils.hpp
@@ -0,0 +1,652 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_SSE_UTILS_HPP
+#define OPENCV_CORE_SSE_UTILS_HPP
+
+#ifndef __cplusplus
+#  error sse_utils.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cvdef.h"
+
+//! @addtogroup core_utils_sse
+//! @{
+
+#if CV_SSE2
+
+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
+}
+
+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                  __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
+    __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
+    __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
+    v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
+    v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
+}
+
+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                  __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
+    __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
+    __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
+    __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
+    __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
+    __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
+    __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
+    __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
+    __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
+    v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
+    v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
+    v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
+    v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
+}
+
+inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+    __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
+    __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
+
+    __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
+    __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
+    __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
+
+    __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
+    __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
+
+    __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
+    __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
+
+    v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
+    v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
+}
+
+inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+    __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
+    __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
+    __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
+
+    __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
+    __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
+    __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
+
+    __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
+    __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
+    __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
+
+    __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
+    __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
+    __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
+
+    v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
+    v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
+    v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
+}
+
+inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+    __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
+    __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
+    __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
+    __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
+    __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
+
+    __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
+    __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
+    __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
+    __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
+    __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
+    __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
+
+    __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
+    __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
+    __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
+    __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
+    __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
+
+    __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
+    __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
+    __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
+    __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
+    __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
+
+    v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
+    v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
+    v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
+    v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
+    v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
+}
+
+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
+}
+
+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                   __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
+    v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
+    v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
+}
+
+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                   __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
+    __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
+    __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
+    __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
+    __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
+    v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
+    v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
+    v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
+    v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
+}
+
+#if CV_SSE4_1
+
+inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i v_mask = _mm_set1_epi32(0x0000ffff);
+
+    __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
+    __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
+
+    __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
+    __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
+
+    __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
+    __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
+
+    v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
+    v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
+}
+
+inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i v_mask = _mm_set1_epi32(0x0000ffff);
+
+    __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
+    __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
+    __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
+
+    __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
+    __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
+    __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
+
+    __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
+    __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
+    __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
+
+    v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
+    v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
+    v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
+}
+
+inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i v_mask = _mm_set1_epi32(0x0000ffff);
+
+    __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
+    __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
+    __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
+    __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
+    __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
+
+    __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
+    __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
+    __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
+    __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
+    __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
+
+    __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
+    __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
+    __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
+    __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
+    __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
+
+    v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
+    v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
+    v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
+    v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
+    v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
+}
+
+#endif // CV_SSE4_1
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
+{
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
+
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
+}
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
+                                __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
+{
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
+    __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
+    __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
+
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
+    __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
+    __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
+    v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
+    v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
+}
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
+                                __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
+{
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
+    __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
+    __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
+    __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
+    __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
+
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
+    __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
+    __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
+    __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
+    __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
+    v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
+    v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
+    v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
+    v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
+}
+
+inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
+{
+    enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
+
+    __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
+    __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
+    __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
+    __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
+
+    __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
+    __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
+    __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
+    __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
+
+    v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
+    v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
+    v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
+    v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
+}
+
+inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
+                              __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
+{
+    enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
+
+    __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
+    __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
+    __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
+    __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
+    __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
+    __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
+
+    __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
+    __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
+    __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
+    __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
+    __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
+    __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
+
+    v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
+    v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
+    v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
+    v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
+    v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
+    v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
+}
+
+inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
+                              __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
+{
+    enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
+
+    __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
+    __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
+    __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
+    __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
+    __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
+    __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
+    __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
+    __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
+
+    __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
+    __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
+    __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
+    __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
+    __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
+    __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
+    __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
+    __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
+
+    v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
+    v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
+    v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
+    v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
+    v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
+    v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
+    v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
+    v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
+}
+
+#endif // CV_SSE2
+
+//! @}
+
+#endif //OPENCV_CORE_SSE_UTILS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/traits.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/traits.hpp
new file mode 100644
index 000000000000..522519389bfb
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/traits.hpp
@@ -0,0 +1,417 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_TRAITS_HPP
+#define OPENCV_CORE_TRAITS_HPP
+
+#include "opencv2/core/cvdef.h"
+
+namespace cv
+{
+
+//#define OPENCV_TRAITS_ENABLE_DEPRECATED
+
+//! @addtogroup core_basic
+//! @{
+
+/** @brief Template "trait" class for OpenCV primitive data types.
+
+@note Deprecated. This is replaced by "single purpose" traits: traits::Type and traits::Depth
+
+A primitive OpenCV data type is one of unsigned char, bool, signed char, unsigned short, signed
+short, int, float, double, or a tuple of values of one of these types, where all the values in the
+tuple have the same type. Any primitive type from the list can be defined by an identifier in the
+form CV_\<bit-depth\>{U|S|F}C(\<number_of_channels\>), for example: uchar \~ CV_8UC1, 3-element
+floating-point tuple \~ CV_32FC3, and so on. A universal OpenCV structure that is able to store a
+single instance of such a primitive data type is Vec. Multiple instances of such a type can be
+stored in a std::vector, Mat, Mat_, SparseMat, SparseMat_, or any other container that is able to
+store Vec instances.
+
+The DataType class is basically used to provide a description of such primitive data types without
+adding any fields or methods to the corresponding classes (and it is actually impossible to add
+anything to primitive C/C++ data types). This technique is known in C++ as class traits. It is not
+DataType itself that is used but its specialized versions, such as:
+@code
+    template<> class DataType<uchar>
+    {
+        typedef uchar value_type;
+        typedef int work_type;
+        typedef uchar channel_type;
+        enum { channel_type = CV_8U, channels = 1, fmt='u', type = CV_8U };
+    };
+    ...
+    template<typename _Tp> DataType<std::complex<_Tp> >
+    {
+        typedef std::complex<_Tp> value_type;
+        typedef std::complex<_Tp> work_type;
+        typedef _Tp channel_type;
+        // DataDepth is another helper trait class
+        enum { depth = DataDepth<_Tp>::value, channels=2,
+            fmt=(channels-1)*256+DataDepth<_Tp>::fmt,
+            type=CV_MAKETYPE(depth, channels) };
+    };
+    ...
+@endcode
+The main purpose of this class is to convert compilation-time type information to an
+OpenCV-compatible data type identifier, for example:
+@code
+    // allocates a 30x40 floating-point matrix
+    Mat A(30, 40, DataType<float>::type);
+
+    Mat B = Mat_<std::complex<double> >(3, 3);
+    // the statement below will print 6, 2 , that is depth == CV_64F, channels == 2
+    cout << B.depth() << ", " << B.channels() << endl;
+@endcode
+So, such traits are used to tell OpenCV which data type you are working with, even if such a type is
+not native to OpenCV. For example, the matrix B initialization above is compiled because OpenCV
+defines the proper specialized template class DataType\<complex\<_Tp\> \> . This mechanism is also
+useful (and used in OpenCV this way) for generic algorithms implementations.
+
+@note Default values were dropped to stop confusing developers about using of unsupported types (see #7599)
+*/
+template<typename _Tp> class DataType
+{
+public:
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+    typedef _Tp         value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 1,
+           depth        = -1,
+           channels     = 1,
+           fmt          = 0,
+           type = CV_MAKETYPE(depth, channels)
+         };
+#endif
+};
+
+template<> class DataType<bool>
+{
+public:
+    typedef bool        value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_8U,
+           channels     = 1,
+           fmt          = (int)'u',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<uchar>
+{
+public:
+    typedef uchar       value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_8U,
+           channels     = 1,
+           fmt          = (int)'u',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<schar>
+{
+public:
+    typedef schar       value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_8S,
+           channels     = 1,
+           fmt          = (int)'c',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<char>
+{
+public:
+    typedef schar       value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_8S,
+           channels     = 1,
+           fmt          = (int)'c',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<ushort>
+{
+public:
+    typedef ushort      value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16U,
+           channels     = 1,
+           fmt          = (int)'w',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<short>
+{
+public:
+    typedef short       value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16S,
+           channels     = 1,
+           fmt          = (int)'s',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<int>
+{
+public:
+    typedef int         value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_32S,
+           channels     = 1,
+           fmt          = (int)'i',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<float>
+{
+public:
+    typedef float       value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_32F,
+           channels     = 1,
+           fmt          = (int)'f',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<double>
+{
+public:
+    typedef double      value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_64F,
+           channels     = 1,
+           fmt          = (int)'d',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<hfloat>
+{
+public:
+    typedef hfloat   value_type;
+    typedef float       work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16F,
+           channels     = 1,
+           fmt          = (int)'h',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+/** @brief A helper class for cv::DataType
+
+The class is specialized for each fundamental numerical data type supported by OpenCV. It provides
+DataDepth<T>::value constant.
+*/
+template<typename _Tp> class DataDepth
+{
+public:
+    enum
+    {
+        value = DataType<_Tp>::depth,
+        fmt   = DataType<_Tp>::fmt
+    };
+};
+
+
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+
+template<int _depth> class TypeDepth
+{
+#ifdef OPENCV_TRAITS_ENABLE_LEGACY_DEFAULTS
+    enum { depth = CV_USRTYPE1 };
+    typedef void value_type;
+#endif
+};
+
+template<> class TypeDepth<CV_8U>
+{
+    enum { depth = CV_8U };
+    typedef uchar value_type;
+};
+
+template<> class TypeDepth<CV_8S>
+{
+    enum { depth = CV_8S };
+    typedef schar value_type;
+};
+
+template<> class TypeDepth<CV_16U>
+{
+    enum { depth = CV_16U };
+    typedef ushort value_type;
+};
+
+template<> class TypeDepth<CV_16S>
+{
+    enum { depth = CV_16S };
+    typedef short value_type;
+};
+
+template<> class TypeDepth<CV_32S>
+{
+    enum { depth = CV_32S };
+    typedef int value_type;
+};
+
+template<> class TypeDepth<CV_32F>
+{
+    enum { depth = CV_32F };
+    typedef float value_type;
+};
+
+template<> class TypeDepth<CV_64F>
+{
+    enum { depth = CV_64F };
+    typedef double value_type;
+};
+
+template<> class TypeDepth<CV_16F>
+{
+    enum { depth = CV_16F };
+    typedef hfloat value_type;
+};
+
+#endif
+
+//! @}
+
+namespace traits {
+
+namespace internal {
+#define CV_CREATE_MEMBER_CHECK(X) \
+template<typename T> class CheckMember_##X { \
+    struct Fallback { int X; }; \
+    struct Derived : T, Fallback { }; \
+    template<typename U, U> struct Check; \
+    typedef char CV_NO[1]; \
+    typedef char CV_YES[2]; \
+    template<typename U> static CV_NO & func(Check<int Fallback::*, &U::X> *); \
+    template<typename U> static CV_YES & func(...); \
+public: \
+    typedef CheckMember_##X type; \
+    enum { value = sizeof(func<Derived>(0)) == sizeof(CV_YES) }; \
+};
+
+CV_CREATE_MEMBER_CHECK(fmt)
+CV_CREATE_MEMBER_CHECK(type)
+
+} // namespace internal
+
+
+template<typename T>
+struct Depth
+{ enum { value = DataType<T>::depth }; };
+
+template<typename T>
+struct Type
+{ enum { value = DataType<T>::type }; };
+
+/** Similar to traits::Type<T> but has value = -1 in case of unknown type (instead of compiler error) */
+template<typename T, bool available = internal::CheckMember_type< DataType<T> >::value >
+struct SafeType {};
+
+template<typename T>
+struct SafeType<T, false>
+{ enum { value = -1 }; };
+
+template<typename T>
+struct SafeType<T, true>
+{ enum { value = Type<T>::value }; };
+
+
+template<typename T, bool available = internal::CheckMember_fmt< DataType<T> >::value >
+struct SafeFmt {};
+
+template<typename T>
+struct SafeFmt<T, false>
+{ enum { fmt = 0 }; };
+
+template<typename T>
+struct SafeFmt<T, true>
+{ enum { fmt = DataType<T>::fmt }; };
+
+
+} // namespace
+
+} // cv
+
+#endif // OPENCV_CORE_TRAITS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/types.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/types.hpp
new file mode 100644
index 000000000000..8e56d5dd93a7
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/types.hpp
@@ -0,0 +1,2463 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_TYPES_HPP
+#define OPENCV_CORE_TYPES_HPP
+
+#ifndef __cplusplus
+#  error types.hpp header must be compiled as C++
+#endif
+
+#include <climits>
+#include <cfloat>
+#include <vector>
+#include <limits>
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/matx.hpp"
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4459)  // declaration of '...' hides global declaration
+#endif
+
+namespace cv
+{
+
+//! @addtogroup core_basic
+//! @{
+
+//////////////////////////////// Complex //////////////////////////////
+
+/** @brief  A complex number class.
+
+  The template class is similar and compatible with std::complex, however it provides slightly
+  more convenient access to the real and imaginary parts using through the simple field access, as opposite
+  to std::complex::real() and std::complex::imag().
+*/
+template<typename _Tp> class Complex
+{
+public:
+
+    //! default constructor
+    Complex();
+    Complex( _Tp _re, _Tp _im = 0 );
+
+    //! conversion to another data type
+    template<typename T2> operator Complex<T2>() const;
+    //! conjugation
+    Complex conj() const;
+
+    _Tp re, im; ///< the real and the imaginary parts
+};
+
+typedef Complex<float> Complexf;
+typedef Complex<double> Complexd;
+
+template<typename _Tp> class DataType< Complex<_Tp> >
+{
+public:
+    typedef Complex<_Tp> value_type;
+    typedef value_type   work_type;
+    typedef _Tp          channel_type;
+
+    enum { generic_type = 0,
+           channels     = 2,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+    };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Complex<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Complex<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 2) }; };
+} // namespace
+
+
+//////////////////////////////// Point_ ////////////////////////////////
+
+/** @brief Template class for 2D points specified by its coordinates `x` and `y`.
+
+An instance of the class is interchangeable with C structures, CvPoint and CvPoint2D32f . There is
+also a cast operator to convert point coordinates to the specified type. The conversion from
+floating-point coordinates to integer coordinates is done by rounding. Commonly, the conversion
+uses this operation for each of the coordinates. Besides the class members listed in the
+declaration above, the following operations on points are implemented:
+@code
+    pt1 = pt2 + pt3;
+    pt1 = pt2 - pt3;
+    pt1 = pt2 * a;
+    pt1 = a * pt2;
+    pt1 = pt2 / a;
+    pt1 += pt2;
+    pt1 -= pt2;
+    pt1 *= a;
+    pt1 /= a;
+    double value = norm(pt); // L2 norm
+    pt1 == pt2;
+    pt1 != pt2;
+@endcode
+For your convenience, the following type aliases are defined:
+@code
+    typedef Point_<int> Point2i;
+    typedef Point2i Point;
+    typedef Point_<float> Point2f;
+    typedef Point_<double> Point2d;
+@endcode
+Example:
+@code
+    Point2f a(0.3f, 0.f), b(0.f, 0.4f);
+    Point pt = (a + b)*10.f;
+    cout << pt.x << ", " << pt.y << endl;
+@endcode
+*/
+template<typename _Tp> class Point_
+{
+public:
+    typedef _Tp value_type;
+
+    //! default constructor
+    Point_();
+    Point_(_Tp _x, _Tp _y);
+#if (defined(__GNUC__) && __GNUC__ < 5) && !defined(__clang__)  // GCC 4.x bug. Details: https://github.com/opencv/opencv/pull/20837
+    Point_(const Point_& pt);
+    Point_(Point_&& pt) CV_NOEXCEPT = default;
+#elif OPENCV_ABI_COMPATIBILITY < 500
+    Point_(const Point_& pt) = default;
+    Point_(Point_&& pt) CV_NOEXCEPT = default;
+#endif
+    Point_(const Size_<_Tp>& sz);
+    Point_(const Vec<_Tp, 2>& v);
+
+#if (defined(__GNUC__) && __GNUC__ < 5) && !defined(__clang__)  // GCC 4.x bug. Details: https://github.com/opencv/opencv/pull/20837
+    Point_& operator = (const Point_& pt);
+    Point_& operator = (Point_&& pt) CV_NOEXCEPT = default;
+#elif OPENCV_ABI_COMPATIBILITY < 500
+    Point_& operator = (const Point_& pt) = default;
+    Point_& operator = (Point_&& pt) CV_NOEXCEPT = default;
+#endif
+    //! conversion to another data type
+    template<typename _Tp2> operator Point_<_Tp2>() const;
+
+    //! conversion to the old-style C structures
+    operator Vec<_Tp, 2>() const;
+
+    //! dot product
+    _Tp dot(const Point_& pt) const;
+    //! dot product computed in double-precision arithmetics
+    double ddot(const Point_& pt) const;
+    //! cross-product
+    double cross(const Point_& pt) const;
+    //! checks whether the point is inside the specified rectangle
+    bool inside(const Rect_<_Tp>& r) const;
+    _Tp x; //!< x coordinate of the point
+    _Tp y; //!< y coordinate of the point
+};
+
+typedef Point_<int> Point2i;
+typedef Point_<int64> Point2l;
+typedef Point_<float> Point2f;
+typedef Point_<double> Point2d;
+typedef Point2i Point;
+
+template<typename _Tp> class DataType< Point_<_Tp> >
+{
+public:
+    typedef Point_<_Tp>                               value_type;
+    typedef Point_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                       channel_type;
+
+    enum { generic_type = 0,
+           channels     = 2,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Point_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Point_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 2) }; };
+} // namespace
+
+
+//////////////////////////////// Point3_ ////////////////////////////////
+
+/** @brief Template class for 3D points specified by its coordinates `x`, `y` and `z`.
+
+An instance of the class is interchangeable with the C structure CvPoint2D32f . Similarly to
+Point_ , the coordinates of 3D points can be converted to another type. The vector arithmetic and
+comparison operations are also supported.
+
+The following Point3_\<\> aliases are available:
+@code
+    typedef Point3_<int> Point3i;
+    typedef Point3_<float> Point3f;
+    typedef Point3_<double> Point3d;
+@endcode
+@see cv::Point3i, cv::Point3f and cv::Point3d
+*/
+template<typename _Tp> class Point3_
+{
+public:
+    typedef _Tp value_type;
+
+    //! default constructor
+    Point3_();
+    Point3_(_Tp _x, _Tp _y, _Tp _z);
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Point3_(const Point3_& pt) = default;
+    Point3_(Point3_&& pt) CV_NOEXCEPT = default;
+#endif
+    explicit Point3_(const Point_<_Tp>& pt);
+    Point3_(const Vec<_Tp, 3>& v);
+
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Point3_& operator = (const Point3_& pt) = default;
+    Point3_& operator = (Point3_&& pt) CV_NOEXCEPT = default;
+#endif
+    //! conversion to another data type
+    template<typename _Tp2> operator Point3_<_Tp2>() const;
+    //! conversion to cv::Vec<>
+    operator Vec<_Tp, 3>() const;
+
+    //! dot product
+    _Tp dot(const Point3_& pt) const;
+    //! dot product computed in double-precision arithmetics
+    double ddot(const Point3_& pt) const;
+    //! cross product of the 2 3D points
+    Point3_ cross(const Point3_& pt) const;
+    _Tp x; //!< x coordinate of the 3D point
+    _Tp y; //!< y coordinate of the 3D point
+    _Tp z; //!< z coordinate of the 3D point
+};
+
+typedef Point3_<int> Point3i;
+typedef Point3_<float> Point3f;
+typedef Point3_<double> Point3d;
+
+template<typename _Tp> class DataType< Point3_<_Tp> >
+{
+public:
+    typedef Point3_<_Tp>                               value_type;
+    typedef Point3_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                        channel_type;
+
+    enum { generic_type = 0,
+           channels     = 3,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Point3_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Point3_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 3) }; };
+} // namespace
+
+//////////////////////////////// Size_ ////////////////////////////////
+
+/** @brief Template class for specifying the size of an image or rectangle.
+
+The class includes two members called width and height. The structure can be converted to and from
+the old OpenCV structures CvSize and CvSize2D32f . The same set of arithmetic and comparison
+operations as for Point_ is available.
+
+OpenCV defines the following Size_\<\> aliases:
+@code
+    typedef Size_<int> Size2i;
+    typedef Size2i Size;
+    typedef Size_<float> Size2f;
+@endcode
+*/
+template<typename _Tp> class Size_
+{
+public:
+    typedef _Tp value_type;
+
+    //! default constructor
+    Size_();
+    Size_(_Tp _width, _Tp _height);
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Size_(const Size_& sz) = default;
+    Size_(Size_&& sz) CV_NOEXCEPT = default;
+#endif
+    Size_(const Point_<_Tp>& pt);
+
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Size_& operator = (const Size_& sz) = default;
+    Size_& operator = (Size_&& sz) CV_NOEXCEPT = default;
+#endif
+    //! the area (width*height)
+    _Tp area() const;
+    //! aspect ratio (width/height)
+    double aspectRatio() const;
+    //! true if empty
+    bool empty() const;
+
+    //! conversion of another data type.
+    template<typename _Tp2> operator Size_<_Tp2>() const;
+
+    _Tp width; //!< the width
+    _Tp height; //!< the height
+};
+
+typedef Size_<int> Size2i;
+typedef Size_<int64> Size2l;
+typedef Size_<float> Size2f;
+typedef Size_<double> Size2d;
+typedef Size2i Size;
+
+template<typename _Tp> class DataType< Size_<_Tp> >
+{
+public:
+    typedef Size_<_Tp>                               value_type;
+    typedef Size_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                      channel_type;
+
+    enum { generic_type = 0,
+           channels     = 2,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Size_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Size_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 2) }; };
+} // namespace
+
+//////////////////////////////// Rect_ ////////////////////////////////
+
+/** @brief Template class for 2D rectangles
+
+described by the following parameters:
+-   Coordinates of the top-left corner. This is a default interpretation of Rect_::x and Rect_::y
+    in OpenCV. Though, in your algorithms you may count x and y from the bottom-left corner.
+-   Rectangle width and height.
+
+OpenCV typically assumes that the top and left boundary of the rectangle are inclusive, while the
+right and bottom boundaries are not. For example, the method Rect_::contains returns true if
+
+\f[x  \leq pt.x < x+width,
+      y  \leq pt.y < y+height\f]
+
+Virtually every loop over an image ROI in OpenCV (where ROI is specified by Rect_\<int\> ) is
+implemented as:
+@code
+    for(int y = roi.y; y < roi.y + roi.height; y++)
+        for(int x = roi.x; x < roi.x + roi.width; x++)
+        {
+            // ...
+        }
+@endcode
+In addition to the class members, the following operations on rectangles are implemented:
+-   \f$\texttt{rect} = \texttt{rect} \pm \texttt{point}\f$ (shifting a rectangle by a certain offset)
+-   \f$\texttt{rect} = \texttt{rect} \pm \texttt{size}\f$ (expanding or shrinking a rectangle by a
+    certain amount)
+-   rect += point, rect -= point, rect += size, rect -= size (augmenting operations)
+-   rect = rect1 & rect2 (rectangle intersection)
+-   rect = rect1 | rect2 (minimum area rectangle containing rect1 and rect2 )
+-   rect &= rect1, rect |= rect1 (and the corresponding augmenting operations)
+-   rect == rect1, rect != rect1 (rectangle comparison)
+
+This is an example how the partial ordering on rectangles can be established (rect1 \f$\subseteq\f$
+rect2):
+@code
+    template<typename _Tp> inline bool
+    operator <= (const Rect_<_Tp>& r1, const Rect_<_Tp>& r2)
+    {
+        return (r1 & r2) == r1;
+    }
+@endcode
+For your convenience, the Rect_\<\> alias is available: cv::Rect
+*/
+template<typename _Tp> class Rect_
+{
+public:
+    typedef _Tp value_type;
+
+    //! default constructor
+    Rect_();
+    Rect_(_Tp _x, _Tp _y, _Tp _width, _Tp _height);
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Rect_(const Rect_& r) = default;
+    Rect_(Rect_&& r) CV_NOEXCEPT = default;
+#endif
+    Rect_(const Point_<_Tp>& org, const Size_<_Tp>& sz);
+    Rect_(const Point_<_Tp>& pt1, const Point_<_Tp>& pt2);
+
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Rect_& operator = (const Rect_& r) = default;
+    Rect_& operator = (Rect_&& r) CV_NOEXCEPT = default;
+#endif
+    //! the top-left corner
+    Point_<_Tp> tl() const;
+    //! the bottom-right corner
+    Point_<_Tp> br() const;
+
+    //! size (width, height) of the rectangle
+    Size_<_Tp> size() const;
+    //! area (width*height) of the rectangle
+    _Tp area() const;
+    //! true if empty
+    bool empty() const;
+
+    //! conversion to another data type
+    template<typename _Tp2> operator Rect_<_Tp2>() const;
+
+    //! checks whether the rectangle contains the point
+    bool contains(const Point_<_Tp>& pt) const;
+
+    _Tp x; //!< x coordinate of the top-left corner
+    _Tp y; //!< y coordinate of the top-left corner
+    _Tp width; //!< width of the rectangle
+    _Tp height; //!< height of the rectangle
+};
+
+typedef Rect_<int> Rect2i;
+typedef Rect_<float> Rect2f;
+typedef Rect_<double> Rect2d;
+typedef Rect2i Rect;
+
+template<typename _Tp> class DataType< Rect_<_Tp> >
+{
+public:
+    typedef Rect_<_Tp>                               value_type;
+    typedef Rect_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                      channel_type;
+
+    enum { generic_type = 0,
+           channels     = 4,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Rect_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Rect_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 4) }; };
+} // namespace
+
+///////////////////////////// RotatedRect /////////////////////////////
+
+/** @brief The class represents rotated (i.e. not up-right) rectangles on a plane.
+
+Each rectangle is specified by the center point (mass center), length of each side (represented by
+#Size2f structure) and the rotation angle in degrees.
+
+The sample below demonstrates how to use RotatedRect:
+@snippet snippets/core_various.cpp RotatedRect_demo
+![image](pics/rotatedrect.png)
+
+@sa CamShift, fitEllipse, minAreaRect, CvBox2D
+*/
+class CV_EXPORTS_W_SIMPLE RotatedRect
+{
+public:
+    //! default constructor
+    CV_WRAP RotatedRect();
+    /** full constructor
+    @param center The rectangle mass center.
+    @param size Width and height of the rectangle.
+    @param angle The rotation angle in a clockwise direction. When the angle is 0, 90, 180, 270 etc.,
+    the rectangle becomes an up-right rectangle.
+    */
+    CV_WRAP RotatedRect(const Point2f& center, const Size2f& size, float angle);
+    /**
+    Any 3 end points of the RotatedRect. They must be given in order (either clockwise or
+    anticlockwise).
+     */
+    CV_WRAP RotatedRect(const Point2f& point1, const Point2f& point2, const Point2f& point3);
+
+    /** returns 4 vertices of the rotated rectangle
+    @param pts The points array for storing rectangle vertices. The order is _bottomLeft_, _topLeft_, topRight, bottomRight.
+    @note _Bottom_, _Top_, _Left_ and _Right_ sides refer to the original rectangle (angle is 0),
+    so after 180 degree rotation _bottomLeft_ point will be located at the top right corner of the
+    rectangle.
+    */
+    void points(Point2f pts[]) const;
+
+    CV_WRAP void points(CV_OUT std::vector<Point2f>& pts) const;
+
+    //! returns the minimal up-right integer rectangle containing the rotated rectangle
+    CV_WRAP Rect boundingRect() const;
+    //! returns the minimal (exact) floating point rectangle containing the rotated rectangle, not intended for use with images
+    CV_WRAP Rect2f boundingRect2f() const;
+    //! returns the rectangle mass center
+    CV_PROP_RW Point2f center;
+    //! returns width and height of the rectangle
+    CV_PROP_RW Size2f size;
+    //! returns the rotation angle. When the angle is 0, 90, 180, 270 etc., the rectangle becomes an up-right rectangle.
+    CV_PROP_RW float angle;
+};
+
+template<> class DataType< RotatedRect >
+{
+public:
+    typedef RotatedRect  value_type;
+    typedef value_type   work_type;
+    typedef float        channel_type;
+
+    enum { generic_type = 0,
+           channels     = (int)sizeof(value_type)/sizeof(channel_type), // 5
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<>
+struct Depth< RotatedRect > { enum { value = Depth<float>::value }; };
+template<>
+struct Type< RotatedRect > { enum { value = CV_MAKETYPE(Depth<float>::value, (int)sizeof(RotatedRect)/sizeof(float)) }; };
+} // namespace
+
+
+//////////////////////////////// Range /////////////////////////////////
+
+/** @brief Template class specifying a continuous subsequence (slice) of a sequence.
+
+The class is used to specify a row or a column span in a matrix ( Mat ) and for many other purposes.
+Range(a,b) is basically the same as a:b in Matlab or a..b in Python. As in Python, start is an
+inclusive left boundary of the range and end is an exclusive right boundary of the range. Such a
+half-opened interval is usually denoted as \f$[start,end)\f$ .
+
+The static method Range::all() returns a special variable that means "the whole sequence" or "the
+whole range", just like " : " in Matlab or " ... " in Python. All the methods and functions in
+OpenCV that take Range support this special Range::all() value. But, of course, in case of your own
+custom processing, you will probably have to check and handle it explicitly:
+@code
+    void my_function(..., const Range& r, ....)
+    {
+        if(r == Range::all()) {
+            // process all the data
+        }
+        else {
+            // process [r.start, r.end)
+        }
+    }
+@endcode
+*/
+class CV_EXPORTS Range
+{
+public:
+    Range();
+    Range(int _start, int _end);
+    int size() const;
+    bool empty() const;
+    static Range all();
+
+    int start, end;
+};
+
+template<> class DataType<Range>
+{
+public:
+    typedef Range      value_type;
+    typedef value_type work_type;
+    typedef int        channel_type;
+
+    enum { generic_type = 0,
+           channels     = 2,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<>
+struct Depth< Range > { enum { value = Depth<int>::value }; };
+template<>
+struct Type< Range > { enum { value = CV_MAKETYPE(Depth<int>::value, 2) }; };
+} // namespace
+
+
+//////////////////////////////// Scalar_ ///////////////////////////////
+
+/** @brief Template class for a 4-element vector derived from Vec.
+
+Being derived from Vec\<_Tp, 4\> , Scalar\_ and Scalar can be used just as typical 4-element
+vectors. In addition, they can be converted to/from CvScalar . The type Scalar is widely used in
+OpenCV to pass pixel values.
+*/
+template<typename _Tp> class Scalar_ : public Vec<_Tp, 4>
+{
+public:
+    //! default constructor
+    Scalar_();
+    Scalar_(_Tp v0, _Tp v1, _Tp v2=0, _Tp v3=0);
+    Scalar_(_Tp v0);
+
+    Scalar_(const Scalar_& s);
+    Scalar_(Scalar_&& s) CV_NOEXCEPT;
+
+    Scalar_& operator=(const Scalar_& s);
+    Scalar_& operator=(Scalar_&& s) CV_NOEXCEPT;
+
+    template<typename _Tp2, int cn>
+    Scalar_(const Vec<_Tp2, cn>& v);
+
+    //! returns a scalar with all elements set to v0
+    static Scalar_<_Tp> all(_Tp v0);
+
+    //! conversion to another data type
+    template<typename T2> operator Scalar_<T2>() const;
+
+    //! per-element product
+    Scalar_<_Tp> mul(const Scalar_<_Tp>& a, double scale=1 ) const;
+
+    //! returns (v0, -v1, -v2, -v3)
+    Scalar_<_Tp> conj() const;
+
+    //! returns true iff v1 == v2 == v3 == 0
+    bool isReal() const;
+};
+
+typedef Scalar_<double> Scalar;
+
+template<typename _Tp> class DataType< Scalar_<_Tp> >
+{
+public:
+    typedef Scalar_<_Tp>                               value_type;
+    typedef Scalar_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                        channel_type;
+
+    enum { generic_type = 0,
+           channels     = 4,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Scalar_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Scalar_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 4) }; };
+} // namespace
+
+
+/////////////////////////////// KeyPoint ////////////////////////////////
+
+/** @brief Data structure for salient point detectors.
+
+The class instance stores a keypoint, i.e. a point feature found by one of many available keypoint
+detectors, such as Harris corner detector, #FAST, %StarDetector, %SURF, %SIFT etc.
+
+The keypoint is characterized by the 2D position, scale (proportional to the diameter of the
+neighborhood that needs to be taken into account), orientation and some other parameters. The
+keypoint neighborhood is then analyzed by another algorithm that builds a descriptor (usually
+represented as a feature vector). The keypoints representing the same object in different images
+can then be matched using %KDTree or another method.
+*/
+class CV_EXPORTS_W_SIMPLE KeyPoint
+{
+public:
+    //! the default constructor
+    CV_WRAP KeyPoint();
+    /**
+    @param pt x & y coordinates of the keypoint
+    @param size keypoint diameter
+    @param angle keypoint orientation
+    @param response keypoint detector response on the keypoint (that is, strength of the keypoint)
+    @param octave pyramid octave in which the keypoint has been detected
+    @param class_id object id
+     */
+    KeyPoint(Point2f pt, float size, float angle=-1, float response=0, int octave=0, int class_id=-1);
+    /**
+    @param x x-coordinate of the keypoint
+    @param y y-coordinate of the keypoint
+    @param size keypoint diameter
+    @param angle keypoint orientation
+    @param response keypoint detector response on the keypoint (that is, strength of the keypoint)
+    @param octave pyramid octave in which the keypoint has been detected
+    @param class_id object id
+     */
+    CV_WRAP KeyPoint(float x, float y, float size, float angle=-1, float response=0, int octave=0, int class_id=-1);
+
+    size_t hash() const;
+
+    /**
+    This method converts vector of keypoints to vector of points or the reverse, where each keypoint is
+    assigned the same size and the same orientation.
+
+    @param keypoints Keypoints obtained from any feature detection algorithm like SIFT/SURF/ORB
+    @param points2f Array of (x,y) coordinates of each keypoint
+    @param keypointIndexes Array of indexes of keypoints to be converted to points. (Acts like a mask to
+    convert only specified keypoints)
+    */
+    CV_WRAP static void convert(const std::vector<KeyPoint>& keypoints,
+                                CV_OUT std::vector<Point2f>& points2f,
+                                const std::vector<int>& keypointIndexes=std::vector<int>());
+    /** @overload
+    @param points2f Array of (x,y) coordinates of each keypoint
+    @param keypoints Keypoints obtained from any feature detection algorithm like SIFT/SURF/ORB
+    @param size keypoint diameter
+    @param response keypoint detector response on the keypoint (that is, strength of the keypoint)
+    @param octave pyramid octave in which the keypoint has been detected
+    @param class_id object id
+    */
+    CV_WRAP static void convert(const std::vector<Point2f>& points2f,
+                                CV_OUT std::vector<KeyPoint>& keypoints,
+                                float size=1, float response=1, int octave=0, int class_id=-1);
+
+    /**
+    This method computes overlap for pair of keypoints. Overlap is the ratio between area of keypoint
+    regions' intersection and area of keypoint regions' union (considering keypoint region as circle).
+    If they don't overlap, we get zero. If they coincide at same location with same size, we get 1.
+    @param kp1 First keypoint
+    @param kp2 Second keypoint
+    */
+    CV_WRAP static float overlap(const KeyPoint& kp1, const KeyPoint& kp2);
+
+    CV_PROP_RW Point2f pt; //!< coordinates of the keypoints
+    CV_PROP_RW float size; //!< diameter of the meaningful keypoint neighborhood
+    CV_PROP_RW float angle; //!< computed orientation of the keypoint (-1 if not applicable);
+                            //!< it's in [0,360) degrees and measured relative to
+                            //!< image coordinate system, ie in clockwise.
+    CV_PROP_RW float response; //!< the response by which the most strong keypoints have been selected. Can be used for the further sorting or subsampling
+    CV_PROP_RW int octave; //!< octave (pyramid layer) from which the keypoint has been extracted
+    CV_PROP_RW int class_id; //!< object class (if the keypoints need to be clustered by an object they belong to)
+};
+
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+template<> class DataType<KeyPoint>
+{
+public:
+    typedef KeyPoint      value_type;
+    typedef float         work_type;
+    typedef float         channel_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = (int)(sizeof(value_type)/sizeof(channel_type)), // 7
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKETYPE(depth, channels)
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+#endif
+
+
+//////////////////////////////// DMatch /////////////////////////////////
+
+/** @brief Class for matching keypoint descriptors
+
+query descriptor index, train descriptor index, train image index, and distance between
+descriptors.
+*/
+class CV_EXPORTS_W_SIMPLE DMatch
+{
+public:
+    CV_WRAP DMatch();
+    CV_WRAP DMatch(int _queryIdx, int _trainIdx, float _distance);
+    CV_WRAP DMatch(int _queryIdx, int _trainIdx, int _imgIdx, float _distance);
+
+    CV_PROP_RW int queryIdx; //!< query descriptor index
+    CV_PROP_RW int trainIdx; //!< train descriptor index
+    CV_PROP_RW int imgIdx;   //!< train image index
+
+    CV_PROP_RW float distance;
+
+    // less is better
+    bool operator<(const DMatch &m) const;
+};
+
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+template<> class DataType<DMatch>
+{
+public:
+    typedef DMatch      value_type;
+    typedef int         work_type;
+    typedef int         channel_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = (int)(sizeof(value_type)/sizeof(channel_type)), // 4
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKETYPE(depth, channels)
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+#endif
+
+
+///////////////////////////// TermCriteria //////////////////////////////
+
+/** @brief The class defining termination criteria for iterative algorithms.
+
+You can initialize it by default constructor and then override any parameters, or the structure may
+be fully initialized using the advanced variant of the constructor.
+*/
+class CV_EXPORTS TermCriteria
+{
+public:
+    /**
+      Criteria type, can be one of: COUNT, EPS or COUNT + EPS
+    */
+    enum Type
+    {
+        COUNT=1, //!< the maximum number of iterations or elements to compute
+        MAX_ITER=COUNT, //!< ditto
+        EPS=2 //!< the desired accuracy or change in parameters at which the iterative algorithm stops
+    };
+
+    //! default constructor
+    TermCriteria();
+    /**
+    @param type The type of termination criteria, one of TermCriteria::Type
+    @param maxCount The maximum number of iterations or elements to compute.
+    @param epsilon The desired accuracy or change in parameters at which the iterative algorithm stops.
+    */
+    TermCriteria(int type, int maxCount, double epsilon);
+
+    inline bool isValid() const
+    {
+        const bool isCount = (type & COUNT) && maxCount > 0;
+        const bool isEps = (type & EPS) && !cvIsNaN(epsilon);
+        return isCount || isEps;
+    }
+
+    int type; //!< the type of termination criteria: COUNT, EPS or COUNT + EPS
+    int maxCount; //!< the maximum number of iterations/elements
+    double epsilon; //!< the desired accuracy
+};
+
+
+//! @} core_basic
+
+///////////////////////// raster image moments //////////////////////////
+
+//! @addtogroup imgproc_shape
+//! @{
+
+/** @brief struct returned by cv::moments
+
+The spatial moments \f$\texttt{Moments::m}_{ji}\f$ are computed as:
+
+\f[\texttt{m} _{ji}= \sum _{x,y}  \left ( \texttt{array} (x,y)  \cdot x^j  \cdot y^i \right )\f]
+
+The central moments \f$\texttt{Moments::mu}_{ji}\f$ are computed as:
+
+\f[\texttt{mu} _{ji}= \sum _{x,y}  \left ( \texttt{array} (x,y)  \cdot (x -  \bar{x} )^j  \cdot (y -  \bar{y} )^i \right )\f]
+
+where \f$(\bar{x}, \bar{y})\f$ is the mass center:
+
+\f[\bar{x} = \frac{\texttt{m}_{10}}{\texttt{m}_{00}} , \; \bar{y} = \frac{\texttt{m}_{01}}{\texttt{m}_{00}}\f]
+
+The normalized central moments \f$\texttt{Moments::nu}_{ij}\f$ are computed as:
+
+\f[\texttt{nu} _{ji}= \frac{\texttt{mu}_{ji}}{\texttt{m}_{00}^{(i+j)/2+1}} .\f]
+
+@note
+\f$\texttt{mu}_{00}=\texttt{m}_{00}\f$, \f$\texttt{nu}_{00}=1\f$
+\f$\texttt{nu}_{10}=\texttt{mu}_{10}=\texttt{mu}_{01}=\texttt{mu}_{10}=0\f$ , hence the values are not
+stored.
+
+The moments of a contour are defined in the same way but computed using the Green's formula (see
+<http://en.wikipedia.org/wiki/Green_theorem>). So, due to a limited raster resolution, the moments
+computed for a contour are slightly different from the moments computed for the same rasterized
+contour.
+
+@note
+Since the contour moments are computed using Green formula, you may get seemingly odd results for
+contours with self-intersections, e.g. a zero area (m00) for butterfly-shaped contours.
+ */
+class CV_EXPORTS_W_MAP Moments
+{
+public:
+    //! the default constructor
+    Moments();
+    //! the full constructor
+    Moments(double m00, double m10, double m01, double m20, double m11,
+            double m02, double m30, double m21, double m12, double m03 );
+    ////! the conversion from CvMoments
+    //Moments( const CvMoments& moments );
+    ////! the conversion to CvMoments
+    //operator CvMoments() const;
+
+    //! @name spatial moments
+    //! @{
+    CV_PROP_RW double  m00, m10, m01, m20, m11, m02, m30, m21, m12, m03;
+    //! @}
+
+    //! @name central moments
+    //! @{
+    CV_PROP_RW double  mu20, mu11, mu02, mu30, mu21, mu12, mu03;
+    //! @}
+
+    //! @name central normalized moments
+    //! @{
+    CV_PROP_RW double  nu20, nu11, nu02, nu30, nu21, nu12, nu03;
+    //! @}
+};
+
+template<> class DataType<Moments>
+{
+public:
+    typedef Moments     value_type;
+    typedef double      work_type;
+    typedef double      channel_type;
+
+    enum { generic_type = 0,
+           channels     = (int)(sizeof(value_type)/sizeof(channel_type)), // 24
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<>
+struct Depth< Moments > { enum { value = Depth<double>::value }; };
+template<>
+struct Type< Moments > { enum { value = CV_MAKETYPE(Depth<double>::value, (int)(sizeof(Moments)/sizeof(double))) }; };
+} // namespace
+
+//! @} imgproc_shape
+
+//! @cond IGNORED
+
+/////////////////////////////////////////////////////////////////////////
+///////////////////////////// Implementation ////////////////////////////
+/////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////// Complex ////////////////////////////////
+
+template<typename _Tp> inline
+Complex<_Tp>::Complex()
+    : re(0), im(0) {}
+
+template<typename _Tp> inline
+Complex<_Tp>::Complex( _Tp _re, _Tp _im )
+    : re(_re), im(_im) {}
+
+template<typename _Tp> template<typename T2> inline
+Complex<_Tp>::operator Complex<T2>() const
+{
+    return Complex<T2>(saturate_cast<T2>(re), saturate_cast<T2>(im));
+}
+
+template<typename _Tp> inline
+Complex<_Tp> Complex<_Tp>::conj() const
+{
+    return Complex<_Tp>(re, -im);
+}
+
+
+template<typename _Tp> static inline
+bool operator == (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return a.re == b.re && a.im == b.im;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return a.re != b.re || a.im != b.im;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator + (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return Complex<_Tp>( a.re + b.re, a.im + b.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator += (Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    a.re += b.re; a.im += b.im;
+    return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator - (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return Complex<_Tp>( a.re - b.re, a.im - b.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator -= (Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    a.re -= b.re; a.im -= b.im;
+    return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator - (const Complex<_Tp>& a)
+{
+    return Complex<_Tp>(-a.re, -a.im);
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator * (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return Complex<_Tp>( a.re*b.re - a.im*b.im, a.re*b.im + a.im*b.re );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator * (const Complex<_Tp>& a, _Tp b)
+{
+    return Complex<_Tp>( a.re*b, a.im*b );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator * (_Tp b, const Complex<_Tp>& a)
+{
+    return Complex<_Tp>( a.re*b, a.im*b );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator + (const Complex<_Tp>& a, _Tp b)
+{
+    return Complex<_Tp>( a.re + b, a.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator - (const Complex<_Tp>& a, _Tp b)
+{ return Complex<_Tp>( a.re - b, a.im ); }
+
+template<typename _Tp> static inline
+Complex<_Tp> operator + (_Tp b, const Complex<_Tp>& a)
+{
+    return Complex<_Tp>( a.re + b, a.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator - (_Tp b, const Complex<_Tp>& a)
+{
+    return Complex<_Tp>( b - a.re, -a.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator += (Complex<_Tp>& a, _Tp b)
+{
+    a.re += b; return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator -= (Complex<_Tp>& a, _Tp b)
+{
+    a.re -= b; return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator *= (Complex<_Tp>& a, _Tp b)
+{
+    a.re *= b; a.im *= b; return a;
+}
+
+template<typename _Tp> static inline
+double abs(const Complex<_Tp>& a)
+{
+    return std::sqrt( (double)a.re*a.re + (double)a.im*a.im);
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator / (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    double t = 1./((double)b.re*b.re + (double)b.im*b.im);
+    return Complex<_Tp>( (_Tp)((a.re*b.re + a.im*b.im)*t),
+                        (_Tp)((-a.re*b.im + a.im*b.re)*t) );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator /= (Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    a = a / b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator / (const Complex<_Tp>& a, _Tp b)
+{
+    _Tp t = (_Tp)1/b;
+    return Complex<_Tp>( a.re*t, a.im*t );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator / (_Tp b, const Complex<_Tp>& a)
+{
+    return Complex<_Tp>(b)/a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator /= (const Complex<_Tp>& a, _Tp b)
+{
+    _Tp t = (_Tp)1/b;
+    a.re *= t; a.im *= t; return a;
+}
+
+
+
+//////////////////////////////// 2D Point ///////////////////////////////
+
+template<typename _Tp> inline
+Point_<_Tp>::Point_()
+    : x(0), y(0) {}
+
+template<typename _Tp> inline
+Point_<_Tp>::Point_(_Tp _x, _Tp _y)
+    : x(_x), y(_y) {}
+
+#if (defined(__GNUC__) && __GNUC__ < 5) && !defined(__clang__)  // GCC 4.x bug. Details: https://github.com/opencv/opencv/pull/20837
+template<typename _Tp> inline
+Point_<_Tp>::Point_(const Point_& pt)
+    : x(pt.x), y(pt.y) {}
+#endif
+
+template<typename _Tp> inline
+Point_<_Tp>::Point_(const Size_<_Tp>& sz)
+    : x(sz.width), y(sz.height) {}
+
+template<typename _Tp> inline
+Point_<_Tp>::Point_(const Vec<_Tp,2>& v)
+    : x(v[0]), y(v[1]) {}
+
+#if (defined(__GNUC__) && __GNUC__ < 5) && !defined(__clang__)  // GCC 4.x bug. Details: https://github.com/opencv/opencv/pull/20837
+template<typename _Tp> inline
+Point_<_Tp>& Point_<_Tp>::operator = (const Point_& pt)
+{
+    x = pt.x; y = pt.y;
+    return *this;
+}
+#endif
+
+template<typename _Tp> template<typename _Tp2> inline
+Point_<_Tp>::operator Point_<_Tp2>() const
+{
+    return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+}
+
+template<typename _Tp> inline
+Point_<_Tp>::operator Vec<_Tp, 2>() const
+{
+    return Vec<_Tp, 2>(x, y);
+}
+
+template<typename _Tp> inline
+_Tp Point_<_Tp>::dot(const Point_& pt) const
+{
+    return saturate_cast<_Tp>(x*pt.x + y*pt.y);
+}
+
+template<typename _Tp> inline
+double Point_<_Tp>::ddot(const Point_& pt) const
+{
+    return (double)x*(double)(pt.x) + (double)y*(double)(pt.y);
+}
+
+template<typename _Tp> inline
+double Point_<_Tp>::cross(const Point_& pt) const
+{
+    return (double)x*pt.y - (double)y*pt.x;
+}
+
+template<typename _Tp> inline bool
+Point_<_Tp>::inside( const Rect_<_Tp>& r ) const
+{
+    return r.contains(*this);
+}
+
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator += (Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator -= (Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator *= (Point_<_Tp>& a, int b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator *= (Point_<_Tp>& a, float b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator *= (Point_<_Tp>& a, double b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator /= (Point_<_Tp>& a, int b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator /= (Point_<_Tp>& a, float b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator /= (Point_<_Tp>& a, double b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+double norm(const Point_<_Tp>& pt)
+{
+    return std::sqrt((double)pt.x*pt.x + (double)pt.y*pt.y);
+}
+
+template<typename _Tp> static inline
+bool operator == (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return a.x == b.x && a.y == b.y;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return a.x != b.x || a.y != b.y;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator + (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x + b.x), saturate_cast<_Tp>(a.y + b.y) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator - (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x - b.x), saturate_cast<_Tp>(a.y - b.y) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator - (const Point_<_Tp>& a)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(-a.x), saturate_cast<_Tp>(-a.y) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (const Point_<_Tp>& a, int b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x*b), saturate_cast<_Tp>(a.y*b) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (int a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(b.x*a), saturate_cast<_Tp>(b.y*a) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (const Point_<_Tp>& a, float b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x*b), saturate_cast<_Tp>(a.y*b) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (float a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(b.x*a), saturate_cast<_Tp>(b.y*a) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (const Point_<_Tp>& a, double b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x*b), saturate_cast<_Tp>(a.y*b) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (double a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(b.x*a), saturate_cast<_Tp>(b.y*a) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (const Matx<_Tp, 2, 2>& a, const Point_<_Tp>& b)
+{
+    Matx<_Tp, 2, 1> tmp = a * Vec<_Tp,2>(b.x, b.y);
+    return Point_<_Tp>(tmp.val[0], tmp.val[1]);
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Matx<_Tp, 3, 3>& a, const Point_<_Tp>& b)
+{
+    Matx<_Tp, 3, 1> tmp = a * Vec<_Tp,3>(b.x, b.y, 1);
+    return Point3_<_Tp>(tmp.val[0], tmp.val[1], tmp.val[2]);
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator / (const Point_<_Tp>& a, int b)
+{
+    Point_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator / (const Point_<_Tp>& a, float b)
+{
+    Point_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator / (const Point_<_Tp>& a, double b)
+{
+    Point_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+
+template<typename _AccTp> static inline _AccTp normL2Sqr(const Point_<int>& pt);
+template<typename _AccTp> static inline _AccTp normL2Sqr(const Point_<int64>& pt);
+template<typename _AccTp> static inline _AccTp normL2Sqr(const Point_<float>& pt);
+template<typename _AccTp> static inline _AccTp normL2Sqr(const Point_<double>& pt);
+
+template<> inline int normL2Sqr<int>(const Point_<int>& pt) { return pt.dot(pt); }
+template<> inline int64 normL2Sqr<int64>(const Point_<int64>& pt) { return pt.dot(pt); }
+template<> inline float normL2Sqr<float>(const Point_<float>& pt) { return pt.dot(pt); }
+template<> inline double normL2Sqr<double>(const Point_<int>& pt) { return pt.dot(pt); }
+
+template<> inline double normL2Sqr<double>(const Point_<float>& pt) { return pt.ddot(pt); }
+template<> inline double normL2Sqr<double>(const Point_<double>& pt) { return pt.ddot(pt); }
+
+
+
+//////////////////////////////// 3D Point ///////////////////////////////
+
+template<typename _Tp> inline
+Point3_<_Tp>::Point3_()
+    : x(0), y(0), z(0) {}
+
+template<typename _Tp> inline
+Point3_<_Tp>::Point3_(_Tp _x, _Tp _y, _Tp _z)
+    : x(_x), y(_y), z(_z) {}
+
+template<typename _Tp> inline
+Point3_<_Tp>::Point3_(const Point_<_Tp>& pt)
+    : x(pt.x), y(pt.y), z(_Tp()) {}
+
+template<typename _Tp> inline
+Point3_<_Tp>::Point3_(const Vec<_Tp, 3>& v)
+    : x(v[0]), y(v[1]), z(v[2]) {}
+
+template<typename _Tp> template<typename _Tp2> inline
+Point3_<_Tp>::operator Point3_<_Tp2>() const
+{
+    return Point3_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(z));
+}
+
+template<typename _Tp> inline
+Point3_<_Tp>::operator Vec<_Tp, 3>() const
+{
+    return Vec<_Tp, 3>(x, y, z);
+}
+
+template<typename _Tp> inline
+_Tp Point3_<_Tp>::dot(const Point3_& pt) const
+{
+    return saturate_cast<_Tp>(x*pt.x + y*pt.y + z*pt.z);
+}
+
+template<typename _Tp> inline
+double Point3_<_Tp>::ddot(const Point3_& pt) const
+{
+    return (double)x*pt.x + (double)y*pt.y + (double)z*pt.z;
+}
+
+template<typename _Tp> inline
+Point3_<_Tp> Point3_<_Tp>::cross(const Point3_<_Tp>& pt) const
+{
+    return Point3_<_Tp>(y*pt.z - z*pt.y, z*pt.x - x*pt.z, x*pt.y - y*pt.x);
+}
+
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator += (Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator -= (Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator *= (Point3_<_Tp>& a, int b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    a.z = saturate_cast<_Tp>(a.z * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator *= (Point3_<_Tp>& a, float b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    a.z = saturate_cast<_Tp>(a.z * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator *= (Point3_<_Tp>& a, double b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    a.z = saturate_cast<_Tp>(a.z * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator /= (Point3_<_Tp>& a, int b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    a.z = saturate_cast<_Tp>(a.z / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator /= (Point3_<_Tp>& a, float b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    a.z = saturate_cast<_Tp>(a.z / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator /= (Point3_<_Tp>& a, double b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    a.z = saturate_cast<_Tp>(a.z / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+double norm(const Point3_<_Tp>& pt)
+{
+    return std::sqrt((double)pt.x*pt.x + (double)pt.y*pt.y + (double)pt.z*pt.z);
+}
+
+template<typename _Tp> static inline
+bool operator == (const Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    return a.x != b.x || a.y != b.y || a.z != b.z;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator + (const Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x + b.x), saturate_cast<_Tp>(a.y + b.y), saturate_cast<_Tp>(a.z + b.z));
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator - (const Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x - b.x), saturate_cast<_Tp>(a.y - b.y), saturate_cast<_Tp>(a.z - b.z));
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator - (const Point3_<_Tp>& a)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(-a.x), saturate_cast<_Tp>(-a.y), saturate_cast<_Tp>(-a.z) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Point3_<_Tp>& a, int b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x*b), saturate_cast<_Tp>(a.y*b), saturate_cast<_Tp>(a.z*b) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (int a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(b.x * a), saturate_cast<_Tp>(b.y * a), saturate_cast<_Tp>(b.z * a) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Point3_<_Tp>& a, float b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x * b), saturate_cast<_Tp>(a.y * b), saturate_cast<_Tp>(a.z * b) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (float a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(b.x * a), saturate_cast<_Tp>(b.y * a), saturate_cast<_Tp>(b.z * a) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Point3_<_Tp>& a, double b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x * b), saturate_cast<_Tp>(a.y * b), saturate_cast<_Tp>(a.z * b) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (double a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(b.x * a), saturate_cast<_Tp>(b.y * a), saturate_cast<_Tp>(b.z * a) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Matx<_Tp, 3, 3>& a, const Point3_<_Tp>& b)
+{
+    Matx<_Tp, 3, 1> tmp = a * Vec<_Tp,3>(b.x, b.y, b.z);
+    return Point3_<_Tp>(tmp.val[0], tmp.val[1], tmp.val[2]);
+}
+
+template<typename _Tp> static inline
+Matx<_Tp, 4, 1> operator * (const Matx<_Tp, 4, 4>& a, const Point3_<_Tp>& b)
+{
+    return a * Matx<_Tp, 4, 1>(b.x, b.y, b.z, 1);
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator / (const Point3_<_Tp>& a, int b)
+{
+    Point3_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator / (const Point3_<_Tp>& a, float b)
+{
+    Point3_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator / (const Point3_<_Tp>& a, double b)
+{
+    Point3_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+
+
+////////////////////////////////// Size /////////////////////////////////
+
+template<typename _Tp> inline
+Size_<_Tp>::Size_()
+    : width(0), height(0) {}
+
+template<typename _Tp> inline
+Size_<_Tp>::Size_(_Tp _width, _Tp _height)
+    : width(_width), height(_height) {}
+
+template<typename _Tp> inline
+Size_<_Tp>::Size_(const Point_<_Tp>& pt)
+    : width(pt.x), height(pt.y) {}
+
+template<typename _Tp> template<typename _Tp2> inline
+Size_<_Tp>::operator Size_<_Tp2>() const
+{
+    return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+}
+
+template<typename _Tp> inline
+_Tp Size_<_Tp>::area() const
+{
+    const _Tp result = width * height;
+    CV_DbgAssert(!std::numeric_limits<_Tp>::is_integer
+        || width == 0 || result / width == height); // make sure the result fits in the return value
+    return result;
+}
+
+template<typename _Tp> inline
+double Size_<_Tp>::aspectRatio() const
+{
+    return width / static_cast<double>(height);
+}
+
+template<typename _Tp> inline
+bool Size_<_Tp>::empty() const
+{
+    return width <= 0 || height <= 0;
+}
+
+
+template<typename _Tp> static inline
+Size_<_Tp>& operator *= (Size_<_Tp>& a, _Tp b)
+{
+    a.width *= b;
+    a.height *= b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp> operator * (const Size_<_Tp>& a, _Tp b)
+{
+    Size_<_Tp> tmp(a);
+    tmp *= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp>& operator /= (Size_<_Tp>& a, _Tp b)
+{
+    a.width /= b;
+    a.height /= b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp> operator / (const Size_<_Tp>& a, _Tp b)
+{
+    Size_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp>& operator += (Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    a.width += b.width;
+    a.height += b.height;
+    return a;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp> operator + (const Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    Size_<_Tp> tmp(a);
+    tmp += b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp>& operator -= (Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    a.width -= b.width;
+    a.height -= b.height;
+    return a;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp> operator - (const Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    Size_<_Tp> tmp(a);
+    tmp -= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+bool operator == (const Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    return a.width == b.width && a.height == b.height;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    return !(a == b);
+}
+
+
+
+////////////////////////////////// Rect /////////////////////////////////
+
+template<typename _Tp> inline
+Rect_<_Tp>::Rect_()
+    : x(0), y(0), width(0), height(0) {}
+
+template<typename _Tp> inline
+Rect_<_Tp>::Rect_(_Tp _x, _Tp _y, _Tp _width, _Tp _height)
+    : x(_x), y(_y), width(_width), height(_height) {}
+
+template<typename _Tp> inline
+Rect_<_Tp>::Rect_(const Point_<_Tp>& org, const Size_<_Tp>& sz)
+    : x(org.x), y(org.y), width(sz.width), height(sz.height) {}
+
+template<typename _Tp> inline
+Rect_<_Tp>::Rect_(const Point_<_Tp>& pt1, const Point_<_Tp>& pt2)
+{
+    x = std::min(pt1.x, pt2.x);
+    y = std::min(pt1.y, pt2.y);
+    width = std::max(pt1.x, pt2.x) - x;
+    height = std::max(pt1.y, pt2.y) - y;
+}
+
+template<typename _Tp> inline
+Point_<_Tp> Rect_<_Tp>::tl() const
+{
+    return Point_<_Tp>(x,y);
+}
+
+template<typename _Tp> inline
+Point_<_Tp> Rect_<_Tp>::br() const
+{
+    return Point_<_Tp>(x + width, y + height);
+}
+
+template<typename _Tp> inline
+Size_<_Tp> Rect_<_Tp>::size() const
+{
+    return Size_<_Tp>(width, height);
+}
+
+template<typename _Tp> inline
+_Tp Rect_<_Tp>::area() const
+{
+    const _Tp result = width * height;
+    CV_DbgAssert(!std::numeric_limits<_Tp>::is_integer
+        || width == 0 || result / width == height); // make sure the result fits in the return value
+    return result;
+}
+
+template<typename _Tp> inline
+bool Rect_<_Tp>::empty() const
+{
+    return width <= 0 || height <= 0;
+}
+
+template<typename _Tp> template<typename _Tp2> inline
+Rect_<_Tp>::operator Rect_<_Tp2>() const
+{
+    return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+}
+
+template<typename _Tp> inline
+bool Rect_<_Tp>::contains(const Point_<_Tp>& pt) const
+{
+    return x <= pt.x && pt.x < x + width && y <= pt.y && pt.y < y + height;
+}
+
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator += ( Rect_<_Tp>& a, const Point_<_Tp>& b )
+{
+    a.x += b.x;
+    a.y += b.y;
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator -= ( Rect_<_Tp>& a, const Point_<_Tp>& b )
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator += ( Rect_<_Tp>& a, const Size_<_Tp>& b )
+{
+    a.width += b.width;
+    a.height += b.height;
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator -= ( Rect_<_Tp>& a, const Size_<_Tp>& b )
+{
+    const _Tp width = a.width - b.width;
+    const _Tp height = a.height - b.height;
+    CV_DbgAssert(width >= 0 && height >= 0);
+    a.width = width;
+    a.height = height;
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator &= ( Rect_<_Tp>& a, const Rect_<_Tp>& b )
+{
+    if (a.empty() || b.empty()) {
+        a = Rect();
+        return a;
+    }
+    const Rect_<_Tp>& Rx_min = (a.x < b.x) ? a : b;
+    const Rect_<_Tp>& Rx_max = (a.x < b.x) ? b : a;
+    const Rect_<_Tp>& Ry_min = (a.y < b.y) ? a : b;
+    const Rect_<_Tp>& Ry_max = (a.y < b.y) ? b : a;
+    // Looking at the formula below, we will compute Rx_min.width - (Rx_max.x - Rx_min.x)
+    // but we want to avoid overflows. Rx_min.width >= 0 and (Rx_max.x - Rx_min.x) >= 0
+    // by definition so the difference does not overflow. The only thing that can overflow
+    // is (Rx_max.x - Rx_min.x). And it can only overflow if Rx_min.x < 0.
+    // Let us first deal with the following case.
+    if ((Rx_min.x < 0 && Rx_min.x + Rx_min.width < Rx_max.x) ||
+        (Ry_min.y < 0 && Ry_min.y + Ry_min.height < Ry_max.y)) {
+        a = Rect();
+        return a;
+    }
+    // We now know that either Rx_min.x >= 0, or
+    // Rx_min.x < 0 && Rx_min.x + Rx_min.width >= Rx_max.x and therefore
+    // Rx_min.width >= (Rx_max.x - Rx_min.x) which means (Rx_max.x - Rx_min.x)
+    // is inferior to a valid int and therefore does not overflow.
+    a.width = std::min(Rx_min.width - (Rx_max.x - Rx_min.x), Rx_max.width);
+    a.height = std::min(Ry_min.height - (Ry_max.y - Ry_min.y), Ry_max.height);
+    a.x = Rx_max.x;
+    a.y = Ry_max.y;
+    if (a.empty())
+        a = Rect();
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator |= ( Rect_<_Tp>& a, const Rect_<_Tp>& b )
+{
+    if (a.empty()) {
+        a = b;
+    }
+    else if (!b.empty()) {
+        _Tp x1 = std::min(a.x, b.x);
+        _Tp y1 = std::min(a.y, b.y);
+        a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+        a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+        a.x = x1;
+        a.y = y1;
+    }
+    return a;
+}
+
+template<typename _Tp> static inline
+bool operator == (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    return a.x == b.x && a.y == b.y && a.width == b.width && a.height == b.height;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    return a.x != b.x || a.y != b.y || a.width != b.width || a.height != b.height;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator + (const Rect_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return Rect_<_Tp>( a.x + b.x, a.y + b.y, a.width, a.height );
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator - (const Rect_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return Rect_<_Tp>( a.x - b.x, a.y - b.y, a.width, a.height );
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator + (const Rect_<_Tp>& a, const Size_<_Tp>& b)
+{
+    return Rect_<_Tp>( a.x, a.y, a.width + b.width, a.height + b.height );
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator - (const Rect_<_Tp>& a, const Size_<_Tp>& b)
+{
+    const _Tp width = a.width - b.width;
+    const _Tp height = a.height - b.height;
+    CV_DbgAssert(width >= 0 && height >= 0);
+    return Rect_<_Tp>( a.x, a.y, width, height );
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator & (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator | (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+/**
+ * @brief measure dissimilarity between two sample sets
+ *
+ * computes the complement of the Jaccard Index as described in <https://en.wikipedia.org/wiki/Jaccard_index>.
+ * For rectangles this reduces to computing the intersection over the union.
+ */
+template<typename _Tp> static inline
+double jaccardDistance(const Rect_<_Tp>& a, const Rect_<_Tp>& b) {
+    _Tp Aa = a.area();
+    _Tp Ab = b.area();
+
+    if ((Aa + Ab) <= std::numeric_limits<_Tp>::epsilon()) {
+        // jaccard_index = 1 -> distance = 0
+        return 0.0;
+    }
+
+    double Aab = (a & b).area();
+    // distance = 1 - jaccard_index
+    return 1.0 - Aab / (Aa + Ab - Aab);
+}
+
+/** @brief Finds out if there is any intersection between two rectangles
+ *
+ * mainly useful for language bindings
+ * @param a First rectangle
+ * @param b Second rectangle
+ * @return the area of the intersection
+ */
+CV_EXPORTS_W inline double rectangleIntersectionArea(const Rect2d& a, const Rect2d& b) { return (a & b).area(); }
+
+////////////////////////////// RotatedRect //////////////////////////////
+
+inline
+RotatedRect::RotatedRect()
+    : center(), size(), angle(0) {}
+
+inline
+RotatedRect::RotatedRect(const Point2f& _center, const Size2f& _size, float _angle)
+    : center(_center), size(_size), angle(_angle) {}
+
+///////////////////////////////// Range /////////////////////////////////
+
+inline
+Range::Range()
+    : start(0), end(0) {}
+
+inline
+Range::Range(int _start, int _end)
+    : start(_start), end(_end) {}
+
+inline
+int Range::size() const
+{
+    return end - start;
+}
+
+inline
+bool Range::empty() const
+{
+    return start == end;
+}
+
+inline
+Range Range::all()
+{
+    return Range(INT_MIN, INT_MAX);
+}
+
+
+static inline
+bool operator == (const Range& r1, const Range& r2)
+{
+    return r1.start == r2.start && r1.end == r2.end;
+}
+
+static inline
+bool operator != (const Range& r1, const Range& r2)
+{
+    return !(r1 == r2);
+}
+
+static inline
+bool operator !(const Range& r)
+{
+    return r.start == r.end;
+}
+
+static inline
+Range operator & (const Range& r1, const Range& r2)
+{
+    Range r(std::max(r1.start, r2.start), std::min(r1.end, r2.end));
+    r.end = std::max(r.end, r.start);
+    return r;
+}
+
+static inline
+Range& operator &= (Range& r1, const Range& r2)
+{
+    r1 = r1 & r2;
+    return r1;
+}
+
+static inline
+Range operator + (const Range& r1, int delta)
+{
+    return Range(r1.start + delta, r1.end + delta);
+}
+
+static inline
+Range operator + (int delta, const Range& r1)
+{
+    return Range(r1.start + delta, r1.end + delta);
+}
+
+static inline
+Range operator - (const Range& r1, int delta)
+{
+    return r1 + (-delta);
+}
+
+
+
+///////////////////////////////// Scalar ////////////////////////////////
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_()
+{
+    this->val[0] = this->val[1] = this->val[2] = this->val[3] = 0;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+{
+    this->val[0] = v0;
+    this->val[1] = v1;
+    this->val[2] = v2;
+    this->val[3] = v3;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_(const Scalar_<_Tp>& s) : Vec<_Tp, 4>(s) {
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_(Scalar_<_Tp>&& s) CV_NOEXCEPT {
+    this->val[0] = std::move(s.val[0]);
+    this->val[1] = std::move(s.val[1]);
+    this->val[2] = std::move(s.val[2]);
+    this->val[3] = std::move(s.val[3]);
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>& Scalar_<_Tp>::operator=(const Scalar_<_Tp>& s) {
+    this->val[0] = s.val[0];
+    this->val[1] = s.val[1];
+    this->val[2] = s.val[2];
+    this->val[3] = s.val[3];
+    return *this;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>& Scalar_<_Tp>::operator=(Scalar_<_Tp>&& s) CV_NOEXCEPT {
+    this->val[0] = std::move(s.val[0]);
+    this->val[1] = std::move(s.val[1]);
+    this->val[2] = std::move(s.val[2]);
+    this->val[3] = std::move(s.val[3]);
+    return *this;
+}
+
+template<typename _Tp> template<typename _Tp2, int cn> inline
+Scalar_<_Tp>::Scalar_(const Vec<_Tp2, cn>& v)
+{
+    int i;
+    for( i = 0; i < (cn < 4 ? cn : 4); i++ )
+        this->val[i] = cv::saturate_cast<_Tp>(v.val[i]);
+    for( ; i < 4; i++ )
+        this->val[i] = 0;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_(_Tp v0)
+{
+    this->val[0] = v0;
+    this->val[1] = this->val[2] = this->val[3] = 0;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp> Scalar_<_Tp>::all(_Tp v0)
+{
+    return Scalar_<_Tp>(v0, v0, v0, v0);
+}
+
+
+template<typename _Tp> inline
+Scalar_<_Tp> Scalar_<_Tp>::mul(const Scalar_<_Tp>& a, double scale ) const
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>(this->val[0] * a.val[0] * scale),
+                        saturate_cast<_Tp>(this->val[1] * a.val[1] * scale),
+                        saturate_cast<_Tp>(this->val[2] * a.val[2] * scale),
+                        saturate_cast<_Tp>(this->val[3] * a.val[3] * scale));
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp> Scalar_<_Tp>::conj() const
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>( this->val[0]),
+                        saturate_cast<_Tp>(-this->val[1]),
+                        saturate_cast<_Tp>(-this->val[2]),
+                        saturate_cast<_Tp>(-this->val[3]));
+}
+
+template<typename _Tp> inline
+bool Scalar_<_Tp>::isReal() const
+{
+    return this->val[1] == 0 && this->val[2] == 0 && this->val[3] == 0;
+}
+
+
+template<typename _Tp> template<typename T2> inline
+Scalar_<_Tp>::operator Scalar_<T2>() const
+{
+    return Scalar_<T2>(saturate_cast<T2>(this->val[0]),
+                       saturate_cast<T2>(this->val[1]),
+                       saturate_cast<T2>(this->val[2]),
+                       saturate_cast<T2>(this->val[3]));
+}
+
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator += (Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    a.val[0] += b.val[0];
+    a.val[1] += b.val[1];
+    a.val[2] += b.val[2];
+    a.val[3] += b.val[3];
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator -= (Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    a.val[0] -= b.val[0];
+    a.val[1] -= b.val[1];
+    a.val[2] -= b.val[2];
+    a.val[3] -= b.val[3];
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator *= ( Scalar_<_Tp>& a, _Tp v )
+{
+    a.val[0] *= v;
+    a.val[1] *= v;
+    a.val[2] *= v;
+    a.val[3] *= v;
+    return a;
+}
+
+template<typename _Tp> static inline
+bool operator == ( const Scalar_<_Tp>& a, const Scalar_<_Tp>& b )
+{
+    return a.val[0] == b.val[0] && a.val[1] == b.val[1] &&
+           a.val[2] == b.val[2] && a.val[3] == b.val[3];
+}
+
+template<typename _Tp> static inline
+bool operator != ( const Scalar_<_Tp>& a, const Scalar_<_Tp>& b )
+{
+    return a.val[0] != b.val[0] || a.val[1] != b.val[1] ||
+           a.val[2] != b.val[2] || a.val[3] != b.val[3];
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator + (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    return Scalar_<_Tp>(a.val[0] + b.val[0],
+                        a.val[1] + b.val[1],
+                        a.val[2] + b.val[2],
+                        a.val[3] + b.val[3]);
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator - (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>(a.val[0] - b.val[0]),
+                        saturate_cast<_Tp>(a.val[1] - b.val[1]),
+                        saturate_cast<_Tp>(a.val[2] - b.val[2]),
+                        saturate_cast<_Tp>(a.val[3] - b.val[3]));
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator * (const Scalar_<_Tp>& a, _Tp alpha)
+{
+    return Scalar_<_Tp>(a.val[0] * alpha,
+                        a.val[1] * alpha,
+                        a.val[2] * alpha,
+                        a.val[3] * alpha);
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator * (_Tp alpha, const Scalar_<_Tp>& a)
+{
+    return a*alpha;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator - (const Scalar_<_Tp>& a)
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>(-a.val[0]),
+                        saturate_cast<_Tp>(-a.val[1]),
+                        saturate_cast<_Tp>(-a.val[2]),
+                        saturate_cast<_Tp>(-a.val[3]));
+}
+
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator * (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>(a[0]*b[0] - a[1]*b[1] - a[2]*b[2] - a[3]*b[3]),
+                        saturate_cast<_Tp>(a[0]*b[1] + a[1]*b[0] + a[2]*b[3] - a[3]*b[2]),
+                        saturate_cast<_Tp>(a[0]*b[2] - a[1]*b[3] + a[2]*b[0] + a[3]*b[1]),
+                        saturate_cast<_Tp>(a[0]*b[3] + a[1]*b[2] - a[2]*b[1] + a[3]*b[0]));
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator *= (Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    a = a * b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator / (const Scalar_<_Tp>& a, _Tp alpha)
+{
+    return Scalar_<_Tp>(a.val[0] / alpha,
+                        a.val[1] / alpha,
+                        a.val[2] / alpha,
+                        a.val[3] / alpha);
+}
+
+template<typename _Tp> static inline
+Scalar_<float> operator / (const Scalar_<float>& a, float alpha)
+{
+    float s = 1 / alpha;
+    return Scalar_<float>(a.val[0] * s, a.val[1] * s, a.val[2] * s, a.val[3] * s);
+}
+
+template<typename _Tp> static inline
+Scalar_<double> operator / (const Scalar_<double>& a, double alpha)
+{
+    double s = 1 / alpha;
+    return Scalar_<double>(a.val[0] * s, a.val[1] * s, a.val[2] * s, a.val[3] * s);
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator /= (Scalar_<_Tp>& a, _Tp alpha)
+{
+    a = a / alpha;
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator / (_Tp a, const Scalar_<_Tp>& b)
+{
+    _Tp s = a / (b[0]*b[0] + b[1]*b[1] + b[2]*b[2] + b[3]*b[3]);
+    return b.conj() * s;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator / (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    return a * ((_Tp)1 / b);
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator /= (Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    a = a / b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar operator * (const Matx<_Tp, 4, 4>& a, const Scalar& b)
+{
+    Matx<double, 4, 1> c((Matx<double, 4, 4>)a, b, Matx_MatMulOp());
+    return reinterpret_cast<const Scalar&>(c);
+}
+
+template<> inline
+Scalar operator * (const Matx<double, 4, 4>& a, const Scalar& b)
+{
+    Matx<double, 4, 1> c(a, b, Matx_MatMulOp());
+    return reinterpret_cast<const Scalar&>(c);
+}
+
+
+
+//////////////////////////////// KeyPoint ///////////////////////////////
+
+inline
+KeyPoint::KeyPoint()
+    : pt(0,0), size(0), angle(-1), response(0), octave(0), class_id(-1) {}
+
+inline
+KeyPoint::KeyPoint(Point2f _pt, float _size, float _angle, float _response, int _octave, int _class_id)
+    : pt(_pt), size(_size), angle(_angle), response(_response), octave(_octave), class_id(_class_id) {}
+
+inline
+KeyPoint::KeyPoint(float x, float y, float _size, float _angle, float _response, int _octave, int _class_id)
+    : pt(x, y), size(_size), angle(_angle), response(_response), octave(_octave), class_id(_class_id) {}
+
+
+
+///////////////////////////////// DMatch ////////////////////////////////
+
+inline
+DMatch::DMatch()
+    : queryIdx(-1), trainIdx(-1), imgIdx(-1), distance(FLT_MAX) {}
+
+inline
+DMatch::DMatch(int _queryIdx, int _trainIdx, float _distance)
+    : queryIdx(_queryIdx), trainIdx(_trainIdx), imgIdx(-1), distance(_distance) {}
+
+inline
+DMatch::DMatch(int _queryIdx, int _trainIdx, int _imgIdx, float _distance)
+    : queryIdx(_queryIdx), trainIdx(_trainIdx), imgIdx(_imgIdx), distance(_distance) {}
+
+inline
+bool DMatch::operator < (const DMatch &m) const
+{
+    return distance < m.distance;
+}
+
+
+
+////////////////////////////// TermCriteria /////////////////////////////
+
+inline
+TermCriteria::TermCriteria()
+    : type(0), maxCount(0), epsilon(0) {}
+
+inline
+TermCriteria::TermCriteria(int _type, int _maxCount, double _epsilon)
+    : type(_type), maxCount(_maxCount), epsilon(_epsilon) {}
+
+//! @endcond
+
+} // cv
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif //OPENCV_CORE_TYPES_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/types_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/core/types_c.h
new file mode 100644
index 000000000000..02d4a4f68058
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/types_c.h
@@ -0,0 +1,2110 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_TYPES_H
+#define OPENCV_CORE_TYPES_H
+
+#ifdef CV__ENABLE_C_API_CTORS  // invalid C API ctors (must be removed)
+#if defined(_WIN32) && !defined(CV__SKIP_MESSAGE_MALFORMED_C_API_CTORS)
+#error "C API ctors don't work on Win32: https://github.com/opencv/opencv/issues/15990"
+#endif
+#endif
+
+//#define CV__VALIDATE_UNUNITIALIZED_VARS 1  // C++11 & GCC only
+
+#ifdef __cplusplus
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#define CV_STRUCT_INITIALIZER {0,}
+#else
+#if defined(__GNUC__) && __GNUC__ == 4  // GCC 4.x warns on "= {}" initialization, fixed in GCC 5.0
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+#define CV_STRUCT_INITIALIZER {}
+#endif
+
+#else
+#define CV_STRUCT_INITIALIZER {0}
+#endif
+
+
+#ifdef HAVE_IPL
+#  ifndef __IPL_H__
+#    if defined _WIN32
+#      include <ipl.h>
+#    else
+#      include <ipl/ipl.h>
+#    endif
+#  endif
+#elif defined __IPL_H__
+#  define HAVE_IPL
+#endif
+
+#include "opencv2/core/cvdef.h"
+
+#ifndef SKIP_INCLUDES
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+#endif // SKIP_INCLUDES
+
+
+
+#ifndef CV_DEFAULT
+#  ifdef __cplusplus
+#    define CV_DEFAULT(val) = val
+#  else
+#    define CV_DEFAULT(val)
+#  endif
+#endif
+
+#ifndef CV_EXTERN_C_FUNCPTR
+#  ifdef __cplusplus
+#    define CV_EXTERN_C_FUNCPTR(x) extern "C" { typedef x; }
+#  else
+#    define CV_EXTERN_C_FUNCPTR(x) typedef x
+#  endif
+#endif
+
+#ifndef CVAPI
+#  define CVAPI(rettype) CV_EXTERN_C CV_EXPORTS rettype CV_CDECL
+#endif
+
+#ifndef CV_IMPL
+#  define CV_IMPL CV_EXTERN_C
+#endif
+
+#ifdef __cplusplus
+#  include "opencv2/core.hpp"
+#endif
+
+/** @addtogroup core_c
+    @{
+*/
+
+/** @brief This is the "metatype" used *only* as a function parameter.
+
+It denotes that the function accepts arrays of multiple types, such as IplImage*, CvMat* or even
+CvSeq* sometimes. The particular array type is determined at runtime by analyzing the first 4
+bytes of the header. In C++ interface the role of CvArr is played by InputArray and OutputArray.
+ */
+typedef void CvArr;
+
+typedef int CVStatus;
+
+/** @see cv::Error::Code */
+enum {
+ CV_StsOk=                       0,  /**< everything is ok                */
+ CV_StsBackTrace=               -1,  /**< pseudo error for back trace     */
+ CV_StsError=                   -2,  /**< unknown /unspecified error      */
+ CV_StsInternal=                -3,  /**< internal error (bad state)      */
+ CV_StsNoMem=                   -4,  /**< insufficient memory             */
+ CV_StsBadArg=                  -5,  /**< function arg/param is bad       */
+ CV_StsBadFunc=                 -6,  /**< unsupported function            */
+ CV_StsNoConv=                  -7,  /**< iter. didn't converge           */
+ CV_StsAutoTrace=               -8,  /**< tracing                         */
+ CV_HeaderIsNull=               -9,  /**< image header is NULL            */
+ CV_BadImageSize=              -10,  /**< image size is invalid           */
+ CV_BadOffset=                 -11,  /**< offset is invalid               */
+ CV_BadDataPtr=                -12,  /**/
+ CV_BadStep=                   -13,  /**< image step is wrong, this may happen for a non-continuous matrix */
+ CV_BadModelOrChSeq=           -14,  /**/
+ CV_BadNumChannels=            -15,  /**< bad number of channels, for example, some functions accept only single channel matrices */
+ CV_BadNumChannel1U=           -16,  /**/
+ CV_BadDepth=                  -17,  /**< input image depth is not supported by the function */
+ CV_BadAlphaChannel=           -18,  /**/
+ CV_BadOrder=                  -19,  /**< number of dimensions is out of range */
+ CV_BadOrigin=                 -20,  /**< incorrect input origin               */
+ CV_BadAlign=                  -21,  /**< incorrect input align                */
+ CV_BadCallBack=               -22,  /**/
+ CV_BadTileSize=               -23,  /**/
+ CV_BadCOI=                    -24,  /**< input COI is not supported           */
+ CV_BadROISize=                -25,  /**< incorrect input roi                  */
+ CV_MaskIsTiled=               -26,  /**/
+ CV_StsNullPtr=                -27,  /**< null pointer */
+ CV_StsVecLengthErr=           -28,  /**< incorrect vector length */
+ CV_StsFilterStructContentErr= -29,  /**< incorrect filter structure content */
+ CV_StsKernelStructContentErr= -30,  /**< incorrect transform kernel content */
+ CV_StsFilterOffsetErr=        -31,  /**< incorrect filter offset value */
+ CV_StsBadSize=                -201, /**< the input/output structure size is incorrect  */
+ CV_StsDivByZero=              -202, /**< division by zero */
+ CV_StsInplaceNotSupported=    -203, /**< in-place operation is not supported */
+ CV_StsObjectNotFound=         -204, /**< request can't be completed */
+ CV_StsUnmatchedFormats=       -205, /**< formats of input/output arrays differ */
+ CV_StsBadFlag=                -206, /**< flag is wrong or not supported */
+ CV_StsBadPoint=               -207, /**< bad CvPoint */
+ CV_StsBadMask=                -208, /**< bad format of mask (neither 8uC1 nor 8sC1)*/
+ CV_StsUnmatchedSizes=         -209, /**< sizes of input/output structures do not match */
+ CV_StsUnsupportedFormat=      -210, /**< the data format/type is not supported by the function*/
+ CV_StsOutOfRange=             -211, /**< some of parameters are out of range */
+ CV_StsParseError=             -212, /**< invalid syntax/structure of the parsed file */
+ CV_StsNotImplemented=         -213, /**< the requested function/feature is not implemented */
+ CV_StsBadMemBlock=            -214, /**< an allocated block has been corrupted */
+ CV_StsAssert=                 -215, /**< assertion failed   */
+ CV_GpuNotSupported=           -216, /**< no CUDA support    */
+ CV_GpuApiCallError=           -217, /**< GPU API call error */
+ CV_OpenGlNotSupported=        -218, /**< no OpenGL support  */
+ CV_OpenGlApiCallError=        -219, /**< OpenGL API call error */
+ CV_OpenCLApiCallError=        -220, /**< OpenCL API call error */
+ CV_OpenCLDoubleNotSupported=  -221,
+ CV_OpenCLInitError=           -222, /**< OpenCL initialization error */
+ CV_OpenCLNoAMDBlasFft=        -223
+};
+
+/****************************************************************************************\
+*                             Common macros and inline functions                         *
+\****************************************************************************************/
+
+/** absolute value without jumps */
+#ifndef __cplusplus
+#  define  CV_IABS(a)     (((a) ^ ((a) < 0 ? -1 : 0)) - ((a) < 0 ? -1 : 0))
+#else
+#  define  CV_IABS(a)     abs(a)
+#endif
+
+
+#define cvInvSqrt(value) ((float)(1./sqrt(value)))
+#define cvSqrt(value)  ((float)sqrt(value))
+
+
+/*************** Random number generation *******************/
+
+typedef uint64 CvRNG;
+
+#define CV_RNG_COEFF 4164903690U
+
+/** @brief Initializes a random number generator state.
+
+The function initializes a random number generator and returns the state. The pointer to the state
+can be then passed to the cvRandInt, cvRandReal and cvRandArr functions. In the current
+implementation a multiply-with-carry generator is used.
+@param seed 64-bit value used to initiate a random sequence
+@sa the C++ class RNG replaced CvRNG.
+ */
+CV_INLINE CvRNG cvRNG( int64 seed CV_DEFAULT(-1))
+{
+    CvRNG rng = seed ? (uint64)seed : (uint64)(int64)-1;
+    return rng;
+}
+
+/** @brief Returns a 32-bit unsigned integer and updates RNG.
+
+The function returns a uniformly-distributed random 32-bit unsigned integer and updates the RNG
+state. It is similar to the rand() function from the C runtime library, except that OpenCV functions
+always generates a 32-bit random number, regardless of the platform.
+@param rng CvRNG state initialized by cvRNG.
+ */
+CV_INLINE unsigned cvRandInt( CvRNG* rng )
+{
+    uint64 temp = *rng;
+    temp = (uint64)(unsigned)temp*CV_RNG_COEFF + (temp >> 32);
+    *rng = temp;
+    return (unsigned)temp;
+}
+
+/** @brief Returns a floating-point random number and updates RNG.
+
+The function returns a uniformly-distributed random floating-point number between 0 and 1 (1 is not
+included).
+@param rng RNG state initialized by cvRNG
+ */
+CV_INLINE double cvRandReal( CvRNG* rng )
+{
+    return cvRandInt(rng)*2.3283064365386962890625e-10 /* 2^-32 */;
+}
+
+/****************************************************************************************\
+*                                  Image type (IplImage)                                 *
+\****************************************************************************************/
+
+#ifndef HAVE_IPL
+
+/*
+ * The following definitions (until #endif)
+ * is an extract from IPL headers.
+ * Copyright (c) 1995 Intel Corporation.
+ */
+#define IPL_DEPTH_SIGN 0x80000000
+
+#define IPL_DEPTH_1U     1
+#define IPL_DEPTH_8U     8
+#define IPL_DEPTH_16U   16
+#define IPL_DEPTH_32F   32
+
+#define IPL_DEPTH_8S  (IPL_DEPTH_SIGN| 8)
+#define IPL_DEPTH_16S (IPL_DEPTH_SIGN|16)
+#define IPL_DEPTH_32S (IPL_DEPTH_SIGN|32)
+
+#define IPL_DATA_ORDER_PIXEL  0
+#define IPL_DATA_ORDER_PLANE  1
+
+#define IPL_ORIGIN_TL 0
+#define IPL_ORIGIN_BL 1
+
+#define IPL_ALIGN_4BYTES   4
+#define IPL_ALIGN_8BYTES   8
+#define IPL_ALIGN_16BYTES 16
+#define IPL_ALIGN_32BYTES 32
+
+#define IPL_ALIGN_DWORD   IPL_ALIGN_4BYTES
+#define IPL_ALIGN_QWORD   IPL_ALIGN_8BYTES
+
+#define IPL_BORDER_CONSTANT   0
+#define IPL_BORDER_REPLICATE  1
+#define IPL_BORDER_REFLECT    2
+#define IPL_BORDER_WRAP       3
+
+#ifdef __cplusplus
+typedef struct _IplImage IplImage;
+CV_EXPORTS _IplImage cvIplImage(const cv::Mat& m);
+#endif
+
+/** The IplImage is taken from the Intel Image Processing Library, in which the format is native. OpenCV
+only supports a subset of possible IplImage formats, as outlined in the parameter list above.
+
+In addition to the above restrictions, OpenCV handles ROIs differently. OpenCV functions require
+that the image size or ROI size of all source and destination images match exactly. On the other
+hand, the Intel Image Processing Library processes the area of intersection between the source and
+destination images (or ROIs), allowing them to vary independently.
+*/
+typedef struct
+_IplImage
+{
+    int  nSize;             /**< sizeof(IplImage) */
+    int  ID;                /**< version (=0)*/
+    int  nChannels;         /**< Most of OpenCV functions support 1,2,3 or 4 channels */
+    int  alphaChannel;      /**< Ignored by OpenCV */
+    int  depth;             /**< Pixel depth in bits: IPL_DEPTH_8U, IPL_DEPTH_8S, IPL_DEPTH_16S,
+                               IPL_DEPTH_32S, IPL_DEPTH_32F and IPL_DEPTH_64F are supported.  */
+    char colorModel[4];     /**< Ignored by OpenCV */
+    char channelSeq[4];     /**< ditto */
+    int  dataOrder;         /**< 0 - interleaved color channels, 1 - separate color channels.
+                               cvCreateImage can only create interleaved images */
+    int  origin;            /**< 0 - top-left origin,
+                               1 - bottom-left origin (Windows bitmaps style).  */
+    int  align;             /**< Alignment of image rows (4 or 8).
+                               OpenCV ignores it and uses widthStep instead.    */
+    int  width;             /**< Image width in pixels.                           */
+    int  height;            /**< Image height in pixels.                          */
+    struct _IplROI *roi;    /**< Image ROI. If NULL, the whole image is selected. */
+    struct _IplImage *maskROI;      /**< Must be NULL. */
+    void  *imageId;                 /**< "           " */
+    struct _IplTileInfo *tileInfo;  /**< "           " */
+    int  imageSize;         /**< Image data size in bytes
+                               (==image->height*image->widthStep
+                               in case of interleaved data)*/
+    char *imageData;        /**< Pointer to aligned image data.         */
+    int  widthStep;         /**< Size of aligned image row in bytes.    */
+    int  BorderMode[4];     /**< Ignored by OpenCV.                     */
+    int  BorderConst[4];    /**< Ditto.                                 */
+    char *imageDataOrigin;  /**< Pointer to very origin of image data
+                               (not necessarily aligned) -
+                               needed for correct deallocation */
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    _IplImage()
+    {
+        memset(this, 0, sizeof(*this));  // valid for POD structure
+        nSize = sizeof(IplImage);
+    }
+    _IplImage(const cv::Mat& m) { *this = cvIplImage(m); }
+#endif
+}
+IplImage;
+
+CV_INLINE IplImage cvIplImage()
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    IplImage self = CV_STRUCT_INITIALIZER; self.nSize = sizeof(IplImage); return self;
+#else
+    return _IplImage();
+#endif
+}
+
+typedef struct _IplTileInfo IplTileInfo;
+
+typedef struct _IplROI
+{
+    int  coi; /**< 0 - no COI (all channels are selected), 1 - 0th channel is selected ...*/
+    int  xOffset;
+    int  yOffset;
+    int  width;
+    int  height;
+}
+IplROI;
+
+typedef struct _IplConvKernel
+{
+    int  nCols;
+    int  nRows;
+    int  anchorX;
+    int  anchorY;
+    int *values;
+    int  nShiftR;
+}
+IplConvKernel;
+
+typedef struct _IplConvKernelFP
+{
+    int  nCols;
+    int  nRows;
+    int  anchorX;
+    int  anchorY;
+    float *values;
+}
+IplConvKernelFP;
+
+#define IPL_IMAGE_HEADER 1
+#define IPL_IMAGE_DATA   2
+#define IPL_IMAGE_ROI    4
+
+#endif/*HAVE_IPL*/
+
+/** extra border mode */
+#define IPL_BORDER_REFLECT_101    4
+#define IPL_BORDER_TRANSPARENT    5
+
+#define IPL_IMAGE_MAGIC_VAL  ((int)sizeof(IplImage))
+#define CV_TYPE_NAME_IMAGE "opencv-image"
+
+#define CV_IS_IMAGE_HDR(img) \
+    ((img) != NULL && ((const IplImage*)(img))->nSize == sizeof(IplImage))
+
+#define CV_IS_IMAGE(img) \
+    (CV_IS_IMAGE_HDR(img) && ((IplImage*)img)->imageData != NULL)
+
+/** for storing double-precision
+   floating point data in IplImage's */
+#define IPL_DEPTH_64F  64
+
+/** get reference to pixel at (col,row),
+   for multi-channel images (col) should be multiplied by number of channels */
+#define CV_IMAGE_ELEM( image, elemtype, row, col )       \
+    (((elemtype*)((image)->imageData + (image)->widthStep*(row)))[(col)])
+
+/****************************************************************************************\
+*                                  Matrix type (CvMat)                                   *
+\****************************************************************************************/
+
+#define CV_AUTO_STEP  0x7fffffff
+#define CV_WHOLE_ARR  cvSlice( 0, 0x3fffffff )
+
+#define CV_MAGIC_MASK       0xFFFF0000
+#define CV_MAT_MAGIC_VAL    0x42420000
+#define CV_TYPE_NAME_MAT    "opencv-matrix"
+
+#ifdef __cplusplus
+typedef struct CvMat CvMat;
+CV_INLINE CvMat cvMat(const cv::Mat& m);
+#endif
+
+/** Matrix elements are stored row by row. Element (i, j) (i - 0-based row index, j - 0-based column
+index) of a matrix can be retrieved or modified using CV_MAT_ELEM macro:
+
+    uchar pixval = CV_MAT_ELEM(grayimg, uchar, i, j)
+    CV_MAT_ELEM(cameraMatrix, float, 0, 2) = image.width*0.5f;
+
+To access multiple-channel matrices, you can use
+CV_MAT_ELEM(matrix, type, i, j\*nchannels + channel_idx).
+
+@deprecated CvMat is now obsolete; consider using Mat instead.
+ */
+typedef struct CvMat
+{
+    int type;
+    int step;
+
+    /* for internal use only */
+    int* refcount;
+    int hdr_refcount;
+
+    union
+    {
+        uchar* ptr;
+        short* s;
+        int* i;
+        float* fl;
+        double* db;
+    } data;
+
+#ifdef __cplusplus
+    union
+    {
+        int rows;
+        int height;
+    };
+
+    union
+    {
+        int cols;
+        int width;
+    };
+#else
+    int rows;
+    int cols;
+#endif
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvMat() {}
+    CvMat(const cv::Mat& m) { *this = cvMat(m); }
+#endif
+}
+CvMat;
+
+
+#define CV_IS_MAT_HDR(mat) \
+    ((mat) != NULL && \
+    (((const CvMat*)(mat))->type & CV_MAGIC_MASK) == CV_MAT_MAGIC_VAL && \
+    ((const CvMat*)(mat))->cols > 0 && ((const CvMat*)(mat))->rows > 0)
+
+#define CV_IS_MAT_HDR_Z(mat) \
+    ((mat) != NULL && \
+    (((const CvMat*)(mat))->type & CV_MAGIC_MASK) == CV_MAT_MAGIC_VAL && \
+    ((const CvMat*)(mat))->cols >= 0 && ((const CvMat*)(mat))->rows >= 0)
+
+#define CV_IS_MAT(mat) \
+    (CV_IS_MAT_HDR(mat) && ((const CvMat*)(mat))->data.ptr != NULL)
+
+#define CV_IS_MASK_ARR(mat) \
+    (((mat)->type & (CV_MAT_TYPE_MASK & ~CV_8SC1)) == 0)
+
+#define CV_ARE_TYPES_EQ(mat1, mat2) \
+    ((((mat1)->type ^ (mat2)->type) & CV_MAT_TYPE_MASK) == 0)
+
+#define CV_ARE_CNS_EQ(mat1, mat2) \
+    ((((mat1)->type ^ (mat2)->type) & CV_MAT_CN_MASK) == 0)
+
+#define CV_ARE_DEPTHS_EQ(mat1, mat2) \
+    ((((mat1)->type ^ (mat2)->type) & CV_MAT_DEPTH_MASK) == 0)
+
+#define CV_ARE_SIZES_EQ(mat1, mat2) \
+    ((mat1)->rows == (mat2)->rows && (mat1)->cols == (mat2)->cols)
+
+#define CV_IS_MAT_CONST(mat)  \
+    (((mat)->rows|(mat)->cols) == 1)
+
+#define IPL2CV_DEPTH(depth) \
+    ((((CV_8U)+(CV_16U<<4)+(CV_32F<<8)+(CV_64F<<16)+(CV_8S<<20)+ \
+    (CV_16S<<24)+(CV_32S<<28)) >> ((((depth) & 0xF0) >> 2) + \
+    (((depth) & IPL_DEPTH_SIGN) ? 20 : 0))) & 15)
+
+/** Inline constructor. No data is allocated internally!!!
+ * (Use together with cvCreateData, or use cvCreateMat instead to
+ * get a matrix with allocated data):
+ */
+CV_INLINE CvMat cvMat( int rows, int cols, int type, void* data CV_DEFAULT(NULL))
+{
+    CvMat m;
+
+    assert( (unsigned)CV_MAT_DEPTH(type) <= CV_64F );
+    type = CV_MAT_TYPE(type);
+    m.type = CV_MAT_MAGIC_VAL | CV_MAT_CONT_FLAG | type;
+    m.cols = cols;
+    m.rows = rows;
+    m.step = m.cols*CV_ELEM_SIZE(type);
+    m.data.ptr = (uchar*)data;
+    m.refcount = NULL;
+    m.hdr_refcount = 0;
+
+    return m;
+}
+
+#ifdef __cplusplus
+
+CV_INLINE CvMat cvMat(const cv::Mat& m)
+{
+    CvMat self;
+    CV_DbgAssert(m.dims <= 2);
+    self = cvMat(m.rows, m.dims == 1 ? 1 : m.cols, m.type(), m.data);
+    self.step = (int)m.step[0];
+    self.type = (self.type & ~cv::Mat::CONTINUOUS_FLAG) | (m.flags & cv::Mat::CONTINUOUS_FLAG);
+    return self;
+}
+CV_INLINE CvMat cvMat()
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    CvMat self = CV_STRUCT_INITIALIZER; return self;
+#else
+    return CvMat();
+#endif
+}
+CV_INLINE CvMat cvMat(const CvMat& m)
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    CvMat self = CV_STRUCT_INITIALIZER; memcpy(&self, &m, sizeof(self)); return self;
+#else
+    return CvMat(m);
+#endif
+}
+
+#endif // __cplusplus
+
+
+#define CV_MAT_ELEM_PTR_FAST( mat, row, col, pix_size )  \
+    (assert( (unsigned)(row) < (unsigned)(mat).rows &&   \
+             (unsigned)(col) < (unsigned)(mat).cols ),   \
+     (mat).data.ptr + (size_t)(mat).step*(row) + (pix_size)*(col))
+
+#define CV_MAT_ELEM_PTR( mat, row, col )                 \
+    CV_MAT_ELEM_PTR_FAST( mat, row, col, CV_ELEM_SIZE((mat).type) )
+
+#define CV_MAT_ELEM( mat, elemtype, row, col )           \
+    (*(elemtype*)CV_MAT_ELEM_PTR_FAST( mat, row, col, sizeof(elemtype)))
+
+/** @brief Returns the particular element of single-channel floating-point matrix.
+
+The function is a fast replacement for cvGetReal2D in the case of single-channel floating-point
+matrices. It is faster because it is inline, it does fewer checks for array type and array element
+type, and it checks for the row and column ranges only in debug mode.
+@param mat Input matrix
+@param row The zero-based index of row
+@param col The zero-based index of column
+ */
+CV_INLINE  double  cvmGet( const CvMat* mat, int row, int col )
+{
+    int type;
+
+    type = CV_MAT_TYPE(mat->type);
+    assert( (unsigned)row < (unsigned)mat->rows &&
+            (unsigned)col < (unsigned)mat->cols );
+
+    if( type == CV_32FC1 )
+        return ((float*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col];
+    else
+    {
+        assert( type == CV_64FC1 );
+        return ((double*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col];
+    }
+}
+
+/** @brief Sets a specific element of a single-channel floating-point matrix.
+
+The function is a fast replacement for cvSetReal2D in the case of single-channel floating-point
+matrices. It is faster because it is inline, it does fewer checks for array type and array element
+type, and it checks for the row and column ranges only in debug mode.
+@param mat The matrix
+@param row The zero-based index of row
+@param col The zero-based index of column
+@param value The new value of the matrix element
+ */
+CV_INLINE  void  cvmSet( CvMat* mat, int row, int col, double value )
+{
+    int type;
+    type = CV_MAT_TYPE(mat->type);
+    assert( (unsigned)row < (unsigned)mat->rows &&
+            (unsigned)col < (unsigned)mat->cols );
+
+    if( type == CV_32FC1 )
+        ((float*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col] = (float)value;
+    else
+    {
+        assert( type == CV_64FC1 );
+        ((double*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col] = value;
+    }
+}
+
+
+CV_INLINE int cvIplDepth( int type )
+{
+    int depth = CV_MAT_DEPTH(type);
+    return CV_ELEM_SIZE1(depth)*8 | (depth == CV_8S || depth == CV_16S ||
+           depth == CV_32S ? IPL_DEPTH_SIGN : 0);
+}
+
+
+/****************************************************************************************\
+*                       Multi-dimensional dense array (CvMatND)                          *
+\****************************************************************************************/
+
+#define CV_MATND_MAGIC_VAL    0x42430000
+#define CV_TYPE_NAME_MATND    "opencv-nd-matrix"
+
+#ifdef __cplusplus
+typedef struct CvMatND CvMatND;
+CV_EXPORTS CvMatND cvMatND(const cv::Mat& m);
+#endif
+
+/**
+  @deprecated consider using cv::Mat instead
+  */
+typedef struct
+CvMatND
+{
+    int type;
+    int dims;
+
+    int* refcount;
+    int hdr_refcount;
+
+    union
+    {
+        uchar* ptr;
+        float* fl;
+        double* db;
+        int* i;
+        short* s;
+    } data;
+
+    struct
+    {
+        int size;
+        int step;
+    }
+    dim[CV_MAX_DIM];
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvMatND() {}
+    CvMatND(const cv::Mat& m) { *this = cvMatND(m); }
+#endif
+}
+CvMatND;
+
+
+CV_INLINE CvMatND cvMatND()
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvMatND self = CV_STRUCT_INITIALIZER; return self;
+#else
+    return CvMatND();
+#endif
+}
+
+#define CV_IS_MATND_HDR(mat) \
+    ((mat) != NULL && (((const CvMatND*)(mat))->type & CV_MAGIC_MASK) == CV_MATND_MAGIC_VAL)
+
+#define CV_IS_MATND(mat) \
+    (CV_IS_MATND_HDR(mat) && ((const CvMatND*)(mat))->data.ptr != NULL)
+
+
+/****************************************************************************************\
+*                      Multi-dimensional sparse array (CvSparseMat)                      *
+\****************************************************************************************/
+
+#define CV_SPARSE_MAT_MAGIC_VAL    0x42440000
+#define CV_TYPE_NAME_SPARSE_MAT    "opencv-sparse-matrix"
+
+struct CvSet;
+
+typedef struct CvSparseMat
+{
+    int type;
+    int dims;
+    int* refcount;
+    int hdr_refcount;
+
+    struct CvSet* heap;
+    void** hashtable;
+    int hashsize;
+    int valoffset;
+    int idxoffset;
+    int size[CV_MAX_DIM];
+
+#ifdef __cplusplus
+    CV_EXPORTS void copyToSparseMat(cv::SparseMat& m) const;
+#endif
+}
+CvSparseMat;
+
+#ifdef __cplusplus
+CV_EXPORTS CvSparseMat* cvCreateSparseMat(const cv::SparseMat& m);
+#endif
+
+#define CV_IS_SPARSE_MAT_HDR(mat) \
+    ((mat) != NULL && \
+    (((const CvSparseMat*)(mat))->type & CV_MAGIC_MASK) == CV_SPARSE_MAT_MAGIC_VAL)
+
+#define CV_IS_SPARSE_MAT(mat) \
+    CV_IS_SPARSE_MAT_HDR(mat)
+
+/**************** iteration through a sparse array *****************/
+
+typedef struct CvSparseNode
+{
+    unsigned hashval;
+    struct CvSparseNode* next;
+}
+CvSparseNode;
+
+typedef struct CvSparseMatIterator
+{
+    CvSparseMat* mat;
+    CvSparseNode* node;
+    int curidx;
+}
+CvSparseMatIterator;
+
+#define CV_NODE_VAL(mat,node)   ((void*)((uchar*)(node) + (mat)->valoffset))
+#define CV_NODE_IDX(mat,node)   ((int*)((uchar*)(node) + (mat)->idxoffset))
+
+/****************************************************************************************\
+*                                         Histogram                                      *
+\****************************************************************************************/
+
+typedef int CvHistType;
+
+#define CV_HIST_MAGIC_VAL     0x42450000
+#define CV_HIST_UNIFORM_FLAG  (1 << 10)
+
+/** indicates whether bin ranges are set already or not */
+#define CV_HIST_RANGES_FLAG   (1 << 11)
+
+#define CV_HIST_ARRAY         0
+#define CV_HIST_SPARSE        1
+#define CV_HIST_TREE          CV_HIST_SPARSE
+
+/** should be used as a parameter only,
+   it turns to CV_HIST_UNIFORM_FLAG of hist->type */
+#define CV_HIST_UNIFORM       1
+
+typedef struct CvHistogram
+{
+    int     type;
+    CvArr*  bins;
+    float   thresh[CV_MAX_DIM][2];  /**< For uniform histograms.                      */
+    float** thresh2;                /**< For non-uniform histograms.                  */
+    CvMatND mat;                    /**< Embedded matrix header for array histograms. */
+}
+CvHistogram;
+
+#define CV_IS_HIST( hist ) \
+    ((hist) != NULL  && \
+     (((CvHistogram*)(hist))->type & CV_MAGIC_MASK) == CV_HIST_MAGIC_VAL && \
+     (hist)->bins != NULL)
+
+#define CV_IS_UNIFORM_HIST( hist ) \
+    (((hist)->type & CV_HIST_UNIFORM_FLAG) != 0)
+
+#define CV_IS_SPARSE_HIST( hist ) \
+    CV_IS_SPARSE_MAT((hist)->bins)
+
+#define CV_HIST_HAS_RANGES( hist ) \
+    (((hist)->type & CV_HIST_RANGES_FLAG) != 0)
+
+/****************************************************************************************\
+*                      Other supplementary data type definitions                         *
+\****************************************************************************************/
+
+/*************************************** CvRect *****************************************/
+/** @sa Rect_ */
+typedef struct CvRect
+{
+    int x;
+    int y;
+    int width;
+    int height;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvRect() __attribute__(( warning("Non-initialized variable") )) {};
+    template<typename _Tp> CvRect(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 4);
+        x = y = width = height = 0;
+        if (list.size() == 4)
+        {
+            x = list.begin()[0]; y = list.begin()[1]; width = list.begin()[2]; height = list.begin()[3];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvRect(int _x = 0, int _y = 0, int w = 0, int h = 0): x(_x), y(_y), width(w), height(h) {}
+    template<typename _Tp>
+    CvRect(const cv::Rect_<_Tp>& r): x(cv::saturate_cast<int>(r.x)), y(cv::saturate_cast<int>(r.y)), width(cv::saturate_cast<int>(r.width)), height(cv::saturate_cast<int>(r.height)) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Rect_<_Tp>() const { return cv::Rect_<_Tp>((_Tp)x, (_Tp)y, (_Tp)width, (_Tp)height); }
+#endif
+}
+CvRect;
+
+/** constructs CvRect structure. */
+CV_INLINE  CvRect  cvRect( int x, int y, int width, int height )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvRect r = {x, y, width, height};
+#else
+    CvRect r(x, y , width, height);
+#endif
+    return r;
+}
+#ifdef __cplusplus
+CV_INLINE CvRect cvRect(const cv::Rect& rc) { return cvRect(rc.x, rc.y, rc.width, rc.height); }
+#endif
+
+CV_INLINE  IplROI  cvRectToROI( CvRect rect, int coi )
+{
+    IplROI roi;
+    roi.xOffset = rect.x;
+    roi.yOffset = rect.y;
+    roi.width = rect.width;
+    roi.height = rect.height;
+    roi.coi = coi;
+
+    return roi;
+}
+
+
+CV_INLINE  CvRect  cvROIToRect( IplROI roi )
+{
+    return cvRect( roi.xOffset, roi.yOffset, roi.width, roi.height );
+}
+
+/*********************************** CvTermCriteria *************************************/
+
+#define CV_TERMCRIT_ITER    1
+#define CV_TERMCRIT_NUMBER  CV_TERMCRIT_ITER
+#define CV_TERMCRIT_EPS     2
+
+/** @sa TermCriteria
+ */
+typedef struct CvTermCriteria
+{
+    int    type;  /**< may be combination of
+                     CV_TERMCRIT_ITER
+                     CV_TERMCRIT_EPS */
+    int    max_iter;
+    double epsilon;
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvTermCriteria(int _type = 0, int _iter = 0, double _eps = 0) : type(_type), max_iter(_iter), epsilon(_eps)  {}
+    CvTermCriteria(const cv::TermCriteria& t) : type(t.type), max_iter(t.maxCount), epsilon(t.epsilon)  {}
+#endif
+#ifdef __cplusplus
+    operator cv::TermCriteria() const { return cv::TermCriteria(type, max_iter, epsilon); }
+#endif
+}
+CvTermCriteria;
+
+CV_INLINE  CvTermCriteria  cvTermCriteria( int type, int max_iter, double epsilon )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvTermCriteria t = { type, max_iter, (float)epsilon};
+#else
+    CvTermCriteria t(type, max_iter, epsilon);
+#endif
+    return t;
+}
+#ifdef __cplusplus
+CV_INLINE CvTermCriteria cvTermCriteria(const cv::TermCriteria& t) { return cvTermCriteria(t.type, t.maxCount, t.epsilon); }
+#endif
+
+
+/******************************* CvPoint and variants ***********************************/
+
+typedef struct CvPoint
+{
+    int x;
+    int y;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        x = y = 0;
+        if (list.size() == 2)
+        {
+            x = list.begin()[0]; y = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvPoint(int _x = 0, int _y = 0): x(_x), y(_y) {}
+    template<typename _Tp>
+    CvPoint(const cv::Point_<_Tp>& pt): x((int)pt.x), y((int)pt.y) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Point_<_Tp>() const { return cv::Point_<_Tp>(cv::saturate_cast<_Tp>(x), cv::saturate_cast<_Tp>(y)); }
+#endif
+}
+CvPoint;
+
+/** constructs CvPoint structure. */
+CV_INLINE  CvPoint  cvPoint( int x, int y )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint p = {x, y};
+#else
+    CvPoint p(x, y);
+#endif
+    return p;
+}
+#ifdef __cplusplus
+CV_INLINE CvPoint cvPoint(const cv::Point& pt) { return cvPoint(pt.x, pt.y); }
+#endif
+
+typedef struct CvPoint2D32f
+{
+    float x;
+    float y;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint2D32f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint2D32f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        x = y = 0;
+        if (list.size() == 2)
+        {
+            x = list.begin()[0]; y = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvPoint2D32f(float _x = 0, float _y = 0): x(_x), y(_y) {}
+    template<typename _Tp>
+    CvPoint2D32f(const cv::Point_<_Tp>& pt): x((float)pt.x), y((float)pt.y) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Point_<_Tp>() const { return cv::Point_<_Tp>(cv::saturate_cast<_Tp>(x), cv::saturate_cast<_Tp>(y)); }
+#endif
+}
+CvPoint2D32f;
+
+/** constructs CvPoint2D32f structure. */
+CV_INLINE  CvPoint2D32f  cvPoint2D32f( double x, double y )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint2D32f p = { (float)x, (float)y };
+#else
+    CvPoint2D32f p((float)x, (float)y);
+#endif
+    return p;
+}
+
+#ifdef __cplusplus
+template<typename _Tp>
+CvPoint2D32f cvPoint2D32f(const cv::Point_<_Tp>& pt)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint2D32f p = { (float)pt.x, (float)pt.y };
+#else
+    CvPoint2D32f p((float)pt.x, (float)pt.y);
+#endif
+    return p;
+}
+#endif
+
+/** converts CvPoint to CvPoint2D32f. */
+CV_INLINE  CvPoint2D32f  cvPointTo32f( CvPoint point )
+{
+    return cvPoint2D32f( (float)point.x, (float)point.y );
+}
+
+/** converts CvPoint2D32f to CvPoint. */
+CV_INLINE  CvPoint  cvPointFrom32f( CvPoint2D32f point )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint ipt = { cvRound(point.x), cvRound(point.y) };
+#else
+    CvPoint ipt(cvRound(point.x), cvRound(point.y));
+#endif
+    return ipt;
+}
+
+
+typedef struct CvPoint3D32f
+{
+    float x;
+    float y;
+    float z;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint3D32f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint3D32f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 3);
+        x = y = z = 0;
+        if (list.size() == 3)
+        {
+            x = list.begin()[0]; y = list.begin()[1]; z = list.begin()[2];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvPoint3D32f(float _x = 0, float _y = 0, float _z = 0): x(_x), y(_y), z(_z) {}
+    template<typename _Tp>
+    CvPoint3D32f(const cv::Point3_<_Tp>& pt): x((float)pt.x), y((float)pt.y), z((float)pt.z) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Point3_<_Tp>() const { return cv::Point3_<_Tp>(cv::saturate_cast<_Tp>(x), cv::saturate_cast<_Tp>(y), cv::saturate_cast<_Tp>(z)); }
+#endif
+}
+CvPoint3D32f;
+
+/** constructs CvPoint3D32f structure. */
+CV_INLINE  CvPoint3D32f  cvPoint3D32f( double x, double y, double z )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint3D32f p = { (float)x, (float)y, (float)z };
+#else
+    CvPoint3D32f p((float)x, (float)y, (float)z);
+#endif
+    return p;
+}
+
+#ifdef __cplusplus
+template<typename _Tp>
+CvPoint3D32f cvPoint3D32f(const cv::Point3_<_Tp>& pt)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint3D32f p  = { (float)pt.x, (float)pt.y, (float)pt.z };
+#else
+    CvPoint3D32f p((float)pt.x, (float)pt.y, (float)pt.z);
+#endif
+    return p;
+}
+#endif
+
+
+typedef struct CvPoint2D64f
+{
+    double x;
+    double y;
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint2D64f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint2D64f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        x = y = 0;
+        if (list.size() == 2)
+        {
+            x = list.begin()[0]; y = list.begin()[1];
+        }
+    };
+#endif
+}
+CvPoint2D64f;
+
+/** constructs CvPoint2D64f structure.*/
+CV_INLINE  CvPoint2D64f  cvPoint2D64f( double x, double y )
+{
+    CvPoint2D64f p = { x, y };
+    return p;
+}
+
+
+typedef struct CvPoint3D64f
+{
+    double x;
+    double y;
+    double z;
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint3D64f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint3D64f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 3);
+        x = y = z = 0;
+        if (list.size() == 3)
+        {
+            x = list.begin()[0]; y = list.begin()[1]; z = list.begin()[2];
+        }
+    };
+#endif
+}
+CvPoint3D64f;
+
+/** constructs CvPoint3D64f structure. */
+CV_INLINE  CvPoint3D64f  cvPoint3D64f( double x, double y, double z )
+{
+    CvPoint3D64f p = { x, y, z };
+    return p;
+}
+
+
+/******************************** CvSize's & CvBox **************************************/
+
+typedef struct CvSize
+{
+    int width;
+    int height;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvSize() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvSize(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        width = 0; height = 0;
+        if (list.size() == 2)
+        {
+            width = list.begin()[0]; height = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvSize(int w = 0, int h = 0): width(w), height(h) {}
+    template<typename _Tp>
+    CvSize(const cv::Size_<_Tp>& sz): width(cv::saturate_cast<int>(sz.width)), height(cv::saturate_cast<int>(sz.height)) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Size_<_Tp>() const { return cv::Size_<_Tp>(cv::saturate_cast<_Tp>(width), cv::saturate_cast<_Tp>(height)); }
+#endif
+}
+CvSize;
+
+/** constructs CvSize structure. */
+CV_INLINE  CvSize  cvSize( int width, int height )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize s = { width, height };
+#else
+    CvSize s(width, height);
+#endif
+    return s;
+}
+
+#ifdef __cplusplus
+CV_INLINE CvSize cvSize(const cv::Size& sz)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize s = { sz.width, sz.height };
+#else
+    CvSize s(sz.width, sz.height);
+#endif
+    return s;
+}
+#endif
+
+typedef struct CvSize2D32f
+{
+    float width;
+    float height;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvSize2D32f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvSize2D32f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        width = 0; height = 0;
+        if (list.size() == 2)
+        {
+            width = list.begin()[0]; height = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvSize2D32f(float w = 0, float h = 0): width(w), height(h) {}
+    template<typename _Tp>
+    CvSize2D32f(const cv::Size_<_Tp>& sz): width(cv::saturate_cast<float>(sz.width)), height(cv::saturate_cast<float>(sz.height)) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Size_<_Tp>() const { return cv::Size_<_Tp>(cv::saturate_cast<_Tp>(width), cv::saturate_cast<_Tp>(height)); }
+#endif
+}
+CvSize2D32f;
+
+/** constructs CvSize2D32f structure. */
+CV_INLINE  CvSize2D32f  cvSize2D32f( double width, double height )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize2D32f s = { (float)width, (float)height };
+#else
+    CvSize2D32f s((float)width, (float)height);
+#endif
+    return s;
+}
+#ifdef __cplusplus
+template<typename _Tp>
+CvSize2D32f cvSize2D32f(const cv::Size_<_Tp>& sz)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize2D32f s = { (float)sz.width, (float)sz.height };
+#else
+    CvSize2D32f s((float)sz.width, (float)sz.height);
+#endif
+    return s;
+}
+#endif
+
+/** @sa RotatedRect
+ */
+typedef struct CvBox2D
+{
+    CvPoint2D32f center;  /**< Center of the box.                          */
+    CvSize2D32f  size;    /**< Box width and length.                       */
+    float angle;          /**< Angle between the horizontal axis           */
+                          /**< and the first side (i.e. length) in degrees */
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvBox2D(CvPoint2D32f c = CvPoint2D32f(), CvSize2D32f s = CvSize2D32f(), float a = 0) : center(c), size(s), angle(a) {}
+    CvBox2D(const cv::RotatedRect& rr) : center(rr.center), size(rr.size), angle(rr.angle) {}
+#endif
+#ifdef __cplusplus
+    operator cv::RotatedRect() const { return cv::RotatedRect(center, size, angle); }
+#endif
+}
+CvBox2D;
+
+
+#ifdef __cplusplus
+CV_INLINE CvBox2D cvBox2D(CvPoint2D32f c = CvPoint2D32f(), CvSize2D32f s = CvSize2D32f(), float a = 0)
+{
+    CvBox2D self;
+    self.center = c;
+    self.size = s;
+    self.angle = a;
+    return self;
+}
+CV_INLINE CvBox2D cvBox2D(const cv::RotatedRect& rr)
+{
+    CvBox2D self;
+    self.center = cvPoint2D32f(rr.center);
+    self.size = cvSize2D32f(rr.size);
+    self.angle = rr.angle;
+    return self;
+}
+#endif
+
+
+/** Line iterator state: */
+typedef struct CvLineIterator
+{
+    /** Pointer to the current point: */
+    uchar* ptr;
+
+    /* Bresenham algorithm state: */
+    int  err;
+    int  plus_delta;
+    int  minus_delta;
+    int  plus_step;
+    int  minus_step;
+}
+CvLineIterator;
+
+
+
+/************************************* CvSlice ******************************************/
+#define CV_WHOLE_SEQ_END_INDEX 0x3fffffff
+#define CV_WHOLE_SEQ  cvSlice(0, CV_WHOLE_SEQ_END_INDEX)
+
+typedef struct CvSlice
+{
+    int  start_index, end_index;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvSlice() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvSlice(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        start_index = end_index = 0;
+        if (list.size() == 2)
+        {
+            start_index = list.begin()[0]; end_index = list.begin()[1];
+        }
+    };
+#endif
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus) && !defined(__CUDACC__)
+    CvSlice(int start = 0, int end = 0) : start_index(start), end_index(end) {}
+    CvSlice(const cv::Range& r) { *this = (r.start != INT_MIN && r.end != INT_MAX) ? CvSlice(r.start, r.end) : CvSlice(0, CV_WHOLE_SEQ_END_INDEX); }
+    operator cv::Range() const { return (start_index == 0 && end_index == CV_WHOLE_SEQ_END_INDEX ) ? cv::Range::all() : cv::Range(start_index, end_index); }
+#endif
+}
+CvSlice;
+
+CV_INLINE  CvSlice  cvSlice( int start, int end )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus) && !defined(__CUDACC__))
+    CvSlice slice = { start, end };
+#else
+    CvSlice slice(start, end);
+#endif
+    return slice;
+}
+
+#if defined(__cplusplus)
+CV_INLINE  CvSlice  cvSlice(const cv::Range& r)
+{
+    CvSlice slice = (r.start != INT_MIN && r.end != INT_MAX) ? cvSlice(r.start, r.end) : cvSlice(0, CV_WHOLE_SEQ_END_INDEX);
+    return slice;
+}
+#endif
+
+
+/************************************* CvScalar *****************************************/
+/** @sa Scalar_
+ */
+typedef struct CvScalar
+{
+    double val[4];
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvScalar() __attribute__(( warning("Non-initialized variable") )) {}
+    CvScalar(const std::initializer_list<double> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 4);
+        val[0] = val[1] = val[2] = val[3] = 0;
+        if (list.size() == 4)
+        {
+            val[0] = list.begin()[0]; val[1] = list.begin()[1]; val[2] = list.begin()[2]; val[3] = list.begin()[3];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvScalar() {}
+    CvScalar(double d0, double d1 = 0, double d2 = 0, double d3 = 0) { val[0] = d0; val[1] = d1; val[2] = d2; val[3] = d3; }
+    template<typename _Tp>
+    CvScalar(const cv::Scalar_<_Tp>& s) { val[0] = s.val[0]; val[1] = s.val[1]; val[2] = s.val[2]; val[3] = s.val[3]; }
+    template<typename _Tp, int cn>
+    CvScalar(const cv::Vec<_Tp, cn>& v)
+    {
+        int i;
+        for( i = 0; i < (cn < 4 ? cn : 4); i++ ) val[i] = v.val[i];
+        for( ; i < 4; i++ ) val[i] = 0;
+    }
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Scalar_<_Tp>() const { return cv::Scalar_<_Tp>(cv::saturate_cast<_Tp>(val[0]), cv::saturate_cast<_Tp>(val[1]), cv::saturate_cast<_Tp>(val[2]), cv::saturate_cast<_Tp>(val[3])); }
+#endif
+}
+CvScalar;
+
+CV_INLINE  CvScalar  cvScalar( double val0, double val1 CV_DEFAULT(0),
+                               double val2 CV_DEFAULT(0), double val3 CV_DEFAULT(0))
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = val0; scalar.val[1] = val1;
+    scalar.val[2] = val2; scalar.val[3] = val3;
+    return scalar;
+}
+
+#ifdef __cplusplus
+CV_INLINE CvScalar cvScalar()
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = scalar.val[1] = scalar.val[2] = scalar.val[3] = 0;
+    return scalar;
+}
+CV_INLINE CvScalar cvScalar(const cv::Scalar& s)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = s.val[0];
+    scalar.val[1] = s.val[1];
+    scalar.val[2] = s.val[2];
+    scalar.val[3] = s.val[3];
+    return scalar;
+}
+#endif
+
+CV_INLINE  CvScalar  cvRealScalar( double val0 )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = val0;
+    scalar.val[1] = scalar.val[2] = scalar.val[3] = 0;
+    return scalar;
+}
+
+CV_INLINE  CvScalar  cvScalarAll( double val0123 )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = val0123;
+    scalar.val[1] = val0123;
+    scalar.val[2] = val0123;
+    scalar.val[3] = val0123;
+    return scalar;
+}
+
+/****************************************************************************************\
+*                                   Dynamic Data structures                              *
+\****************************************************************************************/
+
+/******************************** Memory storage ****************************************/
+
+typedef struct CvMemBlock
+{
+    struct CvMemBlock*  prev;
+    struct CvMemBlock*  next;
+}
+CvMemBlock;
+
+#define CV_STORAGE_MAGIC_VAL    0x42890000
+
+typedef struct CvMemStorage
+{
+    int signature;
+    CvMemBlock* bottom;           /**< First allocated block.                   */
+    CvMemBlock* top;              /**< Current memory block - top of the stack. */
+    struct  CvMemStorage* parent; /**< We get new blocks from parent as needed. */
+    int block_size;               /**< Block size.                              */
+    int free_space;               /**< Remaining free space in current block.   */
+}
+CvMemStorage;
+
+#define CV_IS_STORAGE(storage)  \
+    ((storage) != NULL &&       \
+    (((CvMemStorage*)(storage))->signature & CV_MAGIC_MASK) == CV_STORAGE_MAGIC_VAL)
+
+
+typedef struct CvMemStoragePos
+{
+    CvMemBlock* top;
+    int free_space;
+}
+CvMemStoragePos;
+
+
+/*********************************** Sequence *******************************************/
+
+typedef struct CvSeqBlock
+{
+    struct CvSeqBlock*  prev; /**< Previous sequence block.                   */
+    struct CvSeqBlock*  next; /**< Next sequence block.                       */
+    int    start_index;       /**< Index of the first element in the block +  */
+                              /**< sequence->first->start_index.              */
+    int    count;             /**< Number of elements in the block.           */
+    schar* data;              /**< Pointer to the first element of the block. */
+}
+CvSeqBlock;
+
+
+#define CV_TREE_NODE_FIELDS(node_type)                               \
+    int       flags;             /**< Miscellaneous flags.     */      \
+    int       header_size;       /**< Size of sequence header. */      \
+    struct    node_type* h_prev; /**< Previous sequence.       */      \
+    struct    node_type* h_next; /**< Next sequence.           */      \
+    struct    node_type* v_prev; /**< 2nd previous sequence.   */      \
+    struct    node_type* v_next  /**< 2nd next sequence.       */
+
+/**
+   Read/Write sequence.
+   Elements can be dynamically inserted to or deleted from the sequence.
+*/
+#define CV_SEQUENCE_FIELDS()                                              \
+    CV_TREE_NODE_FIELDS(CvSeq);                                           \
+    int       total;          /**< Total number of elements.            */  \
+    int       elem_size;      /**< Size of sequence element in bytes.   */  \
+    schar*    block_max;      /**< Maximal bound of the last block.     */  \
+    schar*    ptr;            /**< Current write pointer.               */  \
+    int       delta_elems;    /**< Grow seq this many at a time.        */  \
+    CvMemStorage* storage;    /**< Where the seq is stored.             */  \
+    CvSeqBlock* free_blocks;  /**< Free blocks list.                    */  \
+    CvSeqBlock* first;        /**< Pointer to the first sequence block. */
+
+typedef struct CvSeq
+{
+    CV_SEQUENCE_FIELDS()
+}
+CvSeq;
+
+#define CV_TYPE_NAME_SEQ             "opencv-sequence"
+#define CV_TYPE_NAME_SEQ_TREE        "opencv-sequence-tree"
+
+/*************************************** Set ********************************************/
+/** @brief Set
+  Order is not preserved. There can be gaps between sequence elements.
+  After the element has been inserted it stays in the same place all the time.
+  The MSB(most-significant or sign bit) of the first field (flags) is 0 iff the element exists.
+*/
+#define CV_SET_ELEM_FIELDS(elem_type)   \
+    int  flags;                         \
+    struct elem_type* next_free;
+
+typedef struct CvSetElem
+{
+    CV_SET_ELEM_FIELDS(CvSetElem)
+}
+CvSetElem;
+
+#define CV_SET_FIELDS()      \
+    CV_SEQUENCE_FIELDS()     \
+    CvSetElem* free_elems;   \
+    int active_count;
+
+typedef struct CvSet
+{
+    CV_SET_FIELDS()
+}
+CvSet;
+
+
+#define CV_SET_ELEM_IDX_MASK   ((1 << 26) - 1)
+#define CV_SET_ELEM_FREE_FLAG  (1 << (sizeof(int)*8-1))
+
+/** Checks whether the element pointed by ptr belongs to a set or not */
+#define CV_IS_SET_ELEM( ptr )  (((CvSetElem*)(ptr))->flags >= 0)
+
+/************************************* Graph ********************************************/
+
+/** @name Graph
+
+We represent a graph as a set of vertices. Vertices contain their adjacency lists (more exactly,
+pointers to first incoming or outcoming edge (or 0 if isolated vertex)). Edges are stored in
+another set. There is a singly-linked list of incoming/outcoming edges for each vertex.
+
+Each edge consists of:
+
+- Two pointers to the starting and ending vertices (vtx[0] and vtx[1] respectively).
+
+    A graph may be oriented or not. In the latter case, edges between vertex i to vertex j are not
+distinguished during search operations.
+
+- Two pointers to next edges for the starting and ending vertices, where next[0] points to the
+next edge in the vtx[0] adjacency list and next[1] points to the next edge in the vtx[1]
+adjacency list.
+
+@see CvGraphEdge, CvGraphVtx, CvGraphVtx2D, CvGraph
+@{
+*/
+#define CV_GRAPH_EDGE_FIELDS()      \
+    int flags;                      \
+    float weight;                   \
+    struct CvGraphEdge* next[2];    \
+    struct CvGraphVtx* vtx[2];
+
+
+#define CV_GRAPH_VERTEX_FIELDS()    \
+    int flags;                      \
+    struct CvGraphEdge* first;
+
+
+typedef struct CvGraphEdge
+{
+    CV_GRAPH_EDGE_FIELDS()
+}
+CvGraphEdge;
+
+typedef struct CvGraphVtx
+{
+    CV_GRAPH_VERTEX_FIELDS()
+}
+CvGraphVtx;
+
+typedef struct CvGraphVtx2D
+{
+    CV_GRAPH_VERTEX_FIELDS()
+    CvPoint2D32f* ptr;
+}
+CvGraphVtx2D;
+
+/**
+   Graph is "derived" from the set (this is set a of vertices)
+   and includes another set (edges)
+*/
+#define  CV_GRAPH_FIELDS()   \
+    CV_SET_FIELDS()          \
+    CvSet* edges;
+
+typedef struct CvGraph
+{
+    CV_GRAPH_FIELDS()
+}
+CvGraph;
+
+#define CV_TYPE_NAME_GRAPH "opencv-graph"
+
+/** @} */
+
+/*********************************** Chain/Contour *************************************/
+
+typedef struct CvChain
+{
+    CV_SEQUENCE_FIELDS()
+    CvPoint  origin;
+}
+CvChain;
+
+#define CV_CONTOUR_FIELDS()  \
+    CV_SEQUENCE_FIELDS()     \
+    CvRect rect;             \
+    int color;               \
+    int reserved[3];
+
+typedef struct CvContour
+{
+    CV_CONTOUR_FIELDS()
+}
+CvContour;
+
+typedef CvContour CvPoint2DSeq;
+
+/****************************************************************************************\
+*                                    Sequence types                                      *
+\****************************************************************************************/
+
+#define CV_SEQ_MAGIC_VAL             0x42990000
+
+#define CV_IS_SEQ(seq) \
+    ((seq) != NULL && (((CvSeq*)(seq))->flags & CV_MAGIC_MASK) == CV_SEQ_MAGIC_VAL)
+
+#define CV_SET_MAGIC_VAL             0x42980000
+#define CV_IS_SET(set) \
+    ((set) != NULL && (((CvSeq*)(set))->flags & CV_MAGIC_MASK) == CV_SET_MAGIC_VAL)
+
+#define CV_SEQ_ELTYPE_BITS           12
+#define CV_SEQ_ELTYPE_MASK           ((1 << CV_SEQ_ELTYPE_BITS) - 1)
+
+#define CV_SEQ_ELTYPE_POINT          CV_32SC2  /**< (x,y) */
+#define CV_SEQ_ELTYPE_CODE           CV_8UC1   /**< freeman code: 0..7 */
+#define CV_SEQ_ELTYPE_GENERIC        0
+#define CV_SEQ_ELTYPE_PTR            CV_MAKE_TYPE(CV_8U, 8 /*sizeof(void*)*/)
+#define CV_SEQ_ELTYPE_PPOINT         CV_SEQ_ELTYPE_PTR  /**< &(x,y) */
+#define CV_SEQ_ELTYPE_INDEX          CV_32SC1  /**< #(x,y) */
+#define CV_SEQ_ELTYPE_GRAPH_EDGE     0  /**< &next_o, &next_d, &vtx_o, &vtx_d */
+#define CV_SEQ_ELTYPE_GRAPH_VERTEX   0  /**< first_edge, &(x,y) */
+#define CV_SEQ_ELTYPE_TRIAN_ATR      0  /**< vertex of the binary tree   */
+#define CV_SEQ_ELTYPE_CONNECTED_COMP 0  /**< connected component  */
+#define CV_SEQ_ELTYPE_POINT3D        CV_32FC3  /**< (x,y,z)  */
+
+#define CV_SEQ_KIND_BITS        2
+#define CV_SEQ_KIND_MASK        (((1 << CV_SEQ_KIND_BITS) - 1)<<CV_SEQ_ELTYPE_BITS)
+
+/** types of sequences */
+#define CV_SEQ_KIND_GENERIC     (0 << CV_SEQ_ELTYPE_BITS)
+#define CV_SEQ_KIND_CURVE       (1 << CV_SEQ_ELTYPE_BITS)
+#define CV_SEQ_KIND_BIN_TREE    (2 << CV_SEQ_ELTYPE_BITS)
+
+/** types of sparse sequences (sets) */
+#define CV_SEQ_KIND_GRAPH       (1 << CV_SEQ_ELTYPE_BITS)
+#define CV_SEQ_KIND_SUBDIV2D    (2 << CV_SEQ_ELTYPE_BITS)
+
+#define CV_SEQ_FLAG_SHIFT       (CV_SEQ_KIND_BITS + CV_SEQ_ELTYPE_BITS)
+
+/** flags for curves */
+#define CV_SEQ_FLAG_CLOSED     (1 << CV_SEQ_FLAG_SHIFT)
+#define CV_SEQ_FLAG_SIMPLE     (0 << CV_SEQ_FLAG_SHIFT)
+#define CV_SEQ_FLAG_CONVEX     (0 << CV_SEQ_FLAG_SHIFT)
+#define CV_SEQ_FLAG_HOLE       (2 << CV_SEQ_FLAG_SHIFT)
+
+/** flags for graphs */
+#define CV_GRAPH_FLAG_ORIENTED (1 << CV_SEQ_FLAG_SHIFT)
+
+#define CV_GRAPH               CV_SEQ_KIND_GRAPH
+#define CV_ORIENTED_GRAPH      (CV_SEQ_KIND_GRAPH|CV_GRAPH_FLAG_ORIENTED)
+
+/** point sets */
+#define CV_SEQ_POINT_SET       (CV_SEQ_KIND_GENERIC| CV_SEQ_ELTYPE_POINT)
+#define CV_SEQ_POINT3D_SET     (CV_SEQ_KIND_GENERIC| CV_SEQ_ELTYPE_POINT3D)
+#define CV_SEQ_POLYLINE        (CV_SEQ_KIND_CURVE  | CV_SEQ_ELTYPE_POINT)
+#define CV_SEQ_POLYGON         (CV_SEQ_FLAG_CLOSED | CV_SEQ_POLYLINE )
+#define CV_SEQ_CONTOUR         CV_SEQ_POLYGON
+#define CV_SEQ_SIMPLE_POLYGON  (CV_SEQ_FLAG_SIMPLE | CV_SEQ_POLYGON  )
+
+/** chain-coded curves */
+#define CV_SEQ_CHAIN           (CV_SEQ_KIND_CURVE  | CV_SEQ_ELTYPE_CODE)
+#define CV_SEQ_CHAIN_CONTOUR   (CV_SEQ_FLAG_CLOSED | CV_SEQ_CHAIN)
+
+/** binary tree for the contour */
+#define CV_SEQ_POLYGON_TREE    (CV_SEQ_KIND_BIN_TREE  | CV_SEQ_ELTYPE_TRIAN_ATR)
+
+/** sequence of the connected components */
+#define CV_SEQ_CONNECTED_COMP  (CV_SEQ_KIND_GENERIC  | CV_SEQ_ELTYPE_CONNECTED_COMP)
+
+/** sequence of the integer numbers */
+#define CV_SEQ_INDEX           (CV_SEQ_KIND_GENERIC  | CV_SEQ_ELTYPE_INDEX)
+
+#define CV_SEQ_ELTYPE( seq )   ((seq)->flags & CV_SEQ_ELTYPE_MASK)
+#define CV_SEQ_KIND( seq )     ((seq)->flags & CV_SEQ_KIND_MASK )
+
+/** flag checking */
+#define CV_IS_SEQ_INDEX( seq )      ((CV_SEQ_ELTYPE(seq) == CV_SEQ_ELTYPE_INDEX) && \
+                                     (CV_SEQ_KIND(seq) == CV_SEQ_KIND_GENERIC))
+
+#define CV_IS_SEQ_CURVE( seq )      (CV_SEQ_KIND(seq) == CV_SEQ_KIND_CURVE)
+#define CV_IS_SEQ_CLOSED( seq )     (((seq)->flags & CV_SEQ_FLAG_CLOSED) != 0)
+#define CV_IS_SEQ_CONVEX( seq )     0
+#define CV_IS_SEQ_HOLE( seq )       (((seq)->flags & CV_SEQ_FLAG_HOLE) != 0)
+#define CV_IS_SEQ_SIMPLE( seq )     1
+
+/** type checking macros */
+#define CV_IS_SEQ_POINT_SET( seq ) \
+    ((CV_SEQ_ELTYPE(seq) == CV_32SC2 || CV_SEQ_ELTYPE(seq) == CV_32FC2))
+
+#define CV_IS_SEQ_POINT_SUBSET( seq ) \
+    (CV_IS_SEQ_INDEX( seq ) || CV_SEQ_ELTYPE(seq) == CV_SEQ_ELTYPE_PPOINT)
+
+#define CV_IS_SEQ_POLYLINE( seq )   \
+    (CV_SEQ_KIND(seq) == CV_SEQ_KIND_CURVE && CV_IS_SEQ_POINT_SET(seq))
+
+#define CV_IS_SEQ_POLYGON( seq )   \
+    (CV_IS_SEQ_POLYLINE(seq) && CV_IS_SEQ_CLOSED(seq))
+
+#define CV_IS_SEQ_CHAIN( seq )   \
+    (CV_SEQ_KIND(seq) == CV_SEQ_KIND_CURVE && (seq)->elem_size == 1)
+
+#define CV_IS_SEQ_CONTOUR( seq )   \
+    (CV_IS_SEQ_CLOSED(seq) && (CV_IS_SEQ_POLYLINE(seq) || CV_IS_SEQ_CHAIN(seq)))
+
+#define CV_IS_SEQ_CHAIN_CONTOUR( seq ) \
+    (CV_IS_SEQ_CHAIN( seq ) && CV_IS_SEQ_CLOSED( seq ))
+
+#define CV_IS_SEQ_POLYGON_TREE( seq ) \
+    (CV_SEQ_ELTYPE (seq) ==  CV_SEQ_ELTYPE_TRIAN_ATR &&    \
+    CV_SEQ_KIND( seq ) ==  CV_SEQ_KIND_BIN_TREE )
+
+#define CV_IS_GRAPH( seq )    \
+    (CV_IS_SET(seq) && CV_SEQ_KIND((CvSet*)(seq)) == CV_SEQ_KIND_GRAPH)
+
+#define CV_IS_GRAPH_ORIENTED( seq )   \
+    (((seq)->flags & CV_GRAPH_FLAG_ORIENTED) != 0)
+
+#define CV_IS_SUBDIV2D( seq )  \
+    (CV_IS_SET(seq) && CV_SEQ_KIND((CvSet*)(seq)) == CV_SEQ_KIND_SUBDIV2D)
+
+/****************************************************************************************/
+/*                            Sequence writer & reader                                  */
+/****************************************************************************************/
+
+#define CV_SEQ_WRITER_FIELDS()                                     \
+    int          header_size;                                      \
+    CvSeq*       seq;        /**< the sequence written */            \
+    CvSeqBlock*  block;      /**< current block */                   \
+    schar*       ptr;        /**< pointer to free space */           \
+    schar*       block_min;  /**< pointer to the beginning of block*/\
+    schar*       block_max;  /**< pointer to the end of block */
+
+typedef struct CvSeqWriter
+{
+    CV_SEQ_WRITER_FIELDS()
+}
+CvSeqWriter;
+
+
+#define CV_SEQ_READER_FIELDS()                                      \
+    int          header_size;                                       \
+    CvSeq*       seq;        /**< sequence, beign read */             \
+    CvSeqBlock*  block;      /**< current block */                    \
+    schar*       ptr;        /**< pointer to element be read next */  \
+    schar*       block_min;  /**< pointer to the beginning of block */\
+    schar*       block_max;  /**< pointer to the end of block */      \
+    int          delta_index;/**< = seq->first->start_index   */      \
+    schar*       prev_elem;  /**< pointer to previous element */
+
+typedef struct CvSeqReader
+{
+    CV_SEQ_READER_FIELDS()
+}
+CvSeqReader;
+
+/****************************************************************************************/
+/*                                Operations on sequences                               */
+/****************************************************************************************/
+
+#define  CV_SEQ_ELEM( seq, elem_type, index )                    \
+/** assert gives some guarantee that <seq> parameter is valid */  \
+(   assert(sizeof((seq)->first[0]) == sizeof(CvSeqBlock) &&      \
+    (seq)->elem_size == sizeof(elem_type)),                      \
+    (elem_type*)((seq)->first && (unsigned)index <               \
+    (unsigned)((seq)->first->count) ?                            \
+    (seq)->first->data + (index) * sizeof(elem_type) :           \
+    cvGetSeqElem( (CvSeq*)(seq), (index) )))
+#define CV_GET_SEQ_ELEM( elem_type, seq, index ) CV_SEQ_ELEM( (seq), elem_type, (index) )
+
+/** Add element to sequence: */
+#define CV_WRITE_SEQ_ELEM_VAR( elem_ptr, writer )     \
+{                                                     \
+    if( (writer).ptr >= (writer).block_max )          \
+    {                                                 \
+        cvCreateSeqBlock( &writer);                   \
+    }                                                 \
+    memcpy((writer).ptr, elem_ptr, (writer).seq->elem_size);\
+    (writer).ptr += (writer).seq->elem_size;          \
+}
+
+#define CV_WRITE_SEQ_ELEM( elem, writer )             \
+{                                                     \
+    assert( (writer).seq->elem_size == sizeof(elem)); \
+    if( (writer).ptr >= (writer).block_max )          \
+    {                                                 \
+        cvCreateSeqBlock( &writer);                   \
+    }                                                 \
+    assert( (writer).ptr <= (writer).block_max - sizeof(elem));\
+    memcpy((writer).ptr, &(elem), sizeof(elem));      \
+    (writer).ptr += sizeof(elem);                     \
+}
+
+
+/** Move reader position forward: */
+#define CV_NEXT_SEQ_ELEM( elem_size, reader )                 \
+{                                                             \
+    if( ((reader).ptr += (elem_size)) >= (reader).block_max ) \
+    {                                                         \
+        cvChangeSeqBlock( &(reader), 1 );                     \
+    }                                                         \
+}
+
+
+/** Move reader position backward: */
+#define CV_PREV_SEQ_ELEM( elem_size, reader )                \
+{                                                            \
+    if( ((reader).ptr -= (elem_size)) < (reader).block_min ) \
+    {                                                        \
+        cvChangeSeqBlock( &(reader), -1 );                   \
+    }                                                        \
+}
+
+/** Read element and move read position forward: */
+#define CV_READ_SEQ_ELEM( elem, reader )                       \
+{                                                              \
+    assert( (reader).seq->elem_size == sizeof(elem));          \
+    memcpy( &(elem), (reader).ptr, sizeof((elem)));            \
+    CV_NEXT_SEQ_ELEM( sizeof(elem), reader )                   \
+}
+
+/** Read element and move read position backward: */
+#define CV_REV_READ_SEQ_ELEM( elem, reader )                     \
+{                                                                \
+    assert( (reader).seq->elem_size == sizeof(elem));            \
+    memcpy(&(elem), (reader).ptr, sizeof((elem)));               \
+    CV_PREV_SEQ_ELEM( sizeof(elem), reader )                     \
+}
+
+
+#define CV_READ_CHAIN_POINT( _pt, reader )                              \
+{                                                                       \
+    (_pt) = (reader).pt;                                                \
+    if( (reader).ptr )                                                  \
+    {                                                                   \
+        CV_READ_SEQ_ELEM( (reader).code, (reader));                     \
+        assert( ((reader).code & ~7) == 0 );                            \
+        (reader).pt.x += (reader).deltas[(int)(reader).code][0];        \
+        (reader).pt.y += (reader).deltas[(int)(reader).code][1];        \
+    }                                                                   \
+}
+
+#define CV_CURRENT_POINT( reader )  (*((CvPoint*)((reader).ptr)))
+#define CV_PREV_POINT( reader )     (*((CvPoint*)((reader).prev_elem)))
+
+#define CV_READ_EDGE( pt1, pt2, reader )               \
+{                                                      \
+    assert( sizeof(pt1) == sizeof(CvPoint) &&          \
+            sizeof(pt2) == sizeof(CvPoint) &&          \
+            reader.seq->elem_size == sizeof(CvPoint)); \
+    (pt1) = CV_PREV_POINT( reader );                   \
+    (pt2) = CV_CURRENT_POINT( reader );                \
+    (reader).prev_elem = (reader).ptr;                 \
+    CV_NEXT_SEQ_ELEM( sizeof(CvPoint), (reader));      \
+}
+
+/************ Graph macros ************/
+
+/** Return next graph edge for given vertex: */
+#define  CV_NEXT_GRAPH_EDGE( edge, vertex )                              \
+     (assert((edge)->vtx[0] == (vertex) || (edge)->vtx[1] == (vertex)),  \
+      (edge)->next[(edge)->vtx[1] == (vertex)])
+
+
+
+/****************************************************************************************\
+*             Data structures for persistence (a.k.a serialization) functionality        *
+\****************************************************************************************/
+
+#if 0
+
+/** "black box" file storage */
+typedef struct CvFileStorage CvFileStorage;
+
+/** Storage flags: */
+#define CV_STORAGE_READ          0
+#define CV_STORAGE_WRITE         1
+#define CV_STORAGE_WRITE_TEXT    CV_STORAGE_WRITE
+#define CV_STORAGE_WRITE_BINARY  CV_STORAGE_WRITE
+#define CV_STORAGE_APPEND        2
+#define CV_STORAGE_MEMORY        4
+#define CV_STORAGE_FORMAT_MASK   (7<<3)
+#define CV_STORAGE_FORMAT_AUTO   0
+#define CV_STORAGE_FORMAT_XML    8
+#define CV_STORAGE_FORMAT_YAML  16
+#define CV_STORAGE_FORMAT_JSON  24
+#define CV_STORAGE_BASE64       64
+#define CV_STORAGE_WRITE_BASE64  (CV_STORAGE_BASE64 | CV_STORAGE_WRITE)
+
+/** @brief List of attributes. :
+
+In the current implementation, attributes are used to pass extra parameters when writing user
+objects (see cvWrite). XML attributes inside tags are not supported, aside from the object type
+specification (type_id attribute).
+@see cvAttrList, cvAttrValue
+ */
+typedef struct CvAttrList
+{
+    const char** attr;         /**< NULL-terminated array of (attribute_name,attribute_value) pairs. */
+    struct CvAttrList* next;   /**< Pointer to next chunk of the attributes list.                    */
+}
+CvAttrList;
+
+/** initializes CvAttrList structure */
+CV_INLINE CvAttrList cvAttrList( const char** attr CV_DEFAULT(NULL),
+                                 CvAttrList* next CV_DEFAULT(NULL) )
+{
+    CvAttrList l;
+    l.attr = attr;
+    l.next = next;
+
+    return l;
+}
+
+struct CvTypeInfo;
+
+#define CV_NODE_NONE        0
+#define CV_NODE_INT         1
+#define CV_NODE_INTEGER     CV_NODE_INT
+#define CV_NODE_REAL        2
+#define CV_NODE_FLOAT       CV_NODE_REAL
+#define CV_NODE_STR         3
+#define CV_NODE_STRING      CV_NODE_STR
+#define CV_NODE_REF         4 /**< not used */
+#define CV_NODE_SEQ         5
+#define CV_NODE_MAP         6
+#define CV_NODE_TYPE_MASK   7
+
+#define CV_NODE_TYPE(flags)  ((flags) & CV_NODE_TYPE_MASK)
+
+/** file node flags */
+#define CV_NODE_FLOW        8 /**<Used only for writing structures in YAML format. */
+#define CV_NODE_USER        16
+#define CV_NODE_EMPTY       32
+#define CV_NODE_NAMED       64
+
+#define CV_NODE_IS_INT(flags)        (CV_NODE_TYPE(flags) == CV_NODE_INT)
+#define CV_NODE_IS_REAL(flags)       (CV_NODE_TYPE(flags) == CV_NODE_REAL)
+#define CV_NODE_IS_STRING(flags)     (CV_NODE_TYPE(flags) == CV_NODE_STRING)
+#define CV_NODE_IS_SEQ(flags)        (CV_NODE_TYPE(flags) == CV_NODE_SEQ)
+#define CV_NODE_IS_MAP(flags)        (CV_NODE_TYPE(flags) == CV_NODE_MAP)
+#define CV_NODE_IS_COLLECTION(flags) (CV_NODE_TYPE(flags) >= CV_NODE_SEQ)
+#define CV_NODE_IS_FLOW(flags)       (((flags) & CV_NODE_FLOW) != 0)
+#define CV_NODE_IS_EMPTY(flags)      (((flags) & CV_NODE_EMPTY) != 0)
+#define CV_NODE_IS_USER(flags)       (((flags) & CV_NODE_USER) != 0)
+#define CV_NODE_HAS_NAME(flags)      (((flags) & CV_NODE_NAMED) != 0)
+
+#define CV_NODE_SEQ_SIMPLE 256
+#define CV_NODE_SEQ_IS_SIMPLE(seq) (((seq)->flags & CV_NODE_SEQ_SIMPLE) != 0)
+
+typedef struct CvString
+{
+    int len;
+    char* ptr;
+}
+CvString;
+
+/** All the keys (names) of elements in the read file storage
+   are stored in the hash to speed up the lookup operations: */
+typedef struct CvStringHashNode
+{
+    unsigned hashval;
+    CvString str;
+    struct CvStringHashNode* next;
+}
+CvStringHashNode;
+
+typedef struct CvGenericHash CvFileNodeHash;
+
+/** Basic element of the file storage - scalar or collection: */
+typedef struct CvFileNode
+{
+    int tag;
+    struct CvTypeInfo* info; /**< type information
+            (only for user-defined object, for others it is 0) */
+    union
+    {
+        double f; /**< scalar floating-point number */
+        int i;    /**< scalar integer number */
+        CvString str; /**< text string */
+        CvSeq* seq; /**< sequence (ordered collection of file nodes) */
+        CvFileNodeHash* map; /**< map (collection of named file nodes) */
+    } data;
+}
+CvFileNode;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef int (CV_CDECL *CvIsInstanceFunc)( const void* struct_ptr );
+typedef void (CV_CDECL *CvReleaseFunc)( void** struct_dblptr );
+typedef void* (CV_CDECL *CvReadFunc)( CvFileStorage* storage, CvFileNode* node );
+typedef void (CV_CDECL *CvWriteFunc)( CvFileStorage* storage, const char* name,
+                                      const void* struct_ptr, CvAttrList attributes );
+typedef void* (CV_CDECL *CvCloneFunc)( const void* struct_ptr );
+#ifdef __cplusplus
+}
+#endif
+
+/** @brief Type information
+
+The structure contains information about one of the standard or user-defined types. Instances of the
+type may or may not contain a pointer to the corresponding CvTypeInfo structure. In any case, there
+is a way to find the type info structure for a given object using the cvTypeOf function.
+Alternatively, type info can be found by type name using cvFindType, which is used when an object
+is read from file storage. The user can register a new type with cvRegisterType that adds the type
+information structure into the beginning of the type list. Thus, it is possible to create
+specialized types from generic standard types and override the basic methods.
+ */
+typedef struct CvTypeInfo
+{
+    int flags; /**< not used */
+    int header_size; /**< sizeof(CvTypeInfo) */
+    struct CvTypeInfo* prev; /**< previous registered type in the list */
+    struct CvTypeInfo* next; /**< next registered type in the list */
+    const char* type_name; /**< type name, written to file storage */
+    CvIsInstanceFunc is_instance; /**< checks if the passed object belongs to the type */
+    CvReleaseFunc release; /**< releases object (memory etc.) */
+    CvReadFunc read; /**< reads object from file storage */
+    CvWriteFunc write; /**< writes object to file storage */
+    CvCloneFunc clone; /**< creates a copy of the object */
+}
+CvTypeInfo;
+#endif
+
+/** @} */
+
+#endif /*OPENCV_CORE_TYPES_H*/
+
+/* End of file. */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utility.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utility.hpp
new file mode 100644
index 000000000000..e491352bcf6c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utility.hpp
@@ -0,0 +1,1231 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_UTILITY_H
+#define OPENCV_CORE_UTILITY_H
+
+#ifndef __cplusplus
+#  error utility.hpp header must be compiled as C++
+#endif
+
+#if defined(check)
+#  warning Detected Apple 'check' macro definition, it can cause build conflicts. Please, include this header before any Apple headers.
+#endif
+
+#include "opencv2/core.hpp"
+#include <ostream>
+
+#include <functional>
+
+#if !defined(_M_CEE)
+#include <mutex>  // std::mutex, std::lock_guard
+#endif
+
+namespace cv
+{
+
+//! @addtogroup core_utils
+//! @{
+
+/** @brief  Automatically Allocated Buffer Class
+
+ The class is used for temporary buffers in functions and methods.
+ If a temporary buffer is usually small (a few K's of memory),
+ but its size depends on the parameters, it makes sense to create a small
+ fixed-size array on stack and use it if it's large enough. If the required buffer size
+ is larger than the fixed size, another buffer of sufficient size is allocated dynamically
+ and released after the processing. Therefore, in typical cases, when the buffer size is small,
+ there is no overhead associated with malloc()/free().
+ At the same time, there is no limit on the size of processed data.
+
+ This is what AutoBuffer does. The template takes 2 parameters - type of the buffer elements and
+ the number of stack-allocated elements. Here is how the class is used:
+
+ \code
+ void my_func(const cv::Mat& m)
+ {
+    cv::AutoBuffer<float> buf(1000); // create automatic buffer containing 1000 floats
+
+    buf.allocate(m.rows); // if m.rows <= 1000, the pre-allocated buffer is used,
+                          // otherwise the buffer of "m.rows" floats will be allocated
+                          // dynamically and deallocated in cv::AutoBuffer destructor
+    ...
+ }
+ \endcode
+*/
+#ifdef OPENCV_ENABLE_MEMORY_SANITIZER
+template<typename _Tp, size_t fixed_size = 0> class AutoBuffer
+#else
+template<typename _Tp, size_t fixed_size = 1024/sizeof(_Tp)+8> class AutoBuffer
+#endif
+{
+public:
+    typedef _Tp value_type;
+
+    //! the default constructor
+    AutoBuffer();
+    //! constructor taking the real buffer size
+    explicit AutoBuffer(size_t _size);
+
+    //! the copy constructor
+    AutoBuffer(const AutoBuffer<_Tp, fixed_size>& buf);
+    //! the assignment operator
+    AutoBuffer<_Tp, fixed_size>& operator = (const AutoBuffer<_Tp, fixed_size>& buf);
+
+    //! destructor. calls deallocate()
+    ~AutoBuffer();
+
+    //! allocates the new buffer of size _size. if the _size is small enough, stack-allocated buffer is used
+    void allocate(size_t _size);
+    //! deallocates the buffer if it was dynamically allocated
+    void deallocate();
+    //! resizes the buffer and preserves the content
+    void resize(size_t _size);
+    //! returns the current buffer size
+    size_t size() const;
+    //! returns pointer to the real buffer, stack-allocated or heap-allocated
+    inline _Tp* data() { return ptr; }
+    //! returns read-only pointer to the real buffer, stack-allocated or heap-allocated
+    inline const _Tp* data() const { return ptr; }
+
+#if !defined(OPENCV_DISABLE_DEPRECATED_COMPATIBILITY) // use to .data() calls instead
+    //! returns pointer to the real buffer, stack-allocated or heap-allocated
+    operator _Tp* () { return ptr; }
+    //! returns read-only pointer to the real buffer, stack-allocated or heap-allocated
+    operator const _Tp* () const { return ptr; }
+#else
+    //! returns a reference to the element at specified location. No bounds checking is performed in Release builds.
+    inline _Tp& operator[] (size_t i) { CV_DbgCheckLT(i, sz, "out of range"); return ptr[i]; }
+    //! returns a reference to the element at specified location. No bounds checking is performed in Release builds.
+    inline const _Tp& operator[] (size_t i) const { CV_DbgCheckLT(i, sz, "out of range"); return ptr[i]; }
+#endif
+
+protected:
+    //! pointer to the real buffer, can point to buf if the buffer is small enough
+    _Tp* ptr;
+    //! size of the real buffer
+    size_t sz;
+    //! pre-allocated buffer. At least 1 element to confirm C++ standard requirements
+    _Tp buf[(fixed_size > 0) ? fixed_size : 1];
+};
+
+/**  @brief Sets/resets the break-on-error mode.
+
+When the break-on-error mode is set, the default error handler issues a hardware exception, which
+can make debugging more convenient.
+
+\return the previous state
+ */
+CV_EXPORTS bool setBreakOnError(bool flag);
+
+extern "C" typedef int (*ErrorCallback)( int status, const char* func_name,
+                                       const char* err_msg, const char* file_name,
+                                       int line, void* userdata );
+
+
+/** @brief Sets the new error handler and the optional user data.
+
+  The function sets the new error handler, called from cv::error().
+
+  \param errCallback the new error handler. If NULL, the default error handler is used.
+  \param userdata the optional user data pointer, passed to the callback.
+  \param prevUserdata the optional output parameter where the previous user data pointer is stored
+
+  \return the previous error handler
+*/
+CV_EXPORTS ErrorCallback redirectError( ErrorCallback errCallback, void* userdata=0, void** prevUserdata=0);
+
+CV_EXPORTS String tempfile( const char* suffix = 0);
+CV_EXPORTS void glob(String pattern, std::vector<String>& result, bool recursive = false);
+
+/** @brief OpenCV will try to set the number of threads for subsequent parallel regions.
+
+If threads == 1, OpenCV will disable threading optimizations and run all it's functions
+sequentially. Passing threads \< 0 will reset threads number to system default.
+The function is not thread-safe. It must not be called in parallel region or concurrent threads.
+
+OpenCV will try to run its functions with specified threads number, but some behaviour differs from
+framework:
+-   `TBB` - User-defined parallel constructions will run with the same threads number, if
+    another is not specified. If later on user creates his own scheduler, OpenCV will use it.
+-   `OpenMP` - No special defined behaviour.
+-   `Concurrency` - If threads == 1, OpenCV will disable threading optimizations and run its
+    functions sequentially.
+-   `GCD` - Supports only values \<= 0.
+-   `C=` - No special defined behaviour.
+@param nthreads Number of threads used by OpenCV.
+@sa getNumThreads, getThreadNum
+ */
+CV_EXPORTS_W void setNumThreads(int nthreads);
+
+/** @brief Returns the number of threads used by OpenCV for parallel regions.
+
+Always returns 1 if OpenCV is built without threading support.
+
+The exact meaning of return value depends on the threading framework used by OpenCV library:
+- `TBB` - The number of threads, that OpenCV will try to use for parallel regions. If there is
+  any tbb::thread_scheduler_init in user code conflicting with OpenCV, then function returns
+  default number of threads used by TBB library.
+- `OpenMP` - An upper bound on the number of threads that could be used to form a new team.
+- `Concurrency` - The number of threads, that OpenCV will try to use for parallel regions.
+- `GCD` - Unsupported; returns the GCD thread pool limit (512) for compatibility.
+- `C=` - The number of threads, that OpenCV will try to use for parallel regions, if before
+  called setNumThreads with threads \> 0, otherwise returns the number of logical CPUs,
+  available for the process.
+@sa setNumThreads, getThreadNum
+ */
+CV_EXPORTS_W int getNumThreads();
+
+/** @brief Returns the index of the currently executed thread within the current parallel region. Always
+returns 0 if called outside of parallel region.
+
+@deprecated Current implementation doesn't corresponding to this documentation.
+
+The exact meaning of the return value depends on the threading framework used by OpenCV library:
+- `TBB` - Unsupported with current 4.1 TBB release. Maybe will be supported in future.
+- `OpenMP` - The thread number, within the current team, of the calling thread.
+- `Concurrency` - An ID for the virtual processor that the current context is executing on (0
+  for master thread and unique number for others, but not necessary 1,2,3,...).
+- `GCD` - System calling thread's ID. Never returns 0 inside parallel region.
+- `C=` - The index of the current parallel task.
+@sa setNumThreads, getNumThreads
+ */
+CV_EXPORTS_W int getThreadNum();
+
+/** @brief Returns full configuration time cmake output.
+
+Returned value is raw cmake output including version control system revision, compiler version,
+compiler flags, enabled modules and third party libraries, etc. Output format depends on target
+architecture.
+ */
+CV_EXPORTS_W const String& getBuildInformation();
+
+/** @brief Returns library version string
+
+For example "3.4.1-dev".
+
+@sa getMajorVersion, getMinorVersion, getRevisionVersion
+*/
+CV_EXPORTS_W String getVersionString();
+
+/** @brief Returns major library version */
+CV_EXPORTS_W int getVersionMajor();
+
+/** @brief Returns minor library version */
+CV_EXPORTS_W int getVersionMinor();
+
+/** @brief Returns revision field of the library version */
+CV_EXPORTS_W int getVersionRevision();
+
+/** @brief Returns the number of ticks.
+
+The function returns the number of ticks after the certain event (for example, when the machine was
+turned on). It can be used to initialize RNG or to measure a function execution time by reading the
+tick count before and after the function call.
+@sa getTickFrequency, TickMeter
+ */
+CV_EXPORTS_W int64 getTickCount();
+
+/** @brief Returns the number of ticks per second.
+
+The function returns the number of ticks per second. That is, the following code computes the
+execution time in seconds:
+@code
+    double t = (double)getTickCount();
+    // do something ...
+    t = ((double)getTickCount() - t)/getTickFrequency();
+@endcode
+@sa getTickCount, TickMeter
+ */
+CV_EXPORTS_W double getTickFrequency();
+
+/** @brief a Class to measure passing time.
+
+The class computes passing time by counting the number of ticks per second. That is, the following code computes the
+execution time in seconds:
+@snippet snippets/core_various.cpp TickMeter_total
+
+It is also possible to compute the average time over multiple runs:
+@snippet snippets/core_various.cpp TickMeter_average
+
+@sa getTickCount, getTickFrequency
+*/
+class CV_EXPORTS_W TickMeter
+{
+public:
+    //! the default constructor
+    CV_WRAP TickMeter()
+    {
+        reset();
+    }
+
+    //! starts counting ticks.
+    CV_WRAP void start()
+    {
+        startTime = cv::getTickCount();
+    }
+
+    //! stops counting ticks.
+    CV_WRAP void stop()
+    {
+        int64 time = cv::getTickCount();
+        if (startTime == 0)
+            return;
+        ++counter;
+        sumTime += (time - startTime);
+        startTime = 0;
+    }
+
+    //! returns counted ticks.
+    CV_WRAP int64 getTimeTicks() const
+    {
+        return sumTime;
+    }
+
+    //! returns passed time in microseconds.
+    CV_WRAP double getTimeMicro() const
+    {
+        return getTimeMilli()*1e3;
+    }
+
+    //! returns passed time in milliseconds.
+    CV_WRAP double getTimeMilli() const
+    {
+        return getTimeSec()*1e3;
+    }
+
+    //! returns passed time in seconds.
+    CV_WRAP double getTimeSec()   const
+    {
+        return (double)getTimeTicks() / getTickFrequency();
+    }
+
+    //! returns internal counter value.
+    CV_WRAP int64 getCounter() const
+    {
+        return counter;
+    }
+
+    //! returns average FPS (frames per second) value.
+    CV_WRAP double getFPS() const
+    {
+        const double sec = getTimeSec();
+        if (sec < DBL_EPSILON)
+            return 0.;
+        return counter / sec;
+    }
+
+    //! returns average time in seconds
+    CV_WRAP double getAvgTimeSec() const
+    {
+        if (counter <= 0)
+            return 0.;
+        return getTimeSec() / counter;
+    }
+
+    //! returns average time in milliseconds
+    CV_WRAP double getAvgTimeMilli() const
+    {
+        return getAvgTimeSec() * 1e3;
+    }
+
+    //! resets internal values.
+    CV_WRAP void reset()
+    {
+        startTime = 0;
+        sumTime = 0;
+        counter = 0;
+    }
+
+private:
+    int64 counter;
+    int64 sumTime;
+    int64 startTime;
+};
+
+/** @brief output operator
+@code
+TickMeter tm;
+tm.start();
+// do something ...
+tm.stop();
+std::cout << tm;
+@endcode
+*/
+
+static inline
+std::ostream& operator << (std::ostream& out, const TickMeter& tm)
+{
+    return out << tm.getTimeSec() << "sec";
+}
+
+/** @brief Returns the number of CPU ticks.
+
+The function returns the current number of CPU ticks on some architectures (such as x86, x64,
+PowerPC). On other platforms the function is equivalent to getTickCount. It can also be used for
+very accurate time measurements, as well as for RNG initialization. Note that in case of multi-CPU
+systems a thread, from which getCPUTickCount is called, can be suspended and resumed at another CPU
+with its own counter. So, theoretically (and practically) the subsequent calls to the function do
+not necessary return the monotonously increasing values. Also, since a modern CPU varies the CPU
+frequency depending on the load, the number of CPU clocks spent in some code cannot be directly
+converted to time units. Therefore, getTickCount is generally a preferable solution for measuring
+execution time.
+ */
+CV_EXPORTS_W int64 getCPUTickCount();
+
+/** @brief Returns true if the specified feature is supported by the host hardware.
+
+The function returns true if the host hardware supports the specified feature. When user calls
+setUseOptimized(false), the subsequent calls to checkHardwareSupport() will return false until
+setUseOptimized(true) is called. This way user can dynamically switch on and off the optimized code
+in OpenCV.
+@param feature The feature of interest, one of cv::CpuFeatures
+ */
+CV_EXPORTS_W bool checkHardwareSupport(int feature);
+
+/** @brief Returns feature name by ID
+
+Returns empty string if feature is not defined
+*/
+CV_EXPORTS_W String getHardwareFeatureName(int feature);
+
+/** @brief Returns list of CPU features enabled during compilation.
+
+Returned value is a string containing space separated list of CPU features with following markers:
+
+- no markers - baseline features
+- prefix `*` - features enabled in dispatcher
+- suffix `?` - features enabled but not available in HW
+
+Example: `SSE SSE2 SSE3 *SSE4.1 *SSE4.2 *FP16 *AVX *AVX2 *AVX512-SKX?`
+*/
+CV_EXPORTS_W std::string getCPUFeaturesLine();
+
+/** @brief Returns the number of logical CPUs available for the process.
+ */
+CV_EXPORTS_W int getNumberOfCPUs();
+
+
+/** @brief Aligns a pointer to the specified number of bytes.
+
+The function returns the aligned pointer of the same type as the input pointer:
+\f[\texttt{(_Tp*)(((size_t)ptr + n-1) & -n)}\f]
+@param ptr Aligned pointer.
+@param n Alignment size that must be a power of two.
+ */
+template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
+{
+    CV_DbgAssert((n & (n - 1)) == 0); // n is a power of 2
+    return (_Tp*)(((size_t)ptr + n-1) & -n);
+}
+
+/** @brief Aligns a buffer size to the specified number of bytes.
+
+The function returns the minimum number that is greater than or equal to sz and is divisible by n :
+\f[\texttt{(sz + n-1) & -n}\f]
+@param sz Buffer size to align.
+@param n Alignment size that must be a power of two.
+ */
+static inline size_t alignSize(size_t sz, int n)
+{
+    CV_DbgAssert((n & (n - 1)) == 0); // n is a power of 2
+    return (sz + n-1) & -n;
+}
+
+/** @brief Integer division with result round up.
+
+Use this function instead of `ceil((float)a / b)` expressions.
+
+@sa alignSize
+*/
+static inline int divUp(int a, unsigned int b)
+{
+    CV_DbgAssert(a >= 0);
+    return (a + b - 1) / b;
+}
+/** @overload */
+static inline size_t divUp(size_t a, unsigned int b)
+{
+    return (a + b - 1) / b;
+}
+
+/** @brief Round first value up to the nearest multiple of second value.
+
+Use this function instead of `ceil((float)a / b) * b` expressions.
+
+@sa divUp
+*/
+static inline int roundUp(int a, unsigned int b)
+{
+    CV_DbgAssert(a >= 0);
+    return a + b - 1 - (a + b -1) % b;
+}
+/** @overload */
+static inline size_t roundUp(size_t a, unsigned int b)
+{
+    return a + b - 1 - (a + b - 1) % b;
+}
+
+/** @brief Alignment check of passed values
+
+Usage: `isAligned<sizeof(int)>(...)`
+
+@note Alignment(N) must be a power of 2 (2**k, 2^k)
+*/
+template<int N, typename T> static inline
+bool isAligned(const T& data)
+{
+    CV_StaticAssert((N & (N - 1)) == 0, "");  // power of 2
+    return (((size_t)data) & (N - 1)) == 0;
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1)
+{
+    return isAligned<N>((size_t)p1);
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1, const void* p2)
+{
+    return isAligned<N>(((size_t)p1)|((size_t)p2));
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1, const void* p2, const void* p3)
+{
+    return isAligned<N>(((size_t)p1)|((size_t)p2)|((size_t)p3));
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1, const void* p2, const void* p3, const void* p4)
+{
+    return isAligned<N>(((size_t)p1)|((size_t)p2)|((size_t)p3)|((size_t)p4));
+}
+
+/** @brief Enables or disables the optimized code.
+
+The function can be used to dynamically turn on and off optimized dispatched code (code that uses SSE4.2, AVX/AVX2,
+and other instructions on the platforms that support it). It sets a global flag that is further
+checked by OpenCV functions. Since the flag is not checked in the inner OpenCV loops, it is only
+safe to call the function on the very top level in your application where you can be sure that no
+other OpenCV function is currently executed.
+
+By default, the optimized code is enabled unless you disable it in CMake. The current status can be
+retrieved using useOptimized.
+@param onoff The boolean flag specifying whether the optimized code should be used (onoff=true)
+or not (onoff=false).
+ */
+CV_EXPORTS_W void setUseOptimized(bool onoff);
+
+/** @brief Returns the status of optimized code usage.
+
+The function returns true if the optimized code is enabled. Otherwise, it returns false.
+ */
+CV_EXPORTS_W bool useOptimized();
+
+static inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); }
+
+/////////////////////////////// Parallel Primitives //////////////////////////////////
+
+/** @brief Base class for parallel data processors
+
+@ingroup core_parallel
+*/
+class CV_EXPORTS ParallelLoopBody
+{
+public:
+    virtual ~ParallelLoopBody();
+    virtual void operator() (const Range& range) const = 0;
+};
+
+/** @brief Parallel data processor
+
+@ingroup core_parallel
+*/
+CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body, double nstripes=-1.);
+
+//! @ingroup core_parallel
+class ParallelLoopBodyLambdaWrapper : public ParallelLoopBody
+{
+private:
+    std::function<void(const Range&)> m_functor;
+public:
+    inline
+    ParallelLoopBodyLambdaWrapper(std::function<void(const Range&)> functor)
+        : m_functor(functor)
+    {
+        // nothing
+    }
+
+    virtual void operator() (const cv::Range& range) const CV_OVERRIDE
+    {
+        m_functor(range);
+    }
+};
+
+//! @ingroup core_parallel
+static inline
+void parallel_for_(const Range& range, std::function<void(const Range&)> functor, double nstripes=-1.)
+{
+    parallel_for_(range, ParallelLoopBodyLambdaWrapper(functor), nstripes);
+}
+
+
+/////////////////////////////// forEach method of cv::Mat ////////////////////////////
+template<typename _Tp, typename Functor> inline
+void Mat::forEach_impl(const Functor& operation) {
+    if (false) {
+        operation(*reinterpret_cast<_Tp*>(0), reinterpret_cast<int*>(0));
+        // If your compiler fails in this line.
+        // Please check that your functor signature is
+        //     (_Tp&, const int*)   <- multi-dimensional
+        //  or (_Tp&, void*)        <- in case you don't need current idx.
+    }
+
+    CV_Assert(!empty());
+    CV_Assert(this->total() / this->size[this->dims - 1] <= INT_MAX);
+    const int LINES = static_cast<int>(this->total() / this->size[this->dims - 1]);
+
+    class PixelOperationWrapper :public ParallelLoopBody
+    {
+    public:
+        PixelOperationWrapper(Mat_<_Tp>* const frame, const Functor& _operation)
+            : mat(frame), op(_operation) {}
+        virtual ~PixelOperationWrapper(){}
+        // ! Overloaded virtual operator
+        // convert range call to row call.
+        virtual void operator()(const Range &range) const CV_OVERRIDE
+        {
+            const int DIMS = mat->dims;
+            const int COLS = mat->size[DIMS - 1];
+            if (DIMS <= 2) {
+                for (int row = range.start; row < range.end; ++row) {
+                    this->rowCall2(row, COLS);
+                }
+            } else {
+                std::vector<int> idx(DIMS); /// idx is modified in this->rowCall
+                idx[DIMS - 2] = range.start - 1;
+
+                for (int line_num = range.start; line_num < range.end; ++line_num) {
+                    idx[DIMS - 2]++;
+                    for (int i = DIMS - 2; i >= 0; --i) {
+                        if (idx[i] >= mat->size[i]) {
+                            idx[i - 1] += idx[i] / mat->size[i];
+                            idx[i] %= mat->size[i];
+                            continue; // carry-over;
+                        }
+                        else {
+                            break;
+                        }
+                    }
+                    this->rowCall(&idx[0], COLS, DIMS);
+                }
+            }
+        }
+    private:
+        Mat_<_Tp>* const mat;
+        const Functor op;
+        // ! Call operator for each elements in this row.
+        inline void rowCall(int* const idx, const int COLS, const int DIMS) const {
+            int &col = idx[DIMS - 1];
+            col = 0;
+            _Tp* pixel = &(mat->template at<_Tp>(idx));
+
+            while (col < COLS) {
+                op(*pixel, const_cast<const int*>(idx));
+                pixel++; col++;
+            }
+            col = 0;
+        }
+        // ! Call operator for each elements in this row. 2d mat special version.
+        inline void rowCall2(const int row, const int COLS) const {
+            union Index{
+                int body[2];
+                operator const int*() const {
+                    return reinterpret_cast<const int*>(this);
+                }
+                int& operator[](const int i) {
+                    return body[i];
+                }
+            } idx = {{row, 0}};
+            // Special union is needed to avoid
+            // "error: array subscript is above array bounds [-Werror=array-bounds]"
+            // when call the functor `op` such that access idx[3].
+
+            _Tp* pixel = &(mat->template at<_Tp>(idx));
+            const _Tp* const pixel_end = pixel + COLS;
+            while(pixel < pixel_end) {
+                op(*pixel++, static_cast<const int*>(idx));
+                idx[1]++;
+            }
+        }
+        PixelOperationWrapper& operator=(const PixelOperationWrapper &) {
+            CV_Assert(false);
+            // We can not remove this implementation because Visual Studio warning C4822.
+            return *this;
+        }
+    };
+
+    parallel_for_(cv::Range(0, LINES), PixelOperationWrapper(reinterpret_cast<Mat_<_Tp>*>(this), operation));
+}
+
+/////////////////////////// Synchronization Primitives ///////////////////////////////
+
+#if !defined(_M_CEE)
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
+typedef std::recursive_mutex Mutex;
+typedef std::lock_guard<cv::Mutex> AutoLock;
+#else // OPENCV_DISABLE_THREAD_SUPPORT
+// Custom (failing) implementation of `std::recursive_mutex`.
+struct Mutex {
+    void lock(){
+        CV_Error(cv::Error::StsNotImplemented,
+                 "cv::Mutex is disabled by OPENCV_DISABLE_THREAD_SUPPORT=ON");
+    }
+    void unlock(){
+        CV_Error(cv::Error::StsNotImplemented,
+                 "cv::Mutex is disabled by OPENCV_DISABLE_THREAD_SUPPORT=ON");
+    }
+};
+// Stub for cv::AutoLock when threads are disabled.
+struct AutoLock {
+    AutoLock(Mutex &) { }
+};
+#endif // OPENCV_DISABLE_THREAD_SUPPORT
+#endif // !defined(_M_CEE)
+
+
+/** @brief Designed for command line parsing
+
+The sample below demonstrates how to use CommandLineParser:
+@code
+    CommandLineParser parser(argc, argv, keys);
+    parser.about("Application name v1.0.0");
+
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    int N = parser.get<int>("N");
+    double fps = parser.get<double>("fps");
+    String path = parser.get<String>("path");
+
+    use_time_stamp = parser.has("timestamp");
+
+    String img1 = parser.get<String>(0);
+    String img2 = parser.get<String>(1);
+
+    int repeat = parser.get<int>(2);
+
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 0;
+    }
+@endcode
+
+### Keys syntax
+
+The keys parameter is a string containing several blocks, each one is enclosed in curly braces and
+describes one argument. Each argument contains three parts separated by the `|` symbol:
+
+-# argument names is a list of option synonyms separated by standard space characters ' ' (to mark argument as positional, prefix it with the `@` symbol)
+-# default value will be used if the argument was not provided (can be empty)
+-# help message (can be empty)
+
+For example:
+
+@code{.cpp}
+    const String keys =
+        "{help h usage ? |      | print this message   }"
+        "{@image1        |      | image1 for compare   }"
+        "{@image2        |<none>| image2 for compare   }"
+        "{@repeat        |1     | number               }"
+        "{path           |.     | path to file         }"
+        "{fps            | -1.0 | fps for output video }"
+        "{N count        |100   | count of objects     }"
+        "{ts timestamp   |      | use time stamp       }"
+        ;
+}
+@endcode
+
+Note that there are no default values for `help` and `timestamp` so we can check their presence using the `has()` method.
+Arguments with default values are considered to be always present. Use the `get()` method in these cases to check their
+actual value instead.
+Note that whitespace characters other than standard spaces are considered part of the string.
+Additionally, leading and trailing standard spaces around the help messages are ignored.
+
+String keys like `get<String>("@image1")` return the empty string `""` by default - even with an empty default value.
+Use the special `<none>` default value to enforce that the returned string must not be empty. (like in `get<String>("@image2")`)
+
+### Usage
+
+For the described keys:
+
+@code{.sh}
+    # Good call (3 positional parameters: image1, image2 and repeat; N is 200, ts is true)
+    $ ./app -N=200 1.png 2.jpg 19 -ts
+
+    # Bad call
+    $ ./app -fps=aaa
+    ERRORS:
+    Parameter 'fps': can not convert: [aaa] to [double]
+@endcode
+ */
+class CV_EXPORTS CommandLineParser
+{
+public:
+
+    /** @brief Constructor
+
+    Initializes command line parser object
+
+    @param argc number of command line arguments (from main())
+    @param argv array of command line arguments (from main())
+    @param keys string describing acceptable command line parameters (see class description for syntax)
+    */
+    CommandLineParser(int argc, const char* const argv[], const String& keys);
+
+    /** @brief Copy constructor */
+    CommandLineParser(const CommandLineParser& parser);
+
+    /** @brief Assignment operator */
+    CommandLineParser& operator = (const CommandLineParser& parser);
+
+    /** @brief Destructor */
+    ~CommandLineParser();
+
+    /** @brief Returns application path
+
+    This method returns the path to the executable from the command line (`argv[0]`).
+
+    For example, if the application has been started with such a command:
+    @code{.sh}
+    $ ./bin/my-executable
+    @endcode
+    this method will return `./bin`.
+    */
+    String getPathToApplication() const;
+
+    /** @brief Access arguments by name
+
+    Returns argument converted to selected type. If the argument is not known or can not be
+    converted to selected type, the error flag is set (can be checked with @ref check).
+
+    For example, define:
+    @code{.cpp}
+    String keys = "{N count||}";
+    @endcode
+
+    Call:
+    @code{.sh}
+    $ ./my-app -N=20
+    # or
+    $ ./my-app --count=20
+    @endcode
+
+    Access:
+    @code{.cpp}
+    int N = parser.get<int>("N");
+    @endcode
+
+    @param name name of the argument
+    @param space_delete remove spaces from the left and right of the string
+    @tparam T the argument will be converted to this type if possible
+
+    @note You can access positional arguments by their `@`-prefixed name:
+    @code{.cpp}
+    parser.get<String>("@image");
+    @endcode
+     */
+    template <typename T>
+    T get(const String& name, bool space_delete = true) const
+    {
+        T val = T();
+        getByName(name, space_delete, ParamType<T>::type, (void*)&val);
+        return val;
+    }
+
+    /** @brief Access positional arguments by index
+
+    Returns argument converted to selected type. Indexes are counted from zero.
+
+    For example, define:
+    @code{.cpp}
+    String keys = "{@arg1||}{@arg2||}"
+    @endcode
+
+    Call:
+    @code{.sh}
+    ./my-app abc qwe
+    @endcode
+
+    Access arguments:
+    @code{.cpp}
+    String val_1 = parser.get<String>(0); // returns "abc", arg1
+    String val_2 = parser.get<String>(1); // returns "qwe", arg2
+    @endcode
+
+    @param index index of the argument
+    @param space_delete remove spaces from the left and right of the string
+    @tparam T the argument will be converted to this type if possible
+     */
+    template <typename T>
+    T get(int index, bool space_delete = true) const
+    {
+        T val = T();
+        getByIndex(index, space_delete, ParamType<T>::type, (void*)&val);
+        return val;
+    }
+
+    /** @brief Check if field was provided in the command line
+
+    @param name argument name to check
+    */
+    bool has(const String& name) const;
+
+    /** @brief Check for parsing errors
+
+    Returns false if error occurred while accessing the parameters (bad conversion, missing arguments,
+    etc.). Call @ref printErrors to print error messages list.
+     */
+    bool check() const;
+
+    /** @brief Set the about message
+
+    The about message will be shown when @ref printMessage is called, right before arguments table.
+     */
+    void about(const String& message);
+
+    /** @brief Print help message
+
+    This method will print standard help message containing the about message and arguments description.
+
+    @sa about
+    */
+    void printMessage() const;
+
+    /** @brief Print list of errors occurred
+
+    @sa check
+    */
+    void printErrors() const;
+
+protected:
+    void getByName(const String& name, bool space_delete, Param type, void* dst) const;
+    void getByIndex(int index, bool space_delete, Param type, void* dst) const;
+
+    struct Impl;
+    Impl* impl;
+};
+
+//! @} core_utils
+
+//! @cond IGNORED
+
+/////////////////////////////// AutoBuffer implementation ////////////////////////////////////////
+
+template<typename _Tp, size_t fixed_size> inline
+AutoBuffer<_Tp, fixed_size>::AutoBuffer()
+{
+    ptr = buf;
+    sz = fixed_size;
+}
+
+template<typename _Tp, size_t fixed_size> inline
+AutoBuffer<_Tp, fixed_size>::AutoBuffer(size_t _size)
+{
+    ptr = buf;
+    sz = fixed_size;
+    allocate(_size);
+}
+
+template<typename _Tp, size_t fixed_size> inline
+AutoBuffer<_Tp, fixed_size>::AutoBuffer(const AutoBuffer<_Tp, fixed_size>& abuf )
+{
+    ptr = buf;
+    sz = fixed_size;
+    allocate(abuf.size());
+    for( size_t i = 0; i < sz; i++ )
+        ptr[i] = abuf.ptr[i];
+}
+
+template<typename _Tp, size_t fixed_size> inline AutoBuffer<_Tp, fixed_size>&
+AutoBuffer<_Tp, fixed_size>::operator = (const AutoBuffer<_Tp, fixed_size>& abuf)
+{
+    if( this != &abuf )
+    {
+        deallocate();
+        allocate(abuf.size());
+        for( size_t i = 0; i < sz; i++ )
+            ptr[i] = abuf.ptr[i];
+    }
+    return *this;
+}
+
+template<typename _Tp, size_t fixed_size> inline
+AutoBuffer<_Tp, fixed_size>::~AutoBuffer()
+{ deallocate(); }
+
+template<typename _Tp, size_t fixed_size> inline void
+AutoBuffer<_Tp, fixed_size>::allocate(size_t _size)
+{
+    if(_size <= sz)
+    {
+        sz = _size;
+        return;
+    }
+    deallocate();
+    sz = _size;
+    if(_size > fixed_size)
+    {
+        ptr = new _Tp[_size];
+    }
+}
+
+template<typename _Tp, size_t fixed_size> inline void
+AutoBuffer<_Tp, fixed_size>::deallocate()
+{
+    if( ptr != buf )
+    {
+        delete[] ptr;
+        ptr = buf;
+        sz = fixed_size;
+    }
+}
+
+template<typename _Tp, size_t fixed_size> inline void
+AutoBuffer<_Tp, fixed_size>::resize(size_t _size)
+{
+    if(_size <= sz)
+    {
+        sz = _size;
+        return;
+    }
+    size_t i, prevsize = sz, minsize = MIN(prevsize, _size);
+    _Tp* prevptr = ptr;
+
+    ptr = _size > fixed_size ? new _Tp[_size] : buf;
+    sz = _size;
+
+    if( ptr != prevptr )
+        for( i = 0; i < minsize; i++ )
+            ptr[i] = prevptr[i];
+    for( i = prevsize; i < _size; i++ )
+        ptr[i] = _Tp();
+
+    if( prevptr != buf )
+        delete[] prevptr;
+}
+
+template<typename _Tp, size_t fixed_size> inline size_t
+AutoBuffer<_Tp, fixed_size>::size() const
+{ return sz; }
+
+//! @endcond
+
+
+// Basic Node class for tree building
+template<class OBJECT>
+class CV_EXPORTS Node
+{
+public:
+    Node()
+    {
+        m_pParent  = 0;
+    }
+    Node(OBJECT& payload) : m_payload(payload)
+    {
+        m_pParent  = 0;
+    }
+    ~Node()
+    {
+        removeChilds();
+        if (m_pParent)
+        {
+            int idx = m_pParent->findChild(this);
+            if (idx >= 0)
+                m_pParent->m_childs.erase(m_pParent->m_childs.begin() + idx);
+        }
+    }
+
+    Node<OBJECT>* findChild(OBJECT& payload) const
+    {
+        for(size_t i = 0; i < this->m_childs.size(); i++)
+        {
+            if(this->m_childs[i]->m_payload == payload)
+                return this->m_childs[i];
+        }
+        return NULL;
+    }
+
+    int findChild(Node<OBJECT> *pNode) const
+    {
+        for (size_t i = 0; i < this->m_childs.size(); i++)
+        {
+            if(this->m_childs[i] == pNode)
+                return (int)i;
+        }
+        return -1;
+    }
+
+    void addChild(Node<OBJECT> *pNode)
+    {
+        if(!pNode)
+            return;
+
+        CV_Assert(pNode->m_pParent == 0);
+        pNode->m_pParent = this;
+        this->m_childs.push_back(pNode);
+    }
+
+    void removeChilds()
+    {
+        for(size_t i = 0; i < m_childs.size(); i++)
+        {
+            m_childs[i]->m_pParent = 0; // avoid excessive parent vector trimming
+            delete m_childs[i];
+        }
+        m_childs.clear();
+    }
+
+    int getDepth()
+    {
+        int   count   = 0;
+        Node *pParent = m_pParent;
+        while(pParent) count++, pParent = pParent->m_pParent;
+        return count;
+    }
+
+public:
+    OBJECT                     m_payload;
+    Node<OBJECT>*              m_pParent;
+    std::vector<Node<OBJECT>*> m_childs;
+};
+
+
+namespace samples {
+
+//! @addtogroup core_utils_samples
+// This section describes utility functions for OpenCV samples.
+//
+// @note Implementation of these utilities is not thread-safe.
+//
+//! @{
+
+/** @brief Try to find requested data file
+
+Search directories:
+
+1. Directories passed via `addSamplesDataSearchPath()`
+2. OPENCV_SAMPLES_DATA_PATH_HINT environment variable
+3. OPENCV_SAMPLES_DATA_PATH environment variable
+   If parameter value is not empty and nothing is found then stop searching.
+4. Detects build/install path based on:
+   a. current working directory (CWD)
+   b. and/or binary module location (opencv_core/opencv_world, doesn't work with static linkage)
+5. Scan `<source>/{,data,samples/data}` directories if build directory is detected or the current directory is in source tree.
+6. Scan `<install>/share/OpenCV` directory if install directory is detected.
+
+@see cv::utils::findDataFile
+
+@param relative_path Relative path to data file
+@param required Specify "file not found" handling.
+       If true, function prints information message and raises cv::Exception.
+       If false, function returns empty result
+@param silentMode Disables messages
+@return Returns path (absolute or relative to the current directory) or empty string if file is not found
+*/
+CV_EXPORTS_W cv::String findFile(const cv::String& relative_path, bool required = true, bool silentMode = false);
+
+CV_EXPORTS_W cv::String findFileOrKeep(const cv::String& relative_path, bool silentMode = false);
+
+inline cv::String findFileOrKeep(const cv::String& relative_path, bool silentMode)
+{
+    cv::String res = findFile(relative_path, false, silentMode);
+    if (res.empty())
+        return relative_path;
+    return res;
+}
+
+/** @brief Override search data path by adding new search location
+
+Use this only to override default behavior
+Passed paths are used in LIFO order.
+
+@param path Path to used samples data
+*/
+CV_EXPORTS_W void addSamplesDataSearchPath(const cv::String& path);
+
+/** @brief Append samples search data sub directory
+
+General usage is to add OpenCV modules name (`<opencv_contrib>/modules/<name>/samples/data` -> `<name>/samples/data` + `modules/<name>/samples/data`).
+Passed subdirectories are used in LIFO order.
+
+@param subdir samples data sub directory
+*/
+CV_EXPORTS_W void addSamplesDataSearchSubDirectory(const cv::String& subdir);
+
+//! @}
+} // namespace samples
+
+namespace utils {
+
+CV_EXPORTS int getThreadID();
+
+} // namespace
+
+} //namespace cv
+
+#ifdef CV_COLLECT_IMPL_DATA
+#include "opencv2/core/utils/instrumentation.hpp"
+#else
+/// Collect implementation data on OpenCV function call. Requires ENABLE_IMPL_COLLECTION build option.
+#define CV_IMPL_ADD(impl)
+#endif
+
+#endif //OPENCV_CORE_UTILITY_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/allocator_stats.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/allocator_stats.hpp
new file mode 100644
index 000000000000..79e933820aea
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/allocator_stats.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ALLOCATOR_STATS_HPP
+#define OPENCV_CORE_ALLOCATOR_STATS_HPP
+
+#include "../cvdef.h"
+
+namespace cv { namespace utils {
+
+class AllocatorStatisticsInterface
+{
+protected:
+    AllocatorStatisticsInterface() {}
+    virtual ~AllocatorStatisticsInterface() {}
+public:
+    virtual uint64_t getCurrentUsage() const = 0;
+    virtual uint64_t getTotalUsage() const = 0;
+    virtual uint64_t getNumberOfAllocations() const = 0;
+    virtual uint64_t getPeakUsage() const = 0;
+
+    /** set peak usage = current usage */
+    virtual void resetPeakUsage() = 0;
+};
+
+}} // namespace
+
+#endif // OPENCV_CORE_ALLOCATOR_STATS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/allocator_stats.impl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/allocator_stats.impl.hpp
new file mode 100644
index 000000000000..bbc6cf89799a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/allocator_stats.impl.hpp
@@ -0,0 +1,106 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP
+#define OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP
+
+#include "./allocator_stats.hpp"
+
+//#define OPENCV_DISABLE_ALLOCATOR_STATS
+
+#include <atomic>
+
+#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
+#if defined(__GNUC__) && (\
+        (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 4) || \
+        (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) && !defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)) \
+    )
+#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int
+#endif
+#endif
+
+#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
+#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long
+#endif
+
+namespace cv { namespace utils {
+
+#ifdef CV__ALLOCATOR_STATS_LOG
+namespace {
+#endif
+
+class AllocatorStatistics : public AllocatorStatisticsInterface
+{
+#ifdef OPENCV_DISABLE_ALLOCATOR_STATS
+
+public:
+    AllocatorStatistics() {}
+    ~AllocatorStatistics() CV_OVERRIDE {}
+
+    uint64_t getCurrentUsage() const CV_OVERRIDE { return 0; }
+    uint64_t getTotalUsage() const CV_OVERRIDE { return 0; }
+    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return 0; }
+    uint64_t getPeakUsage() const CV_OVERRIDE { return 0; }
+
+    /** set peak usage = current usage */
+    void resetPeakUsage() CV_OVERRIDE {};
+
+    void onAllocate(size_t /*sz*/) {}
+    void onFree(size_t /*sz*/) {}
+
+#else
+
+protected:
+    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
+    std::atomic<counter_t> curr, total, total_allocs, peak;
+public:
+    AllocatorStatistics() {}
+    ~AllocatorStatistics() CV_OVERRIDE {}
+
+    uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr.load(); }
+    uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total.load(); }
+    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs.load(); }
+    uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak.load(); }
+
+    /** set peak usage = current usage */
+    void resetPeakUsage() CV_OVERRIDE { peak.store(curr.load()); }
+
+    // Controller interface
+    void onAllocate(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr.load()));
+#endif
+
+        counter_t new_curr = curr.fetch_add((counter_t)sz) + (counter_t)sz;
+
+        // peak = std::max((uint64_t)peak, new_curr);
+        auto prev_peak = peak.load();
+        while (prev_peak < new_curr)
+        {
+            if (peak.compare_exchange_weak(prev_peak, new_curr))
+                break;
+        }
+        // end of peak = max(...)
+
+        total += (counter_t)sz;
+        total_allocs++;
+    }
+    void onFree(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr.load()));
+#endif
+        curr -= (counter_t)sz;
+    }
+#endif // OPENCV_DISABLE_ALLOCATOR_STATS
+};
+
+#ifdef CV__ALLOCATOR_STATS_LOG
+} // namespace
+#endif
+
+}} // namespace
+
+#endif // OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/filesystem.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/filesystem.hpp
new file mode 100644
index 000000000000..8619ae4d1a75
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/filesystem.hpp
@@ -0,0 +1,82 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_FILESYSTEM_HPP
+#define OPENCV_UTILS_FILESYSTEM_HPP
+
+namespace cv { namespace utils { namespace fs {
+
+
+CV_EXPORTS bool exists(const cv::String& path);
+CV_EXPORTS bool isDirectory(const cv::String& path);
+
+CV_EXPORTS void remove_all(const cv::String& path);
+
+
+CV_EXPORTS cv::String getcwd();
+
+/** @brief Converts path p to a canonical absolute path
+ * Symlinks are processed if there is support for them on running platform.
+ *
+ * @param path input path. Target file/directory should exist.
+ */
+CV_EXPORTS cv::String canonical(const cv::String& path);
+
+/** Join path components */
+CV_EXPORTS cv::String join(const cv::String& base, const cv::String& path);
+
+/** Get parent directory */
+CV_EXPORTS cv::String getParent(const cv::String &path);
+CV_EXPORTS std::wstring getParent(const std::wstring& path);
+
+/**
+ * Generate a list of all files that match the globbing pattern.
+ *
+ * Result entries are prefixed by base directory path.
+ *
+ * @param directory base directory
+ * @param pattern filter pattern (based on '*'/'?' symbols). Use empty string to disable filtering and return all results
+ * @param[out] result result of globing.
+ * @param recursive scan nested directories too
+ * @param includeDirectories include directories into results list
+ */
+CV_EXPORTS void glob(const cv::String& directory, const cv::String& pattern,
+        CV_OUT std::vector<cv::String>& result,
+        bool recursive = false, bool includeDirectories = false);
+
+/**
+ * Generate a list of all files that match the globbing pattern.
+ *
+ * @param directory base directory
+ * @param pattern filter pattern (based on '*'/'?' symbols). Use empty string to disable filtering and return all results
+ * @param[out] result globbing result with relative paths from base directory
+ * @param recursive scan nested directories too
+ * @param includeDirectories include directories into results list
+ */
+CV_EXPORTS void glob_relative(const cv::String& directory, const cv::String& pattern,
+        CV_OUT std::vector<cv::String>& result,
+        bool recursive = false, bool includeDirectories = false);
+
+
+CV_EXPORTS bool createDirectory(const cv::String& path);
+CV_EXPORTS bool createDirectories(const cv::String& path);
+
+#if defined(__OPENCV_BUILD) || defined(BUILD_PLUGIN)
+// TODO
+//CV_EXPORTS cv::String getTempDirectory();
+
+/**
+ * @brief Returns directory to store OpenCV cache files
+ * Create sub-directory in common OpenCV cache directory if it doesn't exist.
+ * @param sub_directory_name name of sub-directory. NULL or "" value asks to return root cache directory.
+ * @param configuration_name optional name of configuration parameter name which overrides default behavior.
+ * @return Path to cache directory. Returns empty string if cache directories support is not available. Returns "disabled" if cache disabled by user.
+ */
+CV_EXPORTS cv::String getCacheDirectory(const char* sub_directory_name, const char* configuration_name = NULL);
+
+#endif
+
+}}} // namespace
+
+#endif // OPENCV_UTILS_FILESYSTEM_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/fp_control_utils.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/fp_control_utils.hpp
new file mode 100644
index 000000000000..930bc5d36770
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/fp_control_utils.hpp
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_FP_CONTROL_UTILS_HPP
+#define OPENCV_CORE_FP_CONTROL_UTILS_HPP
+
+namespace cv {
+
+namespace details {
+
+struct FPDenormalsModeState
+{
+    uint32_t reserved[16];  // 64-bytes
+};  // FPDenormalsModeState
+
+CV_EXPORTS void setFPDenormalsIgnoreHint(bool ignore, CV_OUT FPDenormalsModeState& state);
+CV_EXPORTS int saveFPDenormalsState(CV_OUT FPDenormalsModeState& state);
+CV_EXPORTS bool restoreFPDenormalsState(const FPDenormalsModeState& state);
+
+class FPDenormalsIgnoreHintScope
+{
+public:
+    inline explicit FPDenormalsIgnoreHintScope(bool ignore = true)
+    {
+        details::setFPDenormalsIgnoreHint(ignore, saved_state);
+    }
+
+    inline explicit FPDenormalsIgnoreHintScope(const FPDenormalsModeState& state)
+    {
+        details::saveFPDenormalsState(saved_state);
+        details::restoreFPDenormalsState(state);
+    }
+
+    inline ~FPDenormalsIgnoreHintScope()
+    {
+        details::restoreFPDenormalsState(saved_state);
+    }
+
+protected:
+    FPDenormalsModeState saved_state;
+};  // FPDenormalsIgnoreHintScope
+
+class FPDenormalsIgnoreHintScopeNOOP
+{
+public:
+    inline FPDenormalsIgnoreHintScopeNOOP(bool ignore = true) { CV_UNUSED(ignore); }
+    inline FPDenormalsIgnoreHintScopeNOOP(const FPDenormalsModeState& state) { CV_UNUSED(state); }
+    inline ~FPDenormalsIgnoreHintScopeNOOP() { }
+};  // FPDenormalsIgnoreHintScopeNOOP
+
+}  // namespace details
+
+
+// Should depend on target compilation architecture only
+// Note: previously added archs should NOT be removed to preserve ABI compatibility
+#if defined(OPENCV_SUPPORTS_FP_DENORMALS_HINT)
+  // preserve configuration overloading through ports
+#elif defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_X86_)
+typedef details::FPDenormalsIgnoreHintScope FPDenormalsIgnoreHintScope;
+#define OPENCV_SUPPORTS_FP_DENORMALS_HINT 1
+#else
+#define OPENCV_SUPPORTS_FP_DENORMALS_HINT 0
+typedef details::FPDenormalsIgnoreHintScopeNOOP FPDenormalsIgnoreHintScope;
+#endif
+
+}  // namespace cv
+
+#endif // OPENCV_CORE_FP_CONTROL_UTILS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/instrumentation.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/instrumentation.hpp
new file mode 100644
index 000000000000..363986708050
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/instrumentation.hpp
@@ -0,0 +1,125 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_INSTR_HPP
+#define OPENCV_UTILS_INSTR_HPP
+
+#include <opencv2/core/utility.hpp>
+#include <opencv2/core/utils/tls.hpp>
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+#ifdef CV_COLLECT_IMPL_DATA
+CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays
+CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays
+// Get stored implementation flags and functions names arrays
+// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function
+CV_EXPORTS int getImpl(std::vector<int> &impl, std::vector<String> &funName);
+
+CV_EXPORTS bool useCollection(); // return implementation collection state
+CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state
+
+#define CV_IMPL_PLAIN  0x01 // native CPU OpenCV implementation
+#define CV_IMPL_OCL    0x02 // OpenCL implementation
+#define CV_IMPL_IPP    0x04 // IPP implementation
+#define CV_IMPL_MT     0x10 // multithreaded implementation
+
+#undef CV_IMPL_ADD
+#define CV_IMPL_ADD(impl)                                                   \
+    if(cv::useCollection())                                                 \
+    {                                                                       \
+        cv::addImpl(impl, CV_Func);                                         \
+    }
+#endif
+
+// Instrumentation external interface
+namespace instr
+{
+
+#if !defined OPENCV_ABI_CHECK
+
+enum TYPE
+{
+    TYPE_GENERAL = 0,   // OpenCV API function, e.g. exported function
+    TYPE_MARKER,        // Information marker
+    TYPE_WRAPPER,       // Wrapper function for implementation
+    TYPE_FUN,           // Simple function call
+};
+
+enum IMPL
+{
+    IMPL_PLAIN = 0,
+    IMPL_IPP,
+    IMPL_OPENCL,
+};
+
+struct NodeDataTls
+{
+    NodeDataTls()
+    {
+        m_ticksTotal = 0;
+    }
+    uint64      m_ticksTotal;
+};
+
+class CV_EXPORTS NodeData
+{
+public:
+    NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN);
+    NodeData(NodeData &ref);
+    ~NodeData();
+    NodeData& operator=(const NodeData&);
+
+    cv::String          m_funName;
+    cv::instr::TYPE     m_instrType;
+    cv::instr::IMPL     m_implType;
+    const char*         m_fileName;
+    int                 m_lineNum;
+    void*               m_retAddress;
+    bool                m_alwaysExpand;
+    bool                m_funError;
+
+    volatile int         m_counter;
+    volatile uint64      m_ticksTotal;
+    TLSDataAccumulator<NodeDataTls> m_tls;
+    int                  m_threads;
+
+    // No synchronization
+    double getTotalMs()   const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; }
+    double getMeanMs()    const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; }
+};
+bool operator==(const NodeData& lhs, const NodeData& rhs);
+
+typedef Node<NodeData> InstrNode;
+
+CV_EXPORTS InstrNode* getTrace();
+
+#endif // !defined OPENCV_ABI_CHECK
+
+
+CV_EXPORTS bool       useInstrumentation();
+CV_EXPORTS void       setUseInstrumentation(bool flag);
+CV_EXPORTS void       resetTrace();
+
+enum FLAGS
+{
+    FLAGS_NONE              = 0,
+    FLAGS_MAPPING           = 0x01,
+    FLAGS_EXPAND_SAME_NAMES = 0x02,
+};
+
+CV_EXPORTS void       setFlags(FLAGS modeFlags);
+static inline void    setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); }
+CV_EXPORTS FLAGS      getFlags();
+
+} // namespace instr
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logger.defines.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logger.defines.hpp
new file mode 100644
index 000000000000..7d73f02b6695
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logger.defines.hpp
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_LOGGER_DEFINES_HPP
+#define OPENCV_LOGGER_DEFINES_HPP
+
+//! @addtogroup core_logging
+//! @{
+
+// Supported logging levels and their semantic
+#define CV_LOG_LEVEL_SILENT 0          //!< for using in setLogLevel() call
+#define CV_LOG_LEVEL_FATAL 1           //!< Fatal (critical) error (unrecoverable internal error)
+#define CV_LOG_LEVEL_ERROR 2           //!< Error message
+#define CV_LOG_LEVEL_WARN 3            //!< Warning message
+#define CV_LOG_LEVEL_INFO 4            //!< Info message
+#define CV_LOG_LEVEL_DEBUG 5           //!< Debug message. Disabled in the "Release" build.
+#define CV_LOG_LEVEL_VERBOSE 6         //!< Verbose (trace) messages. Requires verbosity level. Disabled in the "Release" build.
+
+namespace cv {
+namespace utils {
+namespace logging {
+
+//! Supported logging levels and their semantic
+enum LogLevel {
+    LOG_LEVEL_SILENT = 0,              //!< for using in setLogVevel() call
+    LOG_LEVEL_FATAL = 1,               //!< Fatal (critical) error (unrecoverable internal error)
+    LOG_LEVEL_ERROR = 2,               //!< Error message
+    LOG_LEVEL_WARNING = 3,             //!< Warning message
+    LOG_LEVEL_INFO = 4,                //!< Info message
+    LOG_LEVEL_DEBUG = 5,               //!< Debug message. Disabled in the "Release" build.
+    LOG_LEVEL_VERBOSE = 6,             //!< Verbose (trace) messages. Requires verbosity level. Disabled in the "Release" build.
+#ifndef CV_DOXYGEN
+    ENUM_LOG_LEVEL_FORCE_INT = INT_MAX
+#endif
+};
+
+}}} // namespace
+
+//! @}
+
+#endif // OPENCV_LOGGER_DEFINES_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logger.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logger.hpp
new file mode 100644
index 000000000000..accb860ada8e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logger.hpp
@@ -0,0 +1,218 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_LOGGER_HPP
+#define OPENCV_LOGGER_HPP
+
+#include <iostream>
+#include <sstream>
+#include <limits.h> // INT_MAX
+
+#include "logger.defines.hpp"
+#include "logtag.hpp"
+
+namespace cv {
+namespace utils {
+namespace logging {
+
+//! @addtogroup core_logging
+//! @{
+
+/** Set global logging level
+@return previous logging level
+*/
+CV_EXPORTS LogLevel setLogLevel(LogLevel logLevel);
+/** Get global logging level */
+CV_EXPORTS LogLevel getLogLevel();
+
+CV_EXPORTS void registerLogTag(cv::utils::logging::LogTag* plogtag);
+
+CV_EXPORTS void setLogTagLevel(const char* tag, cv::utils::logging::LogLevel level);
+
+CV_EXPORTS cv::utils::logging::LogLevel getLogTagLevel(const char* tag);
+
+namespace internal {
+
+/** Get global log tag */
+CV_EXPORTS cv::utils::logging::LogTag* getGlobalLogTag();
+
+/** Write log message */
+CV_EXPORTS void writeLogMessage(LogLevel logLevel, const char* message);
+
+/** Write log message */
+CV_EXPORTS void writeLogMessageEx(LogLevel logLevel, const char* tag, const char* file, int line, const char* func, const char* message);
+
+} // namespace
+
+struct LogTagAuto
+    : public LogTag
+{
+    inline LogTagAuto(const char* _name, LogLevel _level)
+        : LogTag(_name, _level)
+    {
+        registerLogTag(this);
+    }
+};
+
+/**
+ * \def CV_LOG_STRIP_LEVEL
+ *
+ * Define CV_LOG_STRIP_LEVEL=CV_LOG_LEVEL_[DEBUG|INFO|WARN|ERROR|FATAL|SILENT] to compile out anything at that and before that logging level
+ */
+#ifndef CV_LOG_STRIP_LEVEL
+# if defined NDEBUG
+#   define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG
+# else
+#   define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_VERBOSE
+# endif
+#endif
+
+#define CV_LOGTAG_PTR_CAST(expr) static_cast<const cv::utils::logging::LogTag*>(expr)
+
+// CV_LOGTAG_EXPAND_NAME is intended to be re-defined (undef and then define again)
+// to allows logging users to use a shorter name argument when calling
+// CV_LOG_WITH_TAG or its related macros such as CV_LOG_INFO.
+//
+// This macro is intended to modify the tag argument as a string (token), via
+// preprocessor token pasting or metaprogramming techniques. A typical usage
+// is to apply a prefix, such as
+// ...... #define CV_LOGTAG_EXPAND_NAME(tag) cv_logtag_##tag
+//
+// It is permitted to re-define to a hard-coded expression, ignoring the tag.
+// This would work identically like the CV_LOGTAG_FALLBACK macro.
+//
+// Important: When the logging macro is called with tag being NULL, a user-defined
+// CV_LOGTAG_EXPAND_NAME may expand it into cv_logtag_0, cv_logtag_NULL, or
+// cv_logtag_nullptr. Use with care. Also be mindful of C++ symbol redefinitions.
+//
+// If there is significant amount of logging code with tag being NULL, it is
+// recommended to use (re-define) CV_LOGTAG_FALLBACK to inject locally a default
+// tag at the beginning of a compilation unit, to minimize lines of code changes.
+//
+#define CV_LOGTAG_EXPAND_NAME(tag) tag
+
+// CV_LOGTAG_FALLBACK is intended to be re-defined (undef and then define again)
+// by any other compilation units to provide a log tag when the logging statement
+// does not specify one. The macro needs to expand into a C++ expression that can
+// be static_cast into (cv::utils::logging::LogTag*). Null (nullptr) is permitted.
+#define CV_LOGTAG_FALLBACK nullptr
+
+// CV_LOGTAG_GLOBAL is the tag used when a log tag is not specified in the logging
+// statement nor the compilation unit. The macro needs to expand into a C++
+// expression that can be static_cast into (cv::utils::logging::LogTag*). Must be
+// non-null. Do not re-define.
+#define CV_LOGTAG_GLOBAL cv::utils::logging::internal::getGlobalLogTag()
+
+#define CV_LOG_WITH_TAG(tag, msgLevel, extra_check0, extra_check1, ...) \
+    for(;;) { \
+        extra_check0; \
+        const auto cv_temp_msglevel = (cv::utils::logging::LogLevel)(msgLevel); \
+        if (cv_temp_msglevel >= (CV_LOG_STRIP_LEVEL)) break; \
+        auto cv_temp_logtagptr = CV_LOGTAG_PTR_CAST(CV_LOGTAG_EXPAND_NAME(tag)); \
+        if (!cv_temp_logtagptr) cv_temp_logtagptr = CV_LOGTAG_PTR_CAST(CV_LOGTAG_FALLBACK); \
+        if (!cv_temp_logtagptr) cv_temp_logtagptr = CV_LOGTAG_PTR_CAST(CV_LOGTAG_GLOBAL); \
+        if (cv_temp_logtagptr && (cv_temp_msglevel > cv_temp_logtagptr->level)) break; \
+        extra_check1; \
+        std::stringstream cv_temp_logstream; \
+        cv_temp_logstream << __VA_ARGS__; \
+        cv::utils::logging::internal::writeLogMessageEx( \
+            cv_temp_msglevel, \
+            (cv_temp_logtagptr ? cv_temp_logtagptr->name : nullptr), \
+            __FILE__, \
+            __LINE__, \
+            CV_Func, \
+            cv_temp_logstream.str().c_str()); \
+        break; \
+    }
+
+#define CV_LOG_FATAL(tag, ...)   CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_FATAL, , , __VA_ARGS__)
+#define CV_LOG_ERROR(tag, ...)   CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_ERROR, , , __VA_ARGS__)
+#define CV_LOG_WARNING(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_WARNING, , , __VA_ARGS__)
+#define CV_LOG_INFO(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_INFO, , , __VA_ARGS__)
+#define CV_LOG_DEBUG(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_DEBUG, , , __VA_ARGS__)
+#define CV_LOG_VERBOSE(tag, v, ...) CV_LOG_WITH_TAG(tag, (cv::utils::logging::LOG_LEVEL_VERBOSE + (int)(v)), , , __VA_ARGS__)
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_INFO
+#undef CV_LOG_INFO
+#define CV_LOG_INFO(tag, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_DEBUG
+#undef CV_LOG_DEBUG
+#define CV_LOG_DEBUG(tag, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_VERBOSE
+#undef CV_LOG_VERBOSE
+#define CV_LOG_VERBOSE(tag, v, ...)
+#endif
+
+//! @cond IGNORED
+#define CV__LOG_ONCE_CHECK_PRE \
+    static bool _cv_log_once_ ## __LINE__ = false; \
+    if (_cv_log_once_ ## __LINE__) break;
+
+#define CV__LOG_ONCE_CHECK_POST \
+    _cv_log_once_ ## __LINE__ = true;
+
+#define CV__LOG_IF_CHECK(logging_cond) \
+    if (!(logging_cond)) break;
+
+//! @endcond
+
+
+// CV_LOG_ONCE_XXX macros
+
+#define CV_LOG_ONCE_ERROR(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_ERROR, CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+#define CV_LOG_ONCE_WARNING(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_WARNING, CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+#define CV_LOG_ONCE_INFO(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_INFO, CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+#define CV_LOG_ONCE_DEBUG(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_DEBUG, CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+#define CV_LOG_ONCE_VERBOSE(tag, v, ...) CV_LOG_WITH_TAG(tag, (cv::utils::logging::LOG_LEVEL_VERBOSE + (int)(v)), CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_INFO
+#undef CV_LOG_ONCE_INFO
+#define CV_LOG_ONCE_INFO(tag, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_DEBUG
+#undef CV_LOG_ONCE_DEBUG
+#define CV_LOG_ONCE_DEBUG(tag, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_VERBOSE
+#undef CV_LOG_ONCE_VERBOSE
+#define CV_LOG_ONCE_VERBOSE(tag, v, ...)
+#endif
+
+
+// CV_LOG_IF_XXX macros
+
+#define CV_LOG_IF_FATAL(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_FATAL, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_ERROR(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_ERROR, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_WARNING(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_WARNING, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_INFO(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_INFO, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_DEBUG(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_DEBUG, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_VERBOSE(tag, v, logging_cond, ...) CV_LOG_WITH_TAG(tag, (cv::utils::logging::LOG_LEVEL_VERBOSE + (int)(v)), , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_INFO
+#undef CV_LOG_IF_INFO
+#define CV_LOG_IF_INFO(tag, logging_cond, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_DEBUG
+#undef CV_LOG_IF_DEBUG
+#define CV_LOG_IF_DEBUG(tag, logging_cond, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_VERBOSE
+#undef CV_LOG_IF_VERBOSE
+#define CV_LOG_IF_VERBOSE(tag, v, logging_cond, ...)
+#endif
+
+
+//! @}
+
+}}} // namespace
+
+#endif // OPENCV_LOGGER_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logtag.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logtag.hpp
new file mode 100644
index 000000000000..4089720767de
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/logtag.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_LOGTAG_HPP
+#define OPENCV_CORE_LOGTAG_HPP
+
+#include "opencv2/core/cvstd.hpp"
+#include "logger.defines.hpp"
+
+namespace cv {
+namespace utils {
+namespace logging {
+
+struct LogTag
+{
+    const char* name;
+    LogLevel level;
+
+    inline LogTag(const char* _name, LogLevel _level)
+        : name(_name)
+        , level(_level)
+    {}
+};
+
+}}}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/tls.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/tls.hpp
new file mode 100644
index 000000000000..124caebc85b2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/tls.hpp
@@ -0,0 +1,235 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_TLS_HPP
+#define OPENCV_UTILS_TLS_HPP
+
+#ifndef OPENCV_CORE_UTILITY_H
+#error "tls.hpp must be included after opencv2/core/utility.hpp or opencv2/core.hpp"
+#endif
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+namespace details { class TlsStorage; }
+
+/** TLS container base implementation
+ *
+ * Don't use directly.
+ *
+ * @sa TLSData, TLSDataAccumulator templates
+ */
+class CV_EXPORTS TLSDataContainer
+{
+protected:
+    TLSDataContainer();
+    virtual ~TLSDataContainer();
+
+    /// @deprecated use detachData() instead
+    void  gatherData(std::vector<void*> &data) const;
+    /// get TLS data and detach all data from threads (similar to cleanup() call)
+    void  detachData(std::vector<void*>& data);
+
+    void* getData() const;
+    void  release();
+
+protected:
+    virtual void* createDataInstance() const = 0;
+    virtual void  deleteDataInstance(void* pData) const = 0;
+
+private:
+    int key_;
+
+    friend class cv::details::TlsStorage;  // core/src/system.cpp
+
+public:
+    void cleanup(); //!< Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid.
+
+private:
+    // Disable copy/assign (noncopyable pattern)
+    TLSDataContainer(TLSDataContainer &) = delete;
+    TLSDataContainer& operator =(const TLSDataContainer &) = delete;
+};
+
+
+/** @brief Simple TLS data class
+ *
+ * @sa TLSDataAccumulator
+ */
+template <typename T>
+class TLSData : protected TLSDataContainer
+{
+public:
+    inline TLSData() {}
+    inline ~TLSData() { release(); }
+
+    inline T* get() const   { return (T*)getData(); }  //!< Get data associated with key
+    inline T& getRef() const { T* ptr = (T*)getData(); CV_DbgAssert(ptr); return *ptr; }  //!< Get data associated with key
+
+    /// Release associated thread data
+    inline void cleanup()
+    {
+        TLSDataContainer::cleanup();
+    }
+
+protected:
+    /// Wrapper to allocate data by template
+    virtual void* createDataInstance() const CV_OVERRIDE { return new T; }
+    /// Wrapper to release data by template
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE { delete (T*)pData; }
+};
+
+
+/// TLS data accumulator with gathering methods
+template <typename T>
+class TLSDataAccumulator : public TLSData<T>
+{
+    mutable cv::Mutex mutex;
+    mutable std::vector<T*> dataFromTerminatedThreads;
+    std::vector<T*> detachedData;
+    bool cleanupMode;
+public:
+    TLSDataAccumulator() : cleanupMode(false) {}
+    ~TLSDataAccumulator()
+    {
+        release();
+    }
+
+    /** @brief Get data from all threads
+     * @deprecated replaced by detachData()
+     *
+     * Lifetime of vector data is valid until next detachData()/cleanup()/release() calls
+     *
+     * @param[out] data result buffer (should be empty)
+     */
+    void gather(std::vector<T*> &data) const
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        CV_Assert(data.empty());
+        {
+            std::vector<void*> &dataVoid = reinterpret_cast<std::vector<void*>&>(data);
+            TLSDataContainer::gatherData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            data.reserve(data.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                data.push_back((T*)*i);
+            }
+        }
+    }
+
+    /** @brief Get and detach data from all threads
+     *
+     * Call cleanupDetachedData() when returned vector is not needed anymore.
+     *
+     * @return Vector with associated data. Content is preserved (including lifetime of attached data pointers) until next detachData()/cleanupDetachedData()/cleanup()/release() calls
+     */
+    std::vector<T*>& detachData()
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        std::vector<void*> dataVoid;
+        {
+            TLSDataContainer::detachData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            detachedData.reserve(dataVoid.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                detachedData.push_back((T*)*i);
+            }
+            dataFromTerminatedThreads.clear();
+            for (typename std::vector<void*>::const_iterator i = dataVoid.begin(); i != dataVoid.end(); ++i)
+            {
+                detachedData.push_back((T*)(void*)*i);
+            }
+        }
+        dataVoid.clear();
+        return detachedData;
+    }
+
+    /// Release associated thread data returned by detachData() call
+    void cleanupDetachedData()
+    {
+        AutoLock lock(mutex);
+        cleanupMode = true;
+        _cleanupDetachedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data
+    void cleanup()
+    {
+        cleanupMode = true;
+        TLSDataContainer::cleanup();
+
+        AutoLock lock(mutex);
+        _cleanupDetachedData();
+        _cleanupTerminatedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data and free TLS key
+    void release()
+    {
+        cleanupMode = true;
+        TLSDataContainer::release();
+        {
+            AutoLock lock(mutex);
+            _cleanupDetachedData();
+            _cleanupTerminatedData();
+        }
+    }
+
+protected:
+    // synchronized
+    void _cleanupDetachedData()
+    {
+        for (typename std::vector<T*>::iterator i = detachedData.begin(); i != detachedData.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        detachedData.clear();
+    }
+
+    // synchronized
+    void _cleanupTerminatedData()
+    {
+        for (typename std::vector<T*>::iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        dataFromTerminatedThreads.clear();
+    }
+
+protected:
+    virtual void* createDataInstance() const CV_OVERRIDE
+    {
+        // Note: we can collect all allocated data here, but this would require raced mutex locks
+        return new T;
+    }
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE
+    {
+        if (cleanupMode)
+        {
+            delete (T*)pData;
+        }
+        else
+        {
+            AutoLock lock(mutex);
+            dataFromTerminatedThreads.push_back((T*)pData);
+        }
+    }
+};
+
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/trace.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/trace.hpp
new file mode 100644
index 000000000000..ea43bbeea105
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/utils/trace.hpp
@@ -0,0 +1,252 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_TRACE_HPP
+#define OPENCV_TRACE_HPP
+
+#include <opencv2/core/cvdef.h>
+
+namespace cv {
+namespace utils {
+namespace trace {
+
+//! @addtogroup core_logging
+//! @{
+
+//! Macro to trace function
+#define CV_TRACE_FUNCTION()
+
+#define CV_TRACE_FUNCTION_SKIP_NESTED()
+
+//! Trace code scope.
+//! @note Dynamic names are not supported in this macro (on stack or heap). Use string literals here only, like "initialize".
+#define CV_TRACE_REGION(name_as_static_string_literal)
+//! mark completed of the current opened region and create new one
+//! @note Dynamic names are not supported in this macro (on stack or heap). Use string literals here only, like "step1".
+#define CV_TRACE_REGION_NEXT(name_as_static_string_literal)
+
+//! Macro to trace argument value
+#define CV_TRACE_ARG(arg_id)
+
+//! Macro to trace argument value (expanded version)
+#define CV_TRACE_ARG_VALUE(arg_id, arg_name, value)
+
+//! @cond IGNORED
+#define CV_TRACE_NS cv::utils::trace
+
+#if !defined(OPENCV_DISABLE_TRACE) && defined(__EMSCRIPTEN__)
+#define OPENCV_DISABLE_TRACE 1
+#endif
+
+namespace details {
+
+#ifndef __OPENCV_TRACE
+# if defined __OPENCV_BUILD && !defined __OPENCV_TESTS && !defined __OPENCV_APPS
+#   define __OPENCV_TRACE 1
+# else
+#   define __OPENCV_TRACE 0
+# endif
+#endif
+
+#ifndef CV_TRACE_FILENAME
+# define CV_TRACE_FILENAME __FILE__
+#endif
+
+#ifndef CV__TRACE_FUNCTION
+# if defined _MSC_VER
+#   define CV__TRACE_FUNCTION __FUNCSIG__
+# elif defined __GNUC__
+#   define CV__TRACE_FUNCTION __PRETTY_FUNCTION__
+# else
+#   define CV__TRACE_FUNCTION "<unknown>"
+# endif
+#endif
+
+//! Thread-local instance (usually allocated on stack)
+class CV_EXPORTS Region
+{
+public:
+    struct LocationExtraData;
+    struct LocationStaticStorage
+    {
+        LocationExtraData** ppExtra;   ///< implementation specific data
+        const char* name;              ///< region name (function name or other custom name)
+        const char* filename;          ///< source code filename
+        int line;                      ///< source code line
+        int flags;                     ///< flags (implementation code path: Plain, IPP, OpenCL)
+    };
+
+    Region(const LocationStaticStorage& location);
+    inline ~Region()
+    {
+        if (implFlags != 0)
+            destroy();
+        CV_DbgAssert(implFlags == 0);
+        CV_DbgAssert(pImpl == NULL);
+    }
+
+    class Impl;
+    Impl* pImpl; // NULL if current region is not active
+    int implFlags; // see RegionFlag, 0 if region is ignored
+
+    bool isActive() const { return pImpl != NULL; }
+
+    void destroy();
+private:
+    Region(const Region&); // disabled
+    Region& operator= (const Region&); // disabled
+};
+
+//! Specify region flags
+enum RegionLocationFlag {
+    REGION_FLAG_FUNCTION = (1 << 0),             ///< region is function (=1) / nested named region (=0)
+    REGION_FLAG_APP_CODE = (1 << 1),             ///< region is Application code (=1) / OpenCV library code (=0)
+    REGION_FLAG_SKIP_NESTED = (1 << 2),          ///< avoid processing of nested regions
+
+    REGION_FLAG_IMPL_IPP = (1 << 16),            ///< region is part of IPP code path
+    REGION_FLAG_IMPL_OPENCL = (2 << 16),         ///< region is part of OpenCL code path
+    REGION_FLAG_IMPL_OPENVX = (3 << 16),         ///< region is part of OpenVX code path
+
+    REGION_FLAG_IMPL_MASK = (15 << 16),
+
+    REGION_FLAG_REGION_FORCE = (1 << 30),
+    REGION_FLAG_REGION_NEXT = (1 << 31),         ///< close previous region (see #CV_TRACE_REGION_NEXT macro)
+
+    ENUM_REGION_FLAG_FORCE_INT = INT_MAX
+};
+
+struct CV_EXPORTS TraceArg {
+public:
+    struct ExtraData;
+    ExtraData** ppExtra;
+    const char* name;
+    int flags;
+};
+/** @brief Add meta information to current region (function)
+ * See CV_TRACE_ARG macro
+ * @param arg argument information structure (global static cache)
+ * @param value argument value (can by dynamic string literal in case of string, static allocation is not required)
+ */
+CV_EXPORTS void traceArg(const TraceArg& arg, const char* value);
+//! @overload
+CV_EXPORTS void traceArg(const TraceArg& arg, int value);
+//! @overload
+CV_EXPORTS void traceArg(const TraceArg& arg, int64 value);
+//! @overload
+CV_EXPORTS void traceArg(const TraceArg& arg, double value);
+
+#define CV__TRACE_LOCATION_VARNAME(loc_id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_trace_location_, loc_id), __LINE__)
+#define CV__TRACE_LOCATION_EXTRA_VARNAME(loc_id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_trace_location_extra_, loc_id) , __LINE__)
+
+#define CV__TRACE_DEFINE_LOCATION_(loc_id, name, flags) \
+    static CV_TRACE_NS::details::Region::LocationExtraData* CV__TRACE_LOCATION_EXTRA_VARNAME(loc_id) = 0; \
+    static const CV_TRACE_NS::details::Region::LocationStaticStorage \
+        CV__TRACE_LOCATION_VARNAME(loc_id) = { &(CV__TRACE_LOCATION_EXTRA_VARNAME(loc_id)), name, CV_TRACE_FILENAME, __LINE__, flags};
+
+#define CV__TRACE_DEFINE_LOCATION_FN(name, flags) CV__TRACE_DEFINE_LOCATION_(fn, name, ((flags) | CV_TRACE_NS::details::REGION_FLAG_FUNCTION))
+
+
+#define CV__TRACE_OPENCV_FUNCTION() \
+    CV__TRACE_DEFINE_LOCATION_FN(CV__TRACE_FUNCTION, 0); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_OPENCV_FUNCTION_NAME(name) \
+    CV__TRACE_DEFINE_LOCATION_FN(name, 0); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_APP_FUNCTION() \
+    CV__TRACE_DEFINE_LOCATION_FN(CV__TRACE_FUNCTION, CV_TRACE_NS::details::REGION_FLAG_APP_CODE); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_APP_FUNCTION_NAME(name) \
+    CV__TRACE_DEFINE_LOCATION_FN(name, CV_TRACE_NS::details::REGION_FLAG_APP_CODE); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+
+#define CV__TRACE_OPENCV_FUNCTION_SKIP_NESTED() \
+    CV__TRACE_DEFINE_LOCATION_FN(CV__TRACE_FUNCTION, CV_TRACE_NS::details::REGION_FLAG_SKIP_NESTED); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_OPENCV_FUNCTION_NAME_SKIP_NESTED(name) \
+    CV__TRACE_DEFINE_LOCATION_FN(name, CV_TRACE_NS::details::REGION_FLAG_SKIP_NESTED); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_APP_FUNCTION_SKIP_NESTED() \
+    CV__TRACE_DEFINE_LOCATION_FN(CV__TRACE_FUNCTION, CV_TRACE_NS::details::REGION_FLAG_SKIP_NESTED | CV_TRACE_NS::details::REGION_FLAG_APP_CODE); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+
+#define CV__TRACE_REGION_(name_as_static_string_literal, flags) \
+    CV__TRACE_DEFINE_LOCATION_(region, name_as_static_string_literal, flags); \
+    CV_TRACE_NS::details::Region CVAUX_CONCAT(__region_, __LINE__)(CV__TRACE_LOCATION_VARNAME(region));
+
+#define CV__TRACE_REGION(name_as_static_string_literal) CV__TRACE_REGION_(name_as_static_string_literal, 0)
+#define CV__TRACE_REGION_NEXT(name_as_static_string_literal) CV__TRACE_REGION_(name_as_static_string_literal, CV_TRACE_NS::details::REGION_FLAG_REGION_NEXT)
+
+#define CV__TRACE_ARG_VARNAME(arg_id) CVAUX_CONCAT(__cv_trace_arg_ ## arg_id, __LINE__)
+#define CV__TRACE_ARG_EXTRA_VARNAME(arg_id) CVAUX_CONCAT(__cv_trace_arg_extra_ ## arg_id, __LINE__)
+
+#define CV__TRACE_DEFINE_ARG_(arg_id, name, flags) \
+    static CV_TRACE_NS::details::TraceArg::ExtraData* CV__TRACE_ARG_EXTRA_VARNAME(arg_id) = 0; \
+    static const CV_TRACE_NS::details::TraceArg \
+        CV__TRACE_ARG_VARNAME(arg_id) = { &(CV__TRACE_ARG_EXTRA_VARNAME(arg_id)), name, flags };
+
+#define CV__TRACE_ARG_VALUE(arg_id, arg_name, value) \
+        CV__TRACE_DEFINE_ARG_(arg_id, arg_name, 0); \
+        CV_TRACE_NS::details::traceArg((CV__TRACE_ARG_VARNAME(arg_id)), value);
+
+#define CV__TRACE_ARG(arg_id) CV_TRACE_ARG_VALUE(arg_id, #arg_id, (arg_id))
+
+} // namespace
+
+#ifndef OPENCV_DISABLE_TRACE
+#undef CV_TRACE_FUNCTION
+#undef CV_TRACE_FUNCTION_SKIP_NESTED
+#if __OPENCV_TRACE
+#define CV_TRACE_FUNCTION CV__TRACE_OPENCV_FUNCTION
+#define CV_TRACE_FUNCTION_SKIP_NESTED CV__TRACE_OPENCV_FUNCTION_SKIP_NESTED
+#else
+#define CV_TRACE_FUNCTION CV__TRACE_APP_FUNCTION
+#define CV_TRACE_FUNCTION_SKIP_NESTED CV__TRACE_APP_FUNCTION_SKIP_NESTED
+#endif
+
+#undef CV_TRACE_REGION
+#define CV_TRACE_REGION CV__TRACE_REGION
+
+#undef CV_TRACE_REGION_NEXT
+#define CV_TRACE_REGION_NEXT CV__TRACE_REGION_NEXT
+
+#undef CV_TRACE_ARG_VALUE
+#define CV_TRACE_ARG_VALUE(arg_id, arg_name, value) \
+        if (__region_fn.isActive()) \
+        { \
+            CV__TRACE_ARG_VALUE(arg_id, arg_name, value); \
+        }
+
+#undef CV_TRACE_ARG
+#define CV_TRACE_ARG CV__TRACE_ARG
+
+#endif // OPENCV_DISABLE_TRACE
+
+#ifdef OPENCV_TRACE_VERBOSE
+#define CV_TRACE_FUNCTION_VERBOSE CV_TRACE_FUNCTION
+#define CV_TRACE_REGION_VERBOSE CV_TRACE_REGION
+#define CV_TRACE_REGION_NEXT_VERBOSE CV_TRACE_REGION_NEXT
+#define CV_TRACE_ARG_VALUE_VERBOSE CV_TRACE_ARG_VALUE
+#define CV_TRACE_ARG_VERBOSE CV_TRACE_ARG
+#else
+#define CV_TRACE_FUNCTION_VERBOSE(...)
+#define CV_TRACE_REGION_VERBOSE(...)
+#define CV_TRACE_REGION_NEXT_VERBOSE(...)
+#define CV_TRACE_ARG_VALUE_VERBOSE(...)
+#define CV_TRACE_ARG_VERBOSE(...)
+#endif
+
+//! @endcond
+
+//! @}
+
+}}} // namespace
+
+#endif // OPENCV_TRACE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/va_intel.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/va_intel.hpp
new file mode 100644
index 000000000000..b37ce75135da
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/va_intel.hpp
@@ -0,0 +1,75 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2015, Itseez, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_CORE_VA_INTEL_HPP
+#define OPENCV_CORE_VA_INTEL_HPP
+
+#ifndef __cplusplus
+#  error va_intel.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core.hpp"
+#include "ocl.hpp"
+
+#if defined(HAVE_VA)
+# include "va/va.h"
+#else  // HAVE_VA
+# if !defined(_VA_H_)
+    typedef void* VADisplay;
+    typedef unsigned int VASurfaceID;
+# endif // !_VA_H_
+#endif // HAVE_VA
+
+namespace cv { namespace va_intel {
+
+/** @addtogroup core_va_intel
+This section describes Intel VA-API/OpenCL (CL-VA) interoperability.
+
+To enable basic VA interoperability build OpenCV with libva library integration enabled: `-DWITH_VA=ON` (corresponding dev package should be installed).
+
+To enable advanced CL-VA interoperability support on Intel HW, enable option: `-DWITH_VA_INTEL=ON` (OpenCL integration should be enabled which is the default setting). Special runtime environment should be set up in order to use this feature: correct combination of [libva](https://github.com/intel/libva), [OpenCL runtime](https://github.com/intel/compute-runtime) and [media driver](https://github.com/intel/media-driver) should be installed.
+
+Check usage example for details: samples/va_intel/va_intel_interop.cpp
+*/
+//! @{
+
+/////////////////// CL-VA Interoperability Functions ///////////////////
+
+namespace ocl {
+using namespace cv::ocl;
+
+// TODO static functions in the Context class
+/** @brief Creates OpenCL context from VA.
+@param display    - VADisplay for which CL interop should be established.
+@param tryInterop - try to set up for interoperability, if true; set up for use slow copy if false.
+@return Returns reference to OpenCL Context
+ */
+CV_EXPORTS Context& initializeContextFromVA(VADisplay display, bool tryInterop = true);
+
+} // namespace cv::va_intel::ocl
+
+/** @brief Converts InputArray to VASurfaceID object.
+@param display - VADisplay object.
+@param src     - source InputArray.
+@param surface - destination VASurfaceID object.
+@param size    - size of image represented by VASurfaceID object.
+ */
+CV_EXPORTS void convertToVASurface(VADisplay display, InputArray src, VASurfaceID surface, Size size);
+
+/** @brief Converts VASurfaceID object to OutputArray.
+@param display - VADisplay object.
+@param surface - source VASurfaceID object.
+@param size    - size of image represented by VASurfaceID object.
+@param dst     - destination OutputArray.
+ */
+CV_EXPORTS void convertFromVASurface(VADisplay display, VASurfaceID surface, Size size, OutputArray dst);
+
+//! @}
+
+}} // namespace cv::va_intel
+
+#endif /* OPENCV_CORE_VA_INTEL_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/version.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/version.hpp
new file mode 100644
index 000000000000..80e3dc32bcde
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/version.hpp
@@ -0,0 +1,26 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VERSION_HPP
+#define OPENCV_VERSION_HPP
+
+#define CV_VERSION_MAJOR    4
+#define CV_VERSION_MINOR    10
+#define CV_VERSION_REVISION 0
+#define CV_VERSION_STATUS   ""
+
+#define CVAUX_STR_EXP(__A)  #__A
+#define CVAUX_STR(__A)      CVAUX_STR_EXP(__A)
+
+#define CVAUX_STRW_EXP(__A)  L ## #__A
+#define CVAUX_STRW(__A)      CVAUX_STRW_EXP(__A)
+
+#define CV_VERSION          CVAUX_STR(CV_VERSION_MAJOR) "." CVAUX_STR(CV_VERSION_MINOR) "." CVAUX_STR(CV_VERSION_REVISION) CV_VERSION_STATUS
+
+/* old  style version constants*/
+#define CV_MAJOR_VERSION    CV_VERSION_MAJOR
+#define CV_MINOR_VERSION    CV_VERSION_MINOR
+#define CV_SUBMINOR_VERSION CV_VERSION_REVISION
+
+#endif // OPENCV_VERSION_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/core/vsx_utils.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/core/vsx_utils.hpp
new file mode 100644
index 000000000000..79a1074d59ff
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/core/vsx_utils.hpp
@@ -0,0 +1,1047 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_VSX_UTILS_HPP
+#define OPENCV_HAL_VSX_UTILS_HPP
+
+#include "opencv2/core/cvdef.h"
+
+#ifndef SKIP_INCLUDES
+#   include <assert.h>
+#endif
+
+//! @addtogroup core_utils_vsx
+//! @{
+#if CV_VSX
+
+#define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
+#define __VSX_S8__(c, v)  (c){v, v, v, v, v, v, v, v}
+#define __VSX_S4__(c, v)  (c){v, v, v, v}
+#define __VSX_S2__(c, v)  (c){v, v}
+
+typedef __vector unsigned char vec_uchar16;
+#define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
+#define vec_uchar16_sp(c)    (__VSX_S16__(vec_uchar16, (unsigned char)c))
+#define vec_uchar16_c(v)     ((vec_uchar16)(v))
+#define vec_uchar16_z        vec_uchar16_sp(0)
+
+typedef __vector signed char vec_char16;
+#define vec_char16_set(...) (vec_char16){__VA_ARGS__}
+#define vec_char16_sp(c)    (__VSX_S16__(vec_char16, (signed char)c))
+#define vec_char16_c(v)     ((vec_char16)(v))
+#define vec_char16_z        vec_char16_sp(0)
+
+typedef __vector unsigned short vec_ushort8;
+#define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
+#define vec_ushort8_sp(c)    (__VSX_S8__(vec_ushort8, (unsigned short)c))
+#define vec_ushort8_c(v)     ((vec_ushort8)(v))
+#define vec_ushort8_z        vec_ushort8_sp(0)
+
+typedef __vector signed short vec_short8;
+#define vec_short8_set(...) (vec_short8){__VA_ARGS__}
+#define vec_short8_sp(c)    (__VSX_S8__(vec_short8, (signed short)c))
+#define vec_short8_c(v)     ((vec_short8)(v))
+#define vec_short8_z        vec_short8_sp(0)
+
+typedef __vector unsigned int vec_uint4;
+#define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
+#define vec_uint4_sp(c)    (__VSX_S4__(vec_uint4, (unsigned int)c))
+#define vec_uint4_c(v)     ((vec_uint4)(v))
+#define vec_uint4_z        vec_uint4_sp(0)
+
+typedef __vector signed int vec_int4;
+#define vec_int4_set(...)  (vec_int4){__VA_ARGS__}
+#define vec_int4_sp(c)     (__VSX_S4__(vec_int4, (signed int)c))
+#define vec_int4_c(v)      ((vec_int4)(v))
+#define vec_int4_z         vec_int4_sp(0)
+
+typedef __vector float vec_float4;
+#define vec_float4_set(...)  (vec_float4){__VA_ARGS__}
+#define vec_float4_sp(c)     (__VSX_S4__(vec_float4, c))
+#define vec_float4_c(v)      ((vec_float4)(v))
+#define vec_float4_z         vec_float4_sp(0)
+
+typedef __vector unsigned long long vec_udword2;
+#define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
+#define vec_udword2_sp(c)    (__VSX_S2__(vec_udword2, (unsigned long long)c))
+#define vec_udword2_c(v)     ((vec_udword2)(v))
+#define vec_udword2_z        vec_udword2_sp(0)
+
+typedef __vector signed long long vec_dword2;
+#define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
+#define vec_dword2_sp(c)    (__VSX_S2__(vec_dword2, (signed long long)c))
+#define vec_dword2_c(v)     ((vec_dword2)(v))
+#define vec_dword2_z        vec_dword2_sp(0)
+
+typedef  __vector double vec_double2;
+#define vec_double2_set(...) (vec_double2){__VA_ARGS__}
+#define vec_double2_c(v)     ((vec_double2)(v))
+#define vec_double2_sp(c)    (__VSX_S2__(vec_double2, c))
+#define vec_double2_z        vec_double2_sp(0)
+
+#define vec_bchar16           __vector __bool char
+#define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
+#define vec_bchar16_c(v)     ((vec_bchar16)(v))
+
+#define vec_bshort8           __vector __bool short
+#define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
+#define vec_bshort8_c(v)     ((vec_bshort8)(v))
+
+#define vec_bint4             __vector __bool int
+#define vec_bint4_set(...)   (vec_bint4){__VA_ARGS__}
+#define vec_bint4_c(v)       ((vec_bint4)(v))
+
+#define vec_bdword2            __vector __bool long long
+#define vec_bdword2_set(...)  (vec_bdword2){__VA_ARGS__}
+#define vec_bdword2_c(v)      ((vec_bdword2)(v))
+
+#define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))
+
+#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2)   \
+VSX_FINLINE(rt) fnm(const rg& a) { return fn2(a); }
+
+#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2)   \
+VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
+
+/*
+ * GCC VSX compatibility
+**/
+#if defined(__GNUG__) && !defined(__clang__)
+
+// inline asm helper
+#define VSX_IMPL_1RG(rt, rg, opc, fnm) \
+VSX_FINLINE(rt) fnm(const rg& a)       \
+{ rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "=wa" (rs) : "wa" (a)); return rs; }
+
+#define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
+VSX_FINLINE(rt) fnm(const rg& a)        \
+{ rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; }
+
+#define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm)     \
+VSX_FINLINE(rt) fnm(const rg& a, const rg& b)  \
+{ rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; }
+
+#define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
+
+#if __GNUG__ < 8
+
+    // Support for int4 -> dword2 expanding multiply was added in GCC 8.
+    #ifdef vec_mule
+        #undef vec_mule
+    #endif
+    #ifdef vec_mulo
+        #undef vec_mulo
+    #endif
+
+    VSX_REDIRECT_2RG(vec_ushort8,  vec_uchar16,  vec_mule, __builtin_vec_mule)
+    VSX_REDIRECT_2RG(vec_short8,  vec_char16,  vec_mule, __builtin_vec_mule)
+    VSX_REDIRECT_2RG(vec_int4,  vec_short8,  vec_mule, __builtin_vec_mule)
+    VSX_REDIRECT_2RG(vec_uint4,  vec_ushort8,  vec_mule, __builtin_vec_mule)
+    VSX_REDIRECT_2RG(vec_ushort8,  vec_uchar16,  vec_mulo, __builtin_vec_mulo)
+    VSX_REDIRECT_2RG(vec_short8,  vec_char16,  vec_mulo, __builtin_vec_mulo)
+    VSX_REDIRECT_2RG(vec_int4,  vec_short8,  vec_mulo, __builtin_vec_mulo)
+    VSX_REDIRECT_2RG(vec_uint4,  vec_ushort8,  vec_mulo, __builtin_vec_mulo)
+
+    // dword2 support arrived in ISA 2.07 and GCC 8+
+    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulosw, vec_mule)
+    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
+    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulesw, vec_mulo)
+    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)
+
+#endif
+
+#if __GNUG__ < 7
+// up to GCC 6 vec_mul only supports precisions and llong
+#   ifdef vec_mul
+#       undef vec_mul
+#   endif
+/*
+ * there's no a direct instruction for supporting 8-bit, 16-bit multiplication in ISA 2.07,
+ * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
+**/
+#   define VSX_IMPL_MULH(Tvec, cperm)                                        \
+    VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b)                  \
+    {                                                                        \
+        static const vec_uchar16 ev_od = {cperm};                            \
+        return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od);  \
+    }
+    #define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+    VSX_IMPL_MULH(vec_char16,  VSX_IMPL_MULH_P16)
+    VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
+    #define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
+    VSX_IMPL_MULH(vec_short8,  VSX_IMPL_MULH_P8)
+    VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
+    // vmuluwm can be used for unsigned or signed integers, that's what they said
+    VSX_IMPL_2VRG(vec_int4,  vec_int4,  vmuluwm, vec_mul)
+    VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
+    // redirect to GCC builtin vec_mul, since it already supports precisions and llong
+    VSX_REDIRECT_2RG(vec_float4,  vec_float4,  vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
+#endif // __GNUG__ < 7
+
+#if __GNUG__ < 6
+/*
+ * Instruction "compare greater than or equal" in ISA 2.07 only supports single
+ * and double precision.
+ * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR.
+**/
+#   ifdef vec_cmpge
+#       undef vec_cmpge
+#   endif
+#   ifdef vec_cmple
+#       undef vec_cmple
+#   endif
+#   define vec_cmple(a, b) vec_cmpge(b, a)
+#   define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
+    VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
+
+    VSX_IMPL_CMPGE(vec_bchar16, vec_char16,  vcmpgtsb, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bshort8, vec_short8,  vcmpgtsh, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bint4,   vec_int4,    vcmpgtsw, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bint4,   vec_uint4,   vcmpgtuw, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bdword2, vec_dword2,  vcmpgtsd, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
+
+// redirect to GCC builtin cmpge, since it already supports precisions
+    VSX_REDIRECT_2RG(vec_bint4,   vec_float4,  vec_cmpge, __builtin_vec_cmpge)
+    VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
+
+// up to gcc5 vec_nor doesn't support bool long long
+#   undef vec_nor
+    template<typename T>
+    VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
+
+    VSX_FINLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
+    { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
+
+// vec_packs doesn't support double words in gcc4 and old versions of gcc5
+#   undef vec_packs
+    VSX_REDIRECT_2RG(vec_char16,  vec_short8,  vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_short8,  vec_int4,    vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_ushort8, vec_uint4,   vec_packs, __builtin_vec_packs)
+
+    VSX_IMPL_2VRG_F(vec_int4,  vec_dword2,  "vpksdss %0,%2,%1", vec_packs)
+    VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
+#endif // __GNUG__ < 6
+
+#if __GNUG__ < 5
+// vec_xxpermdi in gcc4 missing little-endian supports just like clang
+#   define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
+// same as vec_xxpermdi
+#   undef vec_vbpermq
+    VSX_IMPL_2VRG(vec_udword2, vec_uchar16, vbpermq, vec_vbpermq)
+    VSX_IMPL_2VRG(vec_dword2,  vec_char16, vbpermq, vec_vbpermq)
+#else
+#   define vec_permi vec_xxpermdi
+#endif // __GNUG__ < 5
+
+// shift left double by word immediate
+#ifndef vec_sldw
+#   define vec_sldw __builtin_vsx_xxsldwi
+#endif
+
+// vector population count
+VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
+VSX_IMPL_1VRG(vec_uchar16, vec_char16,  vpopcntb, vec_popcntu)
+VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
+VSX_IMPL_1VRG(vec_ushort8, vec_short8,  vpopcnth, vec_popcntu)
+VSX_IMPL_1VRG(vec_uint4,   vec_uint4,   vpopcntw, vec_popcntu)
+VSX_IMPL_1VRG(vec_uint4,   vec_int4,    vpopcntw, vec_popcntu)
+VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
+VSX_IMPL_1VRG(vec_udword2, vec_dword2,  vpopcntd, vec_popcntu)
+
+// converts between single and double-precision
+VSX_REDIRECT_1RG(vec_float4,  vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
+VSX_REDIRECT_1RG(vec_double2, vec_float4,  vec_cvfo, __builtin_vsx_xvcvspdp)
+
+// converts word and doubleword to double-precision
+#undef vec_ctd
+VSX_IMPL_1RG(vec_double2, vec_int4,    xvcvsxwdp, vec_ctdo)
+VSX_IMPL_1RG(vec_double2, vec_uint4,   xvcvuxwdp, vec_ctdo)
+VSX_IMPL_1RG(vec_double2, vec_dword2,  xvcvsxddp, vec_ctd)
+VSX_IMPL_1RG(vec_double2, vec_udword2, xvcvuxddp, vec_ctd)
+
+// converts word and doubleword to single-precision
+#undef vec_ctf
+VSX_IMPL_1RG(vec_float4, vec_int4,    xvcvsxwsp, vec_ctf)
+VSX_IMPL_1RG(vec_float4, vec_uint4,   xvcvuxwsp, vec_ctf)
+VSX_IMPL_1RG(vec_float4, vec_dword2,  xvcvsxdsp, vec_ctfo)
+VSX_IMPL_1RG(vec_float4, vec_udword2, xvcvuxdsp, vec_ctfo)
+
+// converts single and double precision to signed word
+#undef vec_cts
+VSX_IMPL_1RG(vec_int4,  vec_double2, xvcvdpsxws, vec_ctso)
+VSX_IMPL_1RG(vec_int4,  vec_float4,  xvcvspsxws, vec_cts)
+
+// converts single and double precision to unsigned word
+#undef vec_ctu
+VSX_IMPL_1RG(vec_uint4, vec_double2, xvcvdpuxws, vec_ctuo)
+VSX_IMPL_1RG(vec_uint4, vec_float4,  xvcvspuxws, vec_ctu)
+
+// converts single and double precision to signed doubleword
+#undef vec_ctsl
+VSX_IMPL_1RG(vec_dword2, vec_double2, xvcvdpsxds, vec_ctsl)
+VSX_IMPL_1RG(vec_dword2, vec_float4,  xvcvspsxds, vec_ctslo)
+
+// converts single and double precision to unsigned doubleword
+#undef vec_ctul
+VSX_IMPL_1RG(vec_udword2, vec_double2, xvcvdpuxds, vec_ctul)
+VSX_IMPL_1RG(vec_udword2, vec_float4,  xvcvspuxds, vec_ctulo)
+
+// just in case if GCC doesn't define it
+#ifndef vec_xl
+#   define vec_xl vec_vsx_ld
+#   define vec_xst vec_vsx_st
+#endif
+
+#endif // GCC VSX compatibility
+
+/*
+ * CLANG VSX compatibility
+**/
+#if defined(__clang__) && !defined(__IBMCPP__)
+
+/*
+ * CLANG doesn't support %x<n> in the inline asm template which fixes register number
+ * when using any of the register constraints wa, wd, wf
+ *
+ * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
+ * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
+ *
+ * So we're not able to use inline asm and only use built-in functions that CLANG supports
+ * and use __builtin_convertvector if clang missing any of vector conversions built-in functions
+ *
+ * todo: clang asm template bug is fixed, need to reconsider the current workarounds.
+*/
+
+// convert vector helper
+#define VSX_IMPL_CONVERT(rt, rg, fnm) \
+VSX_FINLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
+
+#ifndef vec_permi
+#if __clang_major__ < 5
+// implement vec_permi in a dirty way
+#   define VSX_IMPL_CLANG_4_PERMI(Tvec)                                                 \
+    VSX_FINLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c)    \
+    {                                                                                   \
+        switch (c)                                                                      \
+        {                                                                               \
+        case 0:                                                                         \
+            return vec_mergeh(a, b);                                                    \
+        case 1:                                                                         \
+            return vec_mergel(vec_mergeh(a, a), b);                                     \
+        case 2:                                                                         \
+            return vec_mergeh(vec_mergel(a, a), b);                                     \
+        default:                                                                        \
+            return vec_mergel(a, b);                                                    \
+        }                                                                               \
+    }
+    VSX_IMPL_CLANG_4_PERMI(vec_udword2)
+    VSX_IMPL_CLANG_4_PERMI(vec_dword2)
+    VSX_IMPL_CLANG_4_PERMI(vec_double2)
+
+// vec_xxsldwi is missing in clang 4
+#   define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
+#else
+// vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
+#   define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
+#endif // __clang_major__ < 5
+#endif
+
+// shift left double by word immediate
+#ifndef vec_sldw
+#   define vec_sldw vec_xxsldwi
+#endif
+
+#if __clang_major__ < 13
+// Implement vec_rsqrt since clang only supports vec_rsqrte
+#ifndef vec_rsqrt
+    VSX_FINLINE(vec_float4) vec_rsqrt(const vec_float4& a)
+    { return vec_div(vec_float4_sp(1), vec_sqrt(a)); }
+
+    VSX_FINLINE(vec_double2) vec_rsqrt(const vec_double2& a)
+    { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
+#endif
+
+// vec_promote missing support for doubleword
+VSX_FINLINE(vec_dword2) vec_promote(long long a, int b)
+{
+    vec_dword2 ret = vec_dword2_z;
+    ret[b & 1] = a;
+    return ret;
+}
+
+VSX_FINLINE(vec_udword2) vec_promote(unsigned long long a, int b)
+{
+    vec_udword2 ret = vec_udword2_z;
+    ret[b & 1] = a;
+    return ret;
+}
+#endif
+
+// vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
+#define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast)   \
+VSX_FINLINE(Tvec) vec_popcntu(const Tvec2& a)  \
+{ return ucast(vec_popcnt(a)); }
+VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
+VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
+VSX_IMPL_POPCNTU(vec_uint4,   vec_int4,   vec_uint4_c);
+VSX_IMPL_POPCNTU(vec_udword2, vec_dword2, vec_udword2_c);
+// redirect unsigned types
+VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
+VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
+VSX_REDIRECT_1RG(vec_uint4,   vec_uint4,   vec_popcntu, vec_popcnt)
+VSX_REDIRECT_1RG(vec_udword2, vec_udword2, vec_popcntu, vec_popcnt)
+
+// converts between single and double precision
+VSX_REDIRECT_1RG(vec_float4,  vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
+VSX_REDIRECT_1RG(vec_double2, vec_float4,  vec_cvfo, __builtin_vsx_xvcvspdp)
+
+// converts word and doubleword to double-precision
+#ifdef vec_ctd
+#   undef vec_ctd
+#endif
+VSX_REDIRECT_1RG(vec_double2, vec_int4,  vec_ctdo, __builtin_vsx_xvcvsxwdp)
+VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
+
+VSX_IMPL_CONVERT(vec_double2, vec_dword2,  vec_ctd)
+VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
+
+// converts word and doubleword to single-precision
+#if __clang_major__ > 4
+#   undef vec_ctf
+#endif
+VSX_IMPL_CONVERT(vec_float4, vec_int4,    vec_ctf)
+VSX_IMPL_CONVERT(vec_float4, vec_uint4,   vec_ctf)
+VSX_REDIRECT_1RG(vec_float4, vec_dword2,  vec_ctfo, __builtin_vsx_xvcvsxdsp)
+VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctfo, __builtin_vsx_xvcvuxdsp)
+
+// converts single and double precision to signed word
+#if __clang_major__ > 4
+#   undef vec_cts
+#endif
+VSX_REDIRECT_1RG(vec_int4,  vec_double2, vec_ctso, __builtin_vsx_xvcvdpsxws)
+VSX_IMPL_CONVERT(vec_int4,  vec_float4,  vec_cts)
+
+// converts single and double precision to unsigned word
+#if __clang_major__ > 4
+#   undef vec_ctu
+#endif
+VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctuo, __builtin_vsx_xvcvdpuxws)
+VSX_IMPL_CONVERT(vec_uint4, vec_float4,  vec_ctu)
+
+// converts single and double precision to signed doubleword
+#ifdef vec_ctsl
+#   undef vec_ctsl
+#endif
+VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
+// __builtin_convertvector unable to convert, xvcvspsxds is missing on it
+VSX_FINLINE(vec_dword2) vec_ctslo(const vec_float4& a)
+{ return vec_ctsl(vec_cvfo(a)); }
+
+// converts single and double precision to unsigned doubleword
+#ifdef vec_ctul
+#   undef vec_ctul
+#endif
+VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
+// __builtin_convertvector unable to convert, xvcvspuxds is missing on it
+VSX_FINLINE(vec_udword2) vec_ctulo(const vec_float4& a)
+{ return vec_ctul(vec_cvfo(a)); }
+
+#endif // CLANG VSX compatibility
+
+/*
+ * Common GCC, CLANG compatibility
+**/
+#if defined(__GNUG__) && !defined(__IBMCPP__)
+
+#ifdef vec_cvf
+#   undef vec_cvf
+#endif
+
+#define VSX_IMPL_CONV_EVEN_4_2(rt, rg, fnm, fn2) \
+VSX_FINLINE(rt) fnm(const rg& a)                 \
+{ return fn2(vec_sldw(a, a, 1)); }
+
+VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_float4, vec_cvf,  vec_cvfo)
+VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_int4,   vec_ctd,  vec_ctdo)
+VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_uint4,  vec_ctd,  vec_ctdo)
+
+VSX_IMPL_CONV_EVEN_4_2(vec_dword2,  vec_float4, vec_ctsl, vec_ctslo)
+VSX_IMPL_CONV_EVEN_4_2(vec_udword2, vec_float4, vec_ctul, vec_ctulo)
+
+#define VSX_IMPL_CONV_EVEN_2_4(rt, rg, fnm, fn2) \
+VSX_FINLINE(rt) fnm(const rg& a)                 \
+{                                                \
+    rt v4 = fn2(a);                              \
+    return vec_sldw(v4, v4, 3);                  \
+}
+
+VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_double2, vec_cvf, vec_cvfo)
+VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_dword2,  vec_ctf, vec_ctfo)
+VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_udword2, vec_ctf, vec_ctfo)
+
+VSX_IMPL_CONV_EVEN_2_4(vec_int4,   vec_double2, vec_cts, vec_ctso)
+VSX_IMPL_CONV_EVEN_2_4(vec_uint4,  vec_double2, vec_ctu, vec_ctuo)
+
+// Only for Eigen!
+/*
+ * changing behavior of conversion intrinsics for gcc has effect on Eigen
+ * so we redefine old behavior again only on gcc, clang
+*/
+#if !defined(__clang__) || __clang_major__ > 4
+    // ignoring second arg since Eigen only truncates toward zero
+#   define VSX_IMPL_CONV_2VARIANT(rt, rg, fnm, fn2)     \
+    VSX_FINLINE(rt) fnm(const rg& a, int only_truncate) \
+    {                                                   \
+        assert(only_truncate == 0);                     \
+        CV_UNUSED(only_truncate);                       \
+        return fn2(a);                                  \
+    }
+    VSX_IMPL_CONV_2VARIANT(vec_int4,   vec_float4,  vec_cts, vec_cts)
+    VSX_IMPL_CONV_2VARIANT(vec_uint4,  vec_float4,  vec_ctu, vec_ctu)
+    VSX_IMPL_CONV_2VARIANT(vec_float4, vec_int4,    vec_ctf, vec_ctf)
+    VSX_IMPL_CONV_2VARIANT(vec_float4, vec_uint4,   vec_ctf, vec_ctf)
+    // define vec_cts for converting double precision to signed doubleword
+    // which isn't compatible with xlc but its okay since Eigen only uses it for gcc
+    VSX_IMPL_CONV_2VARIANT(vec_dword2, vec_double2, vec_cts, vec_ctsl)
+#endif // Eigen
+
+#endif // Common GCC, CLANG compatibility
+
+/*
+ * XLC VSX compatibility
+**/
+#if defined(__IBMCPP__)
+
+// vector population count
+#define vec_popcntu vec_popcnt
+
+// overload and redirect with setting second arg to zero
+// since we only support conversions without the second arg
+#define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
+VSX_FINLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
+
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4,    vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4,   vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2,  vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
+
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_int4,    vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_uint4,   vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_dword2,  vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_udword2, vec_ctf)
+
+VSX_IMPL_OVERLOAD_Z2(vec_int4,    vec_double2, vec_cts)
+VSX_IMPL_OVERLOAD_Z2(vec_int4,    vec_float4,  vec_cts)
+
+VSX_IMPL_OVERLOAD_Z2(vec_uint4,   vec_double2, vec_ctu)
+VSX_IMPL_OVERLOAD_Z2(vec_uint4,   vec_float4,  vec_ctu)
+
+VSX_IMPL_OVERLOAD_Z2(vec_dword2,  vec_double2, vec_ctsl)
+VSX_IMPL_OVERLOAD_Z2(vec_dword2,  vec_float4,  vec_ctsl)
+
+VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
+VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4,  vec_ctul)
+
+// fixme: implement conversions of odd-numbered elements in a dirty way
+// since xlc doesn't support VSX registers operand in inline asm.
+#define VSX_IMPL_CONV_ODD_4_2(rt, rg, fnm, fn2) \
+VSX_FINLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
+
+VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_float4, vec_cvfo,  vec_cvf)
+VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_int4,   vec_ctdo,  vec_ctd)
+VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_uint4,  vec_ctdo,  vec_ctd)
+
+VSX_IMPL_CONV_ODD_4_2(vec_dword2,  vec_float4, vec_ctslo, vec_ctsl)
+VSX_IMPL_CONV_ODD_4_2(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
+
+#define VSX_IMPL_CONV_ODD_2_4(rt, rg, fnm, fn2)  \
+VSX_FINLINE(rt) fnm(const rg& a)                 \
+{                                                \
+    rt v4 = fn2(a);                              \
+    return vec_sldw(v4, v4, 1);                  \
+}
+
+VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_double2, vec_cvfo, vec_cvf)
+VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_dword2,  vec_ctfo, vec_ctf)
+VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_udword2, vec_ctfo, vec_ctf)
+
+VSX_IMPL_CONV_ODD_2_4(vec_int4,   vec_double2, vec_ctso, vec_cts)
+VSX_IMPL_CONV_ODD_2_4(vec_uint4,  vec_double2, vec_ctuo, vec_ctu)
+
+#endif // XLC VSX compatibility
+
+// ignore GCC warning that caused by -Wunused-but-set-variable in rare cases
+#if defined(__GNUG__) && !defined(__clang__)
+#   define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
+#else // CLANG, XLC
+#   define VSX_UNUSED(Tvec) Tvec
+#endif
+
+// gcc can find his way in casting log int and XLC, CLANG ambiguous
+#if defined(__clang__) || defined(__IBMCPP__)
+    VSX_FINLINE(vec_udword2) vec_splats(uint64 v)
+    { return vec_splats((unsigned long long) v); }
+
+    VSX_FINLINE(vec_dword2) vec_splats(int64 v)
+    { return vec_splats((long long) v); }
+
+    VSX_FINLINE(vec_udword2) vec_promote(uint64 a, int b)
+    { return vec_promote((unsigned long long) a, b); }
+
+    VSX_FINLINE(vec_dword2) vec_promote(int64 a, int b)
+    { return vec_promote((long long) a, b); }
+#endif
+
+/*
+ * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
+ * load and set using offset depend on the pointer type
+ *
+ * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
+ * load and set using offset depend on fixed bytes size
+ *
+ * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
+ * so we are using vec_vsx_ld, vec_vsx_st instead
+*/
+
+#if defined(__clang__) && !defined(__IBMCPP__)
+#   define vsx_ldf  vec_vsx_ld
+#   define vsx_stf  vec_vsx_st
+#else // GCC , XLC
+#   define vsx_ldf  vec_xl
+#   define vsx_stf  vec_xst
+#endif
+
+#define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
+#define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
+#define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
+
+/*
+ * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words
+ * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long
+ * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses
+ *
+ * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
+*/
+#if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
+    VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
+    { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); }
+
+    VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
+    { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); }
+
+    VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
+    { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); }
+
+    VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
+    { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); }
+#else // XLC
+    VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
+    { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); }
+
+    VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
+    { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); }
+
+    VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
+    { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); }
+
+    VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
+    { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
+#endif
+
+// Store lower 8 byte
+#define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
+
+// Store higher 8 byte
+#define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
+
+// Load 64-bits of integer data to lower part
+#define VSX_IMPL_LOAD_L8(Tvec, Tp)                  \
+VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p)            \
+{ return ((Tvec)vec_promote(*((uint64*)p), 0)); }
+
+VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
+VSX_IMPL_LOAD_L8(vec_char16,  schar)
+VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
+VSX_IMPL_LOAD_L8(vec_short8,  short)
+VSX_IMPL_LOAD_L8(vec_uint4,   uint)
+VSX_IMPL_LOAD_L8(vec_int4,    int)
+VSX_IMPL_LOAD_L8(vec_float4,  float)
+VSX_IMPL_LOAD_L8(vec_udword2, uint64)
+VSX_IMPL_LOAD_L8(vec_dword2,  int64)
+VSX_IMPL_LOAD_L8(vec_double2, double)
+
+// logical not
+#define vec_not(a) vec_nor(a, a)
+
+// power9 yaya
+// not equal
+#ifndef vec_cmpne
+#   define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
+#endif
+
+// absolute difference
+#ifndef _ARCH_PWR9
+#   undef vec_absd
+#   define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
+#endif
+
+/*
+ * Implement vec_unpacklu and vec_unpackhu
+ * since vec_unpackl, vec_unpackh only support signed integers
+**/
+#define VSX_IMPL_UNPACKU(rt, rg, zero)      \
+VSX_FINLINE(rt) vec_unpacklu(const rg& a)   \
+{ return (rt)(vec_mergel(a, zero)); }       \
+VSX_FINLINE(rt) vec_unpackhu(const rg& a)   \
+{ return (rt)(vec_mergeh(a, zero));  }
+
+VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
+VSX_IMPL_UNPACKU(vec_uint4,   vec_ushort8, vec_ushort8_z)
+VSX_IMPL_UNPACKU(vec_udword2, vec_uint4,   vec_uint4_z)
+
+/*
+ * Implement vec_mergesqe and vec_mergesqo
+ * Merges the sequence values of even and odd elements of two vectors
+*/
+#define VSX_IMPL_PERM(rt, fnm, ...)            \
+VSX_FINLINE(rt) fnm(const rt& a, const rt& b)  \
+{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
+
+// 16
+#define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+#define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
+VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
+VSX_IMPL_PERM(vec_char16,  vec_mergesqe, perm16_mergesqe)
+VSX_IMPL_PERM(vec_char16,  vec_mergesqo, perm16_mergesqo)
+// 8
+#define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+#define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
+VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
+VSX_IMPL_PERM(vec_short8,  vec_mergesqe, perm8_mergesqe)
+VSX_IMPL_PERM(vec_short8,  vec_mergesqo, perm8_mergesqo)
+// 4
+#define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+#define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+VSX_IMPL_PERM(vec_uint4,  vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_uint4,  vec_mergesqo, perm4_mergesqo)
+VSX_IMPL_PERM(vec_int4,   vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_int4,   vec_mergesqo, perm4_mergesqo)
+VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
+// 2
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqo, vec_mergel)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
+
+/*
+ * Implement vec_mergesqh and vec_mergesql
+ * Merges the sequence most and least significant halves of two vectors
+*/
+#define VSX_IMPL_MERGESQHL(Tvec)                                    \
+VSX_FINLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b)        \
+{ return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); }    \
+VSX_FINLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b)        \
+{ return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
+VSX_IMPL_MERGESQHL(vec_uchar16)
+VSX_IMPL_MERGESQHL(vec_char16)
+VSX_IMPL_MERGESQHL(vec_ushort8)
+VSX_IMPL_MERGESQHL(vec_short8)
+VSX_IMPL_MERGESQHL(vec_uint4)
+VSX_IMPL_MERGESQHL(vec_int4)
+VSX_IMPL_MERGESQHL(vec_float4)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesql, vec_mergel)
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
+
+
+// 2 and 4 channels interleave for all types except 2 lanes
+#define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec)                                    \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr)  \
+{                                                                           \
+    vsx_stf(vec_mergeh(a, b), 0, ptr);                                      \
+    vsx_stf(vec_mergel(a, b), 16, ptr);                                     \
+}                                                                           \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,           \
+                                     const Tvec& c, const Tvec& d, Tp* ptr) \
+{                                                                           \
+    Tvec ac = vec_mergeh(a, c);                                             \
+    Tvec bd = vec_mergeh(b, d);                                             \
+    vsx_stf(vec_mergeh(ac, bd), 0, ptr);                                    \
+    vsx_stf(vec_mergel(ac, bd), 16, ptr);                                   \
+    ac = vec_mergel(a, c);                                                  \
+    bd = vec_mergel(b, d);                                                  \
+    vsx_stf(vec_mergeh(ac, bd), 32, ptr);                                   \
+    vsx_stf(vec_mergel(ac, bd), 48, ptr);                                   \
+}
+VSX_IMPL_ST_INTERLEAVE(uchar,  vec_uchar16)
+VSX_IMPL_ST_INTERLEAVE(schar,  vec_char16)
+VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
+VSX_IMPL_ST_INTERLEAVE(short,  vec_short8)
+VSX_IMPL_ST_INTERLEAVE(uint,   vec_uint4)
+VSX_IMPL_ST_INTERLEAVE(int,    vec_int4)
+VSX_IMPL_ST_INTERLEAVE(float,  vec_float4)
+
+// 2 and 4 channels deinterleave for 16 lanes
+#define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec)                                 \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(16, ptr);                                              \
+    a = vec_mergesqe(v0, v1);                                               \
+    b = vec_mergesqo(v0, v1);                                               \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(16, ptr);                                              \
+    Tvec v2 = vsx_ld(32, ptr);                                              \
+    Tvec v3 = vsx_ld(48, ptr);                                              \
+    Tvec m0 = vec_mergesqe(v0, v1);                                         \
+    Tvec m1 = vec_mergesqe(v2, v3);                                         \
+    a = vec_mergesqe(m0, m1);                                               \
+    c = vec_mergesqo(m0, m1);                                               \
+    m0 = vec_mergesqo(v0, v1);                                              \
+    m1 = vec_mergesqo(v2, v3);                                              \
+    b = vec_mergesqe(m0, m1);                                               \
+    d = vec_mergesqo(m0, m1);                                               \
+}
+VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
+VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
+
+// 2 and 4 channels deinterleave for 8 lanes
+#define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec)                                \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(8, ptr);                                               \
+    a = vec_mergesqe(v0, v1);                                               \
+    b = vec_mergesqo(v0, v1);                                               \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(8, ptr);                                               \
+    Tvec m0 = vec_mergeh(v0, v1);                                           \
+    Tvec m1 = vec_mergel(v0, v1);                                           \
+    Tvec ab0 = vec_mergeh(m0, m1);                                          \
+    Tvec cd0 = vec_mergel(m0, m1);                                          \
+    v0 = vsx_ld(16, ptr);                                                   \
+    v1 = vsx_ld(24, ptr);                                                   \
+    m0 = vec_mergeh(v0, v1);                                                \
+    m1 = vec_mergel(v0, v1);                                                \
+    Tvec ab1 = vec_mergeh(m0, m1);                                          \
+    Tvec cd1 = vec_mergel(m0, m1);                                          \
+    a = vec_mergesqh(ab0, ab1);                                             \
+    b = vec_mergesql(ab0, ab1);                                             \
+    c = vec_mergesqh(cd0, cd1);                                             \
+    d = vec_mergesql(cd0, cd1);                                             \
+}
+VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
+VSX_IMPL_ST_DINTERLEAVE_16(short,  vec_short8)
+
+// 2 and 4 channels deinterleave for 4 lanes
+#define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec)                                \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
+{                                                                           \
+    a = vsx_ld(0, ptr);                                                     \
+    b = vsx_ld(4, ptr);                                                     \
+    Tvec m0 = vec_mergeh(a, b);                                             \
+    Tvec m1 = vec_mergel(a, b);                                             \
+    a = vec_mergeh(m0, m1);                                                 \
+    b = vec_mergel(m0, m1);                                                 \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(4, ptr);                                               \
+    Tvec v2 = vsx_ld(8, ptr);                                               \
+    Tvec v3 = vsx_ld(12, ptr);                                              \
+    Tvec m0 = vec_mergeh(v0, v2);                                           \
+    Tvec m1 = vec_mergeh(v1, v3);                                           \
+    a = vec_mergeh(m0, m1);                                                 \
+    b = vec_mergel(m0, m1);                                                 \
+    m0 = vec_mergel(v0, v2);                                                \
+    m1 = vec_mergel(v1, v3);                                                \
+    c = vec_mergeh(m0, m1);                                                 \
+    d = vec_mergel(m0, m1);                                                 \
+}
+VSX_IMPL_ST_DINTERLEAVE_32(uint,  vec_uint4)
+VSX_IMPL_ST_DINTERLEAVE_32(int,   vec_int4)
+VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
+
+// 2 and 4 channels interleave and deinterleave for 2 lanes
+#define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func)             \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr)  \
+{                                                                           \
+    st_func(vec_mergeh(a, b), 0, ptr);                                      \
+    st_func(vec_mergel(a, b), 2, ptr);                                      \
+}                                                                           \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,           \
+                                     const Tvec& c, const Tvec& d, Tp* ptr) \
+{                                                                           \
+    st_func(vec_mergeh(a, b), 0, ptr);                                      \
+    st_func(vec_mergeh(c, d), 2, ptr);                                      \
+    st_func(vec_mergel(a, b), 4, ptr);                                      \
+    st_func(vec_mergel(c, d), 6, ptr);                                      \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
+{                                                                           \
+    Tvec m0 = ld_func(0, ptr);                                              \
+    Tvec m1 = ld_func(2, ptr);                                              \
+    a = vec_mergeh(m0, m1);                                                 \
+    b = vec_mergel(m0, m1);                                                 \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = ld_func(0, ptr);                                              \
+    Tvec v1 = ld_func(2, ptr);                                              \
+    Tvec v2 = ld_func(4, ptr);                                              \
+    Tvec v3 = ld_func(6, ptr);                                              \
+    a = vec_mergeh(v0, v2);                                                 \
+    b = vec_mergel(v0, v2);                                                 \
+    c = vec_mergeh(v1, v3);                                                 \
+    d = vec_mergel(v1, v3);                                                 \
+}
+VSX_IMPL_ST_D_INTERLEAVE_64(int64,  vec_dword2,  vsx_ld2, vsx_st2)
+VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld,  vsx_st)
+
+/* 3 channels */
+#define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec)                                                   \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                 \
+                                     const Tvec& c, Tp* ptr)                                      \
+{                                                                                                 \
+    static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5};         \
+    static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15};    \
+    vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                                       \
+    static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26};       \
+    static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15};    \
+    vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr);                                      \
+    static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0};    \
+    static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31};   \
+    vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr);                                      \
+}                                                                                                 \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                   \
+{                                                                                                 \
+    Tvec v1 = vsx_ld(0, ptr);                                                                     \
+    Tvec v2 = vsx_ld(16, ptr);                                                                    \
+    Tvec v3 = vsx_ld(32, ptr);                                                                    \
+    static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0};  \
+    static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29};  \
+    a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                                      \
+    static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
+    static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30};  \
+    b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                                      \
+    static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0};  \
+    static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31};  \
+    c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                                      \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
+VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
+
+#define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec)                                                    \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                 \
+                                     const Tvec& c, Tp* ptr)                                      \
+{                                                                                                 \
+    static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21};        \
+    static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15};     \
+    vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                                       \
+    static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11};        \
+    static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15};   \
+    vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr);                                       \
+    static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0};    \
+    static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31};   \
+    vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr);                                      \
+}                                                                                                 \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                   \
+{                                                                                                 \
+    Tvec v1 = vsx_ld(0, ptr);                                                                     \
+    Tvec v2 = vsx_ld(8, ptr);                                                                     \
+    Tvec v3 = vsx_ld(16, ptr);                                                                    \
+    static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
+    static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27};  \
+    a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                                      \
+    static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0};   \
+    static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29};  \
+    b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                                      \
+    static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
+    static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31};  \
+    c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                                      \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
+VSX_IMPL_ST_INTERLEAVE_3CH_8(short,  vec_short8)
+
+#define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec)                                                     \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                  \
+                                     const Tvec& c, Tp* ptr)                                       \
+{                                                                                                  \
+    Tvec hbc = vec_mergeh(b, c);                                                                   \
+    static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7};      \
+    vsx_st(vec_perm(a, hbc, ahbc), 0, ptr);                                                        \
+    Tvec lab = vec_mergel(a, b);                                                                   \
+    vsx_st(vec_sld(lab, hbc, 8), 4, ptr);                                                          \
+    static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
+    vsx_st(vec_perm(c, lab, clab), 8, ptr);                                                        \
+}                                                                                                  \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                    \
+{                                                                                                  \
+    Tvec v1 = vsx_ld(0, ptr);                                                                      \
+    Tvec v2 = vsx_ld(4, ptr);                                                                      \
+    Tvec v3 = vsx_ld(8, ptr);                                                                      \
+    static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31};   \
+    a = vec_perm(v1, vec_sld(v3, v2, 8), flp);                                                     \
+    static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19};  \
+    b = vec_perm(v2, vec_sld(v1, v3, 8), flp2);                                                    \
+    c = vec_perm(vec_sld(v2, v1, 8), v3, flp);                                                     \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_4(uint,  vec_uint4)
+VSX_IMPL_ST_INTERLEAVE_3CH_4(int,   vec_int4)
+VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
+
+#define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func)     \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,    \
+                                     const Tvec& c, Tp* ptr)         \
+{                                                                    \
+    st_func(vec_mergeh(a, b), 0, ptr);                               \
+    st_func(vec_permi(c, a, 1), 2, ptr);                             \
+    st_func(vec_mergel(b, c), 4, ptr);                               \
+}                                                                    \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a,        \
+                                       Tvec& b, Tvec& c)             \
+{                                                                    \
+    Tvec v1 = ld_func(0, ptr);                                       \
+    Tvec v2 = ld_func(2, ptr);                                       \
+    Tvec v3 = ld_func(4, ptr);                                       \
+    a = vec_permi(v1, v2, 1);                                        \
+    b = vec_permi(v1, v3, 2);                                        \
+    c = vec_permi(v2, v3, 1);                                        \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_2(int64,  vec_dword2,  vsx_ld2, vsx_st2)
+VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld,  vsx_st)
+
+#endif // CV_VSX
+
+//! @}
+
+#endif // OPENCV_HAL_VSX_UTILS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/cvconfig.h b/3rdparty/opencv/opencv410/build/include/opencv2/cvconfig.h
new file mode 100644
index 000000000000..355adec96b85
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/cvconfig.h
@@ -0,0 +1,152 @@
+#ifndef OPENCV_CVCONFIG_H_INCLUDED
+#define OPENCV_CVCONFIG_H_INCLUDED
+
+/* OpenCV compiled as static or dynamic libs */
+#define BUILD_SHARED_LIBS
+
+/* OpenCV intrinsics optimized code */
+#define CV_ENABLE_INTRINSICS
+
+/* OpenCV additional optimized code */
+/* #undef CV_DISABLE_OPTIMIZATION */
+
+/* Compile for 'real' NVIDIA GPU architectures */
+#define CUDA_ARCH_BIN ""
+
+/* NVIDIA GPU features are used */
+#define CUDA_ARCH_FEATURES ""
+
+/* Compile for 'virtual' NVIDIA PTX architectures */
+#define CUDA_ARCH_PTX ""
+
+/* AMD's Basic Linear Algebra Subprograms Library*/
+/* #undef HAVE_CLAMDBLAS */
+
+/* AMD's OpenCL Fast Fourier Transform Library*/
+/* #undef HAVE_CLAMDFFT */
+
+/* Clp support */
+/* #undef HAVE_CLP */
+
+/* NVIDIA CUDA Runtime API*/
+/* #undef HAVE_CUDA */
+
+/* NVIDIA CUDA Basic Linear Algebra Subprograms (BLAS) API*/
+/* #undef HAVE_CUBLAS */
+
+/* NVIDIA CUDA Deep Neural Network (cuDNN) API*/
+/* #undef HAVE_CUDNN */
+
+/* NVIDIA CUDA Fast Fourier Transform (FFT) API*/
+/* #undef HAVE_CUFFT */
+
+/* DirectX */
+#define HAVE_DIRECTX
+#define HAVE_DIRECTX_NV12
+#define HAVE_D3D11
+#define HAVE_D3D10
+#define HAVE_D3D9
+
+/* Eigen Matrix & Linear Algebra Library */
+/* #undef HAVE_EIGEN */
+
+/* Geospatial Data Abstraction Library */
+/* #undef HAVE_GDAL */
+
+/* Halide support */
+/* #undef HAVE_HALIDE */
+
+/* Vulkan support */
+/* #undef HAVE_VULKAN */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Intel Integrated Performance Primitives */
+#define HAVE_IPP
+#define HAVE_IPP_ICV
+#define HAVE_IPP_IW
+#define HAVE_IPP_IW_LL
+
+/* JPEG-2000 codec */
+#define HAVE_OPENJPEG
+/* #undef HAVE_JASPER */
+
+/* AVIF codec */
+/* #undef HAVE_AVIF */
+
+/* IJG JPEG codec */
+#define HAVE_JPEG
+
+/* GDCM DICOM codec */
+/* #undef HAVE_GDCM */
+
+/* NVIDIA Video Decoding API*/
+/* #undef HAVE_NVCUVID */
+/* #undef HAVE_NVCUVID_HEADER */
+/* #undef HAVE_DYNLINK_NVCUVID_HEADER */
+
+/* NVIDIA Video Encoding API*/
+/* #undef HAVE_NVCUVENC */
+
+/* OpenCL Support */
+#define HAVE_OPENCL
+/* #undef HAVE_OPENCL_STATIC */
+/* #undef HAVE_OPENCL_SVM */
+
+/* NVIDIA OpenCL D3D Extensions support */
+#define HAVE_OPENCL_D3D11_NV
+
+/* OpenEXR codec */
+#define HAVE_OPENEXR
+
+/* OpenGL support*/
+/* #undef HAVE_OPENGL */
+
+/* PNG codec */
+#define HAVE_PNG
+
+/* PNG codec */
+/* #undef HAVE_SPNG */
+
+/* Posix threads (pthreads) */
+/* #undef HAVE_PTHREAD */
+
+/* parallel_for with pthreads */
+/* #undef HAVE_PTHREADS_PF */
+
+/* Intel Threading Building Blocks */
+/* #undef HAVE_TBB */
+
+/* Ste||ar Group High Performance ParallelX */
+/* #undef HAVE_HPX */
+
+/* TIFF codec */
+#define HAVE_TIFF
+
+/* Define if your processor stores words with the most significant byte
+   first (like Motorola and SPARC, unlike Intel and VAX). */
+/* #undef WORDS_BIGENDIAN */
+
+/* VA library (libva) */
+/* #undef HAVE_VA */
+
+/* Intel VA-API/OpenCL */
+/* #undef HAVE_VA_INTEL */
+
+/* Lapack */
+/* #undef HAVE_LAPACK */
+
+/* Library was compiled with functions instrumentation */
+/* #undef ENABLE_INSTRUMENTATION */
+
+/* OpenVX */
+/* #undef HAVE_OPENVX */
+
+/* OpenCV trace utilities */
+#define OPENCV_TRACE
+
+/* Library QR-code decoding */
+/* #undef HAVE_QUIRC */
+
+#endif // OPENCV_CVCONFIG_H_INCLUDED
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn.hpp
new file mode 100644
index 000000000000..97f2fe3ffd8c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn.hpp
@@ -0,0 +1,78 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_HPP
+#define OPENCV_DNN_HPP
+
+// This is an umbrella header to include into you project.
+// We are free to change headers layout in dnn subfolder, so please include
+// this header for future compatibility
+
+
+/** @defgroup dnn Deep Neural Network module
+  @{
+    This module contains:
+        - API for new layers creation, layers are building bricks of neural networks;
+        - set of built-in most-useful Layers;
+        - API to construct and modify comprehensive neural networks from layers;
+        - functionality for loading serialized networks models from different frameworks.
+
+    Functionality of this module is designed only for forward pass computations (i.e. network testing).
+    A network training is in principle not supported.
+  @}
+*/
+/** @example samples/dnn/classification.cpp
+Check @ref tutorial_dnn_googlenet "the corresponding tutorial" for more details
+*/
+/** @example samples/dnn/colorization.cpp
+*/
+/** @example samples/dnn/object_detection.cpp
+Check @ref tutorial_dnn_yolo "the corresponding tutorial" for more details
+*/
+/** @example samples/dnn/openpose.cpp
+*/
+/** @example samples/dnn/segmentation.cpp
+*/
+/** @example samples/dnn/text_detection.cpp
+*/
+#include <opencv2/dnn/dnn.hpp>
+
+#endif /* OPENCV_DNN_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/all_layers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/all_layers.hpp
new file mode 100644
index 000000000000..3301f20fde70
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/all_layers.hpp
@@ -0,0 +1,1196 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_DNN_ALL_LAYERS_HPP
+#define OPENCV_DNN_DNN_ALL_LAYERS_HPP
+#include <opencv2/dnn.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+//! @addtogroup dnn
+//! @{
+
+/** @defgroup dnnLayerList Partial List of Implemented Layers
+  @{
+  This subsection of dnn module contains information about built-in layers and their descriptions.
+
+  Classes listed here, in fact, provides C++ API for creating instances of built-in layers.
+  In addition to this way of layers instantiation, there is a more common factory API (see @ref dnnLayerFactory), it allows to create layers dynamically (by name) and register new ones.
+  You can use both API, but factory API is less convenient for native C++ programming and basically designed for use inside importers (see @ref readNetFromCaffe(), @ref readNetFromTorch(), @ref readNetFromTensorflow()).
+
+  Built-in layers partially reproduce functionality of corresponding Caffe and Torch7 layers.
+  In particular, the following layers and Caffe importer were tested to reproduce <a href="http://caffe.berkeleyvision.org/tutorial/layers.html">Caffe</a> functionality:
+  - Convolution
+  - Deconvolution
+  - Pooling
+  - InnerProduct
+  - TanH, ReLU, Sigmoid, BNLL, Power, AbsVal
+  - Softmax
+  - Reshape, Flatten, Slice, Split
+  - LRN
+  - MVN
+  - Dropout (since it does nothing on forward pass -))
+*/
+
+    class CV_EXPORTS BlankLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams &params);
+    };
+
+    /**
+     * Constant layer produces the same data blob at an every forward pass.
+     */
+    class CV_EXPORTS ConstLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams &params);
+    };
+
+    //! LSTM recurrent layer
+    class CV_EXPORTS LSTMLayer : public Layer
+    {
+    public:
+        /** Creates instance of LSTM layer */
+        static Ptr<LSTMLayer> create(const LayerParams& params);
+
+        /** @deprecated Use LayerParams::blobs instead.
+        @brief Set trained weights for LSTM layer.
+
+        LSTM behavior on each step is defined by current input, previous output, previous cell state and learned weights.
+
+        Let @f$x_t@f$ be current input, @f$h_t@f$ be current output, @f$c_t@f$ be current state.
+        Than current output and current cell state is computed as follows:
+        @f{eqnarray*}{
+        h_t &= o_t \odot tanh(c_t),               \\
+        c_t &= f_t \odot c_{t-1} + i_t \odot g_t, \\
+        @f}
+        where @f$\odot@f$ is per-element multiply operation and @f$i_t, f_t, o_t, g_t@f$ is internal gates that are computed using learned weights.
+
+        Gates are computed as follows:
+        @f{eqnarray*}{
+        i_t &= sigmoid&(W_{xi} x_t + W_{hi} h_{t-1} + b_i), \\
+        f_t &= sigmoid&(W_{xf} x_t + W_{hf} h_{t-1} + b_f), \\
+        o_t &= sigmoid&(W_{xo} x_t + W_{ho} h_{t-1} + b_o), \\
+        g_t &= tanh   &(W_{xg} x_t + W_{hg} h_{t-1} + b_g), \\
+        @f}
+        where @f$W_{x?}@f$, @f$W_{h?}@f$ and @f$b_{?}@f$ are learned weights represented as matrices:
+        @f$W_{x?} \in R^{N_h \times N_x}@f$, @f$W_{h?} \in R^{N_h \times N_h}@f$, @f$b_? \in R^{N_h}@f$.
+
+        For simplicity and performance purposes we use @f$ W_x = [W_{xi}; W_{xf}; W_{xo}, W_{xg}] @f$
+        (i.e. @f$W_x@f$ is vertical concatenation of @f$ W_{x?} @f$), @f$ W_x \in R^{4N_h \times N_x} @f$.
+        The same for @f$ W_h = [W_{hi}; W_{hf}; W_{ho}, W_{hg}], W_h \in R^{4N_h \times N_h} @f$
+        and for @f$ b = [b_i; b_f, b_o, b_g]@f$, @f$b \in R^{4N_h} @f$.
+
+        @param Wh is matrix defining how previous output is transformed to internal gates (i.e. according to above mentioned notation is @f$ W_h @f$)
+        @param Wx is matrix defining how current input is transformed to internal gates (i.e. according to above mentioned notation is @f$ W_x @f$)
+        @param b  is bias vector (i.e. according to above mentioned notation is @f$ b @f$)
+        */
+        CV_DEPRECATED virtual void setWeights(const Mat &Wh, const Mat &Wx, const Mat &b) = 0;
+
+        /** @brief Specifies shape of output blob which will be [[`T`], `N`] + @p outTailShape.
+          * @details If this parameter is empty or unset then @p outTailShape = [`Wh`.size(0)] will be used,
+          * where `Wh` is parameter from setWeights().
+          */
+        virtual void setOutShape(const MatShape &outTailShape = MatShape()) = 0;
+
+        /** @deprecated Use flag `produce_cell_output` in LayerParams.
+          * @brief Specifies either interpret first dimension of input blob as timestamp dimension either as sample.
+          *
+          * If flag is set to true then shape of input blob will be interpreted as [`T`, `N`, `[data dims]`] where `T` specifies number of timestamps, `N` is number of independent streams.
+          * In this case each forward() call will iterate through `T` timestamps and update layer's state `T` times.
+          *
+          * If flag is set to false then shape of input blob will be interpreted as [`N`, `[data dims]`].
+          * In this case each forward() call will make one iteration and produce one timestamp with shape [`N`, `[out dims]`].
+          */
+        CV_DEPRECATED virtual void setUseTimstampsDim(bool use = true) = 0;
+
+        /** @deprecated Use flag `use_timestamp_dim` in LayerParams.
+         * @brief If this flag is set to true then layer will produce @f$ c_t @f$ as second output.
+         * @details Shape of the second output is the same as first output.
+         */
+        CV_DEPRECATED virtual void setProduceCellOutput(bool produce = false) = 0;
+
+        /* In common case it use single input with @f$x_t@f$ values to compute output(s) @f$h_t@f$ (and @f$c_t@f$).
+         * @param input should contain packed values @f$x_t@f$
+         * @param output contains computed outputs: @f$h_t@f$ (and @f$c_t@f$ if setProduceCellOutput() flag was set to true).
+         *
+         * If setUseTimstampsDim() is set to true then @p input[0] should has at least two dimensions with the following shape: [`T`, `N`, `[data dims]`],
+         * where `T` specifies number of timestamps, `N` is number of independent streams (i.e. @f$ x_{t_0 + t}^{stream} @f$ is stored inside @p input[0][t, stream, ...]).
+         *
+         * If setUseTimstampsDim() is set to false then @p input[0] should contain single timestamp, its shape should has form [`N`, `[data dims]`] with at least one dimension.
+         * (i.e. @f$ x_{t}^{stream} @f$ is stored inside @p input[0][stream, ...]).
+        */
+
+        int inputNameToIndex(String inputName) CV_OVERRIDE;
+        int outputNameToIndex(const String& outputName) CV_OVERRIDE;
+    };
+
+    /** @brief GRU recurrent one-layer
+     *
+     * Accepts input sequence and computes the final hidden state for each element in the batch.
+     *
+     * - input[0] containing the features of the input sequence.
+     * input[0] should have shape [`T`, `N`, `data_dims`] where `T` is sequence length, `N` is batch size, `data_dims` is input size
+     * - output would have shape [`T`, `N`, `D` * `hidden_size`] where `D = 2` if layer is bidirectional otherwise `D = 1`
+     *
+     * Depends on the following attributes:
+     * - hidden_size - Number of neurons in the hidden layer
+     * - direction - RNN could be bidirectional or forward
+     *
+     * The final hidden state @f$ h_t @f$ computes by the following formulas:
+     *
+     @f{eqnarray*}{
+     r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+     z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+     n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\
+     h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)} \\
+     @f}
+     * Where @f$x_t@f$ is current input, @f$h_{(t-1)}@f$ is previous or initial hidden state.
+     *
+     * @f$W_{x?}@f$, @f$W_{h?}@f$ and @f$b_{?}@f$ are learned weights represented as matrices:
+     * @f$W_{x?} \in R^{N_h \times N_x}@f$, @f$W_{h?} \in R^{N_h \times N_h}@f$, @f$b_? \in R^{N_h}@f$.
+     *
+     * @f$\odot@f$ is per-element multiply operation.
+    */
+    class CV_EXPORTS GRULayer : public Layer
+    {
+    public:
+        /** Creates instance of GRU layer */
+        static Ptr<GRULayer> create(const LayerParams& params);
+    };
+
+    /** @brief Classical recurrent layer
+
+    Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$.
+
+    - input: should contain packed input @f$x_t@f$.
+    - output: should contain output @f$o_t@f$ (and @f$h_t@f$ if setProduceHiddenOutput() is set to true).
+
+    input[0] should have shape [`T`, `N`, `data_dims`] where `T` and `N` is number of timestamps and number of independent samples of @f$x_t@f$ respectively.
+
+    output[0] will have shape [`T`, `N`, @f$N_o@f$], where @f$N_o@f$ is number of rows in @f$ W_{xo} @f$ matrix.
+
+    If setProduceHiddenOutput() is set to true then @p output[1] will contain a Mat with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix.
+    */
+    class CV_EXPORTS RNNLayer : public Layer
+    {
+    public:
+        /** Creates instance of RNNLayer */
+        static Ptr<RNNLayer> create(const LayerParams& params);
+
+        /** Setups learned weights.
+
+        Recurrent-layer behavior on each step is defined by current input @f$ x_t @f$, previous state @f$ h_t @f$ and learned weights as follows:
+        @f{eqnarray*}{
+        h_t &= tanh&(W_{hh} h_{t-1} + W_{xh} x_t + b_h),  \\
+        o_t &= tanh&(W_{ho} h_t + b_o),
+        @f}
+
+        @param Wxh is @f$ W_{xh} @f$ matrix
+        @param bh  is @f$ b_{h}  @f$ vector
+        @param Whh is @f$ W_{hh} @f$ matrix
+        @param Who is @f$ W_{xo} @f$ matrix
+        @param bo  is @f$ b_{o}  @f$ vector
+        */
+        virtual void setWeights(const Mat &Wxh, const Mat &bh, const Mat &Whh, const Mat &Who, const Mat &bo) = 0;
+
+        /** @brief If this flag is set to true then layer will produce @f$ h_t @f$ as second output.
+         * @details Shape of the second output is the same as first output.
+         */
+        virtual void setProduceHiddenOutput(bool produce = false) = 0;
+
+    };
+
+    /** @brief This function performs array summation based
+    * on the Einstein summation convention. The function
+    * allows for concise expressions of various mathematical
+    * operations using subscripts.
+    *
+    * By default, the labels are placed in alphabetical
+    * order at the end of the output.
+    * For example:
+    * if `c = einsum("i,j", a, b)`, then `c[i,j] == a[i]*b[j]`.
+    * However, if `c = einsum("j,i", a, b)`, then `c[i,j] = a[j]*b[i]`.
+    * Alternatively, you can control the output order or prevent
+    * an axis from being summed/force an axis to be summed
+    * by providing indices for the output.
+    * For example:
+    * `diag(a)`         -> `einsum("ii->i", a)`
+    * `sum(a, axis=0)`  -> `einsum("i...->", a)`
+    * Subscripts at the beginning and end may be specified
+    * by putting an ellipsis "..." in the middle.
+    * For instance, the function `einsum("i...i", a)` takes
+    * the diagonal of the first and last dimensions of
+    * the operand, and `einsum("ij...,jk...->ik...")` performs
+    * the matrix product using the first two indices
+    * of each operand instead of the last two.
+    * When there is only one operand, no axes being summed,
+    *  and no output parameter, this function returns
+    * a view into the operand instead of creating a copy.
+     */
+    class CV_EXPORTS EinsumLayer : public Layer
+    {
+    public:
+        static Ptr<EinsumLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS BaseConvolutionLayer : public Layer
+    {
+    public:
+        CV_DEPRECATED_EXTERNAL Size kernel, stride, pad, dilation, adjustPad;
+        std::vector<size_t> adjust_pads;
+        std::vector<size_t> kernel_size, strides, dilations;
+        std::vector<size_t> pads_begin, pads_end;
+        String padMode;
+        int numOutput;
+    };
+
+    class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
+    {
+    public:
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+        bool fusedActivation = false;
+        bool fusedAdd = false;
+        bool useWinograd = true; // Flag whether to use Winograd to speed up 3x3 convolution.
+    };
+
+    class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer
+    {
+    public:
+        int input_zp, output_zp;
+        float input_sc, output_sc;
+
+        // quantization type flag. The perChannel default is true, that means it contains the parameters
+        // of per-Channel quantization. Otherwise, that means this layer contains per-Tensor quantized parameters.
+        bool per_channel;
+        bool useWinograd = false; // Flag whether to use Winograd to speed up 3x3 convolution.
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS DeconvolutionLayer : public BaseConvolutionLayer
+    {
+    public:
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS LRNLayer : public Layer
+    {
+    public:
+        int type;
+
+        int size;
+        float alpha, beta, bias;
+        bool normBySize;
+
+        static Ptr<LRNLayer> create(const LayerParams& params);
+    };
+
+
+    /** @brief ArgMax/ArgMin layer
+     * @note returns indices as floats, which means the supported range is [-2^24; 2^24]
+     */
+    class CV_EXPORTS ArgLayer : public Layer
+    {
+    public:
+        static Ptr<ArgLayer> create(const LayerParams& params);
+    };
+
+    /** @brief Gather layer
+     */
+    class CV_EXPORTS GatherLayer : public Layer
+    {
+    public:
+        static Ptr<GatherLayer> create(const LayerParams& params);
+    };
+
+    /** @brief GatherElements layer
+    * GatherElements takes two inputs data and indices of the same rank r >= 1 and an optional attribute axis and works such that:
+    *   output[i][j][k] = data[index[i][j][k]][j][k] if axis = 0 and r = 3
+    *   output[i][j][k] = data[i][index[i][j][k]][k] if axis = 1 and r = 3
+    *   output[i][j][k] = data[i][j][index[i][j][k]] if axis = 2 and r = 3
+    *
+    * Gather, on the other hand, takes a data tensor of rank r >= 1, and indices tensor of rank q, and works such that:
+    *   it gathers the enteries along axis dimension of the input data indexed by indices and concatenates them in an output tensor of rank q + (r - 1)
+    *   e.g. If axis = 0, let k = indices[i_{0}, ..., i_{q-1}] then output[i_{0}, ..., i_{q-1}, j_{0}, ..., j_{r-2}] = input[k , j_{0}, ..., j_{r-2}]:
+     **/
+    class CV_EXPORTS GatherElementsLayer : public Layer
+    {
+    public:
+        static Ptr<GatherElementsLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS PoolingLayer : public Layer
+    {
+    public:
+        int type;
+        std::vector<size_t> kernel_size, strides;
+        std::vector<size_t> pads_begin, pads_end;
+        bool globalPooling; //!< Flag is true if at least one of the axes is global pooled.
+        std::vector<bool> isGlobalPooling;
+        bool computeMaxIdx;
+        String padMode;
+        bool ceilMode;
+        // If true for average pooling with padding, divide an every output region
+        // by a whole kernel area. Otherwise exclude zero padded values and divide
+        // by number of real values.
+        bool avePoolPaddedArea;
+        // ROIPooling parameters.
+        Size pooledSize;
+        float spatialScale;
+        // PSROIPooling parameters.
+        int psRoiOutChannels;
+
+        static Ptr<PoolingLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS PoolingLayerInt8 : public PoolingLayer
+    {
+    public:
+        int input_zp, output_zp;
+        float input_sc, output_sc;
+        static Ptr<PoolingLayerInt8> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ReduceLayer : public Layer
+    {
+    public:
+        static Ptr<ReduceLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS SoftmaxLayer : public Layer
+    {
+    public:
+        bool logSoftMax;
+
+        static Ptr<SoftmaxLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS SoftmaxLayerInt8 : public SoftmaxLayer
+    {
+    public:
+        float output_sc;
+        int output_zp;
+        static Ptr<SoftmaxLayerInt8> create(const LayerParams& params);
+    };
+
+    /**
+     * `InnerProduct`, `MatMul` and `Gemm` operations are all implemented by Fully Connected Layer.
+     * Parameter `is_matmul` is used to distinguish `MatMul` and `Gemm` from `InnerProduct`.
+     */
+    class CV_EXPORTS InnerProductLayer : public Layer
+    {
+    public:
+        int axis;
+        static Ptr<InnerProductLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS InnerProductLayerInt8 : public InnerProductLayer
+    {
+    public:
+        int input_zp, output_zp;
+        float input_sc, output_sc;
+
+        // quantization type flag. The perChannel default is true, that means it contains the parameters
+        // of per-Channel quantization. Otherwise, that means this layer contains per-Tensor quantized parameters.
+        bool per_channel;
+        static Ptr<InnerProductLayerInt8> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS MVNLayer : public Layer
+    {
+    public:
+        float eps;
+        bool normVariance, acrossChannels;
+
+        static Ptr<MVNLayer> create(const LayerParams& params);
+    };
+
+    /* Reshaping */
+
+    class CV_EXPORTS ReshapeLayer : public Layer
+    {
+    public:
+        MatShape newShapeDesc;
+        Range newShapeRange;
+
+        static Ptr<ReshapeLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS FlattenLayer : public Layer
+    {
+    public:
+        static Ptr<FlattenLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS QuantizeLayer : public Layer
+    {
+    public:
+        std::vector<float> scales;
+        std::vector<int> zeropoints;
+        static Ptr<QuantizeLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS DequantizeLayer : public Layer
+    {
+    public:
+        std::vector<float> scales;
+        std::vector<int> zeropoints;
+        static Ptr<DequantizeLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS RequantizeLayer : public Layer
+    {
+    public:
+        float scale, shift;
+        static Ptr<RequantizeLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ConcatLayer : public Layer
+    {
+    public:
+        int axis;
+        /**
+         * @brief Add zero padding in case of concatenation of blobs with different
+         * spatial sizes.
+         *
+         * Details: https://github.com/torch/nn/blob/master/doc/containers.md#depthconcat
+         */
+        bool padding;
+        int paddingValue;
+
+        static Ptr<ConcatLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SplitLayer : public Layer
+    {
+    public:
+        int outputsCount; //!< Number of copies that will be produced (is ignored when negative).
+
+        static Ptr<SplitLayer> create(const LayerParams &params);
+    };
+
+    /**
+     * Slice layer has several modes:
+     * 1. Caffe mode
+     * @param[in] axis Axis of split operation
+     * @param[in] slice_point Array of split points
+     *
+     * Number of output blobs equals to number of split points plus one. The
+     * first blob is a slice on input from 0 to @p slice_point[0] - 1 by @p axis,
+     * the second output blob is a slice of input from @p slice_point[0] to
+     * @p slice_point[1] - 1 by @p axis and the last output blob is a slice of
+     * input from @p slice_point[-1] up to the end of @p axis size.
+     *
+     * 2. TensorFlow mode
+     * @param begin Vector of start indices
+     * @param size Vector of sizes
+     *
+     * More convenient numpy-like slice. One and only output blob
+     * is a slice `input[begin[0]:begin[0]+size[0], begin[1]:begin[1]+size[1], ...]`
+     *
+     * 3. Torch mode
+     * @param axis Axis of split operation
+     *
+     * Split input blob on the equal parts by @p axis.
+     */
+    class CV_EXPORTS SliceLayer : public Layer
+    {
+    public:
+        /**
+         * @brief Vector of slice ranges.
+         *
+         * The first dimension equals number of output blobs.
+         * Inner vector has slice ranges for the first number of input dimensions.
+         */
+        std::vector<std::vector<Range> > sliceRanges;
+        std::vector<std::vector<int> > sliceSteps;
+        int axis;
+        int num_split;
+
+        static Ptr<SliceLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS PermuteLayer : public Layer
+    {
+    public:
+        static Ptr<PermuteLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * Permute channels of 4-dimensional input blob.
+     * @param group Number of groups to split input channels and pick in turns
+     *              into output blob.
+     *
+     * \f[ groupSize = \frac{number\ of\ channels}{group} \f]
+     * \f[ output(n, c, h, w) = input(n, groupSize \times (c \% group) + \lfloor \frac{c}{group} \rfloor, h, w) \f]
+     * Read more at https://arxiv.org/pdf/1707.01083.pdf
+     */
+    class CV_EXPORTS ShuffleChannelLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+
+        int group;
+    };
+
+    /**
+     * @brief Adds extra values for specific axes.
+     * @param paddings Vector of paddings in format
+     *                 @code
+     *                 [ pad_before, pad_after,  // [0]th dimension
+     *                   pad_before, pad_after,  // [1]st dimension
+     *                   ...
+     *                   pad_before, pad_after ] // [n]th dimension
+     *                 @endcode
+     *                 that represents number of padded values at every dimension
+     *                 starting from the first one. The rest of dimensions won't
+     *                 be padded.
+     * @param value Value to be padded. Defaults to zero.
+     * @param type Padding type: 'constant', 'reflect'
+     * @param input_dims Torch's parameter. If @p input_dims is not equal to the
+     *                   actual input dimensionality then the `[0]th` dimension
+     *                   is considered as a batch dimension and @p paddings are shifted
+     *                   to a one dimension. Defaults to `-1` that means padding
+     *                   corresponding to @p paddings.
+     */
+    class CV_EXPORTS PaddingLayer : public Layer
+    {
+    public:
+        static Ptr<PaddingLayer> create(const LayerParams& params);
+    };
+
+    /* Activations */
+    class CV_EXPORTS ActivationLayer : public Layer
+    {
+    public:
+        virtual void forwardSlice(const float* src, float* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
+        virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
+        virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
+    };
+
+    class CV_EXPORTS ReLULayer : public ActivationLayer
+    {
+    public:
+        float negativeSlope;
+
+        static Ptr<ReLULayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ReLU6Layer : public ActivationLayer
+    {
+    public:
+        float minValue, maxValue;
+
+        static Ptr<ReLU6Layer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ChannelsPReLULayer : public ActivationLayer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ELULayer : public ActivationLayer
+    {
+    public:
+        float alpha;
+
+        static Ptr<ELULayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS TanHLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<TanHLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SwishLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SwishLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS MishLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<MishLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SigmoidLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SigmoidLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS BNLLLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<BNLLLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AbsLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<AbsLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS PowerLayer : public ActivationLayer
+    {
+    public:
+        float power, scale, shift;
+
+        static Ptr<PowerLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ExpLayer : public ActivationLayer
+    {
+    public:
+        float base, scale, shift;
+
+        static Ptr<ExpLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS CeilLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<CeilLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS FloorLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<FloorLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS LogLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<LogLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS RoundLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<RoundLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SqrtLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SqrtLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS NotLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<NotLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AcosLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<AcosLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AcoshLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<AcoshLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AsinLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<AsinLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AsinhLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<AsinhLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AtanLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<AtanLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AtanhLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<AtanhLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS CosLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<CosLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS CoshLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<CoshLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ErfLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<ErfLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS HardSwishLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<HardSwishLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SinLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SinLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SinhLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SinhLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SoftplusLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SoftplusLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SoftsignLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SoftsignLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS TanLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<TanLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS CeluLayer : public ActivationLayer
+    {
+    public:
+        float alpha;
+
+        static Ptr<CeluLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS HardSigmoidLayer : public ActivationLayer
+    {
+    public:
+        float alpha;
+        float beta;
+
+        static Ptr<HardSigmoidLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SeluLayer : public ActivationLayer
+    {
+    public:
+        float alpha;
+        float gamma;
+
+        static Ptr<SeluLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS GeluLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<GeluLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS GeluApproximationLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<GeluApproximationLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ThresholdedReluLayer : public ActivationLayer
+    {
+    public:
+        float alpha;
+
+        static Ptr<ThresholdedReluLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ActivationLayerInt8 : public ActivationLayer
+    {
+    public:
+        static Ptr<ActivationLayerInt8> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SignLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SignLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ShrinkLayer : public ActivationLayer
+    {
+    public:
+        float bias;
+        float lambd;
+        static Ptr<ShrinkLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ReciprocalLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<ReciprocalLayer> create(const LayerParams &params);
+    };
+
+    /* Layers used in semantic segmentation */
+
+    class CV_EXPORTS CropLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams &params);
+    };
+
+    /** @brief Element wise operation on inputs
+
+    Extra optional parameters:
+    - "operation" as string. Values are "sum" (default), "prod", "max", "div", "min"
+    - "coeff" as float array. Specify weights of inputs for SUM operation
+    - "output_channels_mode" as string. Values are "same" (default, all input must have the same layout), "input_0", "input_0_truncate", "max_input_channels"
+    */
+    class CV_EXPORTS EltwiseLayer : public Layer
+    {
+    public:
+        static Ptr<EltwiseLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS EltwiseLayerInt8 : public Layer
+    {
+    public:
+        static Ptr<EltwiseLayerInt8> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS NaryEltwiseLayer : public Layer
+    {
+    public:
+        static Ptr<NaryEltwiseLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS BatchNormLayer : public ActivationLayer
+    {
+    public:
+        bool hasWeights, hasBias;
+        float epsilon;
+
+        static Ptr<BatchNormLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS BatchNormLayerInt8 : public BatchNormLayer
+    {
+    public:
+        float input_sc, output_sc;
+        int input_zp, output_zp;
+        static Ptr<BatchNormLayerInt8> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS MaxUnpoolLayer : public Layer
+    {
+    public:
+        Size poolKernel;
+        Size poolPad;
+        Size poolStride;
+
+        static Ptr<MaxUnpoolLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ScaleLayer : public Layer
+    {
+    public:
+        bool hasBias;
+        int axis;
+        String mode;
+
+        static Ptr<ScaleLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ScaleLayerInt8 : public ScaleLayer
+    {
+    public:
+        float output_sc;
+        int output_zp;
+        static Ptr<ScaleLayerInt8> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ShiftLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ShiftLayerInt8 : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS CompareLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS DataAugmentationLayer : public Layer
+    {
+    public:
+        static Ptr<DataAugmentationLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS CorrelationLayer : public Layer
+    {
+    public:
+        static Ptr<CorrelationLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS AccumLayer : public Layer
+    {
+    public:
+        static Ptr<AccumLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS FlowWarpLayer : public Layer
+    {
+    public:
+        static Ptr<FlowWarpLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS PriorBoxLayer : public Layer
+    {
+    public:
+        static Ptr<PriorBoxLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ReorgLayer : public Layer
+    {
+    public:
+        static Ptr<ReorgLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS RegionLayer : public Layer
+    {
+    public:
+        float nmsThreshold;
+
+        static Ptr<RegionLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * @brief Detection output layer.
+     *
+     * The layer size is: @f$ (1 \times 1 \times N \times 7) @f$
+     *    where N is [keep_top_k] parameter multiplied by batch size. Each row is:
+     *    [image_id, label, confidence, xmin, ymin, xmax, ymax]
+     *    where image_id is the index of image input in the batch.
+     */
+    class CV_EXPORTS DetectionOutputLayer : public Layer
+    {
+    public:
+        static Ptr<DetectionOutputLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * @brief \f$ L_p \f$ - normalization layer.
+     * @param p Normalization factor. The most common `p = 1` for \f$ L_1 \f$ -
+     *          normalization or `p = 2` for \f$ L_2 \f$ - normalization or a custom one.
+     * @param eps Parameter \f$ \epsilon \f$ to prevent a division by zero.
+     * @param across_spatial If true, normalize an input across all non-batch dimensions.
+     *                       Otherwise normalize an every channel separately.
+     *
+     * Across spatial:
+     * @f[
+     * norm = \sqrt[p]{\epsilon + \sum_{x, y, c} |src(x, y, c)|^p } \\
+     * dst(x, y, c) = \frac{ src(x, y, c) }{norm}
+     * @f]
+     *
+     * Channel wise normalization:
+     * @f[
+     * norm(c) = \sqrt[p]{\epsilon + \sum_{x, y} |src(x, y, c)|^p } \\
+     * dst(x, y, c) = \frac{ src(x, y, c) }{norm(c)}
+     * @f]
+     *
+     * Where `x, y` - spatial coordinates, `c` - channel.
+     *
+     * An every sample in the batch is normalized separately. Optionally,
+     * output is scaled by the trained parameters.
+     */
+    class CV_EXPORTS NormalizeBBoxLayer : public Layer
+    {
+    public:
+        float pnorm, epsilon;
+        CV_DEPRECATED_EXTERNAL bool acrossSpatial;
+
+        static Ptr<NormalizeBBoxLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * @brief Resize input 4-dimensional blob by nearest neighbor or bilinear strategy.
+     *
+     * Layer is used to support TensorFlow's resize_nearest_neighbor and resize_bilinear ops.
+     */
+    class CV_EXPORTS ResizeLayer : public Layer
+    {
+    public:
+        static Ptr<ResizeLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * @brief Bilinear resize layer from https://github.com/cdmh/deeplab-public-ver2
+     *
+     * It differs from @ref ResizeLayer in output shape and resize scales computations.
+     */
+    class CV_EXPORTS InterpLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ProposalLayer : public Layer
+    {
+    public:
+        static Ptr<ProposalLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS CropAndResizeLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS CumSumLayer : public Layer
+    {
+    public:
+        int exclusive;
+        int reverse;
+
+        static Ptr<CumSumLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ScatterLayer : public Layer
+    {
+    public:
+        static Ptr<ScatterLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ScatterNDLayer : public Layer
+    {
+    public:
+        static Ptr<ScatterNDLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS TileLayer : public Layer
+    {
+    public:
+        static Ptr<TileLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS LayerNormLayer : public Layer
+    {
+    public:
+        CV_DEPRECATED_EXTERNAL bool hasBias; // Deprecated, preserve for compatibility
+        int axis;
+        float epsilon;
+
+        static Ptr<LayerNormLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS GemmLayer : public Layer {
+    public:
+        bool trans_a;
+        bool trans_b;
+        float alpha;
+        float beta;
+
+        static Ptr<GemmLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS MatMulLayer : public Layer {
+     public:
+        static Ptr<MatMulLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ExpandLayer : public Layer
+    {
+    public:
+        static Ptr<ExpandLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS InstanceNormLayer : public Layer {
+    public:
+        float epsilon;
+
+        static Ptr<InstanceNormLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AttentionLayer : public Layer {
+     public:
+        static Ptr<AttentionLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS GroupNormLayer : public Layer {
+    public:
+        static Ptr<GroupNormLayer> create(const LayerParams &params);
+    };
+
+//! @}
+//! @}
+CV__DNN_INLINE_NS_END
+}
+}
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/dict.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/dict.hpp
new file mode 100644
index 000000000000..059ce9b28ef5
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/dict.hpp
@@ -0,0 +1,160 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <opencv2/core.hpp>
+#include <map>
+#include <ostream>
+
+#include <opencv2/dnn/dnn.hpp>
+
+#ifndef OPENCV_DNN_DNN_DICT_HPP
+#define OPENCV_DNN_DNN_DICT_HPP
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+//! @addtogroup dnn
+//! @{
+
+/** @brief This struct stores the scalar value (or array) of one of the following type: double, cv::String or int64.
+ *  @todo Maybe int64 is useless because double type exactly stores at least 2^52 integers.
+ */
+struct CV_EXPORTS_W DictValue
+{
+    DictValue(const DictValue &r);
+    explicit DictValue(bool i)           : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = i ? 1 : 0; }       //!< Constructs integer scalar
+    explicit DictValue(int64 i = 0)      : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = i; }       //!< Constructs integer scalar
+    CV_WRAP explicit DictValue(int i)    : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = i; }       //!< Constructs integer scalar
+    explicit DictValue(unsigned p)       : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = p; }       //!< Constructs integer scalar
+    CV_WRAP explicit DictValue(double p)         : type(Param::REAL), pd(new AutoBuffer<double,1>) { (*pd)[0] = p; }     //!< Constructs floating point scalar
+    CV_WRAP explicit DictValue(const String &s)  : type(Param::STRING), ps(new AutoBuffer<String,1>) { (*ps)[0] = s; }   //!< Constructs string scalar
+    explicit DictValue(const char *s)            : type(Param::STRING), ps(new AutoBuffer<String,1>) { (*ps)[0] = s; }   //!< @overload
+
+    template<typename TypeIter>
+    static DictValue arrayInt(TypeIter begin, int size);    //!< Constructs integer array
+    template<typename TypeIter>
+    static DictValue arrayReal(TypeIter begin, int size);   //!< Constructs floating point array
+    template<typename TypeIter>
+    static DictValue arrayString(TypeIter begin, int size); //!< Constructs array of strings
+
+    template<typename T>
+    T get(int idx = -1) const; //!< Tries to convert array element with specified index to requested type and returns its.
+
+    int size() const;
+
+    CV_WRAP bool isInt() const;
+    CV_WRAP bool isString() const;
+    CV_WRAP bool isReal() const;
+
+    CV_WRAP int getIntValue(int idx = -1) const;
+    CV_WRAP double getRealValue(int idx = -1) const;
+    CV_WRAP String getStringValue(int idx = -1) const;
+
+    DictValue &operator=(const DictValue &r);
+
+    friend std::ostream &operator<<(std::ostream &stream, const DictValue &dictv);
+
+    ~DictValue();
+
+private:
+
+    Param type;
+
+    union
+    {
+        AutoBuffer<int64, 1> *pi;
+        AutoBuffer<double, 1> *pd;
+        AutoBuffer<String, 1> *ps;
+        void *pv;
+    };
+
+    DictValue(Param _type, void *_p) : type(_type), pv(_p) {}
+    void release();
+};
+
+/** @brief This class implements name-value dictionary, values are instances of DictValue. */
+class CV_EXPORTS Dict
+{
+    typedef std::map<String, DictValue> _Dict;
+    _Dict dict;
+
+public:
+
+    //! Checks a presence of the @p key in the dictionary.
+    bool has(const String &key) const;
+
+    //! If the @p key in the dictionary then returns pointer to its value, else returns NULL.
+    DictValue *ptr(const String &key);
+
+    /** @overload */
+    const DictValue *ptr(const String &key) const;
+
+    //! If the @p key in the dictionary then returns its value, else an error will be generated.
+    const DictValue &get(const String &key) const;
+
+    /** @overload */
+    template <typename T>
+    T get(const String &key) const;
+
+    //! If the @p key in the dictionary then returns its value, else returns @p defaultValue.
+    template <typename T>
+    T get(const String &key, const T &defaultValue) const;
+
+    //! Sets new @p value for the @p key, or adds new key-value pair into the dictionary.
+    template<typename T>
+    const T &set(const String &key, const T &value);
+
+    //! Erase @p key from the dictionary.
+    void erase(const String &key);
+
+    friend std::ostream &operator<<(std::ostream &stream, const Dict &dict);
+
+    std::map<String, DictValue>::const_iterator begin() const;
+
+    std::map<String, DictValue>::const_iterator end() const;
+};
+
+//! @}
+CV__DNN_INLINE_NS_END
+}
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/dnn.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/dnn.hpp
new file mode 100644
index 000000000000..fd9129010475
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/dnn.hpp
@@ -0,0 +1,1946 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_DNN_HPP
+#define OPENCV_DNN_DNN_HPP
+
+#include <vector>
+#include <opencv2/core.hpp>
+#include "opencv2/core/async.hpp"
+
+#include "../dnn/version.hpp"
+
+#include <opencv2/dnn/dict.hpp>
+
+namespace cv {
+namespace dnn {
+
+namespace accessor {
+class DnnNetAccessor;  // forward declaration
+}
+
+CV__DNN_INLINE_NS_BEGIN
+//! @addtogroup dnn
+//! @{
+
+    typedef std::vector<int> MatShape;
+
+    /**
+     * @brief Enum of computation backends supported by layers.
+     * @see Net::setPreferableBackend
+     */
+    enum Backend
+    {
+        //! DNN_BACKEND_DEFAULT equals to OPENCV_DNN_BACKEND_DEFAULT, which can be defined using CMake or a configuration parameter
+        DNN_BACKEND_DEFAULT = 0,
+        DNN_BACKEND_HALIDE,
+        DNN_BACKEND_INFERENCE_ENGINE,            //!< Intel OpenVINO computational backend
+                                                 //!< @note Tutorial how to build OpenCV with OpenVINO: @ref tutorial_dnn_openvino
+        DNN_BACKEND_OPENCV,
+        DNN_BACKEND_VKCOM,
+        DNN_BACKEND_CUDA,
+        DNN_BACKEND_WEBNN,
+        DNN_BACKEND_TIMVX,
+        DNN_BACKEND_CANN,
+#if defined(__OPENCV_BUILD) || defined(BUILD_PLUGIN)
+#if !defined(OPENCV_BINDING_PARSER)
+        DNN_BACKEND_INFERENCE_ENGINE_NGRAPH = 1000000,     // internal - use DNN_BACKEND_INFERENCE_ENGINE + setInferenceEngineBackendType()
+        DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019,      // internal - use DNN_BACKEND_INFERENCE_ENGINE + setInferenceEngineBackendType()
+#endif
+#endif
+    };
+
+    /**
+     * @brief Enum of target devices for computations.
+     * @see Net::setPreferableTarget
+     */
+    enum Target
+    {
+        DNN_TARGET_CPU = 0,
+        DNN_TARGET_OPENCL,
+        DNN_TARGET_OPENCL_FP16,
+        DNN_TARGET_MYRIAD,
+        DNN_TARGET_VULKAN,
+        DNN_TARGET_FPGA,  //!< FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
+        DNN_TARGET_CUDA,
+        DNN_TARGET_CUDA_FP16,
+        DNN_TARGET_HDDL,
+        DNN_TARGET_NPU,
+        DNN_TARGET_CPU_FP16, // Only the ARM platform is supported. Low precision computing, accelerate model inference.
+    };
+
+    /**
+     * @brief Enum of data layout for model inference.
+     * @see Image2BlobParams
+     */
+    enum DataLayout
+    {
+        DNN_LAYOUT_UNKNOWN = 0,
+        DNN_LAYOUT_ND = 1,        //!< OpenCV data layout for 2D data.
+        DNN_LAYOUT_NCHW = 2,      //!< OpenCV data layout for 4D data.
+        DNN_LAYOUT_NCDHW = 3,      //!< OpenCV data layout for 5D data.
+        DNN_LAYOUT_NHWC = 4,      //!< Tensorflow-like data layout for 4D data.
+        DNN_LAYOUT_NDHWC = 5,      //!< Tensorflow-like data layout for 5D data.
+        DNN_LAYOUT_PLANAR = 6,     //!< Tensorflow-like data layout, it should only be used at tf or tflite model parsing.
+    };
+
+    CV_EXPORTS std::vector< std::pair<Backend, Target> > getAvailableBackends();
+    CV_EXPORTS_W std::vector<Target> getAvailableTargets(dnn::Backend be);
+
+    /**
+     * @brief Enables detailed logging of the DNN model loading with CV DNN API.
+     * @param[in] isDiagnosticsMode Indicates whether diagnostic mode should be set.
+     *
+     * Diagnostic mode provides detailed logging of the model loading stage to explore
+     * potential problems (ex.: not implemented layer type).
+     *
+     * @note In diagnostic mode series of assertions will be skipped, it can lead to the
+     * expected application crash.
+     */
+    CV_EXPORTS void enableModelDiagnostics(bool isDiagnosticsMode);
+
+    /** @brief This class provides all data needed to initialize layer.
+     *
+     * It includes dictionary with scalar params (which can be read by using Dict interface),
+     * blob params #blobs and optional meta information: #name and #type of layer instance.
+    */
+    class CV_EXPORTS LayerParams : public Dict
+    {
+    public:
+        //TODO: Add ability to name blob params
+        std::vector<Mat> blobs; //!< List of learned parameters stored as blobs.
+
+        String name; //!< Name of the layer instance (optional, can be used internal purposes).
+        String type; //!< Type name which was used for creating layer by layer factory (optional).
+    };
+
+   /**
+    * @brief Derivatives of this class encapsulates functions of certain backends.
+    */
+    class BackendNode
+    {
+    public:
+        explicit BackendNode(int backendId);
+
+        virtual ~BackendNode(); //!< Virtual destructor to make polymorphism.
+
+        int backendId; //!< Backend identifier.
+    };
+
+    /**
+     * @brief Derivatives of this class wraps cv::Mat for different backends and targets.
+     */
+    class BackendWrapper
+    {
+    public:
+        BackendWrapper(int backendId, int targetId);
+
+        /**
+         * @brief Wrap cv::Mat for specific backend and target.
+         * @param[in] targetId Target identifier.
+         * @param[in] m cv::Mat for wrapping.
+         *
+         * Make CPU->GPU data transfer if it's require for the target.
+         */
+        BackendWrapper(int targetId, const cv::Mat& m);
+
+        /**
+         * @brief Make wrapper for reused cv::Mat.
+         * @param[in] base Wrapper of cv::Mat that will be reused.
+         * @param[in] shape Specific shape.
+         *
+         * Initialize wrapper from another one. It'll wrap the same host CPU
+         * memory and mustn't allocate memory on device(i.e. GPU). It might
+         * has different shape. Use in case of CPU memory reusing for reuse
+         * associated memory on device too.
+         */
+        BackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape);
+
+        virtual ~BackendWrapper(); //!< Virtual destructor to make polymorphism.
+
+        /**
+         * @brief Transfer data to CPU host memory.
+         */
+        virtual void copyToHost() = 0;
+
+        /**
+         * @brief Indicate that an actual data is on CPU.
+         */
+        virtual void setHostDirty() = 0;
+
+        int backendId;  //!< Backend identifier.
+        int targetId;   //!< Target identifier.
+    };
+
+    class CV_EXPORTS ActivationLayer;
+
+    /** @brief This interface class allows to build new Layers - are building blocks of networks.
+     *
+     * Each class, derived from Layer, must implement forward() method to compute outputs.
+     * Also before using the new layer into networks you must register your layer by using one of @ref dnnLayerFactory "LayerFactory" macros.
+     */
+    class CV_EXPORTS_W Layer : public Algorithm
+    {
+    public:
+
+        //! List of learned parameters must be stored here to allow read them by using Net::getParam().
+        CV_PROP_RW std::vector<Mat> blobs;
+
+        /** @brief Computes and sets internal parameters according to inputs, outputs and blobs.
+         *  @deprecated Use Layer::finalize(InputArrayOfArrays, OutputArrayOfArrays) instead
+         *  @param[in]  input  vector of already allocated input blobs
+         *  @param[out] output vector of already allocated output blobs
+         *
+         * This method is called after network has allocated all memory for input and output blobs
+         * and before inferencing.
+         */
+        CV_DEPRECATED_EXTERNAL
+        virtual void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output);
+
+        /** @brief Computes and sets internal parameters according to inputs, outputs and blobs.
+         *  @param[in]  inputs  vector of already allocated input blobs
+         *  @param[out] outputs vector of already allocated output blobs
+         *
+         * This method is called after network has allocated all memory for input and output blobs
+         * and before inferencing.
+         */
+        CV_WRAP virtual void finalize(InputArrayOfArrays inputs, OutputArrayOfArrays outputs);
+
+        /** @brief Given the @p input blobs, computes the output @p blobs.
+         *  @deprecated Use Layer::forward(InputArrayOfArrays, OutputArrayOfArrays, OutputArrayOfArrays) instead
+         *  @param[in]  input  the input blobs.
+         *  @param[out] output allocated output blobs, which will store results of the computation.
+         *  @param[out] internals allocated internal blobs
+         */
+        CV_DEPRECATED_EXTERNAL
+        virtual void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals);
+
+        /** @brief Given the @p input blobs, computes the output @p blobs.
+         *  @param[in]  inputs  the input blobs.
+         *  @param[out] outputs allocated output blobs, which will store results of the computation.
+         *  @param[out] internals allocated internal blobs
+         */
+        virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);
+
+        /** @brief Tries to quantize the given layer and compute the quantization parameters required for fixed point implementation.
+         *  @param[in] scales input and output scales.
+         *  @param[in] zeropoints input and output zeropoints.
+         *  @param[out] params Quantized parameters required for fixed point implementation of that layer.
+         *  @returns True if layer can be quantized.
+         */
+        virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                                 const std::vector<std::vector<int> > &zeropoints, LayerParams& params);
+
+        /** @brief Given the @p input blobs, computes the output @p blobs.
+         *  @param[in]  inputs  the input blobs.
+         *  @param[out] outputs allocated output blobs, which will store results of the computation.
+         *  @param[out] internals allocated internal blobs
+         */
+        void forward_fallback(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);
+
+        /** @brief
+         * @overload
+         * @deprecated Use Layer::finalize(InputArrayOfArrays, OutputArrayOfArrays) instead
+         */
+        CV_DEPRECATED_EXTERNAL
+        void finalize(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs);
+
+        /** @brief
+         * @overload
+         * @deprecated Use Layer::finalize(InputArrayOfArrays, OutputArrayOfArrays) instead
+         */
+        CV_DEPRECATED std::vector<Mat> finalize(const std::vector<Mat> &inputs);
+
+        /** @brief Allocates layer and computes output.
+         *  @deprecated This method will be removed in the future release.
+         */
+        CV_DEPRECATED CV_WRAP void run(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs,
+                                       CV_IN_OUT std::vector<Mat> &internals);
+
+        /** @brief Returns index of input blob into the input array.
+         *  @param inputName label of input blob
+         *
+         * Each layer input and output can be labeled to easily identify them using "%<layer_name%>[.output_name]" notation.
+         * This method maps label of input blob to its index into input vector.
+         */
+        virtual int inputNameToIndex(String inputName);  // FIXIT const
+        /** @brief Returns index of output blob in output array.
+         *  @see inputNameToIndex()
+         */
+        CV_WRAP virtual int outputNameToIndex(const String& outputName);  // FIXIT const
+
+        /**
+         * @brief Ask layer if it support specific backend for doing computations.
+         * @param[in] backendId computation backend identifier.
+         * @see Backend
+         */
+        virtual bool supportBackend(int backendId);  // FIXIT const
+
+        /**
+         * @brief Returns Halide backend node.
+         * @param[in] inputs Input Halide buffers.
+         * @see BackendNode, BackendWrapper
+         *
+         * Input buffers should be exactly the same that will be used in forward invocations.
+         * Despite we can use Halide::ImageParam based on input shape only,
+         * it helps prevent some memory management issues (if something wrong,
+         * Halide tests will be failed).
+         */
+        virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs);
+
+        virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs, const std::vector<Ptr<BackendNode> >& nodes);
+
+        virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs, std::vector<Ptr<BackendWrapper> > &outputs);
+
+        virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> > &inputs, const std::vector<Ptr<BackendNode> >& nodes);
+
+        /**
+         * @brief Returns a CUDA backend node
+         *
+         * @param   context  void pointer to CSLContext object
+         * @param   inputs   layer inputs
+         * @param   outputs  layer outputs
+         */
+        virtual Ptr<BackendNode> initCUDA(
+            void *context,
+            const std::vector<Ptr<BackendWrapper>>& inputs,
+            const std::vector<Ptr<BackendWrapper>>& outputs
+        );
+
+        /**
+         * @brief Returns a TimVX backend node
+         *
+         * @param   timVxInfo  void pointer to CSLContext object
+         * @param   inputsWrapper   layer inputs
+         * @param   outputsWrapper  layer outputs
+         * @param   isLast if the node is the last one of the TimVX Graph.
+         */
+        virtual Ptr<BackendNode> initTimVX(void* timVxInfo,
+                                           const std::vector<Ptr<BackendWrapper> > &inputsWrapper,
+                                           const std::vector<Ptr<BackendWrapper> > &outputsWrapper,
+                                           bool isLast);
+
+        /**
+         * @brief Returns a CANN backend node
+         *
+         * @param   inputs   input tensors of CANN operator
+         * @param   outputs  output tensors of CANN operator
+         * @param   nodes           nodes of input tensors
+         */
+        virtual Ptr<BackendNode> initCann(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                          const std::vector<Ptr<BackendWrapper> > &outputs,
+                                          const std::vector<Ptr<BackendNode> >& nodes);
+
+       /**
+        * @brief Automatic Halide scheduling based on layer hyper-parameters.
+        * @param[in] node Backend node with Halide functions.
+        * @param[in] inputs Blobs that will be used in forward invocations.
+        * @param[in] outputs Blobs that will be used in forward invocations.
+        * @param[in] targetId Target identifier
+        * @see BackendNode, Target
+        *
+        * Layer don't use own Halide::Func members because we can have applied
+        * layers fusing. In this way the fused function should be scheduled.
+        */
+        virtual void applyHalideScheduler(Ptr<BackendNode>& node,
+                                          const std::vector<Mat*> &inputs,
+                                          const std::vector<Mat> &outputs,
+                                          int targetId) const;
+
+        /**
+         * @brief Implement layers fusing.
+         * @param[in] node Backend node of bottom layer.
+         * @see BackendNode
+         *
+         * Actual for graph-based backends. If layer attached successfully,
+         * returns non-empty cv::Ptr to node of the same backend.
+         * Fuse only over the last function.
+         */
+        virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node);
+
+        /**
+         * @brief Tries to attach to the layer the subsequent activation layer, i.e. do the layer fusion in a partial case.
+         * @param[in] layer The subsequent activation layer.
+         *
+         * Returns true if the activation layer has been attached successfully.
+         */
+        virtual bool setActivation(const Ptr<ActivationLayer>& layer);
+
+        /**
+         * @brief Try to fuse current layer with a next one
+         * @param[in] top Next layer to be fused.
+         * @returns True if fusion was performed.
+         */
+        virtual bool tryFuse(Ptr<Layer>& top);
+
+        /**
+         * @brief Returns parameters of layers with channel-wise multiplication and addition.
+         * @param[out] scale Channel-wise multipliers. Total number of values should
+         *                   be equal to number of channels.
+         * @param[out] shift Channel-wise offsets. Total number of values should
+         *                   be equal to number of channels.
+         *
+         * Some layers can fuse their transformations with further layers.
+         * In example, convolution + batch normalization. This way base layer
+         * use weights from layer after it. Fused layer is skipped.
+         * By default, @p scale and @p shift are empty that means layer has no
+         * element-wise multiplications or additions.
+         */
+        virtual void getScaleShift(Mat& scale, Mat& shift) const;
+
+        /**
+         * @brief Returns scale and zeropoint of layers
+         * @param[out] scale Output scale
+         * @param[out] zeropoint Output zeropoint
+         *
+         * By default, @p scale is 1 and @p zeropoint is 0.
+         */
+        virtual void getScaleZeropoint(float& scale, int& zeropoint) const;
+
+
+        /**
+         * @brief "Detaches" all the layers, attached to particular layer.
+         */
+        virtual void unsetAttached();
+
+        virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                     const int requiredOutputs,
+                                     std::vector<MatShape> &outputs,
+                                     std::vector<MatShape> &internals) const;
+
+        virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                               const std::vector<MatShape> &outputs) const {CV_UNUSED(inputs); CV_UNUSED(outputs); return 0;}
+
+        virtual bool updateMemoryShapes(const std::vector<MatShape> &inputs);
+
+        CV_PROP String name; //!< Name of the layer instance, can be used for logging or other internal purposes.
+        CV_PROP String type; //!< Type name which was used for creating layer by layer factory.
+        CV_PROP int preferableTarget; //!< prefer target for layer forwarding
+
+        Layer();
+        explicit Layer(const LayerParams &params);      //!< Initializes only #name, #type and #blobs fields.
+        void setParamsFrom(const LayerParams &params);  //!< Initializes only #name, #type and #blobs fields.
+        virtual ~Layer();
+    };
+
+    /** @brief This class allows to create and manipulate comprehensive artificial neural networks.
+     *
+     * Neural network is presented as directed acyclic graph (DAG), where vertices are Layer instances,
+     * and edges specify relationships between layers inputs and outputs.
+     *
+     * Each network layer has unique integer id and unique string name inside its network.
+     * LayerId can store either layer name or layer id.
+     *
+     * This class supports reference counting of its instances, i. e. copies point to the same instance.
+     */
+    class CV_EXPORTS_W_SIMPLE Net
+    {
+    public:
+
+        CV_WRAP Net();  //!< Default constructor.
+        CV_WRAP ~Net(); //!< Destructor frees the net only if there aren't references to the net anymore.
+
+        /** @brief Create a network from Intel's Model Optimizer intermediate representation (IR).
+         *  @param[in] xml XML configuration file with network's topology.
+         *  @param[in] bin Binary file with trained weights.
+         *  Networks imported from Intel's Model Optimizer are launched in Intel's Inference Engine
+         *  backend.
+         */
+        CV_WRAP static Net readFromModelOptimizer(CV_WRAP_FILE_PATH const String& xml, CV_WRAP_FILE_PATH const String& bin);
+
+        /** @brief Create a network from Intel's Model Optimizer in-memory buffers with intermediate representation (IR).
+         *  @param[in] bufferModelConfig buffer with model's configuration.
+         *  @param[in] bufferWeights buffer with model's trained weights.
+         *  @returns Net object.
+         */
+        CV_WRAP static
+        Net readFromModelOptimizer(const std::vector<uchar>& bufferModelConfig, const std::vector<uchar>& bufferWeights);
+
+        /** @brief Create a network from Intel's Model Optimizer in-memory buffers with intermediate representation (IR).
+         *  @param[in] bufferModelConfigPtr buffer pointer of model's configuration.
+         *  @param[in] bufferModelConfigSize buffer size of model's configuration.
+         *  @param[in] bufferWeightsPtr buffer pointer of model's trained weights.
+         *  @param[in] bufferWeightsSize buffer size of model's trained weights.
+         *  @returns Net object.
+         */
+        static
+        Net readFromModelOptimizer(const uchar* bufferModelConfigPtr, size_t bufferModelConfigSize,
+                                            const uchar* bufferWeightsPtr, size_t bufferWeightsSize);
+
+        /** Returns true if there are no layers in the network. */
+        CV_WRAP bool empty() const;
+
+        /** @brief Dump net to String
+         *  @returns String with structure, hyperparameters, backend, target and fusion
+         *  Call method after setInput(). To see correct backend, target and fusion run after forward().
+         */
+        CV_WRAP String dump();
+        /** @brief Dump net structure, hyperparameters, backend, target and fusion to dot file
+         *  @param path   path to output file with .dot extension
+         *  @see dump()
+         */
+        CV_WRAP void dumpToFile(CV_WRAP_FILE_PATH const String& path);
+        /** @brief Dump net structure, hyperparameters, backend, target and fusion to pbtxt file
+         *  @param path   path to output file with .pbtxt extension
+         *
+         *  Use Netron (https://netron.app) to open the target file to visualize the model.
+         *  Call method after setInput(). To see correct backend, target and fusion run after forward().
+        */
+        CV_WRAP void dumpToPbtxt(CV_WRAP_FILE_PATH const String& path);
+
+        /** @brief Adds new layer to the net.
+         *  @param name   unique name of the adding layer.
+         *  @param type   typename of the adding layer (type must be registered in LayerRegister).
+         *  @param dtype  datatype of output blobs.
+         *  @param params parameters which will be used to initialize the creating layer.
+         *  @returns unique identifier of created layer, or -1 if a failure will happen.
+         */
+        int addLayer(const String &name, const String &type, const int &dtype, LayerParams &params);
+
+        /** @overload Datatype of output blobs set to default CV_32F */
+        int addLayer(const String &name, const String &type, LayerParams &params);
+
+        /** @brief Adds new layer and connects its first input to the first output of previously added layer.
+         *  @see addLayer()
+         */
+        int addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams &params);
+
+        /** @overload */
+        int addLayerToPrev(const String &name, const String &type, LayerParams &params);
+
+        /** @brief Converts string name of the layer to the integer identifier.
+         *  @returns id of the layer, or -1 if the layer wasn't found.
+         */
+        CV_WRAP int getLayerId(const String &layer) const;
+
+        CV_WRAP std::vector<String> getLayerNames() const;
+
+        /** @brief Container for strings and integers.
+         *
+         * @deprecated Use getLayerId() with int result.
+         */
+        typedef DictValue LayerId;
+
+        /** @brief Returns pointer to layer with specified id or name which the network use. */
+        CV_WRAP Ptr<Layer> getLayer(int layerId) const;
+        /** @overload
+         *  @deprecated Use int getLayerId(const String &layer)
+         */
+        CV_WRAP inline Ptr<Layer> getLayer(const String& layerName) const { return getLayer(getLayerId(layerName)); }
+        /** @overload
+         *  @deprecated to be removed
+         */
+        CV_WRAP Ptr<Layer> getLayer(const LayerId& layerId) const;
+
+        /** @brief Returns pointers to input layers of specific layer. */
+        std::vector<Ptr<Layer> > getLayerInputs(int layerId) const; // FIXIT: CV_WRAP
+
+        /** @brief Connects output of the first layer to input of the second layer.
+         *  @param outPin descriptor of the first layer output.
+         *  @param inpPin descriptor of the second layer input.
+         *
+         * Descriptors have the following template <DFN>&lt;layer_name&gt;[.input_number]</DFN>:
+         * - the first part of the template <DFN>layer_name</DFN> is string name of the added layer.
+         *   If this part is empty then the network input pseudo layer will be used;
+         * - the second optional part of the template <DFN>input_number</DFN>
+         *   is either number of the layer input, either label one.
+         *   If this part is omitted then the first layer input will be used.
+         *
+         *  @see setNetInputs(), Layer::inputNameToIndex(), Layer::outputNameToIndex()
+         */
+        CV_WRAP void connect(String outPin, String inpPin);
+
+        /** @brief Connects #@p outNum output of the first layer to #@p inNum input of the second layer.
+         *  @param outLayerId identifier of the first layer
+         *  @param outNum number of the first layer output
+         *  @param inpLayerId identifier of the second layer
+         *  @param inpNum number of the second layer input
+         */
+        void connect(int outLayerId, int outNum, int inpLayerId, int inpNum);
+
+        /** @brief Registers network output with name
+         *
+         *  Function may create additional 'Identity' layer.
+         *
+         *  @param outputName identifier of the output
+         *  @param layerId identifier of the second layer
+         *  @param outputPort number of the second layer input
+         *
+         *  @returns index of bound layer (the same as layerId or newly created)
+         */
+        int registerOutput(const std::string& outputName, int layerId, int outputPort);
+
+        /** @brief Sets outputs names of the network input pseudo layer.
+         *
+         * Each net always has special own the network input pseudo layer with id=0.
+         * This layer stores the user blobs only and don't make any computations.
+         * In fact, this layer provides the only way to pass user data into the network.
+         * As any other layer, this layer can label its outputs and this function provides an easy way to do this.
+         */
+        CV_WRAP void setInputsNames(const std::vector<String> &inputBlobNames);
+
+        /** @brief Specify shape of network input.
+         */
+        CV_WRAP void setInputShape(const String &inputName, const MatShape& shape);
+
+        /** @brief Runs forward pass to compute output of layer with name @p outputName.
+         *  @param outputName name for layer which output is needed to get
+         *  @return blob for first output of specified layer.
+         *  @details By default runs forward pass for the whole network.
+         */
+        CV_WRAP Mat forward(const String& outputName = String());
+
+        /** @brief Runs forward pass to compute output of layer with name @p outputName.
+         *  @param outputName name for layer which output is needed to get
+         *  @details By default runs forward pass for the whole network.
+         *
+         *  This is an asynchronous version of forward(const String&).
+         *  dnn::DNN_BACKEND_INFERENCE_ENGINE backend is required.
+         */
+        CV_WRAP AsyncArray forwardAsync(const String& outputName = String());
+
+        /** @brief Runs forward pass to compute output of layer with name @p outputName.
+         *  @param outputBlobs contains all output blobs for specified layer.
+         *  @param outputName name for layer which output is needed to get
+         *  @details If @p outputName is empty, runs forward pass for the whole network.
+         */
+        CV_WRAP void forward(OutputArrayOfArrays outputBlobs, const String& outputName = String());
+
+        /** @brief Runs forward pass to compute outputs of layers listed in @p outBlobNames.
+         *  @param outputBlobs contains blobs for first outputs of specified layers.
+         *  @param outBlobNames names for layers which outputs are needed to get
+         */
+        CV_WRAP void forward(OutputArrayOfArrays outputBlobs,
+                             const std::vector<String>& outBlobNames);
+
+        /** @brief Runs forward pass to compute outputs of layers listed in @p outBlobNames.
+         *  @param outputBlobs contains all output blobs for each layer specified in @p outBlobNames.
+         *  @param outBlobNames names for layers which outputs are needed to get
+         */
+        CV_WRAP_AS(forwardAndRetrieve) void forward(CV_OUT std::vector<std::vector<Mat> >& outputBlobs,
+                                                    const std::vector<String>& outBlobNames);
+
+        /** @brief Returns a quantized Net from a floating-point Net.
+         *  @param calibData Calibration data to compute the quantization parameters.
+         *  @param inputsDtype Datatype of quantized net's inputs. Can be CV_32F or CV_8S.
+         *  @param outputsDtype Datatype of quantized net's outputs. Can be CV_32F or CV_8S.
+         *  @param perChannel Quantization granularity of quantized Net. The default is true, that means quantize model
+         *  in per-channel way (channel-wise). Set it false to quantize model in per-tensor way (or tensor-wise).
+         */
+        CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel=true);
+
+        /** @brief Returns input scale and zeropoint for a quantized Net.
+         *  @param scales output parameter for returning input scales.
+         *  @param zeropoints output parameter for returning input zeropoints.
+         */
+        CV_WRAP void getInputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
+
+        /** @brief Returns output scale and zeropoint for a quantized Net.
+         *  @param scales output parameter for returning output scales.
+         *  @param zeropoints output parameter for returning output zeropoints.
+         */
+        CV_WRAP void getOutputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
+
+        /**
+         * @brief Compile Halide layers.
+         * @param[in] scheduler Path to YAML file with scheduling directives.
+         * @see setPreferableBackend
+         *
+         * Schedule layers that support Halide backend. Then compile them for
+         * specific target. For layers that not represented in scheduling file
+         * or if no manual scheduling used at all, automatic scheduling will be applied.
+         */
+        CV_WRAP void setHalideScheduler(const String& scheduler);
+
+        /**
+         * @brief Ask network to use specific computation backend where it supported.
+         * @param[in] backendId backend identifier.
+         * @see Backend
+         */
+        CV_WRAP void setPreferableBackend(int backendId);
+
+        /**
+         * @brief Ask network to make computations on specific target device.
+         * @param[in] targetId target identifier.
+         * @see Target
+         *
+         * List of supported combinations backend / target:
+         * |                        | DNN_BACKEND_OPENCV | DNN_BACKEND_INFERENCE_ENGINE | DNN_BACKEND_HALIDE |  DNN_BACKEND_CUDA |
+         * |------------------------|--------------------|------------------------------|--------------------|-------------------|
+         * | DNN_TARGET_CPU         |                  + |                            + |                  + |                   |
+         * | DNN_TARGET_OPENCL      |                  + |                            + |                  + |                   |
+         * | DNN_TARGET_OPENCL_FP16 |                  + |                            + |                    |                   |
+         * | DNN_TARGET_MYRIAD      |                    |                            + |                    |                   |
+         * | DNN_TARGET_FPGA        |                    |                            + |                    |                   |
+         * | DNN_TARGET_CUDA        |                    |                              |                    |                 + |
+         * | DNN_TARGET_CUDA_FP16   |                    |                              |                    |                 + |
+         * | DNN_TARGET_HDDL        |                    |                            + |                    |                   |
+         */
+        CV_WRAP void setPreferableTarget(int targetId);
+
+        /** @brief Sets the new input value for the network
+         *  @param blob        A new blob. Should have CV_32F or CV_8U depth.
+         *  @param name        A name of input layer.
+         *  @param scalefactor An optional normalization scale.
+         *  @param mean        An optional mean subtraction values.
+         *  @see connect(String, String) to know format of the descriptor.
+         *
+         *  If scale or mean values are specified, a final input blob is computed
+         *  as:
+         * \f[input(n,c,h,w) = scalefactor \times (blob(n,c,h,w) - mean_c)\f]
+         */
+        CV_WRAP void setInput(InputArray blob, const String& name = "",
+                              double scalefactor = 1.0, const Scalar& mean = Scalar());
+
+        /** @brief Sets the new value for the learned param of the layer.
+         *  @param layer name or id of the layer.
+         *  @param numParam index of the layer parameter in the Layer::blobs array.
+         *  @param blob the new value.
+         *  @see Layer::blobs
+         *  @note If shape of the new blob differs from the previous shape,
+         *  then the following forward pass may fail.
+        */
+        CV_WRAP void setParam(int layer, int numParam, const Mat &blob);
+        CV_WRAP inline void setParam(const String& layerName, int numParam, const Mat &blob) { return setParam(getLayerId(layerName), numParam, blob); }
+
+        /** @brief Returns parameter blob of the layer.
+         *  @param layer name or id of the layer.
+         *  @param numParam index of the layer parameter in the Layer::blobs array.
+         *  @see Layer::blobs
+         */
+        CV_WRAP Mat getParam(int layer, int numParam = 0) const;
+        CV_WRAP inline Mat getParam(const String& layerName, int numParam = 0) const { return getParam(getLayerId(layerName), numParam); }
+
+        /** @brief Returns indexes of layers with unconnected outputs.
+         *
+         * FIXIT: Rework API to registerOutput() approach, deprecate this call
+         */
+        CV_WRAP std::vector<int> getUnconnectedOutLayers() const;
+
+        /** @brief Returns names of layers with unconnected outputs.
+         *
+         * FIXIT: Rework API to registerOutput() approach, deprecate this call
+         */
+        CV_WRAP std::vector<String> getUnconnectedOutLayersNames() const;
+
+        /** @brief Returns input and output shapes for all layers in loaded model;
+         *  preliminary inferencing isn't necessary.
+         *  @param netInputShapes shapes for all input blobs in net input layer.
+         *  @param layersIds output parameter for layer IDs.
+         *  @param inLayersShapes output parameter for input layers shapes;
+         * order is the same as in layersIds
+         *  @param outLayersShapes output parameter for output layers shapes;
+         * order is the same as in layersIds
+         */
+        CV_WRAP void getLayersShapes(const std::vector<MatShape>& netInputShapes,
+                                     CV_OUT std::vector<int>& layersIds,
+                                     CV_OUT std::vector<std::vector<MatShape> >& inLayersShapes,
+                                     CV_OUT std::vector<std::vector<MatShape> >& outLayersShapes) const;
+
+        /** @overload */
+        CV_WRAP void getLayersShapes(const MatShape& netInputShape,
+                                     CV_OUT std::vector<int>& layersIds,
+                                     CV_OUT std::vector<std::vector<MatShape> >& inLayersShapes,
+                                     CV_OUT std::vector<std::vector<MatShape> >& outLayersShapes) const;
+
+        /** @brief Returns input and output shapes for layer with specified
+         * id in loaded model; preliminary inferencing isn't necessary.
+         *  @param netInputShape shape input blob in net input layer.
+         *  @param layerId id for layer.
+         *  @param inLayerShapes output parameter for input layers shapes;
+         * order is the same as in layersIds
+         *  @param outLayerShapes output parameter for output layers shapes;
+         * order is the same as in layersIds
+         */
+        void getLayerShapes(const MatShape& netInputShape,
+                                    const int layerId,
+                                    CV_OUT std::vector<MatShape>& inLayerShapes,
+                                    CV_OUT std::vector<MatShape>& outLayerShapes) const; // FIXIT: CV_WRAP
+
+        /** @overload */
+        void getLayerShapes(const std::vector<MatShape>& netInputShapes,
+                                    const int layerId,
+                                    CV_OUT std::vector<MatShape>& inLayerShapes,
+                                    CV_OUT std::vector<MatShape>& outLayerShapes) const; // FIXIT: CV_WRAP
+
+        /** @brief Computes FLOP for whole loaded model with specified input shapes.
+         * @param netInputShapes vector of shapes for all net inputs.
+         * @returns computed FLOP.
+         */
+        CV_WRAP int64 getFLOPS(const std::vector<MatShape>& netInputShapes) const;
+        /** @overload */
+        CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
+        /** @overload */
+        CV_WRAP int64 getFLOPS(const int layerId,
+                               const std::vector<MatShape>& netInputShapes) const;
+        /** @overload */
+        CV_WRAP int64 getFLOPS(const int layerId,
+                               const MatShape& netInputShape) const;
+
+        /** @brief Returns list of types for layer used in model.
+         * @param layersTypes output parameter for returning types.
+         */
+        CV_WRAP void getLayerTypes(CV_OUT std::vector<String>& layersTypes) const;
+
+        /** @brief Returns count of layers of specified type.
+         * @param layerType type.
+         * @returns count of layers
+         */
+        CV_WRAP int getLayersCount(const String& layerType) const;
+
+        /** @brief Computes bytes number which are required to store
+         * all weights and intermediate blobs for model.
+         * @param netInputShapes vector of shapes for all net inputs.
+         * @param weights output parameter to store resulting bytes for weights.
+         * @param blobs output parameter to store resulting bytes for intermediate blobs.
+         */
+        void getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
+                                          CV_OUT size_t& weights, CV_OUT size_t& blobs) const; // FIXIT: CV_WRAP
+        /** @overload */
+        CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
+                                          CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
+        /** @overload */
+        CV_WRAP void getMemoryConsumption(const int layerId,
+                                          const std::vector<MatShape>& netInputShapes,
+                                          CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
+        /** @overload */
+        CV_WRAP void getMemoryConsumption(const int layerId,
+                                          const MatShape& netInputShape,
+                                          CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
+
+        /** @brief Computes bytes number which are required to store
+         * all weights and intermediate blobs for each layer.
+         * @param netInputShapes vector of shapes for all net inputs.
+         * @param layerIds output vector to save layer IDs.
+         * @param weights output parameter to store resulting bytes for weights.
+         * @param blobs output parameter to store resulting bytes for intermediate blobs.
+         */
+        void getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
+                                          CV_OUT std::vector<int>& layerIds,
+                                          CV_OUT std::vector<size_t>& weights,
+                                          CV_OUT std::vector<size_t>& blobs) const; // FIXIT: CV_WRAP
+        /** @overload */
+        void getMemoryConsumption(const MatShape& netInputShape,
+                                          CV_OUT std::vector<int>& layerIds,
+                                          CV_OUT std::vector<size_t>& weights,
+                                          CV_OUT std::vector<size_t>& blobs) const; // FIXIT: CV_WRAP
+
+        /** @brief Enables or disables layer fusion in the network.
+         * @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
+         */
+        CV_WRAP void enableFusion(bool fusion);
+
+        /** @brief Enables or disables the Winograd compute branch. The Winograd compute branch can speed up
+         * 3x3 Convolution at a small loss of accuracy.
+        * @param useWinograd true to enable the Winograd compute branch. The default is true.
+        */
+        CV_WRAP void enableWinograd(bool useWinograd);
+
+        /** @brief Returns overall time for inference and timings (in ticks) for layers.
+         *
+         * Indexes in returned vector correspond to layers ids. Some layers can be fused with others,
+         * in this case zero ticks count will be return for that skipped layers. Supported by DNN_BACKEND_OPENCV on DNN_TARGET_CPU only.
+         *
+         * @param[out] timings vector for tick timings for all layers.
+         * @return overall ticks for model inference.
+         */
+        CV_WRAP int64 getPerfProfile(CV_OUT std::vector<double>& timings);
+
+
+        struct Impl;
+        inline Impl* getImpl() const { return impl.get(); }
+        inline Impl& getImplRef() const { CV_DbgAssert(impl); return *impl.get(); }
+        friend class accessor::DnnNetAccessor;
+    protected:
+        Ptr<Impl> impl;
+    };
+
+    /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
+    *  @param cfgFile      path to the .cfg file with text description of the network architecture.
+    *  @param darknetModel path to the .weights file with learned network.
+    *  @returns Network object that ready to do forward, throw an exception in failure cases.
+    */
+    CV_EXPORTS_W Net readNetFromDarknet(CV_WRAP_FILE_PATH const String &cfgFile, CV_WRAP_FILE_PATH const String &darknetModel = String());
+
+    /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
+     *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
+     *  @param bufferModel A buffer contains a content of .weights file with learned network.
+     *  @returns Net object.
+     */
+    CV_EXPORTS_W Net readNetFromDarknet(const std::vector<uchar>& bufferCfg,
+                                        const std::vector<uchar>& bufferModel = std::vector<uchar>());
+
+    /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
+     *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
+     *  @param lenCfg      Number of bytes to read from bufferCfg
+     *  @param bufferModel A buffer contains a content of .weights file with learned network.
+     *  @param lenModel    Number of bytes to read from bufferModel
+     *  @returns Net object.
+     */
+    CV_EXPORTS Net readNetFromDarknet(const char *bufferCfg, size_t lenCfg,
+                                      const char *bufferModel = NULL, size_t lenModel = 0);
+
+    /** @brief Reads a network model stored in <a href="http://caffe.berkeleyvision.org">Caffe</a> framework's format.
+      * @param prototxt   path to the .prototxt file with text description of the network architecture.
+      * @param caffeModel path to the .caffemodel file with learned network.
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromCaffe(CV_WRAP_FILE_PATH const String &prototxt, CV_WRAP_FILE_PATH const String &caffeModel = String());
+
+    /** @brief Reads a network model stored in Caffe model in memory.
+      * @param bufferProto buffer containing the content of the .prototxt file
+      * @param bufferModel buffer containing the content of the .caffemodel file
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromCaffe(const std::vector<uchar>& bufferProto,
+                                      const std::vector<uchar>& bufferModel = std::vector<uchar>());
+
+    /** @brief Reads a network model stored in Caffe model in memory.
+      * @details This is an overloaded member function, provided for convenience.
+      * It differs from the above function only in what argument(s) it accepts.
+      * @param bufferProto buffer containing the content of the .prototxt file
+      * @param lenProto length of bufferProto
+      * @param bufferModel buffer containing the content of the .caffemodel file
+      * @param lenModel length of bufferModel
+      * @returns Net object.
+      */
+    CV_EXPORTS Net readNetFromCaffe(const char *bufferProto, size_t lenProto,
+                                    const char *bufferModel = NULL, size_t lenModel = 0);
+
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
+      * @param model  path to the .pb file with binary protobuf description of the network architecture
+      * @param config path to the .pbtxt file that contains text graph definition in protobuf format.
+      *               Resulting Net object is built by text graph using weights from a binary one that
+      *               let us make it more flexible.
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromTensorflow(CV_WRAP_FILE_PATH const String &model, CV_WRAP_FILE_PATH const String &config = String());
+
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
+      * @param bufferModel buffer containing the content of the pb file
+      * @param bufferConfig buffer containing the content of the pbtxt file
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromTensorflow(const std::vector<uchar>& bufferModel,
+                                           const std::vector<uchar>& bufferConfig = std::vector<uchar>());
+
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
+      * @details This is an overloaded member function, provided for convenience.
+      * It differs from the above function only in what argument(s) it accepts.
+      * @param bufferModel buffer containing the content of the pb file
+      * @param lenModel length of bufferModel
+      * @param bufferConfig buffer containing the content of the pbtxt file
+      * @param lenConfig length of bufferConfig
+      */
+    CV_EXPORTS Net readNetFromTensorflow(const char *bufferModel, size_t lenModel,
+                                         const char *bufferConfig = NULL, size_t lenConfig = 0);
+
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/lite">TFLite</a> framework's format.
+      * @param model  path to the .tflite file with binary flatbuffers description of the network architecture
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromTFLite(CV_WRAP_FILE_PATH const String &model);
+
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/lite">TFLite</a> framework's format.
+      * @param bufferModel buffer containing the content of the tflite file
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromTFLite(const std::vector<uchar>& bufferModel);
+
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/lite">TFLite</a> framework's format.
+      * @details This is an overloaded member function, provided for convenience.
+      * It differs from the above function only in what argument(s) it accepts.
+      * @param bufferModel buffer containing the content of the tflite file
+      * @param lenModel length of bufferModel
+      */
+    CV_EXPORTS Net readNetFromTFLite(const char *bufferModel, size_t lenModel);
+
+    /**
+     *  @brief Reads a network model stored in <a href="http://torch.ch">Torch7</a> framework's format.
+     *  @param model    path to the file, dumped from Torch by using torch.save() function.
+     *  @param isBinary specifies whether the network was serialized in ascii mode or binary.
+     *  @param evaluate specifies testing phase of network. If true, it's similar to evaluate() method in Torch.
+     *  @returns Net object.
+     *
+     *  @note Ascii mode of Torch serializer is more preferable, because binary mode extensively use `long` type of C language,
+     *  which has various bit-length on different systems.
+     *
+     * The loading file must contain serialized <a href="https://github.com/torch/nn/blob/master/doc/module.md">nn.Module</a> object
+     * with importing network. Try to eliminate a custom objects from serialazing data to avoid importing errors.
+     *
+     * List of supported layers (i.e. object instances derived from Torch nn.Module class):
+     * - nn.Sequential
+     * - nn.Parallel
+     * - nn.Concat
+     * - nn.Linear
+     * - nn.SpatialConvolution
+     * - nn.SpatialMaxPooling, nn.SpatialAveragePooling
+     * - nn.ReLU, nn.TanH, nn.Sigmoid
+     * - nn.Reshape
+     * - nn.SoftMax, nn.LogSoftMax
+     *
+     * Also some equivalents of these classes from cunn, cudnn, and fbcunn may be successfully imported.
+     */
+     CV_EXPORTS_W Net readNetFromTorch(CV_WRAP_FILE_PATH const String &model, bool isBinary = true, bool evaluate = true);
+
+     /**
+      * @brief Read deep learning network represented in one of the supported formats.
+      * @param[in] model Binary file contains trained weights. The following file
+      *                  extensions are expected for models from different frameworks:
+      *                  * `*.caffemodel` (Caffe, http://caffe.berkeleyvision.org/)
+      *                  * `*.pb` (TensorFlow, https://www.tensorflow.org/)
+      *                  * `*.t7` | `*.net` (Torch, http://torch.ch/)
+      *                  * `*.weights` (Darknet, https://pjreddie.com/darknet/)
+      *                  * `*.bin` | `*.onnx` (OpenVINO, https://software.intel.com/openvino-toolkit)
+      *                  * `*.onnx` (ONNX, https://onnx.ai/)
+      * @param[in] config Text file contains network configuration. It could be a
+      *                   file with the following extensions:
+      *                  * `*.prototxt` (Caffe, http://caffe.berkeleyvision.org/)
+      *                  * `*.pbtxt` (TensorFlow, https://www.tensorflow.org/)
+      *                  * `*.cfg` (Darknet, https://pjreddie.com/darknet/)
+      *                  * `*.xml` (OpenVINO, https://software.intel.com/openvino-toolkit)
+      * @param[in] framework Explicit framework name tag to determine a format.
+      * @returns Net object.
+      *
+      * This function automatically detects an origin framework of trained model
+      * and calls an appropriate function such @ref readNetFromCaffe, @ref readNetFromTensorflow,
+      * @ref readNetFromTorch or @ref readNetFromDarknet. An order of @p model and @p config
+      * arguments does not matter.
+      */
+     CV_EXPORTS_W Net readNet(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "", const String& framework = "");
+
+     /**
+      * @brief Read deep learning network represented in one of the supported formats.
+      * @details This is an overloaded member function, provided for convenience.
+      *          It differs from the above function only in what argument(s) it accepts.
+      * @param[in] framework    Name of origin framework.
+      * @param[in] bufferModel  A buffer with a content of binary file with weights
+      * @param[in] bufferConfig A buffer with a content of text file contains network configuration.
+      * @returns Net object.
+      */
+     CV_EXPORTS_W Net readNet(const String& framework, const std::vector<uchar>& bufferModel,
+                              const std::vector<uchar>& bufferConfig = std::vector<uchar>());
+
+    /** @brief Loads blob which was serialized as torch.Tensor object of Torch7 framework.
+     *  @warning This function has the same limitations as readNetFromTorch().
+     */
+    CV_EXPORTS_W Mat readTorchBlob(const String &filename, bool isBinary = true);
+
+    /** @brief Load a network from Intel's Model Optimizer intermediate representation.
+     *  @param[in] xml XML configuration file with network's topology.
+     *  @param[in] bin Binary file with trained weights.
+     *  @returns Net object.
+     *  Networks imported from Intel's Model Optimizer are launched in Intel's Inference Engine
+     *  backend.
+     */
+    CV_EXPORTS_W
+    Net readNetFromModelOptimizer(CV_WRAP_FILE_PATH const String &xml, CV_WRAP_FILE_PATH const String &bin = "");
+
+    /** @brief Load a network from Intel's Model Optimizer intermediate representation.
+     *  @param[in] bufferModelConfig Buffer contains XML configuration with network's topology.
+     *  @param[in] bufferWeights Buffer contains binary data with trained weights.
+     *  @returns Net object.
+     *  Networks imported from Intel's Model Optimizer are launched in Intel's Inference Engine
+     *  backend.
+     */
+    CV_EXPORTS_W
+    Net readNetFromModelOptimizer(const std::vector<uchar>& bufferModelConfig, const std::vector<uchar>& bufferWeights);
+
+    /** @brief Load a network from Intel's Model Optimizer intermediate representation.
+     *  @param[in] bufferModelConfigPtr Pointer to buffer which contains XML configuration with network's topology.
+     *  @param[in] bufferModelConfigSize Binary size of XML configuration data.
+     *  @param[in] bufferWeightsPtr Pointer to buffer which contains binary data with trained weights.
+     *  @param[in] bufferWeightsSize Binary size of trained weights data.
+     *  @returns Net object.
+     *  Networks imported from Intel's Model Optimizer are launched in Intel's Inference Engine
+     *  backend.
+     */
+    CV_EXPORTS
+    Net readNetFromModelOptimizer(const uchar* bufferModelConfigPtr, size_t bufferModelConfigSize,
+                                           const uchar* bufferWeightsPtr, size_t bufferWeightsSize);
+
+    /** @brief Reads a network model <a href="https://onnx.ai/">ONNX</a>.
+     *  @param onnxFile path to the .onnx file with text description of the network architecture.
+     *  @returns Network object that ready to do forward, throw an exception in failure cases.
+     */
+    CV_EXPORTS_W Net readNetFromONNX(CV_WRAP_FILE_PATH const String &onnxFile);
+
+    /** @brief Reads a network model from <a href="https://onnx.ai/">ONNX</a>
+     *         in-memory buffer.
+     *  @param buffer memory address of the first byte of the buffer.
+     *  @param sizeBuffer size of the buffer.
+     *  @returns Network object that ready to do forward, throw an exception
+     *        in failure cases.
+     */
+    CV_EXPORTS Net readNetFromONNX(const char* buffer, size_t sizeBuffer);
+
+    /** @brief Reads a network model from <a href="https://onnx.ai/">ONNX</a>
+     *         in-memory buffer.
+     *  @param buffer in-memory buffer that stores the ONNX model bytes.
+     *  @returns Network object that ready to do forward, throw an exception
+     *        in failure cases.
+     */
+    CV_EXPORTS_W Net readNetFromONNX(const std::vector<uchar>& buffer);
+
+    /** @brief Creates blob from .pb file.
+     *  @param path to the .pb file with input tensor.
+     *  @returns Mat.
+     */
+    CV_EXPORTS_W Mat readTensorFromONNX(CV_WRAP_FILE_PATH const String& path);
+
+    /** @brief Creates 4-dimensional blob from image. Optionally resizes and crops @p image from center,
+     *  subtract @p mean values, scales values by @p scalefactor, swap Blue and Red channels.
+     *  @param image input image (with 1-, 3- or 4-channels).
+     *  @param scalefactor multiplier for @p images values.
+     *  @param size spatial size for output image
+     *  @param mean scalar with mean values which are subtracted from channels. Values are intended
+     *  to be in (mean-R, mean-G, mean-B) order if @p image has BGR ordering and @p swapRB is true.
+     *  @param swapRB flag which indicates that swap first and last channels
+     *  in 3-channel image is necessary.
+     *  @param crop flag which indicates whether image will be cropped after resize or not
+     *  @param ddepth Depth of output blob. Choose CV_32F or CV_8U.
+     *  @details if @p crop is true, input image is resized so one side after resize is equal to corresponding
+     *  dimension in @p size and another one is equal or larger. Then, crop from the center is performed.
+     *  If @p crop is false, direct resize without cropping and preserving aspect ratio is performed.
+     *  @returns 4-dimensional Mat with NCHW dimensions order.
+     *
+     * @note
+     * The order and usage of `scalefactor` and `mean` are (input - mean) * scalefactor.
+     */
+    CV_EXPORTS_W Mat blobFromImage(InputArray image, double scalefactor=1.0, const Size& size = Size(),
+                                   const Scalar& mean = Scalar(), bool swapRB=false, bool crop=false,
+                                   int ddepth=CV_32F);
+
+    /** @brief Creates 4-dimensional blob from image.
+     *  @details This is an overloaded member function, provided for convenience.
+     *           It differs from the above function only in what argument(s) it accepts.
+     */
+    CV_EXPORTS void blobFromImage(InputArray image, OutputArray blob, double scalefactor=1.0,
+                                  const Size& size = Size(), const Scalar& mean = Scalar(),
+                                  bool swapRB=false, bool crop=false, int ddepth=CV_32F);
+
+
+    /** @brief Creates 4-dimensional blob from series of images. Optionally resizes and
+     *  crops @p images from center, subtract @p mean values, scales values by @p scalefactor,
+     *  swap Blue and Red channels.
+     *  @param images input images (all with 1-, 3- or 4-channels).
+     *  @param size spatial size for output image
+     *  @param mean scalar with mean values which are subtracted from channels. Values are intended
+     *  to be in (mean-R, mean-G, mean-B) order if @p image has BGR ordering and @p swapRB is true.
+     *  @param scalefactor multiplier for @p images values.
+     *  @param swapRB flag which indicates that swap first and last channels
+     *  in 3-channel image is necessary.
+     *  @param crop flag which indicates whether image will be cropped after resize or not
+     *  @param ddepth Depth of output blob. Choose CV_32F or CV_8U.
+     *  @details if @p crop is true, input image is resized so one side after resize is equal to corresponding
+     *  dimension in @p size and another one is equal or larger. Then, crop from the center is performed.
+     *  If @p crop is false, direct resize without cropping and preserving aspect ratio is performed.
+     *  @returns 4-dimensional Mat with NCHW dimensions order.
+     *
+     * @note
+     * The order and usage of `scalefactor` and `mean` are (input - mean) * scalefactor.
+     */
+    CV_EXPORTS_W Mat blobFromImages(InputArrayOfArrays images, double scalefactor=1.0,
+                                    Size size = Size(), const Scalar& mean = Scalar(), bool swapRB=false, bool crop=false,
+                                    int ddepth=CV_32F);
+
+    /** @brief Creates 4-dimensional blob from series of images.
+     *  @details This is an overloaded member function, provided for convenience.
+     *           It differs from the above function only in what argument(s) it accepts.
+     */
+    CV_EXPORTS void blobFromImages(InputArrayOfArrays images, OutputArray blob,
+                                   double scalefactor=1.0, Size size = Size(),
+                                   const Scalar& mean = Scalar(), bool swapRB=false, bool crop=false,
+                                   int ddepth=CV_32F);
+
+    /**
+     * @brief Enum of image processing mode.
+     * To facilitate the specialization pre-processing requirements of the dnn model.
+     * For example, the `letter box` often used in the Yolo series of models.
+     * @see Image2BlobParams
+     */
+    enum ImagePaddingMode
+    {
+        DNN_PMODE_NULL = 0,        // !< Default. Resize to required input size without extra processing.
+        DNN_PMODE_CROP_CENTER = 1, // !< Image will be cropped after resize.
+        DNN_PMODE_LETTERBOX = 2,   // !< Resize image to the desired size while preserving the aspect ratio of original image.
+    };
+
+    /** @brief Processing params of image to blob.
+     *
+     * It includes all possible image processing operations and corresponding parameters.
+     *
+     * @see blobFromImageWithParams
+     *
+     * @note
+     * The order and usage of `scalefactor` and `mean` are (input - mean) * scalefactor.
+     * The order and usage of `scalefactor`, `size`, `mean`, `swapRB`, and `ddepth` are consistent
+     * with the function of @ref blobFromImage.
+    */
+    struct CV_EXPORTS_W_SIMPLE Image2BlobParams
+    {
+        CV_WRAP Image2BlobParams();
+        CV_WRAP Image2BlobParams(const Scalar& scalefactor, const Size& size = Size(), const Scalar& mean = Scalar(),
+                            bool swapRB = false, int ddepth = CV_32F, DataLayout datalayout = DNN_LAYOUT_NCHW,
+                            ImagePaddingMode mode = DNN_PMODE_NULL, Scalar borderValue = 0.0);
+
+        CV_PROP_RW Scalar scalefactor; //!< scalefactor multiplier for input image values.
+        CV_PROP_RW Size size;    //!< Spatial size for output image.
+        CV_PROP_RW Scalar mean;  //!< Scalar with mean values which are subtracted from channels.
+        CV_PROP_RW bool swapRB;  //!< Flag which indicates that swap first and last channels
+        CV_PROP_RW int ddepth;   //!< Depth of output blob. Choose CV_32F or CV_8U.
+        CV_PROP_RW DataLayout datalayout; //!< Order of output dimensions. Choose DNN_LAYOUT_NCHW or DNN_LAYOUT_NHWC.
+        CV_PROP_RW ImagePaddingMode paddingmode;   //!< Image padding mode. @see ImagePaddingMode.
+        CV_PROP_RW Scalar borderValue;   //!< Value used in padding mode for padding.
+
+        /** @brief Get rectangle coordinates in original image system from rectangle in blob coordinates.
+         *  @param rBlob rect in blob coordinates.
+         *  @param size original input image size.
+         *  @returns rectangle in original image coordinates.
+         */
+        CV_WRAP Rect blobRectToImageRect(const Rect &rBlob, const Size &size);
+
+        /** @brief Get rectangle coordinates in original image system from rectangle in blob coordinates.
+         *  @param rBlob rect in blob coordinates.
+         *  @param rImg result rect in image coordinates.
+         *  @param size original input image size.
+         */
+        CV_WRAP void blobRectsToImageRects(const std::vector<Rect> &rBlob, CV_OUT std::vector<Rect>& rImg, const Size& size);
+    };
+
+    /** @brief Creates 4-dimensional blob from image with given params.
+     *
+     *  @details This function is an extension of @ref blobFromImage to meet more image preprocess needs.
+     *  Given input image and preprocessing parameters, and function outputs the blob.
+     *
+     *  @param image input image (all with 1-, 3- or 4-channels).
+     *  @param param struct of Image2BlobParams, contains all parameters needed by processing of image to blob.
+     *  @return 4-dimensional Mat.
+     */
+    CV_EXPORTS_W Mat blobFromImageWithParams(InputArray image, const Image2BlobParams& param = Image2BlobParams());
+
+    /** @overload */
+    CV_EXPORTS_W void blobFromImageWithParams(InputArray image, OutputArray blob, const Image2BlobParams& param = Image2BlobParams());
+
+    /** @brief Creates 4-dimensional blob from series of images with given params.
+     *
+     *  @details This function is an extension of @ref blobFromImages to meet more image preprocess needs.
+     *  Given input image and preprocessing parameters, and function outputs the blob.
+     *
+     *  @param images input image (all with 1-, 3- or 4-channels).
+     *  @param param struct of Image2BlobParams, contains all parameters needed by processing of image to blob.
+     *  @returns 4-dimensional Mat.
+     */
+    CV_EXPORTS_W Mat blobFromImagesWithParams(InputArrayOfArrays images, const Image2BlobParams& param = Image2BlobParams());
+
+    /** @overload */
+    CV_EXPORTS_W void blobFromImagesWithParams(InputArrayOfArrays images, OutputArray blob, const Image2BlobParams& param = Image2BlobParams());
+
+    /** @brief Parse a 4D blob and output the images it contains as 2D arrays through a simpler data structure
+     *  (std::vector<cv::Mat>).
+     *  @param[in] blob_ 4 dimensional array (images, channels, height, width) in floating point precision (CV_32F) from
+     *  which you would like to extract the images.
+     *  @param[out] images_ array of 2D Mat containing the images extracted from the blob in floating point precision
+     *  (CV_32F). They are non normalized neither mean added. The number of returned images equals the first dimension
+     *  of the blob (batch size). Every image has a number of channels equals to the second dimension of the blob (depth).
+     */
+    CV_EXPORTS_W void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_);
+
+    /** @brief Convert all weights of Caffe network to half precision floating point.
+     * @param src Path to origin model from Caffe framework contains single
+     *            precision floating point weights (usually has `.caffemodel` extension).
+     * @param dst Path to destination model with updated weights.
+     * @param layersTypes Set of layers types which parameters will be converted.
+     *                    By default, converts only Convolutional and Fully-Connected layers'
+     *                    weights.
+     *
+     * @note Shrinked model has no origin float32 weights so it can't be used
+     *       in origin Caffe framework anymore. However the structure of data
+     *       is taken from NVidia's Caffe fork: https://github.com/NVIDIA/caffe.
+     *       So the resulting model may be used there.
+     */
+    CV_EXPORTS_W void shrinkCaffeModel(CV_WRAP_FILE_PATH const String& src, CV_WRAP_FILE_PATH const String& dst,
+                                       const std::vector<String>& layersTypes = std::vector<String>());
+
+    /** @brief Create a text representation for a binary network stored in protocol buffer format.
+     *  @param[in] model  A path to binary network.
+     *  @param[in] output A path to output text file to be created.
+     *
+     *  @note To reduce output file size, trained weights are not included.
+     */
+    CV_EXPORTS_W void writeTextGraph(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& output);
+
+    /** @brief Performs non maximum suppression given boxes and corresponding scores.
+
+     * @param bboxes a set of bounding boxes to apply NMS.
+     * @param scores a set of corresponding confidences.
+     * @param score_threshold a threshold used to filter boxes by score.
+     * @param nms_threshold a threshold used in non maximum suppression.
+     * @param indices the kept indices of bboxes after NMS.
+     * @param eta a coefficient in adaptive threshold formula: \f$nms\_threshold_{i+1}=eta\cdot nms\_threshold_i\f$.
+     * @param top_k if `>0`, keep at most @p top_k picked indices.
+     */
+    CV_EXPORTS void NMSBoxes(const std::vector<Rect>& bboxes, const std::vector<float>& scores,
+                               const float score_threshold, const float nms_threshold,
+                               CV_OUT std::vector<int>& indices,
+                               const float eta = 1.f, const int top_k = 0);
+
+    CV_EXPORTS_W void NMSBoxes(const std::vector<Rect2d>& bboxes, const std::vector<float>& scores,
+                               const float score_threshold, const float nms_threshold,
+                               CV_OUT std::vector<int>& indices,
+                               const float eta = 1.f, const int top_k = 0);
+
+    CV_EXPORTS_AS(NMSBoxesRotated) void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<float>& scores,
+                             const float score_threshold, const float nms_threshold,
+                             CV_OUT std::vector<int>& indices,
+                             const float eta = 1.f, const int top_k = 0);
+
+    /** @brief Performs batched non maximum suppression on given boxes and corresponding scores across different classes.
+
+     * @param bboxes a set of bounding boxes to apply NMS.
+     * @param scores a set of corresponding confidences.
+     * @param class_ids a set of corresponding class ids. Ids are integer and usually start from 0.
+     * @param score_threshold a threshold used to filter boxes by score.
+     * @param nms_threshold a threshold used in non maximum suppression.
+     * @param indices the kept indices of bboxes after NMS.
+     * @param eta a coefficient in adaptive threshold formula: \f$nms\_threshold_{i+1}=eta\cdot nms\_threshold_i\f$.
+     * @param top_k if `>0`, keep at most @p top_k picked indices.
+     */
+    CV_EXPORTS void NMSBoxesBatched(const std::vector<Rect>& bboxes, const std::vector<float>& scores, const std::vector<int>& class_ids,
+                                    const float score_threshold, const float nms_threshold,
+                                    CV_OUT std::vector<int>& indices,
+                                    const float eta = 1.f, const int top_k = 0);
+
+    CV_EXPORTS_W void NMSBoxesBatched(const std::vector<Rect2d>& bboxes, const std::vector<float>& scores, const std::vector<int>& class_ids,
+                                      const float score_threshold, const float nms_threshold,
+                                      CV_OUT std::vector<int>& indices,
+                                      const float eta = 1.f, const int top_k = 0);
+
+    /**
+     * @brief Enum of Soft NMS methods.
+     * @see softNMSBoxes
+     */
+    enum class SoftNMSMethod
+    {
+        SOFTNMS_LINEAR = 1,
+        SOFTNMS_GAUSSIAN = 2
+    };
+
+    /** @brief Performs soft non maximum suppression given boxes and corresponding scores.
+     * Reference: https://arxiv.org/abs/1704.04503
+     * @param bboxes a set of bounding boxes to apply Soft NMS.
+     * @param scores a set of corresponding confidences.
+     * @param updated_scores a set of corresponding updated confidences.
+     * @param score_threshold a threshold used to filter boxes by score.
+     * @param nms_threshold a threshold used in non maximum suppression.
+     * @param indices the kept indices of bboxes after NMS.
+     * @param top_k keep at most @p top_k picked indices.
+     * @param sigma parameter of Gaussian weighting.
+     * @param method Gaussian or linear.
+     * @see SoftNMSMethod
+     */
+    CV_EXPORTS_W void softNMSBoxes(const std::vector<Rect>& bboxes,
+                                   const std::vector<float>& scores,
+                                   CV_OUT std::vector<float>& updated_scores,
+                                   const float score_threshold,
+                                   const float nms_threshold,
+                                   CV_OUT std::vector<int>& indices,
+                                   size_t top_k = 0,
+                                   const float sigma = 0.5,
+                                   SoftNMSMethod method = SoftNMSMethod::SOFTNMS_GAUSSIAN);
+
+
+     /** @brief This class is presented high-level API for neural networks.
+      *
+      * Model allows to set params for preprocessing input image.
+      * Model creates net from file with trained weights and config,
+      * sets preprocessing input and runs forward pass.
+      */
+     class CV_EXPORTS_W_SIMPLE Model
+     {
+     public:
+         CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+         Model();
+
+         Model(const Model&) = default;
+         Model(Model&&) = default;
+         Model& operator=(const Model&) = default;
+         Model& operator=(Model&&) = default;
+
+         /**
+          * @brief Create model from deep learning network represented in one of the supported formats.
+          * An order of @p model and @p config arguments does not matter.
+          * @param[in] model Binary file contains trained weights.
+          * @param[in] config Text file contains network configuration.
+          */
+         CV_WRAP Model(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
+
+         /**
+          * @brief Create model from deep learning network.
+          * @param[in] network Net object.
+          */
+         CV_WRAP Model(const Net& network);
+
+         /** @brief Set input size for frame.
+          *  @param[in] size New input size.
+          *  @note If shape of the new blob less than 0, then frame size not change.
+         */
+         CV_WRAP Model& setInputSize(const Size& size);
+
+         /** @overload
+         *  @param[in] width New input width.
+         *  @param[in] height New input height.
+         */
+         CV_WRAP inline
+         Model& setInputSize(int width, int height) { return setInputSize(Size(width, height)); }
+
+         /** @brief Set mean value for frame.
+          *  @param[in] mean Scalar with mean values which are subtracted from channels.
+         */
+         CV_WRAP Model& setInputMean(const Scalar& mean);
+
+         /** @brief Set scalefactor value for frame.
+          *  @param[in] scale Multiplier for frame values.
+         */
+         CV_WRAP Model& setInputScale(const Scalar& scale);
+
+         /** @brief Set flag crop for frame.
+          *  @param[in] crop Flag which indicates whether image will be cropped after resize or not.
+         */
+         CV_WRAP Model& setInputCrop(bool crop);
+
+         /** @brief Set flag swapRB for frame.
+          *  @param[in] swapRB Flag which indicates that swap first and last channels.
+         */
+         CV_WRAP Model& setInputSwapRB(bool swapRB);
+
+         /** @brief Set output names for frame.
+          *  @param[in] outNames Names for output layers.
+         */
+         CV_WRAP Model& setOutputNames(const std::vector<String>& outNames);
+
+         /** @brief Set preprocessing parameters for frame.
+         *  @param[in] size New input size.
+         *  @param[in] mean Scalar with mean values which are subtracted from channels.
+         *  @param[in] scale Multiplier for frame values.
+         *  @param[in] swapRB Flag which indicates that swap first and last channels.
+         *  @param[in] crop Flag which indicates whether image will be cropped after resize or not.
+         *  blob(n, c, y, x) = scale * resize( frame(y, x, c) ) - mean(c) )
+         */
+         CV_WRAP void setInputParams(double scale = 1.0, const Size& size = Size(),
+                                     const Scalar& mean = Scalar(), bool swapRB = false, bool crop = false);
+
+         /** @brief Given the @p input frame, create input blob, run net and return the output @p blobs.
+          *  @param[in]  frame  The input image.
+          *  @param[out] outs Allocated output blobs, which will store results of the computation.
+          */
+         CV_WRAP void predict(InputArray frame, OutputArrayOfArrays outs) const;
+
+
+         // ============================== Net proxy methods ==============================
+         // Never expose methods with network implementation details, like:
+         // - addLayer, addLayerToPrev, connect, setInputsNames, setInputShape, setParam, getParam
+         // - getLayer*, getUnconnectedOutLayers, getUnconnectedOutLayersNames, getLayersShapes
+         // - forward* methods, setInput
+
+         /// @sa Net::setPreferableBackend
+         CV_WRAP Model& setPreferableBackend(dnn::Backend backendId);
+         /// @sa Net::setPreferableTarget
+         CV_WRAP Model& setPreferableTarget(dnn::Target targetId);
+
+         /// @sa Net::enableWinograd
+         CV_WRAP Model& enableWinograd(bool useWinograd);
+
+         CV_DEPRECATED_EXTERNAL
+         operator Net&() const { return getNetwork_(); }
+
+     //protected: - internal/tests usage only
+         Net& getNetwork_() const;
+         inline Net& getNetwork_() { return const_cast<const Model*>(this)->getNetwork_(); }
+
+         struct Impl;
+         inline Impl* getImpl() const { return impl.get(); }
+         inline Impl& getImplRef() const { CV_DbgAssert(impl); return *impl.get(); }
+     protected:
+         Ptr<Impl> impl;
+     };
+
+     /** @brief This class represents high-level API for classification models.
+      *
+      * ClassificationModel allows to set params for preprocessing input image.
+      * ClassificationModel creates net from file with trained weights and config,
+      * sets preprocessing input, runs forward pass and return top-1 prediction.
+      */
+     class CV_EXPORTS_W_SIMPLE ClassificationModel : public Model
+     {
+     public:
+         CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+         ClassificationModel();
+
+         /**
+          * @brief Create classification model from network represented in one of the supported formats.
+          * An order of @p model and @p config arguments does not matter.
+          * @param[in] model Binary file contains trained weights.
+          * @param[in] config Text file contains network configuration.
+          */
+          CV_WRAP ClassificationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
+
+         /**
+          * @brief Create model from deep learning network.
+          * @param[in] network Net object.
+          */
+         CV_WRAP ClassificationModel(const Net& network);
+
+         /**
+          * @brief Set enable/disable softmax post processing option.
+          *
+          * If this option is true, softmax is applied after forward inference within the classify() function
+          * to convert the confidences range to [0.0-1.0].
+          * This function allows you to toggle this behavior.
+          * Please turn true when not contain softmax layer in model.
+          * @param[in] enable Set enable softmax post processing within the classify() function.
+          */
+         CV_WRAP ClassificationModel& setEnableSoftmaxPostProcessing(bool enable);
+
+         /**
+          * @brief Get enable/disable softmax post processing option.
+          *
+          * This option defaults to false, softmax post processing is not applied within the classify() function.
+          */
+         CV_WRAP bool getEnableSoftmaxPostProcessing() const;
+
+         /** @brief Given the @p input frame, create input blob, run net and return top-1 prediction.
+          *  @param[in]  frame  The input image.
+          */
+         std::pair<int, float> classify(InputArray frame);
+
+         /** @overload */
+         CV_WRAP void classify(InputArray frame, CV_OUT int& classId, CV_OUT float& conf);
+     };
+
+     /** @brief This class represents high-level API for keypoints models
+      *
+      * KeypointsModel allows to set params for preprocessing input image.
+      * KeypointsModel creates net from file with trained weights and config,
+      * sets preprocessing input, runs forward pass and returns the x and y coordinates of each detected keypoint
+      */
+     class CV_EXPORTS_W_SIMPLE KeypointsModel: public Model
+     {
+     public:
+         /**
+          * @brief Create keypoints model from network represented in one of the supported formats.
+          * An order of @p model and @p config arguments does not matter.
+          * @param[in] model Binary file contains trained weights.
+          * @param[in] config Text file contains network configuration.
+          */
+          CV_WRAP KeypointsModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
+
+         /**
+          * @brief Create model from deep learning network.
+          * @param[in] network Net object.
+          */
+         CV_WRAP KeypointsModel(const Net& network);
+
+         /** @brief Given the @p input frame, create input blob, run net
+          *  @param[in]  frame  The input image.
+          *  @param thresh minimum confidence threshold to select a keypoint
+          *  @returns a vector holding the x and y coordinates of each detected keypoint
+          *
+          */
+         CV_WRAP std::vector<Point2f> estimate(InputArray frame, float thresh=0.5);
+     };
+
+     /** @brief This class represents high-level API for segmentation  models
+      *
+      * SegmentationModel allows to set params for preprocessing input image.
+      * SegmentationModel creates net from file with trained weights and config,
+      * sets preprocessing input, runs forward pass and returns the class prediction for each pixel.
+      */
+     class CV_EXPORTS_W_SIMPLE SegmentationModel: public Model
+     {
+     public:
+         /**
+          * @brief Create segmentation model from network represented in one of the supported formats.
+          * An order of @p model and @p config arguments does not matter.
+          * @param[in] model Binary file contains trained weights.
+          * @param[in] config Text file contains network configuration.
+          */
+          CV_WRAP SegmentationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
+
+         /**
+          * @brief Create model from deep learning network.
+          * @param[in] network Net object.
+          */
+         CV_WRAP SegmentationModel(const Net& network);
+
+         /** @brief Given the @p input frame, create input blob, run net
+          *  @param[in]  frame  The input image.
+          *  @param[out] mask Allocated class prediction for each pixel
+          */
+         CV_WRAP void segment(InputArray frame, OutputArray mask);
+     };
+
+     /** @brief This class represents high-level API for object detection networks.
+      *
+      * DetectionModel allows to set params for preprocessing input image.
+      * DetectionModel creates net from file with trained weights and config,
+      * sets preprocessing input, runs forward pass and return result detections.
+      * For DetectionModel SSD, Faster R-CNN, YOLO topologies are supported.
+      */
+     class CV_EXPORTS_W_SIMPLE DetectionModel : public Model
+     {
+     public:
+         /**
+          * @brief Create detection model from network represented in one of the supported formats.
+          * An order of @p model and @p config arguments does not matter.
+          * @param[in] model Binary file contains trained weights.
+          * @param[in] config Text file contains network configuration.
+          */
+         CV_WRAP DetectionModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
+
+         /**
+          * @brief Create model from deep learning network.
+          * @param[in] network Net object.
+          */
+         CV_WRAP DetectionModel(const Net& network);
+
+         CV_DEPRECATED_EXTERNAL  // avoid using in C++ code (need to fix bindings first)
+         DetectionModel();
+
+         /**
+          * @brief nmsAcrossClasses defaults to false,
+          * such that when non max suppression is used during the detect() function, it will do so per-class.
+          * This function allows you to toggle this behaviour.
+          * @param[in] value The new value for nmsAcrossClasses
+          */
+         CV_WRAP DetectionModel& setNmsAcrossClasses(bool value);
+
+         /**
+          * @brief Getter for nmsAcrossClasses. This variable defaults to false,
+          * such that when non max suppression is used during the detect() function, it will do so only per-class
+          */
+         CV_WRAP bool getNmsAcrossClasses();
+
+         /** @brief Given the @p input frame, create input blob, run net and return result detections.
+          *  @param[in]  frame  The input image.
+          *  @param[out] classIds Class indexes in result detection.
+          *  @param[out] confidences A set of corresponding confidences.
+          *  @param[out] boxes A set of bounding boxes.
+          *  @param[in] confThreshold A threshold used to filter boxes by confidences.
+          *  @param[in] nmsThreshold A threshold used in non maximum suppression.
+          */
+         CV_WRAP void detect(InputArray frame, CV_OUT std::vector<int>& classIds,
+                             CV_OUT std::vector<float>& confidences, CV_OUT std::vector<Rect>& boxes,
+                             float confThreshold = 0.5f, float nmsThreshold = 0.0f);
+     };
+
+
+/** @brief This class represents high-level API for text recognition networks.
+ *
+ * TextRecognitionModel allows to set params for preprocessing input image.
+ * TextRecognitionModel creates net from file with trained weights and config,
+ * sets preprocessing input, runs forward pass and return recognition result.
+ * For TextRecognitionModel, CRNN-CTC is supported.
+ */
+class CV_EXPORTS_W_SIMPLE TextRecognitionModel : public Model
+{
+public:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    TextRecognitionModel();
+
+    /**
+     * @brief Create Text Recognition model from deep learning network
+     * Call setDecodeType() and setVocabulary() after constructor to initialize the decoding method
+     * @param[in] network Net object
+     */
+    CV_WRAP TextRecognitionModel(const Net& network);
+
+    /**
+     * @brief Create text recognition model from network represented in one of the supported formats
+     * Call setDecodeType() and setVocabulary() after constructor to initialize the decoding method
+     * @param[in] model Binary file contains trained weights
+     * @param[in] config Text file contains network configuration
+     */
+    CV_WRAP inline
+    TextRecognitionModel(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
+        : TextRecognitionModel(readNet(model, config)) { /* nothing */ }
+
+    /**
+     * @brief Set the decoding method of translating the network output into string
+     * @param[in] decodeType The decoding method of translating the network output into string, currently supported type:
+     *    - `"CTC-greedy"` greedy decoding for the output of CTC-based methods
+     *    - `"CTC-prefix-beam-search"` Prefix beam search decoding for the output of CTC-based methods
+     */
+    CV_WRAP
+    TextRecognitionModel& setDecodeType(const std::string& decodeType);
+
+    /**
+     * @brief Get the decoding method
+     * @return the decoding method
+     */
+    CV_WRAP
+    const std::string& getDecodeType() const;
+
+    /**
+     * @brief Set the decoding method options for `"CTC-prefix-beam-search"` decode usage
+     * @param[in] beamSize Beam size for search
+     * @param[in] vocPruneSize Parameter to optimize big vocabulary search,
+     * only take top @p vocPruneSize tokens in each search step, @p vocPruneSize <= 0 stands for disable this prune.
+     */
+    CV_WRAP
+    TextRecognitionModel& setDecodeOptsCTCPrefixBeamSearch(int beamSize, int vocPruneSize = 0);
+
+    /**
+     * @brief Set the vocabulary for recognition.
+     * @param[in] vocabulary the associated vocabulary of the network.
+     */
+    CV_WRAP
+    TextRecognitionModel& setVocabulary(const std::vector<std::string>& vocabulary);
+
+    /**
+     * @brief Get the vocabulary for recognition.
+     * @return vocabulary the associated vocabulary
+     */
+    CV_WRAP
+    const std::vector<std::string>& getVocabulary() const;
+
+    /**
+     * @brief Given the @p input frame, create input blob, run net and return recognition result
+     * @param[in] frame The input image
+     * @return The text recognition result
+     */
+    CV_WRAP
+    std::string recognize(InputArray frame) const;
+
+    /**
+     * @brief Given the @p input frame, create input blob, run net and return recognition result
+     * @param[in] frame The input image
+     * @param[in] roiRects List of text detection regions of interest (cv::Rect, CV_32SC4). ROIs is be cropped as the network inputs
+     * @param[out] results A set of text recognition results.
+     */
+    CV_WRAP
+    void recognize(InputArray frame, InputArrayOfArrays roiRects, CV_OUT std::vector<std::string>& results) const;
+};
+
+
+/** @brief Base class for text detection networks
+ */
+class CV_EXPORTS_W_SIMPLE TextDetectionModel : public Model
+{
+protected:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    TextDetectionModel();
+
+public:
+
+    /** @brief Performs detection
+     *
+     * Given the input @p frame, prepare network input, run network inference, post-process network output and return result detections.
+     *
+     * Each result is quadrangle's 4 points in this order:
+     * - bottom-left
+     * - top-left
+     * - top-right
+     * - bottom-right
+     *
+     * Use cv::getPerspectiveTransform function to retrieve image region without perspective transformations.
+     *
+     * @note If DL model doesn't support that kind of output then result may be derived from detectTextRectangles() output.
+     *
+     * @param[in] frame The input image
+     * @param[out] detections array with detections' quadrangles (4 points per result)
+     * @param[out] confidences array with detection confidences
+     */
+    CV_WRAP
+    void detect(
+            InputArray frame,
+            CV_OUT std::vector< std::vector<Point> >& detections,
+            CV_OUT std::vector<float>& confidences
+    ) const;
+
+    /** @overload */
+    CV_WRAP
+    void detect(
+            InputArray frame,
+            CV_OUT std::vector< std::vector<Point> >& detections
+    ) const;
+
+    /** @brief Performs detection
+     *
+     * Given the input @p frame, prepare network input, run network inference, post-process network output and return result detections.
+     *
+     * Each result is rotated rectangle.
+     *
+     * @note Result may be inaccurate in case of strong perspective transformations.
+     *
+     * @param[in] frame the input image
+     * @param[out] detections array with detections' RotationRect results
+     * @param[out] confidences array with detection confidences
+     */
+    CV_WRAP
+    void detectTextRectangles(
+            InputArray frame,
+            CV_OUT std::vector<cv::RotatedRect>& detections,
+            CV_OUT std::vector<float>& confidences
+    ) const;
+
+    /** @overload */
+    CV_WRAP
+    void detectTextRectangles(
+            InputArray frame,
+            CV_OUT std::vector<cv::RotatedRect>& detections
+    ) const;
+};
+
+/** @brief This class represents high-level API for text detection DL networks compatible with EAST model.
+ *
+ * Configurable parameters:
+ * - (float) confThreshold - used to filter boxes by confidences, default: 0.5f
+ * - (float) nmsThreshold - used in non maximum suppression, default: 0.0f
+ */
+class CV_EXPORTS_W_SIMPLE TextDetectionModel_EAST : public TextDetectionModel
+{
+public:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    TextDetectionModel_EAST();
+
+    /**
+     * @brief Create text detection algorithm from deep learning network
+     * @param[in] network Net object
+     */
+    CV_WRAP TextDetectionModel_EAST(const Net& network);
+
+    /**
+     * @brief Create text detection model from network represented in one of the supported formats.
+     * An order of @p model and @p config arguments does not matter.
+     * @param[in] model Binary file contains trained weights.
+     * @param[in] config Text file contains network configuration.
+     */
+    CV_WRAP inline
+    TextDetectionModel_EAST(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
+        : TextDetectionModel_EAST(readNet(model, config)) { /* nothing */ }
+
+    /**
+     * @brief Set the detection confidence threshold
+     * @param[in] confThreshold A threshold used to filter boxes by confidences
+     */
+    CV_WRAP
+    TextDetectionModel_EAST& setConfidenceThreshold(float confThreshold);
+
+    /**
+     * @brief Get the detection confidence threshold
+     */
+    CV_WRAP
+    float getConfidenceThreshold() const;
+
+    /**
+     * @brief Set the detection NMS filter threshold
+     * @param[in] nmsThreshold A threshold used in non maximum suppression
+     */
+    CV_WRAP
+    TextDetectionModel_EAST& setNMSThreshold(float nmsThreshold);
+
+    /**
+     * @brief Get the detection confidence threshold
+     */
+    CV_WRAP
+    float getNMSThreshold() const;
+};
+
+/** @brief This class represents high-level API for text detection DL networks compatible with DB model.
+ *
+ * Related publications: @cite liao2020real
+ * Paper: https://arxiv.org/abs/1911.08947
+ * For more information about the hyper-parameters setting, please refer to https://github.com/MhLiao/DB
+ *
+ * Configurable parameters:
+ * - (float) binaryThreshold - The threshold of the binary map. It is usually set to 0.3.
+ * - (float) polygonThreshold - The threshold of text polygons. It is usually set to 0.5, 0.6, and 0.7. Default is 0.5f
+ * - (double) unclipRatio - The unclip ratio of the detected text region, which determines the output size. It is usually set to 2.0.
+ * - (int) maxCandidates - The max number of the output results.
+ */
+class CV_EXPORTS_W_SIMPLE TextDetectionModel_DB : public TextDetectionModel
+{
+public:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    TextDetectionModel_DB();
+
+    /**
+     * @brief Create text detection algorithm from deep learning network.
+     * @param[in] network Net object.
+     */
+    CV_WRAP TextDetectionModel_DB(const Net& network);
+
+    /**
+     * @brief Create text detection model from network represented in one of the supported formats.
+     * An order of @p model and @p config arguments does not matter.
+     * @param[in] model Binary file contains trained weights.
+     * @param[in] config Text file contains network configuration.
+     */
+    CV_WRAP inline
+    TextDetectionModel_DB(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
+        : TextDetectionModel_DB(readNet(model, config)) { /* nothing */ }
+
+    CV_WRAP TextDetectionModel_DB& setBinaryThreshold(float binaryThreshold);
+    CV_WRAP float getBinaryThreshold() const;
+
+    CV_WRAP TextDetectionModel_DB& setPolygonThreshold(float polygonThreshold);
+    CV_WRAP float getPolygonThreshold() const;
+
+    CV_WRAP TextDetectionModel_DB& setUnclipRatio(double unclipRatio);
+    CV_WRAP double getUnclipRatio() const;
+
+    CV_WRAP TextDetectionModel_DB& setMaxCandidates(int maxCandidates);
+    CV_WRAP int getMaxCandidates() const;
+};
+
+//! @}
+CV__DNN_INLINE_NS_END
+}
+}
+
+#include <opencv2/dnn/layer.hpp>
+#include <opencv2/dnn/dnn.inl.hpp>
+
+/// @deprecated Include this header directly from application. Automatic inclusion will be removed
+#include <opencv2/dnn/utils/inference_engine.hpp>
+
+#endif  /* OPENCV_DNN_DNN_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/dnn.inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/dnn.inl.hpp
new file mode 100644
index 000000000000..8312a418f3de
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/dnn.inl.hpp
@@ -0,0 +1,412 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_DNN_INL_HPP
+#define OPENCV_DNN_DNN_INL_HPP
+
+#include <opencv2/dnn.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+template<typename TypeIter>
+DictValue DictValue::arrayInt(TypeIter begin, int size)
+{
+    DictValue res(Param::INT, new AutoBuffer<int64, 1>(size));
+    for (int j = 0; j < size; begin++, j++)
+        (*res.pi)[j] = *begin;
+    return res;
+}
+
+template<typename TypeIter>
+DictValue DictValue::arrayReal(TypeIter begin, int size)
+{
+    DictValue res(Param::REAL, new AutoBuffer<double, 1>(size));
+    for (int j = 0; j < size; begin++, j++)
+        (*res.pd)[j] = *begin;
+    return res;
+}
+
+template<typename TypeIter>
+DictValue DictValue::arrayString(TypeIter begin, int size)
+{
+    DictValue res(Param::STRING, new AutoBuffer<String, 1>(size));
+    for (int j = 0; j < size; begin++, j++)
+        (*res.ps)[j] = *begin;
+    return res;
+}
+
+template<>
+inline DictValue DictValue::get<DictValue>(int idx) const
+{
+    CV_Assert(idx == -1);
+    return *this;
+}
+
+template<>
+inline int64 DictValue::get<int64>(int idx) const
+{
+    CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
+    idx = (idx == -1) ? 0 : idx;
+
+    if (type == Param::INT)
+    {
+        return (*pi)[idx];
+    }
+    else if (type == Param::REAL)
+    {
+        double doubleValue = (*pd)[idx];
+
+        double fracpart, intpart;
+        fracpart = std::modf(doubleValue, &intpart);
+        CV_Assert(fracpart == 0.0);
+
+        return (int64)doubleValue;
+    }
+    else if (type == Param::STRING)
+    {
+        return std::atoi((*ps)[idx].c_str());
+    }
+    else
+    {
+        CV_Assert(isInt() || isReal() || isString());
+        return 0;
+    }
+}
+
+template<>
+inline int DictValue::get<int>(int idx) const
+{
+    return (int)get<int64>(idx);
+}
+
+inline int DictValue::getIntValue(int idx) const
+{
+    return (int)get<int64>(idx);
+}
+
+template<>
+inline unsigned DictValue::get<unsigned>(int idx) const
+{
+    return (unsigned)get<int64>(idx);
+}
+
+template<>
+inline bool DictValue::get<bool>(int idx) const
+{
+    return (get<int64>(idx) != 0);
+}
+
+template<>
+inline double DictValue::get<double>(int idx) const
+{
+    CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
+    idx = (idx == -1) ? 0 : idx;
+
+    if (type == Param::REAL)
+    {
+        return (*pd)[idx];
+    }
+    else if (type == Param::INT)
+    {
+        return (double)(*pi)[idx];
+    }
+    else if (type == Param::STRING)
+    {
+        return std::atof((*ps)[idx].c_str());
+    }
+    else
+    {
+        CV_Assert(isReal() || isInt() || isString());
+        return 0;
+    }
+}
+
+inline double DictValue::getRealValue(int idx) const
+{
+    return get<double>(idx);
+}
+
+template<>
+inline float DictValue::get<float>(int idx) const
+{
+    return (float)get<double>(idx);
+}
+
+template<>
+inline String DictValue::get<String>(int idx) const
+{
+    CV_Assert(isString());
+    CV_Assert((idx == -1 && ps->size() == 1) || (idx >= 0 && idx < (int)ps->size()));
+    return (*ps)[(idx == -1) ? 0 : idx];
+}
+
+
+inline String DictValue::getStringValue(int idx) const
+{
+    return get<String>(idx);
+}
+
+inline void DictValue::release()
+{
+    switch (type)
+    {
+    case Param::INT:
+        delete pi;
+        break;
+    case Param::STRING:
+        delete ps;
+        break;
+    case Param::REAL:
+        delete pd;
+        break;
+    case Param::BOOLEAN:
+    case Param::MAT:
+    case Param::MAT_VECTOR:
+    case Param::ALGORITHM:
+    case Param::FLOAT:
+    case Param::UNSIGNED_INT:
+    case Param::UINT64:
+    case Param::UCHAR:
+    case Param::SCALAR:
+        break; // unhandled
+    }
+}
+
+inline DictValue::~DictValue()
+{
+    release();
+}
+
+inline DictValue & DictValue::operator=(const DictValue &r)
+{
+    if (&r == this)
+        return *this;
+
+    if (r.type == Param::INT)
+    {
+        AutoBuffer<int64, 1> *tmp = new AutoBuffer<int64, 1>(*r.pi);
+        release();
+        pi = tmp;
+    }
+    else if (r.type == Param::STRING)
+    {
+        AutoBuffer<String, 1> *tmp = new AutoBuffer<String, 1>(*r.ps);
+        release();
+        ps = tmp;
+    }
+    else if (r.type == Param::REAL)
+    {
+        AutoBuffer<double, 1> *tmp = new AutoBuffer<double, 1>(*r.pd);
+        release();
+        pd = tmp;
+    }
+
+    type = r.type;
+
+    return *this;
+}
+
+inline DictValue::DictValue(const DictValue &r)
+    : pv(NULL)
+{
+    type = r.type;
+
+    if (r.type == Param::INT)
+        pi = new AutoBuffer<int64, 1>(*r.pi);
+    else if (r.type == Param::STRING)
+        ps = new AutoBuffer<String, 1>(*r.ps);
+    else if (r.type == Param::REAL)
+        pd = new AutoBuffer<double, 1>(*r.pd);
+}
+
+inline bool DictValue::isString() const
+{
+    return (type == Param::STRING);
+}
+
+inline bool DictValue::isInt() const
+{
+    return (type == Param::INT);
+}
+
+inline bool DictValue::isReal() const
+{
+    return (type == Param::REAL || type == Param::INT);
+}
+
+inline int DictValue::size() const
+{
+    switch (type)
+    {
+    case Param::INT:
+        return (int)pi->size();
+    case Param::STRING:
+        return (int)ps->size();
+    case Param::REAL:
+        return (int)pd->size();
+    case Param::BOOLEAN:
+    case Param::MAT:
+    case Param::MAT_VECTOR:
+    case Param::ALGORITHM:
+    case Param::FLOAT:
+    case Param::UNSIGNED_INT:
+    case Param::UINT64:
+    case Param::UCHAR:
+    case Param::SCALAR:
+        break; // unhandled
+    }
+    CV_Error_(Error::StsInternal, ("Unhandled type (%d)", static_cast<int>(type)));
+}
+
+inline std::ostream &operator<<(std::ostream &stream, const DictValue &dictv)
+{
+    int i;
+
+    if (dictv.isInt())
+    {
+        for (i = 0; i < dictv.size() - 1; i++)
+            stream << dictv.get<int64>(i) << ", ";
+        stream << dictv.get<int64>(i);
+    }
+    else if (dictv.isReal())
+    {
+        for (i = 0; i < dictv.size() - 1; i++)
+            stream << dictv.get<double>(i) << ", ";
+        stream << dictv.get<double>(i);
+    }
+    else if (dictv.isString())
+    {
+        for (i = 0; i < dictv.size() - 1; i++)
+            stream << "\"" << dictv.get<String>(i) << "\", ";
+        stream << dictv.get<String>(i);
+    }
+
+    return stream;
+}
+
+/////////////////////////////////////////////////////////////////
+
+inline bool Dict::has(const String &key) const
+{
+    return dict.count(key) != 0;
+}
+
+inline DictValue *Dict::ptr(const String &key)
+{
+    _Dict::iterator i = dict.find(key);
+    return (i == dict.end()) ? NULL : &i->second;
+}
+
+inline const DictValue *Dict::ptr(const String &key) const
+{
+    _Dict::const_iterator i = dict.find(key);
+    return (i == dict.end()) ? NULL : &i->second;
+}
+
+inline const DictValue &Dict::get(const String &key) const
+{
+    _Dict::const_iterator i = dict.find(key);
+    if (i == dict.end())
+        CV_Error(Error::StsObjectNotFound, "Required argument \"" + key + "\" not found into dictionary");
+    return i->second;
+}
+
+template <typename T>
+inline T Dict::get(const String &key) const
+{
+    return this->get(key).get<T>();
+}
+
+template <typename T>
+inline T Dict::get(const String &key, const T &defaultValue) const
+{
+    _Dict::const_iterator i = dict.find(key);
+
+    if (i != dict.end())
+        return i->second.get<T>();
+    else
+        return defaultValue;
+}
+
+template<typename T>
+inline const T &Dict::set(const String &key, const T &value)
+{
+    _Dict::iterator i = dict.find(key);
+
+    if (i != dict.end())
+        i->second = DictValue(value);
+    else
+        dict.insert(std::make_pair(key, DictValue(value)));
+
+    return value;
+}
+
+inline void Dict::erase(const String &key)
+{
+    dict.erase(key);
+}
+
+inline std::ostream &operator<<(std::ostream &stream, const Dict &dict)
+{
+    Dict::_Dict::const_iterator it;
+    for (it = dict.dict.begin(); it != dict.dict.end(); it++)
+        stream << it->first << " : " << it->second << "\n";
+
+    return stream;
+}
+
+inline std::map<String, DictValue>::const_iterator Dict::begin() const
+{
+    return dict.begin();
+}
+
+inline std::map<String, DictValue>::const_iterator Dict::end() const
+{
+    return dict.end();
+}
+
+CV__DNN_INLINE_NS_END
+}
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/layer.details.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/layer.details.hpp
new file mode 100644
index 000000000000..1133da562e2e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/layer.details.hpp
@@ -0,0 +1,78 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+#ifndef OPENCV_DNN_LAYER_DETAILS_HPP
+#define OPENCV_DNN_LAYER_DETAILS_HPP
+
+#include <opencv2/dnn/layer.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+/** @brief Registers layer constructor in runtime.
+*   @param type string, containing type name of the layer.
+*   @param constructorFunc pointer to the function of type LayerRegister::Constructor, which creates the layer.
+*   @details This macros must be placed inside the function code.
+*/
+#define CV_DNN_REGISTER_LAYER_FUNC(type, constructorFunc) \
+    cv::dnn::LayerFactory::registerLayer(#type, constructorFunc);
+
+/** @brief Registers layer class in runtime.
+ *  @param type string, containing type name of the layer.
+ *  @param class C++ class, derived from Layer.
+ *  @details This macros must be placed inside the function code.
+ */
+#define CV_DNN_REGISTER_LAYER_CLASS(type, class) \
+    cv::dnn::LayerFactory::registerLayer(#type, cv::dnn::details::_layerDynamicRegisterer<class>);
+
+/** @brief Registers layer constructor on module load time.
+*   @param type string, containing type name of the layer.
+*   @param constructorFunc pointer to the function of type LayerRegister::Constructor, which creates the layer.
+*   @details This macros must be placed outside the function code.
+*/
+#define CV_DNN_REGISTER_LAYER_FUNC_STATIC(type, constructorFunc) \
+static cv::dnn::details::_LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, constructorFunc);
+
+/** @brief Registers layer class on module load time.
+ *  @param type string, containing type name of the layer.
+ *  @param class C++ class, derived from Layer.
+ *  @details This macros must be placed outside the function code.
+ */
+#define CV_DNN_REGISTER_LAYER_CLASS_STATIC(type, class)                         \
+Ptr<Layer> __LayerStaticRegisterer_func_##type(LayerParams &params) \
+    { return Ptr<Layer>(new class(params)); }                       \
+static cv::dnn::details::_LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, __LayerStaticRegisterer_func_##type);
+
+namespace details {
+
+template<typename LayerClass>
+Ptr<Layer> _layerDynamicRegisterer(LayerParams &params)
+{
+    return Ptr<Layer>(LayerClass::create(params));
+}
+
+//allows automatically register created layer on module load time
+class _LayerStaticRegisterer
+{
+    String type;
+public:
+
+    _LayerStaticRegisterer(const String &layerType, LayerFactory::Constructor layerConstructor)
+    {
+        this->type = layerType;
+        LayerFactory::registerLayer(layerType, layerConstructor);
+    }
+
+    ~_LayerStaticRegisterer()
+    {
+        LayerFactory::unregisterLayer(type);
+    }
+};
+
+} // namespace
+CV__DNN_INLINE_NS_END
+}} // namespace
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/layer.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/layer.hpp
new file mode 100644
index 000000000000..a4d167564d90
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/layer.hpp
@@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_LAYER_HPP
+#define OPENCV_DNN_LAYER_HPP
+#include <opencv2/dnn.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+//! @addtogroup dnn
+//! @{
+//!
+//! @defgroup dnnLayerFactory Utilities for New Layers Registration
+//! @{
+
+/** @brief %Layer factory allows to create instances of registered layers. */
+class CV_EXPORTS LayerFactory
+{
+public:
+
+    //! Each Layer class must provide this function to the factory
+    typedef Ptr<Layer>(*Constructor)(LayerParams &params);
+
+    //! Registers the layer class with typename @p type and specified @p constructor. Thread-safe.
+    static void registerLayer(const String &type, Constructor constructor);
+
+    //! Unregisters registered layer with specified type name. Thread-safe.
+    static void unregisterLayer(const String &type);
+
+    //! Check if layer is registered.
+    static bool isLayerRegistered(const std::string& type);
+
+    /** @brief Creates instance of registered layer.
+     *  @param type type name of creating layer.
+     *  @param params parameters which will be used for layer initialization.
+     *  @note Thread-safe.
+     */
+    static Ptr<Layer> createLayerInstance(const String &type, LayerParams& params);
+
+private:
+    LayerFactory();
+};
+
+//! @}
+//! @}
+CV__DNN_INLINE_NS_END
+}
+}
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/shape_utils.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/shape_utils.hpp
new file mode 100644
index 000000000000..6f4c0d57a923
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/shape_utils.hpp
@@ -0,0 +1,290 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_DNN_SHAPE_UTILS_HPP
+#define OPENCV_DNN_DNN_SHAPE_UTILS_HPP
+
+#include <opencv2/dnn/dnn.hpp>
+#include <opencv2/core/types_c.h>  // CV_MAX_DIM
+#include <iostream>
+#include <ostream>
+#include <sstream>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+//Slicing
+
+struct _Range : public cv::Range
+{
+    _Range(const Range &r) : cv::Range(r) {}
+    _Range(int start_, int size_ = 1) : cv::Range(start_, start_ + size_) {}
+};
+
+static inline Mat slice(const Mat &m, const _Range &r0)
+{
+    Range ranges[CV_MAX_DIM];
+    for (int i = 1; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    return m(&ranges[0]);
+}
+
+static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1)
+{
+    CV_Assert(m.dims >= 2);
+    Range ranges[CV_MAX_DIM];
+    for (int i = 2; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    return m(&ranges[0]);
+}
+
+static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2)
+{
+    CV_Assert(m.dims >= 3);
+    Range ranges[CV_MAX_DIM];
+    for (int i = 3; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    ranges[2] = r2;
+    return m(&ranges[0]);
+}
+
+static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2, const _Range &r3)
+{
+    CV_Assert(m.dims >= 4);
+    Range ranges[CV_MAX_DIM];
+    for (int i = 4; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    ranges[2] = r2;
+    ranges[3] = r3;
+    return m(&ranges[0]);
+}
+
+static inline Mat getPlane(const Mat &m, int n, int cn)
+{
+    CV_Assert(m.dims > 2);
+    int sz[CV_MAX_DIM];
+    for(int i = 2; i < m.dims; i++)
+    {
+        sz[i-2] = m.size.p[i];
+    }
+    return Mat(m.dims - 2, sz, m.type(), (void*)m.ptr<float>(n, cn));
+}
+
+static inline MatShape shape(const int* dims, const int n)
+{
+    MatShape shape;
+    shape.assign(dims, dims + n);
+    return shape;
+}
+
+static inline MatShape shape(const Mat& mat)
+{
+    return shape(mat.size.p, mat.dims);
+}
+
+static inline MatShape shape(const MatSize& sz)
+{
+    return shape(sz.p, sz.dims());
+}
+
+static inline MatShape shape(const UMat& mat)
+{
+    return shape(mat.size.p, mat.dims);
+}
+
+#if 0  // issues with MatExpr wrapped into InputArray
+static inline
+MatShape shape(InputArray input)
+{
+    int sz[CV_MAX_DIM];
+    int ndims = input.sizend(sz);
+    return shape(sz, ndims);
+}
+#endif
+
+namespace {inline bool is_neg(int i) { return i < 0; }}
+
+static inline MatShape shape(int a0, int a1=-1, int a2=-1, int a3=-1)
+{
+    int dims[] = {a0, a1, a2, a3};
+    MatShape s = shape(dims, 4);
+    s.erase(std::remove_if(s.begin(), s.end(), is_neg), s.end());
+    return s;
+}
+
+static inline int total(const MatShape& shape, int start = -1, int end = -1)
+{
+    if (shape.empty())
+        return 0;
+
+    int dims = (int)shape.size();
+
+    if (start == -1) start = 0;
+    if (end == -1) end = dims;
+
+    CV_CheckLE(0, start, "");
+    CV_CheckLE(start, end, "");
+    CV_CheckLE(end, dims, "");
+
+    int elems = 1;
+    for (int i = start; i < end; i++)
+    {
+        elems *= shape[i];
+    }
+    return elems;
+}
+
+// TODO: rename to countDimsElements()
+static inline int total(const Mat& mat, int start = -1, int end = -1)
+{
+    if (mat.empty())
+        return 0;
+
+    int dims = mat.dims;
+
+    if (start == -1) start = 0;
+    if (end == -1) end = dims;
+
+    CV_CheckLE(0, start, "");
+    CV_CheckLE(start, end, "");
+    CV_CheckLE(end, dims, "");
+
+    int elems = 1;
+    for (int i = start; i < end; i++)
+    {
+        elems *= mat.size[i];
+    }
+    return elems;
+}
+
+static inline MatShape concat(const MatShape& a, const MatShape& b)
+{
+    MatShape c = a;
+    c.insert(c.end(), b.begin(), b.end());
+
+    return c;
+}
+
+template<typename _Tp>
+static inline std::string toString(const std::vector<_Tp>& shape, const String& name = "")
+{
+    std::ostringstream ss;
+    if (!name.empty())
+        ss << name << ' ';
+    ss << '[';
+    for(size_t i = 0, n = shape.size(); i < n; ++i)
+        ss << ' ' << shape[i];
+    ss << " ]";
+    return ss.str();
+}
+
+template<typename _Tp>
+static inline void print(const std::vector<_Tp>& shape, const String& name = "")
+{
+    std::cout << toString(shape, name) << std::endl;
+}
+template<typename _Tp>
+static inline std::ostream& operator<<(std::ostream &out, const std::vector<_Tp>& shape)
+{
+    out << toString(shape);
+    return out;
+}
+
+/// @brief Converts axis from `[-dims; dims)` (similar to Python's slice notation) to `[0; dims)` range.
+static inline
+int normalize_axis(int axis, int dims)
+{
+    CV_Check(axis, axis >= -dims && axis < dims, "");
+    axis = (axis < 0) ? (dims + axis) : axis;
+    CV_DbgCheck(axis, axis >= 0 && axis < dims, "");
+    return axis;
+}
+
+static inline
+int normalize_axis(int axis, const MatShape& shape)
+{
+    return normalize_axis(axis, (int)shape.size());
+}
+
+static inline
+Range normalize_axis_range(const Range& r, int axisSize)
+{
+    if (r == Range::all())
+        return Range(0, axisSize);
+    CV_CheckGE(r.start, 0, "");
+    Range clamped(r.start,
+                  r.end > 0 ? std::min(r.end, axisSize) : axisSize + r.end + 1);
+    CV_DbgCheckGE(clamped.start, 0, "");
+    CV_CheckLT(clamped.start, clamped.end, "");
+    CV_CheckLE(clamped.end, axisSize, "");
+    return clamped;
+}
+
+static inline
+bool isAllOnes(const MatShape &inputShape, int startPos, int endPos)
+{
+    CV_Assert(!inputShape.empty());
+
+    CV_CheckGE((int) inputShape.size(), startPos, "");
+    CV_CheckGE(startPos, 0, "");
+    CV_CheckLE(startPos, endPos, "");
+    CV_CheckLE((size_t)endPos, inputShape.size(), "");
+
+    for (size_t i = startPos; i < endPos; i++)
+    {
+        if (inputShape[i] != 1)
+            return false;
+    }
+    return true;
+}
+
+CV__DNN_INLINE_NS_END
+}
+}
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/utils/debug_utils.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/utils/debug_utils.hpp
new file mode 100644
index 000000000000..71dd3ab8d670
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/utils/debug_utils.hpp
@@ -0,0 +1,24 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
+#define OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
+
+#include "../dnn.hpp"
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+/**
+ * @brief Skip model import after diagnostic run in readNet() functions.
+ * @param[in] skip Indicates whether to skip the import.
+ *
+ * This is an internal OpenCV function not intended for users.
+ */
+CV_EXPORTS void skipModelImport(bool skip);
+
+CV__DNN_INLINE_NS_END
+}} // namespace
+
+#endif // OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/utils/inference_engine.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/utils/inference_engine.hpp
new file mode 100644
index 000000000000..b81806ed5a1a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/utils/inference_engine.hpp
@@ -0,0 +1,82 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2019, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_DNN_UTILS_INF_ENGINE_HPP
+#define OPENCV_DNN_UTILS_INF_ENGINE_HPP
+
+#include "../dnn.hpp"
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+
+/* Values for 'OPENCV_DNN_BACKEND_INFERENCE_ENGINE_TYPE' parameter */
+/// @deprecated
+#define CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API     "NN_BUILDER"
+/// @deprecated
+#define CV_DNN_BACKEND_INFERENCE_ENGINE_NGRAPH             "NGRAPH"
+
+/** @brief Returns Inference Engine internal backend API.
+ *
+ * See values of `CV_DNN_BACKEND_INFERENCE_ENGINE_*` macros.
+ *
+ * `OPENCV_DNN_BACKEND_INFERENCE_ENGINE_TYPE` runtime parameter (environment variable) is ignored since 4.6.0.
+ *
+ * @deprecated
+ */
+CV_EXPORTS_W cv::String getInferenceEngineBackendType();
+
+/** @brief Specify Inference Engine internal backend API.
+ *
+ * See values of `CV_DNN_BACKEND_INFERENCE_ENGINE_*` macros.
+ *
+ * @returns previous value of internal backend API
+ *
+ * @deprecated
+ */
+CV_EXPORTS_W cv::String setInferenceEngineBackendType(const cv::String& newBackendType);
+
+
+/** @brief Release a Myriad device (binded by OpenCV).
+ *
+ * Single Myriad device cannot be shared across multiple processes which uses
+ * Inference Engine's Myriad plugin.
+ */
+CV_EXPORTS_W void resetMyriadDevice();
+
+
+/* Values for 'OPENCV_DNN_IE_VPU_TYPE' parameter */
+#define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_UNSPECIFIED ""
+/// Intel(R) Movidius(TM) Neural Compute Stick, NCS (USB 03e7:2150), Myriad2 (https://software.intel.com/en-us/movidius-ncs)
+#define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_2 "Myriad2"
+/// Intel(R) Neural Compute Stick 2, NCS2 (USB 03e7:2485), MyriadX (https://software.intel.com/ru-ru/neural-compute-stick)
+#define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X "MyriadX"
+#define CV_DNN_INFERENCE_ENGINE_CPU_TYPE_ARM_COMPUTE "ARM_COMPUTE"
+#define CV_DNN_INFERENCE_ENGINE_CPU_TYPE_X86         "X86"
+
+
+/** @brief Returns Inference Engine VPU type.
+ *
+ * See values of `CV_DNN_INFERENCE_ENGINE_VPU_TYPE_*` macros.
+ */
+CV_EXPORTS_W cv::String getInferenceEngineVPUType();
+
+/** @brief Returns Inference Engine CPU type.
+ *
+ * Specify OpenVINO plugin: CPU or ARM.
+ */
+CV_EXPORTS_W cv::String getInferenceEngineCPUType();
+
+/** @brief Release a HDDL plugin.
+ */
+CV_EXPORTS_W void releaseHDDLPlugin();
+
+
+CV__DNN_INLINE_NS_END
+}} // namespace
+
+#endif // OPENCV_DNN_UTILS_INF_ENGINE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/dnn/version.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/version.hpp
new file mode 100644
index 000000000000..f83d90dab410
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/dnn/version.hpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_VERSION_HPP
+#define OPENCV_DNN_VERSION_HPP
+
+/// Use with major OpenCV version only.
+#define OPENCV_DNN_API_VERSION 20240521
+
+#if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
+#define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
+#define CV__DNN_INLINE_NS_BEGIN namespace CV__DNN_INLINE_NS {
+#define CV__DNN_INLINE_NS_END }
+namespace cv { namespace dnn { namespace CV__DNN_INLINE_NS { } using namespace CV__DNN_INLINE_NS; }}
+#else
+#define CV__DNN_INLINE_NS_BEGIN
+#define CV__DNN_INLINE_NS_END
+#endif
+
+#endif  // OPENCV_DNN_VERSION_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/features2d.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/features2d.hpp
new file mode 100644
index 000000000000..b4c4dde7121f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/features2d.hpp
@@ -0,0 +1,1602 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_FEATURES_2D_HPP
+#define OPENCV_FEATURES_2D_HPP
+
+#include "opencv2/opencv_modules.hpp"
+#include "opencv2/core.hpp"
+
+#ifdef HAVE_OPENCV_FLANN
+#include "opencv2/flann/miniflann.hpp"
+#endif
+
+/**
+  @defgroup features2d 2D Features Framework
+  @{
+    @defgroup features2d_main Feature Detection and Description
+    @defgroup features2d_match Descriptor Matchers
+
+    Matchers of keypoint descriptors in OpenCV have wrappers with a common interface that enables
+    you to easily switch between different algorithms solving the same problem. This section is
+    devoted to matching descriptors that are represented as vectors in a multidimensional space.
+    All objects that implement vector descriptor matchers inherit the DescriptorMatcher interface.
+
+    @defgroup features2d_draw Drawing Function of Keypoints and Matches
+    @defgroup features2d_category Object Categorization
+
+    This section describes approaches based on local 2D features and used to categorize objects.
+
+    @defgroup feature2d_hal Hardware Acceleration Layer
+    @{
+        @defgroup features2d_hal_interface Interface
+    @}
+  @}
+ */
+
+namespace cv
+{
+
+//! @addtogroup features2d_main
+//! @{
+
+// //! writes vector of keypoints to the file storage
+// CV_EXPORTS void write(FileStorage& fs, const String& name, const std::vector<KeyPoint>& keypoints);
+// //! reads vector of keypoints from the specified file storage node
+// CV_EXPORTS void read(const FileNode& node, CV_OUT std::vector<KeyPoint>& keypoints);
+
+/** @brief A class filters a vector of keypoints.
+
+ Because now it is difficult to provide a convenient interface for all usage scenarios of the
+ keypoints filter class, it has only several needed by now static methods.
+ */
+class CV_EXPORTS KeyPointsFilter
+{
+public:
+    KeyPointsFilter(){}
+
+    /*
+     * Remove keypoints within borderPixels of an image edge.
+     */
+    static void runByImageBorder( std::vector<KeyPoint>& keypoints, Size imageSize, int borderSize );
+    /*
+     * Remove keypoints of sizes out of range.
+     */
+    static void runByKeypointSize( std::vector<KeyPoint>& keypoints, float minSize,
+                                   float maxSize=FLT_MAX );
+    /*
+     * Remove keypoints from some image by mask for pixels of this image.
+     */
+    static void runByPixelsMask( std::vector<KeyPoint>& keypoints, const Mat& mask );
+    /*
+     * Remove objects from some image and a vector of points by mask for pixels of this image
+     */
+    static void runByPixelsMask2VectorPoint(std::vector<KeyPoint> &keypoints, std::vector<std::vector<Point> > &removeFrom, const Mat &mask);
+    /*
+     * Remove duplicated keypoints.
+     */
+    static void removeDuplicated( std::vector<KeyPoint>& keypoints );
+    /*
+     * Remove duplicated keypoints and sort the remaining keypoints
+     */
+    static void removeDuplicatedSorted( std::vector<KeyPoint>& keypoints );
+
+    /*
+     * Retain the specified number of the best keypoints (according to the response)
+     */
+    static void retainBest( std::vector<KeyPoint>& keypoints, int npoints );
+};
+
+
+/************************************ Base Classes ************************************/
+
+/** @brief Abstract base class for 2D image feature detectors and descriptor extractors
+*/
+#ifdef __EMSCRIPTEN__
+class CV_EXPORTS_W Feature2D : public Algorithm
+#else
+class CV_EXPORTS_W Feature2D : public virtual Algorithm
+#endif
+{
+public:
+    virtual ~Feature2D();
+
+    /** @brief Detects keypoints in an image (first variant) or image set (second variant).
+
+    @param image Image.
+    @param keypoints The detected keypoints. In the second variant of the method keypoints[i] is a set
+    of keypoints detected in images[i] .
+    @param mask Mask specifying where to look for keypoints (optional). It must be a 8-bit integer
+    matrix with non-zero values in the region of interest.
+     */
+    CV_WRAP virtual void detect( InputArray image,
+                                 CV_OUT std::vector<KeyPoint>& keypoints,
+                                 InputArray mask=noArray() );
+
+    /** @overload
+    @param images Image set.
+    @param keypoints The detected keypoints. In the second variant of the method keypoints[i] is a set
+    of keypoints detected in images[i] .
+    @param masks Masks for each input image specifying where to look for keypoints (optional).
+    masks[i] is a mask for images[i].
+    */
+    CV_WRAP virtual void detect( InputArrayOfArrays images,
+                         CV_OUT std::vector<std::vector<KeyPoint> >& keypoints,
+                         InputArrayOfArrays masks=noArray() );
+
+    /** @brief Computes the descriptors for a set of keypoints detected in an image (first variant) or image set
+    (second variant).
+
+    @param image Image.
+    @param keypoints Input collection of keypoints. Keypoints for which a descriptor cannot be
+    computed are removed. Sometimes new keypoints can be added, for example: SIFT duplicates keypoint
+    with several dominant orientations (for each orientation).
+    @param descriptors Computed descriptors. In the second variant of the method descriptors[i] are
+    descriptors computed for a keypoints[i]. Row j is the keypoints (or keypoints[i]) is the
+    descriptor for keypoint j-th keypoint.
+     */
+    CV_WRAP virtual void compute( InputArray image,
+                                  CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints,
+                                  OutputArray descriptors );
+
+    /** @overload
+
+    @param images Image set.
+    @param keypoints Input collection of keypoints. Keypoints for which a descriptor cannot be
+    computed are removed. Sometimes new keypoints can be added, for example: SIFT duplicates keypoint
+    with several dominant orientations (for each orientation).
+    @param descriptors Computed descriptors. In the second variant of the method descriptors[i] are
+    descriptors computed for a keypoints[i]. Row j is the keypoints (or keypoints[i]) is the
+    descriptor for keypoint j-th keypoint.
+    */
+    CV_WRAP virtual void compute( InputArrayOfArrays images,
+                          CV_OUT CV_IN_OUT std::vector<std::vector<KeyPoint> >& keypoints,
+                          OutputArrayOfArrays descriptors );
+
+    /** Detects keypoints and computes the descriptors */
+    CV_WRAP virtual void detectAndCompute( InputArray image, InputArray mask,
+                                           CV_OUT std::vector<KeyPoint>& keypoints,
+                                           OutputArray descriptors,
+                                           bool useProvidedKeypoints=false );
+
+    CV_WRAP virtual int descriptorSize() const;
+    CV_WRAP virtual int descriptorType() const;
+    CV_WRAP virtual int defaultNorm() const;
+
+    CV_WRAP void write( const String& fileName ) const;
+
+    CV_WRAP void read( const String& fileName );
+
+    virtual void write( FileStorage&) const CV_OVERRIDE;
+
+    // see corresponding cv::Algorithm method
+    CV_WRAP virtual void read( const FileNode&) CV_OVERRIDE;
+
+    //! Return true if detector object is empty
+    CV_WRAP virtual bool empty() const CV_OVERRIDE;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    // see corresponding cv::Algorithm method
+    CV_WRAP inline void write(FileStorage& fs, const String& name) const { Algorithm::write(fs, name); }
+#if CV_VERSION_MAJOR < 5
+    inline void write(const Ptr<FileStorage>& fs, const String& name) const { CV_Assert(fs); Algorithm::write(*fs, name); }
+#endif
+};
+
+/** Feature detectors in OpenCV have wrappers with a common interface that enables you to easily switch
+between different algorithms solving the same problem. All objects that implement keypoint detectors
+inherit the FeatureDetector interface. */
+typedef Feature2D FeatureDetector;
+
+/** Extractors of keypoint descriptors in OpenCV have wrappers with a common interface that enables you
+to easily switch between different algorithms solving the same problem. This section is devoted to
+computing descriptors represented as vectors in a multidimensional space. All objects that implement
+the vector descriptor extractors inherit the DescriptorExtractor interface.
+ */
+typedef Feature2D DescriptorExtractor;
+
+
+/** @brief Class for implementing the wrapper which makes detectors and extractors to be affine invariant,
+described as ASIFT in @cite YM11 .
+*/
+class CV_EXPORTS_W AffineFeature : public Feature2D
+{
+public:
+    /**
+    @param backend The detector/extractor you want to use as backend.
+    @param maxTilt The highest power index of tilt factor. 5 is used in the paper as tilt sampling range n.
+    @param minTilt The lowest power index of tilt factor. 0 is used in the paper.
+    @param tiltStep Tilt sampling step \f$\delta_t\f$ in Algorithm 1 in the paper.
+    @param rotateStepBase Rotation sampling step factor b in Algorithm 1 in the paper.
+    */
+    CV_WRAP static Ptr<AffineFeature> create(const Ptr<Feature2D>& backend,
+        int maxTilt = 5, int minTilt = 0, float tiltStep = 1.4142135623730951f, float rotateStepBase = 72);
+
+    CV_WRAP virtual void setViewParams(const std::vector<float>& tilts, const std::vector<float>& rolls) = 0;
+    CV_WRAP virtual void getViewParams(std::vector<float>& tilts, std::vector<float>& rolls) const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+typedef AffineFeature AffineFeatureDetector;
+typedef AffineFeature AffineDescriptorExtractor;
+
+
+/** @brief Class for extracting keypoints and computing descriptors using the Scale Invariant Feature Transform
+(SIFT) algorithm by D. Lowe @cite Lowe04 .
+*/
+class CV_EXPORTS_W SIFT : public Feature2D
+{
+public:
+    /**
+    @param nfeatures The number of best features to retain. The features are ranked by their scores
+    (measured in SIFT algorithm as the local contrast)
+
+    @param nOctaveLayers The number of layers in each octave. 3 is the value used in D. Lowe paper. The
+    number of octaves is computed automatically from the image resolution.
+
+    @param contrastThreshold The contrast threshold used to filter out weak features in semi-uniform
+    (low-contrast) regions. The larger the threshold, the less features are produced by the detector.
+
+    @note The contrast threshold will be divided by nOctaveLayers when the filtering is applied. When
+    nOctaveLayers is set to default and if you want to use the value used in D. Lowe paper, 0.03, set
+    this argument to 0.09.
+
+    @param edgeThreshold The threshold used to filter out edge-like features. Note that the its meaning
+    is different from the contrastThreshold, i.e. the larger the edgeThreshold, the less features are
+    filtered out (more features are retained).
+
+    @param sigma The sigma of the Gaussian applied to the input image at the octave \#0. If your image
+    is captured with a weak camera with soft lenses, you might want to reduce the number.
+
+    @param enable_precise_upscale Whether to enable precise upscaling in the scale pyramid, which maps
+    index \f$\texttt{x}\f$ to \f$\texttt{2x}\f$. This prevents localization bias. The option
+    is disabled by default.
+    */
+    CV_WRAP static Ptr<SIFT> create(int nfeatures = 0, int nOctaveLayers = 3,
+        double contrastThreshold = 0.04, double edgeThreshold = 10,
+        double sigma = 1.6, bool enable_precise_upscale = false);
+
+    /** @brief Create SIFT with specified descriptorType.
+    @param nfeatures The number of best features to retain. The features are ranked by their scores
+    (measured in SIFT algorithm as the local contrast)
+
+    @param nOctaveLayers The number of layers in each octave. 3 is the value used in D. Lowe paper. The
+    number of octaves is computed automatically from the image resolution.
+
+    @param contrastThreshold The contrast threshold used to filter out weak features in semi-uniform
+    (low-contrast) regions. The larger the threshold, the less features are produced by the detector.
+
+    @note The contrast threshold will be divided by nOctaveLayers when the filtering is applied. When
+    nOctaveLayers is set to default and if you want to use the value used in D. Lowe paper, 0.03, set
+    this argument to 0.09.
+
+    @param edgeThreshold The threshold used to filter out edge-like features. Note that the its meaning
+    is different from the contrastThreshold, i.e. the larger the edgeThreshold, the less features are
+    filtered out (more features are retained).
+
+    @param sigma The sigma of the Gaussian applied to the input image at the octave \#0. If your image
+    is captured with a weak camera with soft lenses, you might want to reduce the number.
+
+    @param descriptorType The type of descriptors. Only CV_32F and CV_8U are supported.
+
+    @param enable_precise_upscale Whether to enable precise upscaling in the scale pyramid, which maps
+    index \f$\texttt{x}\f$ to \f$\texttt{2x}\f$. This prevents localization bias. The option
+    is disabled by default.
+    */
+    CV_WRAP static Ptr<SIFT> create(int nfeatures, int nOctaveLayers,
+        double contrastThreshold, double edgeThreshold,
+        double sigma, int descriptorType, bool enable_precise_upscale = false);
+
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    CV_WRAP virtual void setNFeatures(int maxFeatures) = 0;
+    CV_WRAP virtual int getNFeatures() const = 0;
+
+    CV_WRAP virtual void setNOctaveLayers(int nOctaveLayers) = 0;
+    CV_WRAP virtual int getNOctaveLayers() const = 0;
+
+    CV_WRAP virtual void setContrastThreshold(double contrastThreshold) = 0;
+    CV_WRAP virtual double getContrastThreshold() const = 0;
+
+    CV_WRAP virtual void setEdgeThreshold(double edgeThreshold) = 0;
+    CV_WRAP virtual double getEdgeThreshold() const = 0;
+
+    CV_WRAP virtual void setSigma(double sigma) = 0;
+    CV_WRAP virtual double getSigma() const = 0;
+};
+
+typedef SIFT SiftFeatureDetector;
+typedef SIFT SiftDescriptorExtractor;
+
+
+/** @brief Class implementing the BRISK keypoint detector and descriptor extractor, described in @cite LCS11 .
+ */
+class CV_EXPORTS_W BRISK : public Feature2D
+{
+public:
+    /** @brief The BRISK constructor
+
+    @param thresh AGAST detection threshold score.
+    @param octaves detection octaves. Use 0 to do single scale.
+    @param patternScale apply this scale to the pattern used for sampling the neighbourhood of a
+    keypoint.
+     */
+    CV_WRAP static Ptr<BRISK> create(int thresh=30, int octaves=3, float patternScale=1.0f);
+
+    /** @brief The BRISK constructor for a custom pattern
+
+    @param radiusList defines the radii (in pixels) where the samples around a keypoint are taken (for
+    keypoint scale 1).
+    @param numberList defines the number of sampling points on the sampling circle. Must be the same
+    size as radiusList..
+    @param dMax threshold for the short pairings used for descriptor formation (in pixels for keypoint
+    scale 1).
+    @param dMin threshold for the long pairings used for orientation determination (in pixels for
+    keypoint scale 1).
+    @param indexChange index remapping of the bits. */
+    CV_WRAP static Ptr<BRISK> create(const std::vector<float> &radiusList, const std::vector<int> &numberList,
+        float dMax=5.85f, float dMin=8.2f, const std::vector<int>& indexChange=std::vector<int>());
+
+    /** @brief The BRISK constructor for a custom pattern, detection threshold and octaves
+
+    @param thresh AGAST detection threshold score.
+    @param octaves detection octaves. Use 0 to do single scale.
+    @param radiusList defines the radii (in pixels) where the samples around a keypoint are taken (for
+    keypoint scale 1).
+    @param numberList defines the number of sampling points on the sampling circle. Must be the same
+    size as radiusList..
+    @param dMax threshold for the short pairings used for descriptor formation (in pixels for keypoint
+    scale 1).
+    @param dMin threshold for the long pairings used for orientation determination (in pixels for
+    keypoint scale 1).
+    @param indexChange index remapping of the bits. */
+    CV_WRAP static Ptr<BRISK> create(int thresh, int octaves, const std::vector<float> &radiusList,
+        const std::vector<int> &numberList, float dMax=5.85f, float dMin=8.2f,
+        const std::vector<int>& indexChange=std::vector<int>());
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    /** @brief Set detection threshold.
+    @param threshold AGAST detection threshold score.
+    */
+    CV_WRAP virtual void setThreshold(int threshold) = 0;
+    CV_WRAP virtual int getThreshold() const = 0;
+
+    /** @brief Set detection octaves.
+    @param octaves detection octaves. Use 0 to do single scale.
+    */
+    CV_WRAP virtual void setOctaves(int octaves) = 0;
+    CV_WRAP virtual int getOctaves() const = 0;
+    /** @brief Set detection patternScale.
+    @param patternScale apply this scale to the pattern used for sampling the neighbourhood of a
+    keypoint.
+    */
+    CV_WRAP virtual void setPatternScale(float patternScale) = 0;
+    CV_WRAP virtual float getPatternScale() const = 0;
+};
+
+/** @brief Class implementing the ORB (*oriented BRIEF*) keypoint detector and descriptor extractor
+
+described in @cite RRKB11 . The algorithm uses FAST in pyramids to detect stable keypoints, selects
+the strongest features using FAST or Harris response, finds their orientation using first-order
+moments and computes the descriptors using BRIEF (where the coordinates of random point pairs (or
+k-tuples) are rotated according to the measured orientation).
+ */
+class CV_EXPORTS_W ORB : public Feature2D
+{
+public:
+    enum ScoreType { HARRIS_SCORE=0, FAST_SCORE=1 };
+    static const int kBytes = 32;
+
+    /** @brief The ORB constructor
+
+    @param nfeatures The maximum number of features to retain.
+    @param scaleFactor Pyramid decimation ratio, greater than 1. scaleFactor==2 means the classical
+    pyramid, where each next level has 4x less pixels than the previous, but such a big scale factor
+    will degrade feature matching scores dramatically. On the other hand, too close to 1 scale factor
+    will mean that to cover certain scale range you will need more pyramid levels and so the speed
+    will suffer.
+    @param nlevels The number of pyramid levels. The smallest level will have linear size equal to
+    input_image_linear_size/pow(scaleFactor, nlevels - firstLevel).
+    @param edgeThreshold This is size of the border where the features are not detected. It should
+    roughly match the patchSize parameter.
+    @param firstLevel The level of pyramid to put source image to. Previous layers are filled
+    with upscaled source image.
+    @param WTA_K The number of points that produce each element of the oriented BRIEF descriptor. The
+    default value 2 means the BRIEF where we take a random point pair and compare their brightnesses,
+    so we get 0/1 response. Other possible values are 3 and 4. For example, 3 means that we take 3
+    random points (of course, those point coordinates are random, but they are generated from the
+    pre-defined seed, so each element of BRIEF descriptor is computed deterministically from the pixel
+    rectangle), find point of maximum brightness and output index of the winner (0, 1 or 2). Such
+    output will occupy 2 bits, and therefore it will need a special variant of Hamming distance,
+    denoted as NORM_HAMMING2 (2 bits per bin). When WTA_K=4, we take 4 random points to compute each
+    bin (that will also occupy 2 bits with possible values 0, 1, 2 or 3).
+    @param scoreType The default HARRIS_SCORE means that Harris algorithm is used to rank features
+    (the score is written to KeyPoint::score and is used to retain best nfeatures features);
+    FAST_SCORE is alternative value of the parameter that produces slightly less stable keypoints,
+    but it is a little faster to compute.
+    @param patchSize size of the patch used by the oriented BRIEF descriptor. Of course, on smaller
+    pyramid layers the perceived image area covered by a feature will be larger.
+    @param fastThreshold the fast threshold
+     */
+    CV_WRAP static Ptr<ORB> create(int nfeatures=500, float scaleFactor=1.2f, int nlevels=8, int edgeThreshold=31,
+        int firstLevel=0, int WTA_K=2, ORB::ScoreType scoreType=ORB::HARRIS_SCORE, int patchSize=31, int fastThreshold=20);
+
+    CV_WRAP virtual void setMaxFeatures(int maxFeatures) = 0;
+    CV_WRAP virtual int getMaxFeatures() const = 0;
+
+    CV_WRAP virtual void setScaleFactor(double scaleFactor) = 0;
+    CV_WRAP virtual double getScaleFactor() const = 0;
+
+    CV_WRAP virtual void setNLevels(int nlevels) = 0;
+    CV_WRAP virtual int getNLevels() const = 0;
+
+    CV_WRAP virtual void setEdgeThreshold(int edgeThreshold) = 0;
+    CV_WRAP virtual int getEdgeThreshold() const = 0;
+
+    CV_WRAP virtual void setFirstLevel(int firstLevel) = 0;
+    CV_WRAP virtual int getFirstLevel() const = 0;
+
+    CV_WRAP virtual void setWTA_K(int wta_k) = 0;
+    CV_WRAP virtual int getWTA_K() const = 0;
+
+    CV_WRAP virtual void setScoreType(ORB::ScoreType scoreType) = 0;
+    CV_WRAP virtual ORB::ScoreType getScoreType() const = 0;
+
+    CV_WRAP virtual void setPatchSize(int patchSize) = 0;
+    CV_WRAP virtual int getPatchSize() const = 0;
+
+    CV_WRAP virtual void setFastThreshold(int fastThreshold) = 0;
+    CV_WRAP virtual int getFastThreshold() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @brief Maximally stable extremal region extractor
+
+The class encapsulates all the parameters of the %MSER extraction algorithm (see [wiki
+article](http://en.wikipedia.org/wiki/Maximally_stable_extremal_regions)).
+
+- there are two different implementation of %MSER: one for grey image, one for color image
+
+- the grey image algorithm is taken from: @cite nister2008linear ;  the paper claims to be faster
+than union-find method; it actually get 1.5~2m/s on my centrino L7200 1.2GHz laptop.
+
+- the color image algorithm is taken from: @cite forssen2007maximally ; it should be much slower
+than grey image method ( 3~4 times )
+
+- (Python) A complete example showing the use of the %MSER detector can be found at samples/python/mser.py
+*/
+class CV_EXPORTS_W MSER : public Feature2D
+{
+public:
+    /** @brief Full constructor for %MSER detector
+
+    @param delta it compares \f$(size_{i}-size_{i-delta})/size_{i-delta}\f$
+    @param min_area prune the area which smaller than minArea
+    @param max_area prune the area which bigger than maxArea
+    @param max_variation prune the area have similar size to its children
+    @param min_diversity for color image, trace back to cut off mser with diversity less than min_diversity
+    @param max_evolution  for color image, the evolution steps
+    @param area_threshold for color image, the area threshold to cause re-initialize
+    @param min_margin for color image, ignore too small margin
+    @param edge_blur_size for color image, the aperture size for edge blur
+     */
+    CV_WRAP static Ptr<MSER> create( int delta=5, int min_area=60, int max_area=14400,
+          double max_variation=0.25, double min_diversity=.2,
+          int max_evolution=200, double area_threshold=1.01,
+          double min_margin=0.003, int edge_blur_size=5 );
+
+    /** @brief Detect %MSER regions
+
+    @param image input image (8UC1, 8UC3 or 8UC4, must be greater or equal than 3x3)
+    @param msers resulting list of point sets
+    @param bboxes resulting bounding boxes
+    */
+    CV_WRAP virtual void detectRegions( InputArray image,
+                                        CV_OUT std::vector<std::vector<Point> >& msers,
+                                        CV_OUT std::vector<Rect>& bboxes ) = 0;
+
+    CV_WRAP virtual void setDelta(int delta) = 0;
+    CV_WRAP virtual int getDelta() const = 0;
+
+    CV_WRAP virtual void setMinArea(int minArea) = 0;
+    CV_WRAP virtual int getMinArea() const = 0;
+
+    CV_WRAP virtual void setMaxArea(int maxArea) = 0;
+    CV_WRAP virtual int getMaxArea() const = 0;
+
+    CV_WRAP virtual void setMaxVariation(double maxVariation) = 0;
+    CV_WRAP virtual double getMaxVariation() const = 0;
+
+    CV_WRAP virtual void setMinDiversity(double minDiversity) = 0;
+    CV_WRAP virtual double getMinDiversity() const = 0;
+
+    CV_WRAP virtual void setMaxEvolution(int maxEvolution) = 0;
+    CV_WRAP virtual int getMaxEvolution() const = 0;
+
+    CV_WRAP virtual void setAreaThreshold(double areaThreshold) = 0;
+    CV_WRAP virtual double getAreaThreshold() const = 0;
+
+    CV_WRAP virtual void setMinMargin(double min_margin) = 0;
+    CV_WRAP virtual double getMinMargin() const = 0;
+
+    CV_WRAP virtual void setEdgeBlurSize(int edge_blur_size) = 0;
+    CV_WRAP virtual int getEdgeBlurSize() const = 0;
+
+    CV_WRAP virtual void setPass2Only(bool f) = 0;
+    CV_WRAP virtual bool getPass2Only() const = 0;
+
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+
+/** @brief Wrapping class for feature detection using the FAST method. :
+ */
+class CV_EXPORTS_W FastFeatureDetector : public Feature2D
+{
+public:
+    enum DetectorType
+    {
+        TYPE_5_8 = 0, TYPE_7_12 = 1, TYPE_9_16 = 2
+    };
+    enum
+    {
+        THRESHOLD = 10000, NONMAX_SUPPRESSION=10001, FAST_N=10002
+    };
+
+
+    CV_WRAP static Ptr<FastFeatureDetector> create( int threshold=10,
+                                                    bool nonmaxSuppression=true,
+                                                    FastFeatureDetector::DetectorType type=FastFeatureDetector::TYPE_9_16 );
+
+    CV_WRAP virtual void setThreshold(int threshold) = 0;
+    CV_WRAP virtual int getThreshold() const = 0;
+
+    CV_WRAP virtual void setNonmaxSuppression(bool f) = 0;
+    CV_WRAP virtual bool getNonmaxSuppression() const = 0;
+
+    CV_WRAP virtual void setType(FastFeatureDetector::DetectorType type) = 0;
+    CV_WRAP virtual FastFeatureDetector::DetectorType getType() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @overload */
+CV_EXPORTS void FAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
+                      int threshold, bool nonmaxSuppression=true );
+
+/** @brief Detects corners using the FAST algorithm
+
+@param image grayscale image where keypoints (corners) are detected.
+@param keypoints keypoints detected on the image.
+@param threshold threshold on difference between intensity of the central pixel and pixels of a
+circle around this pixel.
+@param nonmaxSuppression if true, non-maximum suppression is applied to detected corners
+(keypoints).
+@param type one of the three neighborhoods as defined in the paper:
+FastFeatureDetector::TYPE_9_16, FastFeatureDetector::TYPE_7_12,
+FastFeatureDetector::TYPE_5_8
+
+Detects corners using the FAST algorithm by @cite Rosten06 .
+
+@note In Python API, types are given as cv.FAST_FEATURE_DETECTOR_TYPE_5_8,
+cv.FAST_FEATURE_DETECTOR_TYPE_7_12 and cv.FAST_FEATURE_DETECTOR_TYPE_9_16. For corner
+detection, use cv.FAST.detect() method.
+ */
+CV_EXPORTS void FAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
+                      int threshold, bool nonmaxSuppression, FastFeatureDetector::DetectorType type );
+
+
+/** @brief Wrapping class for feature detection using the AGAST method. :
+ */
+class CV_EXPORTS_W AgastFeatureDetector : public Feature2D
+{
+public:
+    enum DetectorType
+    {
+        AGAST_5_8 = 0, AGAST_7_12d = 1, AGAST_7_12s = 2, OAST_9_16 = 3,
+    };
+
+    enum
+    {
+        THRESHOLD = 10000, NONMAX_SUPPRESSION = 10001,
+    };
+
+    CV_WRAP static Ptr<AgastFeatureDetector> create( int threshold=10,
+                                                     bool nonmaxSuppression=true,
+                                                     AgastFeatureDetector::DetectorType type = AgastFeatureDetector::OAST_9_16);
+
+    CV_WRAP virtual void setThreshold(int threshold) = 0;
+    CV_WRAP virtual int getThreshold() const = 0;
+
+    CV_WRAP virtual void setNonmaxSuppression(bool f) = 0;
+    CV_WRAP virtual bool getNonmaxSuppression() const = 0;
+
+    CV_WRAP virtual void setType(AgastFeatureDetector::DetectorType type) = 0;
+    CV_WRAP virtual AgastFeatureDetector::DetectorType getType() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @overload */
+CV_EXPORTS void AGAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
+                      int threshold, bool nonmaxSuppression=true );
+
+/** @brief Detects corners using the AGAST algorithm
+
+@param image grayscale image where keypoints (corners) are detected.
+@param keypoints keypoints detected on the image.
+@param threshold threshold on difference between intensity of the central pixel and pixels of a
+circle around this pixel.
+@param nonmaxSuppression if true, non-maximum suppression is applied to detected corners
+(keypoints).
+@param type one of the four neighborhoods as defined in the paper:
+AgastFeatureDetector::AGAST_5_8, AgastFeatureDetector::AGAST_7_12d,
+AgastFeatureDetector::AGAST_7_12s, AgastFeatureDetector::OAST_9_16
+
+For non-Intel platforms, there is a tree optimised variant of AGAST with same numerical results.
+The 32-bit binary tree tables were generated automatically from original code using perl script.
+The perl script and examples of tree generation are placed in features2d/doc folder.
+Detects corners using the AGAST algorithm by @cite mair2010_agast .
+
+ */
+CV_EXPORTS void AGAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
+                      int threshold, bool nonmaxSuppression, AgastFeatureDetector::DetectorType type );
+
+/** @brief Wrapping class for feature detection using the goodFeaturesToTrack function. :
+ */
+class CV_EXPORTS_W GFTTDetector : public Feature2D
+{
+public:
+    CV_WRAP static Ptr<GFTTDetector> create( int maxCorners=1000, double qualityLevel=0.01, double minDistance=1,
+                                             int blockSize=3, bool useHarrisDetector=false, double k=0.04 );
+    CV_WRAP static Ptr<GFTTDetector> create( int maxCorners, double qualityLevel, double minDistance,
+                                             int blockSize, int gradiantSize, bool useHarrisDetector=false, double k=0.04 );
+    CV_WRAP virtual void setMaxFeatures(int maxFeatures) = 0;
+    CV_WRAP virtual int getMaxFeatures() const = 0;
+
+    CV_WRAP virtual void setQualityLevel(double qlevel) = 0;
+    CV_WRAP virtual double getQualityLevel() const = 0;
+
+    CV_WRAP virtual void setMinDistance(double minDistance) = 0;
+    CV_WRAP virtual double getMinDistance() const = 0;
+
+    CV_WRAP virtual void setBlockSize(int blockSize) = 0;
+    CV_WRAP virtual int getBlockSize() const = 0;
+
+    CV_WRAP virtual void setGradientSize(int gradientSize_) = 0;
+    CV_WRAP virtual int getGradientSize() = 0;
+
+    CV_WRAP virtual void setHarrisDetector(bool val) = 0;
+    CV_WRAP virtual bool getHarrisDetector() const = 0;
+
+    CV_WRAP virtual void setK(double k) = 0;
+    CV_WRAP virtual double getK() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @brief Class for extracting blobs from an image. :
+
+The class implements a simple algorithm for extracting blobs from an image:
+
+1.  Convert the source image to binary images by applying thresholding with several thresholds from
+    minThreshold (inclusive) to maxThreshold (exclusive) with distance thresholdStep between
+    neighboring thresholds.
+2.  Extract connected components from every binary image by findContours and calculate their
+    centers.
+3.  Group centers from several binary images by their coordinates. Close centers form one group that
+    corresponds to one blob, which is controlled by the minDistBetweenBlobs parameter.
+4.  From the groups, estimate final centers of blobs and their radiuses and return as locations and
+    sizes of keypoints.
+
+This class performs several filtrations of returned blobs. You should set filterBy\* to true/false
+to turn on/off corresponding filtration. Available filtrations:
+
+-   **By color**. This filter compares the intensity of a binary image at the center of a blob to
+blobColor. If they differ, the blob is filtered out. Use blobColor = 0 to extract dark blobs
+and blobColor = 255 to extract light blobs.
+-   **By area**. Extracted blobs have an area between minArea (inclusive) and maxArea (exclusive).
+-   **By circularity**. Extracted blobs have circularity
+(\f$\frac{4*\pi*Area}{perimeter * perimeter}\f$) between minCircularity (inclusive) and
+maxCircularity (exclusive).
+-   **By ratio of the minimum inertia to maximum inertia**. Extracted blobs have this ratio
+between minInertiaRatio (inclusive) and maxInertiaRatio (exclusive).
+-   **By convexity**. Extracted blobs have convexity (area / area of blob convex hull) between
+minConvexity (inclusive) and maxConvexity (exclusive).
+
+Default values of parameters are tuned to extract dark circular blobs.
+ */
+class CV_EXPORTS_W SimpleBlobDetector : public Feature2D
+{
+public:
+  struct CV_EXPORTS_W_SIMPLE Params
+  {
+      CV_WRAP Params();
+      CV_PROP_RW float thresholdStep;
+      CV_PROP_RW float minThreshold;
+      CV_PROP_RW float maxThreshold;
+      CV_PROP_RW size_t minRepeatability;
+      CV_PROP_RW float minDistBetweenBlobs;
+
+      CV_PROP_RW bool filterByColor;
+      CV_PROP_RW uchar blobColor;
+
+      CV_PROP_RW bool filterByArea;
+      CV_PROP_RW float minArea, maxArea;
+
+      CV_PROP_RW bool filterByCircularity;
+      CV_PROP_RW float minCircularity, maxCircularity;
+
+      CV_PROP_RW bool filterByInertia;
+      CV_PROP_RW float minInertiaRatio, maxInertiaRatio;
+
+      CV_PROP_RW bool filterByConvexity;
+      CV_PROP_RW float minConvexity, maxConvexity;
+
+      CV_PROP_RW bool collectContours;
+
+      void read( const FileNode& fn );
+      void write( FileStorage& fs ) const;
+  };
+
+  CV_WRAP static Ptr<SimpleBlobDetector>
+    create(const SimpleBlobDetector::Params &parameters = SimpleBlobDetector::Params());
+
+  CV_WRAP virtual void setParams(const SimpleBlobDetector::Params& params ) = 0;
+  CV_WRAP virtual SimpleBlobDetector::Params getParams() const = 0;
+
+  CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+  CV_WRAP virtual const std::vector<std::vector<cv::Point> >& getBlobContours() const;
+};
+
+
+/** @brief Class implementing the KAZE keypoint detector and descriptor extractor, described in @cite ABD12 .
+
+@note AKAZE descriptor can only be used with KAZE or AKAZE keypoints .. [ABD12] KAZE Features. Pablo
+F. Alcantarilla, Adrien Bartoli and Andrew J. Davison. In European Conference on Computer Vision
+(ECCV), Fiorenze, Italy, October 2012.
+*/
+class CV_EXPORTS_W KAZE : public Feature2D
+{
+public:
+    enum DiffusivityType
+    {
+        DIFF_PM_G1 = 0,
+        DIFF_PM_G2 = 1,
+        DIFF_WEICKERT = 2,
+        DIFF_CHARBONNIER = 3
+    };
+
+    /** @brief The KAZE constructor
+
+    @param extended Set to enable extraction of extended (128-byte) descriptor.
+    @param upright Set to enable use of upright descriptors (non rotation-invariant).
+    @param threshold Detector response threshold to accept point
+    @param nOctaves Maximum octave evolution of the image
+    @param nOctaveLayers Default number of sublevels per scale level
+    @param diffusivity Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or
+    DIFF_CHARBONNIER
+     */
+    CV_WRAP static Ptr<KAZE> create(bool extended=false, bool upright=false,
+                                    float threshold = 0.001f,
+                                    int nOctaves = 4, int nOctaveLayers = 4,
+                                    KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2);
+
+    CV_WRAP virtual void setExtended(bool extended) = 0;
+    CV_WRAP virtual bool getExtended() const = 0;
+
+    CV_WRAP virtual void setUpright(bool upright) = 0;
+    CV_WRAP virtual bool getUpright() const = 0;
+
+    CV_WRAP virtual void setThreshold(double threshold) = 0;
+    CV_WRAP virtual double getThreshold() const = 0;
+
+    CV_WRAP virtual void setNOctaves(int octaves) = 0;
+    CV_WRAP virtual int getNOctaves() const = 0;
+
+    CV_WRAP virtual void setNOctaveLayers(int octaveLayers) = 0;
+    CV_WRAP virtual int getNOctaveLayers() const = 0;
+
+    CV_WRAP virtual void setDiffusivity(KAZE::DiffusivityType diff) = 0;
+    CV_WRAP virtual KAZE::DiffusivityType getDiffusivity() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @brief Class implementing the AKAZE keypoint detector and descriptor extractor, described in @cite ANB13.
+
+@details AKAZE descriptors can only be used with KAZE or AKAZE keypoints. This class is thread-safe.
+
+@note When you need descriptors use Feature2D::detectAndCompute, which
+provides better performance. When using Feature2D::detect followed by
+Feature2D::compute scale space pyramid is computed twice.
+
+@note AKAZE implements T-API. When image is passed as UMat some parts of the algorithm
+will use OpenCL.
+
+@note [ANB13] Fast Explicit Diffusion for Accelerated Features in Nonlinear
+Scale Spaces. Pablo F. Alcantarilla, Jesús Nuevo and Adrien Bartoli. In
+British Machine Vision Conference (BMVC), Bristol, UK, September 2013.
+
+*/
+class CV_EXPORTS_W AKAZE : public Feature2D
+{
+public:
+    // AKAZE descriptor type
+    enum DescriptorType
+    {
+        DESCRIPTOR_KAZE_UPRIGHT = 2, ///< Upright descriptors, not invariant to rotation
+        DESCRIPTOR_KAZE = 3,
+        DESCRIPTOR_MLDB_UPRIGHT = 4, ///< Upright descriptors, not invariant to rotation
+        DESCRIPTOR_MLDB = 5
+    };
+
+    /** @brief The AKAZE constructor
+
+    @param descriptor_type Type of the extracted descriptor: DESCRIPTOR_KAZE,
+    DESCRIPTOR_KAZE_UPRIGHT, DESCRIPTOR_MLDB or DESCRIPTOR_MLDB_UPRIGHT.
+    @param descriptor_size Size of the descriptor in bits. 0 -\> Full size
+    @param descriptor_channels Number of channels in the descriptor (1, 2, 3)
+    @param threshold Detector response threshold to accept point
+    @param nOctaves Maximum octave evolution of the image
+    @param nOctaveLayers Default number of sublevels per scale level
+    @param diffusivity Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or
+    DIFF_CHARBONNIER
+    @param max_points Maximum amount of returned points. In case if image contains
+    more features, then the features with highest response are returned.
+    Negative value means no limitation.
+     */
+    CV_WRAP static Ptr<AKAZE> create(AKAZE::DescriptorType descriptor_type = AKAZE::DESCRIPTOR_MLDB,
+                                     int descriptor_size = 0, int descriptor_channels = 3,
+                                     float threshold = 0.001f, int nOctaves = 4,
+                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2,
+                                     int max_points = -1);
+
+    CV_WRAP virtual void setDescriptorType(AKAZE::DescriptorType dtype) = 0;
+    CV_WRAP virtual AKAZE::DescriptorType getDescriptorType() const = 0;
+
+    CV_WRAP virtual void setDescriptorSize(int dsize) = 0;
+    CV_WRAP virtual int getDescriptorSize() const = 0;
+
+    CV_WRAP virtual void setDescriptorChannels(int dch) = 0;
+    CV_WRAP virtual int getDescriptorChannels() const = 0;
+
+    CV_WRAP virtual void setThreshold(double threshold) = 0;
+    CV_WRAP virtual double getThreshold() const = 0;
+
+    CV_WRAP virtual void setNOctaves(int octaves) = 0;
+    CV_WRAP virtual int getNOctaves() const = 0;
+
+    CV_WRAP virtual void setNOctaveLayers(int octaveLayers) = 0;
+    CV_WRAP virtual int getNOctaveLayers() const = 0;
+
+    CV_WRAP virtual void setDiffusivity(KAZE::DiffusivityType diff) = 0;
+    CV_WRAP virtual KAZE::DiffusivityType getDiffusivity() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    CV_WRAP virtual void setMaxPoints(int max_points) = 0;
+    CV_WRAP virtual int getMaxPoints() const = 0;
+};
+
+
+/****************************************************************************************\
+*                                      Distance                                          *
+\****************************************************************************************/
+
+template<typename T>
+struct CV_EXPORTS Accumulator
+{
+    typedef T Type;
+};
+
+template<> struct Accumulator<unsigned char>  { typedef float Type; };
+template<> struct Accumulator<unsigned short> { typedef float Type; };
+template<> struct Accumulator<char>   { typedef float Type; };
+template<> struct Accumulator<short>  { typedef float Type; };
+
+/*
+ * Squared Euclidean distance functor
+ */
+template<class T>
+struct CV_EXPORTS SL2
+{
+    static const NormTypes normType = NORM_L2SQR;
+    typedef T ValueType;
+    typedef typename Accumulator<T>::Type ResultType;
+
+    ResultType operator()( const T* a, const T* b, int size ) const
+    {
+        return normL2Sqr<ValueType, ResultType>(a, b, size);
+    }
+};
+
+/*
+ * Euclidean distance functor
+ */
+template<class T>
+struct L2
+{
+    static const NormTypes normType = NORM_L2;
+    typedef T ValueType;
+    typedef typename Accumulator<T>::Type ResultType;
+
+    ResultType operator()( const T* a, const T* b, int size ) const
+    {
+        return (ResultType)std::sqrt((double)normL2Sqr<ValueType, ResultType>(a, b, size));
+    }
+};
+
+/*
+ * Manhattan distance (city block distance) functor
+ */
+template<class T>
+struct L1
+{
+    static const NormTypes normType = NORM_L1;
+    typedef T ValueType;
+    typedef typename Accumulator<T>::Type ResultType;
+
+    ResultType operator()( const T* a, const T* b, int size ) const
+    {
+        return normL1<ValueType, ResultType>(a, b, size);
+    }
+};
+
+//! @} features2d_main
+
+/****************************************************************************************\
+*                                  DescriptorMatcher                                     *
+\****************************************************************************************/
+
+//! @addtogroup features2d_match
+//! @{
+
+/** @brief Abstract base class for matching keypoint descriptors.
+
+It has two groups of match methods: for matching descriptors of an image with another image or with
+an image set.
+ */
+class CV_EXPORTS_W DescriptorMatcher : public Algorithm
+{
+public:
+   enum MatcherType
+    {
+        FLANNBASED            = 1,
+        BRUTEFORCE            = 2,
+        BRUTEFORCE_L1         = 3,
+        BRUTEFORCE_HAMMING    = 4,
+        BRUTEFORCE_HAMMINGLUT = 5,
+        BRUTEFORCE_SL2        = 6
+    };
+
+    virtual ~DescriptorMatcher();
+
+    /** @brief Adds descriptors to train a CPU(trainDescCollectionis) or GPU(utrainDescCollectionis) descriptor
+    collection.
+
+    If the collection is not empty, the new descriptors are added to existing train descriptors.
+
+    @param descriptors Descriptors to add. Each descriptors[i] is a set of descriptors from the same
+    train image.
+     */
+    CV_WRAP virtual void add( InputArrayOfArrays descriptors );
+
+    /** @brief Returns a constant link to the train descriptor collection trainDescCollection .
+     */
+    CV_WRAP const std::vector<Mat>& getTrainDescriptors() const;
+
+    /** @brief Clears the train descriptor collections.
+     */
+    CV_WRAP virtual void clear() CV_OVERRIDE;
+
+    /** @brief Returns true if there are no train descriptors in the both collections.
+     */
+    CV_WRAP virtual bool empty() const CV_OVERRIDE;
+
+    /** @brief Returns true if the descriptor matcher supports masking permissible matches.
+     */
+    CV_WRAP virtual bool isMaskSupported() const = 0;
+
+    /** @brief Trains a descriptor matcher
+
+    Trains a descriptor matcher (for example, the flann index). In all methods to match, the method
+    train() is run every time before matching. Some descriptor matchers (for example, BruteForceMatcher)
+    have an empty implementation of this method. Other matchers really train their inner structures (for
+    example, FlannBasedMatcher trains flann::Index ).
+     */
+    CV_WRAP virtual void train();
+
+    /** @brief Finds the best match for each descriptor from a query set.
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. If a query descriptor is masked out in mask , no match is added for this
+    descriptor. So, matches size may be smaller than the query descriptors count.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    CV_WRAP void match( InputArray queryDescriptors, InputArray trainDescriptors,
+                CV_OUT std::vector<DMatch>& matches, InputArray mask=noArray() ) const;
+
+    /** @brief Finds the k best matches for each descriptor from a query set.
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param matches Matches. Each matches[i] is k or less matches for the same query descriptor.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    These extended variants of DescriptorMatcher::match methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::match
+    for the details about query and train descriptors.
+     */
+    CV_WRAP void knnMatch( InputArray queryDescriptors, InputArray trainDescriptors,
+                   CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
+                   InputArray mask=noArray(), bool compactResult=false ) const;
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance.
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Found matches.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    CV_WRAP void radiusMatch( InputArray queryDescriptors, InputArray trainDescriptors,
+                      CV_OUT std::vector<std::vector<DMatch> >& matches, float maxDistance,
+                      InputArray mask=noArray(), bool compactResult=false ) const;
+
+    /** @overload
+    @param queryDescriptors Query set of descriptors.
+    @param matches Matches. If a query descriptor is masked out in mask , no match is added for this
+    descriptor. So, matches size may be smaller than the query descriptors count.
+    @param masks Set of masks. Each masks[i] specifies permissible matches between the input query
+    descriptors and stored train descriptors from the i-th image trainDescCollection[i].
+    */
+    CV_WRAP void match( InputArray queryDescriptors, CV_OUT std::vector<DMatch>& matches,
+                        InputArrayOfArrays masks=noArray() );
+    /** @overload
+    @param queryDescriptors Query set of descriptors.
+    @param matches Matches. Each matches[i] is k or less matches for the same query descriptor.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param masks Set of masks. Each masks[i] specifies permissible matches between the input query
+    descriptors and stored train descriptors from the i-th image trainDescCollection[i].
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+    */
+    CV_WRAP void knnMatch( InputArray queryDescriptors, CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
+                           InputArrayOfArrays masks=noArray(), bool compactResult=false );
+    /** @overload
+    @param queryDescriptors Query set of descriptors.
+    @param matches Found matches.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param masks Set of masks. Each masks[i] specifies permissible matches between the input query
+    descriptors and stored train descriptors from the i-th image trainDescCollection[i].
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+    */
+    CV_WRAP void radiusMatch( InputArray queryDescriptors, CV_OUT std::vector<std::vector<DMatch> >& matches, float maxDistance,
+                      InputArrayOfArrays masks=noArray(), bool compactResult=false );
+
+
+    CV_WRAP void write( const String& fileName ) const
+    {
+        FileStorage fs(fileName, FileStorage::WRITE);
+        write(fs);
+    }
+
+    CV_WRAP void read( const String& fileName )
+    {
+        FileStorage fs(fileName, FileStorage::READ);
+        read(fs.root());
+    }
+    // Reads matcher object from a file node
+    // see corresponding cv::Algorithm method
+    CV_WRAP virtual void read( const FileNode& ) CV_OVERRIDE;
+    // Writes matcher object to a file storage
+    virtual void write( FileStorage& ) const CV_OVERRIDE;
+
+    /** @brief Clones the matcher.
+
+    @param emptyTrainData If emptyTrainData is false, the method creates a deep copy of the object,
+    that is, copies both parameters and train data. If emptyTrainData is true, the method creates an
+    object copy with the current parameters but with empty train data.
+     */
+    CV_WRAP CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
+
+    /** @brief Creates a descriptor matcher of a given type with the default parameters (using default
+    constructor).
+
+    @param descriptorMatcherType Descriptor matcher type. Now the following matcher types are
+    supported:
+    -   `BruteForce` (it uses L2 )
+    -   `BruteForce-L1`
+    -   `BruteForce-Hamming`
+    -   `BruteForce-Hamming(2)`
+    -   `FlannBased`
+     */
+    CV_WRAP static Ptr<DescriptorMatcher> create( const String& descriptorMatcherType );
+
+    CV_WRAP static Ptr<DescriptorMatcher> create( const DescriptorMatcher::MatcherType& matcherType );
+
+
+    // see corresponding cv::Algorithm method
+    CV_WRAP inline void write(FileStorage& fs, const String& name) const { Algorithm::write(fs, name); }
+#if CV_VERSION_MAJOR < 5
+    inline void write(const Ptr<FileStorage>& fs, const String& name) const { CV_Assert(fs); Algorithm::write(*fs, name); }
+#endif
+
+protected:
+    /**
+     * Class to work with descriptors from several images as with one merged matrix.
+     * It is used e.g. in FlannBasedMatcher.
+     */
+    class CV_EXPORTS DescriptorCollection
+    {
+    public:
+        DescriptorCollection();
+        DescriptorCollection( const DescriptorCollection& collection );
+        virtual ~DescriptorCollection();
+
+        // Vector of matrices "descriptors" will be merged to one matrix "mergedDescriptors" here.
+        void set( const std::vector<Mat>& descriptors );
+        virtual void clear();
+
+        const Mat& getDescriptors() const;
+        Mat getDescriptor( int imgIdx, int localDescIdx ) const;
+        Mat getDescriptor( int globalDescIdx ) const;
+        void getLocalIdx( int globalDescIdx, int& imgIdx, int& localDescIdx ) const;
+
+        int size() const;
+
+    protected:
+        Mat mergedDescriptors;
+        std::vector<int> startIdxs;
+    };
+
+    //! In fact the matching is implemented only by the following two methods. These methods suppose
+    //! that the class object has been trained already. Public match methods call these methods
+    //! after calling train().
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) = 0;
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) = 0;
+
+    static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx );
+    static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx );
+
+    CV_NODISCARD_STD static Mat clone_op( Mat m ) { return m.clone(); }
+    void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const;
+
+    //! Collection of descriptors from train images.
+    std::vector<Mat> trainDescCollection;
+    std::vector<UMat> utrainDescCollection;
+};
+
+/** @brief Brute-force descriptor matcher.
+
+For each descriptor in the first set, this matcher finds the closest descriptor in the second set
+by trying each one. This descriptor matcher supports masking permissible matches of descriptor
+sets.
+ */
+class CV_EXPORTS_W BFMatcher : public DescriptorMatcher
+{
+public:
+    /** @brief Brute-force matcher constructor (obsolete). Please use BFMatcher.create()
+     *
+     *
+    */
+    CV_WRAP BFMatcher( int normType=NORM_L2, bool crossCheck=false );
+
+    virtual ~BFMatcher() {}
+
+    virtual bool isMaskSupported() const CV_OVERRIDE { return true; }
+
+    /** @brief Brute-force matcher create method.
+    @param normType One of NORM_L1, NORM_L2, NORM_HAMMING, NORM_HAMMING2. L1 and L2 norms are
+    preferable choices for SIFT and SURF descriptors, NORM_HAMMING should be used with ORB, BRISK and
+    BRIEF, NORM_HAMMING2 should be used with ORB when WTA_K==3 or 4 (see ORB::ORB constructor
+    description).
+    @param crossCheck If it is false, this is will be default BFMatcher behaviour when it finds the k
+    nearest neighbors for each query descriptor. If crossCheck==true, then the knnMatch() method with
+    k=1 will only return pairs (i,j) such that for i-th query descriptor the j-th descriptor in the
+    matcher's collection is the nearest and vice versa, i.e. the BFMatcher will only return consistent
+    pairs. Such technique usually produces best results with minimal number of outliers when there are
+    enough matches. This is alternative to the ratio test, used by D. Lowe in SIFT paper.
+     */
+    CV_WRAP static Ptr<BFMatcher> create( int normType=NORM_L2, bool crossCheck=false ) ;
+
+    CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
+protected:
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
+
+    int normType;
+    bool crossCheck;
+};
+
+#if defined(HAVE_OPENCV_FLANN) || defined(CV_DOXYGEN)
+
+/** @brief Flann-based descriptor matcher.
+
+This matcher trains cv::flann::Index on a train descriptor collection and calls its nearest search
+methods to find the best matches. So, this matcher may be faster when matching a large train
+collection than the brute force matcher. FlannBasedMatcher does not support masking permissible
+matches of descriptor sets because flann::Index does not support this. :
+ */
+class CV_EXPORTS_W FlannBasedMatcher : public DescriptorMatcher
+{
+public:
+    CV_WRAP FlannBasedMatcher( const Ptr<flann::IndexParams>& indexParams=makePtr<flann::KDTreeIndexParams>(),
+                       const Ptr<flann::SearchParams>& searchParams=makePtr<flann::SearchParams>() );
+
+    virtual void add( InputArrayOfArrays descriptors ) CV_OVERRIDE;
+    virtual void clear() CV_OVERRIDE;
+
+    // Reads matcher object from a file node
+    virtual void read( const FileNode& ) CV_OVERRIDE;
+    // Writes matcher object to a file storage
+    virtual void write( FileStorage& ) const CV_OVERRIDE;
+
+    virtual void train() CV_OVERRIDE;
+    virtual bool isMaskSupported() const CV_OVERRIDE;
+
+    CV_WRAP static Ptr<FlannBasedMatcher> create();
+
+    CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
+protected:
+    static void convertToDMatches( const DescriptorCollection& descriptors,
+                                   const Mat& indices, const Mat& distances,
+                                   std::vector<std::vector<DMatch> >& matches );
+
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
+
+    Ptr<flann::IndexParams> indexParams;
+    Ptr<flann::SearchParams> searchParams;
+    Ptr<flann::Index> flannIndex;
+
+    DescriptorCollection mergedDescriptors;
+    int addedDescCount;
+};
+
+#endif
+
+//! @} features2d_match
+
+/****************************************************************************************\
+*                                   Drawing functions                                    *
+\****************************************************************************************/
+
+//! @addtogroup features2d_draw
+//! @{
+
+enum struct DrawMatchesFlags
+{
+  DEFAULT = 0, //!< Output image matrix will be created (Mat::create),
+               //!< i.e. existing memory of output image may be reused.
+               //!< Two source image, matches and single keypoints will be drawn.
+               //!< For each keypoint only the center point will be drawn (without
+               //!< the circle around keypoint with keypoint size and orientation).
+  DRAW_OVER_OUTIMG = 1, //!< Output image matrix will not be created (Mat::create).
+                        //!< Matches will be drawn on existing content of output image.
+  NOT_DRAW_SINGLE_POINTS = 2, //!< Single keypoints will not be drawn.
+  DRAW_RICH_KEYPOINTS = 4 //!< For each keypoint the circle around keypoint with keypoint size and
+                          //!< orientation will be drawn.
+};
+CV_ENUM_FLAGS(DrawMatchesFlags)
+
+/** @brief Draws keypoints.
+
+@param image Source image.
+@param keypoints Keypoints from the source image.
+@param outImage Output image. Its content depends on the flags value defining what is drawn in the
+output image. See possible flags bit values below.
+@param color Color of keypoints.
+@param flags Flags setting drawing features. Possible flags bit values are defined by
+DrawMatchesFlags. See details above in drawMatches .
+
+@note
+For Python API, flags are modified as cv.DRAW_MATCHES_FLAGS_DEFAULT,
+cv.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS, cv.DRAW_MATCHES_FLAGS_DRAW_OVER_OUTIMG,
+cv.DRAW_MATCHES_FLAGS_NOT_DRAW_SINGLE_POINTS
+ */
+CV_EXPORTS_W void drawKeypoints( InputArray image, const std::vector<KeyPoint>& keypoints, InputOutputArray outImage,
+                               const Scalar& color=Scalar::all(-1), DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
+
+/** @brief Draws the found matches of keypoints from two images.
+
+@param img1 First source image.
+@param keypoints1 Keypoints from the first source image.
+@param img2 Second source image.
+@param keypoints2 Keypoints from the second source image.
+@param matches1to2 Matches from the first image to the second one, which means that keypoints1[i]
+has a corresponding point in keypoints2[matches[i]] .
+@param outImg Output image. Its content depends on the flags value defining what is drawn in the
+output image. See possible flags bit values below.
+@param matchColor Color of matches (lines and connected keypoints). If matchColor==Scalar::all(-1)
+, the color is generated randomly.
+@param singlePointColor Color of single keypoints (circles), which means that keypoints do not
+have the matches. If singlePointColor==Scalar::all(-1) , the color is generated randomly.
+@param matchesMask Mask determining which matches are drawn. If the mask is empty, all matches are
+drawn.
+@param flags Flags setting drawing features. Possible flags bit values are defined by
+DrawMatchesFlags.
+
+This function draws matches of keypoints from two images in the output image. Match is a line
+connecting two keypoints (circles). See cv::DrawMatchesFlags.
+ */
+CV_EXPORTS_W void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                             InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                             const std::vector<DMatch>& matches1to2, InputOutputArray outImg,
+                             const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1),
+                             const std::vector<char>& matchesMask=std::vector<char>(), DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
+
+/** @overload */
+CV_EXPORTS_W void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                             InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                             const std::vector<DMatch>& matches1to2, InputOutputArray outImg,
+                             const int matchesThickness, const Scalar& matchColor=Scalar::all(-1),
+                             const Scalar& singlePointColor=Scalar::all(-1), const std::vector<char>& matchesMask=std::vector<char>(),
+                             DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
+
+CV_EXPORTS_AS(drawMatchesKnn) void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                             InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                             const std::vector<std::vector<DMatch> >& matches1to2, InputOutputArray outImg,
+                             const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1),
+                             const std::vector<std::vector<char> >& matchesMask=std::vector<std::vector<char> >(), DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
+
+//! @} features2d_draw
+
+/****************************************************************************************\
+*   Functions to evaluate the feature detectors and [generic] descriptor extractors      *
+\****************************************************************************************/
+
+//! @addtogroup features2d_main
+//! @{
+
+CV_EXPORTS void evaluateFeatureDetector( const Mat& img1, const Mat& img2, const Mat& H1to2,
+                                         std::vector<KeyPoint>* keypoints1, std::vector<KeyPoint>* keypoints2,
+                                         float& repeatability, int& correspCount,
+                                         const Ptr<FeatureDetector>& fdetector=Ptr<FeatureDetector>() );
+
+CV_EXPORTS void computeRecallPrecisionCurve( const std::vector<std::vector<DMatch> >& matches1to2,
+                                             const std::vector<std::vector<uchar> >& correctMatches1to2Mask,
+                                             std::vector<Point2f>& recallPrecisionCurve );
+
+CV_EXPORTS float getRecall( const std::vector<Point2f>& recallPrecisionCurve, float l_precision );
+CV_EXPORTS int getNearestPoint( const std::vector<Point2f>& recallPrecisionCurve, float l_precision );
+
+//! @}
+
+/****************************************************************************************\
+*                                     Bag of visual words                                *
+\****************************************************************************************/
+
+//! @addtogroup features2d_category
+//! @{
+
+/** @brief Abstract base class for training the *bag of visual words* vocabulary from a set of descriptors.
+
+For details, see, for example, *Visual Categorization with Bags of Keypoints* by Gabriella Csurka,
+Christopher R. Dance, Lixin Fan, Jutta Willamowski, Cedric Bray, 2004. :
+ */
+class CV_EXPORTS_W BOWTrainer
+{
+public:
+    BOWTrainer();
+    virtual ~BOWTrainer();
+
+    /** @brief Adds descriptors to a training set.
+
+    @param descriptors Descriptors to add to a training set. Each row of the descriptors matrix is a
+    descriptor.
+
+    The training set is clustered using clustermethod to construct the vocabulary.
+     */
+    CV_WRAP void add( const Mat& descriptors );
+
+    /** @brief Returns a training set of descriptors.
+    */
+    CV_WRAP const std::vector<Mat>& getDescriptors() const;
+
+    /** @brief Returns the count of all descriptors stored in the training set.
+    */
+    CV_WRAP int descriptorsCount() const;
+
+    CV_WRAP virtual void clear();
+
+    /** @overload */
+    CV_WRAP virtual Mat cluster() const = 0;
+
+    /** @brief Clusters train descriptors.
+
+    @param descriptors Descriptors to cluster. Each row of the descriptors matrix is a descriptor.
+    Descriptors are not added to the inner train descriptor set.
+
+    The vocabulary consists of cluster centers. So, this method returns the vocabulary. In the first
+    variant of the method, train descriptors stored in the object are clustered. In the second variant,
+    input descriptors are clustered.
+     */
+    CV_WRAP virtual Mat cluster( const Mat& descriptors ) const = 0;
+
+protected:
+    std::vector<Mat> descriptors;
+    int size;
+};
+
+/** @brief kmeans -based class to train visual vocabulary using the *bag of visual words* approach. :
+ */
+class CV_EXPORTS_W BOWKMeansTrainer : public BOWTrainer
+{
+public:
+    /** @brief The constructor.
+
+    @see cv::kmeans
+    */
+    CV_WRAP BOWKMeansTrainer( int clusterCount, const TermCriteria& termcrit=TermCriteria(),
+                      int attempts=3, int flags=KMEANS_PP_CENTERS );
+    virtual ~BOWKMeansTrainer();
+
+    // Returns trained vocabulary (i.e. cluster centers).
+    CV_WRAP virtual Mat cluster() const CV_OVERRIDE;
+    CV_WRAP virtual Mat cluster( const Mat& descriptors ) const CV_OVERRIDE;
+
+protected:
+
+    int clusterCount;
+    TermCriteria termcrit;
+    int attempts;
+    int flags;
+};
+
+/** @brief Class to compute an image descriptor using the *bag of visual words*.
+
+Such a computation consists of the following steps:
+
+1.  Compute descriptors for a given image and its keypoints set.
+2.  Find the nearest visual words from the vocabulary for each keypoint descriptor.
+3.  Compute the bag-of-words image descriptor as is a normalized histogram of vocabulary words
+encountered in the image. The i-th bin of the histogram is a frequency of i-th word of the
+vocabulary in the given image.
+ */
+class CV_EXPORTS_W BOWImgDescriptorExtractor
+{
+public:
+    /** @brief The constructor.
+
+    @param dextractor Descriptor extractor that is used to compute descriptors for an input image and
+    its keypoints.
+    @param dmatcher Descriptor matcher that is used to find the nearest word of the trained vocabulary
+    for each keypoint descriptor of the image.
+     */
+    CV_WRAP BOWImgDescriptorExtractor( const Ptr<Feature2D>& dextractor,
+                                       const Ptr<DescriptorMatcher>& dmatcher );
+    /** @overload */
+    BOWImgDescriptorExtractor( const Ptr<DescriptorMatcher>& dmatcher );
+    virtual ~BOWImgDescriptorExtractor();
+
+    /** @brief Sets a visual vocabulary.
+
+    @param vocabulary Vocabulary (can be trained using the inheritor of BOWTrainer ). Each row of the
+    vocabulary is a visual word (cluster center).
+     */
+    CV_WRAP void setVocabulary( const Mat& vocabulary );
+
+    /** @brief Returns the set vocabulary.
+    */
+    CV_WRAP const Mat& getVocabulary() const;
+
+    /** @brief Computes an image descriptor using the set visual vocabulary.
+
+    @param image Image, for which the descriptor is computed.
+    @param keypoints Keypoints detected in the input image.
+    @param imgDescriptor Computed output image descriptor.
+    @param pointIdxsOfClusters Indices of keypoints that belong to the cluster. This means that
+    pointIdxsOfClusters[i] are keypoint indices that belong to the i -th cluster (word of vocabulary)
+    returned if it is non-zero.
+    @param descriptors Descriptors of the image keypoints that are returned if they are non-zero.
+     */
+    void compute( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray imgDescriptor,
+                  std::vector<std::vector<int> >* pointIdxsOfClusters=0, Mat* descriptors=0 );
+    /** @overload
+    @param keypointDescriptors Computed descriptors to match with vocabulary.
+    @param imgDescriptor Computed output image descriptor.
+    @param pointIdxsOfClusters Indices of keypoints that belong to the cluster. This means that
+    pointIdxsOfClusters[i] are keypoint indices that belong to the i -th cluster (word of vocabulary)
+    returned if it is non-zero.
+    */
+    void compute( InputArray keypointDescriptors, OutputArray imgDescriptor,
+                  std::vector<std::vector<int> >* pointIdxsOfClusters=0 );
+    // compute() is not constant because DescriptorMatcher::match is not constant
+
+    CV_WRAP_AS(compute) void compute2( const Mat& image, std::vector<KeyPoint>& keypoints, CV_OUT Mat& imgDescriptor )
+    { compute(image,keypoints,imgDescriptor); }
+
+    /** @brief Returns an image descriptor size if the vocabulary is set. Otherwise, it returns 0.
+    */
+    CV_WRAP int descriptorSize() const;
+
+    /** @brief Returns an image descriptor type.
+     */
+    CV_WRAP int descriptorType() const;
+
+protected:
+    Mat vocabulary;
+    Ptr<DescriptorExtractor> dextractor;
+    Ptr<DescriptorMatcher> dmatcher;
+};
+
+//! @} features2d_category
+
+} /* namespace cv */
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/features2d/features2d.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/features2d/features2d.hpp
new file mode 100644
index 000000000000..e81df0ad08c1
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/features2d/features2d.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/features2d.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/features2d/hal/interface.h b/3rdparty/opencv/opencv410/build/include/opencv2/features2d/hal/interface.h
new file mode 100644
index 000000000000..bc3b084264a4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/features2d/hal/interface.h
@@ -0,0 +1,33 @@
+#ifndef OPENCV_FEATURE2D_HAL_INTERFACE_H
+#define OPENCV_FEATURE2D_HAL_INTERFACE_H
+
+#include "opencv2/core/cvdef.h"
+//! @addtogroup features2d_hal_interface
+//! @{
+
+//! @name Fast feature detector types
+//! @sa cv::FastFeatureDetector
+//! @{
+#define CV_HAL_TYPE_5_8  0
+#define CV_HAL_TYPE_7_12 1
+#define CV_HAL_TYPE_9_16 2
+//! @}
+
+//! @name Key point
+//! @sa cv::KeyPoint
+//! @{
+struct CV_EXPORTS cvhalKeyPoint
+{
+    float x;
+    float y;
+    float size;
+    float angle;
+    float response;
+    int octave;
+    int class_id;
+};
+//! @}
+
+//! @}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/flann.hpp
new file mode 100644
index 000000000000..90ee59e0b869
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann.hpp
@@ -0,0 +1,629 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_FLANN_HPP
+#define OPENCV_FLANN_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/flann/miniflann.hpp"
+#include "opencv2/flann/flann_base.hpp"
+
+/**
+@defgroup flann Clustering and Search in Multi-Dimensional Spaces
+
+This section documents OpenCV's interface to the FLANN library. FLANN (Fast Library for Approximate
+Nearest Neighbors) is a library that contains a collection of algorithms optimized for fast nearest
+neighbor search in large datasets and for high dimensional features. More information about FLANN
+can be found in @cite Muja2009 .
+*/
+
+namespace cvflann
+{
+    CV_EXPORTS flann_distance_t flann_distance_type();
+    CV_DEPRECATED CV_EXPORTS void set_distance_type(flann_distance_t distance_type, int order);
+}
+
+
+namespace cv
+{
+namespace flann
+{
+
+
+//! @addtogroup flann
+//! @{
+
+template <typename T> struct CvType {};
+template <> struct CvType<unsigned char> { static int type() { return CV_8U; } };
+template <> struct CvType<char> { static int type() { return CV_8S; } };
+template <> struct CvType<unsigned short> { static int type() { return CV_16U; } };
+template <> struct CvType<short> { static int type() { return CV_16S; } };
+template <> struct CvType<int> { static int type() { return CV_32S; } };
+template <> struct CvType<float> { static int type() { return CV_32F; } };
+template <> struct CvType<double> { static int type() { return CV_64F; } };
+
+
+// bring the flann parameters into this namespace
+using ::cvflann::get_param;
+using ::cvflann::print_params;
+
+// bring the flann distances into this namespace
+using ::cvflann::L2_Simple;
+using ::cvflann::L2;
+using ::cvflann::L1;
+using ::cvflann::MinkowskiDistance;
+using ::cvflann::MaxDistance;
+using ::cvflann::HammingLUT;
+using ::cvflann::Hamming;
+using ::cvflann::Hamming2;
+using ::cvflann::DNAmmingLUT;
+using ::cvflann::DNAmming2;
+using ::cvflann::HistIntersectionDistance;
+using ::cvflann::HellingerDistance;
+using ::cvflann::ChiSquareDistance;
+using ::cvflann::KL_Divergence;
+
+
+/** @brief The FLANN nearest neighbor index class. This class is templated with the type of elements for which
+the index is built.
+
+`Distance` functor specifies the metric to be used to calculate the distance between two points.
+There are several `Distance` functors that are readily available:
+
+cv::cvflann::L2_Simple - Squared Euclidean distance functor.
+This is the simpler, unrolled version. This is preferable for very low dimensionality data (eg 3D points)
+
+cv::flann::L2 - Squared Euclidean distance functor, optimized version.
+
+cv::flann::L1 - Manhattan distance functor, optimized version.
+
+cv::flann::MinkowskiDistance -  The Minkowski distance functor.
+This is highly optimised with loop unrolling.
+The computation of squared root at the end is omitted for efficiency.
+
+cv::flann::MaxDistance - The max distance functor. It computes the
+maximum distance between two vectors. This distance is not a valid kdtree distance, it's not
+dimensionwise additive.
+
+cv::flann::HammingLUT -  %Hamming distance functor. It counts the bit
+differences between two strings using a lookup table implementation.
+
+cv::flann::Hamming - %Hamming distance functor. Population count is
+performed using library calls, if available. Lookup table implementation is used as a fallback.
+
+cv::flann::Hamming2 - %Hamming distance functor. Population count is
+implemented in 12 arithmetic operations (one of which is multiplication).
+
+cv::flann::DNAmmingLUT -  %Adaptation of the Hamming distance functor to DNA comparison.
+As the four bases A, C, G, T of the DNA (or A, G, C, U for RNA) can be coded on 2 bits,
+it counts the bits pairs differences between two sequences using a lookup table implementation.
+
+cv::flann::DNAmming2 - %Adaptation of the Hamming distance functor to DNA comparison.
+Bases differences count are vectorised thanks to arithmetic operations using standard
+registers (AVX2 and AVX-512 should come in a near future).
+
+cv::flann::HistIntersectionDistance - The histogram
+intersection distance functor.
+
+cv::flann::HellingerDistance - The Hellinger distance functor.
+
+cv::flann::ChiSquareDistance - The chi-square distance functor.
+
+cv::flann::KL_Divergence - The Kullback-Leibler divergence functor.
+
+Although the provided implementations cover a vast range of cases, it is also possible to use
+a custom implementation. The distance functor is a class whose `operator()` computes the distance
+between two features. If the distance is also a kd-tree compatible distance, it should also provide an
+`accum_dist()` method that computes the distance between individual feature dimensions.
+
+In addition to `operator()` and `accum_dist()`, a distance functor should also define the
+`ElementType` and the `ResultType` as the types of the elements it operates on and the type of the
+result it computes. If a distance functor can be used as a kd-tree distance (meaning that the full
+distance between a pair of features can be accumulated from the partial distances between the
+individual dimensions) a typedef `is_kdtree_distance` should be present inside the distance functor.
+If the distance is not a kd-tree distance, but it's a distance in a vector space (the individual
+dimensions of the elements it operates on can be accessed independently) a typedef
+`is_vector_space_distance` should be defined inside the functor. If neither typedef is defined, the
+distance is assumed to be a metric distance and will only be used with indexes operating on
+generic metric distances.
+ */
+template <typename Distance>
+class GenericIndex
+{
+public:
+        typedef typename Distance::ElementType ElementType;
+        typedef typename Distance::ResultType DistanceType;
+
+        /** @brief Constructs a nearest neighbor search index for a given dataset.
+
+        @param features Matrix of containing the features(points) to index. The size of the matrix is
+        num_features x feature_dimensionality and the data type of the elements in the matrix must
+        coincide with the type of the index.
+        @param params Structure containing the index parameters. The type of index that will be
+        constructed depends on the type of this parameter. See the description.
+        @param distance
+
+        The method constructs a fast search structure from a set of features using the specified algorithm
+        with specified parameters, as defined by params. params is a reference to one of the following class
+        IndexParams descendants:
+
+        - **LinearIndexParams** When passing an object of this type, the index will perform a linear,
+        brute-force search. :
+        @code
+        struct LinearIndexParams : public IndexParams
+        {
+        };
+        @endcode
+        - **KDTreeIndexParams** When passing an object of this type the index constructed will consist of
+        a set of randomized kd-trees which will be searched in parallel. :
+        @code
+        struct KDTreeIndexParams : public IndexParams
+        {
+            KDTreeIndexParams( int trees = 4 );
+        };
+        @endcode
+        - **HierarchicalClusteringIndexParams** When passing an object of this type the index constructed
+        will be a hierarchical tree of clusters, dividing each set of points into n clusters whose centers
+        are picked among the points without further refinement of their position.
+        This algorithm fits both floating, integer and binary vectors. :
+        @code
+        struct HierarchicalClusteringIndexParams : public IndexParams
+        {
+            HierarchicalClusteringIndexParams(
+                int branching = 32,
+                flann_centers_init_t centers_init = CENTERS_RANDOM,
+                int trees = 4,
+                int leaf_size = 100);
+
+        };
+        @endcode
+        - **KMeansIndexParams** When passing an object of this type the index constructed will be a
+        hierarchical k-means tree (one tree by default), dividing each set of points into n clusters
+        whose barycenters are refined iteratively.
+        Note that this algorithm has been extended to the support of binary vectors as an alternative
+        to LSH when knn search speed is the criterium. It will also outperform LSH when processing
+        directly (i.e. without the use of MCA/PCA) datasets whose points share mostly the same values
+        for most of the dimensions. It is recommended to set more than one tree with binary data. :
+        @code
+        struct KMeansIndexParams : public IndexParams
+        {
+            KMeansIndexParams(
+                int branching = 32,
+                int iterations = 11,
+                flann_centers_init_t centers_init = CENTERS_RANDOM,
+                float cb_index = 0.2,
+                int trees = 1);
+        };
+        @endcode
+        - **CompositeIndexParams** When using a parameters object of this type the index created
+        combines the randomized kd-trees and the hierarchical k-means tree. :
+        @code
+        struct CompositeIndexParams : public IndexParams
+        {
+            CompositeIndexParams(
+                int trees = 4,
+                int branching = 32,
+                int iterations = 11,
+                flann_centers_init_t centers_init = CENTERS_RANDOM,
+                float cb_index = 0.2 );
+        };
+        @endcode
+        - **LshIndexParams** When using a parameters object of this type the index created uses
+        multi-probe LSH (by Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity Search
+        by Qin Lv, William Josephson, Zhe Wang, Moses Charikar, Kai Li., Proceedings of the 33rd
+        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007).
+        This algorithm is designed for binary vectors. :
+        @code
+        struct LshIndexParams : public IndexParams
+        {
+            LshIndexParams(
+                int table_number,
+                int key_size,
+                int multi_probe_level );
+        };
+        @endcode
+        - **AutotunedIndexParams** When passing an object of this type the index created is
+        automatically tuned to offer the best performance, by choosing the optimal index type
+        (randomized kd-trees, hierarchical kmeans, linear) and parameters for the dataset provided. :
+        @code
+        struct AutotunedIndexParams : public IndexParams
+        {
+            AutotunedIndexParams(
+                float target_precision = 0.9,
+                float build_weight = 0.01,
+                float memory_weight = 0,
+                float sample_fraction = 0.1 );
+        };
+        @endcode
+        - **SavedIndexParams** This object type is used for loading a previously saved index from the
+        disk. :
+        @code
+        struct SavedIndexParams : public IndexParams
+        {
+            SavedIndexParams( String filename );
+        };
+        @endcode
+         */
+        GenericIndex(const Mat& features, const ::cvflann::IndexParams& params, Distance distance = Distance());
+
+        ~GenericIndex();
+
+        /** @brief Performs a K-nearest neighbor search for a given query point using the index.
+
+        @param query The query point
+        @param indices Vector that will contain the indices of the K-nearest neighbors found. It must have
+        at least knn size.
+        @param dists Vector that will contain the distances to the K-nearest neighbors found. It must have
+        at least knn size.
+        @param knn Number of nearest neighbors to search for.
+        @param params SearchParams
+         */
+        void knnSearch(const std::vector<ElementType>& query, std::vector<int>& indices,
+                       std::vector<DistanceType>& dists, int knn, const ::cvflann::SearchParams& params);
+        void knnSearch(const Mat& queries, Mat& indices, Mat& dists, int knn, const ::cvflann::SearchParams& params);
+
+        /** @brief Performs a radius nearest neighbor search for a given query point using the index.
+
+        @param query The query point.
+        @param indices Vector that will contain the indices of the nearest neighbors found.
+        @param dists Vector that will contain the distances to the nearest neighbors found. It has the same
+        number of elements as indices.
+        @param radius The search radius.
+        @param params SearchParams
+
+        This function returns the number of nearest neighbors found.
+        */
+        int radiusSearch(const std::vector<ElementType>& query, std::vector<int>& indices,
+                         std::vector<DistanceType>& dists, DistanceType radius, const ::cvflann::SearchParams& params);
+        int radiusSearch(const Mat& query, Mat& indices, Mat& dists,
+                         DistanceType radius, const ::cvflann::SearchParams& params);
+
+        void save(String filename) { nnIndex->save(filename); }
+
+        int veclen() const { return nnIndex->veclen(); }
+
+        int size() const { return (int)nnIndex->size(); }
+
+        ::cvflann::IndexParams getParameters() { return nnIndex->getParameters(); }
+
+        CV_DEPRECATED const ::cvflann::IndexParams* getIndexParameters() { return nnIndex->getIndexParameters(); }
+
+private:
+        ::cvflann::Index<Distance>* nnIndex;
+        Mat _dataset;
+};
+
+//! @cond IGNORED
+
+#define FLANN_DISTANCE_CHECK \
+    if ( ::cvflann::flann_distance_type() != cvflann::FLANN_DIST_L2) { \
+        printf("[WARNING] You are using cv::flann::Index (or cv::flann::GenericIndex) and have also changed "\
+        "the distance using cvflann::set_distance_type. This is no longer working as expected "\
+        "(cv::flann::Index always uses L2). You should create the index templated on the distance, "\
+        "for example for L1 distance use: GenericIndex< L1<float> > \n"); \
+    }
+
+
+template <typename Distance>
+GenericIndex<Distance>::GenericIndex(const Mat& dataset, const ::cvflann::IndexParams& params, Distance distance)
+: _dataset(dataset)
+{
+    CV_Assert(dataset.type() == CvType<ElementType>::type());
+    CV_Assert(dataset.isContinuous());
+    ::cvflann::Matrix<ElementType> m_dataset((ElementType*)_dataset.ptr<ElementType>(0), _dataset.rows, _dataset.cols);
+
+    nnIndex = new ::cvflann::Index<Distance>(m_dataset, params, distance);
+
+    FLANN_DISTANCE_CHECK
+
+    nnIndex->buildIndex();
+}
+
+template <typename Distance>
+GenericIndex<Distance>::~GenericIndex()
+{
+    delete nnIndex;
+}
+
+template <typename Distance>
+void GenericIndex<Distance>::knnSearch(const std::vector<ElementType>& query, std::vector<int>& indices, std::vector<DistanceType>& dists, int knn, const ::cvflann::SearchParams& searchParams)
+{
+    ::cvflann::Matrix<ElementType> m_query((ElementType*)&query[0], 1, query.size());
+    ::cvflann::Matrix<int> m_indices(&indices[0], 1, indices.size());
+    ::cvflann::Matrix<DistanceType> m_dists(&dists[0], 1, dists.size());
+
+    FLANN_DISTANCE_CHECK
+
+    nnIndex->knnSearch(m_query,m_indices,m_dists,knn,searchParams);
+}
+
+
+template <typename Distance>
+void GenericIndex<Distance>::knnSearch(const Mat& queries, Mat& indices, Mat& dists, int knn, const ::cvflann::SearchParams& searchParams)
+{
+    CV_Assert(queries.type() == CvType<ElementType>::type());
+    CV_Assert(queries.isContinuous());
+    ::cvflann::Matrix<ElementType> m_queries((ElementType*)queries.ptr<ElementType>(0), queries.rows, queries.cols);
+
+    CV_Assert(indices.type() == CV_32S);
+    CV_Assert(indices.isContinuous());
+    ::cvflann::Matrix<int> m_indices((int*)indices.ptr<int>(0), indices.rows, indices.cols);
+
+    CV_Assert(dists.type() == CvType<DistanceType>::type());
+    CV_Assert(dists.isContinuous());
+    ::cvflann::Matrix<DistanceType> m_dists((DistanceType*)dists.ptr<DistanceType>(0), dists.rows, dists.cols);
+
+    FLANN_DISTANCE_CHECK
+
+    nnIndex->knnSearch(m_queries,m_indices,m_dists,knn, searchParams);
+}
+
+template <typename Distance>
+int GenericIndex<Distance>::radiusSearch(const std::vector<ElementType>& query, std::vector<int>& indices, std::vector<DistanceType>& dists, DistanceType radius, const ::cvflann::SearchParams& searchParams)
+{
+    ::cvflann::Matrix<ElementType> m_query((ElementType*)&query[0], 1, query.size());
+    ::cvflann::Matrix<int> m_indices(&indices[0], 1, indices.size());
+    ::cvflann::Matrix<DistanceType> m_dists(&dists[0], 1, dists.size());
+
+    FLANN_DISTANCE_CHECK
+
+    return nnIndex->radiusSearch(m_query,m_indices,m_dists,radius,searchParams);
+}
+
+template <typename Distance>
+int GenericIndex<Distance>::radiusSearch(const Mat& query, Mat& indices, Mat& dists, DistanceType radius, const ::cvflann::SearchParams& searchParams)
+{
+    CV_Assert(query.type() == CvType<ElementType>::type());
+    CV_Assert(query.isContinuous());
+    ::cvflann::Matrix<ElementType> m_query((ElementType*)query.ptr<ElementType>(0), query.rows, query.cols);
+
+    CV_Assert(indices.type() == CV_32S);
+    CV_Assert(indices.isContinuous());
+    ::cvflann::Matrix<int> m_indices((int*)indices.ptr<int>(0), indices.rows, indices.cols);
+
+    CV_Assert(dists.type() == CvType<DistanceType>::type());
+    CV_Assert(dists.isContinuous());
+    ::cvflann::Matrix<DistanceType> m_dists((DistanceType*)dists.ptr<DistanceType>(0), dists.rows, dists.cols);
+
+    FLANN_DISTANCE_CHECK
+
+    return nnIndex->radiusSearch(m_query,m_indices,m_dists,radius,searchParams);
+}
+
+/**
+ * @deprecated Use GenericIndex class instead
+ */
+template <typename T>
+class Index_
+{
+public:
+    typedef typename L2<T>::ElementType ElementType;
+    typedef typename L2<T>::ResultType DistanceType;
+
+    CV_DEPRECATED Index_(const Mat& dataset, const ::cvflann::IndexParams& params)
+    {
+        printf("[WARNING] The cv::flann::Index_<T> class is deperecated, use cv::flann::GenericIndex<Distance> instead\n");
+
+        CV_Assert(dataset.type() == CvType<ElementType>::type());
+        CV_Assert(dataset.isContinuous());
+        ::cvflann::Matrix<ElementType> m_dataset((ElementType*)dataset.ptr<ElementType>(0), dataset.rows, dataset.cols);
+
+        if ( ::cvflann::flann_distance_type() == cvflann::FLANN_DIST_L2 ) {
+            nnIndex_L1 = NULL;
+            nnIndex_L2 = new ::cvflann::Index< L2<ElementType> >(m_dataset, params);
+        }
+        else if ( ::cvflann::flann_distance_type() == cvflann::FLANN_DIST_L1 ) {
+            nnIndex_L1 = new ::cvflann::Index< L1<ElementType> >(m_dataset, params);
+            nnIndex_L2 = NULL;
+        }
+        else {
+            printf("[ERROR] cv::flann::Index_<T> only provides backwards compatibility for the L1 and L2 distances. "
+                   "For other distance types you must use cv::flann::GenericIndex<Distance>\n");
+            CV_Assert(0);
+        }
+        if (nnIndex_L1) nnIndex_L1->buildIndex();
+        if (nnIndex_L2) nnIndex_L2->buildIndex();
+    }
+    CV_DEPRECATED ~Index_()
+    {
+        if (nnIndex_L1) delete nnIndex_L1;
+        if (nnIndex_L2) delete nnIndex_L2;
+    }
+
+    CV_DEPRECATED void knnSearch(const std::vector<ElementType>& query, std::vector<int>& indices, std::vector<DistanceType>& dists, int knn, const ::cvflann::SearchParams& searchParams)
+    {
+        ::cvflann::Matrix<ElementType> m_query((ElementType*)&query[0], 1, query.size());
+        ::cvflann::Matrix<int> m_indices(&indices[0], 1, indices.size());
+        ::cvflann::Matrix<DistanceType> m_dists(&dists[0], 1, dists.size());
+
+        if (nnIndex_L1) nnIndex_L1->knnSearch(m_query,m_indices,m_dists,knn,searchParams);
+        if (nnIndex_L2) nnIndex_L2->knnSearch(m_query,m_indices,m_dists,knn,searchParams);
+    }
+    CV_DEPRECATED void knnSearch(const Mat& queries, Mat& indices, Mat& dists, int knn, const ::cvflann::SearchParams& searchParams)
+    {
+        CV_Assert(queries.type() == CvType<ElementType>::type());
+        CV_Assert(queries.isContinuous());
+        ::cvflann::Matrix<ElementType> m_queries((ElementType*)queries.ptr<ElementType>(0), queries.rows, queries.cols);
+
+        CV_Assert(indices.type() == CV_32S);
+        CV_Assert(indices.isContinuous());
+        ::cvflann::Matrix<int> m_indices((int*)indices.ptr<int>(0), indices.rows, indices.cols);
+
+        CV_Assert(dists.type() == CvType<DistanceType>::type());
+        CV_Assert(dists.isContinuous());
+        ::cvflann::Matrix<DistanceType> m_dists((DistanceType*)dists.ptr<DistanceType>(0), dists.rows, dists.cols);
+
+        if (nnIndex_L1) nnIndex_L1->knnSearch(m_queries,m_indices,m_dists,knn, searchParams);
+        if (nnIndex_L2) nnIndex_L2->knnSearch(m_queries,m_indices,m_dists,knn, searchParams);
+    }
+
+    CV_DEPRECATED int radiusSearch(const std::vector<ElementType>& query, std::vector<int>& indices, std::vector<DistanceType>& dists, DistanceType radius, const ::cvflann::SearchParams& searchParams)
+    {
+        ::cvflann::Matrix<ElementType> m_query((ElementType*)&query[0], 1, query.size());
+        ::cvflann::Matrix<int> m_indices(&indices[0], 1, indices.size());
+        ::cvflann::Matrix<DistanceType> m_dists(&dists[0], 1, dists.size());
+
+        if (nnIndex_L1) return nnIndex_L1->radiusSearch(m_query,m_indices,m_dists,radius,searchParams);
+        if (nnIndex_L2) return nnIndex_L2->radiusSearch(m_query,m_indices,m_dists,radius,searchParams);
+    }
+
+    CV_DEPRECATED int radiusSearch(const Mat& query, Mat& indices, Mat& dists, DistanceType radius, const ::cvflann::SearchParams& searchParams)
+    {
+        CV_Assert(query.type() == CvType<ElementType>::type());
+        CV_Assert(query.isContinuous());
+        ::cvflann::Matrix<ElementType> m_query((ElementType*)query.ptr<ElementType>(0), query.rows, query.cols);
+
+        CV_Assert(indices.type() == CV_32S);
+        CV_Assert(indices.isContinuous());
+        ::cvflann::Matrix<int> m_indices((int*)indices.ptr<int>(0), indices.rows, indices.cols);
+
+        CV_Assert(dists.type() == CvType<DistanceType>::type());
+        CV_Assert(dists.isContinuous());
+        ::cvflann::Matrix<DistanceType> m_dists((DistanceType*)dists.ptr<DistanceType>(0), dists.rows, dists.cols);
+
+        if (nnIndex_L1) return nnIndex_L1->radiusSearch(m_query,m_indices,m_dists,radius,searchParams);
+        if (nnIndex_L2) return nnIndex_L2->radiusSearch(m_query,m_indices,m_dists,radius,searchParams);
+    }
+
+    CV_DEPRECATED void save(String filename)
+    {
+        if (nnIndex_L1) nnIndex_L1->save(filename);
+        if (nnIndex_L2) nnIndex_L2->save(filename);
+    }
+
+    CV_DEPRECATED int veclen() const
+    {
+        if (nnIndex_L1) return nnIndex_L1->veclen();
+        if (nnIndex_L2) return nnIndex_L2->veclen();
+    }
+
+    CV_DEPRECATED int size() const
+    {
+        if (nnIndex_L1) return nnIndex_L1->size();
+        if (nnIndex_L2) return nnIndex_L2->size();
+    }
+
+    CV_DEPRECATED ::cvflann::IndexParams getParameters()
+    {
+        if (nnIndex_L1) return nnIndex_L1->getParameters();
+        if (nnIndex_L2) return nnIndex_L2->getParameters();
+
+    }
+
+    CV_DEPRECATED const ::cvflann::IndexParams* getIndexParameters()
+    {
+        if (nnIndex_L1) return nnIndex_L1->getIndexParameters();
+        if (nnIndex_L2) return nnIndex_L2->getIndexParameters();
+    }
+
+private:
+    // providing backwards compatibility for L2 and L1 distances (most common)
+    ::cvflann::Index< L2<ElementType> >* nnIndex_L2;
+    ::cvflann::Index< L1<ElementType> >* nnIndex_L1;
+};
+
+//! @endcond
+
+/** @brief Clusters features using hierarchical k-means algorithm.
+
+@param features The points to be clustered. The matrix must have elements of type
+Distance::ElementType.
+@param centers The centers of the clusters obtained. The matrix must have type
+Distance::CentersType. The number of rows in this matrix represents the number of clusters desired,
+however, because of the way the cut in the hierarchical tree is chosen, the number of clusters
+computed will be the highest number of the form (branching-1)\*k+1 that's lower than the number of
+clusters desired, where branching is the tree's branching factor (see description of the
+KMeansIndexParams).
+@param params Parameters used in the construction of the hierarchical k-means tree.
+@param d Distance to be used for clustering.
+
+The method clusters the given feature vectors by constructing a hierarchical k-means tree and
+choosing a cut in the tree that minimizes the cluster's variance. It returns the number of clusters
+found.
+ */
+template <typename Distance>
+int hierarchicalClustering(const Mat& features, Mat& centers, const ::cvflann::KMeansIndexParams& params,
+                           Distance d = Distance())
+{
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::CentersType CentersType;
+
+    CV_Assert(features.type() == CvType<ElementType>::type());
+    CV_Assert(features.isContinuous());
+    ::cvflann::Matrix<ElementType> m_features((ElementType*)features.ptr<ElementType>(0), features.rows, features.cols);
+
+    CV_Assert(centers.type() == CvType<CentersType>::type());
+    CV_Assert(centers.isContinuous());
+    ::cvflann::Matrix<CentersType> m_centers((CentersType*)centers.ptr<CentersType>(0), centers.rows, centers.cols);
+
+    return ::cvflann::hierarchicalClustering<Distance>(m_features, m_centers, params, d);
+}
+
+//! @cond IGNORED
+
+template <typename ELEM_TYPE, typename DIST_TYPE>
+CV_DEPRECATED int hierarchicalClustering(const Mat& features, Mat& centers, const ::cvflann::KMeansIndexParams& params)
+{
+    printf("[WARNING] cv::flann::hierarchicalClustering<ELEM_TYPE,DIST_TYPE> is deprecated, use "
+        "cv::flann::hierarchicalClustering<Distance> instead\n");
+
+    if ( ::cvflann::flann_distance_type() == cvflann::FLANN_DIST_L2 ) {
+        return hierarchicalClustering< L2<ELEM_TYPE> >(features, centers, params);
+    }
+    else if ( ::cvflann::flann_distance_type() == cvflann::FLANN_DIST_L1 ) {
+        return hierarchicalClustering< L1<ELEM_TYPE> >(features, centers, params);
+    }
+    else {
+        printf("[ERROR] cv::flann::hierarchicalClustering<ELEM_TYPE,DIST_TYPE> only provides backwards "
+        "compatibility for the L1 and L2 distances. "
+        "For other distance types you must use cv::flann::hierarchicalClustering<Distance>\n");
+        CV_Assert(0);
+    }
+}
+
+//! @endcond
+
+//! @} flann
+
+} } // namespace cv::flann
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/all_indices.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/all_indices.h
new file mode 100644
index 000000000000..03877ab6adca
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/all_indices.h
@@ -0,0 +1,162 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+
+#ifndef OPENCV_FLANN_ALL_INDICES_H_
+#define OPENCV_FLANN_ALL_INDICES_H_
+
+//! @cond IGNORED
+
+#include "general.h"
+
+#include "nn_index.h"
+#include "kdtree_index.h"
+#include "kdtree_single_index.h"
+#include "kmeans_index.h"
+#include "composite_index.h"
+#include "linear_index.h"
+#include "hierarchical_clustering_index.h"
+#include "lsh_index.h"
+#include "autotuned_index.h"
+
+
+namespace cvflann
+{
+
+template<typename KDTreeCapability, typename VectorSpace, typename Distance>
+struct index_creator
+{
+    static NNIndex<Distance>* create(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance)
+    {
+        flann_algorithm_t index_type = get_param<flann_algorithm_t>(params, "algorithm");
+
+        NNIndex<Distance>* nnIndex;
+        switch (index_type) {
+        case FLANN_INDEX_LINEAR:
+            nnIndex = new LinearIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_KDTREE_SINGLE:
+            nnIndex = new KDTreeSingleIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_KDTREE:
+            nnIndex = new KDTreeIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_KMEANS:
+            nnIndex = new KMeansIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_COMPOSITE:
+            nnIndex = new CompositeIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_AUTOTUNED:
+            nnIndex = new AutotunedIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_HIERARCHICAL:
+            nnIndex = new HierarchicalClusteringIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_LSH:
+            nnIndex = new LshIndex<Distance>(dataset, params, distance);
+            break;
+        default:
+            FLANN_THROW(cv::Error::StsBadArg, "Unknown index type");
+        }
+
+        return nnIndex;
+    }
+};
+
+template<typename VectorSpace, typename Distance>
+struct index_creator<False,VectorSpace,Distance>
+{
+    static NNIndex<Distance>* create(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance)
+    {
+        flann_algorithm_t index_type = get_param<flann_algorithm_t>(params, "algorithm");
+
+        NNIndex<Distance>* nnIndex;
+        switch (index_type) {
+        case FLANN_INDEX_LINEAR:
+            nnIndex = new LinearIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_KMEANS:
+            nnIndex = new KMeansIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_HIERARCHICAL:
+            nnIndex = new HierarchicalClusteringIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_LSH:
+            nnIndex = new LshIndex<Distance>(dataset, params, distance);
+            break;
+        default:
+            FLANN_THROW(cv::Error::StsBadArg, "Unknown index type");
+        }
+
+        return nnIndex;
+    }
+};
+
+template<typename Distance>
+struct index_creator<False,False,Distance>
+{
+    static NNIndex<Distance>* create(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance)
+    {
+        flann_algorithm_t index_type = get_param<flann_algorithm_t>(params, "algorithm");
+
+        NNIndex<Distance>* nnIndex;
+        switch (index_type) {
+        case FLANN_INDEX_LINEAR:
+            nnIndex = new LinearIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_KMEANS:
+            nnIndex = new KMeansIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_HIERARCHICAL:
+            nnIndex = new HierarchicalClusteringIndex<Distance>(dataset, params, distance);
+            break;
+        case FLANN_INDEX_LSH:
+            nnIndex = new LshIndex<Distance>(dataset, params, distance);
+            break;
+        default:
+            FLANN_THROW(cv::Error::StsBadArg, "Unknown index type");
+        }
+
+        return nnIndex;
+    }
+};
+
+template<typename Distance>
+NNIndex<Distance>* create_index_by_type(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance)
+{
+    return index_creator<typename Distance::is_kdtree_distance,
+                         typename Distance::is_vector_space_distance,
+                         Distance>::create(dataset, params,distance);
+}
+
+}
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_ALL_INDICES_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/allocator.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/allocator.h
new file mode 100644
index 000000000000..d5870a018161
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/allocator.h
@@ -0,0 +1,196 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_ALLOCATOR_H_
+#define OPENCV_FLANN_ALLOCATOR_H_
+
+//! @cond IGNORED
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+namespace cvflann
+{
+
+/**
+ * Allocates (using C's malloc) a generic type T.
+ *
+ * Params:
+ *     count = number of instances to allocate.
+ * Returns: pointer (of type T*) to memory buffer
+ */
+template <typename T>
+T* allocate(size_t count = 1)
+{
+    T* mem = (T*) ::malloc(sizeof(T)*count);
+    return mem;
+}
+
+
+/**
+ * Pooled storage allocator
+ *
+ * The following routines allow for the efficient allocation of storage in
+ * small chunks from a specified pool.  Rather than allowing each structure
+ * to be freed individually, an entire pool of storage is freed at once.
+ * This method has two advantages over just using malloc() and free().  First,
+ * it is far more efficient for allocating small objects, as there is
+ * no overhead for remembering all the information needed to free each
+ * object or consolidating fragmented memory.  Second, the decision about
+ * how long to keep an object is made at the time of allocation, and there
+ * is no need to track down all the objects to free them.
+ *
+ */
+
+const size_t     WORDSIZE=16;
+const  size_t     BLOCKSIZE=8192;
+
+class PooledAllocator
+{
+    /* We maintain memory alignment to word boundaries by requiring that all
+        allocations be in multiples of the machine wordsize.  */
+    /* Size of machine word in bytes.  Must be power of 2. */
+    /* Minimum number of bytes requested at a time from	the system.  Must be multiple of WORDSIZE. */
+
+
+    int     remaining;  /* Number of bytes left in current block of storage. */
+    void*   base;     /* Pointer to base of current block of storage. */
+    void*   loc;      /* Current location in block to next allocate memory. */
+    int     blocksize;
+
+
+public:
+    int     usedMemory;
+    int     wastedMemory;
+
+    /**
+        Default constructor. Initializes a new pool.
+     */
+    PooledAllocator(int blockSize = BLOCKSIZE)
+    {
+        blocksize = blockSize;
+        remaining = 0;
+        base = NULL;
+        loc = NULL;
+
+        usedMemory = 0;
+        wastedMemory = 0;
+    }
+
+    /**
+     * Destructor. Frees all the memory allocated in this pool.
+     */
+    ~PooledAllocator()
+    {
+        void* prev;
+
+        while (base != NULL) {
+            prev = *((void**) base); /* Get pointer to prev block. */
+            ::free(base);
+            base = prev;
+        }
+    }
+
+    /**
+     * Returns a pointer to a piece of new memory of the given size in bytes
+     * allocated from the pool.
+     */
+    void* allocateMemory(int size)
+    {
+        int blockSize;
+
+        /* Round size up to a multiple of wordsize.  The following expression
+            only works for WORDSIZE that is a power of 2, by masking last bits of
+            incremented size to zero.
+         */
+        size = (size + (WORDSIZE - 1)) & ~(WORDSIZE - 1);
+
+        /* Check whether a new block must be allocated.  Note that the first word
+            of a block is reserved for a pointer to the previous block.
+         */
+        if (size > remaining) {
+
+            wastedMemory += remaining;
+
+            /* Allocate new storage. */
+            blockSize = (size + sizeof(void*) + (WORDSIZE-1) > BLOCKSIZE) ?
+                        size + sizeof(void*) + (WORDSIZE-1) : BLOCKSIZE;
+
+            // use the standard C malloc to allocate memory
+            void* m = ::malloc(blockSize);
+            if (!m) {
+                fprintf(stderr,"Failed to allocate memory.\n");
+                return NULL;
+            }
+
+            /* Fill first word of new block with pointer to previous block. */
+            ((void**) m)[0] = base;
+            base = m;
+
+            int shift = 0;
+            //int shift = (WORDSIZE - ( (((size_t)m) + sizeof(void*)) & (WORDSIZE-1))) & (WORDSIZE-1);
+
+            remaining = blockSize - sizeof(void*) - shift;
+            loc = ((char*)m + sizeof(void*) + shift);
+        }
+        void* rloc = loc;
+        loc = (char*)loc + size;
+        remaining -= size;
+
+        usedMemory += size;
+
+        return rloc;
+    }
+
+    /**
+     * Allocates (using this pool) a generic type T.
+     *
+     * Params:
+     *     count = number of instances to allocate.
+     * Returns: pointer (of type T*) to memory buffer
+     */
+    template <typename T>
+    T* allocate(size_t count = 1)
+    {
+        T* mem = (T*) this->allocateMemory((int)(sizeof(T)*count));
+        return mem;
+    }
+
+private:
+    PooledAllocator(const PooledAllocator &); // copy disabled
+    PooledAllocator& operator=(const PooledAllocator &); // assign disabled
+};
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_ALLOCATOR_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/any.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/any.h
new file mode 100644
index 000000000000..2228bd1cfcbf
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/any.h
@@ -0,0 +1,355 @@
+#ifndef OPENCV_FLANN_ANY_H_
+#define OPENCV_FLANN_ANY_H_
+/*
+ * (C) Copyright Christopher Diggins 2005-2011
+ * (C) Copyright Pablo Aguilar 2005
+ * (C) Copyright Kevlin Henney 2001
+ *
+ * Distributed under the Boost Software License, Version 1.0. (See
+ * accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt
+ *
+ * Adapted for FLANN by Marius Muja
+ */
+
+//! @cond IGNORED
+
+#include "defines.h"
+#include <stdexcept>
+#include <ostream>
+#include <typeinfo>
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/utility.hpp"
+
+namespace cvflann
+{
+
+namespace anyimpl
+{
+
+struct bad_any_cast : public std::exception
+{
+    bad_any_cast() = default;
+
+    bad_any_cast(const char* src, const char* dst)
+        : message_(cv::format("cvflann::bad_any_cast(from %s to %s)", src, dst)) {}
+
+
+    const char* what() const noexcept override
+    {
+        return message_.c_str();
+    }
+
+private:
+    std::string message_{"cvflann::bad_any_cast"};
+};
+
+#ifndef CV_THROW_IF_TYPE_MISMATCH
+#define CV_THROW_IF_TYPE_MISMATCH(src_type_info, dst_type_info) \
+    if ((src_type_info) != (dst_type_info)) \
+        throw cvflann::anyimpl::bad_any_cast((src_type_info).name(), \
+                                             (dst_type_info).name())
+#endif
+
+struct empty_any
+{
+};
+
+inline std::ostream& operator <<(std::ostream& out, const empty_any&)
+{
+    out << "[empty_any]";
+    return out;
+}
+
+struct base_any_policy
+{
+    virtual void static_delete(void** x) = 0;
+    virtual void copy_from_value(void const* src, void** dest) = 0;
+    virtual void clone(void* const* src, void** dest) = 0;
+    virtual void move(void* const* src, void** dest) = 0;
+    virtual void* get_value(void** src) = 0;
+    virtual const void* get_value(void* const * src) = 0;
+    virtual ::size_t get_size() = 0;
+    virtual const std::type_info& type() = 0;
+    virtual void print(std::ostream& out, void* const* src) = 0;
+    virtual ~base_any_policy() {}
+};
+
+template<typename T>
+struct typed_base_any_policy : base_any_policy
+{
+    virtual ::size_t get_size() CV_OVERRIDE { return sizeof(T); }
+    virtual const std::type_info& type() CV_OVERRIDE { return typeid(T); }
+
+};
+
+template<typename T>
+struct small_any_policy CV_FINAL : typed_base_any_policy<T>
+{
+    virtual void static_delete(void**) CV_OVERRIDE { }
+    virtual void copy_from_value(void const* src, void** dest) CV_OVERRIDE
+    {
+        new (dest) T(* reinterpret_cast<T const*>(src));
+    }
+    virtual void clone(void* const* src, void** dest) CV_OVERRIDE { *dest = *src; }
+    virtual void move(void* const* src, void** dest) CV_OVERRIDE { *dest = *src; }
+    virtual void* get_value(void** src) CV_OVERRIDE { return reinterpret_cast<void*>(src); }
+    virtual const void* get_value(void* const * src) CV_OVERRIDE { return reinterpret_cast<const void*>(src); }
+    virtual void print(std::ostream& out, void* const* src) CV_OVERRIDE { out << *reinterpret_cast<T const*>(src); }
+};
+
+template<typename T>
+struct big_any_policy CV_FINAL : typed_base_any_policy<T>
+{
+    virtual void static_delete(void** x) CV_OVERRIDE
+    {
+        if (* x) delete (* reinterpret_cast<T**>(x));
+        *x = NULL;
+    }
+    virtual void copy_from_value(void const* src, void** dest) CV_OVERRIDE
+    {
+        *dest = new T(*reinterpret_cast<T const*>(src));
+    }
+    virtual void clone(void* const* src, void** dest) CV_OVERRIDE
+    {
+        *dest = new T(**reinterpret_cast<T* const*>(src));
+    }
+    virtual void move(void* const* src, void** dest) CV_OVERRIDE
+    {
+        (*reinterpret_cast<T**>(dest))->~T();
+        **reinterpret_cast<T**>(dest) = **reinterpret_cast<T* const*>(src);
+    }
+    virtual void* get_value(void** src) CV_OVERRIDE { return *src; }
+    virtual const void* get_value(void* const * src) CV_OVERRIDE { return *src; }
+    virtual void print(std::ostream& out, void* const* src) CV_OVERRIDE { out << *reinterpret_cast<T const*>(*src); }
+};
+
+template<> inline void big_any_policy<flann_centers_init_t>::print(std::ostream& out, void* const* src)
+{
+    out << int(*reinterpret_cast<flann_centers_init_t const*>(*src));
+}
+
+template<> inline void big_any_policy<flann_algorithm_t>::print(std::ostream& out, void* const* src)
+{
+    out << int(*reinterpret_cast<flann_algorithm_t const*>(*src));
+}
+
+template<> inline void big_any_policy<cv::String>::print(std::ostream& out, void* const* src)
+{
+    out << (*reinterpret_cast<cv::String const*>(*src)).c_str();
+}
+
+template<typename T>
+struct choose_policy
+{
+    typedef big_any_policy<T> type;
+};
+
+template<typename T>
+struct choose_policy<T*>
+{
+    typedef small_any_policy<T*> type;
+};
+
+struct any;
+
+/// Choosing the policy for an any type is illegal, but should never happen.
+/// This is designed to throw a compiler error.
+template<>
+struct choose_policy<any>
+{
+    typedef void type;
+};
+
+/// Specializations for small types.
+#define SMALL_POLICY(TYPE) \
+    template<> \
+    struct choose_policy<TYPE> { typedef small_any_policy<TYPE> type; \
+    }
+
+SMALL_POLICY(signed char);
+SMALL_POLICY(unsigned char);
+SMALL_POLICY(signed short);
+SMALL_POLICY(unsigned short);
+SMALL_POLICY(signed int);
+SMALL_POLICY(unsigned int);
+SMALL_POLICY(signed long);
+SMALL_POLICY(unsigned long);
+SMALL_POLICY(float);
+SMALL_POLICY(bool);
+
+#undef SMALL_POLICY
+
+template <typename T>
+class SinglePolicy
+{
+    SinglePolicy();
+    SinglePolicy(const SinglePolicy& other);
+    SinglePolicy& operator=(const SinglePolicy& other);
+
+public:
+    static base_any_policy* get_policy();
+};
+
+/// This function will return a different policy for each type.
+template <typename T>
+inline base_any_policy* SinglePolicy<T>::get_policy()
+{
+    static typename choose_policy<T>::type policy;
+    return &policy;
+}
+
+} // namespace anyimpl
+
+struct any
+{
+private:
+    // fields
+    anyimpl::base_any_policy* policy;
+    void* object;
+
+public:
+    /// Initializing constructor.
+    template <typename T>
+    any(const T& x)
+        : policy(anyimpl::SinglePolicy<anyimpl::empty_any>::get_policy()), object(NULL)
+    {
+        assign(x);
+    }
+
+    /// Empty constructor.
+    any()
+        : policy(anyimpl::SinglePolicy<anyimpl::empty_any>::get_policy()), object(NULL)
+    { }
+
+    /// Special initializing constructor for string literals.
+    any(const char* x)
+        : policy(anyimpl::SinglePolicy<anyimpl::empty_any>::get_policy()), object(NULL)
+    {
+        assign(x);
+    }
+
+    /// Copy constructor.
+    any(const any& x)
+        : policy(anyimpl::SinglePolicy<anyimpl::empty_any>::get_policy()), object(NULL)
+    {
+        assign(x);
+    }
+
+    /// Destructor.
+    ~any()
+    {
+        policy->static_delete(&object);
+    }
+
+    /// Assignment function from another any.
+    any& assign(const any& x)
+    {
+        reset();
+        policy = x.policy;
+        policy->clone(&x.object, &object);
+        return *this;
+    }
+
+    /// Assignment function.
+    template <typename T>
+    any& assign(const T& x)
+    {
+        reset();
+        policy = anyimpl::SinglePolicy<T>::get_policy();
+        policy->copy_from_value(&x, &object);
+        return *this;
+    }
+
+    /// Assignment operator.
+    template<typename T>
+    any& operator=(const T& x)
+    {
+        return assign(x);
+    }
+
+    /// Assignment operator. Template-based version above doesn't work as expected. We need regular assignment operator here.
+    any& operator=(const any& x)
+    {
+        return assign(x);
+    }
+
+    /// Assignment operator, specialed for literal strings.
+    /// They have types like const char [6] which don't work as expected.
+    any& operator=(const char* x)
+    {
+        return assign(x);
+    }
+
+    /// Utility functions
+    any& swap(any& x)
+    {
+        std::swap(policy, x.policy);
+        std::swap(object, x.object);
+        return *this;
+    }
+
+    /// Cast operator. You can only cast to the original type.
+    template<typename T>
+    T& cast()
+    {
+        CV_THROW_IF_TYPE_MISMATCH(policy->type(), typeid(T));
+        T* r = reinterpret_cast<T*>(policy->get_value(&object));
+        return *r;
+    }
+
+    /// Cast operator. You can only cast to the original type.
+    template<typename T>
+    const T& cast() const
+    {
+        CV_THROW_IF_TYPE_MISMATCH(policy->type(), typeid(T));
+        const T* r = reinterpret_cast<const T*>(policy->get_value(&object));
+        return *r;
+    }
+
+    /// Returns true if the any contains no value.
+    bool empty() const
+    {
+        return policy->type() == typeid(anyimpl::empty_any);
+    }
+
+    /// Frees any allocated memory, and sets the value to NULL.
+    void reset()
+    {
+        policy->static_delete(&object);
+        policy = anyimpl::SinglePolicy<anyimpl::empty_any>::get_policy();
+    }
+
+    /// Returns true if the two types are the same.
+    bool compatible(const any& x) const
+    {
+        return policy->type() == x.policy->type();
+    }
+
+    /// Returns if the type is compatible with the policy
+    template<typename T>
+    bool has_type()
+    {
+        return policy->type() == typeid(T);
+    }
+
+    const std::type_info& type() const
+    {
+        return policy->type();
+    }
+
+    friend std::ostream& operator <<(std::ostream& out, const any& any_val);
+};
+
+inline std::ostream& operator <<(std::ostream& out, const any& any_val)
+{
+    any_val.policy->print(out,&any_val.object);
+    return out;
+}
+
+}
+
+//! @endcond
+
+#endif // OPENCV_FLANN_ANY_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/autotuned_index.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/autotuned_index.h
new file mode 100644
index 000000000000..d90f739aff24
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/autotuned_index.h
@@ -0,0 +1,594 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+#ifndef OPENCV_FLANN_AUTOTUNED_INDEX_H_
+#define OPENCV_FLANN_AUTOTUNED_INDEX_H_
+
+//! @cond IGNORED
+
+#include <sstream>
+
+#include "nn_index.h"
+#include "ground_truth.h"
+#include "index_testing.h"
+#include "sampling.h"
+#include "kdtree_index.h"
+#include "kdtree_single_index.h"
+#include "kmeans_index.h"
+#include "composite_index.h"
+#include "linear_index.h"
+#include "logger.h"
+
+namespace cvflann
+{
+
+template<typename Distance>
+NNIndex<Distance>* create_index_by_type(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance);
+
+
+struct AutotunedIndexParams : public IndexParams
+{
+    AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01, float memory_weight = 0, float sample_fraction = 0.1)
+    {
+        (*this)["algorithm"] = FLANN_INDEX_AUTOTUNED;
+        // precision desired (used for autotuning, -1 otherwise)
+        (*this)["target_precision"] = target_precision;
+        // build tree time weighting factor
+        (*this)["build_weight"] = build_weight;
+        // index memory weighting factor
+        (*this)["memory_weight"] = memory_weight;
+        // what fraction of the dataset to use for autotuning
+        (*this)["sample_fraction"] = sample_fraction;
+    }
+};
+
+
+template <typename Distance>
+class AutotunedIndex : public NNIndex<Distance>
+{
+public:
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+
+    AutotunedIndex(const Matrix<ElementType>& inputData, const IndexParams& params = AutotunedIndexParams(), Distance d = Distance()) :
+        dataset_(inputData), distance_(d)
+    {
+        target_precision_ = get_param(params, "target_precision",0.8f);
+        build_weight_ =  get_param(params,"build_weight", 0.01f);
+        memory_weight_ = get_param(params, "memory_weight", 0.0f);
+        sample_fraction_ = get_param(params,"sample_fraction", 0.1f);
+        bestIndex_ = NULL;
+        speedup_ = 0;
+    }
+
+    AutotunedIndex(const AutotunedIndex&);
+    AutotunedIndex& operator=(const AutotunedIndex&);
+
+    virtual ~AutotunedIndex()
+    {
+        if (bestIndex_ != NULL) {
+            delete bestIndex_;
+            bestIndex_ = NULL;
+        }
+    }
+
+    /**
+     *          Method responsible with building the index.
+     */
+    virtual void buildIndex() CV_OVERRIDE
+    {
+        std::ostringstream stream;
+        bestParams_ = estimateBuildParams();
+        print_params(bestParams_, stream);
+        Logger::info("----------------------------------------------------\n");
+        Logger::info("Autotuned parameters:\n");
+        Logger::info("%s", stream.str().c_str());
+        Logger::info("----------------------------------------------------\n");
+
+        bestIndex_ = create_index_by_type(dataset_, bestParams_, distance_);
+        bestIndex_->buildIndex();
+        speedup_ = estimateSearchParams(bestSearchParams_);
+        stream.str(std::string());
+        print_params(bestSearchParams_, stream);
+        Logger::info("----------------------------------------------------\n");
+        Logger::info("Search parameters:\n");
+        Logger::info("%s", stream.str().c_str());
+        Logger::info("----------------------------------------------------\n");
+    }
+
+    /**
+     *  Saves the index to a stream
+     */
+    virtual void saveIndex(FILE* stream) CV_OVERRIDE
+    {
+        save_value(stream, (int)bestIndex_->getType());
+        bestIndex_->saveIndex(stream);
+        save_value(stream, get_param<int>(bestSearchParams_, "checks"));
+    }
+
+    /**
+     *  Loads the index from a stream
+     */
+    virtual void loadIndex(FILE* stream) CV_OVERRIDE
+    {
+        int index_type;
+
+        load_value(stream, index_type);
+        IndexParams params;
+        params["algorithm"] = (flann_algorithm_t)index_type;
+        bestIndex_ = create_index_by_type<Distance>(dataset_, params, distance_);
+        bestIndex_->loadIndex(stream);
+        int checks;
+        load_value(stream, checks);
+        bestSearchParams_["checks"] = checks;
+    }
+
+    /**
+     *      Method that searches for nearest-neighbors
+     */
+    virtual void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
+    {
+        int checks = get_param<int>(searchParams,"checks",FLANN_CHECKS_AUTOTUNED);
+        if (checks == FLANN_CHECKS_AUTOTUNED) {
+            bestIndex_->findNeighbors(result, vec, bestSearchParams_);
+        }
+        else {
+            bestIndex_->findNeighbors(result, vec, searchParams);
+        }
+    }
+
+
+    IndexParams getParameters() const CV_OVERRIDE
+    {
+        return bestIndex_->getParameters();
+    }
+
+    SearchParams getSearchParameters() const
+    {
+        return bestSearchParams_;
+    }
+
+    float getSpeedup() const
+    {
+        return speedup_;
+    }
+
+
+    /**
+     *      Number of features in this index.
+     */
+    virtual size_t size() const CV_OVERRIDE
+    {
+        return bestIndex_->size();
+    }
+
+    /**
+     *  The length of each vector in this index.
+     */
+    virtual size_t veclen() const CV_OVERRIDE
+    {
+        return bestIndex_->veclen();
+    }
+
+    /**
+     * The amount of memory (in bytes) this index uses.
+     */
+    virtual int usedMemory() const CV_OVERRIDE
+    {
+        return bestIndex_->usedMemory();
+    }
+
+    /**
+     * Algorithm name
+     */
+    virtual flann_algorithm_t getType() const CV_OVERRIDE
+    {
+        return FLANN_INDEX_AUTOTUNED;
+    }
+
+private:
+
+    struct CostData
+    {
+        float searchTimeCost;
+        float buildTimeCost;
+        float memoryCost;
+        float totalCost;
+        IndexParams params;
+    };
+
+    void evaluate_kmeans(CostData& cost)
+    {
+        StartStopTimer t;
+        int checks;
+        const int nn = 1;
+
+        Logger::info("KMeansTree using params: max_iterations=%d, branching=%d\n",
+                     get_param<int>(cost.params,"iterations"),
+                     get_param<int>(cost.params,"branching"));
+        KMeansIndex<Distance> kmeans(sampledDataset_, cost.params, distance_);
+        // measure index build time
+        t.start();
+        kmeans.buildIndex();
+        t.stop();
+        float buildTime = (float)t.value;
+
+        // measure search time
+        float searchTime = test_index_precision(kmeans, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
+
+        float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
+        cost.memoryCost = (kmeans.usedMemory() + datasetMemory) / datasetMemory;
+        cost.searchTimeCost = searchTime;
+        cost.buildTimeCost = buildTime;
+        Logger::info("KMeansTree buildTime=%g, searchTime=%g, build_weight=%g\n", buildTime, searchTime, build_weight_);
+    }
+
+
+    void evaluate_kdtree(CostData& cost)
+    {
+        StartStopTimer t;
+        int checks;
+        const int nn = 1;
+
+        Logger::info("KDTree using params: trees=%d\n", get_param<int>(cost.params,"trees"));
+        KDTreeIndex<Distance> kdtree(sampledDataset_, cost.params, distance_);
+
+        t.start();
+        kdtree.buildIndex();
+        t.stop();
+        float buildTime = (float)t.value;
+
+        //measure search time
+        float searchTime = test_index_precision(kdtree, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
+
+        float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
+        cost.memoryCost = (kdtree.usedMemory() + datasetMemory) / datasetMemory;
+        cost.searchTimeCost = searchTime;
+        cost.buildTimeCost = buildTime;
+        Logger::info("KDTree buildTime=%g, searchTime=%g\n", buildTime, searchTime);
+    }
+
+
+    //    struct KMeansSimpleDownhillFunctor {
+    //
+    //        Autotune& autotuner;
+    //        KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {}
+    //
+    //        float operator()(int* params) {
+    //
+    //            float maxFloat = numeric_limits<float>::max();
+    //
+    //            if (params[0]<2) return maxFloat;
+    //            if (params[1]<0) return maxFloat;
+    //
+    //            CostData c;
+    //            c.params["algorithm"] = KMEANS;
+    //            c.params["centers-init"] = CENTERS_RANDOM;
+    //            c.params["branching"] = params[0];
+    //            c.params["max-iterations"] = params[1];
+    //
+    //            autotuner.evaluate_kmeans(c);
+    //
+    //            return c.timeCost;
+    //
+    //        }
+    //    };
+    //
+    //    struct KDTreeSimpleDownhillFunctor {
+    //
+    //        Autotune& autotuner;
+    //        KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {}
+    //
+    //        float operator()(int* params) {
+    //            float maxFloat = numeric_limits<float>::max();
+    //
+    //            if (params[0]<1) return maxFloat;
+    //
+    //            CostData c;
+    //            c.params["algorithm"] = KDTREE;
+    //            c.params["trees"] = params[0];
+    //
+    //            autotuner.evaluate_kdtree(c);
+    //
+    //            return c.timeCost;
+    //
+    //        }
+    //    };
+
+
+
+    void optimizeKMeans(std::vector<CostData>& costs)
+    {
+        Logger::info("KMEANS, Step 1: Exploring parameter space\n");
+
+        // explore kmeans parameters space using combinations of the parameters below
+        int maxIterations[] = { 1, 5, 10, 15 };
+        int branchingFactors[] = { 16, 32, 64, 128, 256 };
+
+        int kmeansParamSpaceSize = FLANN_ARRAY_LEN(maxIterations) * FLANN_ARRAY_LEN(branchingFactors);
+        costs.reserve(costs.size() + kmeansParamSpaceSize);
+
+        // evaluate kmeans for all parameter combinations
+        for (size_t i = 0; i < FLANN_ARRAY_LEN(maxIterations); ++i) {
+            for (size_t j = 0; j < FLANN_ARRAY_LEN(branchingFactors); ++j) {
+                CostData cost;
+                cost.params["algorithm"] = FLANN_INDEX_KMEANS;
+                cost.params["centers_init"] = FLANN_CENTERS_RANDOM;
+                cost.params["iterations"] = maxIterations[i];
+                cost.params["branching"] = branchingFactors[j];
+
+                evaluate_kmeans(cost);
+                costs.push_back(cost);
+            }
+        }
+
+        //         Logger::info("KMEANS, Step 2: simplex-downhill optimization\n");
+        //
+        //         const int n = 2;
+        //         // choose initial simplex points as the best parameters so far
+        //         int kmeansNMPoints[n*(n+1)];
+        //         float kmeansVals[n+1];
+        //         for (int i=0;i<n+1;++i) {
+        //             kmeansNMPoints[i*n] = (int)kmeansCosts[i].params["branching"];
+        //             kmeansNMPoints[i*n+1] = (int)kmeansCosts[i].params["max-iterations"];
+        //             kmeansVals[i] = kmeansCosts[i].timeCost;
+        //         }
+        //         KMeansSimpleDownhillFunctor kmeans_cost_func(*this);
+        //         // run optimization
+        //         optimizeSimplexDownhill(kmeansNMPoints,n,kmeans_cost_func,kmeansVals);
+        //         // store results
+        //         for (int i=0;i<n+1;++i) {
+        //             kmeansCosts[i].params["branching"] = kmeansNMPoints[i*2];
+        //             kmeansCosts[i].params["max-iterations"] = kmeansNMPoints[i*2+1];
+        //             kmeansCosts[i].timeCost = kmeansVals[i];
+        //         }
+    }
+
+
+    void optimizeKDTree(std::vector<CostData>& costs)
+    {
+        Logger::info("KD-TREE, Step 1: Exploring parameter space\n");
+
+        // explore kd-tree parameters space using the parameters below
+        int testTrees[] = { 1, 4, 8, 16, 32 };
+
+        // evaluate kdtree for all parameter combinations
+        for (size_t i = 0; i < FLANN_ARRAY_LEN(testTrees); ++i) {
+            CostData cost;
+            cost.params["algorithm"] = FLANN_INDEX_KDTREE;
+            cost.params["trees"] = testTrees[i];
+
+            evaluate_kdtree(cost);
+            costs.push_back(cost);
+        }
+
+        //         Logger::info("KD-TREE, Step 2: simplex-downhill optimization\n");
+        //
+        //         const int n = 1;
+        //         // choose initial simplex points as the best parameters so far
+        //         int kdtreeNMPoints[n*(n+1)];
+        //         float kdtreeVals[n+1];
+        //         for (int i=0;i<n+1;++i) {
+        //             kdtreeNMPoints[i] = (int)kdtreeCosts[i].params["trees"];
+        //             kdtreeVals[i] = kdtreeCosts[i].timeCost;
+        //         }
+        //         KDTreeSimpleDownhillFunctor kdtree_cost_func(*this);
+        //         // run optimization
+        //         optimizeSimplexDownhill(kdtreeNMPoints,n,kdtree_cost_func,kdtreeVals);
+        //         // store results
+        //         for (int i=0;i<n+1;++i) {
+        //             kdtreeCosts[i].params["trees"] = kdtreeNMPoints[i];
+        //             kdtreeCosts[i].timeCost = kdtreeVals[i];
+        //         }
+    }
+
+    /**
+     *  Chooses the best nearest-neighbor algorithm and estimates the optimal
+     *  parameters to use when building the index (for a given precision).
+     *  Returns a dictionary with the optimal parameters.
+     */
+    IndexParams estimateBuildParams()
+    {
+        std::vector<CostData> costs;
+
+        int sampleSize = int(sample_fraction_ * dataset_.rows);
+        int testSampleSize = std::min(sampleSize / 10, 1000);
+
+        Logger::info("Entering autotuning, dataset size: %d, sampleSize: %d, testSampleSize: %d, target precision: %g\n", dataset_.rows, sampleSize, testSampleSize, target_precision_);
+
+        // For a very small dataset, it makes no sense to build any fancy index, just
+        // use linear search
+        if (testSampleSize < 10) {
+            Logger::info("Choosing linear, dataset too small\n");
+            return LinearIndexParams();
+        }
+
+        // We use a fraction of the original dataset to speedup the autotune algorithm
+        sampledDataset_ = random_sample(dataset_, sampleSize);
+        // We use a cross-validation approach, first we sample a testset from the dataset
+        testDataset_ = random_sample(sampledDataset_, testSampleSize, true);
+
+        // We compute the ground truth using linear search
+        Logger::info("Computing ground truth... \n");
+        gt_matches_ = Matrix<int>(new int[testDataset_.rows], testDataset_.rows, 1);
+        StartStopTimer t;
+        t.start();
+        compute_ground_truth<Distance>(sampledDataset_, testDataset_, gt_matches_, 0, distance_);
+        t.stop();
+
+        CostData linear_cost;
+        linear_cost.searchTimeCost = (float)t.value;
+        linear_cost.buildTimeCost = 0;
+        linear_cost.memoryCost = 0;
+        linear_cost.params["algorithm"] = FLANN_INDEX_LINEAR;
+
+        costs.push_back(linear_cost);
+
+        // Start parameter autotune process
+        Logger::info("Autotuning parameters...\n");
+
+        optimizeKMeans(costs);
+        optimizeKDTree(costs);
+
+        float bestTimeCost = costs[0].searchTimeCost;
+        for (size_t i = 0; i < costs.size(); ++i) {
+            float timeCost = costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost;
+            if (timeCost < bestTimeCost) {
+                bestTimeCost = timeCost;
+            }
+        }
+
+        float bestCost = costs[0].searchTimeCost / bestTimeCost;
+        IndexParams bestParams = costs[0].params;
+        if (bestTimeCost > 0) {
+            for (size_t i = 0; i < costs.size(); ++i) {
+                float crtCost = (costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost) / bestTimeCost +
+                                memory_weight_ * costs[i].memoryCost;
+                if (crtCost < bestCost) {
+                    bestCost = crtCost;
+                    bestParams = costs[i].params;
+                }
+            }
+        }
+
+        delete[] gt_matches_.data;
+        delete[] testDataset_.data;
+        delete[] sampledDataset_.data;
+
+        return bestParams;
+    }
+
+
+
+    /**
+     *  Estimates the search time parameters needed to get the desired precision.
+     *  Precondition: the index is built
+     *  Postcondition: the searchParams will have the optimum params set, also the speedup obtained over linear search.
+     */
+    float estimateSearchParams(SearchParams& searchParams)
+    {
+        const int nn = 1;
+        const size_t SAMPLE_COUNT = 1000;
+
+        CV_Assert(bestIndex_ != NULL && "Requires a valid index"); // must have a valid index
+
+        float speedup = 0;
+
+        int samples = (int)std::min(dataset_.rows / 10, SAMPLE_COUNT);
+        if (samples > 0) {
+            Matrix<ElementType> testDataset = random_sample(dataset_, samples);
+
+            Logger::info("Computing ground truth\n");
+
+            // we need to compute the ground truth first
+            Matrix<int> gt_matches(new int[testDataset.rows], testDataset.rows, 1);
+            StartStopTimer t;
+            t.start();
+            compute_ground_truth<Distance>(dataset_, testDataset, gt_matches, 1, distance_);
+            t.stop();
+            float linear = (float)t.value;
+
+            int checks;
+            Logger::info("Estimating number of checks\n");
+
+            float searchTime;
+            float cb_index;
+            if (bestIndex_->getType() == FLANN_INDEX_KMEANS) {
+                Logger::info("KMeans algorithm, estimating cluster border factor\n");
+                KMeansIndex<Distance>* kmeans = (KMeansIndex<Distance>*)bestIndex_;
+                float bestSearchTime = -1;
+                float best_cb_index = -1;
+                int best_checks = -1;
+                for (cb_index = 0; cb_index < 1.1f; cb_index += 0.2f) {
+                    kmeans->set_cb_index(cb_index);
+                    searchTime = test_index_precision(*kmeans, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
+                    if ((searchTime < bestSearchTime) || (bestSearchTime == -1)) {
+                        bestSearchTime = searchTime;
+                        best_cb_index = cb_index;
+                        best_checks = checks;
+                    }
+                }
+                searchTime = bestSearchTime;
+                cb_index = best_cb_index;
+                checks = best_checks;
+
+                kmeans->set_cb_index(best_cb_index);
+                Logger::info("Optimum cb_index: %g\n", cb_index);
+                bestParams_["cb_index"] = cb_index;
+            }
+            else {
+                searchTime = test_index_precision(*bestIndex_, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
+            }
+
+            Logger::info("Required number of checks: %d \n", checks);
+            searchParams["checks"] = checks;
+
+            speedup = linear / searchTime;
+
+            delete[] gt_matches.data;
+            delete[] testDataset.data;
+        }
+
+        return speedup;
+    }
+
+private:
+    NNIndex<Distance>* bestIndex_;
+
+    IndexParams bestParams_;
+    SearchParams bestSearchParams_;
+
+    Matrix<ElementType> sampledDataset_;
+    Matrix<ElementType> testDataset_;
+    Matrix<int> gt_matches_;
+
+    float speedup_;
+
+    /**
+     * The dataset used by this index
+     */
+    const Matrix<ElementType> dataset_;
+
+    /**
+     * Index parameters
+     */
+    float target_precision_;
+    float build_weight_;
+    float memory_weight_;
+    float sample_fraction_;
+
+    Distance distance_;
+
+
+};
+}
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_AUTOTUNED_INDEX_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/composite_index.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/composite_index.h
new file mode 100644
index 000000000000..37a6223f882d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/composite_index.h
@@ -0,0 +1,196 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_COMPOSITE_INDEX_H_
+#define OPENCV_FLANN_COMPOSITE_INDEX_H_
+
+//! @cond IGNORED
+
+#include "nn_index.h"
+#include "kdtree_index.h"
+#include "kmeans_index.h"
+
+namespace cvflann
+{
+
+/**
+ * Index parameters for the CompositeIndex.
+ */
+struct CompositeIndexParams : public IndexParams
+{
+    CompositeIndexParams(int trees = 4, int branching = 32, int iterations = 11,
+                         flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+    {
+        (*this)["algorithm"] = FLANN_INDEX_KMEANS;
+        // number of randomized trees to use (for kdtree)
+        (*this)["trees"] = trees;
+        // branching factor
+        (*this)["branching"] = branching;
+        // max iterations to perform in one kmeans clustering (kmeans tree)
+        (*this)["iterations"] = iterations;
+        // algorithm used for picking the initial cluster centers for kmeans tree
+        (*this)["centers_init"] = centers_init;
+        // cluster boundary index. Used when searching the kmeans tree
+        (*this)["cb_index"] = cb_index;
+    }
+};
+
+
+/**
+ * This index builds a kd-tree index and a k-means index and performs nearest
+ * neighbour search both indexes. This gives a slight boost in search performance
+ * as some of the neighbours that are missed by one index are found by the other.
+ */
+template <typename Distance>
+class CompositeIndex : public NNIndex<Distance>
+{
+public:
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+
+    /**
+     * Index constructor
+     * @param inputData dataset containing the points to index
+     * @param params Index parameters
+     * @param d Distance functor
+     */
+    CompositeIndex(const Matrix<ElementType>& inputData, const IndexParams& params = CompositeIndexParams(),
+                   Distance d = Distance()) : index_params_(params)
+    {
+        kdtree_index_ = new KDTreeIndex<Distance>(inputData, params, d);
+        kmeans_index_ = new KMeansIndex<Distance>(inputData, params, d);
+
+    }
+
+    CompositeIndex(const CompositeIndex&);
+    CompositeIndex& operator=(const CompositeIndex&);
+
+    virtual ~CompositeIndex()
+    {
+        delete kdtree_index_;
+        delete kmeans_index_;
+    }
+
+    /**
+     * @return The index type
+     */
+    flann_algorithm_t getType() const CV_OVERRIDE
+    {
+        return FLANN_INDEX_COMPOSITE;
+    }
+
+    /**
+     * @return Size of the index
+     */
+    size_t size() const CV_OVERRIDE
+    {
+        return kdtree_index_->size();
+    }
+
+    /**
+     * \returns The dimensionality of the features in this index.
+     */
+    size_t veclen() const CV_OVERRIDE
+    {
+        return kdtree_index_->veclen();
+    }
+
+    /**
+     * \returns The amount of memory (in bytes) used by the index.
+     */
+    int usedMemory() const CV_OVERRIDE
+    {
+        return kmeans_index_->usedMemory() + kdtree_index_->usedMemory();
+    }
+
+    /**
+     * \brief Builds the index
+     */
+    void buildIndex() CV_OVERRIDE
+    {
+        Logger::info("Building kmeans tree...\n");
+        kmeans_index_->buildIndex();
+        Logger::info("Building kdtree tree...\n");
+        kdtree_index_->buildIndex();
+    }
+
+    /**
+     * \brief Saves the index to a stream
+     * \param stream The stream to save the index to
+     */
+    void saveIndex(FILE* stream) CV_OVERRIDE
+    {
+        kmeans_index_->saveIndex(stream);
+        kdtree_index_->saveIndex(stream);
+    }
+
+    /**
+     * \brief Loads the index from a stream
+     * \param stream The stream from which the index is loaded
+     */
+    void loadIndex(FILE* stream) CV_OVERRIDE
+    {
+        kmeans_index_->loadIndex(stream);
+        kdtree_index_->loadIndex(stream);
+    }
+
+    /**
+     * \returns The index parameters
+     */
+    IndexParams getParameters() const CV_OVERRIDE
+    {
+        return index_params_;
+    }
+
+    /**
+     * \brief Method that searches for nearest-neighbours
+     */
+    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
+    {
+        kmeans_index_->findNeighbors(result, vec, searchParams);
+        kdtree_index_->findNeighbors(result, vec, searchParams);
+    }
+
+private:
+    /** The k-means index */
+    KMeansIndex<Distance>* kmeans_index_;
+
+    /** The kd-tree index */
+    KDTreeIndex<Distance>* kdtree_index_;
+
+    /** The index parameters */
+    const IndexParams index_params_;
+};
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_COMPOSITE_INDEX_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/config.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/config.h
new file mode 100644
index 000000000000..c9342c00c3d7
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/config.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2011  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2011  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+
+#ifndef OPENCV_FLANN_CONFIG_H_
+#define OPENCV_FLANN_CONFIG_H_
+
+//! @cond IGNORED
+
+#ifdef FLANN_VERSION_
+#undef FLANN_VERSION_
+#endif
+#define FLANN_VERSION_ "1.6.10"
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_CONFIG_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/defines.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/defines.h
new file mode 100644
index 000000000000..8ab83293ffa8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/defines.h
@@ -0,0 +1,169 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2011  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2011  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+
+#ifndef OPENCV_FLANN_DEFINES_H_
+#define OPENCV_FLANN_DEFINES_H_
+
+//! @cond IGNORED
+
+#include "config.h"
+
+#ifdef FLANN_EXPORT
+#undef FLANN_EXPORT
+#endif
+#ifdef _WIN32
+/* win32 dll export/import directives */
+ #ifdef FLANN_EXPORTS
+  #define FLANN_EXPORT __declspec(dllexport)
+ #elif defined(FLANN_STATIC)
+  #define FLANN_EXPORT
+ #else
+  #define FLANN_EXPORT __declspec(dllimport)
+ #endif
+#else
+/* unix needs nothing */
+ #define FLANN_EXPORT
+#endif
+
+
+#undef FLANN_PLATFORM_32_BIT
+#undef FLANN_PLATFORM_64_BIT
+#if defined __amd64__ || defined __x86_64__ || defined _WIN64 || defined _M_X64
+#define FLANN_PLATFORM_64_BIT
+#else
+#define FLANN_PLATFORM_32_BIT
+#endif
+
+
+#undef FLANN_ARRAY_LEN
+#define FLANN_ARRAY_LEN(a) (sizeof(a)/sizeof(a[0]))
+
+namespace cvflann {
+
+/* Nearest neighbour index algorithms */
+enum flann_algorithm_t
+{
+    FLANN_INDEX_LINEAR = 0,
+    FLANN_INDEX_KDTREE = 1,
+    FLANN_INDEX_KMEANS = 2,
+    FLANN_INDEX_COMPOSITE = 3,
+    FLANN_INDEX_KDTREE_SINGLE = 4,
+    FLANN_INDEX_HIERARCHICAL = 5,
+    FLANN_INDEX_LSH = 6,
+    FLANN_INDEX_SAVED = 254,
+    FLANN_INDEX_AUTOTUNED = 255,
+
+    // deprecated constants, should use the FLANN_INDEX_* ones instead
+    LINEAR = 0,
+    KDTREE = 1,
+    KMEANS = 2,
+    COMPOSITE = 3,
+    KDTREE_SINGLE = 4,
+    SAVED = 254,
+    AUTOTUNED = 255
+};
+
+
+
+enum flann_centers_init_t
+{
+    FLANN_CENTERS_RANDOM = 0,
+    FLANN_CENTERS_GONZALES = 1,
+    FLANN_CENTERS_KMEANSPP = 2,
+    FLANN_CENTERS_GROUPWISE = 3,
+
+    // deprecated constants, should use the FLANN_CENTERS_* ones instead
+    CENTERS_RANDOM = 0,
+    CENTERS_GONZALES = 1,
+    CENTERS_KMEANSPP = 2
+};
+
+enum flann_log_level_t
+{
+    FLANN_LOG_NONE = 0,
+    FLANN_LOG_FATAL = 1,
+    FLANN_LOG_ERROR = 2,
+    FLANN_LOG_WARN = 3,
+    FLANN_LOG_INFO = 4
+};
+
+enum flann_distance_t
+{
+    FLANN_DIST_EUCLIDEAN = 1,
+    FLANN_DIST_L2 = 1,
+    FLANN_DIST_MANHATTAN = 2,
+    FLANN_DIST_L1 = 2,
+    FLANN_DIST_MINKOWSKI = 3,
+    FLANN_DIST_MAX   = 4,
+    FLANN_DIST_HIST_INTERSECT   = 5,
+    FLANN_DIST_HELLINGER = 6,
+    FLANN_DIST_CHI_SQUARE = 7,
+    FLANN_DIST_CS         = 7,
+    FLANN_DIST_KULLBACK_LEIBLER  = 8,
+    FLANN_DIST_KL                = 8,
+    FLANN_DIST_HAMMING          = 9,
+    FLANN_DIST_DNAMMING          = 10,
+
+    // deprecated constants, should use the FLANN_DIST_* ones instead
+    EUCLIDEAN = 1,
+    MANHATTAN = 2,
+    MINKOWSKI = 3,
+    MAX_DIST   = 4,
+    HIST_INTERSECT   = 5,
+    HELLINGER = 6,
+    CS         = 7,
+    KL         = 8,
+    KULLBACK_LEIBLER  = 8
+};
+
+enum flann_datatype_t
+{
+    FLANN_INT8 = 0,
+    FLANN_INT16 = 1,
+    FLANN_INT32 = 2,
+    FLANN_INT64 = 3,
+    FLANN_UINT8 = 4,
+    FLANN_UINT16 = 5,
+    FLANN_UINT32 = 6,
+    FLANN_UINT64 = 7,
+    FLANN_FLOAT32 = 8,
+    FLANN_FLOAT64 = 9
+};
+
+enum
+{
+    FLANN_CHECKS_UNLIMITED = -1,
+    FLANN_CHECKS_AUTOTUNED = -2
+};
+
+}
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_DEFINES_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/dist.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/dist.h
new file mode 100644
index 000000000000..3029ebb5ef8e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/dist.h
@@ -0,0 +1,1292 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_DIST_H_
+#define OPENCV_FLANN_DIST_H_
+
+//! @cond IGNORED
+
+#include <cmath>
+#include <cstdlib>
+#include <string.h>
+#ifdef _MSC_VER
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+#include "defines.h"
+
+#if defined _WIN32 && (defined(_M_ARM) || defined(_M_ARM64))
+# include <Intrin.h>
+#endif
+
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
+# include "arm_neon.h"
+#endif
+
+namespace cvflann
+{
+
+template<typename T>
+inline T abs(T x) { return (x<0) ? -x : x; }
+
+template<>
+inline int abs<int>(int x) { return ::abs(x); }
+
+template<>
+inline float abs<float>(float x) { return fabsf(x); }
+
+template<>
+inline double abs<double>(double x) { return fabs(x); }
+
+
+template<typename TargetType>
+inline TargetType round(float x) { return static_cast<TargetType>(x); }
+
+template<>
+inline unsigned int round<unsigned int>(float x) { return static_cast<unsigned int>(x + 0.5f); }
+
+template<>
+inline unsigned short round<unsigned short>(float x) { return static_cast<unsigned short>(x + 0.5f); }
+
+template<>
+inline unsigned char round<unsigned char>(float x) { return static_cast<unsigned char>(x + 0.5f); }
+
+template<>
+inline long long round<long long>(float x) { return static_cast<long long>(x + 0.5f); }
+
+template<>
+inline long round<long>(float x) { return static_cast<long>(x + 0.5f); }
+
+template<>
+inline int round<int>(float x) { return static_cast<int>(x + 0.5f) - (x<0); }
+
+template<>
+inline short round<short>(float x) { return static_cast<short>(x + 0.5f) - (x<0); }
+
+template<>
+inline char round<char>(float x) { return static_cast<char>(x + 0.5f) - (x<0); }
+
+
+template<typename TargetType>
+inline TargetType round(double x) { return static_cast<TargetType>(x); }
+
+template<>
+inline unsigned int round<unsigned int>(double x) { return static_cast<unsigned int>(x + 0.5); }
+
+template<>
+inline unsigned short round<unsigned short>(double x) { return static_cast<unsigned short>(x + 0.5); }
+
+template<>
+inline unsigned char round<unsigned char>(double x) { return static_cast<unsigned char>(x + 0.5); }
+
+template<>
+inline long long round<long long>(double x) { return static_cast<long long>(x + 0.5); }
+
+template<>
+inline long round<long>(double x) { return static_cast<long>(x + 0.5); }
+
+template<>
+inline int round<int>(double x) { return static_cast<int>(x + 0.5) - (x<0); }
+
+template<>
+inline short round<short>(double x) { return static_cast<short>(x + 0.5) - (x<0); }
+
+template<>
+inline char round<char>(double x) { return static_cast<char>(x + 0.5) - (x<0); }
+
+
+template<typename T>
+struct Accumulator { typedef T Type; };
+template<>
+struct Accumulator<unsigned char>  { typedef float Type; };
+template<>
+struct Accumulator<unsigned short> { typedef float Type; };
+template<>
+struct Accumulator<unsigned int> { typedef float Type; };
+template<>
+struct Accumulator<char>   { typedef float Type; };
+template<>
+struct Accumulator<short>  { typedef float Type; };
+template<>
+struct Accumulator<int> { typedef float Type; };
+
+#undef True
+#undef False
+
+class True
+{
+public:
+    static const bool val = true;
+};
+
+class False
+{
+public:
+    static const bool val = false;
+};
+
+
+/*
+ * This is a "zero iterator". It basically behaves like a zero filled
+ * array to all algorithms that use arrays as iterators (STL style).
+ * It's useful when there's a need to compute the distance between feature
+ * and origin it and allows for better compiler optimisation than using a
+ * zero-filled array.
+ */
+template <typename T>
+struct ZeroIterator
+{
+
+    T operator*()
+    {
+        return 0;
+    }
+
+    T operator[](int)
+    {
+        return 0;
+    }
+
+    const ZeroIterator<T>& operator ++()
+    {
+        return *this;
+    }
+
+    ZeroIterator<T> operator ++(int)
+    {
+        return *this;
+    }
+
+    ZeroIterator<T>& operator+=(int)
+    {
+        return *this;
+    }
+
+};
+
+
+
+/**
+ * Squared Euclidean distance functor.
+ *
+ * This is the simpler, unrolled version. This is preferable for
+ * very low dimensionality data (eg 3D points)
+ */
+template<class T>
+struct L2_Simple
+{
+    typedef True is_kdtree_distance;
+    typedef True is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;
+
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        ResultType result = ResultType();
+        ResultType diff;
+        for(size_t i = 0; i < size; ++i ) {
+            diff = (ResultType)(*a++ - *b++);
+            result += diff*diff;
+        }
+        return result;
+    }
+
+    template <typename U, typename V>
+    inline ResultType accum_dist(const U& a, const V& b, int) const
+    {
+        return (a-b)*(a-b);
+    }
+};
+
+
+
+/**
+ * Squared Euclidean distance functor, optimized version
+ */
+template<class T>
+struct L2
+{
+    typedef True is_kdtree_distance;
+    typedef True is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;
+
+    /**
+     *  Compute the squared Euclidean distance between two vectors.
+     *
+     *	This is highly optimised, with loop unrolling, as it is one
+     *	of the most expensive inner loops.
+     *
+     *	The computation of squared root at the end is omitted for
+     *	efficiency.
+     */
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType worst_dist = -1) const
+    {
+        ResultType result = ResultType();
+        ResultType diff0, diff1, diff2, diff3;
+        Iterator1 last = a + size;
+        Iterator1 lastgroup = last - 3;
+
+        /* Process 4 items with each loop for efficiency. */
+        while (a < lastgroup) {
+            diff0 = (ResultType)(a[0] - b[0]);
+            diff1 = (ResultType)(a[1] - b[1]);
+            diff2 = (ResultType)(a[2] - b[2]);
+            diff3 = (ResultType)(a[3] - b[3]);
+            result += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
+            a += 4;
+            b += 4;
+
+            if ((worst_dist>0)&&(result>worst_dist)) {
+                return result;
+            }
+        }
+        /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+        while (a < last) {
+            diff0 = (ResultType)(*a++ - *b++);
+            result += diff0 * diff0;
+        }
+        return result;
+    }
+
+    /**
+     *	Partial euclidean distance, using just one dimension. This is used by the
+     *	kd-tree when computing partial distances while traversing the tree.
+     *
+     *	Squared root is omitted for efficiency.
+     */
+    template <typename U, typename V>
+    inline ResultType accum_dist(const U& a, const V& b, int) const
+    {
+        return (a-b)*(a-b);
+    }
+};
+
+
+/*
+ * Manhattan distance functor, optimized version
+ */
+template<class T>
+struct L1
+{
+    typedef True is_kdtree_distance;
+    typedef True is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;
+
+    /**
+     *  Compute the Manhattan (L_1) distance between two vectors.
+     *
+     *	This is highly optimised, with loop unrolling, as it is one
+     *	of the most expensive inner loops.
+     */
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType worst_dist = -1) const
+    {
+        ResultType result = ResultType();
+        ResultType diff0, diff1, diff2, diff3;
+        Iterator1 last = a + size;
+        Iterator1 lastgroup = last - 3;
+
+        /* Process 4 items with each loop for efficiency. */
+        while (a < lastgroup) {
+            diff0 = (ResultType)abs(a[0] - b[0]);
+            diff1 = (ResultType)abs(a[1] - b[1]);
+            diff2 = (ResultType)abs(a[2] - b[2]);
+            diff3 = (ResultType)abs(a[3] - b[3]);
+            result += diff0 + diff1 + diff2 + diff3;
+            a += 4;
+            b += 4;
+
+            if ((worst_dist>0)&&(result>worst_dist)) {
+                return result;
+            }
+        }
+        /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+        while (a < last) {
+            diff0 = (ResultType)abs(*a++ - *b++);
+            result += diff0;
+        }
+        return result;
+    }
+
+    /**
+     * Partial distance, used by the kd-tree.
+     */
+    template <typename U, typename V>
+    inline ResultType accum_dist(const U& a, const V& b, int) const
+    {
+        return abs(a-b);
+    }
+};
+
+
+
+template<class T>
+struct MinkowskiDistance
+{
+    typedef True is_kdtree_distance;
+    typedef True is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;
+
+    int order;
+
+    MinkowskiDistance(int order_) : order(order_) {}
+
+    /**
+     *  Compute the Minkowski (L_p) distance between two vectors.
+     *
+     *	This is highly optimised, with loop unrolling, as it is one
+     *	of the most expensive inner loops.
+     *
+     *	The computation of squared root at the end is omitted for
+     *	efficiency.
+     */
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType worst_dist = -1) const
+    {
+        ResultType result = ResultType();
+        ResultType diff0, diff1, diff2, diff3;
+        Iterator1 last = a + size;
+        Iterator1 lastgroup = last - 3;
+
+        /* Process 4 items with each loop for efficiency. */
+        while (a < lastgroup) {
+            diff0 = (ResultType)abs(a[0] - b[0]);
+            diff1 = (ResultType)abs(a[1] - b[1]);
+            diff2 = (ResultType)abs(a[2] - b[2]);
+            diff3 = (ResultType)abs(a[3] - b[3]);
+            result += pow(diff0,order) + pow(diff1,order) + pow(diff2,order) + pow(diff3,order);
+            a += 4;
+            b += 4;
+
+            if ((worst_dist>0)&&(result>worst_dist)) {
+                return result;
+            }
+        }
+        /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+        while (a < last) {
+            diff0 = (ResultType)abs(*a++ - *b++);
+            result += pow(diff0,order);
+        }
+        return result;
+    }
+
+    /**
+     * Partial distance, used by the kd-tree.
+     */
+    template <typename U, typename V>
+    inline ResultType accum_dist(const U& a, const V& b, int) const
+    {
+        return pow(static_cast<ResultType>(abs(a-b)),order);
+    }
+};
+
+
+
+template<class T>
+struct MaxDistance
+{
+    typedef False is_kdtree_distance;
+    typedef True is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;
+
+    /**
+     *  Compute the max distance (L_infinity) between two vectors.
+     *
+     *  This distance is not a valid kdtree distance, it's not dimensionwise additive.
+     */
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType worst_dist = -1) const
+    {
+        ResultType result = ResultType();
+        ResultType diff0, diff1, diff2, diff3;
+        Iterator1 last = a + size;
+        Iterator1 lastgroup = last - 3;
+
+        /* Process 4 items with each loop for efficiency. */
+        while (a < lastgroup) {
+            diff0 = abs(a[0] - b[0]);
+            diff1 = abs(a[1] - b[1]);
+            diff2 = abs(a[2] - b[2]);
+            diff3 = abs(a[3] - b[3]);
+            if (diff0>result) {result = diff0; }
+            if (diff1>result) {result = diff1; }
+            if (diff2>result) {result = diff2; }
+            if (diff3>result) {result = diff3; }
+            a += 4;
+            b += 4;
+
+            if ((worst_dist>0)&&(result>worst_dist)) {
+                return result;
+            }
+        }
+        /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+        while (a < last) {
+            diff0 = abs(*a++ - *b++);
+            result = (diff0>result) ? diff0 : result;
+        }
+        return result;
+    }
+
+    /* This distance functor is not dimension-wise additive, which
+     * makes it an invalid kd-tree distance, not implementing the accum_dist method */
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
+ * bit count of A exclusive XOR'ed with B
+ */
+struct HammingLUT
+{
+    typedef False is_kdtree_distance;
+    typedef False is_vector_space_distance;
+
+    typedef unsigned char ElementType;
+    typedef int ResultType;
+    typedef ElementType CentersType;
+
+    /** this will count the bits in a ^ b
+     */
+    template<typename Iterator2>
+    ResultType operator()(const unsigned char* a, const Iterator2 b, size_t size) const
+    {
+        static const uchar popCountTable[] =
+        {
+            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+        };
+        ResultType result = 0;
+        const unsigned char* b2 = reinterpret_cast<const unsigned char*> (b);
+        for (size_t i = 0; i < size; i++) {
+            result += popCountTable[a[i] ^ b2[i]];
+        }
+        return result;
+    }
+
+
+    ResultType operator()(const unsigned char* a, const ZeroIterator<unsigned char> b, size_t size) const
+    {
+        (void)b;
+        static const uchar popCountTable[] =
+        {
+            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+        };
+        ResultType result = 0;
+        for (size_t i = 0; i < size; i++) {
+            result += popCountTable[a[i]];
+        }
+        return result;
+    }
+};
+
+/**
+ * Hamming distance functor (pop count between two binary vectors, i.e. xor them and count the number of bits set)
+ * That code was taken from brief.cpp in OpenCV
+ */
+template<class T>
+struct Hamming
+{
+    typedef False is_kdtree_distance;
+    typedef False is_vector_space_distance;
+
+
+    typedef T ElementType;
+    typedef int ResultType;
+    typedef ElementType CentersType;
+
+    template<typename Iterator1, typename Iterator2>
+    ResultType operator()(const Iterator1 a, const Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        ResultType result = 0;
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
+        {
+            const unsigned char* a2 = reinterpret_cast<const unsigned char*> (a);
+            const unsigned char* b2 = reinterpret_cast<const unsigned char*> (b);
+            uint32x4_t bits = vmovq_n_u32(0);
+            for (size_t i = 0; i < size; i += 16) {
+                uint8x16_t A_vec = vld1q_u8 (a2 + i);
+                uint8x16_t B_vec = vld1q_u8 (b2 + i);
+                uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+                uint8x16_t bitsSet = vcntq_u8 (AxorB);
+                uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+                uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+                bits = vaddq_u32(bits, bitSet4);
+            }
+            uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+            result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+            result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+        }
+#elif defined(__GNUC__)
+        {
+            //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
+            typedef unsigned long long pop_t;
+            const size_t modulo = size % sizeof(pop_t);
+            const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
+            const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
+            const pop_t* a2_end = a2 + (size / sizeof(pop_t));
+
+            for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));
+
+            if (modulo) {
+                //in the case where size is not dividable by sizeof(size_t)
+                //need to mask off the bits at the end
+                pop_t a_final = 0, b_final = 0;
+                memcpy(&a_final, a2, modulo);
+                memcpy(&b_final, b2, modulo);
+                result += __builtin_popcountll(a_final ^ b_final);
+            }
+        }
+#else // NO NEON and NOT GNUC
+        HammingLUT lut;
+        result = lut(reinterpret_cast<const unsigned char*> (a),
+                     reinterpret_cast<const unsigned char*> (b), size);
+#endif
+        return result;
+    }
+
+
+    template<typename Iterator1>
+    ResultType operator()(const Iterator1 a, ZeroIterator<unsigned char> b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        (void)b;
+        ResultType result = 0;
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
+        {
+            const unsigned char* a2 = reinterpret_cast<const unsigned char*> (a);
+            uint32x4_t bits = vmovq_n_u32(0);
+            for (size_t i = 0; i < size; i += 16) {
+                uint8x16_t A_vec = vld1q_u8 (a2 + i);
+                uint8x16_t bitsSet = vcntq_u8 (A_vec);
+                uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+                uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+                bits = vaddq_u32(bits, bitSet4);
+            }
+            uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+            result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+            result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+        }
+#elif defined(__GNUC__)
+        {
+            //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
+            typedef unsigned long long pop_t;
+            const size_t modulo = size % sizeof(pop_t);
+            const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
+            const pop_t* a2_end = a2 + (size / sizeof(pop_t));
+
+            for (; a2 != a2_end; ++a2) result += __builtin_popcountll(*a2);
+
+            if (modulo) {
+                //in the case where size is not dividable by sizeof(size_t)
+                //need to mask off the bits at the end
+                pop_t a_final = 0;
+                memcpy(&a_final, a2, modulo);
+                result += __builtin_popcountll(a_final);
+            }
+        }
+#else // NO NEON and NOT GNUC
+        HammingLUT lut;
+        result = lut(reinterpret_cast<const unsigned char*> (a), b, size);
+#endif
+        return result;
+    }
+};
+
+template<typename T>
+struct Hamming2
+{
+    typedef False is_kdtree_distance;
+    typedef False is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef int ResultType;
+    typedef ElementType CentersType;
+
+    /** This is popcount_3() from:
+     * http://en.wikipedia.org/wiki/Hamming_weight */
+    unsigned int popcnt32(uint32_t n) const
+    {
+        n -= ((n >> 1) & 0x55555555);
+        n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
+        return (((n + (n >> 4))& 0xF0F0F0F)* 0x1010101) >> 24;
+    }
+
+#ifdef FLANN_PLATFORM_64_BIT
+    unsigned int popcnt64(uint64_t n) const
+    {
+        n -= ((n >> 1) & 0x5555555555555555);
+        n = (n & 0x3333333333333333) + ((n >> 2) & 0x3333333333333333);
+        return (((n + (n >> 4))& 0x0f0f0f0f0f0f0f0f)* 0x0101010101010101) >> 56;
+    }
+#endif
+
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(const Iterator1 a, const Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        CV_DbgAssert(!(size % long_word_size_) && "vectors size must be multiple of long words size (i.e. 8)");
+
+#ifdef FLANN_PLATFORM_64_BIT
+        const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
+        const uint64_t* pb = reinterpret_cast<const uint64_t*>(b);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt64(*pa ^ *pb);
+            ++pa;
+            ++pb;
+        }
+#else
+        const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
+        const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt32(*pa ^ *pb);
+            ++pa;
+            ++pb;
+        }
+#endif
+        return result;
+    }
+
+
+    template <typename Iterator1>
+    ResultType operator()(const Iterator1 a, ZeroIterator<unsigned char> b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        CV_DbgAssert(!(size % long_word_size_) && "vectors size must be multiple of long words size (i.e. 8)");
+
+        (void)b;
+#ifdef FLANN_PLATFORM_64_BIT
+        const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt64(*pa);
+            ++pa;
+        }
+#else
+        const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt32(*pa);
+            ++pa;
+        }
+#endif
+        return result;
+    }
+
+private:
+#ifdef FLANN_PLATFORM_64_BIT
+    static const size_t long_word_size_ = sizeof(uint64_t)/sizeof(unsigned char);
+#else
+    static const size_t long_word_size_ = sizeof(uint32_t)/sizeof(unsigned char);
+#endif
+};
+
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct DNAmmingLUT
+{
+    typedef False is_kdtree_distance;
+    typedef False is_vector_space_distance;
+
+    typedef unsigned char ElementType;
+    typedef int ResultType;
+    typedef ElementType CentersType;
+
+    /** this will count the bits in a ^ b
+     */
+    template<typename Iterator2>
+    ResultType operator()(const unsigned char* a, const Iterator2 b, size_t size) const
+    {
+        static const uchar popCountTable[] =
+        {
+            0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+            1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+            1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+        };
+        ResultType result = 0;
+        const unsigned char* b2 = reinterpret_cast<const unsigned char*> (b);
+        for (size_t i = 0; i < size; i++) {
+            result += popCountTable[a[i] ^ b2[i]];
+        }
+        return result;
+    }
+
+
+    ResultType operator()(const unsigned char* a, const ZeroIterator<unsigned char> b, size_t size) const
+    {
+        (void)b;
+        static const uchar popCountTable[] =
+        {
+            0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+            1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+            1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+            2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+        };
+        ResultType result = 0;
+        for (size_t i = 0; i < size; i++) {
+            result += popCountTable[a[i]];
+        }
+        return result;
+    }
+};
+
+
+template<typename T>
+struct DNAmming2
+{
+    typedef False is_kdtree_distance;
+    typedef False is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef int ResultType;
+    typedef ElementType CentersType;
+
+    /** This is popcount_3() from:
+     * http://en.wikipedia.org/wiki/Hamming_weight */
+    unsigned int popcnt32(uint32_t n) const
+    {
+        n = ((n >> 1) | n) & 0x55555555;
+        n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
+        return (((n + (n >> 4))& 0x0F0F0F0F)* 0x01010101) >> 24;
+    }
+
+#ifdef FLANN_PLATFORM_64_BIT
+    unsigned int popcnt64(uint64_t n) const
+    {
+        n = ((n >> 1) | n) & 0x5555555555555555;
+        n = (n & 0x3333333333333333) + ((n >> 2) & 0x3333333333333333);
+        return (((n + (n >> 4))& 0x0f0f0f0f0f0f0f0f)* 0x0101010101010101) >> 56;
+    }
+#endif
+
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(const Iterator1 a, const Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        CV_DbgAssert(!(size % long_word_size_) && "vectors size must be multiple of long words size (i.e. 8)");
+
+#ifdef FLANN_PLATFORM_64_BIT
+        const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
+        const uint64_t* pb = reinterpret_cast<const uint64_t*>(b);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt64(*pa ^ *pb);
+            ++pa;
+            ++pb;
+        }
+#else
+        const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
+        const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt32(*pa ^ *pb);
+            ++pa;
+            ++pb;
+        }
+#endif
+        return result;
+    }
+
+
+    template <typename Iterator1>
+    ResultType operator()(const Iterator1 a, ZeroIterator<unsigned char> b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        CV_DbgAssert(!(size % long_word_size_) && "vectors size must be multiple of long words size (i.e. 8)");
+
+        (void)b;
+#ifdef FLANN_PLATFORM_64_BIT
+        const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt64(*pa);
+            ++pa;
+        }
+#else
+        const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
+        ResultType result = 0;
+        size /= long_word_size_;
+        for(size_t i = 0; i < size; ++i ) {
+            result += popcnt32(*pa);
+            ++pa;
+        }
+#endif
+        return result;
+    }
+
+private:
+#ifdef FLANN_PLATFORM_64_BIT
+    static const size_t long_word_size_= sizeof(uint64_t)/sizeof(unsigned char);
+#else
+    static const size_t long_word_size_= sizeof(uint32_t)/sizeof(unsigned char);
+#endif
+};
+
+
+
+template<class T>
+struct HistIntersectionDistance
+{
+    typedef True is_kdtree_distance;
+    typedef True is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;
+
+    /**
+     *  Compute the histogram intersection distance
+     */
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType worst_dist = -1) const
+    {
+        ResultType result = ResultType();
+        ResultType min0, min1, min2, min3;
+        Iterator1 last = a + size;
+        Iterator1 lastgroup = last - 3;
+
+        /* Process 4 items with each loop for efficiency. */
+        while (a < lastgroup) {
+            min0 = (ResultType)(a[0] < b[0] ? a[0] : b[0]);
+            min1 = (ResultType)(a[1] < b[1] ? a[1] : b[1]);
+            min2 = (ResultType)(a[2] < b[2] ? a[2] : b[2]);
+            min3 = (ResultType)(a[3] < b[3] ? a[3] : b[3]);
+            result += min0 + min1 + min2 + min3;
+            a += 4;
+            b += 4;
+            if ((worst_dist>0)&&(result>worst_dist)) {
+                return result;
+            }
+        }
+        /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+        while (a < last) {
+            min0 = (ResultType)(*a < *b ? *a : *b);
+            result += min0;
+            ++a;
+            ++b;
+        }
+        return result;
+    }
+
+    /**
+     * Partial distance, used by the kd-tree.
+     */
+    template <typename U, typename V>
+    inline ResultType accum_dist(const U& a, const V& b, int) const
+    {
+        return a<b ? a : b;
+    }
+};
+
+
+
+template<class T>
+struct HellingerDistance
+{
+    typedef True is_kdtree_distance;
+    typedef True is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;
+
+    /**
+     *  Compute the Hellinger distance
+     */
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
+    {
+        ResultType result = ResultType();
+        ResultType diff0, diff1, diff2, diff3;
+        Iterator1 last = a + size;
+        Iterator1 lastgroup = last - 3;
+
+        /* Process 4 items with each loop for efficiency. */
+        while (a < lastgroup) {
+            diff0 = sqrt(static_cast<ResultType>(a[0])) - sqrt(static_cast<ResultType>(b[0]));
+            diff1 = sqrt(static_cast<ResultType>(a[1])) - sqrt(static_cast<ResultType>(b[1]));
+            diff2 = sqrt(static_cast<ResultType>(a[2])) - sqrt(static_cast<ResultType>(b[2]));
+            diff3 = sqrt(static_cast<ResultType>(a[3])) - sqrt(static_cast<ResultType>(b[3]));
+            result += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
+            a += 4;
+            b += 4;
+        }
+        while (a < last) {
+            diff0 = sqrt(static_cast<ResultType>(*a++)) - sqrt(static_cast<ResultType>(*b++));
+            result += diff0 * diff0;
+        }
+        return result;
+    }
+
+    /**
+     * Partial distance, used by the kd-tree.
+     */
+    template <typename U, typename V>
+    inline ResultType accum_dist(const U& a, const V& b, int) const
+    {
+        ResultType diff = sqrt(static_cast<ResultType>(a)) - sqrt(static_cast<ResultType>(b));
+        return diff * diff;
+    }
+};
+
+
+template<class T>
+struct ChiSquareDistance
+{
+    typedef True is_kdtree_distance;
+    typedef True is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;
+
+    /**
+     *  Compute the chi-square distance
+     */
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType worst_dist = -1) const
+    {
+        ResultType result = ResultType();
+        ResultType sum, diff;
+        Iterator1 last = a + size;
+
+        while (a < last) {
+            sum = (ResultType)(*a + *b);
+            if (sum>0) {
+                diff = (ResultType)(*a - *b);
+                result += diff*diff/sum;
+            }
+            ++a;
+            ++b;
+
+            if ((worst_dist>0)&&(result>worst_dist)) {
+                return result;
+            }
+        }
+        return result;
+    }
+
+    /**
+     * Partial distance, used by the kd-tree.
+     */
+    template <typename U, typename V>
+    inline ResultType accum_dist(const U& a, const V& b, int) const
+    {
+        ResultType result = ResultType();
+        ResultType sum, diff;
+
+        sum = (ResultType)(a+b);
+        if (sum>0) {
+            diff = (ResultType)(a-b);
+            result = diff*diff/sum;
+        }
+        return result;
+    }
+};
+
+
+template<class T>
+struct KL_Divergence
+{
+    typedef True is_kdtree_distance;
+    typedef True is_vector_space_distance;
+
+    typedef T ElementType;
+    typedef typename Accumulator<T>::Type ResultType;
+    typedef ResultType CentersType;
+
+    /**
+     *  Compute the Kullback-Leibler divergence
+     */
+    template <typename Iterator1, typename Iterator2>
+    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType worst_dist = -1) const
+    {
+        ResultType result = ResultType();
+        Iterator1 last = a + size;
+
+        while (a < last) {
+            if ( *a != 0 && *b != 0 ) {
+                ResultType ratio = (ResultType)(*a / *b);
+                if (ratio>0) {
+                    result += *a * log(ratio);
+                }
+            }
+            ++a;
+            ++b;
+
+            if ((worst_dist>0)&&(result>worst_dist)) {
+                return result;
+            }
+        }
+        return result;
+    }
+
+    /**
+     * Partial distance, used by the kd-tree.
+     */
+    template <typename U, typename V>
+    inline ResultType accum_dist(const U& a, const V& b, int) const
+    {
+        ResultType result = ResultType();
+        if( a != 0 && b != 0 ) {
+            ResultType ratio = (ResultType)(a / b);
+            if (ratio>0) {
+                result = a * log(ratio);
+            }
+        }
+        return result;
+    }
+};
+
+
+/*
+ * Depending on processed distances, some of them are already squared (e.g. L2)
+ * and some are not (e.g.Hamming). In KMeans++ for instance we want to be sure
+ * we are working on ^2 distances, thus following templates to ensure that.
+ */
+template <typename Distance, typename ElementType>
+struct squareDistance
+{
+    typedef typename Distance::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return dist*dist; }
+};
+
+
+template <typename ElementType>
+struct squareDistance<L2_Simple<ElementType>, ElementType>
+{
+    typedef typename L2_Simple<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return dist; }
+};
+
+template <typename ElementType>
+struct squareDistance<L2<ElementType>, ElementType>
+{
+    typedef typename L2<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return dist; }
+};
+
+
+template <typename ElementType>
+struct squareDistance<MinkowskiDistance<ElementType>, ElementType>
+{
+    typedef typename MinkowskiDistance<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return dist; }
+};
+
+template <typename ElementType>
+struct squareDistance<HellingerDistance<ElementType>, ElementType>
+{
+    typedef typename HellingerDistance<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return dist; }
+};
+
+template <typename ElementType>
+struct squareDistance<ChiSquareDistance<ElementType>, ElementType>
+{
+    typedef typename ChiSquareDistance<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return dist; }
+};
+
+
+template <typename Distance>
+typename Distance::ResultType ensureSquareDistance( typename Distance::ResultType dist )
+{
+    typedef typename Distance::ElementType ElementType;
+
+    squareDistance<Distance, ElementType> dummy;
+    return dummy( dist );
+}
+
+
+/*
+ * ...a template to tell the user if the distance he is working with is actually squared
+ */
+
+template <typename Distance, typename ElementType>
+struct isSquareDist
+{
+    bool operator()() { return false; }
+};
+
+
+template <typename ElementType>
+struct isSquareDist<L2_Simple<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+template <typename ElementType>
+struct isSquareDist<L2<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+
+template <typename ElementType>
+struct isSquareDist<MinkowskiDistance<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+template <typename ElementType>
+struct isSquareDist<HellingerDistance<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+template <typename ElementType>
+struct isSquareDist<ChiSquareDistance<ElementType>, ElementType>
+{
+    bool operator()() { return true; }
+};
+
+
+template <typename Distance>
+bool isSquareDistance()
+{
+    typedef typename Distance::ElementType ElementType;
+
+    isSquareDist<Distance, ElementType> dummy;
+    return dummy();
+}
+
+/*
+ * ...and a template to ensure the user that he will process the normal distance,
+ * and not squared distance, without losing processing time calling sqrt(ensureSquareDistance)
+ * that will result in doing actually sqrt(dist*dist) for L1 distance for instance.
+ */
+template <typename Distance, typename ElementType>
+struct simpleDistance
+{
+    typedef typename Distance::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return dist; }
+};
+
+
+template <typename ElementType>
+struct simpleDistance<L2_Simple<ElementType>, ElementType>
+{
+    typedef typename L2_Simple<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return sqrt(dist); }
+};
+
+template <typename ElementType>
+struct simpleDistance<L2<ElementType>, ElementType>
+{
+    typedef typename L2<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return sqrt(dist); }
+};
+
+
+template <typename ElementType>
+struct simpleDistance<MinkowskiDistance<ElementType>, ElementType>
+{
+    typedef typename MinkowskiDistance<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return sqrt(dist); }
+};
+
+template <typename ElementType>
+struct simpleDistance<HellingerDistance<ElementType>, ElementType>
+{
+    typedef typename HellingerDistance<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return sqrt(dist); }
+};
+
+template <typename ElementType>
+struct simpleDistance<ChiSquareDistance<ElementType>, ElementType>
+{
+    typedef typename ChiSquareDistance<ElementType>::ResultType ResultType;
+    ResultType operator()( ResultType dist ) { return sqrt(dist); }
+};
+
+
+template <typename Distance>
+typename Distance::ResultType ensureSimpleDistance( typename Distance::ResultType dist )
+{
+    typedef typename Distance::ElementType ElementType;
+
+    simpleDistance<Distance, ElementType> dummy;
+    return dummy( dist );
+}
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_DIST_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/dummy.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/dummy.h
new file mode 100644
index 000000000000..c176f2e4ef0a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/dummy.h
@@ -0,0 +1,16 @@
+
+#ifndef OPENCV_FLANN_DUMMY_H_
+#define OPENCV_FLANN_DUMMY_H_
+
+//! @cond IGNORED
+
+namespace cvflann
+{
+
+CV_DEPRECATED inline void dummyfunc() {}
+
+}
+
+//! @endcond
+
+#endif  /* OPENCV_FLANN_DUMMY_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/dynamic_bitset.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/dynamic_bitset.h
new file mode 100644
index 000000000000..676cb0b71ebd
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/dynamic_bitset.h
@@ -0,0 +1,160 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+/***********************************************************************
+ * Author: Vincent Rabaud
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_DYNAMIC_BITSET_H_
+#define OPENCV_FLANN_DYNAMIC_BITSET_H_
+
+//! @cond IGNORED
+
+#ifndef FLANN_USE_BOOST
+#  define FLANN_USE_BOOST 0
+#endif
+//#define FLANN_USE_BOOST 1
+#if FLANN_USE_BOOST
+#include <boost/dynamic_bitset.hpp>
+typedef boost::dynamic_bitset<> DynamicBitset;
+#else
+
+#include <limits.h>
+
+#include "dist.h"
+
+namespace cvflann {
+
+/** Class re-implementing the boost version of it
+ * This helps not depending on boost, it also does not do the bound checks
+ * and has a way to reset a block for speed
+ */
+class DynamicBitset
+{
+public:
+    /** default constructor
+     */
+    DynamicBitset() : size_(0)
+    {
+    }
+
+    /** only constructor we use in our code
+     * @param sz the size of the bitset (in bits)
+     */
+    DynamicBitset(size_t sz)
+    {
+        resize(sz);
+        reset();
+    }
+
+    /** Sets all the bits to 0
+     */
+    void clear()
+    {
+        std::fill(bitset_.begin(), bitset_.end(), 0);
+    }
+
+    /** @brief checks if the bitset is empty
+     * @return true if the bitset is empty
+     */
+    bool empty() const
+    {
+        return bitset_.empty();
+    }
+
+    /** set all the bits to 0
+     */
+    void reset()
+    {
+        std::fill(bitset_.begin(), bitset_.end(), 0);
+    }
+
+    /** @brief set one bit to 0
+     */
+    void reset(size_t index)
+    {
+        bitset_[index / cell_bit_size_] &= ~(size_t(1) << (index % cell_bit_size_));
+    }
+
+    /** @brief sets a specific bit to 0, and more bits too
+     * This function is useful when resetting a given set of bits so that the
+     * whole bitset ends up being 0: if that's the case, we don't care about setting
+     * other bits to 0
+     */
+    void reset_block(size_t index)
+    {
+        bitset_[index / cell_bit_size_] = 0;
+    }
+
+    /** resize the bitset so that it contains at least sz bits
+     */
+    void resize(size_t sz)
+    {
+        size_ = sz;
+        bitset_.resize(sz / cell_bit_size_ + 1);
+    }
+
+    /** set a bit to true
+     * @param index the index of the bit to set to 1
+     */
+    void set(size_t index)
+    {
+        bitset_[index / cell_bit_size_] |= size_t(1) << (index % cell_bit_size_);
+    }
+
+    /** gives the number of contained bits
+     */
+    size_t size() const
+    {
+        return size_;
+    }
+
+    /** check if a bit is set
+     * @param index the index of the bit to check
+     * @return true if the bit is set
+     */
+    bool test(size_t index) const
+    {
+        return (bitset_[index / cell_bit_size_] & (size_t(1) << (index % cell_bit_size_))) != 0;
+    }
+
+private:
+    std::vector<size_t> bitset_;
+    size_t size_;
+    static const unsigned int cell_bit_size_ = CHAR_BIT * sizeof(size_t);
+};
+
+} // namespace cvflann
+
+#endif
+
+//! @endcond
+
+#endif // OPENCV_FLANN_DYNAMIC_BITSET_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/flann.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/flann/flann.hpp
new file mode 100644
index 000000000000..227683f97973
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/flann.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/flann.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/flann_base.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/flann/flann_base.hpp
new file mode 100644
index 000000000000..af0b380bbf0f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/flann_base.hpp
@@ -0,0 +1,312 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_BASE_HPP_
+#define OPENCV_FLANN_BASE_HPP_
+
+//! @cond IGNORED
+
+#include <vector>
+#include <cstdio>
+
+#include "general.h"
+#include "matrix.h"
+#include "params.h"
+#include "saving.h"
+
+#include "all_indices.h"
+
+namespace cvflann
+{
+class FILEScopeGuard {
+
+public:
+    explicit FILEScopeGuard(FILE* file) {
+        file_ = file;
+    };
+
+    ~FILEScopeGuard() {
+        fclose(file_);
+    };
+
+private:
+    FILE* file_;
+};
+
+
+/**
+ * Sets the log level used for all flann functions
+ * @param level Verbosity level
+ */
+inline void log_verbosity(int level)
+{
+    if (level >= 0) {
+        Logger::setLevel(level);
+    }
+}
+
+/**
+ * (Deprecated) Index parameters for creating a saved index.
+ */
+struct SavedIndexParams : public IndexParams
+{
+    SavedIndexParams(cv::String filename)
+    {
+        (* this)["algorithm"] = FLANN_INDEX_SAVED;
+        (*this)["filename"] = filename;
+    }
+};
+
+template<typename Distance>
+NNIndex<Distance>* load_saved_index(const Matrix<typename Distance::ElementType>& dataset, const cv::String& filename, Distance distance)
+{
+    typedef typename Distance::ElementType ElementType;
+
+    FILE* fin = fopen(filename.c_str(), "rb");
+    if (fin == NULL) {
+        return NULL;
+    }
+    FILEScopeGuard fscgd(fin);
+
+    IndexHeader header = load_header(fin);
+    if (header.data_type != Datatype<ElementType>::type()) {
+        FLANN_THROW(cv::Error::StsError, "Datatype of saved index is different than of the one to be created.");
+    }
+    if ((size_t(header.rows) != dataset.rows)||(size_t(header.cols) != dataset.cols)) {
+        FLANN_THROW(cv::Error::StsError, "The index saved belongs to a different dataset");
+    }
+
+    IndexParams params;
+    params["algorithm"] = header.index_type;
+    NNIndex<Distance>* nnIndex = create_index_by_type<Distance>(dataset, params, distance);
+    nnIndex->loadIndex(fin);
+
+    return nnIndex;
+}
+
+
+template<typename Distance>
+class Index : public NNIndex<Distance>
+{
+public:
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+
+    Index(const Matrix<ElementType>& features, const IndexParams& params, Distance distance = Distance() )
+        :index_params_(params)
+    {
+        flann_algorithm_t index_type = get_param<flann_algorithm_t>(params,"algorithm");
+        loaded_ = false;
+
+        if (index_type == FLANN_INDEX_SAVED) {
+            nnIndex_ = load_saved_index<Distance>(features, get_param<cv::String>(params,"filename"), distance);
+            loaded_ = true;
+        }
+        else {
+            nnIndex_ = create_index_by_type<Distance>(features, params, distance);
+        }
+    }
+
+    ~Index()
+    {
+        delete nnIndex_;
+    }
+
+    /**
+     * Builds the index.
+     */
+    void buildIndex() CV_OVERRIDE
+    {
+        if (!loaded_) {
+            nnIndex_->buildIndex();
+        }
+    }
+
+    void save(cv::String filename)
+    {
+        FILE* fout = fopen(filename.c_str(), "wb");
+        if (fout == NULL) {
+            FLANN_THROW(cv::Error::StsError, "Cannot open file");
+        }
+        save_header(fout, *nnIndex_);
+        saveIndex(fout);
+        fclose(fout);
+    }
+
+    /**
+     * \brief Saves the index to a stream
+     * \param stream The stream to save the index to
+     */
+    virtual void saveIndex(FILE* stream) CV_OVERRIDE
+    {
+        nnIndex_->saveIndex(stream);
+    }
+
+    /**
+     * \brief Loads the index from a stream
+     * \param stream The stream from which the index is loaded
+     */
+    virtual void loadIndex(FILE* stream) CV_OVERRIDE
+    {
+        nnIndex_->loadIndex(stream);
+    }
+
+    /**
+     * \returns number of features in this index.
+     */
+    size_t veclen() const CV_OVERRIDE
+    {
+        return nnIndex_->veclen();
+    }
+
+    /**
+     * \returns The dimensionality of the features in this index.
+     */
+    size_t size() const CV_OVERRIDE
+    {
+        return nnIndex_->size();
+    }
+
+    /**
+     * \returns The index type (kdtree, kmeans,...)
+     */
+    flann_algorithm_t getType() const CV_OVERRIDE
+    {
+        return nnIndex_->getType();
+    }
+
+    /**
+     * \returns The amount of memory (in bytes) used by the index.
+     */
+    virtual int usedMemory() const CV_OVERRIDE
+    {
+        return nnIndex_->usedMemory();
+    }
+
+
+    /**
+     * \returns The index parameters
+     */
+    IndexParams getParameters() const CV_OVERRIDE
+    {
+        return nnIndex_->getParameters();
+    }
+
+    /**
+     * \brief Perform k-nearest neighbor search
+     * \param[in] queries The query points for which to find the nearest neighbors
+     * \param[out] indices The indices of the nearest neighbors found
+     * \param[out] dists Distances to the nearest neighbors found
+     * \param[in] knn Number of nearest neighbors to return
+     * \param[in] params Search parameters
+     */
+    void knnSearch(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, int knn, const SearchParams& params) CV_OVERRIDE
+    {
+        nnIndex_->knnSearch(queries, indices, dists, knn, params);
+    }
+
+    /**
+     * \brief Perform radius search
+     * \param[in] query The query point
+     * \param[out] indices The indinces of the neighbors found within the given radius
+     * \param[out] dists The distances to the nearest neighbors found
+     * \param[in] radius The radius used for search
+     * \param[in] params Search parameters
+     * \returns Number of neighbors found
+     */
+    int radiusSearch(const Matrix<ElementType>& query, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) CV_OVERRIDE
+    {
+        return nnIndex_->radiusSearch(query, indices, dists, radius, params);
+    }
+
+    /**
+     * \brief Method that searches for nearest-neighbours
+     */
+    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
+    {
+        nnIndex_->findNeighbors(result, vec, searchParams);
+    }
+
+    /**
+     * \brief Returns actual index
+     */
+    CV_DEPRECATED NNIndex<Distance>* getIndex()
+    {
+        return nnIndex_;
+    }
+
+    /**
+     * \brief Returns index parameters.
+     * \deprecated use getParameters() instead.
+     */
+    CV_DEPRECATED  const IndexParams* getIndexParameters()
+    {
+        return &index_params_;
+    }
+
+private:
+    /** Pointer to actual index class */
+    NNIndex<Distance>* nnIndex_;
+    /** Indices if the index was loaded from a file */
+    bool loaded_;
+    /** Parameters passed to the index */
+    IndexParams index_params_;
+
+    Index(const Index &); // copy disabled
+    Index& operator=(const Index &); // assign disabled
+};
+
+/**
+ * Performs a hierarchical clustering of the points passed as argument and then takes a cut in the
+ * the clustering tree to return a flat clustering.
+ * @param[in] points Points to be clustered
+ * @param centers The computed cluster centres. Matrix should be preallocated and centers.rows is the
+ *  number of clusters requested.
+ * @param params Clustering parameters (The same as for cvflann::KMeansIndex)
+ * @param d Distance to be used for clustering (eg: cvflann::L2)
+ * @return number of clusters computed (can be different than clusters.rows and is the highest number
+ * of the form (branching-1)*K+1 smaller than clusters.rows).
+ */
+template <typename Distance>
+int hierarchicalClustering(const Matrix<typename Distance::ElementType>& points, Matrix<typename Distance::CentersType>& centers,
+                           const KMeansIndexParams& params, Distance d = Distance())
+{
+    KMeansIndex<Distance> kmeans(points, params, d);
+    kmeans.buildIndex();
+
+    int clusterNum = kmeans.getClusterCenters(centers);
+    return clusterNum;
+}
+
+}
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_BASE_HPP_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/general.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/general.h
new file mode 100644
index 000000000000..e65cba2f8af4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/general.h
@@ -0,0 +1,65 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_GENERAL_H_
+#define OPENCV_FLANN_GENERAL_H_
+
+#include "opencv2/core/version.hpp"
+
+#if CV_VERSION_MAJOR <= 4
+
+//! @cond IGNORED
+
+#include "opencv2/core.hpp"
+
+namespace cvflann
+{
+
+class FLANNException : public cv::Exception
+{
+public:
+    FLANNException(const char* message) : cv::Exception(0, message, "", __FILE__, __LINE__) { }
+
+    FLANNException(const cv::String& message) : cv::Exception(0, message, "", __FILE__, __LINE__) { }
+};
+
+}
+
+#define FLANN_THROW(TYPE, STR) throw FLANNException(STR)
+
+#else
+
+#define FLANN_THROW(TYPE, STR) CV_Error(TYPE, STR)
+
+#endif
+
+//! @endcond
+
+#endif  /* OPENCV_FLANN_GENERAL_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/ground_truth.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/ground_truth.h
new file mode 100644
index 000000000000..17f2a8e848c3
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/ground_truth.h
@@ -0,0 +1,98 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_GROUND_TRUTH_H_
+#define OPENCV_FLANN_GROUND_TRUTH_H_
+
+//! @cond IGNORED
+
+#include "dist.h"
+#include "matrix.h"
+
+
+namespace cvflann
+{
+
+template <typename Distance>
+void find_nearest(const Matrix<typename Distance::ElementType>& dataset, typename Distance::ElementType* query, int* matches, int nn,
+                  int skip = 0, Distance distance = Distance())
+{
+    typedef typename Distance::ResultType DistanceType;
+    int n = nn + skip;
+
+    std::vector<int> match(n);
+    std::vector<DistanceType> dists(n);
+
+    dists[0] = distance(dataset[0], query, dataset.cols);
+    match[0] = 0;
+    int dcnt = 1;
+
+    for (size_t i=1; i<dataset.rows; ++i) {
+        DistanceType tmp = distance(dataset[i], query, dataset.cols);
+
+        if (dcnt<n) {
+            match[dcnt] = (int)i;
+            dists[dcnt++] = tmp;
+        }
+        else if (tmp < dists[dcnt-1]) {
+            dists[dcnt-1] = tmp;
+            match[dcnt-1] = (int)i;
+        }
+
+        int j = dcnt-1;
+        // bubble up
+        while (j>=1 && dists[j]<dists[j-1]) {
+            std::swap(dists[j],dists[j-1]);
+            std::swap(match[j],match[j-1]);
+            j--;
+        }
+    }
+
+    for (int i=0; i<nn; ++i) {
+        matches[i] = match[i+skip];
+    }
+}
+
+
+template <typename Distance>
+void compute_ground_truth(const Matrix<typename Distance::ElementType>& dataset, const Matrix<typename Distance::ElementType>& testset, Matrix<int>& matches,
+                          int skip=0, Distance d = Distance())
+{
+    for (size_t i=0; i<testset.rows; ++i) {
+        find_nearest<Distance>(dataset, testset[i], matches[i], (int)matches.cols, skip, d);
+    }
+}
+
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_GROUND_TRUTH_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/hdf5.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/hdf5.h
new file mode 100644
index 000000000000..75543840d60a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/hdf5.h
@@ -0,0 +1,235 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+
+#ifndef OPENCV_FLANN_HDF5_H_
+#define OPENCV_FLANN_HDF5_H_
+
+//! @cond IGNORED
+
+#include <hdf5.h>
+
+#include "matrix.h"
+
+
+namespace cvflann
+{
+
+namespace
+{
+
+template<typename T>
+hid_t get_hdf5_type()
+{
+    throw FLANNException("Unsupported type for IO operations");
+}
+
+template<>
+hid_t get_hdf5_type<char>() { return H5T_NATIVE_CHAR; }
+template<>
+hid_t get_hdf5_type<unsigned char>() { return H5T_NATIVE_UCHAR; }
+template<>
+hid_t get_hdf5_type<short int>() { return H5T_NATIVE_SHORT; }
+template<>
+hid_t get_hdf5_type<unsigned short int>() { return H5T_NATIVE_USHORT; }
+template<>
+hid_t get_hdf5_type<int>() { return H5T_NATIVE_INT; }
+template<>
+hid_t get_hdf5_type<unsigned int>() { return H5T_NATIVE_UINT; }
+template<>
+hid_t get_hdf5_type<long>() { return H5T_NATIVE_LONG; }
+template<>
+hid_t get_hdf5_type<unsigned long>() { return H5T_NATIVE_ULONG; }
+template<>
+hid_t get_hdf5_type<float>() { return H5T_NATIVE_FLOAT; }
+template<>
+hid_t get_hdf5_type<double>() { return H5T_NATIVE_DOUBLE; }
+}
+
+
+#define CHECK_ERROR(x,y) if ((x)<0) throw FLANNException((y));
+
+template<typename T>
+void save_to_file(const cvflann::Matrix<T>& dataset, const String& filename, const String& name)
+{
+
+#if H5Eset_auto_vers == 2
+    H5Eset_auto( H5E_DEFAULT, NULL, NULL );
+#else
+    H5Eset_auto( NULL, NULL );
+#endif
+
+    herr_t status;
+    hid_t file_id;
+    file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
+    if (file_id < 0) {
+        file_id = H5Fcreate(filename.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, H5P_DEFAULT);
+    }
+    CHECK_ERROR(file_id,"Error creating hdf5 file.");
+
+    hsize_t     dimsf[2];              // dataset dimensions
+    dimsf[0] = dataset.rows;
+    dimsf[1] = dataset.cols;
+
+    hid_t space_id = H5Screate_simple(2, dimsf, NULL);
+    hid_t memspace_id = H5Screate_simple(2, dimsf, NULL);
+
+    hid_t dataset_id;
+#if H5Dcreate_vers == 2
+    dataset_id = H5Dcreate2(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+#else
+    dataset_id = H5Dcreate(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT);
+#endif
+
+    if (dataset_id<0) {
+#if H5Dopen_vers == 2
+        dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
+#else
+        dataset_id = H5Dopen(file_id, name.c_str());
+#endif
+    }
+    CHECK_ERROR(dataset_id,"Error creating or opening dataset in file.");
+
+    status = H5Dwrite(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, H5P_DEFAULT, dataset.data );
+    CHECK_ERROR(status, "Error writing to dataset");
+
+    H5Sclose(memspace_id);
+    H5Sclose(space_id);
+    H5Dclose(dataset_id);
+    H5Fclose(file_id);
+
+}
+
+
+template<typename T>
+void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
+{
+    herr_t status;
+    hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
+    CHECK_ERROR(file_id,"Error opening hdf5 file.");
+
+    hid_t dataset_id;
+#if H5Dopen_vers == 2
+    dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
+#else
+    dataset_id = H5Dopen(file_id, name.c_str());
+#endif
+    CHECK_ERROR(dataset_id,"Error opening dataset in file.");
+
+    hid_t space_id = H5Dget_space(dataset_id);
+
+    hsize_t dims_out[2];
+    H5Sget_simple_extent_dims(space_id, dims_out, NULL);
+
+    dataset = cvflann::Matrix<T>(new T[dims_out[0]*dims_out[1]], dims_out[0], dims_out[1]);
+
+    status = H5Dread(dataset_id, get_hdf5_type<T>(), H5S_ALL, H5S_ALL, H5P_DEFAULT, dataset[0]);
+    CHECK_ERROR(status, "Error reading dataset");
+
+    H5Sclose(space_id);
+    H5Dclose(dataset_id);
+    H5Fclose(file_id);
+}
+
+
+#ifdef HAVE_MPI
+
+namespace mpi
+{
+/**
+ * Loads a the hyperslice corresponding to this processor from a hdf5 file.
+ * @param flann_dataset Dataset where the data is loaded
+ * @param filename HDF5 file name
+ * @param name Name of dataset inside file
+ */
+template<typename T>
+void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
+{
+    MPI_Comm comm  = MPI_COMM_WORLD;
+    MPI_Info info  = MPI_INFO_NULL;
+
+    int mpi_size, mpi_rank;
+    MPI_Comm_size(comm, &mpi_size);
+    MPI_Comm_rank(comm, &mpi_rank);
+
+    herr_t status;
+
+    hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
+    H5Pset_fapl_mpio(plist_id, comm, info);
+    hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, plist_id);
+    CHECK_ERROR(file_id,"Error opening hdf5 file.");
+    H5Pclose(plist_id);
+    hid_t dataset_id;
+#if H5Dopen_vers == 2
+    dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
+#else
+    dataset_id = H5Dopen(file_id, name.c_str());
+#endif
+    CHECK_ERROR(dataset_id,"Error opening dataset in file.");
+
+    hid_t space_id = H5Dget_space(dataset_id);
+    hsize_t dims[2];
+    H5Sget_simple_extent_dims(space_id, dims, NULL);
+
+    hsize_t count[2];
+    hsize_t offset[2];
+
+    hsize_t item_cnt = dims[0]/mpi_size+(dims[0]%mpi_size==0 ? 0 : 1);
+    hsize_t cnt = (mpi_rank<mpi_size-1 ? item_cnt : dims[0]-item_cnt*(mpi_size-1));
+
+    count[0] = cnt;
+    count[1] = dims[1];
+    offset[0] = mpi_rank*item_cnt;
+    offset[1] = 0;
+
+    hid_t memspace_id = H5Screate_simple(2,count,NULL);
+
+    H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, NULL, count, NULL);
+
+    dataset.rows = count[0];
+    dataset.cols = count[1];
+    dataset.data = new T[dataset.rows*dataset.cols];
+
+    plist_id = H5Pcreate(H5P_DATASET_XFER);
+    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+    status = H5Dread(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, plist_id, dataset.data);
+    CHECK_ERROR(status, "Error reading dataset");
+
+    H5Pclose(plist_id);
+    H5Sclose(space_id);
+    H5Sclose(memspace_id);
+    H5Dclose(dataset_id);
+    H5Fclose(file_id);
+}
+}
+#endif // HAVE_MPI
+} // namespace cvflann::mpi
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_HDF5_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/heap.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/heap.h
new file mode 100644
index 000000000000..8cace2044973
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/heap.h
@@ -0,0 +1,244 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_HEAP_H_
+#define OPENCV_FLANN_HEAP_H_
+
+//! @cond IGNORED
+
+#include <algorithm>
+#include <vector>
+
+#include <unordered_map>
+
+namespace cvflann
+{
+
+// TODO: Define x > y operator and use std::greater<T> instead
+template <typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return y < x;
+    }
+};
+
+/**
+ * Priority Queue Implementation
+ *
+ * The priority queue is implemented with a heap.  A heap is a complete
+ * (full) binary tree in which each parent is less than both of its
+ * children, but the order of the children is unspecified.
+ */
+template <typename T>
+class Heap
+{
+    /**
+     * Storage array for the heap.
+     * Type T must be comparable.
+     */
+    std::vector<T> heap;
+public:
+    /**
+     * \brief Constructs a heap with a pre-allocated capacity
+     *
+     * \param capacity heap maximum capacity
+     */
+    Heap(const int capacity)
+    {
+        reserve(capacity);
+    }
+
+    /**
+     * \brief Move-constructs a heap from an external vector
+     *
+     * \param vec external vector
+     */
+    Heap(std::vector<T>&& vec)
+        : heap(std::move(vec))
+    {
+        std::make_heap(heap.begin(), heap.end(), greater<T>());
+    }
+
+    /**
+     *
+     * \returns heap size
+     */
+    int size() const
+    {
+        return (int)heap.size();
+    }
+
+    /**
+     *
+     * \returns heap capacity
+     */
+    int capacity() const
+    {
+        return (int)heap.capacity();
+    }
+
+    /**
+     * \brief Tests if the heap is empty
+     *
+     * \returns true is heap empty, false otherwise
+     */
+    bool empty()
+    {
+        return heap.empty();
+    }
+
+    /**
+     * \brief Clears the heap.
+     */
+    void clear()
+    {
+        heap.clear();
+    }
+
+    /**
+     * \brief Sets the heap maximum capacity.
+     *
+     * \param capacity heap maximum capacity
+     */
+    void reserve(const int capacity)
+    {
+        heap.reserve(capacity);
+    }
+
+    /**
+     * \brief Inserts a new element in the heap.
+     *
+     * We select the next empty leaf node, and then keep moving any larger
+     * parents down until the right location is found to store this element.
+     *
+     * \param value the new element to be inserted in the heap
+     */
+    void insert(T value)
+    {
+        /* If heap is full, then return without adding this element. */
+        if (size() == capacity()) {
+            return;
+        }
+
+        heap.push_back(value);
+        std::push_heap(heap.begin(), heap.end(), greater<T>());
+    }
+
+    /**
+     * \brief Returns the node of minimum value from the heap (top of the heap).
+     *
+     * \param[out] value parameter used to return the min element
+     * \returns false if heap empty
+     */
+    bool popMin(T& value)
+    {
+        if (empty()) {
+            return false;
+        }
+
+        value = heap[0];
+        std::pop_heap(heap.begin(), heap.end(), greater<T>());
+        heap.pop_back();
+
+        return true;  /* Return old last node. */
+    }
+
+    /**
+     * \brief Returns a shared heap for the given memory pool ID.
+     *
+     * It constructs the heap if it does not already exists.
+     *
+     * \param poolId a user-chosen hashable ID for identifying the heap.
+     *     For thread-safe operations, using current thread ID is a good choice.
+     * \param capacity heap maximum capacity
+     * \param iterThreshold remove heaps that were not reused for more than specified iterations count
+     *        if iterThreshold value is less 2, it will be internally adjusted to twice the number of CPU threads
+     * \returns pointer to the heap
+     */
+    template <typename HashableT>
+    static cv::Ptr<Heap<T>> getPooledInstance(
+        const HashableT& poolId, const int capacity, int iterThreshold = 0)
+    {
+        static cv::Mutex mutex;
+        const cv::AutoLock lock(mutex);
+
+        struct HeapMapValueType {
+            cv::Ptr<Heap<T>> heapPtr;
+            int iterCounter;
+        };
+        typedef std::unordered_map<HashableT, HeapMapValueType> HeapMapType;
+
+        static HeapMapType heapsPool;
+        typename HeapMapType::iterator heapIt = heapsPool.find(poolId);
+
+        if (heapIt == heapsPool.end())
+        {
+            // Construct the heap as it does not already exists
+            HeapMapValueType heapAndTimePair = {cv::makePtr<Heap<T>>(capacity), 0};
+            const std::pair<typename HeapMapType::iterator, bool>& emplaceResult = heapsPool.emplace(poolId, std::move(heapAndTimePair));
+            CV_CheckEQ(static_cast<int>(emplaceResult.second), 1, "Failed to insert the heap into its memory pool");
+            heapIt = emplaceResult.first;
+        }
+        else
+        {
+            CV_CheckEQ(heapIt->second.heapPtr.use_count(), 1, "Cannot modify a heap that is currently accessed by another caller");
+            heapIt->second.heapPtr->clear();
+            heapIt->second.heapPtr->reserve(capacity);
+            heapIt->second.iterCounter = 0;
+        }
+
+        if (iterThreshold <= 1) {
+            iterThreshold = 2 * cv::getNumThreads();
+        }
+
+        // Remove heaps that were not reused for more than given iterThreshold
+        typename HeapMapType::iterator cleanupIt = heapsPool.begin();
+        while (cleanupIt != heapsPool.end())
+        {
+            if (cleanupIt->second.iterCounter++ > iterThreshold)
+            {
+                CV_Assert(cleanupIt != heapIt);
+                cleanupIt = heapsPool.erase(cleanupIt);
+                continue;
+            }
+            ++cleanupIt;
+        }
+
+        return heapIt->second.heapPtr;
+    }
+};
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_HEAP_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/hierarchical_clustering_index.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/hierarchical_clustering_index.h
new file mode 100644
index 000000000000..60662e7714b3
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/hierarchical_clustering_index.h
@@ -0,0 +1,846 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2011  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2011  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
+#define OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
+
+//! @cond IGNORED
+
+#include <algorithm>
+#include <map>
+#include <limits>
+#include <cmath>
+
+#include "general.h"
+#include "nn_index.h"
+#include "dist.h"
+#include "matrix.h"
+#include "result_set.h"
+#include "heap.h"
+#include "allocator.h"
+#include "random.h"
+#include "saving.h"
+
+
+namespace cvflann
+{
+
+struct HierarchicalClusteringIndexParams : public IndexParams
+{
+    HierarchicalClusteringIndexParams(int branching = 32,
+                                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM,
+                                      int trees = 4, int leaf_size = 100)
+    {
+        (*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL;
+        // The branching factor used in the hierarchical clustering
+        (*this)["branching"] = branching;
+        // Algorithm used for picking the initial cluster centers
+        (*this)["centers_init"] = centers_init;
+        // number of parallel trees to build
+        (*this)["trees"] = trees;
+        // maximum leaf size
+        (*this)["leaf_size"] = leaf_size;
+    }
+};
+
+
+/**
+ * Hierarchical index
+ *
+ * Contains a tree constructed through a hierarchical clustering
+ * and other information for indexing a set of points for nearest-neighbour matching.
+ */
+template <typename Distance>
+class HierarchicalClusteringIndex : public NNIndex<Distance>
+{
+public:
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+
+private:
+
+
+    typedef void (HierarchicalClusteringIndex::* centersAlgFunction)(int, int*, int, int*, int&);
+
+    /**
+     * The function used for choosing the cluster centers.
+     */
+    centersAlgFunction chooseCenters;
+
+
+
+    /**
+     * Chooses the initial centers in the k-means clustering in a random manner.
+     *
+     * Params:
+     *     k = number of centers
+     *     vecs = the dataset of points
+     *     indices = indices in the dataset
+     *     indices_length = length of indices vector
+     *
+     */
+    void chooseCentersRandom(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
+    {
+        UniqueRandom r(indices_length);
+
+        int index;
+        for (index=0; index<k; ++index) {
+            bool duplicate = true;
+            int rnd;
+            while (duplicate) {
+                duplicate = false;
+                rnd = r.next();
+                if (rnd<0) {
+                    centers_length = index;
+                    return;
+                }
+
+                centers[index] = dsindices[rnd];
+
+                for (int j=0; j<index; ++j) {
+                    DistanceType sq = distance(dataset[centers[index]], dataset[centers[j]], dataset.cols);
+                    if (sq<1e-16) {
+                        duplicate = true;
+                    }
+                }
+            }
+        }
+
+        centers_length = index;
+    }
+
+
+    /**
+     * Chooses the initial centers in the k-means using Gonzales' algorithm
+     * so that the centers are spaced apart from each other.
+     *
+     * Params:
+     *     k = number of centers
+     *     vecs = the dataset of points
+     *     indices = indices in the dataset
+     * Returns:
+     */
+    void chooseCentersGonzales(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
+    {
+        int n = indices_length;
+
+        int rnd = rand_int(n);
+        CV_DbgAssert(rnd >=0 && rnd < n);
+
+        centers[0] = dsindices[rnd];
+
+        int index;
+        for (index=1; index<k; ++index) {
+
+            int best_index = -1;
+            DistanceType best_val = 0;
+            for (int j=0; j<n; ++j) {
+                DistanceType dist = distance(dataset[centers[0]],dataset[dsindices[j]],dataset.cols);
+                for (int i=1; i<index; ++i) {
+                    DistanceType tmp_dist = distance(dataset[centers[i]],dataset[dsindices[j]],dataset.cols);
+                    if (tmp_dist<dist) {
+                        dist = tmp_dist;
+                    }
+                }
+                if (dist>best_val) {
+                    best_val = dist;
+                    best_index = j;
+                }
+            }
+            if (best_index!=-1) {
+                centers[index] = dsindices[best_index];
+            }
+            else {
+                break;
+            }
+        }
+        centers_length = index;
+    }
+
+
+    /**
+     * Chooses the initial centers in the k-means using the algorithm
+     * proposed in the KMeans++ paper:
+     * Arthur, David; Vassilvitskii, Sergei - k-means++: The Advantages of Careful Seeding
+     *
+     * Implementation of this function was converted from the one provided in Arthur's code.
+     *
+     * Params:
+     *     k = number of centers
+     *     vecs = the dataset of points
+     *     indices = indices in the dataset
+     * Returns:
+     */
+    void chooseCentersKMeanspp(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
+    {
+        int n = indices_length;
+
+        double currentPot = 0;
+        DistanceType* closestDistSq = new DistanceType[n];
+
+        // Choose one random center and set the closestDistSq values
+        int index = rand_int(n);
+        CV_DbgAssert(index >=0 && index < n);
+        centers[0] = dsindices[index];
+
+        // Computing distance^2 will have the advantage of even higher probability further to pick new centers
+        // far from previous centers (and this complies to "k-means++: the advantages of careful seeding" article)
+        for (int i = 0; i < n; i++) {
+            closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
+            closestDistSq[i] = ensureSquareDistance<Distance>( closestDistSq[i] );
+            currentPot += closestDistSq[i];
+        }
+
+
+        const int numLocalTries = 1;
+
+        // Choose each center
+        int centerCount;
+        for (centerCount = 1; centerCount < k; centerCount++) {
+
+            // Repeat several trials
+            double bestNewPot = -1;
+            int bestNewIndex = 0;
+            for (int localTrial = 0; localTrial < numLocalTries; localTrial++) {
+
+                // Choose our center - have to be slightly careful to return a valid answer even accounting
+                // for possible rounding errors
+                double randVal = rand_double(currentPot);
+                for (index = 0; index < n-1; index++) {
+                    if (randVal <= closestDistSq[index]) break;
+                    else randVal -= closestDistSq[index];
+                }
+
+                // Compute the new potential
+                double newPot = 0;
+                for (int i = 0; i < n; i++) {
+                    DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
+                    newPot += std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
+                }
+
+                // Store the best result
+                if ((bestNewPot < 0)||(newPot < bestNewPot)) {
+                    bestNewPot = newPot;
+                    bestNewIndex = index;
+                }
+            }
+
+            // Add the appropriate center
+            centers[centerCount] = dsindices[bestNewIndex];
+            currentPot = bestNewPot;
+            for (int i = 0; i < n; i++) {
+                DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols);
+                closestDistSq[i] = std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
+            }
+        }
+
+        centers_length = centerCount;
+
+        delete[] closestDistSq;
+    }
+
+
+    /**
+     * Chooses the initial centers in a way inspired by Gonzales (by Pierre-Emmanuel Viel):
+     * select the first point of the list as a candidate, then parse the points list. If another
+     * point is further than current candidate from the other centers, test if it is a good center
+     * of a local aggregation. If it is, replace current candidate by this point. And so on...
+     *
+     * Used with KMeansIndex that computes centers coordinates by averaging positions of clusters points,
+     * this doesn't make a real difference with previous methods. But used with HierarchicalClusteringIndex
+     * class that pick centers among existing points instead of computing the barycenters, there is a real
+     * improvement.
+     *
+     * Params:
+     *     k = number of centers
+     *     vecs = the dataset of points
+     *     indices = indices in the dataset
+     * Returns:
+     */
+    void GroupWiseCenterChooser(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
+    {
+        const float kSpeedUpFactor = 1.3f;
+
+        int n = indices_length;
+
+        DistanceType* closestDistSq = new DistanceType[n];
+
+        // Choose one random center and set the closestDistSq values
+        int index = rand_int(n);
+        CV_DbgAssert(index >=0 && index < n);
+        centers[0] = dsindices[index];
+
+        for (int i = 0; i < n; i++) {
+            closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
+        }
+
+
+        // Choose each center
+        int centerCount;
+        for (centerCount = 1; centerCount < k; centerCount++) {
+
+            // Repeat several trials
+            double bestNewPot = -1;
+            int bestNewIndex = 0;
+            DistanceType furthest = 0;
+            for (index = 0; index < n; index++) {
+
+                // We will test only the potential of the points further than current candidate
+                if( closestDistSq[index] > kSpeedUpFactor * (float)furthest ) {
+
+                    // Compute the new potential
+                    double newPot = 0;
+                    for (int i = 0; i < n; i++) {
+                        newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols)
+                                            , closestDistSq[i] );
+                    }
+
+                    // Store the best result
+                    if ((bestNewPot < 0)||(newPot <= bestNewPot)) {
+                        bestNewPot = newPot;
+                        bestNewIndex = index;
+                        furthest = closestDistSq[index];
+                    }
+                }
+            }
+
+            // Add the appropriate center
+            centers[centerCount] = dsindices[bestNewIndex];
+            for (int i = 0; i < n; i++) {
+                closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols)
+                                             , closestDistSq[i] );
+            }
+        }
+
+        centers_length = centerCount;
+
+        delete[] closestDistSq;
+    }
+
+
+public:
+
+
+    /**
+     * Index constructor
+     *
+     * Params:
+     *          inputData = dataset with the input features
+     *          params = parameters passed to the hierarchical k-means algorithm
+     */
+    HierarchicalClusteringIndex(const Matrix<ElementType>& inputData, const IndexParams& index_params = HierarchicalClusteringIndexParams(),
+                                Distance d = Distance())
+        : dataset(inputData), params(index_params), root(NULL), indices(NULL), distance(d)
+    {
+        memoryCounter = 0;
+
+        size_ = dataset.rows;
+        veclen_ = dataset.cols;
+
+        branching_ = get_param(params,"branching",32);
+        centers_init_ = get_param(params,"centers_init", FLANN_CENTERS_RANDOM);
+        trees_ = get_param(params,"trees",4);
+        leaf_size_ = get_param(params,"leaf_size",100);
+
+        if (centers_init_==FLANN_CENTERS_RANDOM) {
+            chooseCenters = &HierarchicalClusteringIndex::chooseCentersRandom;
+        }
+        else if (centers_init_==FLANN_CENTERS_GONZALES) {
+            chooseCenters = &HierarchicalClusteringIndex::chooseCentersGonzales;
+        }
+        else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
+            chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;
+        }
+        else if (centers_init_==FLANN_CENTERS_GROUPWISE) {
+            chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser;
+        }
+        else {
+            FLANN_THROW(cv::Error::StsError, "Unknown algorithm for choosing initial centers.");
+        }
+
+        root = new NodePtr[trees_];
+        indices = new int*[trees_];
+
+        for (int i=0; i<trees_; ++i) {
+            root[i] = NULL;
+            indices[i] = NULL;
+        }
+    }
+
+    HierarchicalClusteringIndex(const HierarchicalClusteringIndex&);
+    HierarchicalClusteringIndex& operator=(const HierarchicalClusteringIndex&);
+
+    /**
+     * Index destructor.
+     *
+     * Release the memory used by the index.
+     */
+    virtual ~HierarchicalClusteringIndex()
+    {
+        if (root!=NULL) {
+            delete[] root;
+        }
+
+        if (indices!=NULL) {
+            free_indices();
+            delete[] indices;
+        }
+    }
+
+    /**
+     *  Returns size of index.
+     */
+    size_t size() const CV_OVERRIDE
+    {
+        return size_;
+    }
+
+    /**
+     * Returns the length of an index feature.
+     */
+    size_t veclen() const CV_OVERRIDE
+    {
+        return veclen_;
+    }
+
+
+    /**
+     * Computes the inde memory usage
+     * Returns: memory used by the index
+     */
+    int usedMemory() const CV_OVERRIDE
+    {
+        return pool.usedMemory+pool.wastedMemory+memoryCounter;
+    }
+
+    /**
+     * Builds the index
+     */
+    void buildIndex() CV_OVERRIDE
+    {
+        if (branching_<2) {
+            FLANN_THROW(cv::Error::StsError, "Branching factor must be at least 2");
+        }
+
+        free_indices();
+
+        for (int i=0; i<trees_; ++i) {
+            indices[i] = new int[size_];
+            for (size_t j=0; j<size_; ++j) {
+                indices[i][j] = (int)j;
+            }
+            root[i] = pool.allocate<Node>();
+            computeClustering(root[i], indices[i], (int)size_, branching_,0);
+        }
+    }
+
+
+    flann_algorithm_t getType() const CV_OVERRIDE
+    {
+        return FLANN_INDEX_HIERARCHICAL;
+    }
+
+
+    void saveIndex(FILE* stream) CV_OVERRIDE
+    {
+        save_value(stream, branching_);
+        save_value(stream, trees_);
+        save_value(stream, centers_init_);
+        save_value(stream, leaf_size_);
+        save_value(stream, memoryCounter);
+        for (int i=0; i<trees_; ++i) {
+            save_value(stream, *indices[i], size_);
+            save_tree(stream, root[i], i);
+        }
+
+    }
+
+
+    void loadIndex(FILE* stream) CV_OVERRIDE
+    {
+        if (root!=NULL) {
+            delete[] root;
+        }
+
+        if (indices!=NULL) {
+            free_indices();
+            delete[] indices;
+        }
+
+        load_value(stream, branching_);
+        load_value(stream, trees_);
+        load_value(stream, centers_init_);
+        load_value(stream, leaf_size_);
+        load_value(stream, memoryCounter);
+
+        indices = new int*[trees_];
+        root = new NodePtr[trees_];
+        for (int i=0; i<trees_; ++i) {
+            indices[i] = new int[size_];
+            load_value(stream, *indices[i], size_);
+            load_tree(stream, root[i], i);
+        }
+
+        params["algorithm"] = getType();
+        params["branching"] = branching_;
+        params["trees"] = trees_;
+        params["centers_init"] = centers_init_;
+        params["leaf_size"] = leaf_size_;
+    }
+
+
+    /**
+     * Find set of nearest neighbors to vec. Their indices are stored inside
+     * the result object.
+     *
+     * Params:
+     *     result = the result object in which the indices of the nearest-neighbors are stored
+     *     vec = the vector for which to search the nearest neighbors
+     *     searchParams = parameters that influence the search algorithm (checks)
+     */
+    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
+    {
+
+        const int maxChecks = get_param(searchParams,"checks",32);
+        const bool explore_all_trees = get_param(searchParams,"explore_all_trees",false);
+
+        // Priority queue storing intermediate branches in the best-bin-first search
+        const cv::Ptr<Heap<BranchSt>>& heap = Heap<BranchSt>::getPooledInstance(cv::utils::getThreadID(), (int)size_);
+
+        std::vector<bool> checked(size_,false);
+        int checks = 0;
+        for (int i=0; i<trees_; ++i) {
+            findNN(root[i], result, vec, checks, maxChecks, heap, checked, explore_all_trees);
+            if (!explore_all_trees && (checks >= maxChecks) && result.full())
+                break;
+        }
+
+        BranchSt branch;
+        while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
+            NodePtr node = branch.node;
+            findNN(node, result, vec, checks, maxChecks, heap, checked, false);
+        }
+
+        CV_Assert(result.full());
+    }
+
+    IndexParams getParameters() const CV_OVERRIDE
+    {
+        return params;
+    }
+
+
+private:
+
+    /**
+     * Structure representing a node in the hierarchical k-means tree.
+     */
+    struct Node
+    {
+        /**
+         * The cluster center index
+         */
+        int pivot;
+        /**
+         * The cluster size (number of points in the cluster)
+         */
+        int size;
+        /**
+         * Child nodes (only for non-terminal nodes)
+         */
+        Node** childs;
+        /**
+         * Node points (only for terminal nodes)
+         */
+        int* indices;
+        /**
+         * Level
+         */
+        int level;
+    };
+    typedef Node* NodePtr;
+
+
+
+    /**
+     * Alias definition for a nicer syntax.
+     */
+    typedef BranchStruct<NodePtr, DistanceType> BranchSt;
+
+
+
+    void save_tree(FILE* stream, NodePtr node, int num)
+    {
+        save_value(stream, *node);
+        if (node->childs==NULL) {
+            int indices_offset = (int)(node->indices - indices[num]);
+            save_value(stream, indices_offset);
+        }
+        else {
+            for(int i=0; i<branching_; ++i) {
+                save_tree(stream, node->childs[i], num);
+            }
+        }
+    }
+
+
+    void load_tree(FILE* stream, NodePtr& node, int num)
+    {
+        node = pool.allocate<Node>();
+        load_value(stream, *node);
+        if (node->childs==NULL) {
+            int indices_offset;
+            load_value(stream, indices_offset);
+            node->indices = indices[num] + indices_offset;
+        }
+        else {
+            node->childs = pool.allocate<NodePtr>(branching_);
+            for(int i=0; i<branching_; ++i) {
+                load_tree(stream, node->childs[i], num);
+            }
+        }
+    }
+
+
+    /**
+     * Release the inner elements of indices[]
+     */
+    void free_indices()
+    {
+        if (indices!=NULL) {
+            for(int i=0; i<trees_; ++i) {
+                if (indices[i]!=NULL) {
+                    delete[] indices[i];
+                    indices[i] = NULL;
+                }
+            }
+        }
+    }
+
+
+    void computeLabels(int* dsindices, int indices_length,  int* centers, int centers_length, int* labels, DistanceType& cost)
+    {
+        cost = 0;
+        for (int i=0; i<indices_length; ++i) {
+            ElementType* point = dataset[dsindices[i]];
+            DistanceType dist = distance(point, dataset[centers[0]], veclen_);
+            labels[i] = 0;
+            for (int j=1; j<centers_length; ++j) {
+                DistanceType new_dist = distance(point, dataset[centers[j]], veclen_);
+                if (dist>new_dist) {
+                    labels[i] = j;
+                    dist = new_dist;
+                }
+            }
+            cost += dist;
+        }
+    }
+
+    /**
+     * The method responsible with actually doing the recursive hierarchical
+     * clustering
+     *
+     * Params:
+     *     node = the node to cluster
+     *     indices = indices of the points belonging to the current node
+     *     branching = the branching factor to use in the clustering
+     *
+     * TODO: for 1-sized clusters don't store a cluster center (it's the same as the single cluster point)
+     */
+    void computeClustering(NodePtr node, int* dsindices, int indices_length, int branching, int level)
+    {
+        node->size = indices_length;
+        node->level = level;
+
+        if (indices_length < leaf_size_) { // leaf node
+            node->indices = dsindices;
+            std::sort(node->indices,node->indices+indices_length);
+            node->childs = NULL;
+            return;
+        }
+
+        std::vector<int> centers(branching);
+        std::vector<int> labels(indices_length);
+
+        int centers_length;
+        (this->*chooseCenters)(branching, dsindices, indices_length, &centers[0], centers_length);
+
+        if (centers_length<branching) {
+            node->indices = dsindices;
+            std::sort(node->indices,node->indices+indices_length);
+            node->childs = NULL;
+            return;
+        }
+
+
+        //	assign points to clusters
+        DistanceType cost;
+        computeLabels(dsindices, indices_length, &centers[0], centers_length, &labels[0], cost);
+
+        node->childs = pool.allocate<NodePtr>(branching);
+        int start = 0;
+        int end = start;
+        for (int i=0; i<branching; ++i) {
+            for (int j=0; j<indices_length; ++j) {
+                if (labels[j]==i) {
+                    std::swap(dsindices[j],dsindices[end]);
+                    std::swap(labels[j],labels[end]);
+                    end++;
+                }
+            }
+
+            node->childs[i] = pool.allocate<Node>();
+            node->childs[i]->pivot = centers[i];
+            node->childs[i]->indices = NULL;
+            computeClustering(node->childs[i],dsindices+start, end-start, branching, level+1);
+            start=end;
+        }
+    }
+
+
+
+    /**
+     * Performs one descent in the hierarchical k-means tree. The branches not
+     * visited are stored in a priority queue.
+     *
+     * Params:
+     *      node = node to explore
+     *      result = container for the k-nearest neighbors found
+     *      vec = query points
+     *      checks = how many points in the dataset have been checked so far
+     *      maxChecks = maximum dataset points to checks
+     */
+
+
+    void findNN(NodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
+                const cv::Ptr<Heap<BranchSt>>& heap, std::vector<bool>& checked, bool explore_all_trees = false)
+    {
+        if (node->childs==NULL) {
+            if (!explore_all_trees && (checks>=maxChecks) && result.full()) {
+                return;
+            }
+            for (int i=0; i<node->size; ++i) {
+                int index = node->indices[i];
+                if (!checked[index]) {
+                    DistanceType dist = distance(dataset[index], vec, veclen_);
+                    result.addPoint(dist, index);
+                    checked[index] = true;
+                    ++checks;
+                }
+            }
+        }
+        else {
+            DistanceType* domain_distances = new DistanceType[branching_];
+            int best_index = 0;
+            domain_distances[best_index] = distance(vec, dataset[node->childs[best_index]->pivot], veclen_);
+            for (int i=1; i<branching_; ++i) {
+                domain_distances[i] = distance(vec, dataset[node->childs[i]->pivot], veclen_);
+                if (domain_distances[i]<domain_distances[best_index]) {
+                    best_index = i;
+                }
+            }
+            for (int i=0; i<branching_; ++i) {
+                if (i!=best_index) {
+                    heap->insert(BranchSt(node->childs[i],domain_distances[i]));
+                }
+            }
+            delete[] domain_distances;
+            findNN(node->childs[best_index],result,vec, checks, maxChecks, heap, checked, explore_all_trees);
+        }
+    }
+
+private:
+
+
+    /**
+     * The dataset used by this index
+     */
+    const Matrix<ElementType> dataset;
+
+    /**
+     * Parameters used by this index
+     */
+    IndexParams params;
+
+
+    /**
+     * Number of features in the dataset.
+     */
+    size_t size_;
+
+    /**
+     * Length of each feature.
+     */
+    size_t veclen_;
+
+    /**
+     * The root node in the tree.
+     */
+    NodePtr* root;
+
+    /**
+     *  Array of indices to vectors in the dataset.
+     */
+    int** indices;
+
+
+    /**
+     * The distance
+     */
+    Distance distance;
+
+    /**
+     * Pooled memory allocator.
+     *
+     * Using a pooled memory allocator is more efficient
+     * than allocating memory directly when there is a large
+     * number small of memory allocations.
+     */
+    PooledAllocator pool;
+
+    /**
+     * Memory occupied by the index.
+     */
+    int memoryCounter;
+
+    /** index parameters */
+    int branching_;
+    int trees_;
+    flann_centers_init_t centers_init_;
+    int leaf_size_;
+
+
+};
+
+}
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/index_testing.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/index_testing.h
new file mode 100644
index 000000000000..4c0014332628
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/index_testing.h
@@ -0,0 +1,319 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_INDEX_TESTING_H_
+#define OPENCV_FLANN_INDEX_TESTING_H_
+
+//! @cond IGNORED
+
+#include <cstring>
+#include <cmath>
+
+#include "matrix.h"
+#include "nn_index.h"
+#include "result_set.h"
+#include "logger.h"
+#include "timer.h"
+
+
+namespace cvflann
+{
+
+inline int countCorrectMatches(int* neighbors, int* groundTruth, int n)
+{
+    int count = 0;
+    for (int i=0; i<n; ++i) {
+        for (int k=0; k<n; ++k) {
+            if (neighbors[i]==groundTruth[k]) {
+                count++;
+                break;
+            }
+        }
+    }
+    return count;
+}
+
+
+template <typename Distance>
+typename Distance::ResultType computeDistanceRaport(const Matrix<typename Distance::ElementType>& inputData, typename Distance::ElementType* target,
+                                                    int* neighbors, int* groundTruth, int veclen, int n, const Distance& distance)
+{
+    typedef typename Distance::ResultType DistanceType;
+
+    DistanceType ret = 0;
+    for (int i=0; i<n; ++i) {
+        DistanceType den = distance(inputData[groundTruth[i]], target, veclen);
+        DistanceType num = distance(inputData[neighbors[i]], target, veclen);
+
+        if ((den==0)&&(num==0)) {
+            ret += 1;
+        }
+        else {
+            ret += num/den;
+        }
+    }
+
+    return ret;
+}
+
+template <typename Distance>
+float search_with_ground_truth(NNIndex<Distance>& index, const Matrix<typename Distance::ElementType>& inputData,
+                               const Matrix<typename Distance::ElementType>& testData, const Matrix<int>& matches, int nn, int checks,
+                               float& time, typename Distance::ResultType& dist, const Distance& distance, int skipMatches)
+{
+    typedef typename Distance::ResultType DistanceType;
+
+    if (matches.cols<size_t(nn)) {
+        Logger::info("matches.cols=%d, nn=%d\n",matches.cols,nn);
+
+        FLANN_THROW(cv::Error::StsError, "Ground truth is not computed for as many neighbors as requested");
+    }
+
+    KNNResultSet<DistanceType> resultSet(nn+skipMatches);
+    SearchParams searchParams(checks);
+
+    std::vector<int> indices(nn+skipMatches);
+    std::vector<DistanceType> dists(nn+skipMatches);
+    int* neighbors = &indices[skipMatches];
+
+    int correct = 0;
+    DistanceType distR = 0;
+    StartStopTimer t;
+    int repeats = 0;
+    while (t.value<0.2) {
+        repeats++;
+        t.start();
+        correct = 0;
+        distR = 0;
+        for (size_t i = 0; i < testData.rows; i++) {
+            resultSet.init(&indices[0], &dists[0]);
+            index.findNeighbors(resultSet, testData[i], searchParams);
+
+            correct += countCorrectMatches(neighbors,matches[i], nn);
+            distR += computeDistanceRaport<Distance>(inputData, testData[i], neighbors, matches[i], (int)testData.cols, nn, distance);
+        }
+        t.stop();
+    }
+    time = float(t.value/repeats);
+
+    float precicion = (float)correct/(nn*testData.rows);
+
+    dist = distR/(testData.rows*nn);
+
+    Logger::info("%8d %10.4g %10.5g %10.5g %10.5g\n",
+                 checks, precicion, time, 1000.0 * time / testData.rows, dist);
+
+    return precicion;
+}
+
+
+template <typename Distance>
+float test_index_checks(NNIndex<Distance>& index, const Matrix<typename Distance::ElementType>& inputData,
+                        const Matrix<typename Distance::ElementType>& testData, const Matrix<int>& matches,
+                        int checks, float& precision, const Distance& distance, int nn = 1, int skipMatches = 0)
+{
+    typedef typename Distance::ResultType DistanceType;
+
+    Logger::info("  Nodes  Precision(%)   Time(s)   Time/vec(ms)  Mean dist\n");
+    Logger::info("---------------------------------------------------------\n");
+
+    float time = 0;
+    DistanceType dist = 0;
+    precision = search_with_ground_truth(index, inputData, testData, matches, nn, checks, time, dist, distance, skipMatches);
+
+    return time;
+}
+
+template <typename Distance>
+float test_index_precision(NNIndex<Distance>& index, const Matrix<typename Distance::ElementType>& inputData,
+                           const Matrix<typename Distance::ElementType>& testData, const Matrix<int>& matches,
+                           float precision, int& checks, const Distance& distance, int nn = 1, int skipMatches = 0)
+{
+    typedef typename Distance::ResultType DistanceType;
+    const float SEARCH_EPS = 0.001f;
+
+    Logger::info("  Nodes  Precision(%)   Time(s)   Time/vec(ms)  Mean dist\n");
+    Logger::info("---------------------------------------------------------\n");
+
+    int c2 = 1;
+    float p2;
+    int c1 = 1;
+    //float p1;
+    float time;
+    DistanceType dist;
+
+    p2 = search_with_ground_truth(index, inputData, testData, matches, nn, c2, time, dist, distance, skipMatches);
+
+    if (p2>precision) {
+        Logger::info("Got as close as I can\n");
+        checks = c2;
+        return time;
+    }
+
+    while (p2<precision) {
+        c1 = c2;
+        //p1 = p2;
+        c2 *=2;
+        p2 = search_with_ground_truth(index, inputData, testData, matches, nn, c2, time, dist, distance, skipMatches);
+    }
+
+    int cx;
+    float realPrecision;
+    if (fabs(p2-precision)>SEARCH_EPS) {
+        Logger::info("Start linear estimation\n");
+        // after we got to values in the vecinity of the desired precision
+        // use linear approximation get a better estimation
+
+        cx = (c1+c2)/2;
+        realPrecision = search_with_ground_truth(index, inputData, testData, matches, nn, cx, time, dist, distance, skipMatches);
+        while (fabs(realPrecision-precision)>SEARCH_EPS) {
+
+            if (realPrecision<precision) {
+                c1 = cx;
+            }
+            else {
+                c2 = cx;
+            }
+            cx = (c1+c2)/2;
+            if (cx==c1) {
+                Logger::info("Got as close as I can\n");
+                break;
+            }
+            realPrecision = search_with_ground_truth(index, inputData, testData, matches, nn, cx, time, dist, distance, skipMatches);
+        }
+
+        c2 = cx;
+        p2 = realPrecision;
+
+    }
+    else {
+        Logger::info("No need for linear estimation\n");
+        cx = c2;
+        realPrecision = p2;
+    }
+
+    checks = cx;
+    return time;
+}
+
+
+template <typename Distance>
+void test_index_precisions(NNIndex<Distance>& index, const Matrix<typename Distance::ElementType>& inputData,
+                           const Matrix<typename Distance::ElementType>& testData, const Matrix<int>& matches,
+                           float* precisions, int precisions_length, const Distance& distance, int nn = 1, int skipMatches = 0, float maxTime = 0)
+{
+    typedef typename Distance::ResultType DistanceType;
+
+    const float SEARCH_EPS = 0.001;
+
+    // make sure precisions array is sorted
+    std::sort(precisions, precisions+precisions_length);
+
+    int pindex = 0;
+    float precision = precisions[pindex];
+
+    Logger::info("  Nodes  Precision(%)   Time(s)   Time/vec(ms)  Mean dist\n");
+    Logger::info("---------------------------------------------------------\n");
+
+    int c2 = 1;
+    float p2;
+
+    int c1 = 1;
+
+    float time;
+    DistanceType dist;
+
+    p2 = search_with_ground_truth(index, inputData, testData, matches, nn, c2, time, dist, distance, skipMatches);
+
+    // if precision for 1 run down the tree is already
+    // better then some of the requested precisions, then
+    // skip those
+    while (precisions[pindex]<p2 && pindex<precisions_length) {
+        pindex++;
+    }
+
+    if (pindex==precisions_length) {
+        Logger::info("Got as close as I can\n");
+        return;
+    }
+
+    for (int i=pindex; i<precisions_length; ++i) {
+
+        precision = precisions[i];
+        while (p2<precision) {
+            c1 = c2;
+            c2 *=2;
+            p2 = search_with_ground_truth(index, inputData, testData, matches, nn, c2, time, dist, distance, skipMatches);
+            if ((maxTime> 0)&&(time > maxTime)&&(p2<precision)) return;
+        }
+
+        int cx;
+        float realPrecision;
+        if (fabs(p2-precision)>SEARCH_EPS) {
+            Logger::info("Start linear estimation\n");
+            // after we got to values in the vecinity of the desired precision
+            // use linear approximation get a better estimation
+
+            cx = (c1+c2)/2;
+            realPrecision = search_with_ground_truth(index, inputData, testData, matches, nn, cx, time, dist, distance, skipMatches);
+            while (fabs(realPrecision-precision)>SEARCH_EPS) {
+
+                if (realPrecision<precision) {
+                    c1 = cx;
+                }
+                else {
+                    c2 = cx;
+                }
+                cx = (c1+c2)/2;
+                if (cx==c1) {
+                    Logger::info("Got as close as I can\n");
+                    break;
+                }
+                realPrecision = search_with_ground_truth(index, inputData, testData, matches, nn, cx, time, dist, distance, skipMatches);
+            }
+
+            c2 = cx;
+            p2 = realPrecision;
+
+        }
+        else {
+            Logger::info("No need for linear estimation\n");
+            cx = c2;
+            realPrecision = p2;
+        }
+
+    }
+}
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_INDEX_TESTING_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/kdtree_index.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/kdtree_index.h
new file mode 100644
index 000000000000..8245f7db796e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/kdtree_index.h
@@ -0,0 +1,636 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_KDTREE_INDEX_H_
+#define OPENCV_FLANN_KDTREE_INDEX_H_
+
+//! @cond IGNORED
+
+#include <algorithm>
+#include <map>
+#include <cstring>
+
+#include "nn_index.h"
+#include "dynamic_bitset.h"
+#include "matrix.h"
+#include "result_set.h"
+#include "heap.h"
+#include "allocator.h"
+#include "random.h"
+#include "saving.h"
+
+
+namespace cvflann
+{
+
+struct KDTreeIndexParams : public IndexParams
+{
+    KDTreeIndexParams(int trees = 4)
+    {
+        (*this)["algorithm"] = FLANN_INDEX_KDTREE;
+        (*this)["trees"] = trees;
+    }
+};
+
+
+/**
+ * Randomized kd-tree index
+ *
+ * Contains the k-d trees and other information for indexing a set of points
+ * for nearest-neighbor matching.
+ */
+template <typename Distance>
+class KDTreeIndex : public NNIndex<Distance>
+{
+public:
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+
+
+    /**
+     * KDTree constructor
+     *
+     * Params:
+     *          inputData = dataset with the input features
+     *          params = parameters passed to the kdtree algorithm
+     */
+    KDTreeIndex(const Matrix<ElementType>& inputData, const IndexParams& params = KDTreeIndexParams(),
+                Distance d = Distance() ) :
+        dataset_(inputData), index_params_(params), distance_(d)
+    {
+        size_ = dataset_.rows;
+        veclen_ = dataset_.cols;
+
+        trees_ = get_param(index_params_,"trees",4);
+        tree_roots_ = new NodePtr[trees_];
+
+        // Create a permutable array of indices to the input vectors.
+        vind_.resize(size_);
+        for (size_t i = 0; i < size_; ++i) {
+            vind_[i] = int(i);
+        }
+
+        mean_ = new DistanceType[veclen_];
+        var_ = new DistanceType[veclen_];
+    }
+
+
+    KDTreeIndex(const KDTreeIndex&);
+    KDTreeIndex& operator=(const KDTreeIndex&);
+
+    /**
+     * Standard destructor
+     */
+    ~KDTreeIndex()
+    {
+        if (tree_roots_!=NULL) {
+            delete[] tree_roots_;
+        }
+        delete[] mean_;
+        delete[] var_;
+    }
+
+    /**
+     * Builds the index
+     */
+    void buildIndex() CV_OVERRIDE
+    {
+        /* Construct the randomized trees. */
+        for (int i = 0; i < trees_; i++) {
+            /* Randomize the order of vectors to allow for unbiased sampling. */
+#ifndef OPENCV_FLANN_USE_STD_RAND
+            cv::randShuffle(vind_);
+#else
+            std::random_shuffle(vind_.begin(), vind_.end());
+#endif
+
+            tree_roots_[i] = divideTree(&vind_[0], int(size_) );
+        }
+    }
+
+
+    flann_algorithm_t getType() const CV_OVERRIDE
+    {
+        return FLANN_INDEX_KDTREE;
+    }
+
+
+    void saveIndex(FILE* stream) CV_OVERRIDE
+    {
+        save_value(stream, trees_);
+        for (int i=0; i<trees_; ++i) {
+            save_tree(stream, tree_roots_[i]);
+        }
+    }
+
+
+
+    void loadIndex(FILE* stream) CV_OVERRIDE
+    {
+        load_value(stream, trees_);
+        if (tree_roots_!=NULL) {
+            delete[] tree_roots_;
+        }
+        tree_roots_ = new NodePtr[trees_];
+        for (int i=0; i<trees_; ++i) {
+            load_tree(stream,tree_roots_[i]);
+        }
+
+        index_params_["algorithm"] = getType();
+        index_params_["trees"] = tree_roots_;
+    }
+
+    /**
+     *  Returns size of index.
+     */
+    size_t size() const CV_OVERRIDE
+    {
+        return size_;
+    }
+
+    /**
+     * Returns the length of an index feature.
+     */
+    size_t veclen() const CV_OVERRIDE
+    {
+        return veclen_;
+    }
+
+    /**
+     * Computes the inde memory usage
+     * Returns: memory used by the index
+     */
+    int usedMemory() const CV_OVERRIDE
+    {
+        return int(pool_.usedMemory+pool_.wastedMemory+dataset_.rows*sizeof(int));  // pool memory and vind array memory
+    }
+
+    /**
+     * Find set of nearest neighbors to vec. Their indices are stored inside
+     * the result object.
+     *
+     * Params:
+     *     result = the result object in which the indices of the nearest-neighbors are stored
+     *     vec = the vector for which to search the nearest neighbors
+     *     maxCheck = the maximum number of restarts (in a best-bin-first manner)
+     */
+    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
+    {
+        const int maxChecks = get_param(searchParams,"checks", 32);
+        const float epsError = 1+get_param(searchParams,"eps",0.0f);
+        const bool explore_all_trees = get_param(searchParams,"explore_all_trees",false);
+
+        if (maxChecks==FLANN_CHECKS_UNLIMITED) {
+            getExactNeighbors(result, vec, epsError);
+        }
+        else {
+            getNeighbors(result, vec, maxChecks, epsError, explore_all_trees);
+        }
+    }
+
+    IndexParams getParameters() const CV_OVERRIDE
+    {
+        return index_params_;
+    }
+
+private:
+
+
+    /*--------------------- Internal Data Structures --------------------------*/
+    struct Node
+    {
+        /**
+         * Dimension used for subdivision.
+         */
+        int divfeat;
+        /**
+         * The values used for subdivision.
+         */
+        DistanceType divval;
+        /**
+         * The child nodes.
+         */
+        Node* child1, * child2;
+    };
+    typedef Node* NodePtr;
+    typedef BranchStruct<NodePtr, DistanceType> BranchSt;
+    typedef BranchSt* Branch;
+
+
+
+    void save_tree(FILE* stream, NodePtr tree)
+    {
+        save_value(stream, *tree);
+        if (tree->child1!=NULL) {
+            save_tree(stream, tree->child1);
+        }
+        if (tree->child2!=NULL) {
+            save_tree(stream, tree->child2);
+        }
+    }
+
+
+    void load_tree(FILE* stream, NodePtr& tree)
+    {
+        tree = pool_.allocate<Node>();
+        load_value(stream, *tree);
+        if (tree->child1!=NULL) {
+            load_tree(stream, tree->child1);
+        }
+        if (tree->child2!=NULL) {
+            load_tree(stream, tree->child2);
+        }
+    }
+
+
+    /**
+     * Create a tree node that subdivides the list of vecs from vind[first]
+     * to vind[last].  The routine is called recursively on each sublist.
+     * Place a pointer to this new tree node in the location pTree.
+     *
+     * Params: pTree = the new node to create
+     *                  first = index of the first vector
+     *                  last = index of the last vector
+     */
+    NodePtr divideTree(int* ind, int count)
+    {
+        NodePtr node = pool_.allocate<Node>(); // allocate memory
+
+        /* If too few exemplars remain, then make this a leaf node. */
+        if ( count == 1) {
+            node->child1 = node->child2 = NULL;    /* Mark as leaf node. */
+            node->divfeat = *ind;    /* Store index of this vec. */
+        }
+        else {
+            int idx;
+            int cutfeat;
+            DistanceType cutval;
+            meanSplit(ind, count, idx, cutfeat, cutval);
+
+            node->divfeat = cutfeat;
+            node->divval = cutval;
+            node->child1 = divideTree(ind, idx);
+            node->child2 = divideTree(ind+idx, count-idx);
+        }
+
+        return node;
+    }
+
+
+    /**
+     * Choose which feature to use in order to subdivide this set of vectors.
+     * Make a random choice among those with the highest variance, and use
+     * its variance as the threshold value.
+     */
+    void meanSplit(int* ind, int count, int& index, int& cutfeat, DistanceType& cutval)
+    {
+        memset(mean_,0,veclen_*sizeof(DistanceType));
+        memset(var_,0,veclen_*sizeof(DistanceType));
+
+        /* Compute mean values.  Only the first SAMPLE_MEAN values need to be
+            sampled to get a good estimate.
+         */
+        int cnt = std::min((int)SAMPLE_MEAN+1, count);
+        for (int j = 0; j < cnt; ++j) {
+            ElementType* v = dataset_[ind[j]];
+            for (size_t k=0; k<veclen_; ++k) {
+                mean_[k] += v[k];
+            }
+        }
+        for (size_t k=0; k<veclen_; ++k) {
+            mean_[k] /= cnt;
+        }
+
+        /* Compute variances (no need to divide by count). */
+        for (int j = 0; j < cnt; ++j) {
+            ElementType* v = dataset_[ind[j]];
+            for (size_t k=0; k<veclen_; ++k) {
+                DistanceType dist = v[k] - mean_[k];
+                var_[k] += dist * dist;
+            }
+        }
+        /* Select one of the highest variance indices at random. */
+        cutfeat = selectDivision(var_);
+        cutval = mean_[cutfeat];
+
+        int lim1, lim2;
+        planeSplit(ind, count, cutfeat, cutval, lim1, lim2);
+
+        if (lim1>count/2) index = lim1;
+        else if (lim2<count/2) index = lim2;
+        else index = count/2;
+
+        /* If either list is empty, it means that all remaining features
+         * are identical. Split in the middle to maintain a balanced tree.
+         */
+        if ((lim1==count)||(lim2==0)) index = count/2;
+    }
+
+
+    /**
+     * Select the top RAND_DIM largest values from v and return the index of
+     * one of these selected at random.
+     */
+    int selectDivision(DistanceType* v)
+    {
+        int num = 0;
+        size_t topind[RAND_DIM];
+
+        /* Create a list of the indices of the top RAND_DIM values. */
+        for (size_t i = 0; i < veclen_; ++i) {
+            if ((num < RAND_DIM)||(v[i] > v[topind[num-1]])) {
+                /* Put this element at end of topind. */
+                if (num < RAND_DIM) {
+                    topind[num++] = i;            /* Add to list. */
+                }
+                else {
+                    topind[num-1] = i;         /* Replace last element. */
+                }
+                /* Bubble end value down to right location by repeated swapping. */
+                int j = num - 1;
+                while (j > 0  &&  v[topind[j]] > v[topind[j-1]]) {
+                    std::swap(topind[j], topind[j-1]);
+                    --j;
+                }
+            }
+        }
+        /* Select a random integer in range [0,num-1], and return that index. */
+        int rnd = rand_int(num);
+        return (int)topind[rnd];
+    }
+
+
+    /**
+     *  Subdivide the list of points by a plane perpendicular on axe corresponding
+     *  to the 'cutfeat' dimension at 'cutval' position.
+     *
+     *  On return:
+     *  dataset[ind[0..lim1-1]][cutfeat]<cutval
+     *  dataset[ind[lim1..lim2-1]][cutfeat]==cutval
+     *  dataset[ind[lim2..count]][cutfeat]>cutval
+     */
+    void planeSplit(int* ind, int count, int cutfeat, DistanceType cutval, int& lim1, int& lim2)
+    {
+        /* Move vector indices for left subtree to front of list. */
+        int left = 0;
+        int right = count-1;
+        for (;; ) {
+            while (left<=right && dataset_[ind[left]][cutfeat]<cutval) ++left;
+            while (left<=right && dataset_[ind[right]][cutfeat]>=cutval) --right;
+            if (left>right) break;
+            std::swap(ind[left], ind[right]); ++left; --right;
+        }
+        lim1 = left;
+        right = count-1;
+        for (;; ) {
+            while (left<=right && dataset_[ind[left]][cutfeat]<=cutval) ++left;
+            while (left<=right && dataset_[ind[right]][cutfeat]>cutval) --right;
+            if (left>right) break;
+            std::swap(ind[left], ind[right]); ++left; --right;
+        }
+        lim2 = left;
+    }
+
+    /**
+     * Performs an exact nearest neighbor search. The exact search performs a full
+     * traversal of the tree.
+     */
+    void getExactNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, float epsError)
+    {
+        //		checkID -= 1;  /* Set a different unique ID for each search. */
+
+        if (trees_ > 1) {
+            fprintf(stderr,"It doesn't make any sense to use more than one tree for exact search");
+        }
+        if (trees_>0) {
+            searchLevelExact(result, vec, tree_roots_[0], 0.0, epsError);
+        }
+        CV_Assert(result.full());
+    }
+
+    /**
+     * Performs the approximate nearest-neighbor search. The search is approximate
+     * because the tree traversal is abandoned after a given number of descends in
+     * the tree.
+     */
+    void getNeighbors(ResultSet<DistanceType>& result, const ElementType* vec,
+                      int maxCheck, float epsError, bool explore_all_trees = false)
+    {
+        int i;
+        BranchSt branch;
+        int checkCount = 0;
+        DynamicBitset checked(size_);
+
+        // Priority queue storing intermediate branches in the best-bin-first search
+        const cv::Ptr<Heap<BranchSt>>& heap = Heap<BranchSt>::getPooledInstance(cv::utils::getThreadID(), (int)size_);
+
+        /* Search once through each tree down to root. */
+        for (i = 0; i < trees_; ++i) {
+            searchLevel(result, vec, tree_roots_[i], 0, checkCount, maxCheck,
+                        epsError, heap, checked, explore_all_trees);
+            if (!explore_all_trees && (checkCount >= maxCheck) && result.full())
+                break;
+        }
+
+        /* Keep searching other branches from heap until finished. */
+        while ( heap->popMin(branch) && (checkCount < maxCheck || !result.full() )) {
+            searchLevel(result, vec, branch.node, branch.mindist, checkCount, maxCheck,
+                        epsError, heap, checked, false);
+        }
+
+        CV_Assert(result.full());
+    }
+
+
+    /**
+     *  Search starting from a given node of the tree.  Based on any mismatches at
+     *  higher levels, all exemplars below this level must have a distance of
+     *  at least "mindistsq".
+     */
+    void searchLevel(ResultSet<DistanceType>& result_set, const ElementType* vec, NodePtr node, DistanceType mindist, int& checkCount, int maxCheck,
+                     float epsError, const cv::Ptr<Heap<BranchSt>>& heap, DynamicBitset& checked, bool explore_all_trees = false)
+    {
+        if (result_set.worstDist()<mindist) {
+            //			printf("Ignoring branch, too far\n");
+            return;
+        }
+
+        /* If this is a leaf node, then do check and return. */
+        if ((node->child1 == NULL)&&(node->child2 == NULL)) {
+            /*  Do not check same node more than once when searching multiple trees.
+                Once a vector is checked, we set its location in vind to the
+                current checkID.
+             */
+            int index = node->divfeat;
+            if ( checked.test(index) ||
+                 (!explore_all_trees && (checkCount>=maxCheck) && result_set.full()) ) {
+                return;
+            }
+            checked.set(index);
+            checkCount++;
+
+            DistanceType dist = distance_(dataset_[index], vec, veclen_);
+            result_set.addPoint(dist,index);
+
+            return;
+        }
+
+        /* Which child branch should be taken first? */
+        ElementType val = vec[node->divfeat];
+        DistanceType diff = val - node->divval;
+        NodePtr bestChild = (diff < 0) ? node->child1 : node->child2;
+        NodePtr otherChild = (diff < 0) ? node->child2 : node->child1;
+
+        /* Create a branch record for the branch not taken.  Add distance
+            of this feature boundary (we don't attempt to correct for any
+            use of this feature in a parent node, which is unlikely to
+            happen and would have only a small effect).  Don't bother
+            adding more branches to heap after halfway point, as cost of
+            adding exceeds their value.
+         */
+
+        DistanceType new_distsq = mindist + distance_.accum_dist(val, node->divval, node->divfeat);
+        //		if (2 * checkCount < maxCheck  ||  !result.full()) {
+        if ((new_distsq*epsError < result_set.worstDist())||  !result_set.full()) {
+            heap->insert( BranchSt(otherChild, new_distsq) );
+        }
+
+        /* Call recursively to search next level down. */
+        searchLevel(result_set, vec, bestChild, mindist, checkCount, maxCheck, epsError, heap, checked);
+    }
+
+    /**
+     * Performs an exact search in the tree starting from a node.
+     */
+    void searchLevelExact(ResultSet<DistanceType>& result_set, const ElementType* vec, const NodePtr node, DistanceType mindist, const float epsError)
+    {
+        /* If this is a leaf node, then do check and return. */
+        if ((node->child1 == NULL)&&(node->child2 == NULL)) {
+            int index = node->divfeat;
+            DistanceType dist = distance_(dataset_[index], vec, veclen_);
+            result_set.addPoint(dist,index);
+            return;
+        }
+
+        /* Which child branch should be taken first? */
+        ElementType val = vec[node->divfeat];
+        DistanceType diff = val - node->divval;
+        NodePtr bestChild = (diff < 0) ? node->child1 : node->child2;
+        NodePtr otherChild = (diff < 0) ? node->child2 : node->child1;
+
+        /* Create a branch record for the branch not taken.  Add distance
+            of this feature boundary (we don't attempt to correct for any
+            use of this feature in a parent node, which is unlikely to
+            happen and would have only a small effect).  Don't bother
+            adding more branches to heap after halfway point, as cost of
+            adding exceeds their value.
+         */
+
+        DistanceType new_distsq = mindist + distance_.accum_dist(val, node->divval, node->divfeat);
+
+        /* Call recursively to search next level down. */
+        searchLevelExact(result_set, vec, bestChild, mindist, epsError);
+
+        if (new_distsq*epsError<=result_set.worstDist()) {
+            searchLevelExact(result_set, vec, otherChild, new_distsq, epsError);
+        }
+    }
+
+
+private:
+
+    enum
+    {
+        /**
+         * To improve efficiency, only SAMPLE_MEAN random values are used to
+         * compute the mean and variance at each level when building a tree.
+         * A value of 100 seems to perform as well as using all values.
+         */
+        SAMPLE_MEAN = 100,
+        /**
+         * Top random dimensions to consider
+         *
+         * When creating random trees, the dimension on which to subdivide is
+         * selected at random from among the top RAND_DIM dimensions with the
+         * highest variance.  A value of 5 works well.
+         */
+        RAND_DIM=5
+    };
+
+
+    /**
+     * Number of randomized trees that are used
+     */
+    int trees_;
+
+    /**
+     *  Array of indices to vectors in the dataset.
+     */
+    std::vector<int> vind_;
+
+    /**
+     * The dataset used by this index
+     */
+    const Matrix<ElementType> dataset_;
+
+    IndexParams index_params_;
+
+    size_t size_;
+    size_t veclen_;
+
+
+    DistanceType* mean_;
+    DistanceType* var_;
+
+
+    /**
+     * Array of k-d trees used to find neighbours.
+     */
+    NodePtr* tree_roots_;
+
+    /**
+     * Pooled memory allocator.
+     *
+     * Using a pooled memory allocator is more efficient
+     * than allocating memory directly when there is a large
+     * number small of memory allocations.
+     */
+    PooledAllocator pool_;
+
+    Distance distance_;
+
+
+};   // class KDTreeForest
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_KDTREE_INDEX_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/kdtree_single_index.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/kdtree_single_index.h
new file mode 100644
index 000000000000..ed95c3db7d55
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/kdtree_single_index.h
@@ -0,0 +1,645 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_KDTREE_SINGLE_INDEX_H_
+#define OPENCV_FLANN_KDTREE_SINGLE_INDEX_H_
+
+//! @cond IGNORED
+
+#include <algorithm>
+#include <map>
+#include <cstring>
+
+#include "nn_index.h"
+#include "matrix.h"
+#include "result_set.h"
+#include "heap.h"
+#include "allocator.h"
+#include "random.h"
+#include "saving.h"
+
+namespace cvflann
+{
+
+struct KDTreeSingleIndexParams : public IndexParams
+{
+    KDTreeSingleIndexParams(int leaf_max_size = 10, bool reorder = true, int dim = -1)
+    {
+        (*this)["algorithm"] = FLANN_INDEX_KDTREE_SINGLE;
+        (*this)["leaf_max_size"] = leaf_max_size;
+        (*this)["reorder"] = reorder;
+        (*this)["dim"] = dim;
+    }
+};
+
+
+/**
+ * Randomized kd-tree index
+ *
+ * Contains the k-d trees and other information for indexing a set of points
+ * for nearest-neighbor matching.
+ */
+template <typename Distance>
+class KDTreeSingleIndex : public NNIndex<Distance>
+{
+public:
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+
+
+    /**
+     * KDTree constructor
+     *
+     * Params:
+     *          inputData = dataset with the input features
+     *          params = parameters passed to the kdtree algorithm
+     */
+    KDTreeSingleIndex(const Matrix<ElementType>& inputData, const IndexParams& params = KDTreeSingleIndexParams(),
+                      Distance d = Distance() ) :
+        dataset_(inputData), index_params_(params), distance_(d)
+    {
+        size_ = dataset_.rows;
+        dim_ = dataset_.cols;
+        root_node_ = 0;
+        int dim_param = get_param(params,"dim",-1);
+        if (dim_param>0) dim_ = dim_param;
+        leaf_max_size_ = get_param(params,"leaf_max_size",10);
+        reorder_ = get_param(params,"reorder",true);
+
+        // Create a permutable array of indices to the input vectors.
+        vind_.resize(size_);
+        for (size_t i = 0; i < size_; i++) {
+            vind_[i] = (int)i;
+        }
+    }
+
+    KDTreeSingleIndex(const KDTreeSingleIndex&);
+    KDTreeSingleIndex& operator=(const KDTreeSingleIndex&);
+
+    /**
+     * Standard destructor
+     */
+    ~KDTreeSingleIndex()
+    {
+        if (reorder_) delete[] data_.data;
+    }
+
+    /**
+     * Builds the index
+     */
+    void buildIndex() CV_OVERRIDE
+    {
+        computeBoundingBox(root_bbox_);
+        root_node_ = divideTree(0, (int)size_, root_bbox_ );   // construct the tree
+
+        if (reorder_) {
+            delete[] data_.data;
+            data_ = cvflann::Matrix<ElementType>(new ElementType[size_*dim_], size_, dim_);
+            for (size_t i=0; i<size_; ++i) {
+                for (size_t j=0; j<dim_; ++j) {
+                    data_[i][j] = dataset_[vind_[i]][j];
+                }
+            }
+        }
+        else {
+            data_ = dataset_;
+        }
+    }
+
+    flann_algorithm_t getType() const CV_OVERRIDE
+    {
+        return FLANN_INDEX_KDTREE_SINGLE;
+    }
+
+
+    void saveIndex(FILE* stream) CV_OVERRIDE
+    {
+        save_value(stream, size_);
+        save_value(stream, dim_);
+        save_value(stream, root_bbox_);
+        save_value(stream, reorder_);
+        save_value(stream, leaf_max_size_);
+        save_value(stream, vind_);
+        if (reorder_) {
+            save_value(stream, data_);
+        }
+        save_tree(stream, root_node_);
+    }
+
+
+    void loadIndex(FILE* stream) CV_OVERRIDE
+    {
+        load_value(stream, size_);
+        load_value(stream, dim_);
+        load_value(stream, root_bbox_);
+        load_value(stream, reorder_);
+        load_value(stream, leaf_max_size_);
+        load_value(stream, vind_);
+        if (reorder_) {
+            load_value(stream, data_);
+        }
+        else {
+            data_ = dataset_;
+        }
+        load_tree(stream, root_node_);
+
+
+        index_params_["algorithm"] = getType();
+        index_params_["leaf_max_size"] = leaf_max_size_;
+        index_params_["reorder"] = reorder_;
+    }
+
+    /**
+     *  Returns size of index.
+     */
+    size_t size() const CV_OVERRIDE
+    {
+        return size_;
+    }
+
+    /**
+     * Returns the length of an index feature.
+     */
+    size_t veclen() const CV_OVERRIDE
+    {
+        return dim_;
+    }
+
+    /**
+     * Computes the inde memory usage
+     * Returns: memory used by the index
+     */
+    int usedMemory() const CV_OVERRIDE
+    {
+        return (int)(pool_.usedMemory+pool_.wastedMemory+dataset_.rows*sizeof(int));  // pool memory and vind array memory
+    }
+
+
+    /**
+     * \brief Perform k-nearest neighbor search
+     * \param[in] queries The query points for which to find the nearest neighbors
+     * \param[out] indices The indices of the nearest neighbors found
+     * \param[out] dists Distances to the nearest neighbors found
+     * \param[in] knn Number of nearest neighbors to return
+     * \param[in] params Search parameters
+     */
+    void knnSearch(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, int knn, const SearchParams& params) CV_OVERRIDE
+    {
+        CV_Assert(queries.cols == veclen());
+        CV_Assert(indices.rows >= queries.rows);
+        CV_Assert(dists.rows >= queries.rows);
+        CV_Assert(int(indices.cols) >= knn);
+        CV_Assert(int(dists.cols) >= knn);
+
+        KNNSimpleResultSet<DistanceType> resultSet(knn);
+        for (size_t i = 0; i < queries.rows; i++) {
+            resultSet.init(indices[i], dists[i]);
+            findNeighbors(resultSet, queries[i], params);
+        }
+    }
+
+    IndexParams getParameters() const CV_OVERRIDE
+    {
+        return index_params_;
+    }
+
+    /**
+     * Find set of nearest neighbors to vec. Their indices are stored inside
+     * the result object.
+     *
+     * Params:
+     *     result = the result object in which the indices of the nearest-neighbors are stored
+     *     vec = the vector for which to search the nearest neighbors
+     *     maxCheck = the maximum number of restarts (in a best-bin-first manner)
+     */
+    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
+    {
+        float epsError = 1+get_param(searchParams,"eps",0.0f);
+
+        std::vector<DistanceType> dists(dim_,0);
+        DistanceType distsq = computeInitialDistances(vec, dists);
+        searchLevel(result, vec, root_node_, distsq, dists, epsError);
+    }
+
+private:
+
+
+    /*--------------------- Internal Data Structures --------------------------*/
+    struct Node
+    {
+        /**
+         * Indices of points in leaf node
+         */
+        int left, right;
+        /**
+         * Dimension used for subdivision.
+         */
+        int divfeat;
+        /**
+         * The values used for subdivision.
+         */
+        DistanceType divlow, divhigh;
+        /**
+         * The child nodes.
+         */
+        Node* child1, * child2;
+    };
+    typedef Node* NodePtr;
+
+
+    struct Interval
+    {
+        DistanceType low, high;
+    };
+
+    typedef std::vector<Interval> BoundingBox;
+
+    typedef BranchStruct<NodePtr, DistanceType> BranchSt;
+    typedef BranchSt* Branch;
+
+
+
+
+    void save_tree(FILE* stream, NodePtr tree)
+    {
+        save_value(stream, *tree);
+        if (tree->child1!=NULL) {
+            save_tree(stream, tree->child1);
+        }
+        if (tree->child2!=NULL) {
+            save_tree(stream, tree->child2);
+        }
+    }
+
+
+    void load_tree(FILE* stream, NodePtr& tree)
+    {
+        tree = pool_.allocate<Node>();
+        load_value(stream, *tree);
+        if (tree->child1!=NULL) {
+            load_tree(stream, tree->child1);
+        }
+        if (tree->child2!=NULL) {
+            load_tree(stream, tree->child2);
+        }
+    }
+
+
+    void computeBoundingBox(BoundingBox& bbox)
+    {
+        bbox.resize(dim_);
+        for (size_t i=0; i<dim_; ++i) {
+            bbox[i].low = (DistanceType)dataset_[0][i];
+            bbox[i].high = (DistanceType)dataset_[0][i];
+        }
+        for (size_t k=1; k<dataset_.rows; ++k) {
+            for (size_t i=0; i<dim_; ++i) {
+                if (dataset_[k][i]<bbox[i].low) bbox[i].low = (DistanceType)dataset_[k][i];
+                if (dataset_[k][i]>bbox[i].high) bbox[i].high = (DistanceType)dataset_[k][i];
+            }
+        }
+    }
+
+
+    /**
+     * Create a tree node that subdivides the list of vecs from vind[first]
+     * to vind[last].  The routine is called recursively on each sublist.
+     * Place a pointer to this new tree node in the location pTree.
+     *
+     * Params: pTree = the new node to create
+     *                  first = index of the first vector
+     *                  last = index of the last vector
+     */
+    NodePtr divideTree(int left, int right, BoundingBox& bbox)
+    {
+        NodePtr node = pool_.allocate<Node>(); // allocate memory
+
+        /* If too few exemplars remain, then make this a leaf node. */
+        if ( (right-left) <= leaf_max_size_) {
+            node->child1 = node->child2 = NULL;    /* Mark as leaf node. */
+            node->left = left;
+            node->right = right;
+
+            // compute bounding-box of leaf points
+            for (size_t i=0; i<dim_; ++i) {
+                bbox[i].low = (DistanceType)dataset_[vind_[left]][i];
+                bbox[i].high = (DistanceType)dataset_[vind_[left]][i];
+            }
+            for (int k=left+1; k<right; ++k) {
+                for (size_t i=0; i<dim_; ++i) {
+                    if (bbox[i].low>dataset_[vind_[k]][i]) bbox[i].low=(DistanceType)dataset_[vind_[k]][i];
+                    if (bbox[i].high<dataset_[vind_[k]][i]) bbox[i].high=(DistanceType)dataset_[vind_[k]][i];
+                }
+            }
+        }
+        else {
+            int idx;
+            int cutfeat;
+            DistanceType cutval;
+            middleSplit_(&vind_[0]+left, right-left, idx, cutfeat, cutval, bbox);
+
+            node->divfeat = cutfeat;
+
+            BoundingBox left_bbox(bbox);
+            left_bbox[cutfeat].high = cutval;
+            node->child1 = divideTree(left, left+idx, left_bbox);
+
+            BoundingBox right_bbox(bbox);
+            right_bbox[cutfeat].low = cutval;
+            node->child2 = divideTree(left+idx, right, right_bbox);
+
+            node->divlow = left_bbox[cutfeat].high;
+            node->divhigh = right_bbox[cutfeat].low;
+
+            for (size_t i=0; i<dim_; ++i) {
+                bbox[i].low = std::min(left_bbox[i].low, right_bbox[i].low);
+                bbox[i].high = std::max(left_bbox[i].high, right_bbox[i].high);
+            }
+        }
+
+        return node;
+    }
+
+    void computeMinMax(int* ind, int count, int dim, ElementType& min_elem, ElementType& max_elem)
+    {
+        min_elem = dataset_[ind[0]][dim];
+        max_elem = dataset_[ind[0]][dim];
+        for (int i=1; i<count; ++i) {
+            ElementType val = dataset_[ind[i]][dim];
+            if (val<min_elem) min_elem = val;
+            if (val>max_elem) max_elem = val;
+        }
+    }
+
+    void middleSplit(int* ind, int count, int& index, int& cutfeat, DistanceType& cutval, const BoundingBox& bbox)
+    {
+        // find the largest span from the approximate bounding box
+        ElementType max_span = bbox[0].high-bbox[0].low;
+        cutfeat = 0;
+        cutval = (bbox[0].high+bbox[0].low)/2;
+        for (size_t i=1; i<dim_; ++i) {
+            ElementType span = bbox[i].high-bbox[i].low;
+            if (span>max_span) {
+                max_span = span;
+                cutfeat = i;
+                cutval = (bbox[i].high+bbox[i].low)/2;
+            }
+        }
+
+        // compute exact span on the found dimension
+        ElementType min_elem, max_elem;
+        computeMinMax(ind, count, cutfeat, min_elem, max_elem);
+        cutval = (min_elem+max_elem)/2;
+        max_span = max_elem - min_elem;
+
+        // check if a dimension of a largest span exists
+        size_t k = cutfeat;
+        for (size_t i=0; i<dim_; ++i) {
+            if (i==k) continue;
+            ElementType span = bbox[i].high-bbox[i].low;
+            if (span>max_span) {
+                computeMinMax(ind, count, i, min_elem, max_elem);
+                span = max_elem - min_elem;
+                if (span>max_span) {
+                    max_span = span;
+                    cutfeat = i;
+                    cutval = (min_elem+max_elem)/2;
+                }
+            }
+        }
+        int lim1, lim2;
+        planeSplit(ind, count, cutfeat, cutval, lim1, lim2);
+
+        if (lim1>count/2) index = lim1;
+        else if (lim2<count/2) index = lim2;
+        else index = count/2;
+    }
+
+
+    void middleSplit_(int* ind, int count, int& index, int& cutfeat, DistanceType& cutval, const BoundingBox& bbox)
+    {
+        const float EPS=0.00001f;
+        DistanceType max_span = bbox[0].high-bbox[0].low;
+        for (size_t i=1; i<dim_; ++i) {
+            DistanceType span = bbox[i].high-bbox[i].low;
+            if (span>max_span) {
+                max_span = span;
+            }
+        }
+        DistanceType max_spread = -1;
+        cutfeat = 0;
+        for (size_t i=0; i<dim_; ++i) {
+            DistanceType span = bbox[i].high-bbox[i].low;
+            if (span>(DistanceType)((1-EPS)*max_span)) {
+                ElementType min_elem, max_elem;
+                computeMinMax(ind, count, (int)i, min_elem, max_elem);
+                DistanceType spread = (DistanceType)(max_elem-min_elem);
+                if (spread>max_spread) {
+                    cutfeat = (int)i;
+                    max_spread = spread;
+                }
+            }
+        }
+        // split in the middle
+        DistanceType split_val = (bbox[cutfeat].low+bbox[cutfeat].high)/2;
+        ElementType min_elem, max_elem;
+        computeMinMax(ind, count, cutfeat, min_elem, max_elem);
+
+        if (split_val<min_elem) cutval = (DistanceType)min_elem;
+        else if (split_val>max_elem) cutval = (DistanceType)max_elem;
+        else cutval = split_val;
+
+        int lim1, lim2;
+        planeSplit(ind, count, cutfeat, cutval, lim1, lim2);
+
+        if (lim1>count/2) index = lim1;
+        else if (lim2<count/2) index = lim2;
+        else index = count/2;
+    }
+
+
+    /**
+     *  Subdivide the list of points by a plane perpendicular on axe corresponding
+     *  to the 'cutfeat' dimension at 'cutval' position.
+     *
+     *  On return:
+     *  dataset[ind[0..lim1-1]][cutfeat]<cutval
+     *  dataset[ind[lim1..lim2-1]][cutfeat]==cutval
+     *  dataset[ind[lim2..count]][cutfeat]>cutval
+     */
+    void planeSplit(int* ind, int count, int cutfeat, DistanceType cutval, int& lim1, int& lim2)
+    {
+        /* Move vector indices for left subtree to front of list. */
+        int left = 0;
+        int right = count-1;
+        for (;; ) {
+            while (left<=right && dataset_[ind[left]][cutfeat]<cutval) ++left;
+            while (left<=right && dataset_[ind[right]][cutfeat]>=cutval) --right;
+            if (left>right) break;
+            std::swap(ind[left], ind[right]); ++left; --right;
+        }
+        /* If either list is empty, it means that all remaining features
+         * are identical. Split in the middle to maintain a balanced tree.
+         */
+        lim1 = left;
+        right = count-1;
+        for (;; ) {
+            while (left<=right && dataset_[ind[left]][cutfeat]<=cutval) ++left;
+            while (left<=right && dataset_[ind[right]][cutfeat]>cutval) --right;
+            if (left>right) break;
+            std::swap(ind[left], ind[right]); ++left; --right;
+        }
+        lim2 = left;
+    }
+
+    DistanceType computeInitialDistances(const ElementType* vec, std::vector<DistanceType>& dists)
+    {
+        DistanceType distsq = 0.0;
+
+        for (size_t i = 0; i < dim_; ++i) {
+            if (vec[i] < root_bbox_[i].low) {
+                dists[i] = distance_.accum_dist(vec[i], root_bbox_[i].low, (int)i);
+                distsq += dists[i];
+            }
+            if (vec[i] > root_bbox_[i].high) {
+                dists[i] = distance_.accum_dist(vec[i], root_bbox_[i].high, (int)i);
+                distsq += dists[i];
+            }
+        }
+
+        return distsq;
+    }
+
+    /**
+     * Performs an exact search in the tree starting from a node.
+     */
+    void searchLevel(ResultSet<DistanceType>& result_set, const ElementType* vec, const NodePtr node, DistanceType mindistsq,
+                     std::vector<DistanceType>& dists, const float epsError)
+    {
+        /* If this is a leaf node, then do check and return. */
+        if ((node->child1 == NULL)&&(node->child2 == NULL)) {
+            DistanceType worst_dist = result_set.worstDist();
+            if (reorder_) {
+                for (int i=node->left; i<node->right; ++i) {
+                    DistanceType dist = distance_(vec, data_[i], dim_, worst_dist);
+                    if (dist<worst_dist) {
+                        result_set.addPoint(dist,vind_[i]);
+                    }
+                }
+            } else {
+                for (int i=node->left; i<node->right; ++i) {
+                    DistanceType dist = distance_(vec, data_[vind_[i]], dim_, worst_dist);
+                    if (dist<worst_dist) {
+                        result_set.addPoint(dist,vind_[i]);
+                    }
+                }
+            }
+            return;
+        }
+
+        /* Which child branch should be taken first? */
+        int idx = node->divfeat;
+        ElementType val = vec[idx];
+        DistanceType diff1 = val - node->divlow;
+        DistanceType diff2 = val - node->divhigh;
+
+        NodePtr bestChild;
+        NodePtr otherChild;
+        DistanceType cut_dist;
+        if ((diff1+diff2)<0) {
+            bestChild = node->child1;
+            otherChild = node->child2;
+            cut_dist = distance_.accum_dist(val, node->divhigh, idx);
+        }
+        else {
+            bestChild = node->child2;
+            otherChild = node->child1;
+            cut_dist = distance_.accum_dist( val, node->divlow, idx);
+        }
+
+        /* Call recursively to search next level down. */
+        searchLevel(result_set, vec, bestChild, mindistsq, dists, epsError);
+
+        DistanceType dst = dists[idx];
+        mindistsq = mindistsq + cut_dist - dst;
+        dists[idx] = cut_dist;
+        if (mindistsq*epsError<=result_set.worstDist()) {
+            searchLevel(result_set, vec, otherChild, mindistsq, dists, epsError);
+        }
+        dists[idx] = dst;
+    }
+
+private:
+
+    /**
+     * The dataset used by this index
+     */
+    const Matrix<ElementType> dataset_;
+
+    IndexParams index_params_;
+
+    int leaf_max_size_;
+    bool reorder_;
+
+
+    /**
+     *  Array of indices to vectors in the dataset.
+     */
+    std::vector<int> vind_;
+
+    Matrix<ElementType> data_;
+
+    size_t size_;
+    size_t dim_;
+
+    /**
+     * Array of k-d trees used to find neighbours.
+     */
+    NodePtr root_node_;
+
+    BoundingBox root_bbox_;
+
+    /**
+     * Pooled memory allocator.
+     *
+     * Using a pooled memory allocator is more efficient
+     * than allocating memory directly when there is a large
+     * number small of memory allocations.
+     */
+    PooledAllocator pool_;
+
+    Distance distance_;
+};   // class KDTree
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_KDTREE_SINGLE_INDEX_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/kmeans_index.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/kmeans_index.h
new file mode 100644
index 000000000000..fd7fe2bd39f4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/kmeans_index.h
@@ -0,0 +1,1819 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_KMEANS_INDEX_H_
+#define OPENCV_FLANN_KMEANS_INDEX_H_
+
+//! @cond IGNORED
+
+#include <algorithm>
+#include <map>
+#include <limits>
+#include <cmath>
+
+#include "general.h"
+#include "nn_index.h"
+#include "dist.h"
+#include "matrix.h"
+#include "result_set.h"
+#include "heap.h"
+#include "allocator.h"
+#include "random.h"
+#include "saving.h"
+#include "logger.h"
+
+#define BITS_PER_CHAR 8
+#define BITS_PER_BASE 2 // for DNA/RNA sequences
+#define BASE_PER_CHAR (BITS_PER_CHAR/BITS_PER_BASE)
+#define HISTOS_PER_BASE (1<<BITS_PER_BASE)
+
+
+namespace cvflann
+{
+
+struct KMeansIndexParams : public IndexParams
+{
+    KMeansIndexParams(int branching = 32, int iterations = 11,
+                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM,
+                      float cb_index = 0.2, int trees = 1 )
+    {
+        (*this)["algorithm"] = FLANN_INDEX_KMEANS;
+        // branching factor
+        (*this)["branching"] = branching;
+        // max iterations to perform in one kmeans clustering (kmeans tree)
+        (*this)["iterations"] = iterations;
+        // algorithm used for picking the initial cluster centers for kmeans tree
+        (*this)["centers_init"] = centers_init;
+        // cluster boundary index. Used when searching the kmeans tree
+        (*this)["cb_index"] = cb_index;
+        // number of kmeans trees to search in
+        (*this)["trees"] = trees;
+    }
+};
+
+
+/**
+ * Hierarchical kmeans index
+ *
+ * Contains a tree constructed through a hierarchical kmeans clustering
+ * and other information for indexing a set of points for nearest-neighbour matching.
+ */
+template <typename Distance>
+class KMeansIndex : public NNIndex<Distance>
+{
+public:
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+    typedef typename Distance::CentersType CentersType;
+
+    typedef typename Distance::is_kdtree_distance is_kdtree_distance;
+    typedef typename Distance::is_vector_space_distance is_vector_space_distance;
+
+
+
+    typedef void (KMeansIndex::* centersAlgFunction)(int, int*, int, int*, int&);
+
+    /**
+     * The function used for choosing the cluster centers.
+     */
+    centersAlgFunction chooseCenters;
+
+
+
+    /**
+     * Chooses the initial centers in the k-means clustering in a random manner.
+     *
+     * Params:
+     *     k = number of centers
+     *     vecs = the dataset of points
+     *     indices = indices in the dataset
+     *     indices_length = length of indices vector
+     *
+     */
+    void chooseCentersRandom(int k, int* indices, int indices_length, int* centers, int& centers_length)
+    {
+        UniqueRandom r(indices_length);
+
+        int index;
+        for (index=0; index<k; ++index) {
+            bool duplicate = true;
+            int rnd;
+            while (duplicate) {
+                duplicate = false;
+                rnd = r.next();
+                if (rnd<0) {
+                    centers_length = index;
+                    return;
+                }
+
+                centers[index] = indices[rnd];
+
+                for (int j=0; j<index; ++j) {
+                    DistanceType sq = distance_(dataset_[centers[index]], dataset_[centers[j]], dataset_.cols);
+                    if (sq<1e-16) {
+                        duplicate = true;
+                    }
+                }
+            }
+        }
+
+        centers_length = index;
+    }
+
+
+    /**
+     * Chooses the initial centers in the k-means using Gonzales' algorithm
+     * so that the centers are spaced apart from each other.
+     *
+     * Params:
+     *     k = number of centers
+     *     vecs = the dataset of points
+     *     indices = indices in the dataset
+     * Returns:
+     */
+    void chooseCentersGonzales(int k, int* indices, int indices_length, int* centers, int& centers_length)
+    {
+        int n = indices_length;
+
+        int rnd = rand_int(n);
+        CV_DbgAssert(rnd >=0 && rnd < n);
+
+        centers[0] = indices[rnd];
+
+        int index;
+        for (index=1; index<k; ++index) {
+
+            int best_index = -1;
+            DistanceType best_val = 0;
+            for (int j=0; j<n; ++j) {
+                DistanceType dist = distance_(dataset_[centers[0]],dataset_[indices[j]],dataset_.cols);
+                for (int i=1; i<index; ++i) {
+                    DistanceType tmp_dist = distance_(dataset_[centers[i]],dataset_[indices[j]],dataset_.cols);
+                    if (tmp_dist<dist) {
+                        dist = tmp_dist;
+                    }
+                }
+                if (dist>best_val) {
+                    best_val = dist;
+                    best_index = j;
+                }
+            }
+            if (best_index!=-1) {
+                centers[index] = indices[best_index];
+            }
+            else {
+                break;
+            }
+        }
+        centers_length = index;
+    }
+
+
+    /**
+     * Chooses the initial centers in the k-means using the algorithm
+     * proposed in the KMeans++ paper:
+     * Arthur, David; Vassilvitskii, Sergei - k-means++: The Advantages of Careful Seeding
+     *
+     * Implementation of this function was converted from the one provided in Arthur's code.
+     *
+     * Params:
+     *     k = number of centers
+     *     vecs = the dataset of points
+     *     indices = indices in the dataset
+     * Returns:
+     */
+    void chooseCentersKMeanspp(int k, int* indices, int indices_length, int* centers, int& centers_length)
+    {
+        int n = indices_length;
+
+        double currentPot = 0;
+        DistanceType* closestDistSq = new DistanceType[n];
+
+        // Choose one random center and set the closestDistSq values
+        int index = rand_int(n);
+        CV_DbgAssert(index >=0 && index < n);
+        centers[0] = indices[index];
+
+        for (int i = 0; i < n; i++) {
+            closestDistSq[i] = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols);
+            closestDistSq[i] = ensureSquareDistance<Distance>( closestDistSq[i] );
+            currentPot += closestDistSq[i];
+        }
+
+
+        const int numLocalTries = 1;
+
+        // Choose each center
+        int centerCount;
+        for (centerCount = 1; centerCount < k; centerCount++) {
+
+            // Repeat several trials
+            double bestNewPot = -1;
+            int bestNewIndex = -1;
+            for (int localTrial = 0; localTrial < numLocalTries; localTrial++) {
+
+                // Choose our center - have to be slightly careful to return a valid answer even accounting
+                // for possible rounding errors
+                double randVal = rand_double(currentPot);
+                for (index = 0; index < n-1; index++) {
+                    if (randVal <= closestDistSq[index]) break;
+                    else randVal -= closestDistSq[index];
+                }
+
+                // Compute the new potential
+                double newPot = 0;
+                for (int i = 0; i < n; i++) {
+                    DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols);
+                    newPot += std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
+                }
+
+                // Store the best result
+                if ((bestNewPot < 0)||(newPot < bestNewPot)) {
+                    bestNewPot = newPot;
+                    bestNewIndex = index;
+                }
+            }
+
+            // Add the appropriate center
+            centers[centerCount] = indices[bestNewIndex];
+            currentPot = bestNewPot;
+            for (int i = 0; i < n; i++) {
+                DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[bestNewIndex]], dataset_.cols);
+                closestDistSq[i] = std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
+            }
+        }
+
+        centers_length = centerCount;
+
+        delete[] closestDistSq;
+    }
+
+
+
+public:
+
+    flann_algorithm_t getType() const CV_OVERRIDE
+    {
+        return FLANN_INDEX_KMEANS;
+    }
+
+    template<class CentersContainerType>
+    class KMeansDistanceComputer : public cv::ParallelLoopBody
+    {
+    public:
+        KMeansDistanceComputer(Distance _distance, const Matrix<ElementType>& _dataset,
+            const int _branching, const int* _indices, const CentersContainerType& _dcenters,
+            const size_t _veclen, std::vector<int> &_new_centroids,
+            std::vector<DistanceType> &_sq_dists)
+            : distance(_distance)
+            , dataset(_dataset)
+            , branching(_branching)
+            , indices(_indices)
+            , dcenters(_dcenters)
+            , veclen(_veclen)
+            , new_centroids(_new_centroids)
+            , sq_dists(_sq_dists)
+        {
+        }
+
+        void operator()(const cv::Range& range) const CV_OVERRIDE
+        {
+            const int begin = range.start;
+            const int end = range.end;
+
+            for( int i = begin; i<end; ++i)
+            {
+                DistanceType sq_dist(distance(dataset[indices[i]], dcenters[0], veclen));
+                int new_centroid(0);
+                for (int j=1; j<branching; ++j) {
+                    DistanceType new_sq_dist = distance(dataset[indices[i]], dcenters[j], veclen);
+                    if (sq_dist>new_sq_dist) {
+                        new_centroid = j;
+                        sq_dist = new_sq_dist;
+                    }
+                }
+                sq_dists[i] = sq_dist;
+                new_centroids[i] = new_centroid;
+            }
+        }
+
+    private:
+        Distance distance;
+        const Matrix<ElementType>& dataset;
+        const int branching;
+        const int* indices;
+        const CentersContainerType& dcenters;
+        const size_t veclen;
+        std::vector<int> &new_centroids;
+        std::vector<DistanceType> &sq_dists;
+        KMeansDistanceComputer& operator=( const KMeansDistanceComputer & ) { return *this; }
+    };
+
+    /**
+     * Index constructor
+     *
+     * Params:
+     *          inputData = dataset with the input features
+     *          params = parameters passed to the hierarchical k-means algorithm
+     */
+    KMeansIndex(const Matrix<ElementType>& inputData, const IndexParams& params = KMeansIndexParams(),
+                Distance d = Distance())
+        : dataset_(inputData), index_params_(params), root_(NULL), indices_(NULL), distance_(d)
+    {
+        memoryCounter_ = 0;
+
+        size_ = dataset_.rows;
+        veclen_ = dataset_.cols;
+
+        branching_ = get_param(params,"branching",32);
+        trees_ = get_param(params,"trees",1);
+        iterations_ = get_param(params,"iterations",11);
+        if (iterations_<0) {
+            iterations_ = (std::numeric_limits<int>::max)();
+        }
+        centers_init_  = get_param(params,"centers_init",FLANN_CENTERS_RANDOM);
+
+        if (centers_init_==FLANN_CENTERS_RANDOM) {
+            chooseCenters = &KMeansIndex::chooseCentersRandom;
+        }
+        else if (centers_init_==FLANN_CENTERS_GONZALES) {
+            chooseCenters = &KMeansIndex::chooseCentersGonzales;
+        }
+        else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
+            chooseCenters = &KMeansIndex::chooseCentersKMeanspp;
+        }
+        else {
+            FLANN_THROW(cv::Error::StsBadArg, "Unknown algorithm for choosing initial centers.");
+        }
+        cb_index_ = 0.4f;
+
+        root_ = new KMeansNodePtr[trees_];
+        indices_ = new int*[trees_];
+
+        for (int i=0; i<trees_; ++i) {
+            root_[i] = NULL;
+            indices_[i] = NULL;
+        }
+    }
+
+
+    KMeansIndex(const KMeansIndex&);
+    KMeansIndex& operator=(const KMeansIndex&);
+
+
+    /**
+     * Index destructor.
+     *
+     * Release the memory used by the index.
+     */
+    virtual ~KMeansIndex()
+    {
+        if (root_ != NULL) {
+            free_centers();
+            delete[] root_;
+        }
+        if (indices_!=NULL) {
+            free_indices();
+            delete[] indices_;
+        }
+    }
+
+    /**
+     *  Returns size of index.
+     */
+    size_t size() const CV_OVERRIDE
+    {
+        return size_;
+    }
+
+    /**
+     * Returns the length of an index feature.
+     */
+    size_t veclen() const CV_OVERRIDE
+    {
+        return veclen_;
+    }
+
+
+    void set_cb_index( float index)
+    {
+        cb_index_ = index;
+    }
+
+    /**
+     * Computes the inde memory usage
+     * Returns: memory used by the index
+     */
+    int usedMemory() const CV_OVERRIDE
+    {
+        return pool_.usedMemory+pool_.wastedMemory+memoryCounter_;
+    }
+
+    /**
+     * Builds the index
+     */
+    void buildIndex() CV_OVERRIDE
+    {
+        if (branching_<2) {
+            FLANN_THROW(cv::Error::StsError, "Branching factor must be at least 2");
+        }
+
+        free_indices();
+
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            for (size_t j=0; j<size_; ++j) {
+                indices_[i][j] = int(j);
+            }
+            root_[i] = pool_.allocate<KMeansNode>();
+            std::memset(root_[i], 0, sizeof(KMeansNode));
+
+            Distance* dummy = NULL;
+            computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_, dummy);
+
+            computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
+        }
+    }
+
+
+    void saveIndex(FILE* stream) CV_OVERRIDE
+    {
+        save_value(stream, branching_);
+        save_value(stream, iterations_);
+        save_value(stream, memoryCounter_);
+        save_value(stream, cb_index_);
+        save_value(stream, trees_);
+        for (int i=0; i<trees_; ++i) {
+            save_value(stream, *indices_[i], (int)size_);
+            save_tree(stream, root_[i], i);
+        }
+    }
+
+
+    void loadIndex(FILE* stream) CV_OVERRIDE
+    {
+        if (indices_!=NULL) {
+            free_indices();
+            delete[] indices_;
+        }
+        if (root_!=NULL) {
+            free_centers();
+        }
+
+        load_value(stream, branching_);
+        load_value(stream, iterations_);
+        load_value(stream, memoryCounter_);
+        load_value(stream, cb_index_);
+        load_value(stream, trees_);
+
+        indices_ = new int*[trees_];
+        for (int i=0; i<trees_; ++i) {
+            indices_[i] = new int[size_];
+            load_value(stream, *indices_[i], size_);
+            load_tree(stream, root_[i], i);
+        }
+
+        index_params_["algorithm"] = getType();
+        index_params_["branching"] = branching_;
+        index_params_["trees"] = trees_;
+        index_params_["iterations"] = iterations_;
+        index_params_["centers_init"] = centers_init_;
+        index_params_["cb_index"] = cb_index_;
+    }
+
+
+    /**
+     * Find set of nearest neighbors to vec. Their indices are stored inside
+     * the result object.
+     *
+     * Params:
+     *     result = the result object in which the indices of the nearest-neighbors are stored
+     *     vec = the vector for which to search the nearest neighbors
+     *     searchParams = parameters that influence the search algorithm (checks, cb_index)
+     */
+    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
+    {
+
+        const int maxChecks = get_param(searchParams,"checks",32);
+
+        if (maxChecks==FLANN_CHECKS_UNLIMITED) {
+            findExactNN(root_[0], result, vec);
+        }
+        else {
+            // Priority queue storing intermediate branches in the best-bin-first search
+            const cv::Ptr<Heap<BranchSt>>& heap = Heap<BranchSt>::getPooledInstance(cv::utils::getThreadID(), (int)size_);
+
+            int checks = 0;
+            for (int i=0; i<trees_; ++i) {
+                findNN(root_[i], result, vec, checks, maxChecks, heap);
+                if ((checks >= maxChecks) && result.full())
+                    break;
+            }
+
+            BranchSt branch;
+            while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
+                KMeansNodePtr node = branch.node;
+                findNN(node, result, vec, checks, maxChecks, heap);
+            }
+            CV_Assert(result.full());
+        }
+    }
+
+    /**
+     * Clustering function that takes a cut in the hierarchical k-means
+     * tree and return the clusters centers of that clustering.
+     * Params:
+     *     numClusters = number of clusters to have in the clustering computed
+     * Returns: number of cluster centers
+     */
+    int getClusterCenters(Matrix<CentersType>& centers)
+    {
+        int numClusters = centers.rows;
+        if (numClusters<1) {
+            FLANN_THROW(cv::Error::StsBadArg, "Number of clusters must be at least 1");
+        }
+
+        DistanceType variance;
+        KMeansNodePtr* clusters = new KMeansNodePtr[numClusters];
+
+        int clusterCount = getMinVarianceClusters(root_[0], clusters, numClusters, variance);
+
+        Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount);
+
+        for (int i=0; i<clusterCount; ++i) {
+            CentersType* center = clusters[i]->pivot;
+            for (size_t j=0; j<veclen_; ++j) {
+                centers[i][j] = center[j];
+            }
+        }
+        delete[] clusters;
+
+        return clusterCount;
+    }
+
+    IndexParams getParameters() const CV_OVERRIDE
+    {
+        return index_params_;
+    }
+
+
+private:
+    /**
+     * Structure representing a node in the hierarchical k-means tree.
+     */
+    struct KMeansNode
+    {
+        /**
+         * The cluster center.
+         */
+        CentersType* pivot;
+        /**
+         * The cluster radius.
+         */
+        DistanceType radius;
+        /**
+         * The cluster mean radius.
+         */
+        DistanceType mean_radius;
+        /**
+         * The cluster variance.
+         */
+        DistanceType variance;
+        /**
+         * The cluster size (number of points in the cluster)
+         */
+        int size;
+        /**
+         * Child nodes (only for non-terminal nodes)
+         */
+        KMeansNode** childs;
+        /**
+         * Node points (only for terminal nodes)
+         */
+        int* indices;
+        /**
+         * Level
+         */
+        int level;
+    };
+    typedef KMeansNode* KMeansNodePtr;
+
+    /**
+     * Alias definition for a nicer syntax.
+     */
+    typedef BranchStruct<KMeansNodePtr, DistanceType> BranchSt;
+
+
+
+
+    void save_tree(FILE* stream, KMeansNodePtr node, int num)
+    {
+        save_value(stream, *node);
+        save_value(stream, *(node->pivot), (int)veclen_);
+        if (node->childs==NULL) {
+            int indices_offset = (int)(node->indices - indices_[num]);
+            save_value(stream, indices_offset);
+        }
+        else {
+            for(int i=0; i<branching_; ++i) {
+                save_tree(stream, node->childs[i], num);
+            }
+        }
+    }
+
+
+    void load_tree(FILE* stream, KMeansNodePtr& node, int num)
+    {
+        node = pool_.allocate<KMeansNode>();
+        load_value(stream, *node);
+        node->pivot = new CentersType[veclen_];
+        load_value(stream, *(node->pivot), (int)veclen_);
+        if (node->childs==NULL) {
+            int indices_offset;
+            load_value(stream, indices_offset);
+            node->indices = indices_[num] + indices_offset;
+        }
+        else {
+            node->childs = pool_.allocate<KMeansNodePtr>(branching_);
+            for(int i=0; i<branching_; ++i) {
+                load_tree(stream, node->childs[i], num);
+            }
+        }
+    }
+
+
+    /**
+     * Helper function
+     */
+    void free_centers(KMeansNodePtr node)
+    {
+        delete[] node->pivot;
+        if (node->childs!=NULL) {
+            for (int k=0; k<branching_; ++k) {
+                free_centers(node->childs[k]);
+            }
+        }
+    }
+
+    void free_centers()
+    {
+       if (root_ != NULL) {
+           for(int i=0; i<trees_; ++i) {
+               if (root_[i] != NULL) {
+                   free_centers(root_[i]);
+               }
+           }
+       }
+    }
+
+    /**
+     * Release the inner elements of indices[]
+     */
+    void free_indices()
+    {
+        if (indices_!=NULL) {
+            for(int i=0; i<trees_; ++i) {
+                if (indices_[i]!=NULL) {
+                    delete[] indices_[i];
+                    indices_[i] = NULL;
+                }
+            }
+        }
+    }
+
+    /**
+     * Computes the statistics of a node (mean, radius, variance).
+     *
+     * Params:
+     *     node = the node to use
+     *     indices = array of indices of the points belonging to the node
+     *     indices_length = number of indices in the array
+     */
+    void computeNodeStatistics(KMeansNodePtr node, int* indices, unsigned int indices_length)
+    {
+        DistanceType variance = 0;
+        CentersType* mean = new CentersType[veclen_];
+        memoryCounter_ += int(veclen_*sizeof(CentersType));
+
+        memset(mean,0,veclen_*sizeof(CentersType));
+
+        for (unsigned int i=0; i<indices_length; ++i) {
+            ElementType* vec = dataset_[indices[i]];
+            for (size_t j=0; j<veclen_; ++j) {
+                mean[j] += vec[j];
+            }
+            variance += distance_(vec, ZeroIterator<ElementType>(), veclen_);
+        }
+        float length = static_cast<float>(indices_length);
+        for (size_t j=0; j<veclen_; ++j) {
+            mean[j] = cvflann::round<CentersType>( mean[j] / static_cast<double>(indices_length) );
+        }
+        variance /= static_cast<DistanceType>( length );
+        variance -= distance_(mean, ZeroIterator<ElementType>(), veclen_);
+
+        DistanceType radius = 0;
+        for (unsigned int i=0; i<indices_length; ++i) {
+            DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_);
+            if (tmp>radius) {
+                radius = tmp;
+            }
+        }
+
+        node->variance = variance;
+        node->radius = radius;
+        node->pivot = mean;
+    }
+
+
+    void computeBitfieldNodeStatistics(KMeansNodePtr node, int* indices,
+                                       unsigned int indices_length)
+    {
+        const unsigned int accumulator_veclen = static_cast<unsigned int>(
+                                                veclen_*sizeof(CentersType)*BITS_PER_CHAR);
+
+        unsigned long long variance = 0ull;
+        CentersType* mean = new CentersType[veclen_];
+        memoryCounter_ += int(veclen_*sizeof(CentersType));
+        unsigned int* mean_accumulator = new unsigned int[accumulator_veclen];
+
+        memset(mean_accumulator, 0, sizeof(unsigned int)*accumulator_veclen);
+
+        for (unsigned int i=0; i<indices_length; ++i) {
+            variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(
+                        distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_)));
+            unsigned char* vec = (unsigned char*)dataset_[indices[i]];
+            for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+                mean_accumulator[k]   += (vec[l])    & 0x01;
+                mean_accumulator[k+1] += (vec[l]>>1) & 0x01;
+                mean_accumulator[k+2] += (vec[l]>>2) & 0x01;
+                mean_accumulator[k+3] += (vec[l]>>3) & 0x01;
+                mean_accumulator[k+4] += (vec[l]>>4) & 0x01;
+                mean_accumulator[k+5] += (vec[l]>>5) & 0x01;
+                mean_accumulator[k+6] += (vec[l]>>6) & 0x01;
+                mean_accumulator[k+7] += (vec[l]>>7) & 0x01;
+            }
+        }
+        double cnt = static_cast<double>(indices_length);
+        unsigned char* char_mean = (unsigned char*)mean;
+        for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+            char_mean[l] = static_cast<unsigned char>(
+                              (((int)(0.5 + (double)(mean_accumulator[k])   / cnt)))
+                            | (((int)(0.5 + (double)(mean_accumulator[k+1]) / cnt))<<1)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+2]) / cnt))<<2)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+3]) / cnt))<<3)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+4]) / cnt))<<4)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+5]) / cnt))<<5)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+6]) / cnt))<<6)
+                            | (((int)(0.5 + (double)(mean_accumulator[k+7]) / cnt))<<7));
+        }
+        variance = static_cast<unsigned long long>(
+                    0.5 + static_cast<double>(variance) / static_cast<double>(indices_length));
+        variance -= static_cast<unsigned long long>(
+                    ensureSquareDistance<Distance>(
+                        distance_(mean, ZeroIterator<ElementType>(), veclen_)));
+
+        DistanceType radius = 0;
+        for (unsigned int i=0; i<indices_length; ++i) {
+            DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_);
+            if (tmp>radius) {
+                radius = tmp;
+            }
+        }
+
+        node->variance = static_cast<DistanceType>(variance);
+        node->radius = radius;
+        node->pivot = mean;
+
+        delete[] mean_accumulator;
+    }
+
+
+    void computeDnaNodeStatistics(KMeansNodePtr node, int* indices,
+                                       unsigned int indices_length)
+    {
+        const unsigned int histos_veclen = static_cast<unsigned int>(
+                    veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR));
+
+        unsigned long long variance = 0ull;
+        unsigned int* histograms = new unsigned int[histos_veclen];
+        memset(histograms, 0, sizeof(unsigned int)*histos_veclen);
+
+        for (unsigned int i=0; i<indices_length; ++i) {
+            variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(
+                        distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_)));
+
+            unsigned char* vec = (unsigned char*)dataset_[indices[i]];
+            for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
+                histograms[k +     ((vec[l])    & 0x03)]++;
+                histograms[k + 4 + ((vec[l]>>2) & 0x03)]++;
+                histograms[k + 8 + ((vec[l]>>4) & 0x03)]++;
+                histograms[k +12 + ((vec[l]>>6) & 0x03)]++;
+            }
+        }
+
+        CentersType* mean = new CentersType[veclen_];
+        memoryCounter_ += int(veclen_*sizeof(CentersType));
+        unsigned char* char_mean = (unsigned char*)mean;
+        unsigned int* h = histograms;
+        for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
+            char_mean[l] = (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k]   > h[k+2] ? 0x00 : 0x10
+                                                            : h[k]   > h[k+3] ? 0x00 : 0x11
+                                          : h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10
+                                                            : h[k+1] > h[k+3] ? 0x01 : 0x11)
+                         | (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00   : 0x1000
+                                                            : h[k+4] > h[k+7] ? 0x00   : 0x1100
+                                          : h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000
+                                                            : h[k+5] > h[k+7] ? 0x0100 : 0x1100)
+                         | (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00   : 0x100000
+                                                            : h[k+8] >h[k+11] ? 0x00   : 0x110000
+                                          : h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000
+                                                            : h[k+9] >h[k+11] ? 0x010000 : 0x110000)
+                         | (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00   : 0x10000000
+                                                              : h[k+12] >h[k+15] ? 0x00   : 0x11000000
+                                            : h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000
+                                                              : h[k+13] >h[k+15] ? 0x01000000 : 0x11000000);
+        }
+        variance = static_cast<unsigned long long>(
+                    0.5 + static_cast<double>(variance) / static_cast<double>(indices_length));
+        variance -= static_cast<unsigned long long>(
+                    ensureSquareDistance<Distance>(
+                        distance_(mean, ZeroIterator<ElementType>(), veclen_)));
+
+        DistanceType radius = 0;
+        for (unsigned int i=0; i<indices_length; ++i) {
+            DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_);
+            if (tmp>radius) {
+                radius = tmp;
+            }
+        }
+
+        node->variance = static_cast<DistanceType>(variance);
+        node->radius = radius;
+        node->pivot = mean;
+
+        delete[] histograms;
+    }
+
+
+    template<typename DistType>
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const DistType* identifier)
+    {
+        (void)identifier;
+        computeNodeStatistics(node, indices, indices_length);
+    }
+
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const cvflann::HammingLUT* identifier)
+    {
+        (void)identifier;
+        computeBitfieldNodeStatistics(node, indices, indices_length);
+    }
+
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const cvflann::Hamming<unsigned char>* identifier)
+    {
+        (void)identifier;
+        computeBitfieldNodeStatistics(node, indices, indices_length);
+    }
+
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const cvflann::Hamming2<unsigned char>* identifier)
+    {
+        (void)identifier;
+        computeBitfieldNodeStatistics(node, indices, indices_length);
+    }
+
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const cvflann::DNAmmingLUT* identifier)
+    {
+        (void)identifier;
+        computeDnaNodeStatistics(node, indices, indices_length);
+    }
+
+    void computeNodeStatistics(KMeansNodePtr node, int* indices,
+                               unsigned int indices_length,
+                               const cvflann::DNAmming2<unsigned char>* identifier)
+    {
+        (void)identifier;
+        computeDnaNodeStatistics(node, indices, indices_length);
+    }
+
+
+    void refineClustering(int* indices, int indices_length, int branching, CentersType** centers,
+                          std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
+    {
+        cv::AutoBuffer<double> dcenters_buf(branching*veclen_);
+        Matrix<double> dcenters(dcenters_buf.data(), branching, veclen_);
+
+        bool converged = false;
+        int iteration = 0;
+        while (!converged && iteration<iterations_) {
+            converged = true;
+            iteration++;
+
+            // compute the new cluster centers
+            for (int i=0; i<branching; ++i) {
+                memset(dcenters[i],0,sizeof(double)*veclen_);
+                radiuses[i] = 0;
+            }
+            for (int i=0; i<indices_length; ++i) {
+                ElementType* vec = dataset_[indices[i]];
+                double* center = dcenters[belongs_to[i]];
+                for (size_t k=0; k<veclen_; ++k) {
+                    center[k] += vec[k];
+                }
+            }
+            for (int i=0; i<branching; ++i) {
+                int cnt = count[i];
+                for (size_t k=0; k<veclen_; ++k) {
+                    dcenters[i][k] /= cnt;
+                }
+            }
+
+            std::vector<int> new_centroids(indices_length);
+            std::vector<DistanceType> sq_dists(indices_length);
+
+            // reassign points to clusters
+            KMeansDistanceComputer<Matrix<double> > invoker(
+                        distance_, dataset_, branching, indices, dcenters, veclen_, new_centroids, sq_dists);
+            parallel_for_(cv::Range(0, (int)indices_length), invoker);
+
+            for (int i=0; i < (int)indices_length; ++i) {
+                DistanceType sq_dist(sq_dists[i]);
+                int new_centroid(new_centroids[i]);
+                if (sq_dist > radiuses[new_centroid]) {
+                    radiuses[new_centroid] = sq_dist;
+                }
+                if (new_centroid != belongs_to[i]) {
+                    count[belongs_to[i]]--;
+                    count[new_centroid]++;
+                    belongs_to[i] = new_centroid;
+                    converged = false;
+                }
+            }
+
+            for (int i=0; i<branching; ++i) {
+                // if one cluster converges to an empty cluster,
+                // move an element into that cluster
+                if (count[i]==0) {
+                    int j = (i+1)%branching;
+                    while (count[j]<=1) {
+                        j = (j+1)%branching;
+                    }
+
+                    for (int k=0; k<indices_length; ++k) {
+                        if (belongs_to[k]==j) {
+                            // for cluster j, we move the furthest element from the center to the empty cluster i
+                            if ( distance_(dataset_[indices[k]], dcenters[j], veclen_) == radiuses[j] ) {
+                                belongs_to[k] = i;
+                                count[j]--;
+                                count[i]++;
+                                break;
+                            }
+                        }
+                    }
+                    converged = false;
+                }
+            }
+        }
+
+       for (int i=0; i<branching; ++i) {
+           centers[i] = new CentersType[veclen_];
+           memoryCounter_ += (int)(veclen_*sizeof(CentersType));
+           for (size_t k=0; k<veclen_; ++k) {
+               centers[i][k] = (CentersType)dcenters[i][k];
+           }
+       }
+    }
+
+
+    void refineBitfieldClustering(int* indices, int indices_length, int branching, CentersType** centers,
+                                  std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
+    {
+        for (int i=0; i<branching; ++i) {
+            centers[i] = new CentersType[veclen_];
+            memoryCounter_ += (int)(veclen_*sizeof(CentersType));
+        }
+
+        const unsigned int accumulator_veclen = static_cast<unsigned int>(
+                                                veclen_*sizeof(ElementType)*BITS_PER_CHAR);
+        cv::AutoBuffer<unsigned int> dcenters_buf(branching*accumulator_veclen);
+        Matrix<unsigned int> dcenters(dcenters_buf.data(), branching, accumulator_veclen);
+
+        bool converged = false;
+        int iteration = 0;
+        while (!converged && iteration<iterations_) {
+            converged = true;
+            iteration++;
+
+            // compute the new cluster centers
+            for (int i=0; i<branching; ++i) {
+                memset(dcenters[i],0,sizeof(unsigned int)*accumulator_veclen);
+                radiuses[i] = 0;
+            }
+            for (int i=0; i<indices_length; ++i) {
+                unsigned char* vec = (unsigned char*)dataset_[indices[i]];
+                unsigned int* dcenter = dcenters[belongs_to[i]];
+                for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+                    dcenter[k]   += (vec[l])    & 0x01;
+                    dcenter[k+1] += (vec[l]>>1) & 0x01;
+                    dcenter[k+2] += (vec[l]>>2) & 0x01;
+                    dcenter[k+3] += (vec[l]>>3) & 0x01;
+                    dcenter[k+4] += (vec[l]>>4) & 0x01;
+                    dcenter[k+5] += (vec[l]>>5) & 0x01;
+                    dcenter[k+6] += (vec[l]>>6) & 0x01;
+                    dcenter[k+7] += (vec[l]>>7) & 0x01;
+                }
+            }
+            for (int i=0; i<branching; ++i) {
+                double cnt = static_cast<double>(count[i]);
+                unsigned int* dcenter = dcenters[i];
+                unsigned char* charCenter = (unsigned char*)centers[i];
+                for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) {
+                    charCenter[l] = static_cast<unsigned char>(
+                                      (((int)(0.5 + (double)(dcenter[k])   / cnt)))
+                                    | (((int)(0.5 + (double)(dcenter[k+1]) / cnt))<<1)
+                                    | (((int)(0.5 + (double)(dcenter[k+2]) / cnt))<<2)
+                                    | (((int)(0.5 + (double)(dcenter[k+3]) / cnt))<<3)
+                                    | (((int)(0.5 + (double)(dcenter[k+4]) / cnt))<<4)
+                                    | (((int)(0.5 + (double)(dcenter[k+5]) / cnt))<<5)
+                                    | (((int)(0.5 + (double)(dcenter[k+6]) / cnt))<<6)
+                                    | (((int)(0.5 + (double)(dcenter[k+7]) / cnt))<<7));
+                }
+            }
+
+            std::vector<int> new_centroids(indices_length);
+            std::vector<DistanceType> dists(indices_length);
+
+            // reassign points to clusters
+            KMeansDistanceComputer<ElementType**> invoker(
+                        distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
+            parallel_for_(cv::Range(0, (int)indices_length), invoker);
+
+            for (int i=0; i < indices_length; ++i) {
+                DistanceType dist(dists[i]);
+                int new_centroid(new_centroids[i]);
+                if (dist > radiuses[new_centroid]) {
+                    radiuses[new_centroid] = dist;
+                }
+                if (new_centroid != belongs_to[i]) {
+                    count[belongs_to[i]]--;
+                    count[new_centroid]++;
+                    belongs_to[i] = new_centroid;
+                    converged = false;
+                }
+            }
+
+            for (int i=0; i<branching; ++i) {
+                // if one cluster converges to an empty cluster,
+                // move an element into that cluster
+                if (count[i]==0) {
+                    int j = (i+1)%branching;
+                    while (count[j]<=1) {
+                        j = (j+1)%branching;
+                    }
+
+                    for (int k=0; k<indices_length; ++k) {
+                        if (belongs_to[k]==j) {
+                            // for cluster j, we move the furthest element from the center to the empty cluster i
+                            if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) {
+                                belongs_to[k] = i;
+                                count[j]--;
+                                count[i]++;
+                                break;
+                            }
+                        }
+                    }
+                    converged = false;
+                }
+            }
+        }
+    }
+
+
+    void refineDnaClustering(int* indices, int indices_length, int branching, CentersType** centers,
+                                  std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
+    {
+        for (int i=0; i<branching; ++i) {
+            centers[i] = new CentersType[veclen_];
+            memoryCounter_ += (int)(veclen_*sizeof(CentersType));
+        }
+
+        const unsigned int histos_veclen = static_cast<unsigned int>(
+                    veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR));
+        cv::AutoBuffer<unsigned int> histos_buf(branching*histos_veclen);
+        Matrix<unsigned int> histos(histos_buf.data(), branching, histos_veclen);
+
+        bool converged = false;
+        int iteration = 0;
+        while (!converged && iteration<iterations_) {
+            converged = true;
+            iteration++;
+
+            // compute the new cluster centers
+            for (int i=0; i<branching; ++i) {
+                memset(histos[i],0,sizeof(unsigned int)*histos_veclen);
+                radiuses[i] = 0;
+            }
+            for (int i=0; i<indices_length; ++i) {
+                unsigned char* vec = (unsigned char*)dataset_[indices[i]];
+                unsigned int* h = histos[belongs_to[i]];
+                for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
+                    h[k +     ((vec[l])    & 0x03)]++;
+                    h[k + 4 + ((vec[l]>>2) & 0x03)]++;
+                    h[k + 8 + ((vec[l]>>4) & 0x03)]++;
+                    h[k +12 + ((vec[l]>>6) & 0x03)]++;
+                }
+            }
+            for (int i=0; i<branching; ++i) {
+                unsigned int* h = histos[i];
+                unsigned char* charCenter = (unsigned char*)centers[i];
+                for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) {
+                    charCenter[l]= (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k]   > h[k+2] ? 0x00 : 0x10
+                                                                    : h[k]   > h[k+3] ? 0x00 : 0x11
+                                                  : h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10
+                                                                    : h[k+1] > h[k+3] ? 0x01 : 0x11)
+                                 | (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00   : 0x1000
+                                                                    : h[k+4] > h[k+7] ? 0x00   : 0x1100
+                                                  : h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000
+                                                                    : h[k+5] > h[k+7] ? 0x0100 : 0x1100)
+                                 | (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00   : 0x100000
+                                                                    : h[k+8] >h[k+11] ? 0x00   : 0x110000
+                                                  : h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000
+                                                                    : h[k+9] >h[k+11] ? 0x010000 : 0x110000)
+                                 | (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00   : 0x10000000
+                                                                      : h[k+12] >h[k+15] ? 0x00   : 0x11000000
+                                                    : h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000
+                                                                      : h[k+13] >h[k+15] ? 0x01000000 : 0x11000000);
+                }
+            }
+
+            std::vector<int> new_centroids(indices_length);
+            std::vector<DistanceType> dists(indices_length);
+
+            // reassign points to clusters
+            KMeansDistanceComputer<ElementType**> invoker(
+                        distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists);
+            parallel_for_(cv::Range(0, (int)indices_length), invoker);
+
+            for (int i=0; i < indices_length; ++i) {
+                DistanceType dist(dists[i]);
+                int new_centroid(new_centroids[i]);
+                if (dist > radiuses[new_centroid]) {
+                    radiuses[new_centroid] = dist;
+                }
+                if (new_centroid != belongs_to[i]) {
+                    count[belongs_to[i]]--;
+                    count[new_centroid]++;
+                    belongs_to[i] = new_centroid;
+                    converged = false;
+                }
+            }
+
+            for (int i=0; i<branching; ++i) {
+                // if one cluster converges to an empty cluster,
+                // move an element into that cluster
+                if (count[i]==0) {
+                    int j = (i+1)%branching;
+                    while (count[j]<=1) {
+                        j = (j+1)%branching;
+                    }
+
+                    for (int k=0; k<indices_length; ++k) {
+                        if (belongs_to[k]==j) {
+                            // for cluster j, we move the furthest element from the center to the empty cluster i
+                            if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) {
+                                belongs_to[k] = i;
+                                count[j]--;
+                                count[i]++;
+                                break;
+                            }
+                        }
+                    }
+                    converged = false;
+                }
+            }
+        }
+    }
+
+
+    void computeSubClustering(KMeansNodePtr node, int* indices, int indices_length,
+                              int branching, int level, CentersType** centers,
+                              std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
+    {
+        // compute kmeans clustering for each of the resulting clusters
+        node->childs = pool_.allocate<KMeansNodePtr>(branching);
+        int start = 0;
+        int end = start;
+        for (int c=0; c<branching; ++c) {
+            int s = count[c];
+
+            DistanceType variance = 0;
+            DistanceType mean_radius =0;
+            for (int i=0; i<indices_length; ++i) {
+                if (belongs_to[i]==c) {
+                    DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_);
+                    variance += d;
+                    mean_radius += static_cast<DistanceType>( sqrt(d) );
+                    std::swap(indices[i],indices[end]);
+                    std::swap(belongs_to[i],belongs_to[end]);
+                    end++;
+                }
+            }
+            variance /= s;
+            mean_radius /= s;
+            variance -= distance_(centers[c], ZeroIterator<ElementType>(), veclen_);
+
+            node->childs[c] = pool_.allocate<KMeansNode>();
+            std::memset(node->childs[c], 0, sizeof(KMeansNode));
+            node->childs[c]->radius = radiuses[c];
+            node->childs[c]->pivot = centers[c];
+            node->childs[c]->variance = variance;
+            node->childs[c]->mean_radius = mean_radius;
+            computeClustering(node->childs[c],indices+start, end-start, branching, level+1);
+            start=end;
+        }
+    }
+
+
+    void computeAnyBitfieldSubClustering(KMeansNodePtr node, int* indices, int indices_length,
+                              int branching, int level, CentersType** centers,
+                              std::vector<DistanceType>& radiuses, int* belongs_to, int* count)
+    {
+        // compute kmeans clustering for each of the resulting clusters
+        node->childs = pool_.allocate<KMeansNodePtr>(branching);
+        int start = 0;
+        int end = start;
+        for (int c=0; c<branching; ++c) {
+            int s = count[c];
+
+            unsigned long long variance = 0ull;
+            DistanceType mean_radius =0;
+            for (int i=0; i<indices_length; ++i) {
+                if (belongs_to[i]==c) {
+                    DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_);
+                    variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(d) );
+                    mean_radius += ensureSimpleDistance<Distance>(d);
+                    std::swap(indices[i],indices[end]);
+                    std::swap(belongs_to[i],belongs_to[end]);
+                    end++;
+                }
+            }
+            mean_radius = static_cast<DistanceType>(
+                        0.5f + static_cast<float>(mean_radius) / static_cast<float>(s));
+            variance = static_cast<unsigned long long>(
+                        0.5 + static_cast<double>(variance) / static_cast<double>(s));
+            variance -= static_cast<unsigned long long>(
+                        ensureSquareDistance<Distance>(
+                            distance_(centers[c], ZeroIterator<ElementType>(), veclen_)));
+
+            node->childs[c] = pool_.allocate<KMeansNode>();
+            std::memset(node->childs[c], 0, sizeof(KMeansNode));
+            node->childs[c]->radius = radiuses[c];
+            node->childs[c]->pivot = centers[c];
+            node->childs[c]->variance = static_cast<DistanceType>(variance);
+            node->childs[c]->mean_radius = mean_radius;
+            computeClustering(node->childs[c],indices+start, end-start, branching, level+1);
+            start=end;
+        }
+    }
+
+
+    template<typename DistType>
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const DistType* identifier)
+    {
+        (void)identifier;
+        refineClustering(indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeSubClustering(node, indices, indices_length, branching,
+                             level, centers, radiuses, belongs_to, count);
+    }
+
+
+    /**
+     * The methods responsible with doing the recursive hierarchical clustering on
+     * binary vectors.
+     * As some might have heard that KMeans on binary data doesn't make sense,
+     * it's worth a little explanation why it actually fairly works. As
+     * with the Hierarchical Clustering algortihm, we seed several centers for the
+     * current node by picking some of its points. Then in a first pass each point
+     * of the node is then related to its closest center. Now let's have a look at
+     * the 5 central dimensions of the 9 following points:
+     *
+     * xxxxxx11100xxxxx (1)
+     * xxxxxx11010xxxxx (2)
+     * xxxxxx11001xxxxx (3)
+     * xxxxxx10110xxxxx (4)
+     * xxxxxx10101xxxxx (5)
+     * xxxxxx10011xxxxx (6)
+     * xxxxxx01110xxxxx (7)
+     * xxxxxx01101xxxxx (8)
+     * xxxxxx01011xxxxx (9)
+     * sum   _____
+     * of 1: 66555
+     *
+     * Even if the barycenter notion doesn't apply, we can set a center
+     * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
+     * on for these points.
+     *
+     * Note that convergence isn't ensured anymore. In practice, using Gonzales
+     * as seeding algorithm should be fine for getting convergence ("iterations"
+     * value can be set to -1). But with KMeans++ seeding you should definitely
+     * set a maximum number of iterations (but make it higher than the "iterations"
+     * default value of 11).
+     *
+     * Params:
+     *     node = the node to cluster
+     *     indices = indices of the points belonging to the current node
+     *     indices_length = number of points in the current node
+     *     branching = the branching factor to use in the clustering
+     *     level = 0 for the root node, it increases with the subdivision levels
+     *     centers = clusters centers to compute
+     *     radiuses = radiuses of clusters
+     *     belongs_to = LookUp Table returning, for a given indice id, the center id it belongs to
+     *     count = array storing the number of indices for a given center id
+     *     identifier = dummy pointer on an instance of Distance (use to branch correctly among templates)
+     */
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const cvflann::HammingLUT* identifier)
+    {
+        (void)identifier;
+        refineBitfieldClustering(
+                    indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+                                        level, centers, radiuses, belongs_to, count);
+    }
+
+
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const cvflann::Hamming<unsigned char>* identifier)
+    {
+        (void)identifier;
+        refineBitfieldClustering(
+                    indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+                                        level, centers, radiuses, belongs_to, count);
+    }
+
+
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const cvflann::Hamming2<unsigned char>* identifier)
+    {
+        (void)identifier;
+        refineBitfieldClustering(
+                    indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+                                        level, centers, radiuses, belongs_to, count);
+    }
+
+
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const cvflann::DNAmmingLUT* identifier)
+    {
+        (void)identifier;
+        refineDnaClustering(
+                    indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+                                        level, centers, radiuses, belongs_to, count);
+    }
+
+
+    void refineAndSplitClustering(
+            KMeansNodePtr node, int* indices, int indices_length, int branching,
+            int level, CentersType** centers, std::vector<DistanceType>& radiuses,
+            int* belongs_to, int* count, const cvflann::DNAmming2<unsigned char>* identifier)
+    {
+        (void)identifier;
+        refineDnaClustering(
+                    indices, indices_length, branching, centers, radiuses, belongs_to, count);
+
+        computeAnyBitfieldSubClustering(node, indices, indices_length, branching,
+                                        level, centers, radiuses, belongs_to, count);
+    }
+
+
+    /**
+     * The method responsible with actually doing the recursive hierarchical
+     * clustering
+     *
+     * Params:
+     *     node = the node to cluster
+     *     indices = indices of the points belonging to the current node
+     *     branching = the branching factor to use in the clustering
+     *
+     * TODO: for 1-sized clusters don't store a cluster center (it's the same as the single cluster point)
+     */
+    void computeClustering(KMeansNodePtr node, int* indices, int indices_length, int branching, int level)
+    {
+        node->size = indices_length;
+        node->level = level;
+
+        if (indices_length < branching) {
+            node->indices = indices;
+            std::sort(node->indices,node->indices+indices_length);
+            node->childs = NULL;
+            return;
+        }
+
+        cv::AutoBuffer<int> centers_idx_buf(branching);
+        int* centers_idx = centers_idx_buf.data();
+        int centers_length;
+        (this->*chooseCenters)(branching, indices, indices_length, centers_idx, centers_length);
+
+        if (centers_length<branching) {
+            node->indices = indices;
+            std::sort(node->indices,node->indices+indices_length);
+            node->childs = NULL;
+            return;
+        }
+
+
+        std::vector<DistanceType> radiuses(branching);
+        cv::AutoBuffer<int> count_buf(branching);
+        int* count = count_buf.data();
+        for (int i=0; i<branching; ++i) {
+            radiuses[i] = 0;
+            count[i] = 0;
+        }
+
+        //	assign points to clusters
+        cv::AutoBuffer<int> belongs_to_buf(indices_length);
+        int* belongs_to = belongs_to_buf.data();
+        for (int i=0; i<indices_length; ++i) {
+            DistanceType sq_dist = distance_(dataset_[indices[i]], dataset_[centers_idx[0]], veclen_);
+            belongs_to[i] = 0;
+            for (int j=1; j<branching; ++j) {
+                DistanceType new_sq_dist = distance_(dataset_[indices[i]], dataset_[centers_idx[j]], veclen_);
+                if (sq_dist>new_sq_dist) {
+                    belongs_to[i] = j;
+                    sq_dist = new_sq_dist;
+                }
+            }
+            if (sq_dist>radiuses[belongs_to[i]]) {
+                radiuses[belongs_to[i]] = sq_dist;
+            }
+            count[belongs_to[i]]++;
+        }
+
+        CentersType** centers = new CentersType*[branching];
+
+        Distance* dummy = NULL;
+        refineAndSplitClustering(node, indices, indices_length, branching, level,
+                                 centers, radiuses, belongs_to, count, dummy);
+
+        delete[] centers;
+    }
+
+
+    /**
+     * Performs one descent in the hierarchical k-means tree. The branches not
+     * visited are stored in a priority queue.
+     *
+     * Params:
+     *      node = node to explore
+     *      result = container for the k-nearest neighbors found
+     *      vec = query points
+     *      checks = how many points in the dataset have been checked so far
+     *      maxChecks = maximum dataset points to checks
+     */
+
+
+    void findNN(KMeansNodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
+                const cv::Ptr<Heap<BranchSt>>& heap)
+    {
+        // Ignore those clusters that are too far away
+        {
+            DistanceType bsq = distance_(vec, node->pivot, veclen_);
+            DistanceType rsq = node->radius;
+            DistanceType wsq = result.worstDist();
+
+            if (isSquareDistance<Distance>())
+            {
+                DistanceType val = bsq-rsq-wsq;
+                if ((val>0) && (val*val > 4*rsq*wsq))
+                    return;
+            }
+            else
+            {
+                if (bsq-rsq > wsq)
+                    return;
+            }
+        }
+
+        if (node->childs==NULL) {
+            if ((checks>=maxChecks) && result.full()) {
+                return;
+            }
+            checks += node->size;
+            for (int i=0; i<node->size; ++i) {
+                int index = node->indices[i];
+                DistanceType dist = distance_(dataset_[index], vec, veclen_);
+                result.addPoint(dist, index);
+            }
+        }
+        else {
+            DistanceType* domain_distances = new DistanceType[branching_];
+            int closest_center = exploreNodeBranches(node, vec, domain_distances, heap);
+            delete[] domain_distances;
+            findNN(node->childs[closest_center],result,vec, checks, maxChecks, heap);
+        }
+    }
+
+    /**
+     * Helper function that computes the nearest childs of a node to a given query point.
+     * Params:
+     *     node = the node
+     *     q = the query point
+     *     distances = array with the distances to each child node.
+     * Returns:
+     */
+    int exploreNodeBranches(KMeansNodePtr node, const ElementType* q, DistanceType* domain_distances, const cv::Ptr<Heap<BranchSt>>& heap)
+    {
+
+        int best_index = 0;
+        domain_distances[best_index] = distance_(q, node->childs[best_index]->pivot, veclen_);
+        for (int i=1; i<branching_; ++i) {
+            domain_distances[i] = distance_(q, node->childs[i]->pivot, veclen_);
+            if (domain_distances[i]<domain_distances[best_index]) {
+                best_index = i;
+            }
+        }
+
+        //		float* best_center = node->childs[best_index]->pivot;
+        for (int i=0; i<branching_; ++i) {
+            if (i != best_index) {
+                domain_distances[i] -= cvflann::round<DistanceType>(
+                                        cb_index_*node->childs[i]->variance );
+
+                //				float dist_to_border = getDistanceToBorder(node.childs[i].pivot,best_center,q);
+                //				if (domain_distances[i]<dist_to_border) {
+                //					domain_distances[i] = dist_to_border;
+                //				}
+                heap->insert(BranchSt(node->childs[i],domain_distances[i]));
+            }
+        }
+
+        return best_index;
+    }
+
+
+    /**
+     * Function the performs exact nearest neighbor search by traversing the entire tree.
+     */
+    void findExactNN(KMeansNodePtr node, ResultSet<DistanceType>& result, const ElementType* vec)
+    {
+        // Ignore those clusters that are too far away
+        {
+            DistanceType bsq = distance_(vec, node->pivot, veclen_);
+            DistanceType rsq = node->radius;
+            DistanceType wsq = result.worstDist();
+
+            if (isSquareDistance<Distance>())
+            {
+                DistanceType val = bsq-rsq-wsq;
+                if ((val>0) && (val*val > 4*rsq*wsq))
+                    return;
+            }
+            else
+            {
+                if (bsq-rsq > wsq)
+                    return;
+            }
+        }
+
+
+        if (node->childs==NULL) {
+            for (int i=0; i<node->size; ++i) {
+                int index = node->indices[i];
+                DistanceType dist = distance_(dataset_[index], vec, veclen_);
+                result.addPoint(dist, index);
+            }
+        }
+        else {
+            int* sort_indices = new int[branching_];
+
+            getCenterOrdering(node, vec, sort_indices);
+
+            for (int i=0; i<branching_; ++i) {
+                findExactNN(node->childs[sort_indices[i]],result,vec);
+            }
+
+            delete[] sort_indices;
+        }
+    }
+
+
+    /**
+     * Helper function.
+     *
+     * I computes the order in which to traverse the child nodes of a particular node.
+     */
+    void getCenterOrdering(KMeansNodePtr node, const ElementType* q, int* sort_indices)
+    {
+        DistanceType* domain_distances = new DistanceType[branching_];
+        for (int i=0; i<branching_; ++i) {
+            DistanceType dist = distance_(q, node->childs[i]->pivot, veclen_);
+
+            int j=0;
+            while (domain_distances[j]<dist && j<i)
+                j++;
+            for (int k=i; k>j; --k) {
+                domain_distances[k] = domain_distances[k-1];
+                sort_indices[k] = sort_indices[k-1];
+            }
+            domain_distances[j] = dist;
+            sort_indices[j] = i;
+        }
+        delete[] domain_distances;
+    }
+
+    /**
+     * Method that computes the squared distance from the query point q
+     * from inside region with center c to the border between this
+     * region and the region with center p
+     */
+    DistanceType getDistanceToBorder(DistanceType* p, DistanceType* c, DistanceType* q)
+    {
+        DistanceType sum = 0;
+        DistanceType sum2 = 0;
+
+        for (int i=0; i<veclen_; ++i) {
+            DistanceType t = c[i]-p[i];
+            sum += t*(q[i]-(c[i]+p[i])/2);
+            sum2 += t*t;
+        }
+
+        return sum*sum/sum2;
+    }
+
+
+    /**
+     * Helper function the descends in the hierarchical k-means tree by splitting those clusters that minimize
+     * the overall variance of the clustering.
+     * Params:
+     *     root = root node
+     *     clusters = array with clusters centers (return value)
+     *     varianceValue = variance of the clustering (return value)
+     * Returns:
+     */
+    int getMinVarianceClusters(KMeansNodePtr root, KMeansNodePtr* clusters, int clusters_length, DistanceType& varianceValue)
+    {
+        int clusterCount = 1;
+        clusters[0] = root;
+
+        DistanceType meanVariance = root->variance*root->size;
+
+        while (clusterCount<clusters_length) {
+            DistanceType minVariance = (std::numeric_limits<DistanceType>::max)();
+            int splitIndex = -1;
+
+            for (int i=0; i<clusterCount; ++i) {
+                if (clusters[i]->childs != NULL) {
+
+                    DistanceType variance = meanVariance - clusters[i]->variance*clusters[i]->size;
+
+                    for (int j=0; j<branching_; ++j) {
+                        variance += clusters[i]->childs[j]->variance*clusters[i]->childs[j]->size;
+                    }
+                    if (variance<minVariance) {
+                        minVariance = variance;
+                        splitIndex = i;
+                    }
+                }
+            }
+
+            if (splitIndex==-1) break;
+            if ( (branching_+clusterCount-1) > clusters_length) break;
+
+            meanVariance = minVariance;
+
+            // split node
+            KMeansNodePtr toSplit = clusters[splitIndex];
+            clusters[splitIndex] = toSplit->childs[0];
+            for (int i=1; i<branching_; ++i) {
+                clusters[clusterCount++] = toSplit->childs[i];
+            }
+        }
+
+        varianceValue = meanVariance/root->size;
+        return clusterCount;
+    }
+
+private:
+    /** The branching factor used in the hierarchical k-means clustering */
+    int branching_;
+
+    /** Number of kmeans trees (default is one) */
+    int trees_;
+
+    /** Maximum number of iterations to use when performing k-means clustering */
+    int iterations_;
+
+    /** Algorithm for choosing the cluster centers */
+    flann_centers_init_t centers_init_;
+
+    /**
+     * Cluster border index. This is used in the tree search phase when determining
+     * the closest cluster to explore next. A zero value takes into account only
+     * the cluster centres, a value greater then zero also take into account the size
+     * of the cluster.
+     */
+    float cb_index_;
+
+    /**
+     * The dataset used by this index
+     */
+    const Matrix<ElementType> dataset_;
+
+    /** Index parameters */
+    IndexParams index_params_;
+
+    /**
+     * Number of features in the dataset.
+     */
+    size_t size_;
+
+    /**
+     * Length of each feature.
+     */
+    size_t veclen_;
+
+    /**
+     * The root node in the tree.
+     */
+    KMeansNodePtr* root_;
+
+    /**
+     *  Array of indices to vectors in the dataset.
+     */
+    int** indices_;
+
+    /**
+     * The distance
+     */
+    Distance distance_;
+
+    /**
+     * Pooled memory allocator.
+     */
+    PooledAllocator pool_;
+
+    /**
+     * Memory occupied by the index.
+     */
+    int memoryCounter_;
+};
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_KMEANS_INDEX_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/linear_index.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/linear_index.h
new file mode 100644
index 000000000000..6428c0d7efd9
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/linear_index.h
@@ -0,0 +1,135 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_LINEAR_INDEX_H_
+#define OPENCV_FLANN_LINEAR_INDEX_H_
+
+//! @cond IGNORED
+
+#include "nn_index.h"
+
+namespace cvflann
+{
+
+struct LinearIndexParams : public IndexParams
+{
+    LinearIndexParams()
+    {
+        (* this)["algorithm"] = FLANN_INDEX_LINEAR;
+    }
+};
+
+template <typename Distance>
+class LinearIndex : public NNIndex<Distance>
+{
+public:
+
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+
+
+    LinearIndex(const Matrix<ElementType>& inputData, const IndexParams& params = LinearIndexParams(),
+                Distance d = Distance()) :
+        dataset_(inputData), index_params_(params), distance_(d)
+    {
+    }
+
+    LinearIndex(const LinearIndex&);
+    LinearIndex& operator=(const LinearIndex&);
+
+    flann_algorithm_t getType() const CV_OVERRIDE
+    {
+        return FLANN_INDEX_LINEAR;
+    }
+
+
+    size_t size() const CV_OVERRIDE
+    {
+        return dataset_.rows;
+    }
+
+    size_t veclen() const CV_OVERRIDE
+    {
+        return dataset_.cols;
+    }
+
+
+    int usedMemory() const CV_OVERRIDE
+    {
+        return 0;
+    }
+
+    void buildIndex() CV_OVERRIDE
+    {
+        /* nothing to do here for linear search */
+    }
+
+    void saveIndex(FILE*) CV_OVERRIDE
+    {
+        /* nothing to do here for linear search */
+    }
+
+
+    void loadIndex(FILE*) CV_OVERRIDE
+    {
+        /* nothing to do here for linear search */
+
+        index_params_["algorithm"] = getType();
+    }
+
+    void findNeighbors(ResultSet<DistanceType>& resultSet, const ElementType* vec, const SearchParams& /*searchParams*/) CV_OVERRIDE
+    {
+        ElementType* data = dataset_.data;
+        for (size_t i = 0; i < dataset_.rows; ++i, data += dataset_.cols) {
+            DistanceType dist = distance_(data, vec, dataset_.cols);
+            resultSet.addPoint(dist, (int)i);
+        }
+    }
+
+    IndexParams getParameters() const CV_OVERRIDE
+    {
+        return index_params_;
+    }
+
+private:
+    /** The dataset */
+    const Matrix<ElementType> dataset_;
+    /** Index parameters */
+    IndexParams index_params_;
+    /** Index distance */
+    Distance distance_;
+
+};
+
+}
+
+//! @endcond
+
+#endif // OPENCV_FLANN_LINEAR_INDEX_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/logger.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/logger.h
new file mode 100644
index 000000000000..31f9bbd77fa2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/logger.h
@@ -0,0 +1,138 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_LOGGER_H
+#define OPENCV_FLANN_LOGGER_H
+
+//! @cond IGNORED
+
+#include <stdio.h>
+#include <stdarg.h>
+
+#include "defines.h"
+
+
+namespace cvflann
+{
+
+class Logger
+{
+    Logger() : stream(stdout), logLevel(FLANN_LOG_WARN) {}
+
+    ~Logger()
+    {
+        if ((stream!=NULL)&&(stream!=stdout)) {
+            fclose(stream);
+        }
+    }
+
+    static Logger& instance()
+    {
+        static Logger logger;
+        return logger;
+    }
+
+    void _setDestination(const char* name)
+    {
+        if (name==NULL) {
+            stream = stdout;
+        }
+        else {
+#ifdef _MSC_VER
+            if (fopen_s(&stream, name, "w") != 0)
+                stream = NULL;
+#else
+            stream = fopen(name,"w");
+#endif
+            if (stream == NULL) {
+                stream = stdout;
+            }
+        }
+    }
+
+    int _log(int level, const char* fmt, va_list arglist)
+    {
+        if (level > logLevel ) return -1;
+        int ret = vfprintf(stream, fmt, arglist);
+        return ret;
+    }
+
+public:
+    /**
+     * Sets the logging level. All messages with lower priority will be ignored.
+     * @param level Logging level
+     */
+    static void setLevel(int level) { instance().logLevel = level; }
+
+    /**
+     * Sets the logging destination
+     * @param name Filename or NULL for console
+     */
+    static void setDestination(const char* name) { instance()._setDestination(name); }
+
+    /**
+     * Print log message
+     * @param level Log level
+     * @param fmt Message format
+     */
+    static int log(int level, const char* fmt, ...)
+    {
+        va_list arglist;
+        va_start(arglist, fmt);
+        int ret = instance()._log(level,fmt,arglist);
+        va_end(arglist);
+        return ret;
+    }
+
+#define LOG_METHOD(NAME,LEVEL) \
+    static int NAME(const char* fmt, ...) \
+    { \
+        va_list ap; \
+        va_start(ap, fmt); \
+        int ret = instance()._log(LEVEL, fmt, ap); \
+        va_end(ap); \
+        return ret; \
+    }
+
+    LOG_METHOD(fatal, FLANN_LOG_FATAL)
+    LOG_METHOD(error, FLANN_LOG_ERROR)
+    LOG_METHOD(warn, FLANN_LOG_WARN)
+    LOG_METHOD(info, FLANN_LOG_INFO)
+
+private:
+    FILE* stream;
+    int logLevel;
+};
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_LOGGER_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/lsh_index.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/lsh_index.h
new file mode 100644
index 000000000000..b5e87f6041f4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/lsh_index.h
@@ -0,0 +1,403 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+/***********************************************************************
+ * Author: Vincent Rabaud
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_LSH_INDEX_H_
+#define OPENCV_FLANN_LSH_INDEX_H_
+
+//! @cond IGNORED
+
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <vector>
+
+#include "nn_index.h"
+#include "matrix.h"
+#include "result_set.h"
+#include "heap.h"
+#include "lsh_table.h"
+#include "allocator.h"
+#include "random.h"
+#include "saving.h"
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4702) //disable unreachable code
+#endif
+
+namespace cvflann
+{
+
+struct LshIndexParams : public IndexParams
+{
+    LshIndexParams(int table_number = 12, int key_size = 20, int multi_probe_level = 2)
+    {
+        (*this)["algorithm"] = FLANN_INDEX_LSH;
+        // The number of hash tables to use
+        (*this)["table_number"] = table_number;
+        // The length of the key in the hash tables
+        (*this)["key_size"] = key_size;
+        // Number of levels to use in multi-probe (0 for standard LSH)
+        (*this)["multi_probe_level"] = multi_probe_level;
+    }
+};
+
+/**
+ * Locality-sensitive hashing  index
+ *
+ * Contains the tables and other information for indexing a set of points
+ * for nearest-neighbor matching.
+ */
+template<typename Distance>
+class LshIndex : public NNIndex<Distance>
+{
+public:
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+
+    /** Constructor
+     * @param input_data dataset with the input features
+     * @param params parameters passed to the LSH algorithm
+     * @param d the distance used
+     */
+    LshIndex(const Matrix<ElementType>& input_data, const IndexParams& params = LshIndexParams(),
+             Distance d = Distance()) :
+        dataset_(input_data), index_params_(params), distance_(d)
+    {
+        // cv::flann::IndexParams sets integer params as 'int', so it is used with get_param
+        // in place of 'unsigned int'
+        table_number_ = get_param(index_params_,"table_number",12);
+        key_size_ = get_param(index_params_,"key_size",20);
+        multi_probe_level_ = get_param(index_params_,"multi_probe_level",2);
+
+        feature_size_ = (unsigned)dataset_.cols;
+        fill_xor_mask(0, key_size_, multi_probe_level_, xor_masks_);
+    }
+
+
+    LshIndex(const LshIndex&);
+    LshIndex& operator=(const LshIndex&);
+
+    /**
+     * Builds the index
+     */
+    void buildIndex() CV_OVERRIDE
+    {
+        tables_.resize(table_number_);
+        for (int i = 0; i < table_number_; ++i) {
+            lsh::LshTable<ElementType>& table = tables_[i];
+            table = lsh::LshTable<ElementType>(feature_size_, key_size_);
+
+            // Add the features to the table
+            table.add(dataset_);
+        }
+    }
+
+    flann_algorithm_t getType() const CV_OVERRIDE
+    {
+        return FLANN_INDEX_LSH;
+    }
+
+
+    void saveIndex(FILE* stream) CV_OVERRIDE
+    {
+        save_value(stream,table_number_);
+        save_value(stream,key_size_);
+        save_value(stream,multi_probe_level_);
+        save_value(stream, dataset_);
+    }
+
+    void loadIndex(FILE* stream) CV_OVERRIDE
+    {
+        load_value(stream, table_number_);
+        load_value(stream, key_size_);
+        load_value(stream, multi_probe_level_);
+        load_value(stream, dataset_);
+        // Building the index is so fast we can afford not storing it
+        buildIndex();
+
+        index_params_["algorithm"] = getType();
+        index_params_["table_number"] = table_number_;
+        index_params_["key_size"] = key_size_;
+        index_params_["multi_probe_level"] = multi_probe_level_;
+    }
+
+    /**
+     *  Returns size of index.
+     */
+    size_t size() const CV_OVERRIDE
+    {
+        return dataset_.rows;
+    }
+
+    /**
+     * Returns the length of an index feature.
+     */
+    size_t veclen() const CV_OVERRIDE
+    {
+        return feature_size_;
+    }
+
+    /**
+     * Computes the index memory usage
+     * Returns: memory used by the index
+     */
+    int usedMemory() const CV_OVERRIDE
+    {
+        return (int)(dataset_.rows * sizeof(int));
+    }
+
+
+    IndexParams getParameters() const CV_OVERRIDE
+    {
+        return index_params_;
+    }
+
+    /**
+     * \brief Perform k-nearest neighbor search
+     * \param[in] queries The query points for which to find the nearest neighbors
+     * \param[out] indices The indices of the nearest neighbors found
+     * \param[out] dists Distances to the nearest neighbors found
+     * \param[in] knn Number of nearest neighbors to return
+     * \param[in] params Search parameters
+     */
+    virtual void knnSearch(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, int knn, const SearchParams& params) CV_OVERRIDE
+    {
+        CV_Assert(queries.cols == veclen());
+        CV_Assert(indices.rows >= queries.rows);
+        CV_Assert(dists.rows >= queries.rows);
+        CV_Assert(int(indices.cols) >= knn);
+        CV_Assert(int(dists.cols) >= knn);
+
+
+        KNNUniqueResultSet<DistanceType> resultSet(knn);
+        for (size_t i = 0; i < queries.rows; i++) {
+            resultSet.clear();
+            std::fill_n(indices[i], knn, -1);
+            std::fill_n(dists[i], knn, std::numeric_limits<DistanceType>::max());
+            findNeighbors(resultSet, queries[i], params);
+            if (get_param(params,"sorted",true)) resultSet.sortAndCopy(indices[i], dists[i], knn);
+            else resultSet.copy(indices[i], dists[i], knn);
+        }
+    }
+
+
+    /**
+     * Find set of nearest neighbors to vec. Their indices are stored inside
+     * the result object.
+     *
+     * Params:
+     *     result = the result object in which the indices of the nearest-neighbors are stored
+     *     vec = the vector for which to search the nearest neighbors
+     *     maxCheck = the maximum number of restarts (in a best-bin-first manner)
+     */
+    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& /*searchParams*/) CV_OVERRIDE
+    {
+        getNeighbors(vec, result);
+    }
+
+private:
+    /** Defines the comparator on score and index
+     */
+    typedef std::pair<float, unsigned int> ScoreIndexPair;
+    struct SortScoreIndexPairOnSecond
+    {
+        bool operator()(const ScoreIndexPair& left, const ScoreIndexPair& right) const
+        {
+            return left.second < right.second;
+        }
+    };
+
+    /** Fills the different xor masks to use when getting the neighbors in multi-probe LSH
+     * @param key the key we build neighbors from
+     * @param lowest_index the lowest index of the bit set
+     * @param level the multi-probe level we are at
+     * @param xor_masks all the xor mask
+     */
+    void fill_xor_mask(lsh::BucketKey key, int lowest_index, unsigned int level,
+                       std::vector<lsh::BucketKey>& xor_masks)
+    {
+        xor_masks.push_back(key);
+        if (level == 0) return;
+        for (int index = lowest_index - 1; index >= 0; --index) {
+            // Create a new key
+            lsh::BucketKey new_key = key | (1 << index);
+            fill_xor_mask(new_key, index, level - 1, xor_masks);
+        }
+    }
+
+    /** Performs the approximate nearest-neighbor search.
+     * @param vec the feature to analyze
+     * @param do_radius flag indicating if we check the radius too
+     * @param radius the radius if it is a radius search
+     * @param do_k flag indicating if we limit the number of nn
+     * @param k_nn the number of nearest neighbors
+     * @param checked_average used for debugging
+     */
+    void getNeighbors(const ElementType* vec, bool /*do_radius*/, float radius, bool do_k, unsigned int k_nn,
+                      float& /*checked_average*/)
+    {
+        static std::vector<ScoreIndexPair> score_index_heap;
+
+        if (do_k) {
+            unsigned int worst_score = std::numeric_limits<unsigned int>::max();
+            typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
+            typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
+            for (; table != table_end; ++table) {
+                size_t key = table->getKey(vec);
+                std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
+                std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
+                for (; xor_mask != xor_mask_end; ++xor_mask) {
+                    size_t sub_key = key ^ (*xor_mask);
+                    const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
+                    if (bucket == 0) continue;
+
+                    // Go over each descriptor index
+                    std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
+                    std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
+                    DistanceType hamming_distance;
+
+                    // Process the rest of the candidates
+                    for (; training_index < last_training_index; ++training_index) {
+                        hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
+
+                        if (hamming_distance < worst_score) {
+                            // Insert the new element
+                            score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
+                            std::push_heap(score_index_heap.begin(), score_index_heap.end());
+
+                            if (score_index_heap.size() > (unsigned int)k_nn) {
+                                // Remove the highest distance value as we have too many elements
+                                std::pop_heap(score_index_heap.begin(), score_index_heap.end());
+                                score_index_heap.pop_back();
+                                // Keep track of the worst score
+                                worst_score = score_index_heap.front().first;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else {
+            typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
+            typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
+            for (; table != table_end; ++table) {
+                size_t key = table->getKey(vec);
+                std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
+                std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
+                for (; xor_mask != xor_mask_end; ++xor_mask) {
+                    size_t sub_key = key ^ (*xor_mask);
+                    const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
+                    if (bucket == 0) continue;
+
+                    // Go over each descriptor index
+                    std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
+                    std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
+                    DistanceType hamming_distance;
+
+                    // Process the rest of the candidates
+                    for (; training_index < last_training_index; ++training_index) {
+                        // Compute the Hamming distance
+                        hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
+                        if (hamming_distance < radius) score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
+                    }
+                }
+            }
+        }
+    }
+
+    /** Performs the approximate nearest-neighbor search.
+     * This is a slower version than the above as it uses the ResultSet
+     * @param vec the feature to analyze
+     */
+    void getNeighbors(const ElementType* vec, ResultSet<DistanceType>& result)
+    {
+        typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
+        typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
+        for (; table != table_end; ++table) {
+            size_t key = table->getKey(vec);
+            std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
+            std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
+            for (; xor_mask != xor_mask_end; ++xor_mask) {
+                size_t sub_key = key ^ (*xor_mask);
+                const lsh::Bucket* bucket = table->getBucketFromKey((lsh::BucketKey)sub_key);
+                if (bucket == 0) continue;
+
+                // Go over each descriptor index
+                std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
+                std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
+                DistanceType hamming_distance;
+
+                // Process the rest of the candidates
+                for (; training_index < last_training_index; ++training_index) {
+                    // Compute the Hamming distance
+                    hamming_distance = distance_(vec, dataset_[*training_index], (int)dataset_.cols);
+                    result.addPoint(hamming_distance, *training_index);
+                }
+            }
+        }
+    }
+
+    /** The different hash tables */
+    std::vector<lsh::LshTable<ElementType> > tables_;
+
+    /** The data the LSH tables where built from */
+    Matrix<ElementType> dataset_;
+
+    /** The size of the features (as ElementType[]) */
+    unsigned int feature_size_;
+
+    IndexParams index_params_;
+
+    /** table number */
+    int table_number_;
+    /** key size */
+    int key_size_;
+    /** How far should we look for neighbors in multi-probe LSH */
+    int multi_probe_level_;
+
+    /** The XOR masks to apply to a key to get the neighboring buckets */
+    std::vector<lsh::BucketKey> xor_masks_;
+
+    Distance distance_;
+};
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+//! @endcond
+
+#endif //OPENCV_FLANN_LSH_INDEX_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/lsh_table.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/lsh_table.h
new file mode 100644
index 000000000000..3f51457cbb09
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/lsh_table.h
@@ -0,0 +1,522 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+/***********************************************************************
+ * Author: Vincent Rabaud
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_LSH_TABLE_H_
+#define OPENCV_FLANN_LSH_TABLE_H_
+
+//! @cond IGNORED
+
+#include <algorithm>
+#include <iostream>
+#include <iomanip>
+#include <limits.h>
+// TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
+#ifdef __GXX_EXPERIMENTAL_CXX0X__
+#  define USE_UNORDERED_MAP 1
+#else
+#  define USE_UNORDERED_MAP 0
+#endif
+#if USE_UNORDERED_MAP
+#include <unordered_map>
+#else
+#include <map>
+#endif
+#include <math.h>
+#include <stddef.h>
+
+#include "dynamic_bitset.h"
+#include "matrix.h"
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4702) //disable unreachable code
+#endif
+
+
+namespace cvflann
+{
+
+namespace lsh
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** What is stored in an LSH bucket
+ */
+typedef uint32_t FeatureIndex;
+/** The id from which we can get a bucket back in an LSH table
+ */
+typedef unsigned int BucketKey;
+
+/** A bucket in an LSH table
+ */
+typedef std::vector<FeatureIndex> Bucket;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** POD for stats about an LSH table
+ */
+struct LshStats
+{
+    std::vector<unsigned int> bucket_sizes_;
+    size_t n_buckets_;
+    size_t bucket_size_mean_;
+    size_t bucket_size_median_;
+    size_t bucket_size_min_;
+    size_t bucket_size_max_;
+    size_t bucket_size_std_dev;
+    /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
+     */
+    std::vector<std::vector<unsigned int> > size_histogram_;
+};
+
+/** Overload the << operator for LshStats
+ * @param out the streams
+ * @param stats the stats to display
+ * @return the streams
+ */
+inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
+{
+    int w = 20;
+    out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
+    << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
+    << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
+    << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
+    << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
+    << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
+    << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
+
+    // Display the histogram
+    out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
+    << std::setiosflags(std::ios::left);
+    for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
+             stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ",  ";
+
+    return out;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Lsh hash table. As its key is a sub-feature, and as usually
+ * the size of it is pretty small, we keep it as a continuous memory array.
+ * The value is an index in the corpus of features (we keep it as an unsigned
+ * int for pure memory reasons, it could be a size_t)
+ */
+template<typename ElementType>
+class LshTable
+{
+public:
+    /** A container of all the feature indices. Optimized for space
+     */
+#if USE_UNORDERED_MAP
+    typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
+#else
+    typedef std::map<BucketKey, Bucket> BucketsSpace;
+#endif
+
+    /** A container of all the feature indices. Optimized for speed
+     */
+    typedef std::vector<Bucket> BucketsSpeed;
+
+    /** Default constructor
+     */
+    LshTable()
+    {
+        key_size_ = 0;
+        feature_size_ = 0;
+        speed_level_ = kArray;
+    }
+
+    /** Default constructor
+     * Create the mask and allocate the memory
+     * @param feature_size is the size of the feature (considered as a ElementType[])
+     * @param key_size is the number of bits that are turned on in the feature
+     */
+    LshTable(unsigned int feature_size, unsigned int key_size)
+    {
+        feature_size_ = feature_size;
+        CV_UNUSED(key_size);
+        CV_Error(cv::Error::StsUnsupportedFormat, "LSH is not implemented for that type" );
+    }
+
+    /** Add a feature to the table
+     * @param value the value to store for that feature
+     * @param feature the feature itself
+     */
+    void add(unsigned int value, const ElementType* feature)
+    {
+        // Add the value to the corresponding bucket
+        BucketKey key = (lsh::BucketKey)getKey(feature);
+
+        switch (speed_level_) {
+        case kArray:
+            // That means we get the buckets from an array
+            buckets_speed_[key].push_back(value);
+            break;
+        case kBitsetHash:
+            // That means we can check the bitset for the presence of a key
+            key_bitset_.set(key);
+            buckets_space_[key].push_back(value);
+            break;
+        case kHash:
+        {
+            // That means we have to check for the hash table for the presence of a key
+            buckets_space_[key].push_back(value);
+            break;
+        }
+        }
+    }
+
+    /** Add a set of features to the table
+     * @param dataset the values to store
+     */
+    void add(Matrix<ElementType> dataset)
+    {
+#if USE_UNORDERED_MAP
+        buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
+#endif
+        // Add the features to the table
+        for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
+        // Now that the table is full, optimize it for speed/space
+        optimize();
+    }
+
+    /** Get a bucket given the key
+     */
+    inline const Bucket* getBucketFromKey(BucketKey key) const
+    {
+        // Generate other buckets
+        switch (speed_level_) {
+        case kArray:
+            // That means we get the buckets from an array
+            return &buckets_speed_[key];
+            break;
+        case kBitsetHash:
+            // That means we can check the bitset for the presence of a key
+            if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
+            else return 0;
+            break;
+        case kHash:
+        {
+            // That means we have to check for the hash table for the presence of a key
+            BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
+            bucket_it = buckets_space_.find(key);
+            // Stop here if that bucket does not exist
+            if (bucket_it == bucket_end) return 0;
+            else return &bucket_it->second;
+            break;
+        }
+        }
+        return 0;
+    }
+
+    /** Compute the sub-signature of a feature
+     */
+    size_t getKey(const ElementType* /*feature*/) const
+    {
+        CV_Error(cv::Error::StsUnsupportedFormat, "LSH is not implemented for that type" );
+        return 0;
+    }
+
+    /** Get statistics about the table
+     */
+    LshStats getStats() const;
+
+private:
+    /** defines the speed fo the implementation
+     * kArray uses a vector for storing data
+     * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
+     * kHash uses a hash map only
+     */
+    enum SpeedLevel
+    {
+        kArray, kBitsetHash, kHash
+    };
+
+    /** Initialize some variables
+     */
+    void initialize(size_t key_size)
+    {
+        const size_t key_size_lower_bound = 1;
+        //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t
+        const size_t key_size_upper_bound = (std::min)(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT);
+        if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound)
+        {
+            CV_Error(cv::Error::StsBadArg, cv::format("Invalid key_size (=%d). Valid values for your system are %d <= key_size < %d.", (int)key_size, (int)key_size_lower_bound, (int)key_size_upper_bound));
+        }
+
+        speed_level_ = kHash;
+        key_size_ = (unsigned)key_size;
+    }
+
+    /** Optimize the table for speed/space
+     */
+    void optimize()
+    {
+        // If we are already using the fast storage, no need to do anything
+        if (speed_level_ == kArray) return;
+
+        // Use an array if it will be more than half full
+        if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
+            speed_level_ = kArray;
+            // Fill the array version of it
+            buckets_speed_.resize(size_t(1) << key_size_);
+            for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
+
+            // Empty the hash table
+            buckets_space_.clear();
+            return;
+        }
+
+        // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
+        // for the vector) or less than 512MB (key_size_ <= 30)
+        if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
+             >= (size_t(1) << key_size_)) || (key_size_ <= 32)) {
+            speed_level_ = kBitsetHash;
+            key_bitset_.resize(size_t(1) << key_size_);
+            key_bitset_.reset();
+            // Try with the BucketsSpace
+            for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
+        }
+        else {
+            speed_level_ = kHash;
+            key_bitset_.clear();
+        }
+    }
+
+    /** The vector of all the buckets if they are held for speed
+     */
+    BucketsSpeed buckets_speed_;
+
+    /** The hash table of all the buckets in case we cannot use the speed version
+     */
+    BucketsSpace buckets_space_;
+
+    /** What is used to store the data */
+    SpeedLevel speed_level_;
+
+    /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
+     * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
+     */
+    DynamicBitset key_bitset_;
+
+    /** The size of the sub-signature in bits
+     */
+    unsigned int key_size_;
+
+    unsigned int feature_size_;
+
+    // Members only used for the unsigned char specialization
+    /** The mask to apply to a feature to get the hash key
+     * Only used in the unsigned char case
+     */
+    std::vector<size_t> mask_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialization for unsigned char
+
+template<>
+inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
+{
+    feature_size_ = feature_size;
+    initialize(subsignature_size);
+    // Allocate the mask
+    mask_ = std::vector<size_t>((feature_size * sizeof(char) + sizeof(size_t) - 1) / sizeof(size_t), 0);
+
+    // A bit brutal but fast to code
+    std::vector<int> indices(feature_size * CHAR_BIT);
+    for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = (int)i;
+#ifndef OPENCV_FLANN_USE_STD_RAND
+    cv::randShuffle(indices);
+#else
+    std::random_shuffle(indices.begin(), indices.end());
+#endif
+
+    // Generate a random set of order of subsignature_size_ bits
+    for (unsigned int i = 0; i < key_size_; ++i) {
+        size_t index = indices[i];
+
+        // Set that bit in the mask
+        size_t divisor = CHAR_BIT * sizeof(size_t);
+        size_t idx = index / divisor; //pick the right size_t index
+        mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
+    }
+
+    // Set to 1 if you want to display the mask for debug
+#if 0
+    {
+        size_t bcount = 0;
+        BOOST_FOREACH(size_t mask_block, mask_){
+            out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
+                << std::endl;
+            bcount += __builtin_popcountll(mask_block);
+        }
+        out << "bit count : " << std::dec << bcount << std::endl;
+        out << "mask size : " << mask_.size() << std::endl;
+        return out;
+    }
+#endif
+}
+
+/** Return the Subsignature of a feature
+ * @param feature the feature to analyze
+ */
+template<>
+inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
+{
+    // no need to check if T is dividable by sizeof(size_t) like in the Hamming
+    // distance computation as we have a mask
+    // FIXIT: This is bad assumption, because we reading tail bytes after of the allocated features buffer
+    const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);
+
+    // Figure out the subsignature of the feature
+    // Given the feature ABCDEF, and the mask 001011, the output will be
+    // 000CEF
+    size_t subsignature = 0;
+    size_t bit_index = 1;
+
+    for (unsigned i = 0; i < feature_size_; i += sizeof(size_t)) {
+        // get the mask and signature blocks
+        size_t feature_block;
+        if (i <= feature_size_ - sizeof(size_t))
+        {
+            feature_block = *feature_block_ptr;
+        }
+        else
+        {
+            size_t tmp = 0;
+            memcpy(&tmp, feature_block_ptr, feature_size_ - i); // preserve bytes order
+            feature_block = tmp;
+        }
+        size_t mask_block = mask_[i / sizeof(size_t)];
+        while (mask_block) {
+            // Get the lowest set bit in the mask block
+            size_t lowest_bit = mask_block & ~(mask_block - 1);
+            // Add it to the current subsignature if necessary
+            subsignature += (feature_block & lowest_bit) ? bit_index : 0;
+            // Reset the bit in the mask block
+            mask_block ^= lowest_bit;
+            // increment the bit index for the subsignature
+            bit_index <<= 1;
+        }
+        // Check the next feature block
+        ++feature_block_ptr;
+    }
+    return subsignature;
+}
+
+template<>
+inline LshStats LshTable<unsigned char>::getStats() const
+{
+    LshStats stats;
+    stats.bucket_size_mean_ = 0;
+    if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
+        stats.n_buckets_ = 0;
+        stats.bucket_size_median_ = 0;
+        stats.bucket_size_min_ = 0;
+        stats.bucket_size_max_ = 0;
+        return stats;
+    }
+
+    if (!buckets_speed_.empty()) {
+        for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
+            stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size());
+            stats.bucket_size_mean_ += pbucket->size();
+        }
+        stats.bucket_size_mean_ /= buckets_speed_.size();
+        stats.n_buckets_ = buckets_speed_.size();
+    }
+    else {
+        for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
+            stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size());
+            stats.bucket_size_mean_ += x->second.size();
+        }
+        stats.bucket_size_mean_ /= buckets_space_.size();
+        stats.n_buckets_ = buckets_space_.size();
+    }
+
+    std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
+
+    //  BOOST_FOREACH(int size, stats.bucket_sizes_)
+    //          std::cout << size << " ";
+    //  std::cout << std::endl;
+    stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
+    stats.bucket_size_min_ = stats.bucket_sizes_.front();
+    stats.bucket_size_max_ = stats.bucket_sizes_.back();
+
+    // TODO compute mean and std
+    /*float mean, stddev;
+       stats.bucket_size_mean_ = mean;
+       stats.bucket_size_std_dev = stddev;*/
+
+    // Include a histogram of the buckets
+    unsigned int bin_start = 0;
+    unsigned int bin_end = 20;
+    bool is_new_bin = true;
+    for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
+         != end; )
+        if (*iterator < bin_end) {
+            if (is_new_bin) {
+                stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
+                stats.size_histogram_.back()[0] = bin_start;
+                stats.size_histogram_.back()[1] = bin_end - 1;
+                is_new_bin = false;
+            }
+            ++stats.size_histogram_.back()[2];
+            ++iterator;
+        }
+        else {
+            bin_start += 20;
+            bin_end += 20;
+            is_new_bin = true;
+        }
+
+    return stats;
+}
+
+// End the two namespaces
+}
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_LSH_TABLE_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/matrix.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/matrix.h
new file mode 100644
index 000000000000..bfbf91ef5cd0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/matrix.h
@@ -0,0 +1,121 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_DATASET_H_
+#define OPENCV_FLANN_DATASET_H_
+
+//! @cond IGNORED
+
+#include <stdio.h>
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/flann/defines.h"
+
+namespace cvflann
+{
+
+/**
+ * Class that implements a simple rectangular matrix stored in a memory buffer and
+ * provides convenient matrix-like access using the [] operators.
+ */
+template <typename T>
+class Matrix
+{
+public:
+    typedef T type;
+
+    size_t rows;
+    size_t cols;
+    size_t stride;
+    T* data;
+
+    Matrix() : rows(0), cols(0), stride(0), data(NULL)
+    {
+    }
+
+    Matrix(T* data_, size_t rows_, size_t cols_, size_t stride_ = 0) :
+        rows(rows_), cols(cols_),  stride(stride_), data(data_)
+    {
+        if (stride==0) stride = cols;
+    }
+
+    /**
+     * Convenience function for deallocating the storage data.
+     */
+    CV_DEPRECATED void free()
+    {
+        fprintf(stderr, "The cvflann::Matrix<T>::free() method is deprecated "
+                "and it does not do any memory deallocation any more.  You are"
+                "responsible for deallocating the matrix memory (by doing"
+                "'delete[] matrix.data' for example)");
+    }
+
+    /**
+     * Operator that return a (pointer to a) row of the data.
+     */
+    T* operator[](size_t index) const
+    {
+        return data+index*stride;
+    }
+};
+
+
+class UntypedMatrix
+{
+public:
+    size_t rows;
+    size_t cols;
+    void* data;
+    flann_datatype_t type;
+
+    UntypedMatrix(void* data_, long rows_, long cols_) :
+        rows(rows_), cols(cols_), data(data_)
+    {
+    }
+
+    ~UntypedMatrix()
+    {
+    }
+
+
+    template<typename T>
+    Matrix<T> as()
+    {
+        return Matrix<T>((T*)data, rows, cols);
+    }
+};
+
+
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_DATASET_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/miniflann.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/flann/miniflann.hpp
new file mode 100644
index 000000000000..b8df92d758ee
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/miniflann.hpp
@@ -0,0 +1,185 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_MINIFLANN_HPP
+#define OPENCV_MINIFLANN_HPP
+
+//! @cond IGNORED
+
+#include "opencv2/core.hpp"
+#include "opencv2/flann/defines.h"
+
+namespace cv
+{
+
+namespace flann
+{
+
+enum FlannIndexType {
+    FLANN_INDEX_TYPE_8U = CV_8U,
+    FLANN_INDEX_TYPE_8S = CV_8S,
+    FLANN_INDEX_TYPE_16U = CV_16U,
+    FLANN_INDEX_TYPE_16S = CV_16S,
+    FLANN_INDEX_TYPE_32S = CV_32S,
+    FLANN_INDEX_TYPE_32F = CV_32F,
+    FLANN_INDEX_TYPE_64F = CV_64F,
+    FLANN_INDEX_TYPE_STRING,
+    FLANN_INDEX_TYPE_BOOL,
+    FLANN_INDEX_TYPE_ALGORITHM,
+    LAST_VALUE_FLANN_INDEX_TYPE = FLANN_INDEX_TYPE_ALGORITHM
+};
+
+struct CV_EXPORTS IndexParams
+{
+    IndexParams();
+    ~IndexParams();
+
+    String getString(const String& key, const String& defaultVal=String()) const;
+    int getInt(const String& key, int defaultVal=-1) const;
+    double getDouble(const String& key, double defaultVal=-1) const;
+
+    void setString(const String& key, const String& value);
+    void setInt(const String& key, int value);
+    void setDouble(const String& key, double value);
+    void setFloat(const String& key, float value);
+    void setBool(const String& key, bool value);
+    void setAlgorithm(int value);
+
+    // FIXIT: replace by void write(FileStorage& fs) const + read()
+    void getAll(std::vector<String>& names,
+                std::vector<FlannIndexType>& types,
+                std::vector<String>& strValues,
+                std::vector<double>& numValues) const;
+
+    void* params;
+
+private:
+    IndexParams(const IndexParams &); // copy disabled
+    IndexParams& operator=(const IndexParams &); // assign disabled
+};
+
+struct CV_EXPORTS KDTreeIndexParams : public IndexParams
+{
+    KDTreeIndexParams(int trees=4);
+};
+
+struct CV_EXPORTS LinearIndexParams : public IndexParams
+{
+    LinearIndexParams();
+};
+
+struct CV_EXPORTS CompositeIndexParams : public IndexParams
+{
+    CompositeIndexParams(int trees = 4, int branching = 32, int iterations = 11,
+                         cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2f );
+};
+
+struct CV_EXPORTS AutotunedIndexParams : public IndexParams
+{
+    AutotunedIndexParams(float target_precision = 0.8f, float build_weight = 0.01f,
+                         float memory_weight = 0, float sample_fraction = 0.1f);
+};
+
+struct CV_EXPORTS HierarchicalClusteringIndexParams : public IndexParams
+{
+    HierarchicalClusteringIndexParams(int branching = 32,
+                      cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, int trees = 4, int leaf_size = 100 );
+};
+
+struct CV_EXPORTS KMeansIndexParams : public IndexParams
+{
+    KMeansIndexParams(int branching = 32, int iterations = 11,
+                      cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2f );
+};
+
+struct CV_EXPORTS LshIndexParams : public IndexParams
+{
+    LshIndexParams(int table_number, int key_size, int multi_probe_level);
+};
+
+struct CV_EXPORTS SavedIndexParams : public IndexParams
+{
+    SavedIndexParams(const String& filename);
+};
+
+struct CV_EXPORTS SearchParams : public IndexParams
+{
+    SearchParams( int checks, float eps, bool sorted, bool explore_all_trees );
+    SearchParams( int checks = 32, float eps = 0, bool sorted = true );
+};
+
+class CV_EXPORTS_W Index
+{
+public:
+    CV_WRAP Index();
+    CV_WRAP Index(InputArray features, const IndexParams& params, cvflann::flann_distance_t distType=cvflann::FLANN_DIST_L2);
+    virtual ~Index();
+
+    CV_WRAP virtual void build(InputArray features, const IndexParams& params, cvflann::flann_distance_t distType=cvflann::FLANN_DIST_L2);
+    CV_WRAP virtual void knnSearch(InputArray query, OutputArray indices,
+                   OutputArray dists, int knn, const SearchParams& params=SearchParams());
+
+    CV_WRAP virtual int radiusSearch(InputArray query, OutputArray indices,
+                             OutputArray dists, double radius, int maxResults,
+                             const SearchParams& params=SearchParams());
+
+    CV_WRAP virtual void save(const String& filename) const;
+    CV_WRAP virtual bool load(InputArray features, const String& filename);
+    CV_WRAP virtual void release();
+    CV_WRAP cvflann::flann_distance_t getDistance() const;
+    CV_WRAP cvflann::flann_algorithm_t getAlgorithm() const;
+
+protected:
+    bool load_(const String& filename);
+
+    cvflann::flann_distance_t distType;
+    cvflann::flann_algorithm_t algo;
+    int featureType;
+    void* index;
+    Mat features_clone;  // index may store features pointer internally for searching, so avoid dangling pointers: https://github.com/opencv/opencv/issues/17553
+};
+
+} } // namespace cv::flann
+
+//! @endcond
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/nn_index.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/nn_index.h
new file mode 100644
index 000000000000..23a1de745354
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/nn_index.h
@@ -0,0 +1,180 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_NNINDEX_H
+#define OPENCV_FLANN_NNINDEX_H
+
+#include "matrix.h"
+#include "result_set.h"
+#include "params.h"
+
+//! @cond IGNORED
+
+namespace cvflann
+{
+
+/**
+ * Nearest-neighbour index base class
+ */
+template <typename Distance>
+class NNIndex
+{
+    typedef typename Distance::ElementType ElementType;
+    typedef typename Distance::ResultType DistanceType;
+
+public:
+
+    virtual ~NNIndex() {}
+
+    /**
+     * \brief Builds the index
+     */
+    virtual void buildIndex() = 0;
+
+    /**
+     * \brief Perform k-nearest neighbor search
+     * \param[in] queries The query points for which to find the nearest neighbors
+     * \param[out] indices The indices of the nearest neighbors found
+     * \param[out] dists Distances to the nearest neighbors found
+     * \param[in] knn Number of nearest neighbors to return
+     * \param[in] params Search parameters
+     */
+    virtual void knnSearch(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, int knn, const SearchParams& params)
+    {
+        CV_Assert(queries.cols == veclen());
+        CV_Assert(indices.rows >= queries.rows);
+        CV_Assert(dists.rows >= queries.rows);
+        CV_Assert(int(indices.cols) >= knn);
+        CV_Assert(int(dists.cols) >= knn);
+
+#if 0
+        KNNResultSet<DistanceType> resultSet(knn);
+        for (size_t i = 0; i < queries.rows; i++) {
+            resultSet.init(indices[i], dists[i]);
+            findNeighbors(resultSet, queries[i], params);
+        }
+#else
+        KNNUniqueResultSet<DistanceType> resultSet(knn);
+        for (size_t i = 0; i < queries.rows; i++) {
+            resultSet.clear();
+            findNeighbors(resultSet, queries[i], params);
+            if (get_param(params,"sorted",true)) resultSet.sortAndCopy(indices[i], dists[i], knn);
+            else resultSet.copy(indices[i], dists[i], knn);
+        }
+#endif
+    }
+
+    /**
+     * \brief Perform radius search
+     * \param[in] query The query point
+     * \param[out] indices The indinces of the neighbors found within the given radius
+     * \param[out] dists The distances to the nearest neighbors found
+     * \param[in] radius The radius used for search
+     * \param[in] params Search parameters
+     * \returns Number of neighbors found
+     */
+    virtual int radiusSearch(const Matrix<ElementType>& query, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params)
+    {
+        if (query.rows != 1) {
+            fprintf(stderr, "I can only search one feature at a time for range search\n");
+            return -1;
+        }
+        CV_Assert(query.cols == veclen());
+        CV_Assert(indices.cols == dists.cols);
+
+        int n = 0;
+        int* indices_ptr = NULL;
+        DistanceType* dists_ptr = NULL;
+        if (indices.cols > 0) {
+            n = (int)indices.cols;
+            indices_ptr = indices[0];
+            dists_ptr = dists[0];
+        }
+
+        RadiusUniqueResultSet<DistanceType> resultSet((DistanceType)radius);
+        resultSet.clear();
+        findNeighbors(resultSet, query[0], params);
+        if (n>0) {
+            if (get_param(params,"sorted",true)) resultSet.sortAndCopy(indices_ptr, dists_ptr, n);
+            else resultSet.copy(indices_ptr, dists_ptr, n);
+        }
+
+        return (int)resultSet.size();
+    }
+
+    /**
+     * \brief Saves the index to a stream
+     * \param stream The stream to save the index to
+     */
+    virtual void saveIndex(FILE* stream) = 0;
+
+    /**
+     * \brief Loads the index from a stream
+     * \param stream The stream from which the index is loaded
+     */
+    virtual void loadIndex(FILE* stream) = 0;
+
+    /**
+     * \returns number of features in this index.
+     */
+    virtual size_t size() const = 0;
+
+    /**
+     * \returns The dimensionality of the features in this index.
+     */
+    virtual size_t veclen() const = 0;
+
+    /**
+     * \returns The amount of memory (in bytes) used by the index.
+     */
+    virtual int usedMemory() const = 0;
+
+    /**
+     * \returns The index type (kdtree, kmeans,...)
+     */
+    virtual flann_algorithm_t getType() const = 0;
+
+    /**
+     * \returns The index parameters
+     */
+    virtual IndexParams getParameters() const = 0;
+
+
+    /**
+     * \brief Method that searches for nearest-neighbours
+     */
+    virtual void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) = 0;
+};
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_NNINDEX_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/object_factory.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/object_factory.h
new file mode 100644
index 000000000000..5cc45ad1b34e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/object_factory.h
@@ -0,0 +1,95 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_OBJECT_FACTORY_H_
+#define OPENCV_FLANN_OBJECT_FACTORY_H_
+
+//! @cond IGNORED
+
+#include <map>
+
+namespace cvflann
+{
+
+class CreatorNotFound
+{
+};
+
+template<typename BaseClass,
+         typename UniqueIdType,
+         typename ObjectCreator = BaseClass* (*)()>
+class ObjectFactory
+{
+    typedef ObjectFactory<BaseClass,UniqueIdType,ObjectCreator> ThisClass;
+    typedef std::map<UniqueIdType, ObjectCreator> ObjectRegistry;
+
+    // singleton class, private constructor
+    ObjectFactory() {}
+
+public:
+
+    bool subscribe(UniqueIdType id, ObjectCreator creator)
+    {
+        if (object_registry.find(id) != object_registry.end()) return false;
+
+        object_registry[id] = creator;
+        return true;
+    }
+
+    bool unregister(UniqueIdType id)
+    {
+        return object_registry.erase(id) == 1;
+    }
+
+    ObjectCreator create(UniqueIdType id)
+    {
+        typename ObjectRegistry::const_iterator iter = object_registry.find(id);
+
+        if (iter == object_registry.end()) {
+            throw CreatorNotFound();
+        }
+
+        return iter->second;
+    }
+
+    static ThisClass& instance()
+    {
+        static ThisClass the_factory;
+        return the_factory;
+    }
+private:
+    ObjectRegistry object_registry;
+};
+
+}
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_OBJECT_FACTORY_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/params.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/params.h
new file mode 100644
index 000000000000..1a8e127035a0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/params.h
@@ -0,0 +1,126 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2011  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2011  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+
+#ifndef OPENCV_FLANN_PARAMS_H_
+#define OPENCV_FLANN_PARAMS_H_
+
+//! @cond IGNORED
+
+#include "any.h"
+#include "general.h"
+#include <iostream>
+#include <map>
+
+
+namespace cvflann
+{
+
+typedef std::map<cv::String, any> IndexParams;
+
+struct SearchParams : public IndexParams
+{
+    SearchParams(int checks = 32, float eps = 0, bool sorted = true )
+    {
+        init(checks, eps, sorted, false);
+    }
+
+    SearchParams(int checks, float eps, bool sorted, bool explore_all_trees )
+    {
+        init(checks, eps, sorted, explore_all_trees);
+    }
+
+    void init(int checks = 32, float eps = 0, bool sorted = true, bool explore_all_trees = false )
+    {
+        // how many leafs to visit when searching for neighbours (-1 for unlimited)
+        (*this)["checks"] = checks;
+        // search for eps-approximate neighbours (default: 0)
+        (*this)["eps"] = eps;
+        // only for radius search, require neighbours sorted by distance (default: true)
+        (*this)["sorted"] = sorted;
+        // if false, search stops at the tree reaching the number of  max checks (original behavior).
+        // When true, we do a descent in each tree and. Like before the alternative paths
+        // stored in the heap are not be processed further when max checks is reached.
+        (*this)["explore_all_trees"] = explore_all_trees;
+    }
+};
+
+
+template<typename T>
+T get_param(const IndexParams& params, const cv::String& name, const T& default_value)
+{
+    IndexParams::const_iterator it = params.find(name);
+    if (it != params.end()) {
+        try {
+            return it->second.cast<T>();
+        } catch (const std::exception& e) {
+            CV_Error_(cv::Error::StsBadArg,
+                      ("FLANN '%s' param type mismatch: %s", name.c_str(), e.what()));
+        }
+    }
+    else {
+        return default_value;
+    }
+}
+
+template<typename T>
+T get_param(const IndexParams& params, const cv::String& name)
+{
+    IndexParams::const_iterator it = params.find(name);
+    if (it != params.end()) {
+        try {
+            return it->second.cast<T>();
+        } catch (const std::exception& e) {
+            CV_Error_(cv::Error::StsBadArg,
+                      ("FLANN '%s' param type mismatch: %s", name.c_str(), e.what()));
+        }
+    }
+    else {
+        FLANN_THROW(cv::Error::StsBadArg, cv::String("Missing parameter '")+name+cv::String("' in the parameters given"));
+    }
+}
+
+inline void print_params(const IndexParams& params, std::ostream& stream)
+{
+    IndexParams::const_iterator it;
+
+    for(it=params.begin(); it!=params.end(); ++it) {
+        stream << it->first << " : " << it->second << std::endl;
+    }
+}
+
+inline void print_params(const IndexParams& params)
+{
+    print_params(params, std::cout);
+}
+
+}
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_PARAMS_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/random.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/random.h
new file mode 100644
index 000000000000..5a12ef3046a6
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/random.h
@@ -0,0 +1,156 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_RANDOM_H
+#define OPENCV_FLANN_RANDOM_H
+
+//! @cond IGNORED
+
+#include <algorithm>
+#include <cstdlib>
+#include <vector>
+
+namespace cvflann
+{
+
+inline int rand()
+{
+#ifndef OPENCV_FLANN_USE_STD_RAND
+#   if INT_MAX == RAND_MAX
+    int v = cv::theRNG().next() & INT_MAX;
+#   else
+    int v = cv::theRNG().uniform(0, RAND_MAX + 1);
+#   endif
+#else
+    int v = std::rand();
+#endif // OPENCV_FLANN_USE_STD_RAND
+    return v;
+}
+
+/**
+ * Seeds the random number generator
+ *  @param seed Random seed
+ */
+inline void seed_random(unsigned int seed)
+{
+#ifndef OPENCV_FLANN_USE_STD_RAND
+    cv::theRNG() = cv::RNG(seed);
+#else
+    std::srand(seed);
+#endif
+}
+
+/*
+ * Generates a random double value.
+ */
+/**
+ * Generates a random double value.
+ * @param high Upper limit
+ * @param low Lower limit
+ * @return Random double value
+ */
+inline double rand_double(double high = 1.0, double low = 0)
+{
+    return low + ((high-low) * (rand() / (RAND_MAX + 1.0)));
+}
+
+/**
+ * Generates a random integer value.
+ * @param high Upper limit
+ * @param low Lower limit
+ * @return Random integer value
+ */
+inline int rand_int(int high = RAND_MAX, int low = 0)
+{
+    return low + (int) ( double(high-low) * (rand() / (RAND_MAX + 1.0)));
+}
+
+/**
+ * Random number generator that returns a distinct number from
+ * the [0,n) interval each time.
+ */
+class UniqueRandom
+{
+    std::vector<int> vals_;
+    int size_;
+    int counter_;
+
+public:
+    /**
+     * Constructor.
+     * @param n Size of the interval from which to generate
+     */
+    UniqueRandom(int n)
+    {
+        init(n);
+    }
+
+    /**
+     * Initializes the number generator.
+     * @param n the size of the interval from which to generate random numbers.
+     */
+    void init(int n)
+    {
+        // create and initialize an array of size n
+        vals_.resize(n);
+        size_ = n;
+        for (int i = 0; i < size_; ++i) vals_[i] = i;
+
+        // shuffle the elements in the array
+#ifndef OPENCV_FLANN_USE_STD_RAND
+        cv::randShuffle(vals_);
+#else
+        std::random_shuffle(vals_.begin(), vals_.end());
+#endif
+
+        counter_ = 0;
+    }
+
+    /**
+     * Return a distinct random integer in greater or equal to 0 and less
+     * than 'n' on each call. It should be called maximum 'n' times.
+     * Returns: a random integer
+     */
+    int next()
+    {
+        if (counter_ == size_) {
+            return -1;
+        }
+        else {
+            return vals_[counter_++];
+        }
+    }
+};
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_RANDOM_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/result_set.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/result_set.h
new file mode 100644
index 000000000000..aa679df71c21
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/result_set.h
@@ -0,0 +1,548 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_RESULTSET_H
+#define OPENCV_FLANN_RESULTSET_H
+
+//! @cond IGNORED
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <set>
+#include <vector>
+
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/cvdef.h"
+
+namespace cvflann
+{
+
+/* This record represents a branch point when finding neighbors in
+    the tree.  It contains a record of the minimum distance to the query
+    point, as well as the node at which the search resumes.
+ */
+
+template <typename T, typename DistanceType>
+struct BranchStruct
+{
+    T node;           /* Tree node at which search resumes */
+    DistanceType mindist;     /* Minimum distance to query for all nodes below. */
+
+    BranchStruct() {}
+    BranchStruct(const T& aNode, DistanceType dist) : node(aNode), mindist(dist) {}
+
+    bool operator<(const BranchStruct<T, DistanceType>& rhs) const
+    {
+        return mindist<rhs.mindist;
+    }
+};
+
+
+template <typename DistanceType>
+class ResultSet
+{
+public:
+    virtual ~ResultSet() {}
+
+    virtual bool full() const = 0;
+
+    virtual void addPoint(DistanceType dist, int index) = 0;
+
+    virtual DistanceType worstDist() const = 0;
+
+};
+
+/**
+ * KNNSimpleResultSet does not ensure that the element it holds are unique.
+ * Is used in those cases where the nearest neighbour algorithm used does not
+ * attempt to insert the same element multiple times.
+ */
+template <typename DistanceType>
+class KNNSimpleResultSet : public ResultSet<DistanceType>
+{
+    int* indices;
+    DistanceType* dists;
+    int capacity;
+    int count;
+    DistanceType worst_distance_;
+
+public:
+    KNNSimpleResultSet(int capacity_) : capacity(capacity_), count(0)
+    {
+    }
+
+    void init(int* indices_, DistanceType* dists_)
+    {
+        indices = indices_;
+        dists = dists_;
+        count = 0;
+        worst_distance_ = (std::numeric_limits<DistanceType>::max)();
+        dists[capacity-1] = worst_distance_;
+    }
+
+    size_t size() const
+    {
+        return count;
+    }
+
+    bool full() const CV_OVERRIDE
+    {
+        return count == capacity;
+    }
+
+
+    void addPoint(DistanceType dist, int index) CV_OVERRIDE
+    {
+        if (dist >= worst_distance_) return;
+        int i;
+        for (i=count; i>0; --i) {
+#ifdef FLANN_FIRST_MATCH
+            if ( (dists[i-1]>dist) || ((dist==dists[i-1])&&(indices[i-1]>index)) )
+#else
+            if (dists[i-1]>dist)
+#endif
+            {
+                if (i<capacity) {
+                    dists[i] = dists[i-1];
+                    indices[i] = indices[i-1];
+                }
+            }
+            else break;
+        }
+        if (count < capacity) ++count;
+        dists[i] = dist;
+        indices[i] = index;
+        worst_distance_ = dists[capacity-1];
+    }
+
+    DistanceType worstDist() const CV_OVERRIDE
+    {
+        return worst_distance_;
+    }
+};
+
+/**
+ * K-Nearest neighbour result set. Ensures that the elements inserted are unique
+ */
+template <typename DistanceType>
+class KNNResultSet : public ResultSet<DistanceType>
+{
+    int* indices;
+    DistanceType* dists;
+    int capacity;
+    int count;
+    DistanceType worst_distance_;
+
+public:
+    KNNResultSet(int capacity_)
+        : indices(NULL), dists(NULL), capacity(capacity_), count(0), worst_distance_(0)
+    {
+    }
+
+    void init(int* indices_, DistanceType* dists_)
+    {
+        indices = indices_;
+        dists = dists_;
+        count = 0;
+        worst_distance_ = (std::numeric_limits<DistanceType>::max)();
+        dists[capacity-1] = worst_distance_;
+    }
+
+    size_t size() const
+    {
+        return count;
+    }
+
+    bool full() const CV_OVERRIDE
+    {
+        return count == capacity;
+    }
+
+
+    void addPoint(DistanceType dist, int index) CV_OVERRIDE
+    {
+        CV_DbgAssert(indices);
+        CV_DbgAssert(dists);
+        if (dist >= worst_distance_) return;
+        int i;
+        for (i = count; i > 0; --i) {
+#ifdef FLANN_FIRST_MATCH
+            if ( (dists[i-1]<=dist) && ((dist!=dists[i-1])||(indices[i-1]<=index)) )
+#else
+            if (dists[i-1]<=dist)
+#endif
+            {
+                // Check for duplicate indices
+                for (int j = i; dists[j] == dist && j--;) {
+                    if (indices[j] == index) {
+                        return;
+                    }
+                }
+                break;
+            }
+        }
+
+        if (count < capacity) ++count;
+        for (int j = count-1; j > i; --j) {
+            dists[j] = dists[j-1];
+            indices[j] = indices[j-1];
+        }
+        dists[i] = dist;
+        indices[i] = index;
+        worst_distance_ = dists[capacity-1];
+    }
+
+    DistanceType worstDist() const CV_OVERRIDE
+    {
+        return worst_distance_;
+    }
+};
+
+
+/**
+ * A result-set class used when performing a radius based search.
+ */
+template <typename DistanceType>
+class RadiusResultSet : public ResultSet<DistanceType>
+{
+    DistanceType radius;
+    int* indices;
+    DistanceType* dists;
+    size_t capacity;
+    size_t count;
+
+public:
+    RadiusResultSet(DistanceType radius_, int* indices_, DistanceType* dists_, int capacity_) :
+        radius(radius_), indices(indices_), dists(dists_), capacity(capacity_)
+    {
+        init();
+    }
+
+    ~RadiusResultSet()
+    {
+    }
+
+    void init()
+    {
+        count = 0;
+    }
+
+    size_t size() const
+    {
+        return count;
+    }
+
+    bool full() const
+    {
+        return true;
+    }
+
+    void addPoint(DistanceType dist, int index)
+    {
+        if (dist<radius) {
+            if ((capacity>0)&&(count < capacity)) {
+                dists[count] = dist;
+                indices[count] = index;
+            }
+            count++;
+        }
+    }
+
+    DistanceType worstDist() const
+    {
+        return radius;
+    }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Class that holds the k NN neighbors
+ * Faster than KNNResultSet as it uses a binary heap and does not maintain two arrays
+ */
+template<typename DistanceType>
+class UniqueResultSet : public ResultSet<DistanceType>
+{
+public:
+    struct DistIndex
+    {
+        DistIndex(DistanceType dist, unsigned int index) :
+            dist_(dist), index_(index)
+        {
+        }
+        bool operator<(const DistIndex dist_index) const
+        {
+            return (dist_ < dist_index.dist_) || ((dist_ == dist_index.dist_) && index_ < dist_index.index_);
+        }
+        DistanceType dist_;
+        unsigned int index_;
+    };
+
+    /** Default constructor */
+    UniqueResultSet() :
+        is_full_(false), worst_distance_(std::numeric_limits<DistanceType>::max())
+    {
+    }
+
+    /** Check the status of the set
+     * @return true if we have k NN
+     */
+    inline bool full() const CV_OVERRIDE
+    {
+        return is_full_;
+    }
+
+    /** Remove all elements in the set
+     */
+    virtual void clear() = 0;
+
+    /** Copy the set to two C arrays
+     * @param indices pointer to a C array of indices
+     * @param dist pointer to a C array of distances
+     * @param n_neighbors the number of neighbors to copy
+     */
+    virtual void copy(int* indices, DistanceType* dist, int n_neighbors = -1) const
+    {
+        if (n_neighbors < 0) {
+            for (typename std::set<DistIndex>::const_iterator dist_index = dist_indices_.begin(), dist_index_end =
+                     dist_indices_.end(); dist_index != dist_index_end; ++dist_index, ++indices, ++dist) {
+                *indices = dist_index->index_;
+                *dist = dist_index->dist_;
+            }
+        }
+        else {
+            int i = 0;
+            for (typename std::set<DistIndex>::const_iterator dist_index = dist_indices_.begin(), dist_index_end =
+                     dist_indices_.end(); (dist_index != dist_index_end) && (i < n_neighbors); ++dist_index, ++indices, ++dist, ++i) {
+                *indices = dist_index->index_;
+                *dist = dist_index->dist_;
+            }
+        }
+    }
+
+    /** Copy the set to two C arrays but sort it according to the distance first
+     * @param indices pointer to a C array of indices
+     * @param dist pointer to a C array of distances
+     * @param n_neighbors the number of neighbors to copy
+     */
+    virtual void sortAndCopy(int* indices, DistanceType* dist, int n_neighbors = -1) const
+    {
+        copy(indices, dist, n_neighbors);
+    }
+
+    /** The number of neighbors in the set
+     */
+    size_t size() const
+    {
+        return dist_indices_.size();
+    }
+
+    /** The distance of the furthest neighbor
+     * If we don't have enough neighbors, it returns the max possible value
+     */
+    inline DistanceType worstDist() const CV_OVERRIDE
+    {
+        return worst_distance_;
+    }
+protected:
+    /** Flag to say if the set is full */
+    bool is_full_;
+
+    /** The worst distance found so far */
+    DistanceType worst_distance_;
+
+    /** The best candidates so far */
+    std::set<DistIndex> dist_indices_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Class that holds the k NN neighbors
+ * Faster than KNNResultSet as it uses a binary heap and does not maintain two arrays
+ */
+template<typename DistanceType>
+class KNNUniqueResultSet : public UniqueResultSet<DistanceType>
+{
+public:
+    /** Constructor
+     * @param capacity the number of neighbors to store at max
+     */
+    KNNUniqueResultSet(unsigned int capacity) : capacity_(capacity)
+    {
+        this->is_full_ = false;
+        this->clear();
+    }
+
+    /** Add a possible candidate to the best neighbors
+     * @param dist distance for that neighbor
+     * @param index index of that neighbor
+     */
+    inline void addPoint(DistanceType dist, int index) CV_OVERRIDE
+    {
+        // Don't do anything if we are worse than the worst
+        if (dist >= worst_distance_) return;
+        dist_indices_.insert(DistIndex(dist, index));
+
+        if (is_full_) {
+            if (dist_indices_.size() > capacity_) {
+                dist_indices_.erase(*dist_indices_.rbegin());
+                worst_distance_ = dist_indices_.rbegin()->dist_;
+            }
+        }
+        else if (dist_indices_.size() == capacity_) {
+            is_full_ = true;
+            worst_distance_ = dist_indices_.rbegin()->dist_;
+        }
+    }
+
+    /** Remove all elements in the set
+     */
+    void clear() CV_OVERRIDE
+    {
+        dist_indices_.clear();
+        worst_distance_ = std::numeric_limits<DistanceType>::max();
+        is_full_ = false;
+    }
+
+protected:
+    typedef typename UniqueResultSet<DistanceType>::DistIndex DistIndex;
+    using UniqueResultSet<DistanceType>::is_full_;
+    using UniqueResultSet<DistanceType>::worst_distance_;
+    using UniqueResultSet<DistanceType>::dist_indices_;
+
+    /** The number of neighbors to keep */
+    unsigned int capacity_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Class that holds the radius nearest neighbors
+ * It is more accurate than RadiusResult as it is not limited in the number of neighbors
+ */
+template<typename DistanceType>
+class RadiusUniqueResultSet : public UniqueResultSet<DistanceType>
+{
+public:
+    /** Constructor
+     * @param radius the maximum distance of a neighbor
+     */
+    RadiusUniqueResultSet(DistanceType radius) :
+        radius_(radius)
+    {
+        is_full_ = true;
+    }
+
+    /** Add a possible candidate to the best neighbors
+     * @param dist distance for that neighbor
+     * @param index index of that neighbor
+     */
+    void addPoint(DistanceType dist, int index) CV_OVERRIDE
+    {
+        if (dist <= radius_) dist_indices_.insert(DistIndex(dist, index));
+    }
+
+    /** Remove all elements in the set
+     */
+    inline void clear() CV_OVERRIDE
+    {
+        dist_indices_.clear();
+    }
+
+
+    /** Check the status of the set
+     * @return alwys false
+     */
+    inline bool full() const CV_OVERRIDE
+    {
+        return true;
+    }
+
+    /** The distance of the furthest neighbor
+     * If we don't have enough neighbors, it returns the max possible value
+     */
+    inline DistanceType worstDist() const CV_OVERRIDE
+    {
+        return radius_;
+    }
+private:
+    typedef typename UniqueResultSet<DistanceType>::DistIndex DistIndex;
+    using UniqueResultSet<DistanceType>::dist_indices_;
+    using UniqueResultSet<DistanceType>::is_full_;
+
+    /** The furthest distance a neighbor can be */
+    DistanceType radius_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Class that holds the k NN neighbors within a radius distance
+ */
+template<typename DistanceType>
+class KNNRadiusUniqueResultSet : public KNNUniqueResultSet<DistanceType>
+{
+public:
+    /** Constructor
+     * @param capacity the number of neighbors to store at max
+     * @param radius the maximum distance of a neighbor
+     */
+    KNNRadiusUniqueResultSet(unsigned int capacity, DistanceType radius)
+    {
+        this->capacity_ = capacity;
+        this->radius_ = radius;
+        this->dist_indices_.reserve(capacity_);
+        this->clear();
+    }
+
+    /** Remove all elements in the set
+     */
+    void clear()
+    {
+        dist_indices_.clear();
+        worst_distance_ = radius_;
+        is_full_ = false;
+    }
+private:
+    using KNNUniqueResultSet<DistanceType>::dist_indices_;
+    using KNNUniqueResultSet<DistanceType>::is_full_;
+    using KNNUniqueResultSet<DistanceType>::worst_distance_;
+
+    /** The maximum number of neighbors to consider */
+    unsigned int capacity_;
+
+    /** The maximum distance of a neighbor */
+    DistanceType radius_;
+};
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_RESULTSET_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/sampling.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/sampling.h
new file mode 100644
index 000000000000..4e452b9bba20
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/sampling.h
@@ -0,0 +1,84 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+
+#ifndef OPENCV_FLANN_SAMPLING_H_
+#define OPENCV_FLANN_SAMPLING_H_
+
+//! @cond IGNORED
+
+#include "matrix.h"
+#include "random.h"
+
+namespace cvflann
+{
+
+template<typename T>
+Matrix<T> random_sample(Matrix<T>& srcMatrix, long size, bool remove = false)
+{
+    Matrix<T> newSet(new T[size * srcMatrix.cols], size,srcMatrix.cols);
+
+    T* src,* dest;
+    for (long i=0; i<size; ++i) {
+        long r = rand_int((int)(srcMatrix.rows-i));
+        dest = newSet[i];
+        src = srcMatrix[r];
+        std::copy(src, src+srcMatrix.cols, dest);
+        if (remove) {
+            src = srcMatrix[srcMatrix.rows-i-1];
+            dest = srcMatrix[r];
+            std::copy(src, src+srcMatrix.cols, dest);
+        }
+    }
+    if (remove) {
+        srcMatrix.rows -= size;
+    }
+    return newSet;
+}
+
+template<typename T>
+Matrix<T> random_sample(const Matrix<T>& srcMatrix, size_t size)
+{
+    UniqueRandom rand((int)srcMatrix.rows);
+    Matrix<T> newSet(new T[size * srcMatrix.cols], size,srcMatrix.cols);
+
+    T* src,* dest;
+    for (size_t i=0; i<size; ++i) {
+        long r = rand.next();
+        dest = newSet[i];
+        src = srcMatrix[r];
+        std::copy(src, src+srcMatrix.cols, dest);
+    }
+    return newSet;
+}
+
+} // namespace
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_SAMPLING_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/saving.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/saving.h
new file mode 100644
index 000000000000..8b3aeb7f0a89
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/saving.h
@@ -0,0 +1,191 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE NNIndexGOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_SAVING_H_
+#define OPENCV_FLANN_SAVING_H_
+
+//! @cond IGNORED
+
+#include <cstring>
+#include <vector>
+
+#include "general.h"
+#include "nn_index.h"
+
+#ifdef FLANN_SIGNATURE_
+#undef FLANN_SIGNATURE_
+#endif
+#define FLANN_SIGNATURE_ "FLANN_INDEX"
+
+namespace cvflann
+{
+
+template <typename T>
+struct Datatype {};
+template<>
+struct Datatype<char> { static flann_datatype_t type() { return FLANN_INT8; } };
+template<>
+struct Datatype<short> { static flann_datatype_t type() { return FLANN_INT16; } };
+template<>
+struct Datatype<int> { static flann_datatype_t type() { return FLANN_INT32; } };
+template<>
+struct Datatype<unsigned char> { static flann_datatype_t type() { return FLANN_UINT8; } };
+template<>
+struct Datatype<unsigned short> { static flann_datatype_t type() { return FLANN_UINT16; } };
+template<>
+struct Datatype<unsigned int> { static flann_datatype_t type() { return FLANN_UINT32; } };
+template<>
+struct Datatype<float> { static flann_datatype_t type() { return FLANN_FLOAT32; } };
+template<>
+struct Datatype<double> { static flann_datatype_t type() { return FLANN_FLOAT64; } };
+
+
+/**
+ * Structure representing the index header.
+ */
+struct IndexHeader
+{
+    char signature[16];
+    char version[16];
+    flann_datatype_t data_type;
+    flann_algorithm_t index_type;
+    size_t rows;
+    size_t cols;
+};
+
+/**
+ * Saves index header to stream
+ *
+ * @param stream - Stream to save to
+ * @param index - The index to save
+ */
+template<typename Distance>
+void save_header(FILE* stream, const NNIndex<Distance>& index)
+{
+    IndexHeader header;
+    memset(header.signature, 0, sizeof(header.signature));
+    strcpy(header.signature, FLANN_SIGNATURE_);
+    memset(header.version, 0, sizeof(header.version));
+    strcpy(header.version, FLANN_VERSION_);
+    header.data_type = Datatype<typename Distance::ElementType>::type();
+    header.index_type = index.getType();
+    header.rows = index.size();
+    header.cols = index.veclen();
+
+    std::fwrite(&header, sizeof(header),1,stream);
+}
+
+
+/**
+ *
+ * @param stream - Stream to load from
+ * @return Index header
+ */
+inline IndexHeader load_header(FILE* stream)
+{
+    IndexHeader header;
+    size_t read_size = fread(&header,sizeof(header),1,stream);
+
+    if (read_size!=(size_t)1) {
+        FLANN_THROW(cv::Error::StsError, "Invalid index file, cannot read");
+    }
+
+    if (strcmp(header.signature,FLANN_SIGNATURE_)!=0) {
+        FLANN_THROW(cv::Error::StsError, "Invalid index file, wrong signature");
+    }
+
+    return header;
+
+}
+
+
+template<typename T>
+void save_value(FILE* stream, const T& value, size_t count = 1)
+{
+    fwrite(&value, sizeof(value),count, stream);
+}
+
+template<typename T>
+void save_value(FILE* stream, const cvflann::Matrix<T>& value)
+{
+    fwrite(&value, sizeof(value),1, stream);
+    fwrite(value.data, sizeof(T),value.rows*value.cols, stream);
+}
+
+template<typename T>
+void save_value(FILE* stream, const std::vector<T>& value)
+{
+    size_t size = value.size();
+    fwrite(&size, sizeof(size_t), 1, stream);
+    fwrite(&value[0], sizeof(T), size, stream);
+}
+
+template<typename T>
+void load_value(FILE* stream, T& value, size_t count = 1)
+{
+    size_t read_cnt = fread(&value, sizeof(value), count, stream);
+    if (read_cnt != count) {
+        FLANN_THROW(cv::Error::StsParseError, "Cannot read from file");
+    }
+}
+
+template<typename T>
+void load_value(FILE* stream, cvflann::Matrix<T>& value)
+{
+    size_t read_cnt = fread(&value, sizeof(value), 1, stream);
+    if (read_cnt != 1) {
+        FLANN_THROW(cv::Error::StsParseError, "Cannot read from file");
+    }
+    value.data = new T[value.rows*value.cols];
+    read_cnt = fread(value.data, sizeof(T), value.rows*value.cols, stream);
+    if (read_cnt != (size_t)(value.rows*value.cols)) {
+        FLANN_THROW(cv::Error::StsParseError, "Cannot read from file");
+    }
+}
+
+
+template<typename T>
+void load_value(FILE* stream, std::vector<T>& value)
+{
+    size_t size;
+    size_t read_cnt = fread(&size, sizeof(size_t), 1, stream);
+    if (read_cnt!=1) {
+        FLANN_THROW(cv::Error::StsError, "Cannot read from file");
+    }
+    value.resize(size);
+    read_cnt = fread(&value[0], sizeof(T), size, stream);
+    if (read_cnt != size) {
+        FLANN_THROW(cv::Error::StsError, "Cannot read from file");
+    }
+}
+
+}
+
+//! @endcond
+
+#endif /* OPENCV_FLANN_SAVING_H_ */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/simplex_downhill.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/simplex_downhill.h
new file mode 100644
index 000000000000..02970148b205
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/simplex_downhill.h
@@ -0,0 +1,190 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_SIMPLEX_DOWNHILL_H_
+#define OPENCV_FLANN_SIMPLEX_DOWNHILL_H_
+
+//! @cond IGNORED
+
+namespace cvflann
+{
+
+/**
+    Adds val to array vals (and point to array points) and keeping the arrays sorted by vals.
+ */
+template <typename T>
+void addValue(int pos, float val, float* vals, T* point, T* points, int n)
+{
+    vals[pos] = val;
+    for (int i=0; i<n; ++i) {
+        points[pos*n+i] = point[i];
+    }
+
+    // bubble down
+    int j=pos;
+    while (j>0 && vals[j]<vals[j-1]) {
+        swap(vals[j],vals[j-1]);
+        for (int i=0; i<n; ++i) {
+            swap(points[j*n+i],points[(j-1)*n+i]);
+        }
+        --j;
+    }
+}
+
+
+/**
+    Simplex downhill optimization function.
+    Preconditions: points is a 2D mattrix of size (n+1) x n
+                    func is the cost function taking n an array of n params and returning float
+                    vals is the cost function in the n+1 simplex points, if NULL it will be computed
+
+    Postcondition: returns optimum value and points[0..n] are the optimum parameters
+ */
+template <typename T, typename F>
+float optimizeSimplexDownhill(T* points, int n, F func, float* vals = NULL )
+{
+    const int MAX_ITERATIONS = 10;
+
+    CV_DbgAssert(n>0);
+
+    T* p_o = new T[n];
+    T* p_r = new T[n];
+    T* p_e = new T[n];
+
+    int alpha = 1;
+
+    int iterations = 0;
+
+    bool ownVals = false;
+    if (vals == NULL) {
+        ownVals = true;
+        vals = new float[n+1];
+        for (int i=0; i<n+1; ++i) {
+            float val = func(points+i*n);
+            addValue(i, val, vals, points+i*n, points, n);
+        }
+    }
+    int nn = n*n;
+
+    while (true) {
+
+        if (iterations++ > MAX_ITERATIONS) break;
+
+        // compute average of simplex points (except the highest point)
+        for (int j=0; j<n; ++j) {
+            p_o[j] = 0;
+            for (int i=0; i<n; ++i) {
+                p_o[i] += points[j*n+i];
+            }
+        }
+        for (int i=0; i<n; ++i) {
+            p_o[i] /= n;
+        }
+
+        bool converged = true;
+        for (int i=0; i<n; ++i) {
+            if (p_o[i] != points[nn+i]) {
+                converged = false;
+            }
+        }
+        if (converged) break;
+
+        // trying a reflection
+        for (int i=0; i<n; ++i) {
+            p_r[i] = p_o[i] + alpha*(p_o[i]-points[nn+i]);
+        }
+        float val_r = func(p_r);
+
+        if ((val_r>=vals[0])&&(val_r<vals[n])) {
+            // reflection between second highest and lowest
+            // add it to the simplex
+            Logger::info("Choosing reflection\n");
+            addValue(n, val_r,vals, p_r, points, n);
+            continue;
+        }
+
+        if (val_r<vals[0]) {
+            // value is smaller than smallest in simplex
+
+            // expand some more to see if it drops further
+            for (int i=0; i<n; ++i) {
+                p_e[i] = 2*p_r[i]-p_o[i];
+            }
+            float val_e = func(p_e);
+
+            if (val_e<val_r) {
+                Logger::info("Choosing reflection and expansion\n");
+                addValue(n, val_e,vals,p_e,points,n);
+            }
+            else {
+                Logger::info("Choosing reflection\n");
+                addValue(n, val_r,vals,p_r,points,n);
+            }
+            continue;
+        }
+        if (val_r>=vals[n]) {
+            for (int i=0; i<n; ++i) {
+                p_e[i] = (p_o[i]+points[nn+i])/2;
+            }
+            float val_e = func(p_e);
+
+            if (val_e<vals[n]) {
+                Logger::info("Choosing contraction\n");
+                addValue(n,val_e,vals,p_e,points,n);
+                continue;
+            }
+        }
+        {
+            Logger::info("Full contraction\n");
+            for (int j=1; j<=n; ++j) {
+                for (int i=0; i<n; ++i) {
+                    points[j*n+i] = (points[j*n+i]+points[i])/2;
+                }
+                float val = func(points+j*n);
+                addValue(j,val,vals,points+j*n,points,n);
+            }
+        }
+    }
+
+    float bestVal = vals[0];
+
+    delete[] p_r;
+    delete[] p_o;
+    delete[] p_e;
+    if (ownVals) delete[] vals;
+
+    return bestVal;
+}
+
+}
+
+//! @endcond
+
+#endif //OPENCV_FLANN_SIMPLEX_DOWNHILL_H_
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/flann/timer.h b/3rdparty/opencv/opencv410/build/include/opencv2/flann/timer.h
new file mode 100644
index 000000000000..7dc50a4777ac
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/flann/timer.h
@@ -0,0 +1,99 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+#ifndef OPENCV_FLANN_TIMER_H
+#define OPENCV_FLANN_TIMER_H
+
+//! @cond IGNORED
+
+#include <time.h>
+#include "opencv2/core.hpp"
+#include "opencv2/core/utility.hpp"
+
+namespace cvflann
+{
+
+/**
+ * A start-stop timer class.
+ *
+ * Can be used to time portions of code.
+ */
+class StartStopTimer
+{
+    int64 startTime;
+
+public:
+    /**
+     * Value of the timer.
+     */
+    double value;
+
+
+    /**
+     * Constructor.
+     */
+    StartStopTimer()
+        : startTime(0)
+    {
+        reset();
+    }
+
+    /**
+     * Starts the timer.
+     */
+    void start()
+    {
+        startTime = cv::getTickCount();
+    }
+
+    /**
+     * Stops the timer and updates timer value.
+     */
+    void stop()
+    {
+        int64 stopTime = cv::getTickCount();
+        value += ( (double)stopTime - startTime) / cv::getTickFrequency();
+    }
+
+    /**
+     * Resets the timer value to 0.
+     */
+    void reset()
+    {
+        value = 0;
+    }
+
+};
+
+}
+
+//! @endcond
+
+#endif // FLANN_TIMER_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi.hpp
new file mode 100644
index 000000000000..2087641023c8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi.hpp
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2021 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_HPP
+#define OPENCV_GAPI_HPP
+
+#include <memory>
+
+/** \defgroup gapi_ref G-API framework
+@{
+    @defgroup gapi_main_classes G-API Main Classes
+    @defgroup gapi_data_objects G-API Data Types
+    @{
+      @defgroup gapi_meta_args G-API Metadata Descriptors
+    @}
+    @defgroup gapi_std_backends G-API Standard Backends
+    @defgroup gapi_compile_args G-API Graph Compilation Arguments
+    @defgroup gapi_serialization G-API Serialization functionality
+@}
+ */
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/garray.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+#include <opencv2/gapi/gopaque.hpp>
+#include <opencv2/gapi/gframe.hpp>
+#include <opencv2/gapi/gcomputation.hpp>
+#include <opencv2/gapi/gcompiled.hpp>
+#include <opencv2/gapi/gtyped.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/operators.hpp>
+
+// Include these files here to avoid cyclic dependency between
+// Desync & GKernel & GComputation & GStreamingCompiled.
+#include <opencv2/gapi/streaming/desync.hpp>
+#include <opencv2/gapi/streaming/format.hpp>
+
+#endif // OPENCV_GAPI_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/core.hpp
new file mode 100644
index 000000000000..60bb2c507458
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/core.hpp
@@ -0,0 +1,1911 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CORE_HPP
+#define OPENCV_GAPI_CORE_HPP
+
+#include <math.h>
+#include <utility> // std::tuple
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/gapi/imgproc.hpp>
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/streaming/format.hpp>
+
+/** \defgroup gapi_core G-API Core functionality
+@{
+    @defgroup gapi_math Graph API: Math operations
+    @defgroup gapi_pixelwise Graph API: Pixelwise operations
+    @defgroup gapi_matrixop Graph API: Operations on matrices
+    @defgroup gapi_transform Graph API: Image and channel composition functions
+@}
+ */
+
+namespace cv { namespace gapi {
+/**
+ * @brief This namespace contains G-API Operation Types for OpenCV
+ * Core module functionality.
+ */
+namespace core {
+    using GResize = cv::gapi::imgproc::GResize;
+    using GResizeP = cv::gapi::imgproc::GResizeP;
+
+    using GMat2 = std::tuple<GMat,GMat>;
+    using GMat3 = std::tuple<GMat,GMat,GMat>; // FIXME: how to avoid this?
+    using GMat4 = std::tuple<GMat,GMat,GMat,GMat>;
+    using GMatScalar  = std::tuple<GMat, GScalar>;
+
+    G_TYPED_KERNEL(GAdd, <GMat(GMat, GMat, int)>, "org.opencv.core.math.add") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc b, int ddepth) {
+            if (ddepth == -1)
+            {
+                // OpenCV: When the input arrays in add/subtract/multiply/divide
+                // functions have different depths, the output array depth must be
+                // explicitly specified!
+                // See artim_op() @ arithm.cpp
+                GAPI_Assert(a.chan == b.chan);
+                GAPI_Assert(a.depth == b.depth);
+                return a;
+            }
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GAddC, <GMat(GMat, GScalar, int)>, "org.opencv.core.math.addC") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc, int ddepth) {
+            GAPI_Assert(a.chan <= 4);
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSub, <GMat(GMat, GMat, int)>, "org.opencv.core.math.sub") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc b, int ddepth) {
+            if (ddepth == -1)
+            {
+                // This macro should select a larger data depth from a and b
+                // considering the number of channels in the same
+                // FIXME!!! Clarify if it is valid for sub()
+                GAPI_Assert(a.chan == b.chan);
+                ddepth = std::max(a.depth, b.depth);
+            }
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSubC, <GMat(GMat, GScalar, int)>, "org.opencv.core.math.subC") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSubRC,<GMat(GScalar, GMat, int)>, "org.opencv.core.math.subRC") {
+        static GMatDesc outMeta(GScalarDesc, GMatDesc b, int ddepth) {
+            return b.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMul, <GMat(GMat, GMat, double, int)>, "org.opencv.core.math.mul") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc, double, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMulCOld, <GMat(GMat, double, int)>, "org.opencv.core.math.mulCOld") {
+        static GMatDesc outMeta(GMatDesc a, double, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMulC, <GMat(GMat, GScalar, int)>, "org.opencv.core.math.mulC") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMulS, <GMat(GMat, GScalar)>, "org.opencv.core.math.muls") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a;
+        }
+    }; // FIXME: Merge with MulC
+
+    G_TYPED_KERNEL(GDiv, <GMat(GMat, GMat, double, int)>, "org.opencv.core.math.div") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc b, double, int ddepth) {
+            if (ddepth == -1)
+            {
+                GAPI_Assert(a.depth == b.depth);
+                return b;
+            }
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GDivC, <GMat(GMat, GScalar, double, int)>, "org.opencv.core.math.divC") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc, double, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GDivRC, <GMat(GScalar, GMat, double, int)>, "org.opencv.core.math.divRC") {
+        static GMatDesc outMeta(GScalarDesc, GMatDesc b, double, int ddepth) {
+            return b.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMean, <GScalar(GMat)>, "org.opencv.core.math.mean") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL_M(GPolarToCart, <GMat2(GMat, GMat, bool)>, "org.opencv.core.math.polarToCart") {
+        static std::tuple<GMatDesc, GMatDesc> outMeta(GMatDesc, GMatDesc a, bool) {
+            return std::make_tuple(a, a);
+        }
+    };
+
+    G_TYPED_KERNEL_M(GCartToPolar, <GMat2(GMat, GMat, bool)>, "org.opencv.core.math.cartToPolar") {
+        static std::tuple<GMatDesc, GMatDesc> outMeta(GMatDesc x, GMatDesc, bool) {
+            return std::make_tuple(x, x);
+        }
+    };
+
+    G_TYPED_KERNEL(GPhase, <GMat(GMat, GMat, bool)>, "org.opencv.core.math.phase") {
+        static GMatDesc outMeta(const GMatDesc &inx, const GMatDesc &, bool) {
+            return inx;
+        }
+    };
+
+    G_TYPED_KERNEL(GMask, <GMat(GMat,GMat)>, "org.opencv.core.pixelwise.mask") {
+        static GMatDesc outMeta(GMatDesc in, GMatDesc) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpGT, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpGT") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpGE, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpGE") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpLE, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpLE") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpLT, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpLT") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpEQ, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpEQ") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpNE, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpNE") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpGTScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpGTScalar") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpGEScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpGEScalar") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpLEScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpLEScalar") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpLTScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpLTScalar") {
+    static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpEQScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpEQScalar") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpNEScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpNEScalar") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GAnd, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.bitwise_and") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GAndS, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.bitwise_andS") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GOr, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.bitwise_or") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GOrS, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.bitwise_orS") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GXor, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.bitwise_xor") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GXorS, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.bitwise_xorS") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GNot, <GMat(GMat)>, "org.opencv.core.pixelwise.bitwise_not") {
+        static GMatDesc outMeta(GMatDesc a) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GSelect, <GMat(GMat, GMat, GMat)>, "org.opencv.core.pixelwise.select") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GMin, <GMat(GMat, GMat)>, "org.opencv.core.matrixop.min") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GMax, <GMat(GMat, GMat)>, "org.opencv.core.matrixop.max") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GAbsDiff, <GMat(GMat, GMat)>, "org.opencv.core.matrixop.absdiff") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat,GScalar)>, "org.opencv.core.matrixop.absdiffC") {
+        static GMatDesc outMeta(const GMatDesc& a, const GScalarDesc&) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GSum, <GScalar(GMat)>, "org.opencv.core.matrixop.sum") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GCountNonZero, <GOpaque<int>(GMat)>, "org.opencv.core.matrixop.countNonZero") {
+        static GOpaqueDesc outMeta(GMatDesc in) {
+            GAPI_Assert(in.chan == 1);
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GAddW, <GMat(GMat, double, GMat, double, double, int)>, "org.opencv.core.matrixop.addweighted") {
+        static GMatDesc outMeta(GMatDesc a, double, GMatDesc b, double, double, int ddepth) {
+            if (ddepth == -1)
+            {
+                // OpenCV: When the input arrays in add/subtract/multiply/divide
+                // functions have different depths, the output array depth must be
+                // explicitly specified!
+                // See artim_op() @ arithm.cpp
+                GAPI_Assert(a.chan == b.chan);
+                GAPI_Assert(a.depth == b.depth);
+                return a;
+            }
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GNormL1, <GScalar(GMat)>, "org.opencv.core.matrixop.norml1") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GNormL2, <GScalar(GMat)>, "org.opencv.core.matrixop.norml2") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GNormInf, <GScalar(GMat)>, "org.opencv.core.matrixop.norminf") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL_M(GIntegral, <GMat2(GMat, int, int)>, "org.opencv.core.matrixop.integral") {
+        static std::tuple<GMatDesc, GMatDesc> outMeta(GMatDesc in, int sd, int sqd) {
+            return std::make_tuple(in.withSizeDelta(1,1).withDepth(sd),
+                                   in.withSizeDelta(1,1).withDepth(sqd));
+        }
+    };
+
+    G_TYPED_KERNEL(GThreshold, <GMat(GMat, GScalar, GScalar, int)>, "org.opencv.core.matrixop.threshold") {
+        static GMatDesc outMeta(GMatDesc in, GScalarDesc, GScalarDesc, int) {
+            return in;
+        }
+    };
+
+
+    G_TYPED_KERNEL_M(GThresholdOT, <GMatScalar(GMat, GScalar, int)>, "org.opencv.core.matrixop.thresholdOT") {
+        static std::tuple<GMatDesc,GScalarDesc> outMeta(GMatDesc in, GScalarDesc, int) {
+            return std::make_tuple(in, empty_scalar_desc());
+        }
+    };
+
+    G_TYPED_KERNEL(GInRange, <GMat(GMat, GScalar, GScalar)>, "org.opencv.core.matrixop.inrange") {
+        static GMatDesc outMeta(GMatDesc in, GScalarDesc, GScalarDesc) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL_M(GSplit3, <GMat3(GMat)>, "org.opencv.core.transform.split3") {
+        static std::tuple<GMatDesc, GMatDesc, GMatDesc> outMeta(GMatDesc in) {
+            const auto out_depth = in.depth;
+            const auto out_desc  = in.withType(out_depth, 1);
+            return std::make_tuple(out_desc, out_desc, out_desc);
+        }
+    };
+
+    G_TYPED_KERNEL_M(GSplit4, <GMat4(GMat)>,"org.opencv.core.transform.split4") {
+        static std::tuple<GMatDesc, GMatDesc, GMatDesc, GMatDesc> outMeta(GMatDesc in) {
+            const auto out_depth = in.depth;
+            const auto out_desc = in.withType(out_depth, 1);
+            return std::make_tuple(out_desc, out_desc, out_desc, out_desc);
+        }
+    };
+
+    G_TYPED_KERNEL(GMerge3, <GMat(GMat,GMat,GMat)>, "org.opencv.core.transform.merge3") {
+        static GMatDesc outMeta(GMatDesc in, GMatDesc, GMatDesc) {
+            // Preserve depth and add channel component
+            return in.withType(in.depth, 3);
+        }
+    };
+
+    G_TYPED_KERNEL(GMerge4, <GMat(GMat,GMat,GMat,GMat)>, "org.opencv.core.transform.merge4") {
+        static GMatDesc outMeta(GMatDesc in, GMatDesc, GMatDesc, GMatDesc) {
+            // Preserve depth and add channel component
+            return in.withType(in.depth, 4);
+        }
+    };
+
+    G_TYPED_KERNEL(GRemap, <GMat(GMat, Mat, Mat, int, int, Scalar)>, "org.opencv.core.transform.remap") {
+        static GMatDesc outMeta(GMatDesc in, Mat m1, Mat, int, int, Scalar) {
+            return in.withSize(m1.size());
+        }
+    };
+
+    G_TYPED_KERNEL(GFlip, <GMat(GMat, int)>, "org.opencv.core.transform.flip") {
+        static GMatDesc outMeta(GMatDesc in, int) {
+            return in;
+        }
+    };
+
+    // TODO: eliminate the need in this kernel (streaming)
+    G_TYPED_KERNEL(GCrop, <GMat(GMat, Rect)>, "org.opencv.core.transform.crop") {
+        static GMatDesc outMeta(GMatDesc in, Rect rc) {
+            return in.withSize(Size(rc.width, rc.height));
+        }
+    };
+
+    G_TYPED_KERNEL(GConcatHor, <GMat(GMat, GMat)>, "org.opencv.imgproc.transform.concatHor") {
+        static GMatDesc outMeta(GMatDesc l, GMatDesc r) {
+            return l.withSizeDelta(+r.size.width, 0);
+        }
+    };
+
+    G_TYPED_KERNEL(GConcatVert, <GMat(GMat, GMat)>, "org.opencv.imgproc.transform.concatVert") {
+        static GMatDesc outMeta(GMatDesc t, GMatDesc b) {
+            return t.withSizeDelta(0, +b.size.height);
+        }
+    };
+
+    G_TYPED_KERNEL(GLUT, <GMat(GMat, Mat)>, "org.opencv.core.transform.LUT") {
+        static GMatDesc outMeta(GMatDesc in, Mat) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GConvertTo, <GMat(GMat, int, double, double)>, "org.opencv.core.transform.convertTo") {
+        static GMatDesc outMeta(GMatDesc in, int rdepth, double, double) {
+            return rdepth < 0 ? in : in.withDepth(rdepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSqrt, <GMat(GMat)>, "org.opencv.core.math.sqrt") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GNormalize, <GMat(GMat, double, double, int, int)>, "org.opencv.core.normalize") {
+        static GMatDesc outMeta(GMatDesc in, double, double, int, int ddepth) {
+            // unlike opencv doesn't have a mask as a parameter
+            return (ddepth < 0 ? in : in.withDepth(ddepth));
+        }
+    };
+
+    G_TYPED_KERNEL(GWarpPerspective, <GMat(GMat, const Mat&, Size, int, int, const cv::Scalar&)>, "org.opencv.core.warpPerspective") {
+        static GMatDesc outMeta(GMatDesc in, const Mat&, Size dsize, int, int borderMode, const cv::Scalar&) {
+            GAPI_Assert((borderMode == cv::BORDER_CONSTANT || borderMode == cv::BORDER_REPLICATE) &&
+                        "cv::gapi::warpPerspective supports only cv::BORDER_CONSTANT and cv::BORDER_REPLICATE border modes");
+            return in.withType(in.depth, in.chan).withSize(dsize);
+        }
+    };
+
+    G_TYPED_KERNEL(GWarpAffine, <GMat(GMat, const Mat&, Size, int, int, const cv::Scalar&)>, "org.opencv.core.warpAffine") {
+        static GMatDesc outMeta(GMatDesc in, const Mat&, Size dsize, int, int border_mode, const cv::Scalar&) {
+            GAPI_Assert(border_mode != cv::BORDER_TRANSPARENT &&
+                        "cv::BORDER_TRANSPARENT mode is not supported in cv::gapi::warpAffine");
+            return in.withType(in.depth, in.chan).withSize(dsize);
+        }
+    };
+
+    G_TYPED_KERNEL(
+        GKMeansND,
+        <std::tuple<GOpaque<double>,GMat,GMat>(GMat,int,GMat,TermCriteria,int,KmeansFlags)>,
+        "org.opencv.core.kmeansND") {
+
+        static std::tuple<GOpaqueDesc,GMatDesc,GMatDesc>
+        outMeta(const GMatDesc& in, int K, const GMatDesc& bestLabels, const TermCriteria&, int,
+                KmeansFlags flags) {
+            GAPI_Assert(in.depth == CV_32F);
+            std::vector<int> amount_n_dim = detail::checkVector(in);
+            int amount = amount_n_dim[0], dim = amount_n_dim[1];
+            if (amount == -1)   // Mat with height != 1, width != 1, channels != 1 given
+            {                   // which means that kmeans will consider the following:
+                amount = in.size.height;
+                dim    = in.size.width * in.chan;
+            }
+            // kmeans sets these labels' sizes when no bestLabels given:
+            GMatDesc out_labels(CV_32S, 1, Size{1, amount});
+            // kmeans always sets these centers' sizes:
+            GMatDesc centers   (CV_32F, 1, Size{dim, K});
+            if (flags & KMEANS_USE_INITIAL_LABELS)
+            {
+                GAPI_Assert(bestLabels.depth == CV_32S);
+                int labels_amount = detail::checkVector(bestLabels, 1u);
+                GAPI_Assert(labels_amount == amount);
+                out_labels = bestLabels;  // kmeans preserves bestLabels' sizes if given
+            }
+            return std::make_tuple(empty_gopaque_desc(), out_labels, centers);
+        }
+    };
+
+    G_TYPED_KERNEL(
+        GKMeansNDNoInit,
+        <std::tuple<GOpaque<double>,GMat,GMat>(GMat,int,TermCriteria,int,KmeansFlags)>,
+        "org.opencv.core.kmeansNDNoInit") {
+
+        static std::tuple<GOpaqueDesc,GMatDesc,GMatDesc>
+        outMeta(const GMatDesc& in, int K, const TermCriteria&, int, KmeansFlags flags) {
+            GAPI_Assert( !(flags & KMEANS_USE_INITIAL_LABELS) );
+            GAPI_Assert(in.depth == CV_32F);
+            std::vector<int> amount_n_dim = detail::checkVector(in);
+            int amount = amount_n_dim[0], dim = amount_n_dim[1];
+            if (amount == -1) // Mat with height != 1, width != 1, channels != 1 given
+            {                   // which means that kmeans will consider the following:
+                amount = in.size.height;
+                dim    = in.size.width * in.chan;
+            }
+            GMatDesc out_labels(CV_32S, 1, Size{1, amount});
+            GMatDesc centers   (CV_32F, 1, Size{dim, K});
+            return std::make_tuple(empty_gopaque_desc(), out_labels, centers);
+        }
+    };
+
+    G_TYPED_KERNEL(GKMeans2D, <std::tuple<GOpaque<double>,GArray<int>,GArray<Point2f>>
+                               (GArray<Point2f>,int,GArray<int>,TermCriteria,int,KmeansFlags)>,
+                   "org.opencv.core.kmeans2D") {
+        static std::tuple<GOpaqueDesc,GArrayDesc,GArrayDesc>
+        outMeta(const GArrayDesc&,int,const GArrayDesc&,const TermCriteria&,int,KmeansFlags) {
+            return std::make_tuple(empty_gopaque_desc(), empty_array_desc(), empty_array_desc());
+        }
+    };
+
+    G_TYPED_KERNEL(GKMeans3D, <std::tuple<GOpaque<double>,GArray<int>,GArray<Point3f>>
+                               (GArray<Point3f>,int,GArray<int>,TermCriteria,int,KmeansFlags)>,
+                   "org.opencv.core.kmeans3D") {
+        static std::tuple<GOpaqueDesc,GArrayDesc,GArrayDesc>
+        outMeta(const GArrayDesc&,int,const GArrayDesc&,const TermCriteria&,int,KmeansFlags) {
+            return std::make_tuple(empty_gopaque_desc(), empty_array_desc(), empty_array_desc());
+        }
+    };
+
+    G_TYPED_KERNEL(GTranspose, <GMat(GMat)>, "org.opencv.core.transpose") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in.withSize({in.size.height, in.size.width});
+        }
+    };
+} // namespace core
+
+namespace streaming {
+
+// Operations for Streaming (declared in this header for convenience)
+G_TYPED_KERNEL(GSize, <GOpaque<Size>(GMat)>, "org.opencv.streaming.size") {
+    static GOpaqueDesc outMeta(const GMatDesc&) {
+        return empty_gopaque_desc();
+    }
+};
+
+G_TYPED_KERNEL(GSizeR, <GOpaque<Size>(GOpaque<Rect>)>, "org.opencv.streaming.sizeR") {
+    static GOpaqueDesc outMeta(const GOpaqueDesc&) {
+        return empty_gopaque_desc();
+    }
+};
+
+G_TYPED_KERNEL(GSizeMF, <GOpaque<Size>(GFrame)>, "org.opencv.streaming.sizeMF") {
+    static GOpaqueDesc outMeta(const GFrameDesc&) {
+        return empty_gopaque_desc();
+    }
+};
+} // namespace streaming
+
+//! @addtogroup gapi_math
+//! @{
+
+/** @brief Calculates the per-element sum of two matrices.
+
+The function add calculates sum of two matrices of the same size and the same number of channels:
+\f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) +  \texttt{src2}(I)) \quad \texttt{if mask}(I) \ne0\f]
+
+The function can be replaced with matrix expressions:
+    \f[\texttt{dst} =  \texttt{src1} + \texttt{src2}\f]
+
+The input matrices and the output matrix can all have the same or different depths. For example, you
+can add a 16-bit unsigned matrix to a 8-bit signed matrix and store the sum as a 32-bit
+floating-point matrix. Depth of the output matrix is determined by the ddepth parameter.
+If src1.depth() == src2.depth(), ddepth can be set to the default -1. In this case, the output matrix will have
+the same depth as the input matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.add"
+@param src1 first input matrix.
+@param src2 second input matrix.
+@param ddepth optional depth of the output matrix.
+@sa sub, addWeighted
+*/
+GAPI_EXPORTS_W GMat add(const GMat& src1, const GMat& src2, int ddepth = -1);
+
+/** @brief Calculates the per-element sum of matrix and given scalar.
+
+The function addC adds a given scalar value to each element of given matrix.
+The function can be replaced with matrix expressions:
+
+    \f[\texttt{dst} =  \texttt{src1} + \texttt{c}\f]
+
+Depth of the output matrix is determined by the ddepth parameter.
+If ddepth is set to default -1, the depth of output matrix will be the same as the depth of input matrix.
+The matrices can be single or multi channel. Output matrix must have the same size and number of channels as the input matrix.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.addC"
+@param src1 first input matrix.
+@param c scalar value to be added.
+@param ddepth optional depth of the output matrix.
+@sa sub, addWeighted
+*/
+GAPI_EXPORTS_W GMat addC(const GMat& src1, const GScalar& c, int ddepth = -1);
+//! @overload
+GAPI_EXPORTS_W GMat addC(const GScalar& c, const GMat& src1, int ddepth = -1);
+
+/** @brief Calculates the per-element difference between two matrices.
+
+The function sub calculates difference between two matrices, when both matrices have the same size and the same number of
+channels:
+    \f[\texttt{dst}(I) =   \texttt{src1}(I) -  \texttt{src2}(I)\f]
+
+The function can be replaced with matrix expressions:
+\f[\texttt{dst} =   \texttt{src1} -  \texttt{src2}\f]
+
+The input matrices and the output matrix can all have the same or different depths. For example, you
+can subtract two 8-bit unsigned matrices store the result as a 16-bit signed matrix.
+Depth of the output matrix is determined by the ddepth parameter.
+If src1.depth() == src2.depth(), ddepth can be set to the default -1. In this case, the output matrix will have
+the same depth as the input matrices. The matrices can be single or multi channel.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.sub"
+@param src1 first input matrix.
+@param src2 second input matrix.
+@param ddepth optional depth of the output matrix.
+@sa  add, addC
+  */
+GAPI_EXPORTS_W GMat sub(const GMat& src1, const GMat& src2, int ddepth = -1);
+
+/** @brief Calculates the per-element difference between matrix and given scalar.
+
+The function can be replaced with matrix expressions:
+    \f[\texttt{dst} =  \texttt{src} - \texttt{c}\f]
+
+Depth of the output matrix is determined by the ddepth parameter.
+If ddepth is set to default -1, the depth of output matrix will be the same as the depth of input matrix.
+The matrices can be single or multi channel. Output matrix must have the same size as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.subC"
+@param src first input matrix.
+@param c scalar value to subtracted.
+@param ddepth optional depth of the output matrix.
+@sa  add, addC, subRC
+  */
+GAPI_EXPORTS_W GMat subC(const GMat& src, const GScalar& c, int ddepth = -1);
+
+/** @brief Calculates the per-element difference between given scalar and the matrix.
+
+The function can be replaced with matrix expressions:
+    \f[\texttt{dst} =  \texttt{c} - \texttt{src}\f]
+
+Depth of the output matrix is determined by the ddepth parameter.
+If ddepth is set to default -1, the depth of output matrix will be the same as the depth of input matrix.
+The matrices can be single or multi channel. Output matrix must have the same size as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.subRC"
+@param c scalar value to subtract from.
+@param src input matrix to be subtracted.
+@param ddepth optional depth of the output matrix.
+@sa  add, addC, subC
+  */
+GAPI_EXPORTS_W GMat subRC(const GScalar& c, const GMat& src, int ddepth = -1);
+
+/** @brief Calculates the per-element scaled product of two matrices.
+
+The function mul calculates the per-element product of two matrices:
+
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{scale} \cdot \texttt{src1} (I)  \cdot \texttt{src2} (I))\f]
+
+If src1.depth() == src2.depth(), ddepth can be set to the default -1. In this case, the output matrix will have
+the same depth as the input matrices. The matrices can be single or multi channel.
+Output matrix must have the same size as input matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.mul"
+@param src1 first input matrix.
+@param src2 second input matrix of the same size and the same depth as src1.
+@param scale optional scale factor.
+@param ddepth optional depth of the output matrix.
+@sa add, sub, div, addWeighted
+*/
+GAPI_EXPORTS_W GMat mul(const GMat& src1, const GMat& src2, double scale = 1.0, int ddepth = -1);
+
+/** @brief Multiplies matrix by scalar.
+
+The function mulC multiplies each element of matrix src by given scalar value:
+
+\f[\texttt{dst} (I)= \texttt{saturate} (  \texttt{src1} (I)  \cdot \texttt{multiplier} )\f]
+
+The matrices can be single or multi channel. Output matrix must have the same size as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.mulC"
+@param src input matrix.
+@param multiplier factor to be multiplied.
+@param ddepth optional depth of the output matrix. If -1, the depth of output matrix will be the same as input matrix depth.
+@sa add, sub, div, addWeighted
+*/
+GAPI_EXPORTS_W GMat mulC(const GMat& src, double multiplier, int ddepth = -1);
+//! @overload
+GAPI_EXPORTS_W GMat mulC(const GMat& src, const GScalar& multiplier, int ddepth = -1);   // FIXME: merge with mulc
+//! @overload
+GAPI_EXPORTS_W GMat mulC(const GScalar& multiplier, const GMat& src, int ddepth = -1);   // FIXME: merge with mulc
+
+/** @brief Performs per-element division of two matrices.
+
+The function divides one matrix by another:
+\f[\texttt{dst(I) = saturate(src1(I)*scale/src2(I))}\f]
+
+For integer types when src2(I) is zero, dst(I) will also be zero.
+Floating point case returns Inf/NaN (according to IEEE).
+
+Different channels of
+multi-channel matrices are processed independently.
+The matrices can be single or multi channel. Output matrix must have the same size and depth as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.div"
+@param src1 first input matrix.
+@param src2 second input matrix of the same size and depth as src1.
+@param scale scalar factor.
+@param ddepth optional depth of the output matrix; you can only pass -1 when src1.depth() == src2.depth().
+@sa  mul, add, sub
+*/
+GAPI_EXPORTS_W GMat div(const GMat& src1, const GMat& src2, double scale, int ddepth = -1);
+
+/** @brief Divides matrix by scalar.
+
+The function divC divides each element of matrix src by given scalar value:
+
+\f[\texttt{dst(I) = saturate(src(I)*scale/divisor)}\f]
+
+When divisor is zero, dst(I) will also be zero. Different channels of
+multi-channel matrices are processed independently.
+The matrices can be single or multi channel. Output matrix must have the same size and depth as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.divC"
+@param src input matrix.
+@param divisor number to be divided by.
+@param ddepth optional depth of the output matrix. If -1, the depth of output matrix will be the same as input matrix depth.
+@param scale scale factor.
+@sa add, sub, div, addWeighted
+*/
+GAPI_EXPORTS_W GMat divC(const GMat& src, const GScalar& divisor, double scale, int ddepth = -1);
+
+/** @brief Divides scalar by matrix.
+
+The function divRC divides given scalar by each element of matrix src and keep the division result in new matrix of the same size and type as src:
+
+\f[\texttt{dst(I) = saturate(divident*scale/src(I))}\f]
+
+When src(I) is zero, dst(I) will also be zero. Different channels of
+multi-channel matrices are processed independently.
+The matrices can be single or multi channel. Output matrix must have the same size and depth as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.divRC"
+@param src input matrix.
+@param divident number to be divided.
+@param ddepth optional depth of the output matrix. If -1, the depth of output matrix will be the same as input matrix depth.
+@param scale scale factor
+@sa add, sub, div, addWeighted
+*/
+GAPI_EXPORTS_W GMat divRC(const GScalar& divident, const GMat& src, double scale, int ddepth = -1);
+
+/** @brief Applies a mask to a matrix.
+
+The function mask set value from given matrix if the corresponding pixel value in mask matrix set to true,
+and set the matrix value to 0 otherwise.
+
+Supported src matrix data types are @ref CV_8UC1, @ref CV_16SC1, @ref CV_16UC1. Supported mask data type is @ref CV_8UC1.
+
+@note Function textual ID is "org.opencv.core.math.mask"
+@param src input matrix.
+@param mask input mask matrix.
+*/
+GAPI_EXPORTS_W GMat mask(const GMat& src, const GMat& mask);
+
+/** @brief Calculates an average (mean) of matrix elements.
+
+The function mean calculates the mean value M of matrix elements,
+independently for each channel, and return it.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.mean"
+@param src input matrix.
+@sa  countNonZero, min, max
+*/
+GAPI_EXPORTS_W GScalar mean(const GMat& src);
+
+/** @brief Calculates x and y coordinates of 2D vectors from their magnitude and angle.
+
+The function polarToCart calculates the Cartesian coordinates of each 2D
+vector represented by the corresponding elements of magnitude and angle:
+\f[\begin{array}{l} \texttt{x} (I) =  \texttt{magnitude} (I) \cos ( \texttt{angle} (I)) \\ \texttt{y} (I) =  \texttt{magnitude} (I) \sin ( \texttt{angle} (I)) \\ \end{array}\f]
+
+The relative accuracy of the estimated coordinates is about 1e-6.
+
+First output is a matrix of x-coordinates of 2D vectors.
+Second output is a matrix of y-coordinates of 2D vectors.
+Both output must have the same size and depth as input matrices.
+
+@note Function textual ID is "org.opencv.core.math.polarToCart"
+
+@param magnitude input floating-point @ref CV_32FC1 matrix (1xN) of magnitudes of 2D vectors;
+@param angle input floating-point @ref CV_32FC1 matrix (1xN) of angles of 2D vectors.
+@param angleInDegrees when true, the input angles are measured in
+degrees, otherwise, they are measured in radians.
+@sa cartToPolar, exp, log, pow, sqrt
+*/
+GAPI_EXPORTS_W std::tuple<GMat, GMat> polarToCart(const GMat& magnitude, const GMat& angle,
+                                                  bool angleInDegrees = false);
+
+/** @brief Calculates the magnitude and angle of 2D vectors.
+
+The function cartToPolar calculates either the magnitude, angle, or both
+for every 2D vector (x(I),y(I)):
+\f[\begin{array}{l} \texttt{magnitude} (I)= \sqrt{\texttt{x}(I)^2+\texttt{y}(I)^2} , \\ \texttt{angle} (I)= \texttt{atan2} ( \texttt{y} (I), \texttt{x} (I))[ \cdot180 / \pi ] \end{array}\f]
+
+The angles are calculated with accuracy about 0.3 degrees. For the point
+(0,0), the angle is set to 0.
+
+First output is a matrix of magnitudes of the same size and depth as input x.
+Second output is a matrix of angles that has the same size and depth as
+x; the angles are measured in radians (from 0 to 2\*Pi) or in degrees (0 to 360 degrees).
+
+@note Function textual ID is "org.opencv.core.math.cartToPolar"
+
+@param x matrix of @ref CV_32FC1 x-coordinates.
+@param y array of @ref CV_32FC1 y-coordinates.
+@param angleInDegrees a flag, indicating whether the angles are measured
+in radians (which is by default), or in degrees.
+@sa polarToCart
+*/
+GAPI_EXPORTS_W std::tuple<GMat, GMat> cartToPolar(const GMat& x, const GMat& y,
+                                                  bool angleInDegrees = false);
+
+/** @brief Calculates the rotation angle of 2D vectors.
+
+The function cv::phase calculates the rotation angle of each 2D vector that
+is formed from the corresponding elements of x and y :
+\f[\texttt{angle} (I) =  \texttt{atan2} ( \texttt{y} (I), \texttt{x} (I))\f]
+
+The angle estimation accuracy is about 0.3 degrees. When x(I)=y(I)=0 ,
+the corresponding angle(I) is set to 0.
+@param x input floating-point array of x-coordinates of 2D vectors.
+@param y input array of y-coordinates of 2D vectors; it must have the
+same size and the same type as x.
+@param angleInDegrees when true, the function calculates the angle in
+degrees, otherwise, they are measured in radians.
+@return array of vector angles; it has the same size and same type as x.
+*/
+GAPI_EXPORTS_W GMat phase(const GMat& x, const GMat &y, bool angleInDegrees = false);
+
+/** @brief Calculates a square root of array elements.
+
+The function cv::gapi::sqrt calculates a square root of each input array element.
+In case of multi-channel arrays, each channel is processed
+independently. The accuracy is approximately the same as of the built-in
+std::sqrt .
+@param src input floating-point array.
+@return output array of the same size and type as src.
+*/
+GAPI_EXPORTS_W GMat sqrt(const GMat &src);
+
+//! @} gapi_math
+//!
+//! @addtogroup gapi_pixelwise
+//! @{
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are greater compare to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  > \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+\f[\texttt{dst} =   \texttt{src1} > \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices/matrix.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpGT"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpLE, cmpGE, cmpLT
+*/
+GAPI_EXPORTS_W GMat cmpGT(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpGTScalar"
+*/
+GAPI_EXPORTS_W GMat cmpGT(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are less than elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  < \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} < \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices/matrix.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLT"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpLE, cmpGE, cmpGT
+*/
+GAPI_EXPORTS_W GMat cmpLT(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLTScalar"
+*/
+GAPI_EXPORTS_W GMat cmpLT(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are greater or equal compare to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  >= \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} >= \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpGE"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpLE, cmpGT, cmpLT
+*/
+GAPI_EXPORTS_W GMat cmpGE(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLGEcalar"
+*/
+GAPI_EXPORTS_W GMat cmpGE(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are less or equal compare to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  <=  \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} <= \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLE"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpGT, cmpGE, cmpLT
+*/
+GAPI_EXPORTS_W GMat cmpLE(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLEScalar"
+*/
+GAPI_EXPORTS_W GMat cmpLE(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are equal to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  ==  \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} == \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpEQ"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpNE
+*/
+GAPI_EXPORTS_W GMat cmpEQ(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpEQScalar"
+*/
+GAPI_EXPORTS_W GMat cmpEQ(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are not equal to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  !=  \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} != \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpNE"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpEQ
+*/
+GAPI_EXPORTS_W GMat cmpNE(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpNEScalar"
+*/
+GAPI_EXPORTS_W GMat cmpNE(const GMat& src1, const GScalar& src2);
+
+/** @brief computes bitwise conjunction of the two matrixes (src1 & src2)
+Calculates the per-element bit-wise logical conjunction of two matrices of the same size.
+
+In case of floating-point matrices, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel matrices, each channel is processed
+independently. Output matrix must have the same size and depth as the input
+matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_and"
+
+@param src1 first input matrix.
+@param src2 second input matrix.
+*/
+GAPI_EXPORTS_W GMat bitwise_and(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_andS"
+@param src1 first input matrix.
+@param src2 scalar, which will be per-lemenetly conjuncted with elements of src1.
+*/
+GAPI_EXPORTS_W GMat bitwise_and(const GMat& src1, const GScalar& src2);
+
+/** @brief computes bitwise disjunction of the two matrixes (src1 | src2)
+Calculates the per-element bit-wise logical disjunction of two matrices of the same size.
+
+In case of floating-point matrices, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel matrices, each channel is processed
+independently. Output matrix must have the same size and depth as the input
+matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_or"
+
+@param src1 first input matrix.
+@param src2 second input matrix.
+*/
+GAPI_EXPORTS_W GMat bitwise_or(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_orS"
+@param src1 first input matrix.
+@param src2 scalar, which will be per-lemenetly disjuncted with elements of src1.
+*/
+GAPI_EXPORTS_W GMat bitwise_or(const GMat& src1, const GScalar& src2);
+
+
+/** @brief computes bitwise logical "exclusive or" of the two matrixes (src1 ^ src2)
+Calculates the per-element bit-wise logical "exclusive or" of two matrices of the same size.
+
+In case of floating-point matrices, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel matrices, each channel is processed
+independently. Output matrix must have the same size and depth as the input
+matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_xor"
+
+@param src1 first input matrix.
+@param src2 second input matrix.
+*/
+GAPI_EXPORTS_W GMat bitwise_xor(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_xorS"
+@param src1 first input matrix.
+@param src2 scalar, for which per-lemenet "logical or" operation on elements of src1 will be performed.
+*/
+GAPI_EXPORTS_W GMat bitwise_xor(const GMat& src1, const GScalar& src2);
+
+
+/** @brief Inverts every bit of an array.
+
+The function bitwise_not calculates per-element bit-wise inversion of the input
+matrix:
+\f[\texttt{dst} (I) =  \neg \texttt{src} (I)\f]
+
+In case of floating-point matrices, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel matrices, each channel is processed
+independently. Output matrix must have the same size and depth as the input
+matrix.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_not"
+
+@param src input matrix.
+*/
+GAPI_EXPORTS_W GMat bitwise_not(const GMat& src);
+
+/** @brief Select values from either first or second of input matrices by given mask.
+The function set to the output matrix either the value from the first input matrix if corresponding value of mask matrix is 255,
+ or value from the second input matrix (if value of mask matrix set to 0).
+
+Input mask matrix must be of @ref CV_8UC1 type, two other inout matrices and output matrix should be of the same type. The size should
+be the same for all input and output matrices.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.select"
+
+@param src1 first input matrix.
+@param src2 second input matrix.
+@param mask mask input matrix.
+*/
+GAPI_EXPORTS_W GMat select(const GMat& src1, const GMat& src2, const GMat& mask);
+
+//! @} gapi_pixelwise
+
+
+//! @addtogroup gapi_matrixop
+//! @{
+/** @brief Calculates per-element minimum of two matrices.
+
+The function min calculates the per-element minimum of two matrices of the same size, number of channels and depth:
+\f[\texttt{dst} (I)= \min ( \texttt{src1} (I), \texttt{src2} (I))\f]
+    where I is a multi-dimensional index of matrix elements. In case of
+    multi-channel matrices, each channel is processed independently.
+Output matrix must be of the same size and depth as src1.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.min"
+@param src1 first input matrix.
+@param src2 second input matrix of the same size and depth as src1.
+@sa max, cmpEQ, cmpLT, cmpLE
+*/
+GAPI_EXPORTS_W GMat min(const GMat& src1, const GMat& src2);
+
+/** @brief Calculates per-element maximum of two matrices.
+
+The function max calculates the per-element maximum of two matrices of the same size, number of channels and depth:
+\f[\texttt{dst} (I)= \max ( \texttt{src1} (I), \texttt{src2} (I))\f]
+    where I is a multi-dimensional index of matrix elements. In case of
+    multi-channel matrices, each channel is processed independently.
+Output matrix must be of the same size and depth as src1.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.max"
+@param src1 first input matrix.
+@param src2 second input matrix of the same size and depth as src1.
+@sa min, compare, cmpEQ, cmpGT, cmpGE
+*/
+GAPI_EXPORTS_W GMat max(const GMat& src1, const GMat& src2);
+
+/** @brief Calculates the per-element absolute difference between two matrices.
+
+The function absDiff calculates absolute difference between two matrices of the same size and depth:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1}(I) -  \texttt{src2}(I)|)\f]
+    where I is a multi-dimensional index of matrix elements. In case of
+    multi-channel matrices, each channel is processed independently.
+Output matrix must have the same size and depth as input matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.absdiff"
+@param src1 first input matrix.
+@param src2 second input matrix.
+@sa abs
+*/
+GAPI_EXPORTS_W GMat absDiff(const GMat& src1, const GMat& src2);
+
+/** @brief Calculates absolute value of matrix elements.
+
+The function abs calculates absolute difference between matrix elements and given scalar value:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1}(I) -  \texttt{matC}(I)|)\f]
+    where matC is constructed from given scalar c and has the same sizes and depth as input matrix src.
+
+Output matrix must be of the same size and depth as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.absdiffC"
+@param src input matrix.
+@param c scalar to be subtracted.
+@sa min, max
+*/
+GAPI_EXPORTS_W GMat absDiffC(const GMat& src, const GScalar& c);
+
+/** @brief Calculates sum of all matrix elements.
+
+The function sum calculates sum of all matrix elements, independently for each channel.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.sum"
+@param src input matrix.
+@sa countNonZero, mean, min, max
+*/
+GAPI_EXPORTS_W GScalar sum(const GMat& src);
+
+/** @brief Counts non-zero array elements.
+
+The function returns the number of non-zero elements in src :
+\f[\sum _{I: \; \texttt{src} (I) \ne0 } 1\f]
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.countNonZero"
+@param src input single-channel matrix.
+@sa  mean, min, max
+*/
+GAPI_EXPORTS_W GOpaque<int> countNonZero(const GMat& src);
+
+/** @brief Calculates the weighted sum of two matrices.
+
+The function addWeighted calculates the weighted sum of two matrices as follows:
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)* \texttt{beta} +  \texttt{gamma} )\f]
+where I is a multi-dimensional index of array elements. In case of multi-channel matrices, each
+channel is processed independently.
+
+The function can be replaced with a matrix expression:
+    \f[\texttt{dst}(I) =  \texttt{alpha} * \texttt{src1}(I) - \texttt{beta} * \texttt{src2}(I) + \texttt{gamma} \f]
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.addweighted"
+@param src1 first input matrix.
+@param alpha weight of the first matrix elements.
+@param src2 second input matrix of the same size and channel number as src1.
+@param beta weight of the second matrix elements.
+@param gamma scalar added to each sum.
+@param ddepth optional depth of the output matrix.
+@sa  add, sub
+*/
+GAPI_EXPORTS_W GMat addWeighted(const GMat& src1, double alpha, const GMat& src2, double beta, double gamma, int ddepth = -1);
+
+/** @brief Calculates the  absolute L1 norm of a matrix.
+
+This version of normL1 calculates the absolute L1 norm of src.
+
+As example for one array consider the function \f$r(x)= \begin{pmatrix} x \\ 1-x \end{pmatrix}, x \in [-1;1]\f$.
+The \f$ L_{1} \f$ norm for the sample value \f$r(-1) = \begin{pmatrix} -1 \\ 2 \end{pmatrix}\f$
+is calculated as follows
+\f{align*}
+    \| r(-1) \|_{L_1} &= |-1| + |2| = 3 \\
+\f}
+and for \f$r(0.5) = \begin{pmatrix} 0.5 \\ 0.5 \end{pmatrix}\f$ the calculation is
+\f{align*}
+    \| r(0.5) \|_{L_1} &= |0.5| + |0.5| = 1 \\
+\f}
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.norml1"
+@param src input matrix.
+@sa normL2, normInf
+*/
+GAPI_EXPORTS_W GScalar normL1(const GMat& src);
+
+/** @brief Calculates the absolute L2 norm of a matrix.
+
+This version of normL2 calculates the absolute L2 norm of src.
+
+As example for one array consider the function \f$r(x)= \begin{pmatrix} x \\ 1-x \end{pmatrix}, x \in [-1;1]\f$.
+The \f$ L_{2} \f$  norm for the sample value \f$r(-1) = \begin{pmatrix} -1 \\ 2 \end{pmatrix}\f$
+is calculated as follows
+\f{align*}
+    \| r(-1) \|_{L_2} &= \sqrt{(-1)^{2} + (2)^{2}} = \sqrt{5} \\
+\f}
+and for \f$r(0.5) = \begin{pmatrix} 0.5 \\ 0.5 \end{pmatrix}\f$ the calculation is
+\f{align*}
+    \| r(0.5) \|_{L_2} &= \sqrt{(0.5)^{2} + (0.5)^{2}} = \sqrt{0.5} \\
+\f}
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+@note Function textual ID is "org.opencv.core.matrixop.norml2"
+@param src input matrix.
+@sa normL1, normInf
+*/
+GAPI_EXPORTS_W GScalar normL2(const GMat& src);
+
+/** @brief Calculates the absolute infinite norm of a matrix.
+
+This version of normInf calculates the absolute infinite norm of src.
+
+As example for one array consider the function \f$r(x)= \begin{pmatrix} x \\ 1-x \end{pmatrix}, x \in [-1;1]\f$.
+The \f$ L_{\infty} \f$ norm for the sample value \f$r(-1) = \begin{pmatrix} -1 \\ 2 \end{pmatrix}\f$
+is calculated as follows
+\f{align*}
+    \| r(-1) \|_{L_\infty} &= \max(|-1|,|2|) = 2
+\f}
+and for \f$r(0.5) = \begin{pmatrix} 0.5 \\ 0.5 \end{pmatrix}\f$ the calculation is
+\f{align*}
+    \| r(0.5) \|_{L_\infty} &= \max(|0.5|,|0.5|) = 0.5.
+\f}
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.norminf"
+@param src input matrix.
+@sa normL1, normL2
+*/
+GAPI_EXPORTS_W GScalar normInf(const GMat& src);
+
+/** @brief Calculates the integral of an image.
+
+The function calculates one or more integral images for the source image as follows:
+
+\f[\texttt{sum} (X,Y) =  \sum _{x<X,y<Y}  \texttt{image} (x,y)\f]
+
+\f[\texttt{sqsum} (X,Y) =  \sum _{x<X,y<Y}  \texttt{image} (x,y)^2\f]
+
+The function return integral image as \f$(W+1)\times (H+1)\f$ , 32-bit integer or floating-point (32f or 64f) and
+ integral image for squared pixel values; it is \f$(W+1)\times (H+)\f$, double-precision floating-point (64f) array.
+
+@note Function textual ID is "org.opencv.core.matrixop.integral"
+
+@param src input image.
+@param sdepth desired depth of the integral and the tilted integral images, CV_32S, CV_32F, or
+CV_64F.
+@param sqdepth desired depth of the integral image of squared pixel values, CV_32F or CV_64F.
+ */
+GAPI_EXPORTS_W std::tuple<GMat, GMat> integral(const GMat& src, int sdepth = -1, int sqdepth = -1);
+
+/** @brief Applies a fixed-level threshold to each matrix element.
+
+The function applies fixed-level thresholding to a single- or multiple-channel matrix.
+The function is typically used to get a bi-level (binary) image out of a grayscale image ( cmp functions could be also used for
+this purpose) or for removing a noise, that is, filtering out pixels with too small or too large
+values. There are several types of thresholding supported by the function. They are determined by
+type parameter.
+
+Also, the special values cv::THRESH_OTSU or cv::THRESH_TRIANGLE may be combined with one of the
+above values. In these cases, the function determines the optimal threshold value using the Otsu's
+or Triangle algorithm and uses it instead of the specified thresh . The function returns the
+computed threshold value in addititon to thresholded matrix.
+The Otsu's and Triangle methods are implemented only for 8-bit matrices.
+
+Input image should be single channel only in case of cv::THRESH_OTSU or cv::THRESH_TRIANGLE flags.
+Output matrix must be of the same size and depth as src.
+
+@note Function textual ID is "org.opencv.core.matrixop.threshold"
+
+@param src input matrix (@ref CV_8UC1, @ref CV_8UC3, or @ref CV_32FC1).
+@param thresh threshold value.
+@param maxval maximum value to use with the cv::THRESH_BINARY and cv::THRESH_BINARY_INV thresholding
+types.
+@param type thresholding type (see the cv::ThresholdTypes).
+
+@sa min, max, cmpGT, cmpLE, cmpGE, cmpLT
+ */
+GAPI_EXPORTS_W GMat threshold(const GMat& src, const GScalar& thresh, const GScalar& maxval, int type);
+/** @overload
+This function applicable for all threshold types except cv::THRESH_OTSU and cv::THRESH_TRIANGLE
+@note Function textual ID is "org.opencv.core.matrixop.thresholdOT"
+*/
+GAPI_EXPORTS_W std::tuple<GMat, GScalar> threshold(const GMat& src, const GScalar& maxval, int type);
+
+/** @brief Applies a range-level threshold to each matrix element.
+
+The function applies range-level thresholding to a single- or multiple-channel matrix.
+It sets output pixel value to OxFF if the corresponding pixel value of input matrix is in specified range,or 0 otherwise.
+
+Input and output matrices must be CV_8UC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.inRange"
+
+@param src input matrix (CV_8UC1).
+@param threshLow lower boundary value.
+@param threshUp upper boundary value.
+
+@sa threshold
+ */
+GAPI_EXPORTS_W GMat inRange(const GMat& src, const GScalar& threshLow, const GScalar& threshUp);
+
+//! @} gapi_matrixop
+
+//! @addtogroup gapi_transform
+//! @{
+/** @brief Creates one 4-channel matrix out of 4 single-channel ones.
+
+The function merges several matrices to make a single multi-channel matrix. That is, each
+element of the output matrix will be a concatenation of the elements of the input matrices, where
+elements of i-th input matrix are treated as mv[i].channels()-element vectors.
+Output matrix must be of @ref CV_8UC4 type.
+
+The function split4 does the reverse operation.
+
+@note
+ - Function textual ID is "org.opencv.core.transform.merge4"
+
+@param src1 first input @ref CV_8UC1 matrix to be merged.
+@param src2 second input @ref CV_8UC1 matrix to be merged.
+@param src3 third input @ref CV_8UC1 matrix to be merged.
+@param src4 fourth input @ref CV_8UC1 matrix to be merged.
+@sa merge3, split4, split3
+*/
+GAPI_EXPORTS_W GMat merge4(const GMat& src1, const GMat& src2, const GMat& src3, const GMat& src4);
+
+/** @brief Creates one 3-channel matrix out of 3 single-channel ones.
+
+The function merges several matrices to make a single multi-channel matrix. That is, each
+element of the output matrix will be a concatenation of the elements of the input matrices, where
+elements of i-th input matrix are treated as mv[i].channels()-element vectors.
+Output matrix must be of @ref CV_8UC3 type.
+
+The function split3 does the reverse operation.
+
+@note
+ - Function textual ID is "org.opencv.core.transform.merge3"
+
+@param src1 first input @ref CV_8UC1 matrix to be merged.
+@param src2 second input @ref CV_8UC1 matrix to be merged.
+@param src3 third input @ref CV_8UC1 matrix to be merged.
+@sa merge4, split4, split3
+*/
+GAPI_EXPORTS_W GMat merge3(const GMat& src1, const GMat& src2, const GMat& src3);
+
+/** @brief Divides a 4-channel matrix into 4 single-channel matrices.
+
+The function splits a 4-channel matrix into 4 single-channel matrices:
+\f[\texttt{mv} [c](I) =  \texttt{src} (I)_c\f]
+
+All output matrices must be of @ref CV_8UC1 type.
+
+The function merge4 does the reverse operation.
+
+@note
+ - Function textual ID is "org.opencv.core.transform.split4"
+
+@param src input @ref CV_8UC4 matrix.
+@sa split3, merge3, merge4
+*/
+GAPI_EXPORTS_W std::tuple<GMat, GMat, GMat,GMat> split4(const GMat& src);
+
+/** @brief Divides a 3-channel matrix into 3 single-channel matrices.
+
+The function splits a 3-channel matrix into 3 single-channel matrices:
+\f[\texttt{mv} [c](I) =  \texttt{src} (I)_c\f]
+
+All output matrices must be of @ref CV_8UC1 type.
+
+The function merge3 does the reverse operation.
+
+@note
+ - Function textual ID is "org.opencv.core.transform.split3"
+
+@param src input @ref CV_8UC3 matrix.
+@sa split4, merge3, merge4
+*/
+GAPI_EXPORTS_W std::tuple<GMat, GMat, GMat> split3(const GMat& src);
+
+/** @brief Applies a generic geometrical transformation to an image.
+
+The function remap transforms the source image using the specified map:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} (map_x(x,y),map_y(x,y))\f]
+
+where values of pixels with non-integer coordinates are computed using one of available
+interpolation methods. \f$map_x\f$ and \f$map_y\f$ can be encoded as separate floating-point maps
+in \f$map_1\f$ and \f$map_2\f$ respectively, or interleaved floating-point maps of \f$(x,y)\f$ in
+\f$map_1\f$, or fixed-point maps created by using convertMaps. The reason you might want to
+convert from floating to fixed-point representations of a map is that they can yield much faster
+(\~2x) remapping operations. In the converted case, \f$map_1\f$ contains pairs (cvFloor(x),
+cvFloor(y)) and \f$map_2\f$ contains indices in a table of interpolation coefficients.
+Output image must be of the same size and depth as input one.
+
+@note
+ - Function textual ID is "org.opencv.core.transform.remap"
+ - Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
+
+@param src Source image.
+@param map1 The first map of either (x,y) points or just x values having the type CV_16SC2,
+CV_32FC1, or CV_32FC2.
+@param map2 The second map of y values having the type CV_16UC1, CV_32FC1, or none (empty map
+if map1 is (x,y) points), respectively.
+@param interpolation Interpolation method (see cv::InterpolationFlags). The methods #INTER_AREA
+and #INTER_LINEAR_EXACT are not supported by this function.
+@param borderMode Pixel extrapolation method (see cv::BorderTypes). When
+borderMode=BORDER_TRANSPARENT, it means that the pixels in the destination image that
+corresponds to the "outliers" in the source image are not modified by the function.
+@param borderValue Value used in case of a constant border. By default, it is 0.
+ */
+GAPI_EXPORTS_W GMat remap(const GMat& src, const Mat& map1, const Mat& map2,
+                          int interpolation, int borderMode = BORDER_CONSTANT,
+                          const Scalar& borderValue = Scalar());
+
+/** @brief Flips a 2D matrix around vertical, horizontal, or both axes.
+
+The function flips the matrix in one of three different ways (row
+and column indices are 0-based):
+\f[\texttt{dst} _{ij} =
+\left\{
+\begin{array}{l l}
+\texttt{src} _{\texttt{src.rows}-i-1,j} & if\;  \texttt{flipCode} = 0 \\
+\texttt{src} _{i, \texttt{src.cols} -j-1} & if\;  \texttt{flipCode} > 0 \\
+\texttt{src} _{ \texttt{src.rows} -i-1, \texttt{src.cols} -j-1} & if\; \texttt{flipCode} < 0 \\
+\end{array}
+\right.\f]
+The example scenarios of using the function are the following:
+*   Vertical flipping of the image (flipCode == 0) to switch between
+    top-left and bottom-left image origin. This is a typical operation
+    in video processing on Microsoft Windows\* OS.
+*   Horizontal flipping of the image with the subsequent horizontal
+    shift and absolute difference calculation to check for a
+    vertical-axis symmetry (flipCode \> 0).
+*   Simultaneous horizontal and vertical flipping of the image with
+    the subsequent shift and absolute difference calculation to check
+    for a central symmetry (flipCode \< 0).
+*   Reversing the order of point arrays (flipCode \> 0 or
+    flipCode == 0).
+Output image must be of the same depth as input one, size should be correct for given flipCode.
+
+@note Function textual ID is "org.opencv.core.transform.flip"
+
+@param src input matrix.
+@param flipCode a flag to specify how to flip the array; 0 means
+flipping around the x-axis and positive value (for example, 1) means
+flipping around y-axis. Negative value (for example, -1) means flipping
+around both axes.
+@sa remap
+*/
+GAPI_EXPORTS_W GMat flip(const GMat& src, int flipCode);
+
+/** @brief Crops a 2D matrix.
+
+The function crops the matrix by given cv::Rect.
+
+Output matrix must be of the same depth as input one, size is specified by given rect size.
+
+@note Function textual ID is "org.opencv.core.transform.crop"
+
+@param src input matrix.
+@param rect a rect to crop a matrix to
+@sa resize
+*/
+GAPI_EXPORTS_W GMat crop(const GMat& src, const Rect& rect);
+
+/** @brief Applies horizontal concatenation to given matrices.
+
+The function horizontally concatenates two GMat matrices (with the same number of rows).
+@code{.cpp}
+    GMat A = { 1, 4,
+               2, 5,
+               3, 6 };
+    GMat B = { 7, 10,
+               8, 11,
+               9, 12 };
+
+    GMat C = gapi::concatHor(A, B);
+    //C:
+    //[1, 4, 7, 10;
+    // 2, 5, 8, 11;
+    // 3, 6, 9, 12]
+@endcode
+Output matrix must the same number of rows and depth as the src1 and src2, and the sum of cols of the src1 and src2.
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.imgproc.transform.concatHor"
+
+@param src1 first input matrix to be considered for horizontal concatenation.
+@param src2 second input matrix to be considered for horizontal concatenation.
+@sa concatVert
+*/
+GAPI_EXPORTS_W GMat concatHor(const GMat& src1, const GMat& src2);
+
+/** @overload
+The function horizontally concatenates given number of GMat matrices (with the same number of columns).
+Output matrix must the same number of columns and depth as the input matrices, and the sum of rows of input matrices.
+
+@param v vector of input matrices to be concatenated horizontally.
+*/
+GAPI_EXPORTS_W GMat concatHor(const std::vector<GMat> &v);
+
+/** @brief Applies vertical concatenation to given matrices.
+
+The function vertically concatenates two GMat matrices (with the same number of cols).
+ @code{.cpp}
+    GMat A = { 1, 7,
+               2, 8,
+               3, 9 };
+    GMat B = { 4, 10,
+               5, 11,
+               6, 12 };
+
+    GMat C = gapi::concatVert(A, B);
+    //C:
+    //[1, 7;
+    // 2, 8;
+    // 3, 9;
+    // 4, 10;
+    // 5, 11;
+    // 6, 12]
+ @endcode
+
+Output matrix must the same number of cols and depth as the src1 and src2, and the sum of rows of the src1 and src2.
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.imgproc.transform.concatVert"
+
+@param src1 first input matrix to be considered for vertical concatenation.
+@param src2 second input matrix to be considered for vertical concatenation.
+@sa concatHor
+*/
+GAPI_EXPORTS_W GMat concatVert(const GMat& src1, const GMat& src2);
+
+/** @overload
+The function vertically concatenates given number of GMat matrices (with the same number of columns).
+Output matrix must the same number of columns and depth as the input matrices, and the sum of rows of input matrices.
+
+@param v vector of input matrices to be concatenated vertically.
+*/
+GAPI_EXPORTS_W GMat concatVert(const std::vector<GMat> &v);
+
+
+/** @brief Performs a look-up table transform of a matrix.
+
+The function LUT fills the output matrix with values from the look-up table. Indices of the entries
+are taken from the input matrix. That is, the function processes each element of src as follows:
+\f[\texttt{dst} (I)  \leftarrow \texttt{lut(src(I))}\f]
+
+Supported matrix data types are @ref CV_8UC1.
+Output is a matrix of the same size and number of channels as src, and the same depth as lut.
+
+@note Function textual ID is "org.opencv.core.transform.LUT"
+
+@param src input matrix of 8-bit elements.
+@param lut look-up table of 256 elements; in case of multi-channel input array, the table should
+either have a single channel (in this case the same table is used for all channels) or the same
+number of channels as in the input matrix.
+*/
+GAPI_EXPORTS_W GMat LUT(const GMat& src, const Mat& lut);
+
+/** @brief Converts a matrix to another data depth with optional scaling.
+
+The method converts source pixel values to the target data depth. saturate_cast\<\> is applied at
+the end to avoid possible overflows:
+
+\f[m(x,y) = saturate \_ cast<rType>( \alpha (*this)(x,y) +  \beta )\f]
+Output matrix must be of the same size as input one.
+
+@note Function textual ID is "org.opencv.core.transform.convertTo"
+@param src input matrix to be converted from.
+@param rdepth desired output matrix depth or, rather, the depth since the number of channels are the
+same as the input has; if rdepth is negative, the output matrix will have the same depth as the input.
+@param alpha optional scale factor.
+@param beta optional delta added to the scaled values.
+ */
+GAPI_EXPORTS_W GMat convertTo(const GMat& src, int rdepth, double alpha=1, double beta=0);
+
+/** @brief Normalizes the norm or value range of an array.
+
+The function normalizes scale and shift the input array elements so that
+\f[\| \texttt{dst} \| _{L_p}= \texttt{alpha}\f]
+(where p=Inf, 1 or 2) when normType=NORM_INF, NORM_L1, or NORM_L2, respectively; or so that
+\f[\min _I  \texttt{dst} (I)= \texttt{alpha} , \, \, \max _I  \texttt{dst} (I)= \texttt{beta}\f]
+when normType=NORM_MINMAX (for dense arrays only).
+
+@note Function textual ID is "org.opencv.core.normalize"
+
+@param src input array.
+@param alpha norm value to normalize to or the lower range boundary in case of the range
+normalization.
+@param beta upper range boundary in case of the range normalization; it is not used for the norm
+normalization.
+@param norm_type normalization type (see cv::NormTypes).
+@param ddepth when negative, the output array has the same type as src; otherwise, it has the same
+number of channels as src and the depth =ddepth.
+@sa norm, Mat::convertTo
+*/
+GAPI_EXPORTS_W GMat normalize(const GMat& src, double alpha, double beta,
+                              int norm_type, int ddepth = -1);
+
+/** @brief Applies a perspective transformation to an image.
+
+The function warpPerspective transforms the source image using the specified matrix:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} \left ( \frac{M_{11} x + M_{12} y + M_{13}}{M_{31} x + M_{32} y + M_{33}} ,
+     \frac{M_{21} x + M_{22} y + M_{23}}{M_{31} x + M_{32} y + M_{33}} \right )\f]
+
+when the flag #WARP_INVERSE_MAP is set. Otherwise, the transformation is first inverted with invert
+and then put in the formula above instead of M. The function cannot operate in-place.
+
+@param src input image.
+@param M \f$3\times 3\f$ transformation matrix.
+@param dsize size of the output image.
+@param flags combination of interpolation methods (#INTER_LINEAR or #INTER_NEAREST) and the
+optional flag #WARP_INVERSE_MAP, that sets M as the inverse transformation (
+\f$\texttt{dst}\rightarrow\texttt{src}\f$ ).
+@param borderMode pixel extrapolation method (#BORDER_CONSTANT or #BORDER_REPLICATE).
+@param borderValue value used in case of a constant border; by default, it equals 0.
+
+@sa  warpAffine, resize, remap, getRectSubPix, perspectiveTransform
+ */
+GAPI_EXPORTS_W GMat warpPerspective(const GMat& src, const Mat& M, const Size& dsize, int flags = cv::INTER_LINEAR,
+                                    int borderMode = cv::BORDER_CONSTANT, const Scalar& borderValue = Scalar());
+
+/** @brief Applies an affine transformation to an image.
+
+The function warpAffine transforms the source image using the specified matrix:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} ( \texttt{M} _{11} x +  \texttt{M} _{12} y +  \texttt{M} _{13}, \texttt{M} _{21} x +  \texttt{M} _{22} y +  \texttt{M} _{23})\f]
+
+when the flag #WARP_INVERSE_MAP is set. Otherwise, the transformation is first inverted
+with #invertAffineTransform and then put in the formula above instead of M. The function cannot
+operate in-place.
+
+@param src input image.
+@param M \f$2\times 3\f$ transformation matrix.
+@param dsize size of the output image.
+@param flags combination of interpolation methods (see #InterpolationFlags) and the optional
+flag #WARP_INVERSE_MAP that means that M is the inverse transformation (
+\f$\texttt{dst}\rightarrow\texttt{src}\f$ ).
+@param borderMode pixel extrapolation method (see #BorderTypes);
+borderMode=#BORDER_TRANSPARENT isn't supported
+@param borderValue value used in case of a constant border; by default, it is 0.
+
+@sa  warpPerspective, resize, remap, getRectSubPix, transform
+ */
+GAPI_EXPORTS_W GMat warpAffine(const GMat& src, const Mat& M, const Size& dsize, int flags = cv::INTER_LINEAR,
+                               int borderMode = cv::BORDER_CONSTANT, const Scalar& borderValue = Scalar());
+//! @} gapi_transform
+
+/** @brief Finds centers of clusters and groups input samples around the clusters.
+
+The function kmeans implements a k-means algorithm that finds the centers of K clusters
+and groups the input samples around the clusters. As an output, \f$\texttt{bestLabels}_i\f$
+contains a 0-based cluster index for the \f$i^{th}\f$ sample.
+
+@note
+ - Function textual ID is "org.opencv.core.kmeansND"
+ - In case of an N-dimentional points' set given, input GMat can have the following traits:
+2 dimensions, a single row or column if there are N channels,
+or N columns if there is a single channel. Mat should have @ref CV_32F depth.
+ - Although, if GMat with height != 1, width != 1, channels != 1 given as data, n-dimensional
+samples are considered given in amount of A, where A = height, n = width * channels.
+ - In case of GMat given as data:
+     - the output labels are returned as 1-channel GMat with sizes
+width = 1, height = A, where A is samples amount, or width = bestLabels.width,
+height = bestLabels.height if bestLabels given;
+     - the cluster centers are returned as 1-channel GMat with sizes
+width = n, height = K, where n is samples' dimentionality and K is clusters' amount.
+ - As one of possible usages, if you want to control the initial labels for each attempt
+by yourself, you can utilize just the core of the function. To do that, set the number
+of attempts to 1, initialize labels each time using a custom algorithm, pass them with the
+( flags = #KMEANS_USE_INITIAL_LABELS ) flag, and then choose the best (most-compact) clustering.
+
+@param data Data for clustering. An array of N-Dimensional points with float coordinates is needed.
+Function can take GArray<Point2f>, GArray<Point3f> for 2D and 3D cases or GMat for any
+dimentionality and channels.
+@param K Number of clusters to split the set by.
+@param bestLabels Optional input integer array that can store the supposed initial cluster indices
+for every sample. Used when ( flags = #KMEANS_USE_INITIAL_LABELS ) flag is set.
+@param criteria The algorithm termination criteria, that is, the maximum number of iterations
+and/or the desired accuracy. The accuracy is specified as criteria.epsilon. As soon as each of
+the cluster centers moves by less than criteria.epsilon on some iteration, the algorithm stops.
+@param attempts Flag to specify the number of times the algorithm is executed using different
+initial labellings. The algorithm returns the labels that yield the best compactness (see the first
+function return value).
+@param flags Flag that can take values of cv::KmeansFlags .
+
+@return
+ - Compactness measure that is computed as
+\f[\sum _i  \| \texttt{samples} _i -  \texttt{centers} _{ \texttt{labels} _i} \| ^2\f]
+after every attempt. The best (minimum) value is chosen and the corresponding labels and the
+compactness value are returned by the function.
+ - Integer array that stores the cluster indices for every sample.
+ - Array of the cluster centers.
+*/
+GAPI_EXPORTS_W std::tuple<GOpaque<double>,GMat,GMat>
+kmeans(const GMat& data, const int K, const GMat& bestLabels,
+       const TermCriteria& criteria, const int attempts, const KmeansFlags flags);
+
+/** @overload
+@note
+ - Function textual ID is "org.opencv.core.kmeansNDNoInit"
+ - #KMEANS_USE_INITIAL_LABELS flag must not be set while using this overload.
+ */
+GAPI_EXPORTS_W std::tuple<GOpaque<double>,GMat,GMat>
+kmeans(const GMat& data, const int K, const TermCriteria& criteria, const int attempts,
+       const KmeansFlags flags);
+
+/** @overload
+@note Function textual ID is "org.opencv.core.kmeans2D"
+ */
+GAPI_EXPORTS_W std::tuple<GOpaque<double>,GArray<int>,GArray<Point2f>>
+kmeans(const GArray<Point2f>& data, const int K, const GArray<int>& bestLabels,
+       const TermCriteria& criteria, const int attempts, const KmeansFlags flags);
+
+/** @overload
+@note Function textual ID is "org.opencv.core.kmeans3D"
+ */
+GAPI_EXPORTS_W std::tuple<GOpaque<double>,GArray<int>,GArray<Point3f>>
+kmeans(const GArray<Point3f>& data, const int K, const GArray<int>& bestLabels,
+       const TermCriteria& criteria, const int attempts, const KmeansFlags flags);
+
+
+/** @brief Transposes a matrix.
+
+The function transposes the matrix:
+\f[\texttt{dst} (i,j) =  \texttt{src} (j,i)\f]
+
+@note
+ - Function textual ID is "org.opencv.core.transpose"
+ - No complex conjugation is done in case of a complex matrix. It should be done separately if needed.
+
+@param src input array.
+*/
+GAPI_EXPORTS_W GMat transpose(const GMat& src);
+
+
+namespace streaming {
+/** @brief Gets dimensions from Mat.
+
+@note Function textual ID is "org.opencv.streaming.size"
+
+@param src Input tensor
+@return Size (tensor dimensions).
+*/
+GAPI_EXPORTS_W GOpaque<Size> size(const GMat& src);
+
+/** @overload
+Gets dimensions from rectangle.
+
+@note Function textual ID is "org.opencv.streaming.sizeR"
+
+@param r Input rectangle.
+@return Size (rectangle dimensions).
+*/
+GAPI_EXPORTS_W GOpaque<Size> size(const GOpaque<Rect>& r);
+
+/** @brief Gets dimensions from MediaFrame.
+
+@note Function textual ID is "org.opencv.streaming.sizeMF"
+
+@param src Input frame
+@return Size (frame dimensions).
+*/
+GAPI_EXPORTS_W GOpaque<Size> size(const GFrame& src);
+} //namespace streaming
+} //namespace gapi
+} //namespace cv
+
+#endif //OPENCV_GAPI_CORE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/core.hpp
new file mode 100644
index 000000000000..ee86fb72c234
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/core.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CPU_CORE_API_HPP
+#define OPENCV_GAPI_CPU_CORE_API_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+namespace cv {
+namespace gapi {
+namespace core {
+namespace cpu {
+
+GAPI_EXPORTS_W cv::GKernelPackage kernels();
+
+} // namespace cpu
+} // namespace core
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_CORE_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/gcpukernel.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/gcpukernel.hpp
new file mode 100644
index 000000000000..eb5f7847478c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/gcpukernel.hpp
@@ -0,0 +1,542 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2022 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCPUKERNEL_HPP
+#define OPENCV_GAPI_GCPUKERNEL_HPP
+
+#if defined _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4702)  // "Unreachable code" on postprocess(...) call inside OCVCallHelper
+#endif
+
+#include <functional>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/mat.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/gmetaarg.hpp>
+#include <opencv2/gapi/util/compiler_hints.hpp> //suppress_unused_warning
+#include <opencv2/gapi/util/util.hpp>
+
+// FIXME: namespace scheme for backends?
+namespace cv {
+
+namespace gimpl
+{
+    // Forward-declare an internal class
+    class GCPUExecutable;
+} // namespace gimpl
+
+namespace gapi
+{
+/**
+ * @brief This namespace contains G-API CPU backend functions,
+ * structures, and symbols.
+ */
+namespace cpu
+{
+    /**
+     * \addtogroup gapi_std_backends
+     * @{
+     *
+     * @brief G-API backends available in this OpenCV version
+     *
+     * G-API backends play a corner stone role in G-API execution
+     * stack. Every backend is hardware-oriented and thus can run its
+     * kernels efficiently on the target platform.
+     *
+     * Backends are usually "black boxes" for G-API users -- on the API
+     * side, all backends are represented as different objects of the
+     * same class cv::gapi::GBackend.
+     * User can manipulate with backends by specifying which kernels to use.
+     *
+     * @sa @ref gapi_hld
+     */
+
+    /**
+     * @brief Get a reference to CPU (OpenCV) backend.
+     *
+     * This is the default backend in G-API at the moment, providing
+     * broader functional coverage but losing some graph model
+     * advantages. Provided mostly for reference and prototyping
+     * purposes.
+     *
+     * @sa gapi_std_backends
+     */
+    GAPI_EXPORTS cv::gapi::GBackend backend();
+    /** @} */
+
+    class GOCVFunctor;
+
+    //! @cond IGNORED
+    template<typename K, typename Callable>
+    GOCVFunctor ocv_kernel(const Callable& c);
+
+    template<typename K, typename Callable>
+    GOCVFunctor ocv_kernel(Callable& c);
+    //! @endcond
+
+} // namespace cpu
+} // namespace gapi
+
+// Represents arguments which are passed to a wrapped CPU function
+// FIXME: put into detail?
+class GAPI_EXPORTS GCPUContext
+{
+public:
+    // Generic accessor API
+    template<typename T>
+    const T& inArg(int input) { return m_args.at(input).get<T>(); }
+
+    // Syntax sugar
+    const cv::Mat&   inMat(int input);
+    cv::Mat&         outMatR(int output); // FIXME: Avoid cv::Mat m = ctx.outMatR()
+
+    const cv::Scalar& inVal(int input);
+    cv::Scalar& outValR(int output); // FIXME: Avoid cv::Scalar s = ctx.outValR()
+    cv::MediaFrame& outFrame(int output);
+    template<typename T> std::vector<T>& outVecR(int output) // FIXME: the same issue
+    {
+        return outVecRef(output).wref<T>();
+    }
+    template<typename T> T& outOpaqueR(int output) // FIXME: the same issue
+    {
+        return outOpaqueRef(output).wref<T>();
+    }
+
+    GArg state()
+    {
+        return m_state;
+    }
+
+protected:
+    detail::VectorRef& outVecRef(int output);
+    detail::OpaqueRef& outOpaqueRef(int output);
+
+    std::vector<GArg> m_args;
+    GArg m_state;
+
+    //FIXME: avoid conversion of arguments from internal representation to OpenCV one on each call
+    //to OCV kernel. (This can be achieved by a two single time conversions in GCPUExecutable::run,
+    //once on enter for input and output arguments, and once before return for output arguments only
+    std::unordered_map<std::size_t, GRunArgP> m_results;
+
+    friend class gimpl::GCPUExecutable;
+};
+
+class GAPI_EXPORTS GCPUKernel
+{
+public:
+    // This function is a kernel's execution entry point (does the processing work)
+    using RunF = std::function<void(GCPUContext &)>;
+    // This function is a stateful kernel's setup routine (configures state)
+    using SetupF = std::function<void(const GMetaArgs &, const GArgs &,
+                                      GArg &, const GCompileArgs &)>;
+
+    GCPUKernel();
+    GCPUKernel(const RunF& runF, const SetupF& setupF = nullptr);
+
+    RunF m_runF = nullptr;
+    SetupF m_setupF = nullptr;
+
+    bool m_isStateful = false;
+};
+
+// FIXME: This is an ugly ad-hoc implementation. TODO: refactor
+
+namespace detail
+{
+template<class T> struct get_in;
+template<> struct get_in<cv::GMat>
+{
+    static cv::Mat    get(GCPUContext &ctx, int idx) { return ctx.inMat(idx); }
+};
+template<> struct get_in<cv::GMatP>
+{
+    static cv::Mat    get(GCPUContext &ctx, int idx) { return get_in<cv::GMat>::get(ctx, idx); }
+};
+template<> struct get_in<cv::GFrame>
+{
+    static cv::MediaFrame get(GCPUContext &ctx, int idx) { return ctx.inArg<cv::MediaFrame>(idx); }
+};
+template<> struct get_in<cv::GScalar>
+{
+    static cv::Scalar get(GCPUContext &ctx, int idx) { return ctx.inVal(idx); }
+};
+template<typename U> struct get_in<cv::GArray<U> >
+{
+    static const std::vector<U>& get(GCPUContext &ctx, int idx) { return ctx.inArg<VectorRef>(idx).rref<U>(); }
+};
+template<typename U> struct get_in<cv::GOpaque<U> >
+{
+    static const U& get(GCPUContext &ctx, int idx) { return ctx.inArg<OpaqueRef>(idx).rref<U>(); }
+};
+
+//FIXME(dm): GArray<Mat>/GArray<GMat> conversion should be done more gracefully in the system
+template<> struct get_in<cv::GArray<cv::GMat> >: public get_in<cv::GArray<cv::Mat> >
+{
+};
+
+//FIXME(dm): GArray<Scalar>/GArray<GScalar> conversion should be done more gracefully in the system
+template<> struct get_in<cv::GArray<cv::GScalar> >: public get_in<cv::GArray<cv::Scalar> >
+{
+};
+
+// FIXME(dm): GArray<vector<U>>/GArray<GArray<U>> conversion should be done more gracefully in the system
+template<typename U> struct get_in<cv::GArray<cv::GArray<U>> >: public get_in<cv::GArray<std::vector<U>> >
+{
+};
+
+//FIXME(dm): GOpaque<Mat>/GOpaque<GMat> conversion should be done more gracefully in the system
+template<> struct get_in<cv::GOpaque<cv::GMat> >: public get_in<cv::GOpaque<cv::Mat> >
+{
+};
+
+//FIXME(dm): GOpaque<Scalar>/GOpaque<GScalar> conversion should be done more gracefully in the system
+template<> struct get_in<cv::GOpaque<cv::GScalar> >: public get_in<cv::GOpaque<cv::Mat> >
+{
+};
+
+template<class T> struct get_in
+{
+    static T get(GCPUContext &ctx, int idx) { return ctx.inArg<T>(idx); }
+};
+
+struct tracked_cv_mat{
+    tracked_cv_mat(cv::Mat& m) : r{m}, original_data{m.data} {}
+    cv::Mat r;
+    uchar* original_data;
+
+    operator cv::Mat& (){ return r;}
+    void validate() const{
+        if (r.data != original_data)
+        {
+            util::throw_error
+                (std::logic_error
+                 ("OpenCV kernel output parameter was reallocated. \n"
+                  "Incorrect meta data was provided ?"));
+        }
+    }
+};
+
+template<typename... Outputs>
+void postprocess(Outputs&... outs)
+{
+    struct
+    {
+        void operator()(tracked_cv_mat* bm) { bm->validate();  }
+        void operator()(...)                {                  }
+
+    } validate;
+    //dummy array to unfold parameter pack
+    int dummy[] = { 0, (validate(&outs), 0)... };
+    cv::util::suppress_unused_warning(dummy);
+}
+
+template<class T> struct get_out;
+template<> struct get_out<cv::GMat>
+{
+    static tracked_cv_mat get(GCPUContext &ctx, int idx)
+    {
+        auto& r = ctx.outMatR(idx);
+        return {r};
+    }
+};
+template<> struct get_out<cv::GMatP>
+{
+    static tracked_cv_mat get(GCPUContext &ctx, int idx)
+    {
+        return get_out<cv::GMat>::get(ctx, idx);
+    }
+};
+template<> struct get_out<cv::GScalar>
+{
+    static cv::Scalar& get(GCPUContext &ctx, int idx)
+    {
+        return ctx.outValR(idx);
+    }
+};
+template<> struct get_out<cv::GFrame>
+{
+    static cv::MediaFrame& get(GCPUContext &ctx, int idx)
+    {
+        return ctx.outFrame(idx);
+    }
+};
+template<typename U> struct get_out<cv::GArray<U>>
+{
+    static std::vector<U>& get(GCPUContext &ctx, int idx)
+    {
+        return ctx.outVecR<U>(idx);
+    }
+};
+
+//FIXME(dm): GArray<Mat>/GArray<GMat> conversion should be done more gracefully in the system
+template<> struct get_out<cv::GArray<cv::GMat> >: public get_out<cv::GArray<cv::Mat> >
+{
+};
+
+// FIXME(dm): GArray<vector<U>>/GArray<GArray<U>> conversion should be done more gracefully in the system
+template<typename U> struct get_out<cv::GArray<cv::GArray<U>> >: public get_out<cv::GArray<std::vector<U>> >
+{
+};
+
+template<typename U> struct get_out<cv::GOpaque<U>>
+{
+    static U& get(GCPUContext &ctx, int idx)
+    {
+        return ctx.outOpaqueR<U>(idx);
+    }
+};
+
+template<typename, typename>
+struct OCVSetupHelper;
+
+template<typename Impl, typename... Ins>
+struct OCVSetupHelper<Impl, std::tuple<Ins...>>
+{
+    // Using 'auto' return type and 'decltype' specifier in both 'setup_impl' versions
+    // to check existence of required 'Impl::setup' functions.
+    // While 'decltype' specifier accepts expression we pass expression with 'comma-operator'
+    // where first operand of comma-operator is call attempt to desired 'Impl::setup' and
+    // the second operand is 'void()' expression.
+    //
+    // SFINAE for 'Impl::setup' which accepts compile arguments.
+    template<int... IIs>
+    static auto setup_impl(const GMetaArgs &metaArgs, const GArgs &args,
+                           GArg &state, const GCompileArgs &compileArgs,
+                           detail::Seq<IIs...>) ->
+        decltype(Impl::setup(detail::get_in_meta<Ins>(metaArgs, args, IIs)...,
+                             std::declval<typename std::add_lvalue_reference<
+                                              std::shared_ptr<typename Impl::State>
+                                                                            >::type
+                                         >(),
+                            compileArgs)
+                 , void())
+    {
+        // TODO: unique_ptr <-> shared_ptr conversion ?
+        // To check: Conversion is possible only if the state which should be passed to
+        // 'setup' user callback isn't required to have previous value
+        std::shared_ptr<typename Impl::State> stPtr;
+        Impl::setup(detail::get_in_meta<Ins>(metaArgs, args, IIs)..., stPtr, compileArgs);
+        state = GArg(stPtr);
+    }
+
+    // SFINAE for 'Impl::setup' which doesn't accept compile arguments.
+    template<int... IIs>
+    static auto setup_impl(const GMetaArgs &metaArgs, const GArgs &args,
+                           GArg &state, const GCompileArgs &/* compileArgs */,
+                           detail::Seq<IIs...>) ->
+        decltype(Impl::setup(detail::get_in_meta<Ins>(metaArgs, args, IIs)...,
+                             std::declval<typename std::add_lvalue_reference<
+                                              std::shared_ptr<typename Impl::State>
+                                                                            >::type
+                                         >()
+                            )
+                 , void())
+    {
+        // The same comment as in 'setup' above.
+        std::shared_ptr<typename Impl::State> stPtr;
+        Impl::setup(detail::get_in_meta<Ins>(metaArgs, args, IIs)..., stPtr);
+        state = GArg(stPtr);
+    }
+
+    static void setup(const GMetaArgs &metaArgs, const GArgs &args,
+                      GArg& state, const GCompileArgs &compileArgs)
+    {
+        setup_impl(metaArgs, args, state, compileArgs,
+                   typename detail::MkSeq<sizeof...(Ins)>::type());
+    }
+};
+
+// OCVCallHelper is a helper class to call stateless OCV kernels and OCV kernel functors.
+template<typename, typename, typename>
+struct OCVCallHelper;
+
+// FIXME: probably can be simplified with std::apply or analogue.
+template<typename Impl, typename... Ins, typename... Outs>
+struct OCVCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...>>
+{
+    template<typename... Inputs>
+    struct call_and_postprocess
+    {
+        template<typename... Outputs>
+        static void call(Inputs&&... ins, Outputs&&... outs)
+        {
+            //not using a std::forward on outs is deliberate in order to
+            //cause compilation error, by trying to bind rvalue references to lvalue references
+            Impl::run(std::forward<Inputs>(ins)..., outs...);
+            postprocess(outs...);
+        }
+
+        template<typename... Outputs>
+        static void call(Impl& impl, Inputs&&... ins, Outputs&&... outs)
+        {
+            impl(std::forward<Inputs>(ins)..., outs...);
+        }
+    };
+
+    template<int... IIs, int... OIs>
+    static void call_impl(GCPUContext &ctx, detail::Seq<IIs...>, detail::Seq<OIs...>)
+    {
+        //Make sure that OpenCV kernels do not reallocate memory for output parameters
+        //by comparing it's state (data ptr) before and after the call.
+        //This is done by converting each output Mat into tracked_cv_mat object, and binding
+        //them to parameters of ad-hoc function
+        call_and_postprocess<decltype(get_in<Ins>::get(ctx, IIs))...>
+            ::call(get_in<Ins>::get(ctx, IIs)..., get_out<Outs>::get(ctx, OIs)...);
+    }
+
+    template<int... IIs, int... OIs>
+    static void call_impl(cv::GCPUContext &ctx, Impl& impl,
+                          detail::Seq<IIs...>, detail::Seq<OIs...>)
+    {
+        call_and_postprocess<decltype(get_in<Ins>::get(ctx, IIs))...>
+            ::call(impl, get_in<Ins>::get(ctx, IIs)..., get_out<Outs>::get(ctx, OIs)...);
+    }
+
+    static void call(GCPUContext &ctx)
+    {
+        call_impl(ctx,
+                  typename detail::MkSeq<sizeof...(Ins)>::type(),
+                  typename detail::MkSeq<sizeof...(Outs)>::type());
+    }
+
+    // NB: Same as call but calling the object
+    // This necessary for kernel implementations that have a state
+    // and are represented as an object
+    static void callFunctor(cv::GCPUContext &ctx, Impl& impl)
+    {
+        call_impl(ctx, impl,
+                  typename detail::MkSeq<sizeof...(Ins)>::type(),
+                  typename detail::MkSeq<sizeof...(Outs)>::type());
+    }
+};
+
+// OCVStCallHelper is a helper class to call stateful OCV kernels.
+template<typename, typename, typename>
+struct OCVStCallHelper;
+
+template<typename Impl, typename... Ins, typename... Outs>
+struct OCVStCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...>> :
+    OCVCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...>>
+{
+    template<typename... Inputs>
+    struct call_and_postprocess
+    {
+        template<typename... Outputs>
+        static void call(typename Impl::State& st, Inputs&&... ins, Outputs&&... outs)
+        {
+            Impl::run(std::forward<Inputs>(ins)..., outs..., st);
+            postprocess(outs...);
+        }
+    };
+
+    template<int... IIs, int... OIs>
+    static void call_impl(GCPUContext &ctx, detail::Seq<IIs...>, detail::Seq<OIs...>)
+    {
+        auto& st = *ctx.state().get<std::shared_ptr<typename Impl::State>>();
+        call_and_postprocess<decltype(get_in<Ins>::get(ctx, IIs))...>
+            ::call(st, get_in<Ins>::get(ctx, IIs)..., get_out<Outs>::get(ctx, OIs)...);
+    }
+
+    static void call(GCPUContext &ctx)
+    {
+        call_impl(ctx,
+                  typename detail::MkSeq<sizeof...(Ins)>::type(),
+                  typename detail::MkSeq<sizeof...(Outs)>::type());
+    }
+};
+
+} // namespace detail
+
+template<class Impl, class K>
+class GCPUKernelImpl: public cv::detail::KernelTag
+{
+    using CallHelper = cv::detail::OCVCallHelper<Impl, typename K::InArgs, typename K::OutArgs>;
+
+public:
+    using API = K;
+
+    static cv::gapi::GBackend backend() { return cv::gapi::cpu::backend(); }
+    static cv::GCPUKernel      kernel() { return GCPUKernel(&CallHelper::call); }
+};
+
+template<class Impl, class K, class S>
+class GCPUStKernelImpl: public cv::detail::KernelTag
+{
+    using StSetupHelper = detail::OCVSetupHelper<Impl, typename K::InArgs>;
+    using StCallHelper  = detail::OCVStCallHelper<Impl, typename K::InArgs, typename K::OutArgs>;
+
+public:
+    using API = K;
+    using State = S;
+
+    static cv::gapi::GBackend backend() { return cv::gapi::cpu::backend(); }
+    static cv::GCPUKernel     kernel()  { return GCPUKernel(&StCallHelper::call,
+                                                            &StSetupHelper::setup); }
+};
+
+#define GAPI_OCV_KERNEL(Name, API) struct Name: public cv::GCPUKernelImpl<Name, API>
+
+// TODO: Reuse Anatoliy's logic for support of types with commas in macro.
+//       Retrieve the common part from Anatoliy's logic to the separate place.
+#define GAPI_OCV_KERNEL_ST(Name, API, State)                   \
+    struct Name: public cv::GCPUStKernelImpl<Name, API, State> \
+
+/// @private
+class gapi::cpu::GOCVFunctor : public gapi::GFunctor
+{
+public:
+    using Impl = std::function<void(GCPUContext &)>;
+    using Meta = cv::GKernel::M;
+
+    GOCVFunctor(const char* id, const Meta &meta, const Impl& impl)
+        : gapi::GFunctor(id), impl_{GCPUKernel(impl), meta}
+    {
+    }
+
+    GKernelImpl    impl()    const override { return impl_;                }
+    gapi::GBackend backend() const override { return gapi::cpu::backend(); }
+
+private:
+    GKernelImpl impl_;
+};
+
+//! @cond IGNORED
+template<typename K, typename Callable>
+gapi::cpu::GOCVFunctor gapi::cpu::ocv_kernel(Callable& c)
+{
+    using P = cv::detail::OCVCallHelper<Callable, typename K::InArgs, typename K::OutArgs>;
+    return GOCVFunctor{ K::id()
+                      , &K::getOutMeta
+                      , std::bind(&P::callFunctor, std::placeholders::_1, std::ref(c))
+                      };
+}
+
+template<typename K, typename Callable>
+gapi::cpu::GOCVFunctor gapi::cpu::ocv_kernel(const Callable& c)
+{
+    using P = cv::detail::OCVCallHelper<Callable, typename K::InArgs, typename K::OutArgs>;
+    return GOCVFunctor{ K::id()
+                      , &K::getOutMeta
+                      , std::bind(&P::callFunctor, std::placeholders::_1, c)
+                      };
+}
+//! @endcond
+
+} // namespace cv
+
+#if defined _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // OPENCV_GAPI_GCPUKERNEL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/imgproc.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/imgproc.hpp
new file mode 100644
index 000000000000..0b96db08ae18
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/imgproc.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CPU_IMGPROC_API_HPP
+#define OPENCV_GAPI_CPU_IMGPROC_API_HPP
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace imgproc {
+namespace cpu {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+} // namespace cpu
+} // namespace imgproc
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_IMGPROC_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/ot.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/ot.hpp
new file mode 100644
index 000000000000..03dbe904cc9d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/ot.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CPU_OT_API_HPP
+#define OPENCV_GAPI_CPU_OT_API_HPP
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+/**
+ * @brief This namespace contains G-API Operation Types for
+ * VAS Object Tracking module functionality.
+ */
+namespace ot {
+namespace cpu {
+GAPI_EXPORTS_W GKernelPackage kernels();
+} // namespace cpu
+} // namespace ot
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_OT_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/stereo.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/stereo.hpp
new file mode 100644
index 000000000000..e2a2242bd0b8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/stereo.hpp
@@ -0,0 +1,48 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_CPU_STEREO_API_HPP
+#define OPENCV_GAPI_CPU_STEREO_API_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace calib3d {
+namespace cpu {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+/** @brief Structure for the Stereo operation initialization parameters.*/
+struct GAPI_EXPORTS StereoInitParam {
+    StereoInitParam(int nD, int bS, double bL, double f):
+        numDisparities(nD), blockSize(bS), baseline(bL), focus(f) {}
+
+    StereoInitParam() = default;
+
+    int numDisparities = 0;
+    int blockSize = 21;
+    double baseline = 63.5;
+    double focus = 3.6;
+};
+
+} // namespace cpu
+} // namespace calib3d
+} // namespace gapi
+
+namespace detail {
+
+    template<> struct CompileArgTag<cv::gapi::calib3d::cpu::StereoInitParam> {
+    static const char* tag() {
+        return "org.opencv.stereoInit";
+    }
+};
+
+} // namespace detail
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_STEREO_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/video.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/video.hpp
new file mode 100644
index 000000000000..d3c1f2e67045
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/cpu/video.hpp
@@ -0,0 +1,25 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_CPU_VIDEO_API_HPP
+#define OPENCV_GAPI_CPU_VIDEO_API_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace video {
+namespace cpu {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+} // namespace cpu
+} // namespace video
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_VIDEO_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/core.hpp
new file mode 100644
index 000000000000..a4329d6f50f8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/core.hpp
@@ -0,0 +1,20 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_CORE_HPP
+#define OPENCV_GAPI_FLUID_CORE_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+namespace cv { namespace gapi { namespace core { namespace fluid {
+
+GAPI_EXPORTS_W cv::GKernelPackage kernels();
+
+}}}}
+
+#endif // OPENCV_GAPI_FLUID_CORE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/gfluidbuffer.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/gfluidbuffer.hpp
new file mode 100644
index 000000000000..551f0a398feb
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/gfluidbuffer.hpp
@@ -0,0 +1,154 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_BUFFER_HPP
+#define OPENCV_GAPI_FLUID_BUFFER_HPP
+
+#include <list>
+#include <numeric> // accumulate
+#include <ostream> // ostream
+#include <cstdint> // uint8_t
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gmat.hpp>
+
+#include <opencv2/gapi/util/optional.hpp>
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+struct Border
+{
+    // This constructor is required to support existing kernels which are part of G-API
+    Border(int _type, cv::Scalar _val) : type(_type), value(_val) {}
+
+    int type;
+    cv::Scalar value;
+};
+
+using BorderOpt = util::optional<Border>;
+
+bool operator == (const Border& b1, const Border& b2);
+
+class GAPI_EXPORTS Buffer;
+
+class GAPI_EXPORTS View
+{
+public:
+    struct Cache
+    {
+        std::vector<const uint8_t*> m_linePtrs;
+        GMatDesc m_desc;
+        int m_border_size = 0;
+
+        inline const uint8_t* linePtr(int index) const
+        {
+            // "out_of_window" check:
+            // user must not request the lines which are outside of specified kernel window
+            GAPI_DbgAssert(index >= -m_border_size
+                        && index <  -m_border_size + static_cast<int>(m_linePtrs.size()));
+            return m_linePtrs[index + m_border_size];
+        }
+    };
+
+    const inline uint8_t* InLineB(int index) const // -(w-1)/2...0...+(w-1)/2 for Filters
+    {
+        return m_cache->linePtr(index);
+    }
+
+    template<typename T> const inline T* InLine(int i) const
+    {
+        const uint8_t* ptr = this->InLineB(i);
+        return reinterpret_cast<const T*>(ptr);
+    }
+
+    inline operator bool() const { return m_priv != nullptr; }
+    bool ready() const;
+    inline int length() const { return m_cache->m_desc.size.width; }
+    int y() const;
+
+    inline const GMatDesc& meta() const { return m_cache->m_desc; }
+
+    class GAPI_EXPORTS Priv;      // internal use only
+    Priv& priv();               // internal use only
+    const Priv& priv() const;   // internal use only
+
+    View();
+    View(std::unique_ptr<Priv>&& p);
+    View(View&& v);
+    View& operator=(View&& v);
+    ~View();
+
+private:
+    std::unique_ptr<Priv> m_priv;
+    const Cache* m_cache = nullptr;
+};
+
+class GAPI_EXPORTS Buffer
+{
+public:
+    struct Cache
+    {
+        std::vector<uint8_t*> m_linePtrs;
+        GMatDesc m_desc;
+    };
+
+    // Default constructor (executable creation stage,
+    // all following initialization performed in Priv::init())
+    Buffer();
+    // Scratch constructor (user kernels)
+    Buffer(const cv::GMatDesc &desc);
+
+    // Constructor for intermediate buffers (for tests)
+    Buffer(const cv::GMatDesc &desc,
+           int max_line_consumption, int border_size,
+           int skew,
+           int wlpi,
+           BorderOpt border);
+    // Constructor for in/out buffers (for tests)
+    Buffer(const cv::Mat &data, bool is_input);
+    ~Buffer();
+    Buffer& operator=(Buffer&&);
+
+    inline uint8_t* OutLineB(int index = 0)
+    {
+        return m_cache->m_linePtrs[index];
+    }
+
+    template<typename T> inline T* OutLine(int index = 0)
+    {
+        uint8_t* ptr = this->OutLineB(index);
+        return reinterpret_cast<T*>(ptr);
+    }
+
+    int y() const;
+
+    int linesReady() const;
+    void debug(std::ostream &os) const;
+    inline int length() const { return m_cache->m_desc.size.width; }
+    int lpi() const;  // LPI for WRITER
+
+    inline const GMatDesc& meta() const { return m_cache->m_desc; }
+
+    View mkView(int borderSize, bool ownStorage);
+    void addView(const View* v);
+
+    class GAPI_EXPORTS Priv;      // internal use only
+    Priv& priv();               // internal use only
+    const Priv& priv() const;   // internal use only
+
+private:
+    std::unique_ptr<Priv> m_priv;
+    const Cache* m_cache;
+};
+
+} // namespace cv::gapi::fluid
+} // namespace cv::gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_FLUID_BUFFER_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/gfluidkernel.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/gfluidkernel.hpp
new file mode 100644
index 000000000000..c3ae9dfdd6e0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/gfluidkernel.hpp
@@ -0,0 +1,442 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2019 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_KERNEL_HPP
+#define OPENCV_GAPI_FLUID_KERNEL_HPP
+
+#include <vector>
+#include <functional>
+#include <map>
+#include <unordered_map>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/garg.hpp>
+
+#include <opencv2/gapi/fluid/gfluidbuffer.hpp>
+
+// FIXME: namespace scheme for backends?
+namespace cv {
+
+namespace gapi
+{
+/**
+ * @brief This namespace contains G-API Fluid backend functions, structures, and symbols.
+ */
+namespace fluid
+{
+    /**
+     * \addtogroup gapi_std_backends G-API Standard Backends
+     * @{
+     */
+    /**
+     * @brief Get a reference to Fluid backend.
+     *
+     * @sa gapi_std_backends
+     */
+    GAPI_EXPORTS cv::gapi::GBackend backend();
+    /** @} */
+} // namespace fluid
+} // namespace gapi
+
+
+class GAPI_EXPORTS GFluidKernel
+{
+public:
+    enum class Kind
+    {
+        Filter,
+        Resize,
+        YUV420toRGB //Color conversion of 4:2:0 chroma sub-sampling formats (NV12, I420 ..etc) to RGB
+    };
+
+    // This function is a generic "doWork" callback
+    using F = std::function<void(const cv::GArgs&, const std::vector<gapi::fluid::Buffer*> &)>;
+
+    // This function is a generic "initScratch" callback
+    using IS = std::function<void(const cv::GMetaArgs &, const cv::GArgs&, gapi::fluid::Buffer &)>;
+
+    // This function is a generic "resetScratch" callback
+    using RS = std::function<void(gapi::fluid::Buffer &)>;
+
+    // This function describes kernel metadata inference rule.
+    using M = std::function<GMetaArgs(const GMetaArgs &, const GArgs &)>;
+
+    // This function is a generic "getBorder" callback (extracts border-related data from kernel's input parameters)
+    using B = std::function<gapi::fluid::BorderOpt(const GMetaArgs&, const GArgs&)>;
+
+    // This function is a generic "getWindow" callback (extracts window-related data from kernel's input parameters)
+    using GW = std::function<int(const GMetaArgs&, const GArgs&)>;
+
+    // FIXME: move implementations out of header file
+    GFluidKernel() {}
+    GFluidKernel(Kind k, int l, bool scratch, const F& f, const IS &is, const RS &rs, const B& b, const GW& win)
+        : m_kind(k)
+        , m_lpi(l)
+        , m_scratch(scratch)
+        , m_f(f)
+        , m_is(is)
+        , m_rs(rs)
+        , m_b(b)
+        , m_gw(win) {}
+
+    Kind m_kind;
+    const int  m_lpi     = -1;
+    const bool m_scratch = false;
+
+    const F    m_f;
+    const IS   m_is;
+    const RS   m_rs;
+    const B    m_b;
+    const GW   m_gw;
+};
+
+// FIXME!!!
+// This is the temporary and experimental API
+// which should be replaced by runtime roi-based scheduling
+/** \addtogroup gapi_compile_args
+ * @{
+ */
+/**
+ * @brief This structure allows to control the output image region
+ * which Fluid backend will produce in the graph.
+ *
+ * This feature is useful for external tiling and parallelism, but
+ * will be deprecated in the future releases.
+ */
+struct GFluidOutputRois
+{
+    std::vector<cv::Rect> rois;
+};
+
+/**
+ * @brief This structure forces Fluid backend to generate multiple
+ * parallel output regions in the graph. These regions execute in parallel.
+ *
+ * This feature may be deprecated in the future releases.
+ */
+struct GFluidParallelOutputRois
+{
+    std::vector<GFluidOutputRois> parallel_rois;
+};
+
+/**
+ * @brief This structure allows to customize the way how Fluid executes
+ * parallel regions.
+ *
+ * For example, user can utilize his own threading runtime via this parameter.
+ * The `parallel_for` member functor is called by the Fluid runtime with the
+ * following arguments:
+ *
+ * @param size Size of the parallel range to process
+ * @param f A function which should be called for every integer index
+ *   in this range by the specified parallel_for implementation.
+ *
+ * This feature may be deprecated in the future releases.
+ */
+struct GFluidParallelFor
+{
+    //this function accepts:
+    // - size of the "parallel" range as the first argument
+    // - and a function to be called on the range items, designated by item index
+    std::function<void(std::size_t size, std::function<void(std::size_t index)>)> parallel_for;
+};
+/** @} gapi_compile_args */
+
+namespace detail
+{
+template<> struct CompileArgTag<GFluidOutputRois>
+{
+    static const char* tag() { return "gapi.fluid.outputRois"; }
+};
+
+template<> struct CompileArgTag<GFluidParallelFor>
+{
+    static const char* tag() { return "gapi.fluid.parallelFor"; }
+};
+
+template<> struct CompileArgTag<GFluidParallelOutputRois>
+{
+    static const char* tag() { return "gapi.fluid.parallelOutputRois"; }
+};
+
+} // namespace detail
+
+namespace detail
+{
+template<class T> struct fluid_get_in;
+template<> struct fluid_get_in<cv::GMat>
+{
+    static const cv::gapi::fluid::View& get(const cv::GArgs &in_args, int idx)
+    {
+        return *in_args[idx].unsafe_get<cv::gapi::fluid::View*>();
+    }
+};
+
+template<> struct fluid_get_in<cv::GScalar>
+{
+    // FIXME: change to return by reference when moved to own::Scalar
+    static cv::Scalar get(const cv::GArgs &in_args, int idx)
+    {
+        return in_args[idx].unsafe_get<cv::Scalar>();
+    }
+};
+
+template<typename U> struct fluid_get_in<cv::GArray<U>>
+{
+    static const std::vector<U>& get(const cv::GArgs &in_args, int idx)
+    {
+        return in_args.at(idx).unsafe_get<cv::detail::VectorRef>().rref<U>();
+    }
+};
+
+template<typename U> struct fluid_get_in<cv::GOpaque<U>>
+{
+    static const U& get(const cv::GArgs &in_args, int idx)
+    {
+        return in_args.at(idx).unsafe_get<cv::detail::OpaqueRef>().rref<U>();
+    }
+};
+
+template<class T> struct fluid_get_in
+{
+    static const T& get(const cv::GArgs &in_args, int idx)
+    {
+        return in_args[idx].unsafe_get<T>();
+    }
+};
+
+template<bool, typename Impl, typename... Ins>
+struct scratch_helper;
+
+template<typename Impl, typename... Ins>
+struct scratch_helper<true, Impl, Ins...>
+{
+    // Init
+    template<int... IIs>
+    static void help_init_impl(const cv::GMetaArgs &metas,
+                               const cv::GArgs     &in_args,
+                               gapi::fluid::Buffer &scratch_buf,
+                               detail::Seq<IIs...>)
+    {
+        Impl::initScratch(get_in_meta<Ins>(metas, in_args, IIs)..., scratch_buf);
+    }
+
+    static void help_init(const cv::GMetaArgs &metas,
+                          const cv::GArgs     &in_args,
+                          gapi::fluid::Buffer &b)
+    {
+        help_init_impl(metas, in_args, b, typename detail::MkSeq<sizeof...(Ins)>::type());
+    }
+
+    // Reset
+    static void help_reset(gapi::fluid::Buffer &b)
+    {
+        Impl::resetScratch(b);
+    }
+};
+
+template<typename Impl, typename... Ins>
+struct scratch_helper<false, Impl, Ins...>
+{
+    static void help_init(const cv::GMetaArgs &,
+                          const cv::GArgs     &,
+                          gapi::fluid::Buffer &)
+    {
+        GAPI_Error("InternalError");
+    }
+    static void help_reset(gapi::fluid::Buffer &)
+    {
+        GAPI_Error("InternalError");
+    }
+};
+
+template<typename T> struct is_gmat_type
+{
+    static const constexpr bool value = std::is_same<cv::GMat, T>::value;
+};
+
+template<bool CallCustomGetBorder, typename Impl, typename... Ins>
+struct get_border_helper;
+
+template<typename Impl, typename... Ins>
+struct get_border_helper<true, Impl, Ins...>
+{
+    template<int... IIs>
+    static gapi::fluid::BorderOpt get_border_impl(const GMetaArgs &metas,
+                                                  const cv::GArgs &in_args,
+                                                  cv::detail::Seq<IIs...>)
+    {
+        return util::make_optional(Impl::getBorder(cv::detail::get_in_meta<Ins>(metas, in_args, IIs)...));
+    }
+
+    static gapi::fluid::BorderOpt help(const GMetaArgs &metas,
+                                       const cv::GArgs &in_args)
+    {
+        return get_border_impl(metas, in_args, typename detail::MkSeq<sizeof...(Ins)>::type());
+    }
+};
+
+template<typename Impl, typename... Ins>
+struct get_border_helper<false, Impl, Ins...>
+{
+    static gapi::fluid::BorderOpt help(const cv::GMetaArgs &,
+                                       const cv::GArgs     &)
+    {
+        return {};
+    }
+};
+
+template<bool CallCustomGetWindow, typename, typename... Ins>
+struct get_window_helper;
+
+template<typename Impl, typename... Ins>
+struct get_window_helper<true, Impl, Ins...>
+{
+    template<int... IIs>
+    static int get_window_impl(const GMetaArgs &metas,
+                               const cv::GArgs &in_args,
+                               cv::detail::Seq<IIs...>)
+    {
+        return Impl::getWindow(cv::detail::get_in_meta<Ins>(metas, in_args, IIs)...);
+    }
+
+    static int help(const GMetaArgs &metas, const cv::GArgs &in_args)
+    {
+        return get_window_impl(metas, in_args, typename detail::MkSeq<sizeof...(Ins)>::type());
+    }
+};
+
+template<typename Impl, typename... Ins>
+struct get_window_helper<false, Impl, Ins...>
+{
+    static int help(const cv::GMetaArgs &,
+                    const cv::GArgs     &)
+    {
+        return Impl::Window;
+    }
+};
+
+template<typename C, typename T>
+struct has_Window
+{
+private:
+    template<class U>
+    static constexpr auto Check(U*) -> typename std::is_same<decltype(U::Window), T>::type;
+
+    template<typename>
+    static constexpr std::false_type Check(...);
+
+    typedef decltype(Check<C>(0)) Result;
+
+public:
+    static constexpr bool value = Result::value;
+};
+
+template<bool hasWindow, typename Impl>
+struct callCustomGetBorder;
+
+template<typename Impl>
+struct callCustomGetBorder<true, Impl>
+{
+    static constexpr bool value = (Impl::Window != 1);
+};
+
+template<typename Impl>
+struct callCustomGetBorder<false, Impl>
+{
+    static constexpr bool value = true;
+};
+
+template<typename, typename, typename, bool UseScratch>
+struct FluidCallHelper;
+
+template<typename Impl, typename... Ins, typename... Outs, bool UseScratch>
+struct FluidCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...>, UseScratch>
+{
+    static_assert(all_satisfy<is_gmat_type, Outs...>::value, "return type must be GMat");
+    static_assert(contains<GMat, Ins...>::value, "input must contain at least one GMat");
+
+    // Execution dispatcher ////////////////////////////////////////////////////
+    template<int... IIs, int... OIs>
+    static void call_impl(const cv::GArgs &in_args,
+                          const std::vector<gapi::fluid::Buffer*> &out_bufs,
+                          detail::Seq<IIs...>,
+                          detail::Seq<OIs...>)
+    {
+        Impl::run(fluid_get_in<Ins>::get(in_args, IIs)..., *out_bufs[OIs]...);
+    }
+
+    static void call(const cv::GArgs &in_args,
+                     const std::vector<gapi::fluid::Buffer*> &out_bufs)
+    {
+        constexpr int numOuts = (sizeof...(Outs)) + (UseScratch ? 1 : 0);
+        call_impl(in_args, out_bufs,
+                  typename detail::MkSeq<sizeof...(Ins)>::type(),
+                  typename detail::MkSeq<numOuts>::type());
+    }
+
+    // Scratch buffer initialization dispatcher ////////////////////////////////
+    static void init_scratch(const GMetaArgs &metas,
+                             const cv::GArgs &in_args,
+                             gapi::fluid::Buffer &b)
+    {
+        scratch_helper<UseScratch, Impl, Ins...>::help_init(metas, in_args, b);
+    }
+
+    // Scratch buffer reset dispatcher /////////////////////////////////////////
+    static void reset_scratch(gapi::fluid::Buffer &scratch_buf)
+    {
+        scratch_helper<UseScratch, Impl, Ins...>::help_reset(scratch_buf);
+    }
+
+    static gapi::fluid::BorderOpt getBorder(const GMetaArgs &metas, const cv::GArgs &in_args)
+    {
+        constexpr bool hasWindow = has_Window<Impl, const int>::value;
+
+        // User must provide "init" callback if Window != 1
+        // TODO: move to constexpr if when we enable C++17
+        return get_border_helper<callCustomGetBorder<hasWindow, Impl>::value, Impl, Ins...>::help(metas, in_args);
+    }
+
+    static int getWindow(const GMetaArgs &metas, const cv::GArgs &in_args)
+    {
+        constexpr bool callCustomGetWindow = !(has_Window<Impl, const int>::value);
+        return get_window_helper<callCustomGetWindow, Impl, Ins...>::help(metas, in_args);
+    }
+};
+} // namespace detail
+
+
+template<class Impl, class K, bool UseScratch>
+class GFluidKernelImpl : public cv::detail::KernelTag
+{
+    static const int LPI = 1;
+    static const auto Kind = GFluidKernel::Kind::Filter;
+    using P = detail::FluidCallHelper<Impl, typename K::InArgs, typename K::OutArgs, UseScratch>;
+
+public:
+    using API = K;
+
+    static GFluidKernel kernel()
+    {
+        // FIXME: call() and getOutMeta() needs to be renamed so it is clear these
+        // functions are internal wrappers, not user API
+        return GFluidKernel(Impl::Kind, Impl::LPI,
+                            UseScratch,
+                            &P::call, &P::init_scratch, &P::reset_scratch, &P::getBorder, &P::getWindow);
+    }
+
+    static cv::gapi::GBackend backend() { return cv::gapi::fluid::backend(); }
+};
+
+#define GAPI_FLUID_KERNEL(Name, API, Scratch) struct Name: public cv::GFluidKernelImpl<Name, API, Scratch>
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GCPUKERNEL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/imgproc.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/imgproc.hpp
new file mode 100644
index 000000000000..a4e8ac0f9922
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/fluid/imgproc.hpp
@@ -0,0 +1,20 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_IMGPROC_HPP
+#define OPENCV_GAPI_FLUID_IMGPROC_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+namespace cv { namespace gapi { namespace imgproc { namespace fluid {
+
+GAPI_EXPORTS_W GKernelPackage kernels();
+
+}}}}
+
+#endif // OPENCV_GAPI_FLUID_IMGPROC_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/garg.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/garg.hpp
new file mode 100644
index 000000000000..2a8315f9d83e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/garg.hpp
@@ -0,0 +1,311 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2021 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GARG_HPP
+#define OPENCV_GAPI_GARG_HPP
+
+#include <vector>
+#include <unordered_map>
+#include <type_traits>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/own/mat.hpp>
+#include <opencv2/gapi/media.hpp>
+
+#include <opencv2/gapi/util/util.hpp>
+#include <opencv2/gapi/util/any.hpp>
+#include <opencv2/gapi/util/variant.hpp>
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+#include <opencv2/gapi/garray.hpp>
+#include <opencv2/gapi/gopaque.hpp>
+#include <opencv2/gapi/gframe.hpp>
+#include <opencv2/gapi/gtype_traits.hpp>
+#include <opencv2/gapi/gmetaarg.hpp>
+#include <opencv2/gapi/streaming/source.hpp>
+#include <opencv2/gapi/rmat.hpp>
+
+namespace cv {
+
+class GArg;
+
+namespace detail {
+    template<typename T>
+    using is_garg = std::is_same<GArg, typename std::decay<T>::type>;
+}
+
+// Parameter holder class for a node
+// Depending on platform capabilities, can either support arbitrary types
+// (as `boost::any`) or a limited number of types (as `boot::variant`).
+// FIXME: put into "details" as a user shouldn't use it in his code
+class GAPI_EXPORTS GArg
+{
+public:
+    GArg() {}
+
+    template<typename T, typename std::enable_if<!detail::is_garg<T>::value, int>::type = 0>
+    explicit GArg(const T &t)
+        : kind(detail::GTypeTraits<T>::kind)
+        , opaque_kind(detail::GOpaqueTraits<T>::kind)
+        , value(detail::wrap_gapi_helper<T>::wrap(t))
+    {
+    }
+
+    template<typename T, typename std::enable_if<!detail::is_garg<T>::value, int>::type = 0>
+    explicit GArg(T &&t)
+        : kind(detail::GTypeTraits<typename std::decay<T>::type>::kind)
+        , opaque_kind(detail::GOpaqueTraits<typename std::decay<T>::type>::kind)
+        , value(detail::wrap_gapi_helper<T>::wrap(t))
+    {
+    }
+
+    template<typename T> inline T& get()
+    {
+        return util::any_cast<typename std::remove_reference<T>::type>(value);
+    }
+
+    template<typename T> inline const T& get() const
+    {
+        return util::any_cast<typename std::remove_reference<T>::type>(value);
+    }
+
+    template<typename T> inline T& unsafe_get()
+    {
+        return util::unsafe_any_cast<typename std::remove_reference<T>::type>(value);
+    }
+
+    template<typename T> inline const T& unsafe_get() const
+    {
+        return util::unsafe_any_cast<typename std::remove_reference<T>::type>(value);
+    }
+
+    detail::ArgKind kind = detail::ArgKind::OPAQUE_VAL;
+    detail::OpaqueKind opaque_kind = detail::OpaqueKind::CV_UNKNOWN;
+
+protected:
+    util::any value;
+};
+
+using GArgs = std::vector<GArg>;
+
+// FIXME: Express as M<GProtoArg...>::type
+// FIXME: Move to a separate file!
+using GRunArgBase  = util::variant<
+#if !defined(GAPI_STANDALONE)
+    cv::UMat,
+#endif // !defined(GAPI_STANDALONE)
+    cv::RMat,
+    cv::gapi::wip::IStreamSource::Ptr,
+    cv::Mat,
+    cv::Scalar,
+    cv::detail::VectorRef,
+    cv::detail::OpaqueRef,
+    cv::MediaFrame
+    >;
+
+namespace detail {
+template<typename,typename>
+struct in_variant;
+
+template<typename T, typename... Types>
+struct in_variant<T, util::variant<Types...> >
+    : std::integral_constant<bool, cv::detail::contains<T, Types...>::value > {
+};
+} // namespace detail
+
+struct GAPI_EXPORTS GRunArg: public GRunArgBase
+{
+    // Metadata information here
+    using Meta = std::unordered_map<std::string, util::any>;
+    Meta meta;
+
+    // Mimic the old GRunArg semantics here, old of the times when
+    // GRunArg was an alias to variant<>
+    GRunArg();
+    GRunArg(const cv::GRunArg &arg);
+    GRunArg(cv::GRunArg &&arg);
+
+    GRunArg& operator= (const GRunArg &arg);
+    GRunArg& operator= (GRunArg &&arg);
+
+    template <typename T>
+    GRunArg(const T &t,
+            const Meta &m = Meta{},
+            typename std::enable_if< detail::in_variant<T, GRunArgBase>::value, int>::type = 0)
+        : GRunArgBase(t)
+        , meta(m)
+    {
+    }
+    template <typename T>
+    GRunArg(T &&t,
+            const Meta &m = Meta{},
+            typename std::enable_if< detail::in_variant<T, GRunArgBase>::value, int>::type = 0)
+        : GRunArgBase(std::move(t))
+        , meta(m)
+    {
+    }
+    template <typename T> auto operator= (const T &t)
+        -> typename std::enable_if< detail::in_variant<T, GRunArgBase>::value, cv::GRunArg>::type&
+    {
+        GRunArgBase::operator=(t);
+        return *this;
+    }
+    template <typename T> auto operator= (T&& t)
+        -> typename std::enable_if< detail::in_variant<T, GRunArgBase>::value, cv::GRunArg>::type&
+    {
+        GRunArgBase::operator=(std::move(t));
+        return *this;
+    }
+};
+using GRunArgs = std::vector<GRunArg>;
+
+// TODO: Think about the addition operator
+/**
+ * @brief This operator allows to complement the input vector at runtime.
+ *
+ * It's an ordinary overload of addition assignment operator.
+ *
+ * Example of usage:
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/dynamic_graph_snippets.cpp GRunArgs usage
+ *
+ */
+inline GRunArgs& operator += (GRunArgs &lhs, const GRunArgs &rhs)
+{
+    lhs.reserve(lhs.size() + rhs.size());
+    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+    return lhs;
+}
+
+namespace gapi
+{
+namespace wip
+{
+/**
+ * @brief This aggregate type represents all types which G-API can
+ * handle (via variant).
+ *
+ * It only exists to overcome C++ language limitations (where a
+ * `using`-defined class can't be forward-declared).
+ */
+struct GAPI_EXPORTS Data: public GRunArg
+{
+    using GRunArg::GRunArg;
+    template <typename T>
+    Data& operator= (const T& t) { GRunArg::operator=(t); return *this; }
+    template <typename T>
+    Data& operator= (T&& t) { GRunArg::operator=(std::move(t)); return *this; }
+};
+} // namespace wip
+} // namespace gapi
+
+using GRunArgP = util::variant<
+#if !defined(GAPI_STANDALONE)
+    cv::UMat*,
+#endif // !defined(GAPI_STANDALONE)
+    cv::Mat*,
+    cv::RMat*,
+    cv::Scalar*,
+    cv::MediaFrame*,
+    cv::detail::VectorRef,
+    cv::detail::OpaqueRef
+    >;
+using GRunArgsP = std::vector<GRunArgP>;
+
+// TODO: Think about the addition operator
+/**
+ * @brief This operator allows to complement the output vector at runtime.
+ *
+ * It's an ordinary overload of addition assignment operator.
+ *
+ * Example of usage:
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/dynamic_graph_snippets.cpp GRunArgsP usage
+ *
+ */
+inline GRunArgsP& operator += (GRunArgsP &lhs, const GRunArgsP &rhs)
+{
+    lhs.reserve(lhs.size() + rhs.size());
+    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+    return lhs;
+}
+
+namespace gapi
+{
+/**
+ * \addtogroup gapi_serialization
+ * @{
+ *
+ * @brief G-API functions and classes for serialization and deserialization.
+ */
+
+/** @brief Wraps deserialized output GRunArgs to GRunArgsP which can be used by GCompiled.
+ *
+ * Since it's impossible to get modifiable output arguments from deserialization
+ * it needs to be wrapped by this function.
+ *
+ * Example of usage:
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp bind after deserialization
+ *
+ * @param out_args deserialized GRunArgs.
+ * @return the same GRunArgs wrapped in GRunArgsP.
+ * @see deserialize
+ */
+GAPI_EXPORTS cv::GRunArgsP bind(cv::GRunArgs &out_args);
+
+/** @brief Wraps output GRunArgsP available during graph execution to GRunArgs which can be serialized.
+ *
+ * GRunArgsP is pointer-to-value, so to be serialized they need to be binded to real values
+ * which this function does.
+ *
+ * Example of usage:
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp bind before serialization
+ *
+ * @param out output GRunArgsP available during graph execution.
+ * @return the same GRunArgsP wrapped in serializable GRunArgs.
+ * @see serialize
+ */
+GAPI_EXPORTS cv::GRunArg   bind(cv::GRunArgP &out);     // FIXME: think more about it
+/** @} */
+}
+
+template<typename... Ts> inline GRunArgs gin(const Ts&... args)
+{
+    return GRunArgs{ GRunArg(detail::wrap_host_helper<Ts>::wrap_in(args))... };
+}
+
+template<typename... Ts> inline GRunArgsP gout(Ts&... args)
+{
+    return GRunArgsP{ GRunArgP(detail::wrap_host_helper<Ts>::wrap_out(args))... };
+}
+
+struct GTypeInfo;
+using GTypesInfo = std::vector<GTypeInfo>;
+
+// FIXME: Needed for python bridge, must be moved to more appropriate header
+namespace detail {
+struct ExtractArgsCallback
+{
+    cv::GRunArgs operator()(const cv::GTypesInfo& info) const { return c(info); }
+    using CallBackT = std::function<cv::GRunArgs(const cv::GTypesInfo& info)>;
+    CallBackT c;
+};
+
+struct ExtractMetaCallback
+{
+    cv::GMetaArgs operator()(const cv::GTypesInfo& info) const { return c(info); }
+    using CallBackT = std::function<cv::GMetaArgs(const cv::GTypesInfo& info)>;
+    CallBackT c;
+};
+
+void constructGraphOutputs(const cv::GTypesInfo &out_info,
+                           cv::GRunArgs         &args,
+                           cv::GRunArgsP        &outs);
+} // namespace detail
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GARG_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/garray.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/garray.hpp
new file mode 100644
index 000000000000..a2951993f2b2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/garray.hpp
@@ -0,0 +1,440 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GARRAY_HPP
+#define OPENCV_GAPI_GARRAY_HPP
+
+#include <functional>
+#include <ostream>
+#include <vector>
+#include <memory>
+
+#include <opencv2/gapi/own/exports.hpp>
+#include <opencv2/gapi/opencv_includes.hpp>
+
+#include <opencv2/gapi/util/variant.hpp>
+#include <opencv2/gapi/util/throw.hpp>
+#include <opencv2/gapi/own/assert.hpp>
+
+#include <opencv2/gapi/gmat.hpp>    // flatten_g only!
+#include <opencv2/gapi/gscalar.hpp> // flatten_g only!
+
+namespace cv
+{
+// Forward declaration; GNode and GOrigin are an internal
+// (user-inaccessible) classes.
+class GNode;
+struct GOrigin;
+template<typename T> class GArray;
+
+/**
+ * \addtogroup gapi_meta_args
+ * @{
+ */
+struct GAPI_EXPORTS_W_SIMPLE GArrayDesc
+{
+    // FIXME: Body
+    // FIXME: Also implement proper operator== then
+    bool operator== (const GArrayDesc&) const { return true; }
+};
+template<typename U> GArrayDesc descr_of(const std::vector<U> &) { return {};}
+GAPI_EXPORTS_W inline GArrayDesc empty_array_desc() {return {}; }
+/** @} */
+
+std::ostream& operator<<(std::ostream& os, const cv::GArrayDesc &desc);
+
+namespace detail
+{
+    // ConstructVec is a callback which stores information about T and is used by
+    // G-API runtime to construct arrays in host memory (T remains opaque for G-API).
+    // ConstructVec is carried into G-API internals by GArrayU.
+    // Currently it is suitable for Host (CPU) plugins only, real offload may require
+    // more information for manual memory allocation on-device.
+    class VectorRef;
+    using ConstructVec = std::function<void(VectorRef&)>;
+
+    // This is the base struct for GArrayU type holder
+    struct TypeHintBase{virtual ~TypeHintBase() = default;};
+
+    // This class holds type of initial GArray to be checked from GArrayU
+    template <typename T>
+    struct TypeHint final : public TypeHintBase{};
+
+    // This class strips type information from GArray<T> and makes it usable
+    // in the G-API graph compiler (expression unrolling, graph generation, etc).
+    // Part of GProtoArg.
+    class GAPI_EXPORTS GArrayU
+    {
+    public:
+        GArrayU(const GNode &n, std::size_t out); // Operation result constructor
+
+        template <typename T>
+        bool holds() const;                       // Check if was created from GArray<T>
+
+        GOrigin& priv();                          // Internal use only
+        const GOrigin& priv() const;              // Internal use only
+
+    protected:
+        GArrayU();                                // Default constructor
+        GArrayU(const detail::VectorRef& vref);   // Constant value constructor
+        template<class> friend class cv::GArray;  //  (available to GArray<T> only)
+
+        void setConstructFcn(ConstructVec &&cv);  // Store T-aware constructor
+
+        template <typename T>
+        void specifyType();                       // Store type of initial GArray<T>
+
+        template <typename T>
+        void storeKind();
+
+        void setKind(cv::detail::OpaqueKind);
+
+        std::shared_ptr<GOrigin> m_priv;
+        std::shared_ptr<TypeHintBase> m_hint;
+    };
+
+    template <typename T>
+    bool GArrayU::holds() const{
+        GAPI_Assert(m_hint != nullptr);
+        using U = typename std::decay<T>::type;
+        return dynamic_cast<TypeHint<U>*>(m_hint.get()) != nullptr;
+    }
+
+    template <typename T>
+    void GArrayU::specifyType(){
+        m_hint.reset(new TypeHint<typename std::decay<T>::type>);
+    }
+
+    template <typename T>
+    void GArrayU::storeKind(){
+        setKind(cv::detail::GOpaqueTraits<T>::kind);
+    }
+
+    // This class represents a typed STL vector reference.
+    // Depending on origins, this reference may be either "just a" reference to
+    // an object created externally, OR actually own the underlying object
+    // (be value holder).
+    class BasicVectorRef
+    {
+    public:
+        // These fields are set by the derived class(es)
+        std::size_t    m_elemSize = 0ul;
+        cv::GArrayDesc m_desc;
+        virtual ~BasicVectorRef() {}
+
+        virtual void mov(BasicVectorRef &ref) = 0;
+        virtual const void* ptr() const = 0;
+        virtual std::size_t size() const = 0;
+    };
+
+    template<typename T> class VectorRefT final: public BasicVectorRef
+    {
+        using empty_t  = util::monostate;
+        using ro_ext_t = const std::vector<T> *;
+        using rw_ext_t =       std::vector<T> *;
+        using rw_own_t =       std::vector<T>  ;
+        util::variant<empty_t, ro_ext_t, rw_ext_t, rw_own_t> m_ref;
+
+        inline bool isEmpty() const { return util::holds_alternative<empty_t>(m_ref);  }
+        inline bool isROExt() const { return util::holds_alternative<ro_ext_t>(m_ref); }
+        inline bool isRWExt() const { return util::holds_alternative<rw_ext_t>(m_ref); }
+        inline bool isRWOwn() const { return util::holds_alternative<rw_own_t>(m_ref); }
+
+        void init(const std::vector<T>* vec = nullptr)
+        {
+            m_elemSize = sizeof(T);
+            if (vec) m_desc = cv::descr_of(*vec);
+        }
+
+    public:
+        VectorRefT() { init(); }
+        virtual ~VectorRefT() {}
+
+        explicit VectorRefT(const std::vector<T>& vec) : m_ref(&vec)      { init(&vec); }
+        explicit VectorRefT(std::vector<T>& vec)  : m_ref(&vec)           { init(&vec); }
+        explicit VectorRefT(std::vector<T>&& vec) : m_ref(std::move(vec)) { init(&vec); }
+
+        // Reset a VectorRefT. Called only for objects instantiated
+        // internally in G-API (e.g. temporary GArray<T>'s within a
+        // computation).  Reset here means both initialization
+        // (creating an object) and reset (discarding its existing
+        // content before the next execution).  Must never be called
+        // for external VectorRefTs.
+        void reset()
+        {
+            if (isEmpty())
+            {
+                std::vector<T> empty_vector;
+                m_desc = cv::descr_of(empty_vector);
+                m_ref  = std::move(empty_vector);
+                GAPI_Assert(isRWOwn());
+            }
+            else if (isRWOwn())
+            {
+                util::get<rw_own_t>(m_ref).clear();
+            }
+            else GAPI_Error("InternalError"); // shouldn't be called in *EXT modes
+        }
+
+        // Obtain a WRITE reference to underlying object
+        // Used by CPU kernel API wrappers when a kernel execution frame
+        // is created
+        std::vector<T>& wref()
+        {
+            GAPI_Assert(isRWExt() || isRWOwn());
+            if (isRWExt()) return *util::get<rw_ext_t>(m_ref);
+            if (isRWOwn()) return  util::get<rw_own_t>(m_ref);
+            util::throw_error(std::logic_error("Impossible happened"));
+        }
+
+        // Obtain a READ reference to underlying object
+        // Used by CPU kernel API wrappers when a kernel execution frame
+        // is created
+        const std::vector<T>& rref() const
+        {
+            // ANY vector can be accessed for reading, even if it declared for
+            // output. Example -- a GComputation from [in] to [out1,out2]
+            // where [out2] is a result of operation applied to [out1]:
+            //
+            //            GComputation boundary
+            //            . . . . . . .
+            //            .           .
+            //     [in] ----> foo() ----> [out1]
+            //            .           .    :
+            //            .           . . .:. . .
+            //            .                V    .
+            //            .              bar() ---> [out2]
+            //            . . . . . . . . . . . .
+            //
+            if (isROExt()) return *util::get<ro_ext_t>(m_ref);
+            if (isRWExt()) return *util::get<rw_ext_t>(m_ref);
+            if (isRWOwn()) return  util::get<rw_own_t>(m_ref);
+            util::throw_error(std::logic_error("Impossible happened"));
+        }
+
+        virtual void mov(BasicVectorRef &v) override {
+            VectorRefT<T> *tv = dynamic_cast<VectorRefT<T>*>(&v);
+            GAPI_Assert(tv != nullptr);
+            wref() = std::move(tv->wref());
+        }
+
+        virtual const void* ptr() const override { return &rref(); }
+        virtual std::size_t size() const override { return rref().size(); }
+    };
+
+    // This class strips type information from VectorRefT<> and makes it usable
+    // in the G-API executables (carrying run-time data/information to kernels).
+    // Part of GRunArg.
+    // Its methods are typed proxies to VectorRefT<T>.
+    // VectorRef maintains "reference" semantics so two copies of VectoRef refer
+    // to the same underlying object.
+    // FIXME: Put a good explanation on why cv::OutputArray doesn't fit this role
+    class VectorRef
+    {
+        std::shared_ptr<BasicVectorRef> m_ref;
+        cv::detail::OpaqueKind m_kind = cv::detail::OpaqueKind::CV_UNKNOWN;
+
+        template<typename T> inline void check() const
+        {
+            GAPI_DbgAssert(dynamic_cast<VectorRefT<T>*>(m_ref.get()) != nullptr);
+            GAPI_Assert(sizeof(T) == m_ref->m_elemSize);
+        }
+
+    public:
+        VectorRef() = default;
+        template<typename T> explicit VectorRef(const std::vector<T>& vec)
+            : m_ref(new VectorRefT<T>(vec))
+            , m_kind(GOpaqueTraits<T>::kind)
+        {}
+        template<typename T> explicit VectorRef(std::vector<T>& vec)
+            : m_ref(new VectorRefT<T>(vec))
+            , m_kind(GOpaqueTraits<T>::kind)
+        {}
+        template<typename T> explicit VectorRef(std::vector<T>&& vec)
+            : m_ref(new VectorRefT<T>(std::move(vec)))
+            , m_kind(GOpaqueTraits<T>::kind)
+        {}
+
+        cv::detail::OpaqueKind getKind() const
+        {
+            return m_kind;
+        }
+
+        template<typename T> void reset()
+        {
+            if (!m_ref) m_ref.reset(new VectorRefT<T>());
+            check<T>();
+            storeKind<T>();
+            static_cast<VectorRefT<T>&>(*m_ref).reset();
+        }
+
+        template <typename T>
+        void storeKind()
+        {
+            m_kind = cv::detail::GOpaqueTraits<T>::kind;
+        }
+
+        template<typename T> std::vector<T>& wref()
+        {
+            check<T>();
+            return static_cast<VectorRefT<T>&>(*m_ref).wref();
+        }
+
+        template<typename T> const std::vector<T>& rref() const
+        {
+            check<T>();
+            return static_cast<VectorRefT<T>&>(*m_ref).rref();
+        }
+
+        // Check if was created for/from std::vector<T>
+        template <typename T> bool holds() const
+        {
+            if (!m_ref) return false;
+            using U = typename std::decay<T>::type;
+            return dynamic_cast<VectorRefT<U>*>(m_ref.get()) != nullptr;
+        }
+
+        void mov(VectorRef &v)
+        {
+            m_ref->mov(*v.m_ref);
+        }
+
+        cv::GArrayDesc descr_of() const
+        {
+            return m_ref->m_desc;
+        }
+
+        std::size_t size() const
+        {
+            return m_ref->size();
+        }
+
+        // May be used to uniquely identify this object internally
+        const void *ptr() const { return m_ref->ptr(); }
+    };
+
+    // Helper (FIXME: work-around?)
+    // stripping G types to their host types
+    // like cv::GArray<GMat> would still map to std::vector<cv::Mat>
+    // but not to std::vector<cv::GMat>
+#if defined(GAPI_STANDALONE)
+#  define FLATTEN_NS cv::gapi::own
+#else
+#  define FLATTEN_NS cv
+#endif
+    template<class T> struct flatten_g;
+    template<> struct flatten_g<cv::GMat>         { using type = FLATTEN_NS::Mat; };
+    template<> struct flatten_g<cv::GScalar>      { using type = FLATTEN_NS::Scalar; };
+    template<class T> struct flatten_g<GArray<T>> { using type = std::vector<T>; };
+    template<class T> struct flatten_g            { using type = T; };
+#undef FLATTEN_NS
+    // FIXME: the above mainly duplicates "ProtoToParam" thing from gtyped.hpp
+    // but I decided not to include gtyped here - probably worth moving that stuff
+    // to some common place? (DM)
+} // namespace detail
+
+/** \addtogroup gapi_data_objects
+ * @{
+ */
+/**
+ * @brief `cv::GArray<T>` template class represents a list of objects
+ * of class `T` in the graph.
+ *
+ * `cv::GArray<T>` describes a functional relationship between
+ * operations consuming and producing arrays of objects of class
+ * `T`. The primary purpose of `cv::GArray<T>` is to represent a
+ * dynamic list of objects -- where the size of the list is not known
+ * at the graph construction or compile time. Examples include: corner
+ * and feature detectors (`cv::GArray<cv::Point>`), object detection
+ * and tracking  results (`cv::GArray<cv::Rect>`). Programmers can use
+ * their own types with `cv::GArray<T>` in the custom operations.
+ *
+ * Similar to `cv::GScalar`, `cv::GArray<T>` may be value-initialized
+ * -- in this case a graph-constant value is associated with the object.
+ *
+ * `GArray<T>` is a virtual counterpart of `std::vector<T>`, which is
+ * usually used to represent the `GArray<T>` data in G-API during the
+ * execution.
+ *
+ * @sa `cv::GOpaque<T>`
+ */
+template<typename T> class GArray
+{
+public:
+    // Host type (or Flat type) - the type this GArray is actually
+    // specified to.
+    /// @private
+    using HT = typename detail::flatten_g<typename std::decay<T>::type>::type;
+
+    /**
+     * @brief Constructs a value-initialized `cv::GArray<T>`
+     *
+     * `cv::GArray<T>` objects  may have their values
+     * be associated at graph construction time. It is useful when
+     * some operation has a `cv::GArray<T>` input which doesn't change during
+     * the program execution, and is set only once. In this case,
+     * there is no need to declare such `cv::GArray<T>` as a graph input.
+     *
+     * @note The value of `cv::GArray<T>` may be overwritten by assigning some
+     * other `cv::GArray<T>` to the object using `operator=` -- on the
+     * assignment, the old association or value is discarded.
+     *
+     * @param v a std::vector<T> to associate with this
+     * `cv::GArray<T>` object. Vector data is copied into the
+     * `cv::GArray<T>` (no reference to the passed data is held).
+     */
+    explicit GArray(const std::vector<HT>& v) // Constant value constructor
+        : m_ref(detail::GArrayU(detail::VectorRef(v))) { putDetails(); }
+
+    /**
+     * @overload
+     * @brief Constructs a value-initialized `cv::GArray<T>`
+     *
+     * @param v a std::vector<T> to associate with this
+     * `cv::GArray<T>` object. Vector data is moved into the `cv::GArray<T>`.
+     */
+    explicit GArray(std::vector<HT>&& v)      // Move-constructor
+        : m_ref(detail::GArrayU(detail::VectorRef(std::move(v)))) { putDetails(); }
+
+    /**
+     * @brief Constructs an empty `cv::GArray<T>`
+     *
+     * Normally, empty G-API data objects denote a starting point of
+     * the graph. When an empty `cv::GArray<T>` is assigned to a result
+     * of some operation, it obtains a functional link to this
+     * operation (and is not empty anymore).
+     */
+    GArray() { putDetails(); }                // Empty constructor
+
+    /// @private
+    explicit GArray(detail::GArrayU &&ref)    // GArrayU-based constructor
+        : m_ref(ref) { putDetails(); }        //   (used by GCall, not for users)
+
+    /// @private
+    detail::GArrayU strip() const {
+        return m_ref;
+    }
+    /// @private
+    static void VCtor(detail::VectorRef& vref) {
+        vref.reset<HT>();
+    }
+
+private:
+    void putDetails() {
+        m_ref.setConstructFcn(&VCtor);
+        m_ref.specifyType<HT>();  // FIXME: to unify those 2 to avoid excessive dynamic_cast
+        m_ref.storeKind<HT>();    //
+    }
+
+    detail::GArrayU m_ref;
+};
+
+/** @} */
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GARRAY_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gasync_context.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gasync_context.hpp
new file mode 100644
index 000000000000..f49b59822d9d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gasync_context.hpp
@@ -0,0 +1,63 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+#ifndef OPENCV_GAPI_GASYNC_CONTEXT_HPP
+#define OPENCV_GAPI_GASYNC_CONTEXT_HPP
+
+#if !defined(GAPI_STANDALONE)
+#  include <opencv2/core/cvdef.h>
+#else   // Without OpenCV
+#  include <opencv2/gapi/own/cvdefs.hpp>
+#endif // !defined(GAPI_STANDALONE)
+
+#include <opencv2/gapi/own/exports.hpp>
+
+namespace cv {
+namespace gapi{
+
+/**
+ * @brief This namespace contains experimental G-API functionality,
+ * functions or structures in this namespace are subjects to change or
+ * removal in the future releases. This namespace also contains
+ * functions which API is not stabilized yet.
+ */
+namespace wip {
+
+/**
+ * @brief A class to group async requests to cancel them in a single shot.
+ *
+ * GAsyncContext is passed as an argument to async() and async_apply() functions
+ */
+
+class GAPI_EXPORTS GAsyncContext{
+    std::atomic<bool> cancelation_requested = {false};
+public:
+    /**
+     * @brief Start cancellation process for an associated request.
+     *
+     * User still has to wait for each individual request (either via callback or according std::future object) to make sure it actually canceled.
+     *
+     * @return true if it was a first request to cancel the context
+     */
+    bool cancel();
+
+    /**
+    * @brief Returns true if cancellation was requested for this context.
+    *
+    * @return true if cancellation was requested for this context
+    */
+    bool isCanceled() const;
+};
+
+class GAPI_EXPORTS GAsyncCanceled : public std::exception {
+public:
+    virtual const char* what() const noexcept CV_OVERRIDE;
+};
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif //OPENCV_GAPI_GASYNC_CONTEXT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcall.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcall.hpp
new file mode 100644
index 000000000000..8d1b8d60100c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcall.hpp
@@ -0,0 +1,78 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCALL_HPP
+#define OPENCV_GAPI_GCALL_HPP
+
+#include <opencv2/gapi/garg.hpp>      // GArg
+#include <opencv2/gapi/gmat.hpp>      // GMat
+#include <opencv2/gapi/gscalar.hpp>   // GScalar
+#include <opencv2/gapi/gframe.hpp>    // GFrame
+#include <opencv2/gapi/garray.hpp>    // GArray<T>
+#include <opencv2/gapi/gopaque.hpp>   // GOpaque<T>
+
+namespace cv {
+
+struct GKernel;
+
+// The whole idea of this class is to represent an operation
+// which is applied to arguments. This is part of public API,
+// since it is what users should use to define kernel interfaces.
+
+class GAPI_EXPORTS GCall final
+{
+public:
+    class Priv;
+
+    explicit GCall(const GKernel &k);
+    ~GCall();
+
+    template<typename... Ts>
+    GCall& pass(Ts&&... args)
+    {
+        setArgs({cv::GArg(std::move(args))...});
+        return *this;
+    }
+
+    // A generic yield method - obtain a link to operator's particular GMat output
+    GMat    yield      (int output = 0);
+    GMatP   yieldP     (int output = 0);
+    GScalar yieldScalar(int output = 0);
+    GFrame  yieldFrame (int output = 0);
+
+    template<class T> GArray<T> yieldArray(int output = 0)
+    {
+        return GArray<T>(yieldArray(output));
+    }
+
+    template<class T> GOpaque<T> yieldOpaque(int output = 0)
+    {
+        return GOpaque<T>(yieldOpaque(output));
+    }
+
+    // Internal use only
+    Priv& priv();
+    const Priv& priv() const;
+
+    // GKernel and params can be modified, it's needed for infer<Generic>,
+    // because information about output shapes doesn't exist in compile time
+    GKernel& kernel();
+    cv::util::any& params();
+
+    void setArgs(std::vector<GArg> &&args);
+
+protected:
+    std::shared_ptr<Priv> m_priv;
+
+    // Public versions return a typed array or opaque, those are implementation details
+    detail::GArrayU yieldArray(int output = 0);
+    detail::GOpaqueU yieldOpaque(int output = 0);
+};
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GCALL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcommon.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcommon.hpp
new file mode 100644
index 000000000000..c61110e4d5b5
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcommon.hpp
@@ -0,0 +1,309 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMMON_HPP
+#define OPENCV_GAPI_GCOMMON_HPP
+
+#include <functional>   // std::hash
+#include <vector>       // std::vector
+#include <type_traits>  // decay
+
+#include <opencv2/gapi/opencv_includes.hpp>
+
+#include <opencv2/gapi/util/any.hpp>
+#include <opencv2/gapi/util/optional.hpp>
+#include <opencv2/gapi/own/exports.hpp>
+#include <opencv2/gapi/own/assert.hpp>
+#include <opencv2/gapi/render/render_types.hpp>
+#include <opencv2/gapi/s11n/base.hpp>
+
+namespace cv {
+
+class GMat; // FIXME: forward declaration for GOpaqueTraits
+
+namespace detail
+{
+    // This is a trait-like structure to mark backend-specific compile arguments
+    // with tags
+    template<typename T> struct CompileArgTag;
+
+    // These structures are tags which separate kernels and transformations
+    struct KernelTag
+    {};
+    struct TransformTag
+    {};
+
+    // This enum is utilized mostly by GArray and GOpaque to store and recognize their internal data
+    // types (aka Host type). Also it is widely used during serialization routine.
+    enum class OpaqueKind: int
+    {
+        CV_UNKNOWN,    // Unknown, generic, opaque-to-GAPI data type unsupported in graph seriallization
+        CV_BOOL,       // bool user G-API data
+        CV_INT,        // int user G-API data
+        CV_INT64,      // int64_t user G-API data
+        CV_DOUBLE,     // double user G-API data
+        CV_FLOAT,      // float user G-API data
+        CV_UINT64,     // uint64_t user G-API data
+        CV_STRING,     // std::string user G-API data
+        CV_POINT,      // cv::Point user G-API data
+        CV_POINT2F,    // cv::Point2f user G-API data
+        CV_POINT3F,    // cv::Point3f user G-API data
+        CV_SIZE,       // cv::Size user G-API data
+        CV_RECT,       // cv::Rect user G-API data
+        CV_SCALAR,     // cv::Scalar user G-API data
+        CV_MAT,        // cv::Mat user G-API data
+        CV_DRAW_PRIM,  // cv::gapi::wip::draw::Prim user G-API data
+    };
+
+    // Type traits helper which simplifies the extraction of kind from type
+    template<typename T> struct GOpaqueTraits;
+    template<typename T> struct GOpaqueTraits    { static constexpr const OpaqueKind kind = OpaqueKind::CV_UNKNOWN; };
+    template<> struct GOpaqueTraits<int>         { static constexpr const OpaqueKind kind = OpaqueKind::CV_INT; };
+    template<> struct GOpaqueTraits<int64_t>     { static constexpr const OpaqueKind kind = OpaqueKind::CV_INT64; };
+    template<> struct GOpaqueTraits<double>      { static constexpr const OpaqueKind kind = OpaqueKind::CV_DOUBLE; };
+    template<> struct GOpaqueTraits<float>       { static constexpr const OpaqueKind kind = OpaqueKind::CV_FLOAT; };
+    template<> struct GOpaqueTraits<uint64_t>    { static constexpr const OpaqueKind kind = OpaqueKind::CV_UINT64; };
+    template<> struct GOpaqueTraits<bool>        { static constexpr const OpaqueKind kind = OpaqueKind::CV_BOOL; };
+    template<> struct GOpaqueTraits<std::string> { static constexpr const OpaqueKind kind = OpaqueKind::CV_STRING; };
+    template<> struct GOpaqueTraits<cv::Size>    { static constexpr const OpaqueKind kind = OpaqueKind::CV_SIZE; };
+    template<> struct GOpaqueTraits<cv::Scalar>  { static constexpr const OpaqueKind kind = OpaqueKind::CV_SCALAR; };
+    template<> struct GOpaqueTraits<cv::Point>   { static constexpr const OpaqueKind kind = OpaqueKind::CV_POINT; };
+    template<> struct GOpaqueTraits<cv::Point2f> { static constexpr const OpaqueKind kind = OpaqueKind::CV_POINT2F; };
+    template<> struct GOpaqueTraits<cv::Point3f> { static constexpr const OpaqueKind kind = OpaqueKind::CV_POINT3F; };
+    template<> struct GOpaqueTraits<cv::Mat>     { static constexpr const OpaqueKind kind = OpaqueKind::CV_MAT; };
+    template<> struct GOpaqueTraits<cv::Rect>    { static constexpr const OpaqueKind kind = OpaqueKind::CV_RECT; };
+    template<> struct GOpaqueTraits<cv::GMat>    { static constexpr const OpaqueKind kind = OpaqueKind::CV_MAT; };
+    template<> struct GOpaqueTraits<cv::gapi::wip::draw::Prim>
+                                                 { static constexpr const OpaqueKind kind = OpaqueKind::CV_DRAW_PRIM; };
+    using GOpaqueTraitsArrayTypes = std::tuple<int, double, float, uint64_t, bool, std::string, cv::Size, cv::Scalar, cv::Point, cv::Point2f,
+                                               cv::Point3f, cv::Mat, cv::Rect, cv::gapi::wip::draw::Prim>;
+    // GOpaque is not supporting cv::Mat and cv::Scalar since there are GScalar and GMat types
+    using GOpaqueTraitsOpaqueTypes = std::tuple<int, double, float, uint64_t, bool, std::string, cv::Size, cv::Point, cv::Point2f, cv::Point3f,
+                                                cv::Rect, cv::gapi::wip::draw::Prim>;
+} // namespace detail
+
+// This definition is here because it is reused by both public(?) and internal
+// modules. Keeping it here wouldn't expose public details (e.g., API-level)
+// to components which are internal and operate on a lower-level entities
+// (e.g., compiler, backends).
+// FIXME: merge with ArgKind?
+// FIXME: replace with variant[format desc]?
+enum class GShape: int
+{
+    GMAT,
+    GSCALAR,
+    GARRAY,
+    GOPAQUE,
+    GFRAME,
+};
+
+namespace gapi {
+namespace s11n {
+namespace detail {
+template<typename T> struct wrap_serialize;
+} // namespace detail
+} // namespace s11n
+} // namespace gapi
+
+
+struct GCompileArg;
+
+namespace detail {
+    template<typename T>
+    using is_compile_arg = std::is_same<GCompileArg, typename std::decay<T>::type>;
+} // namespace detail
+
+// CompileArg is an unified interface over backend-specific compilation
+// information
+// FIXME: Move to a separate file?
+/** \addtogroup gapi_compile_args
+ * @{
+ *
+ * @brief Compilation arguments: data structures controlling the
+ * compilation process
+ *
+ * G-API comes with a number of graph compilation options which can be
+ * passed to cv::GComputation::apply() or
+ * cv::GComputation::compile(). Known compilation options are listed
+ * in this page, while extra backends may introduce their own
+ * compilation options (G-API transparently accepts _everything_ which
+ * can be passed to cv::compile_args(), it depends on underlying
+ * backends if an option would be interpreted or not).
+ *
+ * For example, if an example computation is executed like this:
+ *
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp graph_decl_apply
+ *
+ * Extra parameter specifying which kernels to compile with can be
+ * passed like this:
+ *
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp apply_with_param
+ */
+
+/**
+ * @brief Represents an arbitrary compilation argument.
+ *
+ * Any value can be wrapped into cv::GCompileArg, but only known ones
+ * (to G-API or its backends) can be interpreted correctly.
+ *
+ * Normally objects of this class shouldn't be created manually, use
+ * cv::compile_args() function which automatically wraps everything
+ * passed in (a variadic template parameter pack) into a vector of
+ * cv::GCompileArg objects.
+ */
+struct GCompileArg
+{
+public:
+    // NB: Required for pythnon bindings
+    GCompileArg() = default;
+
+    std::string tag;
+
+    // FIXME: use decay in GArg/other trait-based wrapper before leg is shot!
+    template<typename T, typename std::enable_if<!detail::is_compile_arg<T>::value, int>::type = 0>
+    explicit GCompileArg(T &&t)
+        : tag(detail::CompileArgTag<typename std::decay<T>::type>::tag())
+        , serializeF(cv::gapi::s11n::detail::has_S11N_spec<T>::value ?
+                     &cv::gapi::s11n::detail::wrap_serialize<T>::serialize :
+                     nullptr)
+        , arg(t)
+    {
+    }
+
+    template<typename T> T& get()
+    {
+        return util::any_cast<T>(arg);
+    }
+
+    template<typename T> const T& get() const
+    {
+        return util::any_cast<T>(arg);
+    }
+
+    void serialize(cv::gapi::s11n::IOStream& os) const
+    {
+        if (serializeF)
+        {
+            serializeF(os, *this);
+        }
+    }
+
+private:
+    std::function<void(cv::gapi::s11n::IOStream&, const GCompileArg&)> serializeF;
+    util::any arg;
+};
+
+using GCompileArgs = std::vector<GCompileArg>;
+
+inline cv::GCompileArgs& operator += (      cv::GCompileArgs &lhs,
+                                      const cv::GCompileArgs &rhs)
+{
+    lhs.reserve(lhs.size() + rhs.size());
+    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+    return lhs;
+}
+
+/**
+ * @brief Wraps a list of arguments (a parameter pack) into a vector of
+ *        compilation arguments (cv::GCompileArg).
+ */
+template<typename... Ts> GCompileArgs compile_args(Ts&&... args)
+{
+    return GCompileArgs{ GCompileArg(args)... };
+}
+
+namespace gapi
+{
+/**
+ * @brief Retrieves particular compilation argument by its type from
+ *        cv::GCompileArgs
+ */
+template<typename T>
+inline cv::util::optional<T> getCompileArg(const cv::GCompileArgs &args)
+{
+    for (auto &compile_arg : args)
+    {
+        if (compile_arg.tag == cv::detail::CompileArgTag<T>::tag())
+        {
+            return cv::util::optional<T>(compile_arg.get<T>());
+        }
+    }
+    return cv::util::optional<T>();
+}
+
+namespace s11n {
+namespace detail {
+template<typename T> struct wrap_serialize
+{
+    static void serialize(IOStream& os, const GCompileArg& arg)
+    {
+        using DT = typename std::decay<T>::type;
+        S11N<DT>::serialize(os, arg.get<DT>());
+    }
+};
+} // namespace detail
+} // namespace s11n
+} // namespace gapi
+
+/** @} gapi_compile_args */
+
+/**
+ * @brief Ask G-API to dump compiled graph in Graphviz format under
+ * the given file name.
+ *
+ * Specifies a graph dump path (path to .dot file to be generated).
+ * G-API will dump a .dot file under specified path during a
+ * compilation process if this flag is passed.
+ */
+struct graph_dump_path
+{
+    std::string m_dump_path;
+};
+
+/**
+ * @brief Ask G-API to use threaded executor when cv::GComputation
+ * is compiled via cv::GComputation::compile method.
+ *
+ * Specifies a number of threads that should be used by executor.
+ */
+struct GAPI_EXPORTS use_threaded_executor
+{
+    use_threaded_executor();
+    explicit use_threaded_executor(const uint32_t nthreads);
+
+    uint32_t num_threads;
+};
+
+namespace detail
+{
+    template<> struct CompileArgTag<cv::graph_dump_path>
+    {
+        static const char* tag() { return "gapi.graph_dump_path"; }
+    };
+
+    template<> struct CompileArgTag<cv::use_threaded_executor>
+    {
+        static const char* tag() { return "gapi.threaded_executor"; }
+    };
+}
+
+} // namespace cv
+
+// std::hash overload for GShape
+namespace std
+{
+template<> struct hash<cv::GShape>
+{
+    size_t operator() (cv::GShape sh) const
+    {
+        return std::hash<int>()(static_cast<int>(sh));
+    }
+};
+} // namespace std
+
+
+#endif // OPENCV_GAPI_GCOMMON_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompiled.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompiled.hpp
new file mode 100644
index 000000000000..ac36783d6215
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompiled.hpp
@@ -0,0 +1,232 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPILED_HPP
+#define OPENCV_GAPI_GCOMPILED_HPP
+
+#include <vector>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/own/assert.hpp>
+#include <opencv2/gapi/garg.hpp>
+
+namespace cv {
+
+// This class represents a compiled computation.
+// In theory (and ideally), it can be used w/o the rest of APIs.
+// In theory (and ideally), it can be serialized/deserialized.
+// It can enable scenarious like deployment to an autonomous devince, FuSa, etc.
+//
+// Currently GCompiled assumes all GMats you used to pass data to G-API
+// are valid and not destroyed while you use a GCompiled object.
+//
+// FIXME: In future, there should be a way to name I/O objects and specify it
+// to GCompiled externally (for example, when it is loaded on the target system).
+
+/**
+ * \addtogroup gapi_main_classes
+ * @{
+ */
+/**
+ * @brief Represents a compiled computation (graph). Can only be used
+ * with image / data formats & resolutions it was compiled for, with
+ * some exceptions.
+ *
+ * This class represents a product of graph compilation (calling
+ * cv::GComputation::compile()). Objects of this class actually do
+ * data processing, and graph execution is incapsulated into objects
+ * of this class. Execution model itself depends on kernels and
+ * backends which were using during the compilation, see @ref
+ * gapi_compile_args for details.
+ *
+ * In a general case, GCompiled objects can be applied to data only in
+ * that formats/resolutions they were compiled for (see @ref
+ * gapi_meta_args). However, if the underlying backends allow, a
+ * compiled object can be _reshaped_ to handle data (images) of
+ * different resolution, though formats and types must remain the same.
+ *
+ * GCompiled is very similar to `std::function<>` in its semantics --
+ * running it looks like a function call in the user code.
+ *
+ * At the moment, GCompiled objects are not reentrant -- generally,
+ * the objects are stateful since graph execution itself is a stateful
+ * process and this state is now maintained in GCompiled's own memory
+ * (not on the process stack).
+ *
+ * At the same time, two different GCompiled objects produced from the
+ * single cv::GComputation are completely independent and can be used
+ * concurrently.
+ *
+ * @sa GStreamingCompiled
+ */
+class GAPI_EXPORTS GCompiled
+{
+public:
+    /// @private
+    class GAPI_EXPORTS Priv;
+
+    /**
+     * @brief Constructs an empty object
+     */
+    GCompiled();
+
+    /**
+     * @brief Run the compiled computation, a generic version.
+     *
+     * @param ins vector of inputs to process.
+     * @param outs vector of outputs to produce.
+     *
+     * Input/output vectors must have the same number of elements as
+     * defined in the cv::GComputation protocol (at the moment of its
+     * construction). Shapes of elements also must conform to protocol
+     * (e.g. cv::Mat needs to be passed where cv::GMat has been
+     * declared as input, and so on). Run-time exception is generated
+     * otherwise.
+     *
+     * Objects in output vector may remain empty (like cv::Mat) --
+     * G-API will automatically initialize output objects to proper formats.
+     *
+     * @note Don't construct GRunArgs/GRunArgsP objects manually, use
+     * cv::gin()/cv::gout() wrappers instead.
+     */
+    void operator() (GRunArgs &&ins, GRunArgsP &&outs);          // Generic arg-to-arg
+#if !defined(GAPI_STANDALONE)
+
+    /**
+     * @brief Execute an unary computation
+     *
+     * @overload
+     * @param in input cv::Mat for unary computation
+     * @param out output cv::Mat for unary computation
+     * process.
+     */
+    void operator() (cv::Mat in, cv::Mat &out);                  // Unary overload
+
+    /**
+     * @brief Execute an unary computation
+     *
+     * @overload
+     * @param in input cv::Mat for unary computation
+     * @param out output cv::Scalar for unary computation
+     * process.
+     */
+    void operator() (cv::Mat in, cv::Scalar &out);               // Unary overload (scalar)
+
+    /**
+     * @brief Execute a binary computation
+     *
+     * @overload
+     * @param in1 first input cv::Mat for binary computation
+     * @param in2 second input cv::Mat for binary computation
+     * @param out output cv::Mat for binary computation
+     * process.
+     */
+    void operator() (cv::Mat in1, cv::Mat in2, cv::Mat &out);    // Binary overload
+
+    /**
+     * @brief Execute an binary computation
+     *
+     * @overload
+     * @param in1 first input cv::Mat for binary computation
+     * @param in2 second input cv::Mat for binary computation
+     * @param out output cv::Scalar for binary computation
+     * process.
+     */
+    void operator() (cv::Mat in1, cv::Mat in2, cv::Scalar &out); // Binary overload (scalar)
+
+    /**
+     * @brief Execute a computation with arbitrary number of
+     * inputs/outputs.
+     *
+     * @overload
+     * @param ins vector of input cv::Mat objects to process by the
+     * computation.
+     * @param outs vector of output cv::Mat objects to produce by the
+     * computation.
+     *
+     * Numbers of elements in ins/outs vectors must match numbers of
+     * inputs/outputs which were used to define the source GComputation.
+     */
+    void operator() (const std::vector<cv::Mat> &ins,            // Compatibility overload
+                     const std::vector<cv::Mat> &outs);
+#endif  // !defined(GAPI_STANDALONE)
+    /// @private
+    Priv& priv();
+
+    /**
+     * @brief Check if compiled object is valid (non-empty)
+     *
+     * @return true if the object is runnable (valid), false otherwise
+     */
+    explicit operator bool () const;
+
+    /**
+     * @brief Vector of metadata this graph was compiled for.
+     *
+     * @return Unless _reshape_ is not supported, return value is the
+     * same vector which was passed to cv::GComputation::compile() to
+     * produce this compiled object. Otherwise, it is the latest
+     * metadata vector passed to reshape() (if that call was
+     * successful).
+     */
+    const GMetaArgs& metas() const; // Meta passed to compile()
+
+    /**
+     * @brief Vector of metadata descriptions of graph outputs
+     *
+     * @return vector with formats/resolutions of graph's output
+     * objects, auto-inferred from input metadata vector by
+     * operations which form this computation.
+     *
+     * @note GCompiled objects produced from the same
+     * cv::GComputiation graph with different input metas may return
+     * different values in this vector.
+     */
+    const GMetaArgs& outMetas() const;
+
+    /**
+     * @brief Check if the underlying backends support reshape or not.
+     *
+     * @return true if supported, false otherwise.
+     */
+    bool canReshape() const;
+
+    /**
+     * @brief Reshape a compiled graph to support new image
+     * resolutions.
+     *
+     * Throws an exception if an error occurs.
+     *
+     * @param inMetas new metadata to reshape on. Vector size and
+     * metadata shapes must match the computation's protocol.
+     * @param args compilation arguments to use.
+     */
+    // FIXME: Why it requires compile args?
+    void reshape(const GMetaArgs& inMetas, const GCompileArgs& args);
+
+    /**
+     * @brief Prepare inner kernels states for a new video-stream.
+     *
+     * GCompiled objects may be used to process video streams frame by frame.
+     * In this case, a GCompiled is called on every image frame individually.
+     * Starting OpenCV 4.4, some kernels in the graph may have their internal
+     * states (see GAPI_OCV_KERNEL_ST for the OpenCV backend).
+     * In this case, if user starts processing another video stream with
+     * this GCompiled, this method needs to be called to let kernels re-initialize
+     * their internal states to a new video stream.
+     */
+    void prepareForNewStream();
+
+protected:
+    /// @private
+    std::shared_ptr<Priv> m_priv;
+};
+/** @} */
+
+}
+
+#endif // OPENCV_GAPI_GCOMPILED_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompiled_async.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompiled_async.hpp
new file mode 100644
index 000000000000..a0c2917d6a85
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompiled_async.hpp
@@ -0,0 +1,73 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPILED_ASYNC_HPP
+#define OPENCV_GAPI_GCOMPILED_ASYNC_HPP
+
+#include <future>           //for std::future
+#include <exception>        //for std::exception_ptr
+#include <functional>       //for std::function
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/own/exports.hpp>
+
+namespace cv {
+    //fwd declaration
+    class GCompiled;
+
+namespace gapi{
+namespace wip {
+    class GAsyncContext;
+    /**
+    These functions asynchronously (i.e. probably on a separate thread of execution) call GCompiled::operator() member function of their first argument with copies of rest of arguments (except callback) passed in.
+    The difference between the function is the way to get the completion notification (via callback or a waiting on std::future object)
+    If exception is occurred during execution of apply it is transferred to the callback (via function parameter) or passed to future (and will be thrown on call to std::future::get)
+
+    N.B. :
+    Input arguments are copied on call to async function (actually on call to cv::gin) and thus do not have to outlive the actual completion of asynchronous activity.
+    While output arguments are "captured" by reference(pointer) and therefore _must_ outlive the asynchronous activity
+    (i.e. live at least until callback is called or future is unblocked)
+
+    @param gcmpld       Compiled computation (graph) to start asynchronously
+    @param callback     Callback to be called when execution of gcmpld is done
+    @param ins          Input parameters for gcmpld
+    @param outs         Output parameters for gcmpld
+    */
+    GAPI_EXPORTS void                async(GCompiled& gcmpld, std::function<void(std::exception_ptr)>&& callback, GRunArgs &&ins, GRunArgsP &&outs);
+
+    /** @overload
+    @param gcmpld       Compiled computation (graph) to run asynchronously
+    @param callback     Callback to be called when execution of gcmpld is done
+    @param ins          Input parameters for gcmpld
+    @param outs         Output parameters for gcmpld
+    @param ctx          Context this request belongs to
+    @see   async GAsyncContext
+    */
+    GAPI_EXPORTS void                async(GCompiled& gcmpld, std::function<void(std::exception_ptr)>&& callback, GRunArgs &&ins, GRunArgsP &&outs, GAsyncContext& ctx);
+
+    /** @overload
+    @param gcmpld       Compiled computation (graph) to run asynchronously
+    @param ins          Input parameters for gcmpld
+    @param outs         Output parameters for gcmpld
+    @return             std::future<void> object to wait for completion of async operation
+    @see async
+    */
+    GAPI_EXPORTS std::future<void>   async(GCompiled& gcmpld, GRunArgs &&ins, GRunArgsP &&outs);
+
+    /**
+    @param gcmpld       Compiled computation (graph) to run asynchronously
+    @param ins          Input parameters for gcmpld
+    @param outs         Output parameters for gcmpld
+    @param ctx          Context this request belongs to
+    @return             std::future<void> object to wait for completion of async operation
+    @see   async GAsyncContext
+    */
+    GAPI_EXPORTS std::future<void>   async(GCompiled& gcmpld, GRunArgs &&ins, GRunArgsP &&outs, GAsyncContext& ctx);
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_GCOMPILED_ASYNC_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompoundkernel.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompoundkernel.hpp
new file mode 100644
index 000000000000..df0ce340457a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcompoundkernel.hpp
@@ -0,0 +1,139 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2019 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPOUNDKERNEL_HPP
+#define OPENCV_GAPI_GCOMPOUNDKERNEL_HPP
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/garg.hpp>
+
+namespace cv {
+namespace gapi
+{
+namespace compound
+{
+    // FIXME User does not need to know about this function
+    // Needs that user may define compound kernels(as cpu kernels)
+    GAPI_EXPORTS cv::gapi::GBackend backend();
+} // namespace compound
+} // namespace gapi
+
+namespace detail
+{
+
+struct GCompoundContext
+{
+    explicit GCompoundContext(const GArgs& in_args);
+    template<typename T>
+    const T& inArg(int input) { return m_args.at(input).get<T>(); }
+
+    GArgs m_args;
+    GArgs m_results;
+};
+
+class GAPI_EXPORTS GCompoundKernel
+{
+// Compound kernel must use all of it's inputs
+public:
+    using F = std::function<void(GCompoundContext& ctx)>;
+
+    explicit GCompoundKernel(const F& f);
+    void apply(GCompoundContext& ctx);
+
+protected:
+    F m_f;
+};
+
+template<typename T> struct get_compound_in
+{
+    static T get(GCompoundContext &ctx, int idx) { return ctx.inArg<T>(idx); }
+};
+
+template<typename U> struct get_compound_in<cv::GArray<U>>
+{
+    static cv::GArray<U> get(GCompoundContext &ctx, int idx)
+    {
+        auto array = cv::GArray<U>();
+        ctx.m_args[idx] = GArg(array);
+        return array;
+    }
+};
+
+template<typename U> struct get_compound_in<cv::GOpaque<U>>
+{
+    static cv::GOpaque<U> get(GCompoundContext &ctx, int idx)
+    {
+        auto opaq = cv::GOpaque<U>();
+        ctx.m_args[idx] = GArg(opaq);
+        return opaq;
+    }
+};
+
+template<> struct get_compound_in<cv::GMatP>
+{
+    static cv::GMatP get(GCompoundContext &ctx, int idx)
+    {
+        auto mat = cv::GMatP();
+        ctx.m_args[idx] = GArg(mat);
+        return mat;
+    }
+};
+
+template<typename, typename, typename>
+struct GCompoundCallHelper;
+
+template<typename Impl, typename... Ins, typename... Outs>
+struct GCompoundCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...> >
+{
+    template<int... IIs, int... OIs>
+    static void expand_impl(GCompoundContext &ctx, detail::Seq<IIs...>, detail::Seq<OIs...>)
+    {
+        auto result = Impl::expand(get_compound_in<Ins>::get(ctx, IIs)...);
+        auto tuple_return = tuple_wrap_helper<decltype(result)>::get(std::move(result));
+        ctx.m_results = { cv::GArg(std::get<OIs>(tuple_return))... };
+    }
+
+    static void expand(GCompoundContext &ctx)
+    {
+        expand_impl(ctx,
+                    typename detail::MkSeq<sizeof...(Ins)>::type(),
+                    typename detail::MkSeq<sizeof...(Outs)>::type());
+    }
+};
+
+template<class Impl, class K>
+class GCompoundKernelImpl: public cv::detail::GCompoundCallHelper<Impl, typename K::InArgs, typename K::OutArgs>,
+                           public cv::detail::KernelTag
+{
+    using P = cv::detail::GCompoundCallHelper<Impl, typename K::InArgs, typename K::OutArgs>;
+
+public:
+    using API = K;
+
+    static cv::gapi::GBackend backend() { return cv::gapi::compound::backend(); }
+    static GCompoundKernel    kernel()  { return GCompoundKernel(&P::expand);   }
+};
+
+} // namespace detail
+
+
+/**
+ * Declares a new compound kernel. See this
+ * [documentation chapter](@ref gapi_kernel_compound)
+ * on compound kernels for more details.
+ *
+ * @param Name type name for new kernel
+ * @param API the interface this kernel implements
+ */
+#define GAPI_COMPOUND_KERNEL(Name, API) \
+    struct Name: public cv::detail::GCompoundKernelImpl<Name, API>
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GCOMPOUNDKERNEL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcomputation.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcomputation.hpp
new file mode 100644
index 000000000000..196eb37c6b4f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcomputation.hpp
@@ -0,0 +1,581 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPUTATION_HPP
+#define OPENCV_GAPI_GCOMPUTATION_HPP
+
+#include <functional>
+
+#include <opencv2/gapi/util/util.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/gproto.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/gcompiled.hpp>
+#include <opencv2/gapi/gstreaming.hpp>
+
+namespace cv {
+
+namespace detail
+{
+    // FIXME: move to algorithm, cover with separate tests
+    // FIXME: replace with O(1) version (both memory and compilation time)
+    template<typename...>
+    struct last_type;
+
+    template<typename T>
+    struct last_type<T> { using type = T;};
+
+    template<typename T, typename... Ts>
+    struct last_type<T, Ts...> { using type = typename last_type<Ts...>::type; };
+
+    template<typename... Ts>
+    using last_type_t = typename last_type<Ts...>::type;
+}
+
+// Forward-declare the serialization objects
+namespace gapi {
+namespace s11n {
+    struct IIStream;
+    struct IOStream;
+} // namespace s11n
+} // namespace gapi
+
+/**
+ * \addtogroup gapi_main_classes
+ * @{
+ *
+ * @brief G-API classes for constructed and compiled graphs.
+ */
+
+/**
+ * @brief GComputation class represents a captured computation
+ * graph. GComputation objects form boundaries for expression code
+ * user writes with G-API, allowing to compile and execute it.
+ *
+ * G-API computations are defined with input/output data
+ * objects. G-API will track automatically which operations connect
+ * specified outputs to the inputs, forming up a call graph to be
+ * executed. The below example expresses calculation of Sobel operator
+ * for edge detection (\f$G = \sqrt{G_x^2 + G_y^2}\f$):
+ *
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp graph_def
+ *
+ * Full pipeline can be now captured with this object declaration:
+ *
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp graph_cap_full
+ *
+ * Input/output data objects on which a call graph should be
+ * reconstructed are passed using special wrappers cv::GIn and
+ * cv::GOut. G-API will track automatically which operations form a
+ * path from inputs to outputs and build the execution graph appropriately.
+ *
+ * Note that cv::GComputation doesn't take ownership on data objects
+ * it is defined. Moreover, multiple GComputation objects may be
+ * defined on the same expressions, e.g. a smaller pipeline which
+ * expects that image gradients are already pre-calculated may be
+ * defined like this:
+ *
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp graph_cap_sub
+ *
+ * The resulting graph would expect two inputs and produce one
+ * output. In this case, it doesn't matter if gx/gy data objects are
+ * results of cv::gapi::Sobel operators -- G-API will stop unrolling
+ * expressions and building the underlying graph one reaching this
+ * data objects.
+ *
+ * The way how GComputation is defined is important as its definition
+ * specifies graph _protocol_ -- the way how the graph should be
+ * used. Protocol is defined by number of inputs, number of outputs,
+ * and shapes of inputs and outputs.
+ *
+ * In the above example, sobelEdge expects one Mat on input and
+ * produces one Mat; while sobelEdgeSub expects two Mats on input and
+ * produces one Mat. GComputation's protocol defines how other
+ * computation methods should be used -- cv::GComputation::compile() and
+ * cv::GComputation::apply(). For example, if a graph is defined on
+ * two GMat inputs, two cv::Mat objects have to be passed to apply()
+ * for execution. GComputation checks protocol correctness in runtime
+ * so passing a different number of objects in apply() or passing
+ * cv::Scalar instead of cv::Mat there would compile well as a C++
+ * source but raise an exception in run-time. G-API also comes with a
+ * typed wrapper cv::GComputationT<> which introduces this type-checking in
+ * compile-time.
+ *
+ * cv::GComputation itself is a thin object which just captures what
+ * the graph is. The compiled graph (which actually process data) is
+ * represented by class GCompiled. Use compile() method to generate a
+ * compiled graph with given compile options. cv::GComputation can
+ * also be used to process data with implicit graph compilation
+ * on-the-fly, see apply() for details.
+ *
+ * GComputation is a reference-counted object -- once defined, all its
+ * copies will refer to the same instance.
+ *
+ * @sa GCompiled
+ */
+class GAPI_EXPORTS_W GComputation
+{
+public:
+    class Priv;
+    typedef std::function<GComputation()> Generator;
+
+    // Various constructors enable different ways to define a computation: /////
+    // 1. Generic constructors
+    /**
+     * @brief Define a computation using a generator function.
+     *
+     * Graph can be defined in-place directly at the moment of its
+     * construction with a lambda:
+     *
+     * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp graph_gen
+     *
+     * This may be useful since all temporary objects (cv::GMats) and
+     * namespaces can be localized to scope of lambda, without
+     * contaminating the parent scope with probably unnecessary objects
+     * and information.
+     *
+     * @param gen generator function which returns a cv::GComputation,
+     * see Generator.
+     */
+    GComputation(const Generator& gen);                // Generator
+                                                       // overload
+
+    /**
+     * @brief Generic GComputation constructor.
+     *
+     * Constructs a new graph with a given protocol, specified as a
+     * flow of operations connecting input/output objects. Throws if
+     * the passed boundaries are invalid, e.g. if there's no
+     * functional dependency (path) between given outputs and inputs.
+     *
+     * @param ins Input data vector.
+     * @param outs Output data vector.
+     *
+     * @note Don't construct GProtoInputArgs/GProtoOutputArgs objects
+     * directly, use cv::GIn()/cv::GOut() wrapper functions instead.
+     *
+     * @sa @ref gapi_data_objects
+     */
+    GAPI_WRAP GComputation(GProtoInputArgs &&ins,
+                           GProtoOutputArgs &&outs);             // Arg-to-arg overload
+
+    // 2. Syntax sugar and compatibility overloads
+    /**
+     * @brief Defines an unary (one input -- one output) computation
+     *
+     * @overload
+     * @param in input GMat of the defined unary computation
+     * @param out output GMat of the defined unary computation
+     */
+    GAPI_WRAP GComputation(GMat in, GMat out);  // Unary overload
+
+    /**
+     * @brief Defines an unary (one input -- one output) computation
+     *
+     * @overload
+     * @param in input GMat of the defined unary computation
+     * @param out output GScalar of the defined unary computation
+     */
+    GAPI_WRAP GComputation(GMat in, GScalar out);      // Unary overload (scalar)
+
+    /**
+     * @brief Defines a binary (two inputs -- one output) computation
+     *
+     * @overload
+     * @param in1 first input GMat of the defined binary computation
+     * @param in2 second input GMat of the defined binary computation
+     * @param out output GMat of the defined binary computation
+     */
+    GAPI_WRAP GComputation(GMat in1, GMat in2, GMat out);        // Binary overload
+
+    /**
+     * @brief Defines a binary (two inputs -- one output) computation
+     *
+     * @overload
+     * @param in1 first input GMat of the defined binary computation
+     * @param in2 second input GMat of the defined binary computation
+     * @param out output GScalar of the defined binary computation
+     */
+    GComputation(GMat in1, GMat in2, GScalar out);     // Binary
+                                                       // overload
+                                                       // (scalar)
+
+    /**
+     * @brief Defines a computation with arbitrary input/output number.
+     *
+     * @overload
+     * @param ins vector of inputs GMats for this computation
+     * @param outs vector of outputs GMats for this computation
+     *
+     * Use this overload for cases when number of computation
+     * inputs/outputs is not known in compile-time -- e.g. when graph
+     * is programmatically generated to build an image pyramid with
+     * the given number of levels, etc.
+     */
+    GComputation(const std::vector<GMat> &ins,         // Compatibility overload
+                 const std::vector<GMat> &outs);
+
+    // Various versions of apply(): ////////////////////////////////////////////
+    // 1. Generic apply()
+    /**
+     * @brief Compile graph on-the-fly and immediately execute it on
+     * the inputs data vectors.
+     *
+     * Number of input/output data objects must match GComputation's
+     * protocol, also types of host data objects (cv::Mat, cv::Scalar)
+     * must match the shapes of data objects from protocol (cv::GMat,
+     * cv::GScalar). If there's a mismatch, a run-time exception will
+     * be generated.
+     *
+     * Internally, a cv::GCompiled object is created for the given
+     * input format configuration, which then is executed on the input
+     * data immediately. cv::GComputation caches compiled objects
+     * produced within apply() -- if this method would be called next
+     * time with the same input parameters (image formats, image
+     * resolution, etc), the underlying compiled graph will be reused
+     * without recompilation. If new metadata doesn't match the cached
+     * one, the underlying compiled graph is regenerated.
+     *
+     * @note compile() always triggers a compilation process and
+     * produces a new GCompiled object regardless if a similar one has
+     * been cached via apply() or not.
+     *
+     * @param ins vector of input data to process. Don't create
+     * GRunArgs object manually, use cv::gin() wrapper instead.
+     * @param outs vector of output data to fill results in. cv::Mat
+     * objects may be empty in this vector, G-API will automatically
+     * initialize it with the required format & dimensions. Don't
+     * create GRunArgsP object manually, use cv::gout() wrapper instead.
+     * @param args a list of compilation arguments to pass to the
+     * underlying compilation process. Don't create GCompileArgs
+     * object manually, use cv::compile_args() wrapper instead.
+     *
+     * @sa @ref gapi_data_objects, @ref gapi_compile_args
+     */
+    void apply(GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args = {});       // Arg-to-arg overload
+
+    /// @private -- Exclude this function from OpenCV documentation
+    GAPI_WRAP GRunArgs apply(const cv::detail::ExtractArgsCallback  &callback,
+                                   GCompileArgs                    &&args = {});
+
+    /// @private -- Exclude this function from OpenCV documentation
+    void apply(const std::vector<cv::Mat>& ins,                                   // Compatibility overload
+               const std::vector<cv::Mat>& outs,
+               GCompileArgs &&args = {});
+
+    // 2. Syntax sugar and compatibility overloads
+#if !defined(GAPI_STANDALONE)
+    /**
+     * @brief Execute an unary computation (with compilation on the fly)
+     *
+     * @overload
+     * @param in input cv::Mat for unary computation
+     * @param out output cv::Mat for unary computation
+     * @param args compilation arguments for underlying compilation
+     * process.
+     */
+    void apply(cv::Mat in, cv::Mat &out, GCompileArgs &&args = {}); // Unary overload
+
+    /**
+     * @brief Execute an unary computation (with compilation on the fly)
+     *
+     * @overload
+     * @param in input cv::Mat for unary computation
+     * @param out output cv::Scalar for unary computation
+     * @param args compilation arguments for underlying compilation
+     * process.
+     */
+    void apply(cv::Mat in, cv::Scalar &out, GCompileArgs &&args = {}); // Unary overload (scalar)
+
+    /**
+     * @brief Execute a binary computation (with compilation on the fly)
+     *
+     * @overload
+     * @param in1 first input cv::Mat for binary computation
+     * @param in2 second input cv::Mat for binary computation
+     * @param out output cv::Mat for binary computation
+     * @param args compilation arguments for underlying compilation
+     * process.
+     */
+    void apply(cv::Mat in1, cv::Mat in2, cv::Mat &out, GCompileArgs &&args = {}); // Binary overload
+
+    /**
+     * @brief Execute an binary computation (with compilation on the fly)
+     *
+     * @overload
+     * @param in1 first input cv::Mat for binary computation
+     * @param in2 second input cv::Mat for binary computation
+     * @param out output cv::Scalar for binary computation
+     * @param args compilation arguments for underlying compilation
+     * process.
+     */
+    void apply(cv::Mat in1, cv::Mat in2, cv::Scalar &out, GCompileArgs &&args = {}); // Binary overload (scalar)
+
+    /**
+     * @brief Execute a computation with arbitrary number of
+     * inputs/outputs (with compilation on-the-fly).
+     *
+     * @overload
+     * @param ins vector of input cv::Mat objects to process by the
+     * computation.
+     * @param outs vector of output cv::Mat objects to produce by the
+     * computation.
+     * @param args compilation arguments for underlying compilation
+     * process.
+     *
+     * Numbers of elements in ins/outs vectors must match numbers of
+     * inputs/outputs which were used to define this GComputation.
+     */
+    void apply(const std::vector<cv::Mat>& ins,         // Compatibility overload
+                     std::vector<cv::Mat>& outs,
+               GCompileArgs &&args = {});
+#endif // !defined(GAPI_STANDALONE)
+    // Various versions of compile(): //////////////////////////////////////////
+    // 1. Generic compile() - requires metas to be passed as vector
+    /**
+     * @brief Compile the computation for specific input format(s).
+     *
+     * This method triggers compilation process and produces a new
+     * GCompiled object which then can process data of the given
+     * format. Passing data with different format to the compiled
+     * computation will generate a run-time exception.
+     *
+     * @param in_metas vector of input metadata configuration. Grab
+     * metadata from real data objects (like cv::Mat or cv::Scalar)
+     * using cv::descr_of(), or create it on your own.
+     * @param args compilation arguments for this compilation
+     * process. Compilation arguments directly affect what kind of
+     * executable object would be produced, e.g. which kernels (and
+     * thus, devices) would be used to execute computation.
+     *
+     * @return GCompiled, an executable computation compiled
+     * specifically for the given input parameters.
+     *
+     * @sa @ref gapi_compile_args
+     */
+    GCompiled compile(GMetaArgs &&in_metas, GCompileArgs &&args = {});
+
+    // 2. Syntax sugar - variadic list of metas, no extra compile args
+    // FIXME: SFINAE looks ugly in the generated documentation
+    /**
+     * @overload
+     *
+     * Takes a variadic parameter pack with metadata
+     * descriptors for which a compiled object needs to be produced.
+     *
+     * @return GCompiled, an executable computation compiled
+     * specifically for the given input parameters.
+     */
+    template<typename... Ts>
+    auto compile(const Ts&... metas) ->
+        typename std::enable_if<detail::are_meta_descrs<Ts...>::value, GCompiled>::type
+    {
+        return compile(GMetaArgs{GMetaArg(metas)...}, GCompileArgs());
+    }
+
+    // 3. Syntax sugar - variadic list of metas, extra compile args
+    // (seems optional parameters don't work well when there's an variadic template
+    // comes first)
+    //
+    // Ideally it should look like:
+    //
+    //     template<typename... Ts>
+    //     GCompiled compile(const Ts&... metas, GCompileArgs &&args)
+    //
+    // But not all compilers can handle this (and seems they shouldn't be able to).
+    // FIXME: SFINAE looks ugly in the generated documentation
+    /**
+     * @overload
+     *
+     * Takes a  variadic parameter pack with metadata
+     * descriptors for which a compiled object needs to be produced,
+     * followed by GCompileArgs object representing compilation
+     * arguments for this process.
+     *
+     * @return GCompiled, an executable computation compiled
+     * specifically for the given input parameters.
+     */
+    template<typename... Ts>
+    auto compile(const Ts&... meta_and_compile_args) ->
+        typename std::enable_if<detail::are_meta_descrs_but_last<Ts...>::value
+                                && std::is_same<GCompileArgs, detail::last_type_t<Ts...> >::value,
+                                GCompiled>::type
+    {
+        //FIXME: wrapping meta_and_compile_args into a tuple to unwrap them inside a helper function is the overkill
+        return compile(std::make_tuple(meta_and_compile_args...),
+                       typename detail::MkSeq<sizeof...(Ts)-1>::type());
+    }
+
+
+    // FIXME: Document properly in the Doxygen format
+    // Video-oriented pipeline compilation:
+    // 1. A generic version
+    /**
+     * @brief Compile the computation for streaming mode.
+     *
+     * This method triggers compilation process and produces a new
+     * GStreamingCompiled object which then can process video stream
+     * data of the given format. Passing a stream in a different
+     * format to the compiled computation will generate a run-time
+     * exception.
+     *
+     * @param in_metas vector of input metadata configuration. Grab
+     * metadata from real data objects (like cv::Mat or cv::Scalar)
+     * using cv::descr_of(), or create it on your own.
+     *
+     * @param args compilation arguments for this compilation
+     * process. Compilation arguments directly affect what kind of
+     * executable object would be produced, e.g. which kernels (and
+     * thus, devices) would be used to execute computation.
+     *
+     * @return GStreamingCompiled, a streaming-oriented executable
+     * computation compiled specifically for the given input
+     * parameters.
+     *
+     * @sa @ref gapi_compile_args
+     */
+    GAPI_WRAP GStreamingCompiled compileStreaming(GMetaArgs &&in_metas, GCompileArgs &&args = {});
+
+    /**
+     * @brief Compile the computation for streaming mode.
+     *
+     * This method triggers compilation process and produces a new
+     * GStreamingCompiled object which then can process video stream
+     * data in any format. Underlying mechanisms will be adjusted to
+     * every new input video stream automatically, but please note that
+     * _not all_ existing backends support this (see reshape()).
+     *
+     * @param args compilation arguments for this compilation
+     * process. Compilation arguments directly affect what kind of
+     * executable object would be produced, e.g. which kernels (and
+     * thus, devices) would be used to execute computation.
+     *
+     * @return GStreamingCompiled, a streaming-oriented executable
+     * computation compiled for any input image format.
+     *
+     * @sa @ref gapi_compile_args
+     */
+    GAPI_WRAP GStreamingCompiled compileStreaming(GCompileArgs &&args = {});
+
+    /// @private -- Exclude this function from OpenCV documentation
+    GAPI_WRAP GStreamingCompiled compileStreaming(const cv::detail::ExtractMetaCallback &callback,
+                                                        GCompileArgs                   &&args = {});
+
+    // 2. Direct metadata version
+    /**
+     * @overload
+     *
+     * Takes a variadic parameter pack with metadata
+     * descriptors for which a compiled object needs to be produced.
+     *
+     * @return GStreamingCompiled, a streaming-oriented executable
+     * computation compiled specifically for the given input
+     * parameters.
+     */
+    template<typename... Ts>
+    auto compileStreaming(const Ts&... metas) ->
+        typename std::enable_if<detail::are_meta_descrs<Ts...>::value, GStreamingCompiled>::type
+    {
+        return compileStreaming(GMetaArgs{GMetaArg(metas)...}, GCompileArgs());
+    }
+
+    // 2. Direct metadata + compile arguments version
+    /**
+     * @overload
+     *
+     * Takes a  variadic parameter pack with metadata
+     * descriptors for which a compiled object needs to be produced,
+     * followed by GCompileArgs object representing compilation
+     * arguments for this process.
+     *
+     * @return GStreamingCompiled, a streaming-oriented executable
+     * computation compiled specifically for the given input
+     * parameters.
+     */
+    template<typename... Ts>
+    auto compileStreaming(const Ts&... meta_and_compile_args) ->
+        typename std::enable_if<detail::are_meta_descrs_but_last<Ts...>::value
+                                && std::is_same<GCompileArgs, detail::last_type_t<Ts...> >::value,
+                                GStreamingCompiled>::type
+    {
+        //FIXME: wrapping meta_and_compile_args into a tuple to unwrap them inside a helper function is the overkill
+        return compileStreaming(std::make_tuple(meta_and_compile_args...),
+                                typename detail::MkSeq<sizeof...(Ts)-1>::type());
+    }
+
+    // Internal use only
+    /// @private
+    Priv& priv();
+    /// @private
+    const Priv& priv() const;
+    /// @private
+    explicit GComputation(cv::gapi::s11n::IIStream &);
+    /// @private
+    void serialize(cv::gapi::s11n::IOStream &) const;
+
+protected:
+
+    // 4. Helper methods for (3)
+    /// @private
+    template<typename... Ts, int... IIs>
+    GCompiled compile(const std::tuple<Ts...> &meta_and_compile_args, detail::Seq<IIs...>)
+    {
+        GMetaArgs meta_args = {GMetaArg(std::get<IIs>(meta_and_compile_args))...};
+        GCompileArgs comp_args = std::get<sizeof...(Ts)-1>(meta_and_compile_args);
+        return compile(std::move(meta_args), std::move(comp_args));
+    }
+    template<typename... Ts, int... IIs>
+    GStreamingCompiled compileStreaming(const std::tuple<Ts...> &meta_and_compile_args, detail::Seq<IIs...>)
+    {
+        GMetaArgs meta_args = {GMetaArg(std::get<IIs>(meta_and_compile_args))...};
+        GCompileArgs comp_args = std::get<sizeof...(Ts)-1>(meta_and_compile_args);
+        return compileStreaming(std::move(meta_args), std::move(comp_args));
+    }
+    void recompile(GMetaArgs&& in_metas, GCompileArgs &&args);
+    /// @private
+    std::shared_ptr<Priv> m_priv;
+};
+/** @} */
+
+namespace gapi
+{
+    // FIXME: all these standalone functions need to be added to some
+    // common documentation section
+    /**
+     * @brief Define an tagged island (subgraph) within a computation.
+     *
+     * Declare an Island tagged with `name` and defined from `ins` to `outs`
+     * (exclusively, as ins/outs are data objects, and regioning is done on
+     * operations level).
+     * Throws if any operation between `ins` and `outs` are already assigned
+     * to another island.
+     *
+     * Islands allow to partition graph into subgraphs, fine-tuning
+     * the way it is scheduled by the underlying executor.
+     *
+     * @param name name of the Island to create
+     * @param ins vector of input data objects where the subgraph
+     * begins
+     * @param outs vector of output data objects where the subgraph
+     * ends.
+     *
+     * The way how an island is defined is similar to how
+     * cv::GComputation is defined on input/output data objects.
+     * Same rules apply here as well -- if there's no functional
+     * dependency between inputs and outputs or there's not enough
+     * input data objects were specified to properly calculate all
+     * outputs, an exception is thrown.
+     *
+     * Use cv::GIn() / cv::GOut() to specify input/output vectors.
+     */
+    void GAPI_EXPORTS island(const std::string &name,
+                             GProtoInputArgs  &&ins,
+                             GProtoOutputArgs &&outs);
+} // namespace gapi
+
+} // namespace cv
+#endif // OPENCV_GAPI_GCOMPUTATION_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcomputation_async.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcomputation_async.hpp
new file mode 100644
index 000000000000..8af603efead4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gcomputation_async.hpp
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+#ifndef OPENCV_GAPI_GCOMPUTATION_ASYNC_HPP
+#define OPENCV_GAPI_GCOMPUTATION_ASYNC_HPP
+
+
+#include <future>                           //for std::future
+#include <exception>                        //for std::exception_ptr
+#include <functional>                       //for std::function
+#include <opencv2/gapi/garg.hpp>            //for GRunArgs, GRunArgsP
+#include <opencv2/gapi/gcommon.hpp>         //for GCompileArgs
+#include <opencv2/gapi/own/exports.hpp>
+
+
+namespace cv {
+    //fwd declaration
+    class GComputation;
+namespace gapi {
+namespace wip  {
+    class GAsyncContext;
+    /** In contrast to async() functions, these do call GComputation::apply() member function of the GComputation passed in.
+
+    @param gcomp        Computation (graph) to run asynchronously
+    @param callback     Callback to be called when execution of gcomp is done
+    @param ins          Input parameters for gcomp
+    @param outs         Output parameters for gcomp
+    @param args         Compile arguments to pass to GComputation::apply()
+    @see                async
+    */
+    GAPI_EXPORTS void                async_apply(GComputation& gcomp, std::function<void(std::exception_ptr)>&& callback, GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args = {});
+    /** @overload
+    @param gcomp        Computation (graph) to run asynchronously
+    @param callback     Callback to be called when execution of gcomp is done
+    @param ins          Input parameters for gcomp
+    @param outs         Output parameters for gcomp
+    @param args         Compile arguments to pass to GComputation::apply()
+    @param ctx          Context this request belongs to
+    @see                async_apply async GAsyncContext
+    */
+    GAPI_EXPORTS void                async_apply(GComputation& gcomp, std::function<void(std::exception_ptr)>&& callback, GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args, GAsyncContext& ctx);
+    /** @overload
+    @param gcomp        Computation (graph) to run asynchronously
+    @param ins          Input parameters for gcomp
+    @param outs         Output parameters for gcomp
+    @param args         Compile arguments to pass to GComputation::apply()
+    @return             std::future<void> object to wait for completion of async operation
+    @see                async_apply async
+    */
+    GAPI_EXPORTS std::future<void>   async_apply(GComputation& gcomp, GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args = {});
+    /** @overload
+    @param gcomp        Computation (graph) to run asynchronously
+    @param ins          Input parameters for gcomp
+    @param outs         Output parameters for gcomp
+    @param args         Compile arguments to pass to GComputation::apply()
+    @param ctx          Context this request belongs to
+    @return             std::future<void> object to wait for completion of async operation
+    @see                async_apply async GAsyncContext
+    */
+    GAPI_EXPORTS std::future<void>   async_apply(GComputation& gcomp, GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args,  GAsyncContext& ctx);
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+
+#endif //OPENCV_GAPI_GCOMPUTATION_ASYNC_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gframe.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gframe.hpp
new file mode 100644
index 000000000000..54fb30789e33
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gframe.hpp
@@ -0,0 +1,113 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GFRAME_HPP
+#define OPENCV_GAPI_GFRAME_HPP
+
+#include <ostream>
+#include <memory>                 // std::shared_ptr
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gcommon.hpp> // GShape
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/own/assert.hpp>
+
+// TODO GAPI_EXPORTS or so
+namespace cv
+{
+// Forward declaration; GNode and GOrigin are an internal
+// (user-inaccessible) classes.
+class GNode;
+struct GOrigin;
+
+/** \addtogroup gapi_data_objects
+ * @{
+ */
+/**
+ * @brief GFrame class represents an image or media frame in the graph.
+ *
+ * GFrame doesn't store any data itself, instead it describes a
+ * functional relationship between operations consuming and producing
+ * GFrame objects.
+ *
+ * GFrame is introduced to handle various media formats (e.g., NV12 or
+ * I420) under the same type. Various image formats may differ in the
+ * number of planes (e.g. two for NV12, three for I420) and the pixel
+ * layout inside. GFrame type allows to handle these media formats in
+ * the graph uniformly -- the graph structure will not change if the
+ * media format changes, e.g. a different camera or decoder is used
+ * with the same graph. G-API provides a number of operations which
+ * operate directly on GFrame, like `infer<>()` or
+ * renderFrame(); these operations are expected to handle different
+ * media formats inside. There is also a number of accessor
+ * operations like BGR(), Y(), UV() -- these operations provide
+ * access to frame's data in the familiar cv::GMat form, which can be
+ * used with the majority of the existing G-API operations. These
+ * accessor functions may perform color space conversion on the fly if
+ * the image format of the GFrame they are applied to differs from the
+ * operation's semantic (e.g. the BGR() accessor is called on an NV12
+ * image frame).
+ *
+ * GFrame is a virtual counterpart of cv::MediaFrame.
+ *
+ * @sa cv::MediaFrame, cv::GFrameDesc, BGR(), Y(), UV(), infer<>().
+ */
+class GAPI_EXPORTS_W_SIMPLE GFrame
+{
+public:
+    /**
+     * @brief Constructs an empty GFrame
+     *
+     * Normally, empty G-API data objects denote a starting point of
+     * the graph. When an empty GFrame is assigned to a result of some
+     * operation, it obtains a functional link to this operation (and
+     * is not empty anymore).
+     */
+    GAPI_WRAP GFrame();                      // Empty constructor
+
+    /// @private
+    GFrame(const GNode &n, std::size_t out); // Operation result constructor
+    /// @private
+    GOrigin& priv();                         // Internal use only
+    /// @private
+    const GOrigin& priv()  const;            // Internal use only
+
+private:
+    std::shared_ptr<GOrigin> m_priv;
+};
+/** @} */
+
+enum class MediaFormat: int
+{
+    BGR = 0,
+    NV12,
+    GRAY,
+};
+
+/**
+ * \addtogroup gapi_meta_args
+ * @{
+ */
+struct GAPI_EXPORTS GFrameDesc
+{
+    MediaFormat fmt;
+    cv::Size size;
+
+    bool operator== (const GFrameDesc &) const;
+};
+static inline GFrameDesc empty_gframe_desc() { return GFrameDesc{}; }
+/** @} */
+
+class MediaFrame;
+GAPI_EXPORTS GFrameDesc descr_of(const MediaFrame &frame);
+
+GAPI_EXPORTS std::ostream& operator<<(std::ostream& os, const cv::GFrameDesc &desc);
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GFRAME_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gkernel.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gkernel.hpp
new file mode 100644
index 000000000000..6ec6bf573d0a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gkernel.hpp
@@ -0,0 +1,757 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2021 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GKERNEL_HPP
+#define OPENCV_GAPI_GKERNEL_HPP
+
+#include <functional>
+#include <iostream>
+#include <string>  // string
+#include <type_traits> // false_type, true_type
+#include <unordered_map> // map (for GKernelPackage)
+#include <utility> // tuple
+
+#include <opencv2/gapi/gcommon.hpp> // CompileArgTag
+#include <opencv2/gapi/util/util.hpp> // Seq
+#include <opencv2/gapi/gcall.hpp>
+#include <opencv2/gapi/garg.hpp>      // GArg
+#include <opencv2/gapi/gmetaarg.hpp>  // GMetaArg
+#include <opencv2/gapi/gtype_traits.hpp> // GTypeTraits
+#include <opencv2/gapi/util/compiler_hints.hpp> //suppress_unused_warning
+#include <opencv2/gapi/gtransform.hpp>
+
+namespace cv {
+
+struct GTypeInfo
+{
+    GShape                 shape;
+    cv::detail::OpaqueKind kind;
+    detail::HostCtor       ctor;
+};
+
+using GShapes    = std::vector<GShape>;
+using GKinds     = std::vector<cv::detail::OpaqueKind>;
+using GCtors     = std::vector<detail::HostCtor>;
+using GTypesInfo = std::vector<GTypeInfo>;
+
+// GKernel describes kernel API to the system
+// FIXME: add attributes of a kernel, (e.g. number and types
+// of inputs, etc)
+struct GAPI_EXPORTS GKernel
+{
+    using M = std::function<GMetaArgs(const GMetaArgs &, const GArgs &)>;
+
+    std::string name;       // kernel ID, defined by its API (signature)
+    std::string tag;        // some (implementation-specific) tag
+    M           outMeta;    // generic adaptor to API::outMeta(...)
+    GShapes     outShapes;  // types (shapes) kernel's outputs
+    GKinds      inKinds;    // kinds of kernel's inputs (fixme: below)
+    GCtors      outCtors;   // captured constructors for template output types
+    GKinds      outKinds;   // kinds of kernel's outputs (fixme: below)
+};
+// TODO: It's questionable if inKinds should really be here. Instead,
+// this information could come from meta.
+
+// GKernelImpl describes particular kernel implementation to the system
+struct GAPI_EXPORTS GKernelImpl
+{
+    util::any         opaque;    // backend-specific opaque info
+    GKernel::M        outMeta;   // for deserialized graphs, the outMeta is taken here
+};
+
+template<typename, typename> class GKernelTypeM;
+
+namespace detail
+{
+    ////////////////////////////////////////////////////////////////////////////
+    // yield() is used in graph construction time as a generic method to obtain
+    // lazy "return value" of G-API operations
+    //
+    template<typename T> struct Yield;
+    template<> struct Yield<cv::GMat>
+    {
+        static inline cv::GMat yield(cv::GCall &call, int i) { return call.yield(i); }
+    };
+    template<> struct Yield<cv::GMatP>
+    {
+        static inline cv::GMatP yield(cv::GCall &call, int i) { return call.yieldP(i); }
+    };
+    template<> struct Yield<cv::GScalar>
+    {
+        static inline cv::GScalar yield(cv::GCall &call, int i) { return call.yieldScalar(i); }
+    };
+    template<typename U> struct Yield<cv::GArray<U> >
+    {
+        static inline cv::GArray<U> yield(cv::GCall &call, int i) { return call.yieldArray<U>(i); }
+    };
+    template<typename U> struct Yield<cv::GOpaque<U> >
+    {
+        static inline cv::GOpaque<U> yield(cv::GCall &call, int i) { return call.yieldOpaque<U>(i); }
+    };
+    template<> struct Yield<GFrame>
+    {
+        static inline cv::GFrame yield(cv::GCall &call, int i) { return call.yieldFrame(i); }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Helper classes which brings outputMeta() marshalling to kernel
+    // implementations
+    //
+    // 1. MetaType establishes G#Type -> G#Meta mapping between G-API dynamic
+    //    types and its metadata descriptor types.
+    //    This mapping is used to transform types to call outMeta() callback.
+    template<typename T> struct MetaType;
+    template<> struct MetaType<cv::GMat>    { using type = GMatDesc; };
+    template<> struct MetaType<cv::GMatP>   { using type = GMatDesc; };
+    template<> struct MetaType<cv::GFrame>  { using type = GFrameDesc; };
+    template<> struct MetaType<cv::GScalar> { using type = GScalarDesc; };
+    template<typename U> struct MetaType<cv::GArray<U> >  { using type = GArrayDesc; };
+    template<typename U> struct MetaType<cv::GOpaque<U> > { using type = GOpaqueDesc; };
+    template<typename T> struct MetaType    { using type = T; }; // opaque args passed as-is
+    // FIXME: Move it to type traits?
+
+    // 2. Hacky test based on MetaType to check if we operate on G-* type or not
+    template<typename T> using is_nongapi_type = std::is_same<T, typename MetaType<T>::type>;
+
+    // 3. Two ways to transform input arguments to its meta - for G-* and non-G* types:
+    template<typename T>
+    typename std::enable_if<!is_nongapi_type<T>::value, typename MetaType<T>::type>
+    ::type get_in_meta(const GMetaArgs &in_meta, const GArgs &, int idx)
+    {
+        return util::get<typename MetaType<T>::type>(in_meta.at(idx));
+    }
+
+    template<typename T>
+    typename std::enable_if<is_nongapi_type<T>::value, T>
+    ::type get_in_meta(const GMetaArgs &, const GArgs &in_args, int idx)
+    {
+        return in_args.at(idx).template get<T>();
+    }
+
+    // 4. The MetaHelper itself: an entity which generates outMeta() call
+    //    based on kernel signature, with arguments properly substituted.
+    // 4.1 - case for multiple return values
+    // FIXME: probably can be simplified with std::apply or analogue.
+    template<typename, typename, typename>
+    struct MetaHelper;
+
+    template<typename K, typename... Ins, typename... Outs>
+    struct MetaHelper<K, std::tuple<Ins...>, std::tuple<Outs...> >
+    {
+        template<int... IIs, int... OIs>
+        static GMetaArgs getOutMeta_impl(const GMetaArgs &in_meta,
+                                         const GArgs &in_args,
+                                         detail::Seq<IIs...>,
+                                         detail::Seq<OIs...>)
+        {
+            // FIXME: decay?
+            using R   = std::tuple<typename MetaType<Outs>::type...>;
+            const R r = K::outMeta( get_in_meta<Ins>(in_meta, in_args, IIs)... );
+            return GMetaArgs{ GMetaArg(std::get<OIs>(r))... };
+        }
+        // FIXME: help users identify how outMeta must look like (via default impl w/static_assert?)
+
+        static GMetaArgs getOutMeta(const GMetaArgs &in_meta,
+                                    const GArgs &in_args)
+        {
+            return getOutMeta_impl(in_meta,
+                                   in_args,
+                                   typename detail::MkSeq<sizeof...(Ins)>::type(),
+                                   typename detail::MkSeq<sizeof...(Outs)>::type());
+        }
+    };
+
+    // 4.1 - case for a single return value
+    // FIXME: How to avoid duplication here?
+    template<typename K, typename... Ins, typename Out>
+    struct MetaHelper<K, std::tuple<Ins...>, Out >
+    {
+        template<int... IIs>
+        static GMetaArgs getOutMeta_impl(const GMetaArgs &in_meta,
+                                         const GArgs &in_args,
+                                         detail::Seq<IIs...>)
+        {
+            // FIXME: decay?
+            using R = typename MetaType<Out>::type;
+            const R r = K::outMeta( get_in_meta<Ins>(in_meta, in_args, IIs)... );
+            return GMetaArgs{ GMetaArg(r) };
+        }
+        // FIXME: help users identify how outMeta must look like (via default impl w/static_assert?)
+
+        static GMetaArgs getOutMeta(const GMetaArgs &in_meta,
+                                    const GArgs &in_args)
+        {
+            return getOutMeta_impl(in_meta,
+                                   in_args,
+                                   typename detail::MkSeq<sizeof...(Ins)>::type());
+        }
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Helper class to introduce tags to calls. By default there's no tag
+    struct NoTag {
+        static constexpr const char *tag() { return ""; }
+    };
+
+} // namespace detail
+
+// GKernelType and GKernelTypeM are base classes which implement typed ::on()
+// method based on kernel signature. GKernelTypeM stands for multiple-return-value kernels
+//
+// G_TYPED_KERNEL and G_TYPED_KERNEL_M macros inherit user classes from GKernelType and
+// GKernelTypeM respectively.
+
+template<typename K, typename... R, typename... Args>
+class GKernelTypeM<K, std::function<std::tuple<R...>(Args...)> >
+    : public detail::MetaHelper<K, std::tuple<Args...>, std::tuple<R...>>
+    , public detail::NoTag
+{
+    template<int... IIs>
+    static std::tuple<R...> yield(cv::GCall &call, detail::Seq<IIs...>)
+    {
+        return std::make_tuple(detail::Yield<R>::yield(call, IIs)...);
+    }
+
+public:
+    using InArgs  = std::tuple<Args...>;
+    using OutArgs = std::tuple<R...>;
+
+    // TODO: Args&&... here?
+    static std::tuple<R...> on(Args... args)
+    {
+        cv::GCall call(GKernel{ K::id()
+                              , K::tag()
+                              , &K::getOutMeta
+                              , {detail::GTypeTraits<R>::shape...}
+                              , {detail::GTypeTraits<Args>::op_kind...}
+                              , {detail::GObtainCtor<R>::get()...}
+                              , {detail::GTypeTraits<R>::op_kind...}});
+        call.pass(args...); // TODO: std::forward() here?
+        return yield(call, typename detail::MkSeq<sizeof...(R)>::type());
+    }
+};
+
+template<typename, typename> class GKernelType;
+
+template<typename K, typename R, typename... Args>
+class GKernelType<K, std::function<R(Args...)> >
+    : public detail::MetaHelper<K, std::tuple<Args...>, R>
+    , public detail::NoTag
+{
+public:
+    using InArgs  = std::tuple<Args...>;
+    using OutArgs = std::tuple<R>;
+
+    static R on(Args... args)
+    {
+        cv::GCall call(GKernel{ K::id()
+                              , K::tag()
+                              , &K::getOutMeta
+                              , {detail::GTypeTraits<R>::shape}
+                              , {detail::GTypeTraits<Args>::op_kind...}
+                              , {detail::GObtainCtor<R>::get()}
+                              , {detail::GTypeTraits<R>::op_kind}});
+        call.pass(args...);
+        return detail::Yield<R>::yield(call, 0);
+    }
+};
+
+namespace detail {
+// This tiny class eliminates the semantic difference between
+// GKernelType and GKernelTypeM.
+template<typename, typename> class KernelTypeMedium;
+
+template<typename K, typename... R, typename... Args>
+class KernelTypeMedium<K, std::function<std::tuple<R...>(Args...)>> :
+    public cv::GKernelTypeM<K, std::function<std::tuple<R...>(Args...)>> {};
+
+template<typename K, typename R, typename... Args>
+class KernelTypeMedium<K, std::function<R(Args...)>> :
+    public cv::GKernelType<K, std::function<R(Args...)>> {};
+} // namespace detail
+
+} // namespace cv
+
+
+// FIXME: I don't know a better way so far. Feel free to suggest one
+// The problem is that every typed kernel should have ::id() but body
+// of the class is defined by user (with outMeta, other stuff)
+
+//! @cond IGNORED
+#define G_ID_HELPER_CLASS(Class)  Class##IdHelper
+
+#define G_ID_HELPER_BODY(Class, Id)                                         \
+    struct G_ID_HELPER_CLASS(Class)                                         \
+    {                                                                       \
+        static constexpr const char * id() {return Id;}                     \
+    };                                                                      \
+//! @endcond
+
+#define GET_G_TYPED_KERNEL(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, NAME, ...) NAME
+#define COMBINE_SIGNATURE(...) __VA_ARGS__
+// Ensure correct __VA_ARGS__ expansion on Windows
+#define __WRAP_VAARGS(x) x
+
+/**
+ * Helper for G_TYPED_KERNEL declares a new G-API Operation. See [Kernel API](@ref gapi_kernel_api)
+ * for more details.
+ *
+ * @param Class type name for this operation.
+ * @param API an `std::function<>`-like signature for the operation;
+ *        return type is a single value or a tuple of multiple values.
+ * @param Id string identifier for the operation. Must be unique.
+ */
+#define G_TYPED_KERNEL_HELPER(Class, API, Id)                                               \
+    G_ID_HELPER_BODY(Class, Id)                                                             \
+    struct Class final: public cv::detail::KernelTypeMedium<Class, std::function API >,     \
+                        public G_ID_HELPER_CLASS(Class)
+// {body} is to be defined by user
+
+#define G_TYPED_KERNEL_HELPER_2(Class, _1, _2, Id) \
+G_TYPED_KERNEL_HELPER(Class, COMBINE_SIGNATURE(_1, _2), Id)
+
+#define G_TYPED_KERNEL_HELPER_3(Class, _1, _2, _3, Id) \
+G_TYPED_KERNEL_HELPER(Class, COMBINE_SIGNATURE(_1, _2, _3), Id)
+
+#define G_TYPED_KERNEL_HELPER_4(Class, _1, _2, _3, _4, Id) \
+G_TYPED_KERNEL_HELPER(Class, COMBINE_SIGNATURE(_1, _2, _3, _4), Id)
+
+#define G_TYPED_KERNEL_HELPER_5(Class, _1, _2, _3, _4, _5, Id) \
+G_TYPED_KERNEL_HELPER(Class, COMBINE_SIGNATURE(_1, _2, _3, _4, _5), Id)
+
+#define G_TYPED_KERNEL_HELPER_6(Class, _1, _2, _3, _4, _5, _6, Id) \
+G_TYPED_KERNEL_HELPER(Class, COMBINE_SIGNATURE(_1, _2, _3, _4, _5, _6), Id)
+
+#define G_TYPED_KERNEL_HELPER_7(Class, _1, _2, _3, _4, _5, _6, _7, Id) \
+G_TYPED_KERNEL_HELPER(Class, COMBINE_SIGNATURE(_1, _2, _3, _4, _5, _6, _7), Id)
+
+#define G_TYPED_KERNEL_HELPER_8(Class, _1, _2, _3, _4, _5, _6, _7, _8, Id) \
+G_TYPED_KERNEL_HELPER(Class, COMBINE_SIGNATURE(_1, _2, _3, _4, _5, _6, _7, _8), Id)
+
+#define G_TYPED_KERNEL_HELPER_9(Class, _1, _2, _3, _4, _5, _6, _7, _8, _9, Id) \
+G_TYPED_KERNEL_HELPER(Class, COMBINE_SIGNATURE(_1, _2, _3, _4, _5, _6, _7, _8, _9), Id)
+
+#define G_TYPED_KERNEL_HELPER_10(Class, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, Id) \
+G_TYPED_KERNEL_HELPER(Class, COMBINE_SIGNATURE(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10), Id)
+
+/**
+ * Declares a new G-API Operation. See [Kernel API](@ref gapi_kernel_api)
+ * for more details.
+ *
+ * @param Class type name for this operation.
+ */
+#define G_TYPED_KERNEL(Class, ...) __WRAP_VAARGS(GET_G_TYPED_KERNEL(__VA_ARGS__, \
+                                                 G_TYPED_KERNEL_HELPER_10, \
+                                                 G_TYPED_KERNEL_HELPER_9, \
+                                                 G_TYPED_KERNEL_HELPER_8, \
+                                                 G_TYPED_KERNEL_HELPER_7, \
+                                                 G_TYPED_KERNEL_HELPER_6, \
+                                                 G_TYPED_KERNEL_HELPER_5, \
+                                                 G_TYPED_KERNEL_HELPER_4, \
+                                                 G_TYPED_KERNEL_HELPER_3, \
+                                                 G_TYPED_KERNEL_HELPER_2, \
+                                                 G_TYPED_KERNEL_HELPER)(Class, __VA_ARGS__)) \
+
+/**
+ * Declares a new G-API Operation. See [Kernel API](@ref gapi_kernel_api) for more details.
+ *
+ * @deprecated This macro is deprecated in favor of `G_TYPED_KERNEL` that is used for declaring any
+ * G-API Operation.
+ *
+ * @param Class type name for this operation.
+ */
+#define G_TYPED_KERNEL_M G_TYPED_KERNEL
+
+#define G_API_OP   G_TYPED_KERNEL
+#define G_API_OP_M G_API_OP
+
+namespace cv
+{
+namespace gapi
+{
+    // Prework: model "Device" API before it gets to G-API headers.
+    // FIXME: Don't mix with internal Backends class!
+    /// @private
+    class GAPI_EXPORTS GBackend
+    {
+    public:
+        class Priv;
+
+        // TODO: make it template (call `new` within??)
+        GBackend();
+        explicit GBackend(std::shared_ptr<Priv> &&p);
+
+        Priv& priv();
+        const Priv& priv() const;
+        std::size_t hash() const;
+
+        bool operator== (const GBackend &rhs) const;
+
+    private:
+        std::shared_ptr<Priv> m_priv;
+    };
+
+    inline bool operator != (const GBackend &lhs, const GBackend &rhs)
+    {
+        return !(lhs == rhs);
+    }
+} // namespace gapi
+} // namespace cv
+
+namespace std
+{
+    template<> struct hash<cv::gapi::GBackend>
+    {
+        std::size_t operator() (const cv::gapi::GBackend &b) const
+        {
+            return b.hash();
+        }
+    };
+} // namespace std
+
+namespace cv {
+    class GAPI_EXPORTS_W_SIMPLE GKernelPackage;
+
+namespace gapi {
+    GAPI_EXPORTS_W cv::GKernelPackage combine(const cv::GKernelPackage  &lhs,
+                                              const cv::GKernelPackage  &rhs);
+
+    /// @private
+    class GFunctor
+    {
+    public:
+        virtual cv::GKernelImpl impl()       const = 0;
+        virtual cv::gapi::GBackend backend() const = 0;
+        const char* id()                     const { return m_id; }
+
+        virtual ~GFunctor() = default;
+    protected:
+        GFunctor(const char* id) : m_id(id) { }
+    private:
+        const char* m_id;
+    };
+} // namespace gapi
+
+    /** \addtogroup gapi_compile_args
+     * @{
+     */
+
+    // FIXME: Hide implementation
+    /**
+     * @brief A container class for heterogeneous kernel
+     * implementation collections and graph transformations.
+     *
+     * GKernelPackage is a special container class which stores kernel
+     * _implementations_ and graph _transformations_. Objects of this class
+     * are created and passed to cv::GComputation::compile() to specify
+     * which kernels to use and which transformations to apply in the
+     * compiled graph. GKernelPackage may contain kernels of
+     * different backends, e.g. be heterogeneous.
+     *
+     * The most easy way to create a kernel package is to use function
+     * cv::gapi::kernels(). This template functions takes kernel
+     * implementations in form of type list (variadic template) and
+     * generates a kernel package atop of that.
+     *
+     * Kernel packages can be also generated programmatically, starting
+     * with an empty package (created with the default constructor)
+     * and then by populating it with kernels via call to
+     * GKernelPackage::include(). Note this method is also a template
+     * one since G-API kernel and transformation implementations are _types_,
+     * not objects.
+     *
+     * Finally, two kernel packages can be combined into a new one
+     * with function cv::gapi::combine().
+     */
+    class GAPI_EXPORTS_W_SIMPLE GKernelPackage
+    {
+
+        /// @private
+        using M = std::unordered_map<std::string, std::pair<cv::gapi::GBackend, cv::GKernelImpl>>;
+
+        /// @private
+        M m_id_kernels;
+
+        /// @private
+        std::vector<GTransform> m_transformations;
+
+    protected:
+        /// @private
+        // Remove ALL implementations of the given API (identified by ID)
+        void removeAPI(const std::string &id);
+
+        /// @private
+        // Partial include() specialization for kernels
+        template <typename KImpl>
+        typename std::enable_if<(std::is_base_of<cv::detail::KernelTag, KImpl>::value), void>::type
+        includeHelper()
+        {
+            auto backend     = KImpl::backend();
+            auto kernel_id   = KImpl::API::id();
+            auto kernel_impl = GKernelImpl{KImpl::kernel(), &KImpl::API::getOutMeta};
+            removeAPI(kernel_id);
+
+            m_id_kernels[kernel_id] = std::make_pair(backend, kernel_impl);
+        }
+
+        /// @private
+        // Partial include() specialization for transformations
+        template <typename TImpl>
+        typename std::enable_if<(std::is_base_of<cv::detail::TransformTag, TImpl>::value), void>::type
+        includeHelper()
+        {
+            m_transformations.emplace_back(TImpl::transformation());
+        }
+
+    public:
+        void include(const cv::gapi::GFunctor& functor);
+
+        /**
+         * @brief Returns total number of kernels
+         * in the package (across all backends included)
+         *
+         * @return a number of kernels in the package
+         */
+        GAPI_WRAP std::size_t size() const;
+
+        /**
+         * @brief Returns vector of transformations included in the package
+         *
+         * @return vector of transformations included in the package
+         */
+        const std::vector<GTransform>& get_transformations() const;
+
+        /**
+         * @brief Returns vector of kernel ids included in the package
+         *
+         * @return vector of kernel ids included in the package
+         */
+        std::vector<std::string> get_kernel_ids() const;
+
+        /**
+         * @brief Test if a particular kernel _implementation_ KImpl is
+         * included in this kernel package.
+         *
+         * @sa includesAPI()
+         *
+         * @note cannot be applied to transformations
+         *
+         * @return true if there is such kernel, false otherwise.
+         */
+        template<typename KImpl>
+        bool includes() const
+        {
+            static_assert(std::is_base_of<cv::detail::KernelTag, KImpl>::value,
+                          "includes() can be applied to kernels only");
+
+            auto kernel_it = m_id_kernels.find(KImpl::API::id());
+            return kernel_it != m_id_kernels.end() &&
+                   kernel_it->second.first == KImpl::backend();
+        }
+
+        /**
+         * @brief Remove all kernels associated with the given backend
+         * from the package.
+         *
+         * Does nothing if there's no kernels of this backend in the package.
+         *
+         * @param backend backend which kernels to remove
+         */
+        void remove(const cv::gapi::GBackend& backend);
+
+        /**
+         * @brief Remove all kernels implementing the given API from
+         * the package.
+         *
+         * Does nothing if there's no kernels implementing the given interface.
+         */
+        template<typename KAPI>
+        void remove()
+        {
+            removeAPI(KAPI::id());
+        }
+
+        // FIXME: Rename to includes() and distinguish API/impl case by
+        //     statically?
+        /**
+         * Check if package contains ANY implementation of a kernel API
+         * by API type.
+         */
+        template<typename KAPI>
+        bool includesAPI() const
+        {
+            return includesAPI(KAPI::id());
+        }
+
+        /// @private
+        bool includesAPI(const std::string &id) const;
+
+        // FIXME: The below comment is wrong, and who needs this function?
+        /**
+         * @brief Find a kernel (by its API)
+         *
+         * Returns implementation corresponding id.
+         * Throws if nothing found.
+         *
+         * @return Backend which hosts matching kernel implementation.
+         *
+         */
+        template<typename KAPI>
+        cv::gapi::GBackend lookup() const
+        {
+            return lookup(KAPI::id()).first;
+        }
+
+        /// @private
+        std::pair<cv::gapi::GBackend, cv::GKernelImpl>
+        lookup(const std::string &id) const;
+
+        // FIXME: No overwrites allowed?
+        /**
+         * @brief Put a new kernel implementation or a new transformation
+         * KImpl into the package.
+         */
+        template<typename KImpl>
+        void include()
+        {
+            includeHelper<KImpl>();
+        }
+
+        /**
+         * @brief Adds a new kernel based on it's backend and id into the kernel package
+         *
+         * @param backend backend associated with the kernel
+         * @param kernel_id a name/id of the kernel
+         */
+        void include(const cv::gapi::GBackend& backend, const std::string& kernel_id);
+
+        /**
+         * @brief Lists all backends which are included into package
+         *
+         * @return vector of backends
+         */
+        std::vector<cv::gapi::GBackend> backends() const;
+
+        // TODO: Doxygen bug -- it wants me to place this comment
+        // here, not below.
+        /**
+         * @brief Create a new package based on `lhs` and `rhs`.
+         *
+         * @param lhs "Left-hand-side" package in the process
+         * @param rhs "Right-hand-side" package in the process
+         * @return a new kernel package.
+         */
+        friend GAPI_EXPORTS GKernelPackage cv::gapi::combine(const GKernelPackage  &lhs,
+                                                             const GKernelPackage  &rhs);
+    };
+    /** @} */
+
+namespace gapi {
+    using GKernelPackage = cv::GKernelPackage; // Keep backward compatibility
+
+    /** \addtogroup gapi_compile_args
+     * @{
+     */
+
+    /**
+     * @brief Create a kernel package object containing kernels
+     * and transformations specified in variadic template argument.
+     *
+     * In G-API, kernel implementations and transformations are _types_.
+     * Every backend has its own kernel API (like GAPI_OCV_KERNEL() and
+     * GAPI_FLUID_KERNEL()) but all of that APIs define a new type for
+     * each kernel implementation.
+     *
+     * Use this function to pass kernel implementations (defined in
+     * either way) and transformations to the system. Example:
+     *
+     * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp kernels_snippet
+     *
+     * Note that kernels() itself is a function returning object, not
+     * a type, so having `()` at the end is important -- it must be a
+     * function call.
+     */
+    template<typename... KK> GKernelPackage kernels()
+    {
+        // FIXME: currently there is no check that transformations' signatures are unique
+        // and won't be any intersection in graph compilation stage
+        static_assert(cv::detail::all_unique<typename KK::API...>::value, "Kernels API must be unique");
+
+        GKernelPackage pkg;
+
+        // For those who wonder - below is a trick to call a number of
+        // methods based on parameter pack (zeroes just help hiding these
+        // calls into a sequence which helps to expand this parameter pack).
+        // Just note that `f(),a` always equals to `a` (with f() called!)
+        // and parentheses are used to hide function call in the expanded sequence.
+        // Leading 0 helps to handle case when KK is an empty list (kernels<>()).
+        int unused[] = { 0, (pkg.include<KK>(), 0)... };
+        cv::util::suppress_unused_warning(unused);
+        return pkg;
+    }
+
+    template<typename... FF>
+    GKernelPackage kernels(FF&... functors)
+    {
+        GKernelPackage pkg;
+        int unused[] = { 0, (pkg.include(functors), 0)... };
+        cv::util::suppress_unused_warning(unused);
+        return pkg;
+    }
+
+    /** @} */
+
+    /**
+     * @brief Combines multiple G-API kernel packages into one
+     *
+     * @overload
+     *
+     * This function successively combines the passed kernel packages using a right fold.
+     * Calling `combine(a, b, c)` is equal to `combine(a, combine(b, c))`.
+     *
+     * @return The resulting kernel package
+     */
+    template<typename... Ps>
+    cv::GKernelPackage combine(const cv::GKernelPackage &a, const cv::GKernelPackage &b, Ps&&... rest)
+    {
+        return combine(a, combine(b, rest...));
+    }
+    // NB(DM): Variadic-arg version in Python may require the same
+    // approach as used in GComputation::compile/apply.
+
+    /** \addtogroup gapi_compile_args
+     * @{
+     */
+    /**
+     * @brief cv::gapi::use_only() is a special combinator which hints G-API to use only
+     * kernels specified in cv::GComputation::compile() (and not to extend kernels available by
+     * default with that package).
+     */
+    struct GAPI_EXPORTS use_only
+    {
+        GKernelPackage pkg;
+    };
+    /** @} */
+
+} // namespace gapi
+
+namespace detail
+{
+    template<> struct CompileArgTag<cv::GKernelPackage>
+    {
+        static const char* tag() { return "gapi.kernel_package"; }
+    };
+
+    template<> struct CompileArgTag<cv::gapi::use_only>
+    {
+        static const char* tag() { return "gapi.use_only"; }
+    };
+} // namespace detail
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GKERNEL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gmat.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gmat.hpp
new file mode 100644
index 000000000000..6d6f74ff7f4d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gmat.hpp
@@ -0,0 +1,292 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GMAT_HPP
+#define OPENCV_GAPI_GMAT_HPP
+
+#include <ostream>
+#include <memory>                 // std::shared_ptr
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gcommon.hpp> // GShape
+
+#include <opencv2/gapi/own/assert.hpp>
+
+// TODO GAPI_EXPORTS or so
+namespace cv
+{
+// Forward declaration; GNode and GOrigin are an internal
+// (user-inaccessible) classes.
+class GNode;
+struct GOrigin;
+
+/** \addtogroup gapi_data_objects
+ * @{
+ *
+ * @brief G-API data objects used to build G-API expressions.
+ *
+ * These objects do not own any particular data (except compile-time
+ * associated values like with cv::GScalar or `cv::GArray<T>`) and are
+ * used only to construct graphs.
+ *
+ * Every graph in G-API starts and ends with data objects.
+ *
+ * Once constructed and compiled, G-API operates with regular host-side
+ * data instead. Refer to the below table to find the mapping between
+ * G-API and regular data types when passing input and output data
+ * structures to G-API:
+ *
+ *    G-API data type    | I/O data type
+ *    ------------------ | -------------
+ *    cv::GMat           | cv::Mat, cv::UMat, cv::RMat
+ *    cv::GScalar        | cv::Scalar
+ *    `cv::GArray<T>`    | std::vector<T>
+ *    `cv::GOpaque<T>`   | T
+ *    cv::GFrame         | cv::MediaFrame
+ */
+
+/**
+ * @brief GMat class represents image or tensor data in the
+ * graph.
+ *
+ * GMat doesn't store any data itself, instead it describes a
+ * functional relationship between operations consuming and producing
+ * GMat objects.
+ *
+ * GMat is a virtual counterpart of Mat and UMat, but it
+ * doesn't mean G-API use Mat or UMat objects internally to represent
+ * GMat objects -- the internal data representation may be
+ * backend-specific or optimized out at all.
+ *
+ * @sa Mat, GMatDesc
+ */
+class GAPI_EXPORTS_W_SIMPLE GMat
+{
+public:
+    /**
+     * @brief Constructs an empty GMat
+     *
+     * Normally, empty G-API data objects denote a starting point of
+     * the graph. When an empty GMat is assigned to a result of some
+     * operation, it obtains a functional link to this operation (and
+     * is not empty anymore).
+     */
+    GAPI_WRAP GMat();                       // Empty constructor
+
+    /**
+     * @brief Constructs a value-initialized GMat
+     *
+     * GMat may be associated with a buffer at graph construction time.
+     * It is useful when some operation has a Mat input which doesn't
+     * change during the program execution, and is set only once.
+     * In this case, there's no need to declare such GMat as graph input.
+     *
+     * @param m a cv::Mat buffer to associate with this GMat object.
+     */
+    GAPI_WRAP explicit GMat(cv::Mat m);     // Value-initialization constructor
+
+    /// @private
+    GMat(const GNode &n, std::size_t out);  // Operation result constructor
+    /// @private
+    GOrigin& priv();                        // Internal use only
+    /// @private
+    const GOrigin& priv()  const;           // Internal use only
+
+private:
+    std::shared_ptr<GOrigin> m_priv;
+};
+
+class GAPI_EXPORTS GMatP : public GMat
+{
+public:
+    using GMat::GMat;
+};
+
+class RMat;
+
+/** @} */
+
+/**
+ * \addtogroup gapi_meta_args
+ * @{
+ */
+struct GAPI_EXPORTS_W_SIMPLE GMatDesc
+{
+    // FIXME: Default initializers in C++14
+    GAPI_PROP int depth;
+    GAPI_PROP int chan;
+    GAPI_PROP cv::Size size; // NB.: no multi-dimensional cases covered yet
+    GAPI_PROP bool planar;
+    GAPI_PROP std::vector<int> dims; // FIXME: Maybe it's real questionable to have it here
+
+    GAPI_WRAP GMatDesc(int d, int c, cv::Size s, bool p = false)
+        : depth(d), chan(c), size(s), planar(p) {}
+
+    GAPI_WRAP GMatDesc(int d, const std::vector<int> &dd)
+        : depth(d), chan(-1), size{-1,-1}, planar(false), dims(dd) {}
+
+    GAPI_WRAP GMatDesc(int d, std::vector<int> &&dd)
+        : depth(d), chan(-1), size{-1,-1}, planar(false), dims(std::move(dd)) {}
+
+    GAPI_WRAP GMatDesc() : GMatDesc(-1, -1, {-1,-1}) {}
+
+    inline bool operator== (const GMatDesc &rhs) const
+    {
+        return    depth  == rhs.depth
+               && chan   == rhs.chan
+               && size   == rhs.size
+               && planar == rhs.planar
+               && dims   == rhs.dims;
+    }
+
+    inline bool operator!= (const GMatDesc &rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    bool isND() const { return !dims.empty(); }
+
+    // Checks if the passed mat can be described by this descriptor
+    // (it handles the case when
+    // 1-channel mat can be reinterpreted as is (1-channel mat)
+    // and as a 3-channel planar mat with height divided by 3)
+    bool canDescribe(const cv::Mat& mat) const;
+
+    bool canDescribe(const cv::RMat& mat) const;
+
+    // Meta combinator: return a new GMatDesc which differs in size by delta
+    // (all other fields are taken unchanged from this GMatDesc)
+    // FIXME: a better name?
+    GAPI_WRAP GMatDesc withSizeDelta(cv::Size delta) const
+    {
+        GMatDesc desc(*this);
+        desc.size += delta;
+        return desc;
+    }
+    // Meta combinator: return a new GMatDesc which differs in size by delta
+    // (all other fields are taken unchanged from this GMatDesc)
+    //
+    // This is an overload.
+    GAPI_WRAP GMatDesc withSizeDelta(int dx, int dy) const
+    {
+        return withSizeDelta(cv::Size{dx,dy});
+    }
+
+    GAPI_WRAP GMatDesc withSize(cv::Size sz) const
+    {
+        GMatDesc desc(*this);
+        desc.size = sz;
+        return desc;
+    }
+
+    // Meta combinator: return a new GMatDesc with specified data depth.
+    // (all other fields are taken unchanged from this GMatDesc)
+    GAPI_WRAP GMatDesc withDepth(int ddepth) const
+    {
+        GAPI_Assert(CV_MAT_CN(ddepth) == 1 || ddepth == -1);
+        GMatDesc desc(*this);
+        if (ddepth != -1) desc.depth = ddepth;
+        return desc;
+    }
+
+    // Meta combinator: return a new GMatDesc with specified data depth
+    // and number of channels.
+    // (all other fields are taken unchanged from this GMatDesc)
+    GAPI_WRAP GMatDesc withType(int ddepth, int dchan) const
+    {
+        GAPI_Assert(CV_MAT_CN(ddepth) == 1 || ddepth == -1);
+        GMatDesc desc = withDepth(ddepth);
+        desc.chan = dchan;
+        return desc;
+    }
+
+    // Meta combinator: return a new GMatDesc with planar flag set
+    // (no size changes are performed, only channel interpretation is changed
+    // (interleaved -> planar)
+    GAPI_WRAP GMatDesc asPlanar() const
+    {
+        GAPI_Assert(planar == false);
+        GMatDesc desc(*this);
+        desc.planar = true;
+        return desc;
+    }
+
+    // Meta combinator: return a new GMatDesc
+    // reinterpreting 1-channel input as planar image
+    // (size height is divided by plane number)
+    GAPI_WRAP GMatDesc asPlanar(int planes) const
+    {
+        GAPI_Assert(planar == false);
+        GAPI_Assert(chan == 1);
+        GAPI_Assert(planes > 1);
+        GAPI_Assert(size.height % planes == 0);
+        GMatDesc desc(*this);
+        desc.size.height /=  planes;
+        desc.chan = planes;
+        return desc.asPlanar();
+    }
+
+    // Meta combinator: return a new GMatDesc with planar flag set to false
+    // (no size changes are performed, only channel interpretation is changed
+    // (planar -> interleaved)
+    GAPI_WRAP GMatDesc asInterleaved() const
+    {
+        GAPI_Assert(planar == true);
+        GMatDesc desc(*this);
+        desc.planar = false;
+        return desc;
+    }
+};
+
+static inline GMatDesc empty_gmat_desc() { return GMatDesc{-1,-1,{-1,-1}}; }
+
+namespace gapi { namespace detail {
+/** Checks GMatDesc fields if the passed matrix is a set of n-dimentional points.
+@param in GMatDesc to check.
+@param n expected dimensionality.
+@return the amount of points. In case input matrix can't be described as vector of points
+of expected dimensionality, returns -1.
+ */
+int checkVector(const GMatDesc& in, const size_t n);
+
+/** @overload
+
+Checks GMatDesc fields if the passed matrix can be described as a set of points of any
+dimensionality.
+
+@return array of two elements in form of std::vector<int>: the amount of points
+and their calculated dimensionality. In case input matrix can't be described as vector of points,
+returns {-1, -1}.
+ */
+std::vector<int> checkVector(const GMatDesc& in);
+}} // namespace gapi::detail
+
+#if !defined(GAPI_STANDALONE)
+GAPI_EXPORTS GMatDesc descr_of(const cv::UMat &mat);
+#endif // !defined(GAPI_STANDALONE)
+
+//Fwd declarations
+namespace gapi { namespace own {
+    class Mat;
+    GAPI_EXPORTS GMatDesc descr_of(const Mat &mat);
+}}//gapi::own
+
+GAPI_EXPORTS GMatDesc descr_of(const RMat &mat);
+
+#if !defined(GAPI_STANDALONE)
+GAPI_EXPORTS GMatDesc descr_of(const cv::Mat &mat);
+#else
+using gapi::own::descr_of;
+#endif
+
+/** @} */
+
+GAPI_EXPORTS std::ostream& operator<<(std::ostream& os, const cv::GMatDesc &desc);
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GMAT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gmetaarg.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gmetaarg.hpp
new file mode 100644
index 000000000000..f21182c19f46
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gmetaarg.hpp
@@ -0,0 +1,80 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GMETAARG_HPP
+#define OPENCV_GAPI_GMETAARG_HPP
+
+#include <vector>
+#include <type_traits>
+
+#include <opencv2/gapi/util/util.hpp>
+#include <opencv2/gapi/util/variant.hpp>
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+#include <opencv2/gapi/garray.hpp>
+#include <opencv2/gapi/gopaque.hpp>
+#include <opencv2/gapi/gframe.hpp>
+
+namespace cv
+{
+// FIXME: Rename to GMeta?
+// FIXME: user shouldn't deal with it - put to detail?
+// GMetaArg is an union type over descriptions of G-types which can serve as
+// GComputation's in/output slots.
+//
+// GMetaArg objects are passed as arguments to GComputation::compile()
+// to specify which data a compiled computation should be specialized on.
+// For manual compile(), user must supply this metadata, in case of apply()
+// this metadata is taken from arguments computation should operate on.
+//
+// The first type (monostate) is equal to "uninitialized"/"unresolved" meta.
+using GMetaArg = util::variant
+    < util::monostate
+    , GMatDesc
+    , GScalarDesc
+    , GArrayDesc
+    , GOpaqueDesc
+    , GFrameDesc
+    >;
+GAPI_EXPORTS std::ostream& operator<<(std::ostream& os, const GMetaArg &);
+
+using GMetaArgs = std::vector<GMetaArg>;
+
+namespace detail
+{
+    // These traits are used by GComputation::compile()
+
+    // FIXME: is_constructible<T> doesn't work as variant doesn't do any SFINAE
+    // in its current template constructor
+
+    template<typename T> struct is_meta_descr    : std::false_type {};
+    template<> struct is_meta_descr<GMatDesc>    : std::true_type {};
+    template<> struct is_meta_descr<GScalarDesc> : std::true_type {};
+    template<> struct is_meta_descr<GArrayDesc>  : std::true_type {};
+    template<> struct is_meta_descr<GOpaqueDesc> : std::true_type {};
+
+    template<typename... Ts>
+    using are_meta_descrs = all_satisfy<is_meta_descr, Ts...>;
+
+    template<typename... Ts>
+    using are_meta_descrs_but_last = all_satisfy<is_meta_descr, typename all_but_last<Ts...>::type>;
+
+} // namespace detail
+
+// Note: descr_of(std::vector<..>) returns a GArrayDesc, while
+//       descrs_of(std::vector<..>) returns an array of Meta args!
+class UMat;
+GAPI_EXPORTS cv::GMetaArgs descrs_of(const std::vector<cv::Mat> &vec);
+GAPI_EXPORTS cv::GMetaArgs descrs_of(const std::vector<cv::UMat> &vec);
+namespace gapi { namespace own {
+    GAPI_EXPORTS cv::GMetaArgs descrs_of(const std::vector<Mat> &vec);
+}} // namespace gapi::own
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GMETAARG_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gopaque.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gopaque.hpp
new file mode 100644
index 000000000000..a3f98a9867e8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gopaque.hpp
@@ -0,0 +1,369 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GOPAQUE_HPP
+#define OPENCV_GAPI_GOPAQUE_HPP
+
+#include <functional>
+#include <ostream>
+#include <memory>
+
+#include <opencv2/gapi/own/exports.hpp>
+#include <opencv2/gapi/opencv_includes.hpp>
+
+#include <opencv2/gapi/util/any.hpp>
+#include <opencv2/gapi/util/variant.hpp>
+#include <opencv2/gapi/util/throw.hpp>
+#include <opencv2/gapi/util/type_traits.hpp>
+#include <opencv2/gapi/own/assert.hpp>
+
+#include <opencv2/gapi/gcommon.hpp>  // OpaqueKind
+#include <opencv2/gapi/garray.hpp>  // TypeHintBase
+
+namespace cv
+{
+// Forward declaration; GNode and GOrigin are an internal
+// (user-inaccessible) classes.
+class GNode;
+struct GOrigin;
+template<typename T> class GOpaque;
+
+/**
+ * \addtogroup gapi_meta_args
+ * @{
+ */
+struct GAPI_EXPORTS_W_SIMPLE GOpaqueDesc
+{
+    // FIXME: Body
+    // FIXME: Also implement proper operator== then
+    bool operator== (const GOpaqueDesc&) const { return true; }
+};
+template<typename U> GOpaqueDesc descr_of(const U &) { return {};}
+GAPI_EXPORTS_W inline GOpaqueDesc empty_gopaque_desc() {return {}; }
+/** @} */
+
+std::ostream& operator<<(std::ostream& os, const cv::GOpaqueDesc &desc);
+
+namespace detail
+{
+    // ConstructOpaque is a callback which stores information about T and is used by
+    // G-API runtime to construct an object in host memory (T remains opaque for G-API).
+    // ConstructOpaque is carried into G-API internals by GOpaqueU.
+    // Currently it is suitable for Host (CPU) plugins only, real offload may require
+    // more information for manual memory allocation on-device.
+    class OpaqueRef;
+    using ConstructOpaque = std::function<void(OpaqueRef&)>;
+
+    // FIXME: garray.hpp already contains hint classes (for actual T type verification),
+    // need to think where it can be moved (currently opaque uses it from garray)
+
+    // This class strips type information from GOpaque<T> and makes it usable
+    // in the G-API graph compiler (expression unrolling, graph generation, etc).
+    // Part of GProtoArg.
+    class GAPI_EXPORTS GOpaqueU
+    {
+    public:
+        GOpaqueU(const GNode &n, std::size_t out); // Operation result constructor
+
+        template <typename T>
+        bool holds() const;                       // Check if was created from GOpaque<T>
+
+        GOrigin& priv();                          // Internal use only
+        const GOrigin& priv() const;              // Internal use only
+
+    protected:
+        GOpaqueU();                                // Default constructor
+        template<class> friend class cv::GOpaque;  // (available for GOpaque<T> only)
+
+        void setConstructFcn(ConstructOpaque &&cv);  // Store T-aware constructor
+
+        template <typename T>
+        void specifyType();                       // Store type of initial GOpaque<T>
+
+        template <typename T>
+        void storeKind();
+
+        void setKind(cv::detail::OpaqueKind);
+
+        std::shared_ptr<GOrigin> m_priv;
+        std::shared_ptr<TypeHintBase> m_hint;
+    };
+
+    template <typename T>
+    bool GOpaqueU::holds() const{
+        GAPI_Assert(m_hint != nullptr);
+        using U = util::decay_t<T>;
+        return dynamic_cast<TypeHint<U>*>(m_hint.get()) != nullptr;
+    }
+
+    template <typename T>
+    void GOpaqueU::specifyType(){
+        m_hint.reset(new TypeHint<util::decay_t<T>>);
+    }
+
+    template <typename T>
+    void GOpaqueU::storeKind(){
+        // FIXME: Add assert here on cv::Mat and cv::Scalar?
+        setKind(cv::detail::GOpaqueTraits<T>::kind);
+    }
+
+    // This class represents a typed object reference.
+    // Depending on origins, this reference may be either "just a" reference to
+    // an object created externally, OR actually own the underlying object
+    // (be value holder).
+    class BasicOpaqueRef
+    {
+    public:
+        cv::GOpaqueDesc m_desc;
+        virtual ~BasicOpaqueRef() {}
+
+        virtual void mov(BasicOpaqueRef &ref) = 0;
+        virtual const void* ptr() const = 0;
+        virtual void set(const cv::util::any &a) = 0;
+    };
+
+    template<typename T> class OpaqueRefT final: public BasicOpaqueRef
+    {
+        using empty_t  = util::monostate;
+        using ro_ext_t = const T *;
+        using rw_ext_t =       T *;
+        using rw_own_t =       T  ;
+        util::variant<empty_t, ro_ext_t, rw_ext_t, rw_own_t> m_ref;
+
+        inline bool isEmpty() const { return util::holds_alternative<empty_t>(m_ref);  }
+        inline bool isROExt() const { return util::holds_alternative<ro_ext_t>(m_ref); }
+        inline bool isRWExt() const { return util::holds_alternative<rw_ext_t>(m_ref); }
+        inline bool isRWOwn() const { return util::holds_alternative<rw_own_t>(m_ref); }
+
+        void init(const T* obj = nullptr)
+        {
+            if (obj) m_desc = cv::descr_of(*obj);
+        }
+
+    public:
+        OpaqueRefT() { init(); }
+        virtual ~OpaqueRefT() {}
+
+        explicit OpaqueRefT(const T&  obj) : m_ref(&obj)           { init(&obj); }
+        explicit OpaqueRefT(      T&  obj) : m_ref(&obj)           { init(&obj); }
+        explicit OpaqueRefT(      T&& obj) : m_ref(std::move(obj)) { init(&obj); }
+
+        // Reset a OpaqueRefT. Called only for objects instantiated
+        // internally in G-API (e.g. temporary GOpaque<T>'s within a
+        // computation).  Reset here means both initialization
+        // (creating an object) and reset (discarding its existing
+        // content before the next execution). Must never be called
+        // for external OpaqueRefTs.
+        void reset()
+        {
+            if (isEmpty())
+            {
+                T empty_obj{};
+                m_desc = cv::descr_of(empty_obj);
+                m_ref  = std::move(empty_obj);
+                GAPI_Assert(isRWOwn());
+            }
+            else if (isRWOwn())
+            {
+                util::get<rw_own_t>(m_ref) = {};
+            }
+            else GAPI_Error("InternalError"); // shouldn't be called in *EXT modes
+        }
+
+        // Obtain a WRITE reference to underlying object
+        // Used by CPU kernel API wrappers when a kernel execution frame
+        // is created
+        T& wref()
+        {
+            GAPI_Assert(isRWExt() || isRWOwn());
+            if (isRWExt()) return *util::get<rw_ext_t>(m_ref);
+            if (isRWOwn()) return  util::get<rw_own_t>(m_ref);
+            util::throw_error(std::logic_error("Impossible happened"));
+        }
+
+        // Obtain a READ reference to underlying object
+        // Used by CPU kernel API wrappers when a kernel execution frame
+        // is created
+        const T& rref() const
+        {
+            // ANY object can be accessed for reading, even if it declared for
+            // output. Example -- a GComputation from [in] to [out1,out2]
+            // where [out2] is a result of operation applied to [out1]:
+            //
+            //            GComputation boundary
+            //            . . . . . . .
+            //            .           .
+            //     [in] ----> foo() ----> [out1]
+            //            .           .    :
+            //            .           . . .:. . .
+            //            .                V    .
+            //            .              bar() ---> [out2]
+            //            . . . . . . . . . . . .
+            //
+            if (isROExt()) return *util::get<ro_ext_t>(m_ref);
+            if (isRWExt()) return *util::get<rw_ext_t>(m_ref);
+            if (isRWOwn()) return  util::get<rw_own_t>(m_ref);
+            util::throw_error(std::logic_error("Impossible happened"));
+        }
+
+        virtual void mov(BasicOpaqueRef &v) override {
+            OpaqueRefT<T> *tv = dynamic_cast<OpaqueRefT<T>*>(&v);
+            GAPI_Assert(tv != nullptr);
+            wref() = std::move(tv->wref());
+        }
+
+        virtual const void* ptr() const override { return &rref(); }
+
+        virtual void set(const cv::util::any &a) override {
+            wref() = util::any_cast<T>(a);
+        }
+    };
+
+    // This class strips type information from OpaqueRefT<> and makes it usable
+    // in the G-API executables (carrying run-time data/information to kernels).
+    // Part of GRunArg.
+    // Its methods are typed proxies to OpaqueRefT<T>.
+    // OpaqueRef maintains "reference" semantics so two copies of OpaqueRef refer
+    // to the same underlying object.
+    class OpaqueRef
+    {
+        std::shared_ptr<BasicOpaqueRef> m_ref;
+        cv::detail::OpaqueKind m_kind = cv::detail::OpaqueKind::CV_UNKNOWN;
+
+        template<typename T> inline void check() const
+        {
+            GAPI_DbgAssert(dynamic_cast<OpaqueRefT<T>*>(m_ref.get()) != nullptr);
+        }
+
+    public:
+        OpaqueRef() = default;
+
+        template<
+            typename T,
+            typename = util::are_different_t<OpaqueRef, T>
+        >
+        // FIXME: probably won't work with const object
+        explicit OpaqueRef(T&& obj) :
+            m_ref(new OpaqueRefT<util::decay_t<T>>(std::forward<T>(obj))),
+            m_kind(GOpaqueTraits<util::decay_t<T>>::kind) {}
+
+        cv::detail::OpaqueKind getKind() const
+        {
+            return m_kind;
+        }
+
+        template<typename T> void reset()
+        {
+            if (!m_ref) m_ref.reset(new OpaqueRefT<T>());
+            check<T>();
+            storeKind<T>();
+            static_cast<OpaqueRefT<T>&>(*m_ref).reset();
+        }
+
+        template <typename T>
+        void storeKind()
+        {
+            m_kind = cv::detail::GOpaqueTraits<T>::kind;
+        }
+
+        template<typename T> T& wref()
+        {
+            check<T>();
+            return static_cast<OpaqueRefT<T>&>(*m_ref).wref();
+        }
+
+        template<typename T> const T& rref() const
+        {
+            check<T>();
+            return static_cast<OpaqueRefT<T>&>(*m_ref).rref();
+        }
+
+        void mov(OpaqueRef &v)
+        {
+            m_ref->mov(*v.m_ref);
+        }
+
+        cv::GOpaqueDesc descr_of() const
+        {
+            return m_ref->m_desc;
+        }
+
+        // May be used to uniquely identify this object internally
+        const void *ptr() const { return m_ref->ptr(); }
+
+        // Introduced for in-graph meta handling
+        OpaqueRef& operator= (const cv::util::any &a)
+        {
+            m_ref->set(a);
+            return *this;
+        }
+    };
+} // namespace detail
+
+/** \addtogroup gapi_data_objects
+ * @{
+ */
+/**
+ * @brief `cv::GOpaque<T>` template class represents an object of
+ * class `T` in the graph.
+ *
+ * `cv::GOpaque<T>` describes a functional relationship between operations
+ * consuming and producing object of class `T`. `cv::GOpaque<T>` is
+ * designed to extend G-API with user-defined data types, which are
+ * often required with user-defined operations. G-API can't apply any
+ * optimizations to user-defined types since these types are opaque to
+ * the framework. However, there is a number of G-API operations
+ * declared with `cv::GOpaque<T>` as a return type,
+ * e.g. cv::gapi::streaming::timestamp() or cv::gapi::streaming::size().
+ *
+ * @sa `cv::GArray<T>`
+ */
+template<typename T> class GOpaque
+{
+public:
+    // Host type (or Flat type) - the type this GOpaque is actually
+    // specified to.
+    /// @private
+    using HT = typename detail::flatten_g<util::decay_t<T>>::type;
+
+    /**
+     * @brief Constructs an empty `cv::GOpaque<T>`
+     *
+     * Normally, empty G-API data objects denote a starting point of
+     * the graph. When an empty `cv::GOpaque<T>` is assigned to a result
+     * of some operation, it obtains a functional link to this
+     * operation (and is not empty anymore).
+     */
+    GOpaque() { putDetails(); }              // Empty constructor
+
+    /// @private
+    explicit GOpaque(detail::GOpaqueU &&ref) // GOpaqueU-based constructor
+        : m_ref(ref) { putDetails(); }       // (used by GCall, not for users)
+
+    /// @private
+    detail::GOpaqueU strip() const {
+        return m_ref;
+    }
+    /// @private
+    static void Ctor(detail::OpaqueRef& ref) {
+        ref.reset<HT>();
+    }
+private:
+    void putDetails() {
+        m_ref.setConstructFcn(&Ctor);
+        m_ref.specifyType<HT>();
+        m_ref.storeKind<HT>();
+    }
+
+    detail::GOpaqueU m_ref;
+};
+
+/** @} */
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GOPAQUE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gproto.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gproto.hpp
new file mode 100644
index 000000000000..a2b5d83bc1e8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gproto.hpp
@@ -0,0 +1,159 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GPROTO_HPP
+#define OPENCV_GAPI_GPROTO_HPP
+
+#include <type_traits>
+#include <vector>
+#include <ostream>
+
+#include <opencv2/gapi/util/variant.hpp>
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+#include <opencv2/gapi/garray.hpp>
+#include <opencv2/gapi/gopaque.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/gmetaarg.hpp>
+
+namespace cv {
+
+// FIXME: user shouldn't deal with it - put to detail?
+// GProtoArg is an union type over G-types which can serve as
+// GComputation's in/output slots. In other words, GProtoArg
+// wraps any type which can serve as G-API exchange type.
+//
+// In Runtime, GProtoArgs are substituted with appropriate GRunArgs.
+//
+// GProtoArg objects are constructed in-place when user describes
+// (captures) computations, user doesn't interact with these types
+// directly.
+using GProtoArg = util::variant
+    < GMat
+    , GMatP
+    , GFrame
+    , GScalar
+    , detail::GArrayU  // instead of GArray<T>
+    , detail::GOpaqueU // instead of GOpaque<T>
+    >;
+
+using GProtoArgs = std::vector<GProtoArg>;
+
+namespace detail
+{
+template<typename... Ts> inline GProtoArgs packArgs(Ts... args)
+{
+    return GProtoArgs{ GProtoArg(wrap_gapi_helper<Ts>::wrap(args))... };
+}
+
+}
+
+template<class Tag>
+struct GIOProtoArgs
+{
+public:
+    // NB: Used by python wrapper
+    GIOProtoArgs() = default;
+    explicit GIOProtoArgs(const GProtoArgs& args) : m_args(args) {}
+    explicit GIOProtoArgs(GProtoArgs &&args)      : m_args(std::move(args)) {}
+
+    GProtoArgs m_args;
+
+    // TODO: Think about the addition operator
+    /**
+     * @brief This operator allows to complement the proto vectors at runtime.
+     *
+     * It's an ordinary overload of addition assignment operator.
+     *
+     * Example of usage:
+     * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/dynamic_graph_snippets.cpp  GIOProtoArgs usage
+     *
+     */
+    template<typename Tg>
+    friend GIOProtoArgs<Tg>& operator += (GIOProtoArgs<Tg> &lhs, const GIOProtoArgs<Tg> &rhs);
+};
+
+template<typename Tg>
+cv::GIOProtoArgs<Tg>& operator += (cv::GIOProtoArgs<Tg> &lhs, const cv::GIOProtoArgs<Tg> &rhs)
+{
+    lhs.m_args.reserve(lhs.m_args.size() + rhs.m_args.size());
+    lhs.m_args.insert(lhs.m_args.end(), rhs.m_args.begin(), rhs.m_args.end());
+    return lhs;
+}
+
+struct In_Tag{};
+struct Out_Tag{};
+
+using GProtoInputArgs  = GIOProtoArgs<In_Tag>;
+using GProtoOutputArgs = GIOProtoArgs<Out_Tag>;
+
+// Perfect forwarding
+template<typename... Ts> inline GProtoInputArgs GIn(Ts&&... ts)
+{
+    return GProtoInputArgs(detail::packArgs(std::forward<Ts>(ts)...));
+}
+
+template<typename... Ts> inline GProtoOutputArgs GOut(Ts&&... ts)
+{
+    return GProtoOutputArgs(detail::packArgs(std::forward<Ts>(ts)...));
+}
+
+namespace detail
+{
+    // Extract elements form tuple
+    // FIXME: Someday utilize a generic tuple_to_vec<> routine
+    template<typename... Ts, int... Indexes>
+    static GProtoOutputArgs getGOut_impl(const std::tuple<Ts...>& ts, detail::Seq<Indexes...>)
+    {
+        return GProtoOutputArgs{ detail::packArgs(std::get<Indexes>(ts)...)};
+    }
+}
+
+template<typename... Ts> inline GProtoOutputArgs GOut(const std::tuple<Ts...>& ts)
+{
+    // TODO: think of std::forward(ts)
+    return detail::getGOut_impl(ts, typename detail::MkSeq<sizeof...(Ts)>::type());
+}
+
+// Takes rvalue as input arg
+template<typename... Ts> inline GProtoOutputArgs GOut(std::tuple<Ts...>&& ts)
+{
+    // TODO: think of std::forward(ts)
+    return detail::getGOut_impl(ts, typename detail::MkSeq<sizeof...(Ts)>::type());
+}
+
+// Extract run-time arguments from node origin
+// Can be used to extract constant values associated with G-objects
+// (like GScalar) at graph construction time
+GRunArg value_of(const GOrigin &origin);
+
+// Transform run-time computation arguments into a collection of metadata
+// extracted from that arguments
+GMetaArg  GAPI_EXPORTS descr_of(const GRunArg  &arg );
+GMetaArgs GAPI_EXPORTS descr_of(const GRunArgs &args);
+
+// Transform run-time operation result argument into metadata extracted from that argument
+// Used to compare the metadata, which generated at compile time with the metadata result operation in run time
+GMetaArg GAPI_EXPORTS descr_of(const GRunArgP& argp);
+
+// Checks if run-time computation argument can be described by metadata
+bool GAPI_EXPORTS can_describe(const GMetaArg&  meta,  const GRunArg&  arg);
+bool GAPI_EXPORTS can_describe(const GMetaArgs& metas, const GRunArgs& args);
+
+// Checks if run-time computation result argument can be described by metadata.
+// Used to check if the metadata generated at compile time
+// coincides with output arguments passed to computation in cpu and ocl backends
+bool GAPI_EXPORTS can_describe(const GMetaArg&  meta,  const GRunArgP& argp);
+
+// Validates input arguments
+void GAPI_EXPORTS validate_input_arg(const GRunArg& arg);
+void GAPI_EXPORTS validate_input_args(const GRunArgs& args);
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GPROTO_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/core.hpp
new file mode 100644
index 000000000000..a7ee59577ce4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/core.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GPU_CORE_API_HPP
+#define OPENCV_GAPI_GPU_CORE_API_HPP
+/** @file
+* @deprecated Use <opencv2/gapi/ocl/core.hpp> instead.
+*/
+
+#include <opencv2/gapi/ocl/core.hpp>
+
+namespace cv {
+namespace gapi {
+namespace core {
+namespace gpu {
+    using namespace ocl;
+} // namespace gpu
+} // namespace core
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_GPU_CORE_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/ggpukernel.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/ggpukernel.hpp
new file mode 100644
index 000000000000..b52c21de6bf7
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/ggpukernel.hpp
@@ -0,0 +1,18 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GGPUKERNEL_HPP
+#define OPENCV_GAPI_GGPUKERNEL_HPP
+/** @file
+* @deprecated Use <opencv2/gapi/ocl/goclkernel.hpp> instead.
+*/
+
+#include <opencv2/gapi/ocl/goclkernel.hpp>
+#define GAPI_GPU_KERNEL GAPI_OCL_KERNEL
+
+
+#endif // OPENCV_GAPI_GGPUKERNEL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/imgproc.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/imgproc.hpp
new file mode 100644
index 000000000000..b0df7ae3315e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gpu/imgproc.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GPU_IMGPROC_API_HPP
+#define OPENCV_GAPI_GPU_IMGPROC_API_HPP
+/** @file
+* @deprecated Use <opencv2/gapi/ocl/imgproc.hpp> instead.
+*/
+
+#include <opencv2/gapi/ocl/imgproc.hpp>
+
+
+namespace cv {
+namespace gapi {
+namespace imgproc {
+namespace gpu {
+    using namespace ocl;
+} // namespace gpu
+} // namespace imgproc
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_GPU_IMGPROC_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gscalar.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gscalar.hpp
new file mode 100644
index 000000000000..de0dfe1383c7
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gscalar.hpp
@@ -0,0 +1,140 @@
+// This file is part of OpenCV project.
+
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GSCALAR_HPP
+#define OPENCV_GAPI_GSCALAR_HPP
+
+#include <ostream>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gcommon.hpp> // GShape
+#include <opencv2/gapi/util/optional.hpp>
+
+namespace cv
+{
+// Forward declaration; GNode and GOrigin are an internal
+// (user-inaccessible) classes.
+class GNode;
+struct GOrigin;
+
+/** \addtogroup gapi_data_objects
+ * @{
+ */
+/**
+ * @brief GScalar class represents cv::Scalar data in the graph.
+ *
+ * GScalar may be associated with a cv::Scalar value, which becomes
+ * its constant value bound in graph compile time. cv::GScalar describes a
+ * functional relationship between operations consuming and producing
+ * GScalar objects.
+ *
+ * GScalar is a virtual counterpart of cv::Scalar, which is usually used
+ * to represent the GScalar data in G-API during the execution.
+ *
+ * @sa Scalar
+ */
+class GAPI_EXPORTS_W_SIMPLE GScalar
+{
+public:
+    /**
+     * @brief Constructs an empty GScalar
+     *
+     * Normally, empty G-API data objects denote a starting point of
+     * the graph. When an empty GScalar is assigned to a result of some
+     * operation, it obtains a functional link to this operation (and
+     * is not empty anymore).
+     */
+    GAPI_WRAP GScalar();
+
+    /**
+     * @brief Constructs a value-initialized GScalar
+     *
+     * GScalars may have their values be associated at graph
+     * construction time. It is useful when some operation has a
+     * GScalar input which doesn't change during the program
+     * execution, and is set only once. In this case, there is no need
+     * to declare such GScalar as a graph input.
+     *
+     * @note The value of GScalar may be overwritten by assigning some
+     * other GScalar to the object using `operator=` -- on the
+     * assignment, the old GScalar value is discarded.
+     *
+     * @param s a cv::Scalar value to associate with this GScalar object.
+     */
+    GAPI_WRAP
+    explicit GScalar(const cv::Scalar& s);
+
+    /**
+     * @overload
+     * @brief Constructs a value-initialized GScalar
+     *
+     * @param s a cv::Scalar value to associate with this GScalar object.
+     */
+    explicit GScalar(cv::Scalar&& s);       // Constant value move-constructor from cv::Scalar
+
+    /**
+     * @overload
+     * @brief Constructs a value-initialized GScalar
+     *
+     * @param v0 A `double` value to associate with this GScalar. Note
+     *  that only the first component of a four-component cv::Scalar is
+     *  set to this value, with others remain zeros.
+     *
+     * This constructor overload is not marked `explicit` and can be
+     * used in G-API expression code like this:
+     *
+     * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp gscalar_implicit
+     *
+     * Here operator+(GMat,GScalar) is used to wrap cv::gapi::addC()
+     * and a value-initialized GScalar is created on the fly.
+     *
+     * @overload
+     */
+    GScalar(double v0);                                // Constant value constructor from double
+
+    /// @private
+    GScalar(const GNode &n, std::size_t out);          // Operation result constructor
+    /// @private
+    GOrigin& priv();                                   // Internal use only
+    /// @private
+    const GOrigin& priv()  const;                      // Internal use only
+
+private:
+    std::shared_ptr<GOrigin> m_priv;
+};
+
+/** @} */
+
+/**
+ * \addtogroup gapi_meta_args
+ * @{
+ */
+struct GAPI_EXPORTS_W_SIMPLE GScalarDesc
+{
+    // NB.: right now it is empty
+
+    inline bool operator== (const GScalarDesc &) const
+    {
+        return true; // NB: implement this method if GScalar meta appears
+    }
+
+    inline bool operator!= (const GScalarDesc &rhs) const
+    {
+        return !(*this == rhs);
+    }
+};
+
+GAPI_EXPORTS_W inline GScalarDesc empty_scalar_desc() { return GScalarDesc(); }
+
+GAPI_EXPORTS GScalarDesc descr_of(const cv::Scalar &scalar);
+
+std::ostream& operator<<(std::ostream& os, const cv::GScalarDesc &desc);
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GSCALAR_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gstreaming.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gstreaming.hpp
new file mode 100644
index 000000000000..d413195b8178
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gstreaming.hpp
@@ -0,0 +1,430 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2021 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GSTREAMING_COMPILED_HPP
+#define OPENCV_GAPI_GSTREAMING_COMPILED_HPP
+
+#include <memory>
+#include <vector>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/own/assert.hpp>
+#include <opencv2/gapi/util/optional.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/streaming/source.hpp>
+
+namespace cv {
+
+template<class T> using optional = cv::util::optional<T>;
+
+namespace detail {
+template<typename T> struct wref_spec {
+    using type = T;
+};
+template<typename T> struct wref_spec<std::vector<T> > {
+    using type = T;
+};
+
+template<typename RefHolder>
+struct OptRef {
+    struct OptHolder {
+        virtual void mov(RefHolder &h) = 0;
+        virtual void reset() = 0;
+        virtual ~OptHolder() = default;
+        using Ptr = std::shared_ptr<OptHolder>;
+    };
+    template<class T> struct Holder final: OptHolder {
+        std::reference_wrapper<cv::optional<T> > m_opt_ref;
+
+        explicit Holder(cv::optional<T>& opt) : m_opt_ref(std::ref(opt)) {
+        }
+        virtual void mov(RefHolder &h) override {
+            using U = typename wref_spec<T>::type;
+            m_opt_ref.get() = cv::util::make_optional(std::move(h.template wref<U>()));
+        }
+        virtual void reset() override {
+            m_opt_ref.get().reset();
+        }
+    };
+    template<class T>
+    explicit OptRef(cv::optional<T>& t) : m_opt{new Holder<T>(t)} {}
+    void mov(RefHolder &h) { m_opt->mov(h); }
+    void reset()           { m_opt->reset();}
+private:
+    typename OptHolder::Ptr m_opt;
+};
+using OptionalVectorRef = OptRef<cv::detail::VectorRef>;
+using OptionalOpaqueRef = OptRef<cv::detail::OpaqueRef>;
+} // namespace detail
+
+// TODO: Keep it in sync with GRunArgP (derive the type automatically?)
+using GOptRunArgP = util::variant<
+    optional<cv::Mat>*,
+    optional<cv::RMat>*,
+    optional<cv::MediaFrame>*,
+    optional<cv::Scalar>*,
+    cv::detail::OptionalVectorRef,
+    cv::detail::OptionalOpaqueRef
+>;
+using GOptRunArgsP = std::vector<GOptRunArgP>;
+
+using GOptRunArg = util::variant<
+    optional<cv::Mat>,
+    optional<cv::RMat>,
+    optional<cv::MediaFrame>,
+    optional<cv::Scalar>,
+    optional<cv::detail::VectorRef>,
+    optional<cv::detail::OpaqueRef>
+>;
+using GOptRunArgs = std::vector<GOptRunArg>;
+
+namespace detail {
+
+template<typename T> inline GOptRunArgP wrap_opt_arg(optional<T>& arg) {
+    // By default, T goes to an OpaqueRef. All other types are specialized
+    return GOptRunArgP{OptionalOpaqueRef(arg)};
+}
+
+template<typename T> inline GOptRunArgP wrap_opt_arg(optional<std::vector<T> >& arg) {
+    return GOptRunArgP{OptionalVectorRef(arg)};
+}
+
+template<> inline GOptRunArgP wrap_opt_arg(optional<cv::Mat> &m) {
+    return GOptRunArgP{&m};
+}
+
+template<> inline GOptRunArgP wrap_opt_arg(optional<cv::RMat> &m) {
+    return GOptRunArgP{&m};
+}
+
+template<> inline GOptRunArgP wrap_opt_arg(optional<cv::MediaFrame> &f) {
+    return GOptRunArgP{&f};
+}
+
+template<> inline GOptRunArgP wrap_opt_arg(optional<cv::Scalar> &s) {
+    return GOptRunArgP{&s};
+}
+
+} // namespace detail
+
+// Now cv::gout() may produce an empty vector (see "dynamic graphs"), so
+// there may be a conflict between these two. State here that Opt version
+// _must_ have at least one input for this overload
+template<typename T, typename... Ts>
+inline GOptRunArgsP gout(optional<T>&arg, optional<Ts>&... args)
+{
+    return GOptRunArgsP{ detail::wrap_opt_arg(arg), detail::wrap_opt_arg(args)... };
+}
+
+/**
+ * \addtogroup gapi_main_classes
+ * @{
+ */
+/**
+ * @brief Represents a computation (graph) compiled for streaming.
+ *
+ * This class represents a product of graph compilation (calling
+ * cv::GComputation::compileStreaming()). Objects of this class
+ * actually do stream processing, and the whole pipeline execution
+ * complexity is incapsulated into objects of this class. Execution
+ * model has two levels: at the very top, the execution of a
+ * heterogeneous graph is aggressively pipelined; at the very bottom
+ * the execution of every internal block is determined by its
+ * associated backend. Backends are selected based on kernel packages
+ * passed via compilation arguments ( see @ref gapi_compile_args,
+ * GNetworkPackage, GKernelPackage for details).
+ *
+ * GStreamingCompiled objects have a "player" semantics -- there are
+ * methods like start() and stop(). GStreamingCompiled has a full
+ * control over a videostream and so is stateful. You need to specify the
+ * input stream data using setSource() and then call start() to
+ * actually start processing. After that, use pull() or try_pull() to
+ * obtain next processed data frame from the graph in a blocking or
+ * non-blocking way, respectively.
+ *
+ * Currently a single GStreamingCompiled can process only one video
+ * streat at time. Produce multiple GStreamingCompiled objects to run the
+ * same graph on multiple video streams.
+ *
+ * @sa GCompiled
+ */
+class GAPI_EXPORTS_W_SIMPLE GStreamingCompiled
+{
+public:
+    class GAPI_EXPORTS Priv;
+    GAPI_WRAP GStreamingCompiled();
+
+    // FIXME: More overloads?
+    /**
+     * @brief Specify the input data to GStreamingCompiled for
+     * processing, a generic version.
+     *
+     * Use gin() to create an input parameter vector.
+     *
+     * Input vectors must have the same number of elements as defined
+     * in the cv::GComputation protocol (at the moment of its
+     * construction). Shapes of elements also must conform to protocol
+     * (e.g. cv::Mat needs to be passed where cv::GMat has been
+     * declared as input, and so on). Run-time exception is generated
+     * on type mismatch.
+     *
+     * In contrast with regular GCompiled, user can also pass an
+     * object of type GVideoCapture for a GMat parameter of the parent
+     * GComputation.  The compiled pipeline will start fetching data
+     * from that GVideoCapture and feeding it into the
+     * pipeline. Pipeline stops when a GVideoCapture marks end of the
+     * stream (or when stop() is called).
+     *
+     * Passing a regular Mat for a GMat parameter makes it "infinite"
+     * source -- pipeline may run forever feeding with this Mat until
+     * stopped explicitly.
+     *
+     * Currently only a single GVideoCapture is supported as input. If
+     * the parent GComputation is declared with multiple input GMat's,
+     * one of those can be specified as GVideoCapture but all others
+     * must be regular Mat objects.
+     *
+     * Throws if pipeline is already running. Use stop() and then
+     * setSource() to run the graph on a new video stream.
+     *
+     * @note This method is not thread-safe (with respect to the user
+     * side) at the moment. Protect the access if
+     * start()/stop()/setSource() may be called on the same object in
+     * multiple threads in your application.
+     *
+     * @param ins vector of inputs to process.
+     * @sa gin
+     */
+    void setSource(GRunArgs &&ins);
+
+    /// @private -- Exclude this function from OpenCV documentation
+    GAPI_WRAP void setSource(const cv::detail::ExtractArgsCallback& callback);
+
+    /**
+     * @brief Specify an input video stream for a single-input
+     * computation pipeline.
+     *
+     * Throws if pipeline is already running. Use stop() and then
+     * setSource() to run the graph on a new video stream.
+     *
+     * @overload
+     * @param s a shared pointer to IStreamSource representing the
+     * input video stream.
+     */
+    void setSource(const gapi::wip::IStreamSource::Ptr& s);
+
+    /**
+     * @brief Constructs and specifies an input video stream for a
+     * single-input computation pipeline with the given parameters.
+     *
+     * Throws if pipeline is already running. Use stop() and then
+     * setSource() to run the graph on a new video stream.
+     *
+     * @overload
+     * @param args arguments used to construct and initialize a stream
+     * source.
+     */
+    template<typename T, typename... Args>
+    void setSource(Args&&... args) {
+        setSource(cv::gapi::wip::make_src<T>(std::forward<Args>(args)...));
+    }
+
+    /**
+     * @brief Start the pipeline execution.
+     *
+     * Use pull()/try_pull() to obtain data. Throws an exception if
+     * a video source was not specified.
+     *
+     * setSource() must be called first, even if the pipeline has been
+     * working already and then stopped (explicitly via stop() or due
+     * stream completion)
+     *
+     * @note This method is not thread-safe (with respect to the user
+     * side) at the moment. Protect the access if
+     * start()/stop()/setSource() may be called on the same object in
+     * multiple threads in your application.
+     */
+    GAPI_WRAP void start();
+
+    /**
+     * @brief Get the next processed frame from the pipeline.
+     *
+     * Use gout() to create an output parameter vector.
+     *
+     * Output vectors must have the same number of elements as defined
+     * in the cv::GComputation protocol (at the moment of its
+     * construction). Shapes of elements also must conform to protocol
+     * (e.g. cv::Mat needs to be passed where cv::GMat has been
+     * declared as output, and so on). Run-time exception is generated
+     * on type mismatch.
+     *
+     * This method writes new data into objects passed via output
+     * vector.  If there is no data ready yet, this method blocks. Use
+     * try_pull() if you need a non-blocking version.
+     *
+     * @param outs vector of output parameters to obtain.
+     * @return true if next result has been obtained,
+     *    false marks end of the stream.
+     */
+    bool pull(cv::GRunArgsP &&outs);
+
+    // NB: Used from python
+    /// @private -- Exclude this function from OpenCV documentation
+    GAPI_WRAP std::tuple<bool, cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>> pull();
+
+    /**
+     * @brief Get some next available data from the pipeline.
+     *
+     * This method takes a vector of cv::optional object. An object is
+     * assigned to some value if this value is available (ready) at
+     * the time of the call, and resets the object to empty() if it is
+     * not.
+     *
+     * This is a blocking method which guarantees that some data has
+     * been written to the output vector on return.
+     *
+     * Using this method only makes sense if the graph has
+     * desynchronized parts (see cv::gapi::desync). If there is no
+     * desynchronized parts in the graph, the behavior of this
+     * method is identical to the regular pull() (all data objects are
+     * produced synchronously in the output vector).
+     *
+     * Use gout() to create an output parameter vector.
+     *
+     * Output vectors must have the same number of elements as defined
+     * in the cv::GComputation protocol (at the moment of its
+     * construction). Shapes of elements also must conform to protocol
+     * (e.g. cv::optional<cv::Mat> needs to be passed where cv::GMat
+     * has been declared as output, and so on). Run-time exception is
+     * generated on type mismatch.
+     *
+     * This method writes new data into objects passed via output
+     * vector.  If there is no data ready yet, this method blocks. Use
+     * try_pull() if you need a non-blocking version.
+     *
+     * @param outs vector of output parameters to obtain.
+     * @return true if next result has been obtained,
+     *    false marks end of the stream.
+     *
+     * @sa cv::gapi::desync
+     */
+    bool pull(cv::GOptRunArgsP &&outs);
+
+    /**
+     * @brief Try to get the next processed frame from the pipeline.
+     *
+     * Use gout() to create an output parameter vector.
+     *
+     * This method writes new data into objects passed via output
+     * vector.  If there is no data ready yet, the output vector
+     * remains unchanged and false is returned.
+     *
+     * @return true if data has been obtained, and false if it was
+     *    not. Note: false here doesn't mark the end of the stream.
+     */
+    bool try_pull(cv::GRunArgsP &&outs);
+
+    /**
+     * @brief Stop (abort) processing the pipeline.
+     *
+     * Note - it is not pause but a complete stop. Calling start()
+     * will cause G-API to start processing the stream from the early beginning.
+     *
+     * Throws if the pipeline is not running.
+     */
+    GAPI_WRAP void stop();
+
+    /**
+     * @brief Test if the pipeline is running.
+     *
+     * @note This method is not thread-safe (with respect to the user
+     * side) at the moment. Protect the access if
+     * start()/stop()/setSource() may be called on the same object in
+     * multiple threads in your application.
+     *
+     * @return true if the current stream is not over yet.
+     */
+    GAPI_WRAP bool running() const;
+
+    /// @private
+    Priv& priv();
+
+    /**
+     * @brief Check if compiled object is valid (non-empty)
+     *
+     * @return true if the object is runnable (valid), false otherwise
+     */
+    explicit operator bool () const;
+
+    /**
+     * @brief Vector of metadata this graph was compiled for.
+     *
+     * @return Unless _reshape_ is not supported, return value is the
+     * same vector which was passed to cv::GComputation::compile() to
+     * produce this compiled object. Otherwise, it is the latest
+     * metadata vector passed to reshape() (if that call was
+     * successful).
+     */
+    const GMetaArgs& metas() const; // Meta passed to compile()
+
+    /**
+     * @brief Vector of metadata descriptions of graph outputs
+     *
+     * @return vector with formats/resolutions of graph's output
+     * objects, auto-inferred from input metadata vector by
+     * operations which form this computation.
+     *
+     * @note GCompiled objects produced from the same
+     * cv::GComputiation graph with different input metas may return
+     * different values in this vector.
+     */
+    const GMetaArgs& outMetas() const;
+
+protected:
+    /// @private
+    std::shared_ptr<Priv> m_priv;
+};
+
+namespace gapi {
+
+/**
+ * @brief This namespace contains G-API functions, structures, and
+ * symbols related to the Streaming execution mode.
+ *
+ * Some of the operations defined in this namespace (e.g. size(),
+ * BGR(), etc.) can be used in the traditional execution mode too.
+ */
+namespace streaming {
+/**
+ * @brief Specify queue capacity for streaming execution.
+ *
+ * In the streaming mode the pipeline steps are connected with queues
+ * and this compile argument controls every queue's size.
+ */
+struct GAPI_EXPORTS_W_SIMPLE queue_capacity
+{
+    GAPI_WRAP
+    explicit queue_capacity(size_t cap = 1) : capacity(cap) { }
+    GAPI_PROP_RW
+    size_t capacity;
+};
+} // namespace streaming
+} // namespace gapi
+
+namespace detail
+{
+template<> struct CompileArgTag<cv::gapi::streaming::queue_capacity>
+{
+    static const char* tag() { return "gapi.queue_capacity"; }
+};
+}
+
+/** @} gapi_main_classes */
+
+}
+
+#endif // OPENCV_GAPI_GSTREAMING_COMPILED_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtransform.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtransform.hpp
new file mode 100644
index 000000000000..ce88c894d7f3
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtransform.hpp
@@ -0,0 +1,103 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+#ifndef OPENCV_GAPI_GTRANSFORM_HPP
+#define OPENCV_GAPI_GTRANSFORM_HPP
+
+#include <functional>
+#include <type_traits>
+#include <utility>
+
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/util/util.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/gtype_traits.hpp>
+#include <opencv2/gapi/util/compiler_hints.hpp>
+#include <opencv2/gapi/gcomputation.hpp>
+
+namespace cv
+{
+
+struct GAPI_EXPORTS GTransform
+{
+    // FIXME: consider another simplified
+    // class instead of GComputation
+    using F = std::function<GComputation()>;
+
+    std::string description;
+    F pattern;
+    F substitute;
+
+    GTransform(const std::string& d, const F &p, const F &s) : description(d), pattern(p), substitute(s) {}
+};
+
+namespace detail
+{
+
+template <typename, typename, typename>
+struct TransHelper;
+
+template <typename K, typename... Ins, typename Out>
+struct TransHelper<K, std::tuple<Ins...>, Out>
+{
+    template <typename Callable, int... IIs, int... OIs>
+    static GComputation invoke(Callable f, Seq<IIs...>, Seq<OIs...>)
+    {
+        const std::tuple<Ins...> ins;
+        const auto r = tuple_wrap_helper<Out>::get(f(std::get<IIs>(ins)...));
+        return GComputation(cv::GIn(std::get<IIs>(ins)...),
+                            cv::GOut(std::get<OIs>(r)...));
+    }
+
+    static GComputation get_pattern()
+    {
+        return invoke(K::pattern, typename MkSeq<sizeof...(Ins)>::type(),
+                      typename MkSeq<std::tuple_size<typename tuple_wrap_helper<Out>::type>::value>::type());
+    }
+    static GComputation get_substitute()
+    {
+        return invoke(K::substitute, typename MkSeq<sizeof...(Ins)>::type(),
+                      typename MkSeq<std::tuple_size<typename tuple_wrap_helper<Out>::type>::value>::type());
+    }
+};
+} // namespace detail
+
+template <typename, typename>
+class GTransformImpl;
+
+template <typename K, typename R, typename... Args>
+class GTransformImpl<K, std::function<R(Args...)>> : public cv::detail::TransHelper<K, std::tuple<Args...>, R>,
+                                                     public cv::detail::TransformTag
+{
+public:
+    // FIXME: currently there is no check that transformations' signatures are unique
+    // and won't be any intersection in graph compilation stage
+    using API = K;
+
+    static GTransform transformation()
+    {
+        return GTransform(K::descr(), &K::get_pattern, &K::get_substitute);
+    }
+};
+} // namespace cv
+
+#define G_DESCR_HELPER_CLASS(Class) Class##DescrHelper
+
+#define G_DESCR_HELPER_BODY(Class, Descr)                       \
+    namespace detail                                            \
+    {                                                           \
+    struct G_DESCR_HELPER_CLASS(Class)                          \
+    {                                                           \
+        static constexpr const char *descr() { return Descr; }  \
+    };                                                          \
+    }
+
+#define GAPI_TRANSFORM(Class, API, Descr)                                     \
+    G_DESCR_HELPER_BODY(Class, Descr)                                         \
+    struct Class final : public cv::GTransformImpl<Class, std::function API>, \
+                         public detail::G_DESCR_HELPER_CLASS(Class)
+
+#endif // OPENCV_GAPI_GTRANSFORM_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtype_traits.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtype_traits.hpp
new file mode 100644
index 000000000000..c42d64a7617c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtype_traits.hpp
@@ -0,0 +1,242 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GTYPE_TRAITS_HPP
+#define OPENCV_GAPI_GTYPE_TRAITS_HPP
+
+#include <vector>
+#include <type_traits>
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+#include <opencv2/gapi/garray.hpp>
+#include <opencv2/gapi/gopaque.hpp>
+#include <opencv2/gapi/gframe.hpp>
+#include <opencv2/gapi/streaming/source.hpp>
+#include <opencv2/gapi/media.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/util/util.hpp>
+#include <opencv2/gapi/own/convert.hpp>
+
+namespace cv
+{
+namespace detail
+{
+    template<typename, typename = void>
+    struct contains_shape_field : std::false_type {};
+
+    template<typename TaggedTypeCandidate>
+    struct contains_shape_field<TaggedTypeCandidate,
+                                void_t<decltype(TaggedTypeCandidate::shape)>> :
+        std::is_same<typename std::decay<decltype(TaggedTypeCandidate::shape)>::type, GShape>
+    {};
+
+    template<typename Type>
+    struct has_gshape : contains_shape_field<Type> {};
+
+    // FIXME: These traits and enum and possible numerous switch(kind)
+    // block may be replaced with a special Handler<T> object or with
+    // a double dispatch
+    enum class ArgKind: int
+    {
+        OPAQUE_VAL,   // Unknown, generic, opaque-to-GAPI data type - STATIC
+                      // Note: OPAQUE is sometimes defined in Win sys headers
+#if !defined(OPAQUE) && !defined(CV_DOXYGEN)
+        OPAQUE = OPAQUE_VAL,  // deprecated value used for compatibility, use OPAQUE_VAL instead
+#endif
+        GOBJREF,      // <internal> reference to object
+        GMAT,         // a cv::GMat
+        GMATP,        // a cv::GMatP
+        GFRAME,       // a cv::GFrame
+        GSCALAR,      // a cv::GScalar
+        GARRAY,       // a cv::GArrayU  (note - exactly GArrayU,  not GArray<T>!)
+        GOPAQUE,      // a cv::GOpaqueU (note - exactly GOpaqueU, not GOpaque<T>!)
+    };
+
+    // Describe G-API types (G-types) with traits.  Mostly used by
+    // cv::GArg to store meta information about types passed into
+    // operation arguments. Please note that cv::GComputation is
+    // defined on GProtoArgs, not GArgs!
+    template<typename T> struct GTypeTraits;
+    template<typename T> struct GTypeTraits
+    {
+        static constexpr const ArgKind kind = ArgKind::OPAQUE_VAL;
+        static constexpr const OpaqueKind op_kind = OpaqueKind::CV_UNKNOWN;
+    };
+    template<>           struct GTypeTraits<cv::GMat>
+    {
+        static constexpr const ArgKind kind = ArgKind::GMAT;
+        static constexpr const GShape shape = GShape::GMAT;
+        static constexpr const OpaqueKind op_kind = OpaqueKind::CV_UNKNOWN;
+    };
+    template<>           struct GTypeTraits<cv::GMatP>
+    {
+        static constexpr const ArgKind kind = ArgKind::GMATP;
+        static constexpr const GShape shape = GShape::GMAT;
+        static constexpr const OpaqueKind op_kind = OpaqueKind::CV_UNKNOWN;
+    };
+    template<>           struct GTypeTraits<cv::GFrame>
+    {
+        static constexpr const ArgKind kind = ArgKind::GFRAME;
+        static constexpr const GShape shape = GShape::GFRAME;
+        static constexpr const OpaqueKind op_kind = OpaqueKind::CV_UNKNOWN;
+    };
+    template<>           struct GTypeTraits<cv::GScalar>
+    {
+        static constexpr const ArgKind kind = ArgKind::GSCALAR;
+        static constexpr const GShape shape = GShape::GSCALAR;
+        static constexpr const OpaqueKind op_kind = OpaqueKind::CV_UNKNOWN;
+    };
+    template<class T> struct GTypeTraits<cv::GArray<T> >
+    {
+        static constexpr const ArgKind kind = ArgKind::GARRAY;
+        static constexpr const GShape shape = GShape::GARRAY;
+        static constexpr const OpaqueKind op_kind = GOpaqueTraits<T>::kind;
+        using host_type  = std::vector<T>;
+        using strip_type = cv::detail::VectorRef;
+        static cv::detail::GArrayU   wrap_value(const cv::GArray<T>  &t) { return t.strip();}
+        static cv::detail::VectorRef wrap_in   (const std::vector<T> &t) { return detail::VectorRef(t); }
+        static cv::detail::VectorRef wrap_out  (      std::vector<T> &t) { return detail::VectorRef(t); }
+    };
+    template<class T> struct GTypeTraits<cv::GOpaque<T> >
+    {
+        static constexpr const ArgKind kind = ArgKind::GOPAQUE;
+        static constexpr const GShape shape = GShape::GOPAQUE;
+        static constexpr const OpaqueKind op_kind = GOpaqueTraits<T>::kind;
+        using host_type  = T;
+        using strip_type = cv::detail::OpaqueRef;
+        static cv::detail::GOpaqueU  wrap_value(const cv::GOpaque<T>  &t) { return t.strip();}
+        static cv::detail::OpaqueRef wrap_in   (const T &t) { return detail::OpaqueRef(t); }
+        static cv::detail::OpaqueRef wrap_out  (      T &t) { return detail::OpaqueRef(t); }
+    };
+
+    // Tests if Trait for type T requires extra marshalling ("custom wrap") or not.
+    // If Traits<T> has wrap_value() defined, it does.
+    template<class T> struct has_custom_wrap
+    {
+        template<class,class> class check;
+        template<typename C> static std::true_type  test(check<C, decltype(&GTypeTraits<C>::wrap_value)> *);
+        template<typename C> static std::false_type test(...);
+        using type = decltype(test<T>(nullptr));
+        static const constexpr bool value = std::is_same<std::true_type, decltype(test<T>(nullptr))>::value;
+    };
+
+    // Resolve a Host type back to its associated G-Type.
+    // FIXME: Probably it can be avoided
+    // FIXME: GMatP is not present here.
+    // (Actually these traits is used only to check
+    // if associated G-type has custom wrap functions
+    // and GMat behavior is correct for GMatP)
+    template<typename T> struct GTypeOf;
+#if !defined(GAPI_STANDALONE)
+    template<>           struct GTypeOf<cv::UMat>              { using type = cv::GMat;      };
+#endif // !defined(GAPI_STANDALONE)
+    template<>           struct GTypeOf<cv::Mat>               { using type = cv::GMat;      };
+    template<>           struct GTypeOf<cv::RMat>              { using type = cv::GMat;      };
+    template<>           struct GTypeOf<cv::Scalar>            { using type = cv::GScalar;   };
+    template<typename U> struct GTypeOf<std::vector<U> >       { using type = cv::GArray<U>; };
+    template<typename U> struct GTypeOf                        { using type = cv::GOpaque<U>;};
+    template<>           struct GTypeOf<cv::MediaFrame>        { using type = cv::GFrame;    };
+
+    // FIXME: This is not quite correct since IStreamSource may
+    // produce not only Mat but also MediaFrame, Scalar and vector
+    // data. TODO: Extend the type dispatching on these types too.
+    template<>           struct GTypeOf<cv::gapi::wip::IStreamSource::Ptr> { using type = cv::GMat;};
+    template<class T> using g_type_of_t = typename GTypeOf<T>::type;
+
+    // Marshalling helper for G-types and its Host types. Helps G-API
+    // to store G types in internal generic containers for further
+    // processing. Implements the following callbacks:
+    //
+    // * wrap() - converts user-facing G-type into an internal one
+    //   for internal storage.
+    //   Used when G-API operation is instantiated (G<Kernel>::on(),
+    //   etc) during expressing a pipeline. Mostly returns input
+    //   value "as is" except the case when G-type is a template. For
+    //   template G-classes, calls custom wrap() from Traits.
+    //   The value returned by wrap() is then wrapped into GArg() and
+    //   stored in G-API metadata.
+    //
+    //   Example:
+    //   - cv::GMat arguments are passed as-is.
+    //   - integers, pointers, STL containers, user types are passed as-is.
+    //   - cv::GArray<T> is converted to cv::GArrayU.
+    //
+    // * wrap_in() / wrap_out() - convert Host type associated with
+    //   G-type to internal representation type.
+    //
+    //   - For "simple" (non-template) G-types, returns value as-is.
+    //     Example: cv::GMat has host type cv::Mat, when user passes a
+    //              cv::Mat, system stores it internally as cv::Mat.
+    //
+    //   - For "complex" (template) G-types, utilizes custom
+    //     wrap_in()/wrap_out() as described in Traits.
+    //     Example: cv::GArray<T> has host type std::vector<T>, when
+    //              user passes a std::vector<T>, system stores it
+    //              internally as VectorRef (with <T> stripped away).
+    template<typename T, class Custom = void> struct WrapValue
+    {
+        static auto wrap(const T& t) ->
+            typename std::remove_reference<T>::type
+        {
+            return static_cast<typename std::remove_reference<T>::type>(t);
+        }
+
+        template<typename U> static U  wrap_in (const U &u) { return  u;  }
+        template<typename U> static U* wrap_out(U &u)       { return &u;  }
+    };
+    template<typename T> struct WrapValue<T, typename std::enable_if<has_custom_wrap<T>::value>::type>
+    {
+        static auto wrap(const T& t) -> decltype(GTypeTraits<T>::wrap_value(t))
+        {
+            return GTypeTraits<T>::wrap_value(t);
+        }
+        template<typename U> static auto wrap_in (const U &u) -> typename GTypeTraits<T>::strip_type
+        {
+            static_assert(!(cv::detail::has_gshape<GTypeTraits<U>>::value
+                            || cv::detail::contains<typename std::decay<U>::type, GAPI_OWN_TYPES_LIST>::value),
+                          "gin/gout must not be used with G* classes or cv::gapi::own::*");
+            return GTypeTraits<T>::wrap_in(u);
+        }
+        template<typename U> static auto wrap_out(U &u) -> typename GTypeTraits<T>::strip_type
+        {
+            static_assert(!(cv::detail::has_gshape<GTypeTraits<U>>::value
+                            || cv::detail::contains<typename std::decay<U>::type, GAPI_OWN_TYPES_LIST>::value),
+                          "gin/gout must not be used with G* classes or cv::gapi::own::*");
+            return GTypeTraits<T>::wrap_out(u);
+        }
+    };
+
+    template<typename T> using wrap_gapi_helper = WrapValue<typename std::decay<T>::type>;
+    template<typename T> using wrap_host_helper = WrapValue<typename std::decay<g_type_of_t<T> >::type>;
+
+// Union type for various user-defined type constructors (GArray<T>,
+// GOpaque<T>, etc)
+//
+// TODO: Replace construct-only API with a more generic one (probably
+//    with bits of introspection)
+//
+// Not required for non-user-defined types (GMat, GScalar, etc)
+using HostCtor = util::variant
+    < util::monostate
+    , detail::ConstructVec
+    , detail::ConstructOpaque
+    >;
+
+template<typename T> struct GObtainCtor {
+    static HostCtor get() { return HostCtor{}; }
+};
+template<typename T> struct GObtainCtor<GArray<T> > {
+    static HostCtor get() { return HostCtor{ConstructVec{&GArray<T>::VCtor}}; }
+};
+template<typename T> struct GObtainCtor<GOpaque<T> > {
+    static HostCtor get() { return HostCtor{ConstructOpaque{&GOpaque<T>::Ctor}}; }
+};
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_GAPI_GTYPE_TRAITS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtyped.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtyped.hpp
new file mode 100644
index 000000000000..2acc2f7ffbf0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/gtyped.hpp
@@ -0,0 +1,246 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GTYPED_HPP
+#define OPENCV_GAPI_GTYPED_HPP
+#if !defined(GAPI_STANDALONE)
+
+#include <vector>
+
+#include <opencv2/gapi/gcomputation.hpp>
+#include <opencv2/gapi/gcompiled.hpp>
+#include <opencv2/gapi/gproto.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+
+namespace cv {
+
+namespace detail
+{
+    // FIXME: How to prevent coolhackers from extending it by their own types?
+    // FIXME: ...Should we care?
+    template<typename T> struct ProtoToParam;
+    template<> struct ProtoToParam<cv::GMat>    { using type = cv::Mat; };
+    template<> struct ProtoToParam<cv::GScalar> { using type = cv::Scalar; };
+    template<typename U> struct ProtoToParam<cv::GArray<U> >  { using type = std::vector<U>; };
+    template<> struct ProtoToParam<cv::GArray<cv::GMat>>      { using type = std::vector<cv::Mat>; };
+    template<typename U> struct ProtoToParam<cv::GOpaque<U> > { using type = U; };
+    template<typename T> using ProtoToParamT = typename ProtoToParam<T>::type;
+
+    template<typename T> struct ProtoToMeta;
+    template<> struct ProtoToMeta<cv::GMat>     { using type = cv::GMatDesc; };
+    template<> struct ProtoToMeta<cv::GScalar>  { using type = cv::GScalarDesc; };
+    template<typename U> struct ProtoToMeta<cv::GArray<U> >  { using type = cv::GArrayDesc; };
+    template<typename U> struct ProtoToMeta<cv::GOpaque<U> > { using type = cv::GOpaqueDesc; };
+    template<typename T> using ProtoToMetaT = typename ProtoToMeta<T>::type;
+
+    //workaround for MSVC 19.0 bug
+    template <typename T>
+    auto make_default()->decltype(T{}) {return {};}
+} // detail
+
+/**
+ * @brief This class is a typed wrapper over a regular GComputation.
+ *
+ * `std::function<>`-like template parameter specifies the graph
+ *  signature so methods so the object's constructor, methods like
+ *  `apply()` and the derived `GCompiledT::operator()` also become
+ *  typed.
+ *
+ *  There is no need to use cv::gin() or cv::gout() modifiers with
+ *  objects of this class.  Instead, all input arguments are followed
+ *  by all output arguments in the order from the template argument
+ *  signature.
+ *
+ *  Refer to the following example. Regular (untyped) code is written this way:
+ *
+ *  @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp Untyped_Example
+ *
+ *  Here:
+ *
+ *  - cv::GComputation object is created with a lambda constructor
+ *    where it is defined as a two-input, one-output graph.
+ *
+ *  - Its method `apply()` in fact takes arbitrary number of arguments
+ *    (as vectors) so user can pass wrong number of inputs/outputs
+ *    here. C++ compiler wouldn't notice that since the cv::GComputation
+ *    API is polymorphic, and only a run-time error will be generated.
+ *
+ *  Now the same code written with typed API:
+ *
+ *  @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp Typed_Example
+ *
+ *  The key difference is:
+ *
+ *  - Now the constructor lambda *must take* parameters and *must
+ *    return* values as defined in the `GComputationT<>` signature.
+ *  - Its method `apply()` does not require any extra specifiers to
+ *    separate input arguments from the output ones
+ *  - A `GCompiledT` (compilation product) takes input/output
+ *    arguments with no extra specifiers as well.
+ */
+template<typename> class GComputationT;
+
+// Single return value implementation
+template<typename R, typename... Args> class GComputationT<R(Args...)>
+{
+public:
+    typedef std::function<R(Args...)> Gen;
+
+    class GCompiledT
+    {
+    private:
+        friend class GComputationT<R(Args...)>;
+
+        cv::GCompiled m_comp;
+
+        explicit GCompiledT(const cv::GCompiled &comp) : m_comp(comp) {}
+
+    public:
+        GCompiledT() {}
+
+        void operator()(detail::ProtoToParamT<Args>... inArgs,
+                        detail::ProtoToParamT<R> &outArg)
+        {
+            m_comp(cv::gin(inArgs...), cv::gout(outArg));
+        }
+
+        explicit operator bool() const
+        {
+            return static_cast<bool>(m_comp);
+        }
+    };
+
+private:
+    typedef std::pair<R, GProtoInputArgs > Captured;
+
+    Captured capture(const Gen& g, Args... args)
+    {
+        return Captured(g(args...), cv::GIn(args...));
+    }
+
+    Captured m_capture;
+    cv::GComputation m_comp;
+
+public:
+    GComputationT(const Gen &generator)
+        : m_capture(capture(generator, detail::make_default<Args>()...))
+        , m_comp(cv::GProtoInputArgs(std::move(m_capture.second)),
+                 cv::GOut(m_capture.first))
+    {
+    }
+
+    void apply(detail::ProtoToParamT<Args>... inArgs,
+               detail::ProtoToParamT<R> &outArg,
+               GCompileArgs &&args)
+    {
+        m_comp.apply(cv::gin(inArgs...), cv::gout(outArg), std::move(args));
+    }
+
+    void apply(detail::ProtoToParamT<Args>... inArgs,
+               detail::ProtoToParamT<R> &outArg)
+    {
+        apply(inArgs..., outArg, GCompileArgs());
+    }
+
+
+    GCompiledT compile(detail::ProtoToMetaT<Args>... inDescs)
+    {
+        GMetaArgs inMetas = { GMetaArg(inDescs)... };
+        return GCompiledT(m_comp.compile(std::move(inMetas), GCompileArgs()));
+    }
+
+    GCompiledT compile(detail::ProtoToMetaT<Args>... inDescs, GCompileArgs &&args)
+    {
+        GMetaArgs inMetas = { GMetaArg(inDescs)... };
+        return GCompiledT(m_comp.compile(std::move(inMetas), std::move(args)));
+    }
+};
+
+// Multiple (fixed) return value implementation. FIXME: How to avoid copy-paste?
+template<typename... R, typename... Args> class GComputationT<std::tuple<R...>(Args...)>
+{
+public:
+    typedef std::function<std::tuple<R...>(Args...)> Gen;
+
+    class GCompiledT
+    {
+    private:
+        friend class GComputationT<std::tuple<R...>(Args...)>;
+
+        cv::GCompiled m_comp;
+        explicit GCompiledT(const cv::GCompiled &comp) : m_comp(comp) {}
+
+    public:
+        GCompiledT() {}
+
+        void operator()(detail::ProtoToParamT<Args>... inArgs,
+                        detail::ProtoToParamT<R>&... outArgs)
+        {
+            m_comp(cv::gin(inArgs...), cv::gout(outArgs...));
+        }
+
+        explicit operator bool() const
+        {
+            return static_cast<bool>(m_comp);
+        }
+    };
+
+private:
+    typedef std::pair<GProtoArgs, GProtoArgs> Captured;
+
+    template<int... IIs>
+    Captured capture(GProtoArgs &&args, const std::tuple<R...> &rr, detail::Seq<IIs...>)
+    {
+        return Captured(cv::GOut(std::get<IIs>(rr)...).m_args, args);
+    }
+
+    Captured capture(const Gen& g, Args... args)
+    {
+        return capture(cv::GIn(args...).m_args, g(args...), typename detail::MkSeq<sizeof...(R)>::type());
+    }
+
+    Captured m_capture;
+    cv::GComputation m_comp;
+
+public:
+    GComputationT(const Gen &generator)
+        : m_capture(capture(generator, detail::make_default<Args>()...))
+        , m_comp(cv::GProtoInputArgs(std::move(m_capture.second)),
+                 cv::GProtoOutputArgs(std::move(m_capture.first)))
+    {
+    }
+
+    void apply(detail::ProtoToParamT<Args>... inArgs,
+               detail::ProtoToParamT<R>&... outArgs,
+               GCompileArgs &&args)
+    {
+        m_comp.apply(cv::gin(inArgs...), cv::gout(outArgs...), std::move(args));
+    }
+
+    void apply(detail::ProtoToParamT<Args>... inArgs,
+               detail::ProtoToParamT<R>&... outArgs)
+    {
+        apply(inArgs..., outArgs..., GCompileArgs());
+    }
+
+
+    GCompiledT compile(detail::ProtoToMetaT<Args>... inDescs)
+    {
+        GMetaArgs inMetas = { GMetaArg(inDescs)... };
+        return GCompiledT(m_comp.compile(std::move(inMetas), GCompileArgs()));
+    }
+
+    GCompiledT compile(detail::ProtoToMetaT<Args>... inDescs, GCompileArgs &&args)
+    {
+        GMetaArgs inMetas = { GMetaArg(inDescs)... };
+        return GCompiledT(m_comp.compile(std::move(inMetas), std::move(args)));
+    }
+};
+
+} // namespace cv
+#endif // !defined(GAPI_STANDALONE)
+#endif // OPENCV_GAPI_GTYPED_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/imgproc.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/imgproc.hpp
new file mode 100644
index 000000000000..96aaa5a447f5
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/imgproc.hpp
@@ -0,0 +1,1769 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_IMGPROC_HPP
+#define OPENCV_GAPI_IMGPROC_HPP
+
+#include <opencv2/imgproc.hpp>
+
+#include <utility> // std::tuple
+
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+
+
+/** \defgroup gapi_imgproc G-API Image processing functionality
+@{
+    @defgroup gapi_filters Graph API: Image filters
+    @defgroup gapi_colorconvert Graph API: Converting image from one color space to another
+    @defgroup gapi_feature Graph API: Image Feature Detection
+    @defgroup gapi_shape Graph API: Image Structural Analysis and Shape Descriptors
+    @defgroup gapi_transform Graph API: Image and channel composition functions
+@}
+ */
+
+namespace {
+void validateFindingContoursMeta(const int depth, const int chan, const int mode)
+{
+    GAPI_Assert(chan == 1);
+    switch (mode)
+    {
+    case cv::RETR_CCOMP:
+        GAPI_Assert(depth == CV_8U || depth == CV_32S);
+        break;
+    case cv::RETR_FLOODFILL:
+        GAPI_Assert(depth == CV_32S);
+        break;
+    default:
+        GAPI_Assert(depth == CV_8U);
+        break;
+    }
+}
+} // anonymous namespace
+
+namespace cv { namespace gapi {
+
+/**
+ * @brief This namespace contains G-API Operation Types for OpenCV
+ * ImgProc module functionality.
+ */
+namespace imgproc {
+    using GMat2 = std::tuple<GMat,GMat>;
+    using GMat3 = std::tuple<GMat,GMat,GMat>; // FIXME: how to avoid this?
+    using GFindContoursOutput = std::tuple<GArray<GArray<Point>>,GArray<Vec4i>>;
+
+    G_TYPED_KERNEL(GFilter2D, <GMat(GMat,int,Mat,Point,Scalar,int,Scalar)>, "org.opencv.imgproc.filters.filter2D") {
+        static GMatDesc outMeta(GMatDesc in, int ddepth, Mat, Point, Scalar, int, Scalar) {
+            return in.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSepFilter, <GMat(GMat,int,Mat,Mat,Point,Scalar,int,Scalar)>, "org.opencv.imgproc.filters.sepfilter") {
+        static GMatDesc outMeta(GMatDesc in, int ddepth, Mat, Mat, Point, Scalar, int, Scalar) {
+            return in.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GBoxFilter, <GMat(GMat,int,Size,Point,bool,int,Scalar)>, "org.opencv.imgproc.filters.boxfilter") {
+        static GMatDesc outMeta(GMatDesc in, int ddepth, Size, Point, bool, int, Scalar) {
+            return in.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GBlur, <GMat(GMat,Size,Point,int,Scalar)>, "org.opencv.imgproc.filters.blur") {
+        static GMatDesc outMeta(GMatDesc in, Size, Point, int, Scalar) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GGaussBlur, <GMat(GMat,Size,double,double,int,Scalar)>, "org.opencv.imgproc.filters.gaussianBlur") {
+        static GMatDesc outMeta(GMatDesc in, Size, double, double, int, Scalar) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GMedianBlur, <GMat(GMat,int)>, "org.opencv.imgproc.filters.medianBlur") {
+        static GMatDesc outMeta(GMatDesc in, int) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GErode, <GMat(GMat,Mat,Point,int,int,Scalar)>, "org.opencv.imgproc.filters.erode") {
+        static GMatDesc outMeta(GMatDesc in, Mat, Point, int, int, Scalar) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GDilate, <GMat(GMat,Mat,Point,int,int,Scalar)>, "org.opencv.imgproc.filters.dilate") {
+        static GMatDesc outMeta(GMatDesc in, Mat, Point, int, int, Scalar) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GMorphologyEx, <GMat(GMat,MorphTypes,Mat,Point,int,BorderTypes,Scalar)>,
+                   "org.opencv.imgproc.filters.morphologyEx") {
+        static GMatDesc outMeta(const GMatDesc &in, MorphTypes, Mat, Point, int,
+                                BorderTypes, Scalar) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GSobel, <GMat(GMat,int,int,int,int,double,double,int,Scalar)>, "org.opencv.imgproc.filters.sobel") {
+        static GMatDesc outMeta(GMatDesc in, int ddepth, int, int, int, double, double, int, Scalar) {
+            return in.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL_M(GSobelXY, <GMat2(GMat,int,int,int,double,double,int,Scalar)>, "org.opencv.imgproc.filters.sobelxy") {
+        static std::tuple<GMatDesc, GMatDesc> outMeta(GMatDesc in, int ddepth, int, int, double, double, int, Scalar) {
+            return std::make_tuple(in.withDepth(ddepth), in.withDepth(ddepth));
+        }
+    };
+
+    G_TYPED_KERNEL(GLaplacian, <GMat(GMat,int, int, double, double, int)>,
+                   "org.opencv.imgproc.filters.laplacian") {
+        static GMatDesc outMeta(GMatDesc in, int ddepth, int, double, double, int) {
+            return in.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GBilateralFilter, <GMat(GMat,int, double, double, int)>,
+                   "org.opencv.imgproc.filters.bilateralfilter") {
+        static GMatDesc outMeta(GMatDesc in, int, double, double, int) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GEqHist, <GMat(GMat)>, "org.opencv.imgproc.equalizeHist") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GCanny, <GMat(GMat,double,double,int,bool)>, "org.opencv.imgproc.feature.canny") {
+        static GMatDesc outMeta(GMatDesc in, double, double, int, bool) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GGoodFeatures,
+                   <cv::GArray<cv::Point2f>(GMat,int,double,double,Mat,int,bool,double)>,
+                   "org.opencv.imgproc.feature.goodFeaturesToTrack") {
+        static GArrayDesc outMeta(GMatDesc, int, double, double, const Mat&, int, bool, double) {
+            return empty_array_desc();
+        }
+    };
+
+    using RetrMode = RetrievalModes;
+    using ContMethod = ContourApproximationModes;
+    G_TYPED_KERNEL(GFindContours, <GArray<GArray<Point>>(GMat,RetrMode,ContMethod,GOpaque<Point>)>,
+                   "org.opencv.imgproc.shape.findContours")
+    {
+        static GArrayDesc outMeta(GMatDesc in, RetrMode mode, ContMethod, GOpaqueDesc)
+        {
+            validateFindingContoursMeta(in.depth, in.chan, mode);
+            return empty_array_desc();
+        }
+    };
+
+    // FIXME oc: make default value offset = Point()
+    G_TYPED_KERNEL(GFindContoursNoOffset, <GArray<GArray<Point>>(GMat,RetrMode,ContMethod)>,
+                   "org.opencv.imgproc.shape.findContoursNoOffset")
+    {
+        static GArrayDesc outMeta(GMatDesc in, RetrMode mode, ContMethod)
+        {
+            validateFindingContoursMeta(in.depth, in.chan, mode);
+            return empty_array_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GFindContoursH,<GFindContoursOutput(GMat,RetrMode,ContMethod,GOpaque<Point>)>,
+                   "org.opencv.imgproc.shape.findContoursH")
+    {
+        static std::tuple<GArrayDesc,GArrayDesc>
+        outMeta(GMatDesc in, RetrMode mode, ContMethod, GOpaqueDesc)
+        {
+            validateFindingContoursMeta(in.depth, in.chan, mode);
+            return std::make_tuple(empty_array_desc(), empty_array_desc());
+        }
+    };
+
+    // FIXME oc: make default value offset = Point()
+    G_TYPED_KERNEL(GFindContoursHNoOffset,<GFindContoursOutput(GMat,RetrMode,ContMethod)>,
+                   "org.opencv.imgproc.shape.findContoursHNoOffset")
+    {
+        static std::tuple<GArrayDesc,GArrayDesc>
+        outMeta(GMatDesc in, RetrMode mode, ContMethod)
+        {
+            validateFindingContoursMeta(in.depth, in.chan, mode);
+            return std::make_tuple(empty_array_desc(), empty_array_desc());
+        }
+    };
+
+    G_TYPED_KERNEL(GBoundingRectMat, <GOpaque<Rect>(GMat)>,
+                   "org.opencv.imgproc.shape.boundingRectMat") {
+        static GOpaqueDesc outMeta(GMatDesc in) {
+            if (in.depth == CV_8U)
+            {
+                GAPI_Assert(in.chan == 1);
+            }
+            else
+            {
+                GAPI_Assert (in.depth == CV_32S || in.depth == CV_32F);
+                int amount = detail::checkVector(in, 2u);
+                GAPI_Assert(amount != -1 &&
+                            "Input Mat can't be described as vector of 2-dimentional points");
+            }
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GBoundingRectVector32S, <GOpaque<Rect>(GArray<Point2i>)>,
+                   "org.opencv.imgproc.shape.boundingRectVector32S") {
+        static GOpaqueDesc outMeta(GArrayDesc) {
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GBoundingRectVector32F, <GOpaque<Rect>(GArray<Point2f>)>,
+                   "org.opencv.imgproc.shape.boundingRectVector32F") {
+        static GOpaqueDesc outMeta(GArrayDesc) {
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GFitLine2DMat, <GOpaque<Vec4f>(GMat,DistanceTypes,double,double,double)>,
+                   "org.opencv.imgproc.shape.fitLine2DMat") {
+        static GOpaqueDesc outMeta(GMatDesc in,DistanceTypes,double,double,double) {
+            int amount = detail::checkVector(in, 2u);
+            GAPI_Assert(amount != -1 &&
+                        "Input Mat can't be described as vector of 2-dimentional points");
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GFitLine2DVector32S,
+                   <GOpaque<Vec4f>(GArray<Point2i>,DistanceTypes,double,double,double)>,
+                   "org.opencv.imgproc.shape.fitLine2DVector32S") {
+        static GOpaqueDesc outMeta(GArrayDesc,DistanceTypes,double,double,double) {
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GFitLine2DVector32F,
+                   <GOpaque<Vec4f>(GArray<Point2f>,DistanceTypes,double,double,double)>,
+                   "org.opencv.imgproc.shape.fitLine2DVector32F") {
+        static GOpaqueDesc outMeta(GArrayDesc,DistanceTypes,double,double,double) {
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GFitLine2DVector64F,
+                   <GOpaque<Vec4f>(GArray<Point2d>,DistanceTypes,double,double,double)>,
+                   "org.opencv.imgproc.shape.fitLine2DVector64F") {
+        static GOpaqueDesc outMeta(GArrayDesc,DistanceTypes,double,double,double) {
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GFitLine3DMat, <GOpaque<Vec6f>(GMat,DistanceTypes,double,double,double)>,
+                   "org.opencv.imgproc.shape.fitLine3DMat") {
+        static GOpaqueDesc outMeta(GMatDesc in,int,double,double,double) {
+            int amount = detail::checkVector(in, 3u);
+            GAPI_Assert(amount != -1 &&
+                        "Input Mat can't be described as vector of 3-dimentional points");
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GFitLine3DVector32S,
+                   <GOpaque<Vec6f>(GArray<Point3i>,DistanceTypes,double,double,double)>,
+                   "org.opencv.imgproc.shape.fitLine3DVector32S") {
+        static GOpaqueDesc outMeta(GArrayDesc,DistanceTypes,double,double,double) {
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GFitLine3DVector32F,
+                   <GOpaque<Vec6f>(GArray<Point3f>,DistanceTypes,double,double,double)>,
+                   "org.opencv.imgproc.shape.fitLine3DVector32F") {
+        static GOpaqueDesc outMeta(GArrayDesc,DistanceTypes,double,double,double) {
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GFitLine3DVector64F,
+                   <GOpaque<Vec6f>(GArray<Point3d>,DistanceTypes,double,double,double)>,
+                   "org.opencv.imgproc.shape.fitLine3DVector64F") {
+        static GOpaqueDesc outMeta(GArrayDesc,DistanceTypes,double,double,double) {
+            return empty_gopaque_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GBGR2RGB, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.bgr2rgb") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2YUV, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.rgb2yuv") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GYUV2RGB, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.yuv2rgb") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GBGR2I420, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.bgr2i420") {
+        static GMatDesc outMeta(GMatDesc in) {
+            GAPI_Assert(in.depth == CV_8U);
+            GAPI_Assert(in.chan == 3);
+            GAPI_Assert(in.size.height % 2 == 0);
+            return in.withType(in.depth, 1).withSize(Size(in.size.width, in.size.height * 3 / 2));
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2I420, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.rgb2i420") {
+        static GMatDesc outMeta(GMatDesc in) {
+            GAPI_Assert(in.depth == CV_8U);
+            GAPI_Assert(in.chan == 3);
+            GAPI_Assert(in.size.height % 2 == 0);
+            return in.withType(in.depth, 1).withSize(Size(in.size.width, in.size.height * 3 / 2));
+        }
+    };
+
+    G_TYPED_KERNEL(GI4202BGR, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.i4202bgr") {
+        static GMatDesc outMeta(GMatDesc in) {
+            GAPI_Assert(in.depth == CV_8U);
+            GAPI_Assert(in.chan == 1);
+            GAPI_Assert(in.size.height % 3 == 0);
+            return in.withType(in.depth, 3).withSize(Size(in.size.width, in.size.height * 2 / 3));
+        }
+    };
+
+    G_TYPED_KERNEL(GI4202RGB, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.i4202rgb") {
+        static GMatDesc outMeta(GMatDesc in) {
+            GAPI_Assert(in.depth == CV_8U);
+            GAPI_Assert(in.chan == 1);
+            GAPI_Assert(in.size.height % 3 == 0);
+            return in.withType(in.depth, 3).withSize(Size(in.size.width, in.size.height * 2 / 3));
+        }
+    };
+
+    G_TYPED_KERNEL(GNV12toRGB, <GMat(GMat, GMat)>, "org.opencv.imgproc.colorconvert.nv12torgb") {
+        static GMatDesc outMeta(GMatDesc in_y, GMatDesc in_uv) {
+            GAPI_Assert(in_y.chan == 1);
+            GAPI_Assert(in_uv.chan == 2);
+            GAPI_Assert(in_y.depth == CV_8U);
+            GAPI_Assert(in_uv.depth == CV_8U);
+            // UV size should be aligned with Y
+            GAPI_Assert(in_y.size.width == 2 * in_uv.size.width);
+            GAPI_Assert(in_y.size.height == 2 * in_uv.size.height);
+            return in_y.withType(CV_8U, 3); // type will be CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GNV12toBGR, <GMat(GMat, GMat)>, "org.opencv.imgproc.colorconvert.nv12tobgr") {
+        static GMatDesc outMeta(GMatDesc in_y, GMatDesc in_uv) {
+            GAPI_Assert(in_y.chan == 1);
+            GAPI_Assert(in_uv.chan == 2);
+            GAPI_Assert(in_y.depth == CV_8U);
+            GAPI_Assert(in_uv.depth == CV_8U);
+            // UV size should be aligned with Y
+            GAPI_Assert(in_y.size.width == 2 * in_uv.size.width);
+            GAPI_Assert(in_y.size.height == 2 * in_uv.size.height);
+            return in_y.withType(CV_8U, 3); // type will be CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2Lab, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.rgb2lab") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GBGR2LUV, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.bgr2luv") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GLUV2BGR, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.luv2bgr") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GYUV2BGR, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.yuv2bgr") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GBGR2YUV, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.bgr2yuv") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2Gray, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.rgb2gray") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2GrayCustom, <GMat(GMat,float,float,float)>, "org.opencv.imgproc.colorconvert.rgb2graycustom") {
+        static GMatDesc outMeta(GMatDesc in, float, float, float) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GBGR2Gray, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.bgr2gray") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GBayerGR2RGB, <cv::GMat(cv::GMat)>, "org.opencv.imgproc.colorconvert.bayergr2rgb") {
+        static cv::GMatDesc outMeta(cv::GMatDesc in) {
+            return in.withType(CV_8U, 3);
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2HSV, <cv::GMat(cv::GMat)>, "org.opencv.imgproc.colorconvert.rgb2hsv") {
+        static cv::GMatDesc outMeta(cv::GMatDesc in) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2YUV422, <cv::GMat(cv::GMat)>, "org.opencv.imgproc.colorconvert.rgb2yuv422") {
+        static cv::GMatDesc outMeta(cv::GMatDesc in) {
+            GAPI_Assert(in.depth == CV_8U);
+            GAPI_Assert(in.chan == 3);
+            return in.withType(in.depth, 2);
+        }
+    };
+
+    G_TYPED_KERNEL(GNV12toRGBp, <GMatP(GMat,GMat)>, "org.opencv.imgproc.colorconvert.nv12torgbp") {
+        static GMatDesc outMeta(GMatDesc inY, GMatDesc inUV) {
+            GAPI_Assert(inY.depth == CV_8U);
+            GAPI_Assert(inUV.depth == CV_8U);
+            GAPI_Assert(inY.chan == 1);
+            GAPI_Assert(inY.planar == false);
+            GAPI_Assert(inUV.chan == 2);
+            GAPI_Assert(inUV.planar == false);
+            GAPI_Assert(inY.size.width  == 2 * inUV.size.width);
+            GAPI_Assert(inY.size.height == 2 * inUV.size.height);
+            return inY.withType(CV_8U, 3).asPlanar();
+        }
+    };
+
+    G_TYPED_KERNEL(GNV12toGray, <GMat(GMat,GMat)>, "org.opencv.imgproc.colorconvert.nv12togray") {
+        static GMatDesc outMeta(GMatDesc inY, GMatDesc inUV) {
+            GAPI_Assert(inY.depth   == CV_8U);
+            GAPI_Assert(inUV.depth  == CV_8U);
+            GAPI_Assert(inY.chan    == 1);
+            GAPI_Assert(inY.planar  == false);
+            GAPI_Assert(inUV.chan   == 2);
+            GAPI_Assert(inUV.planar == false);
+
+            GAPI_Assert(inY.size.width  == 2 * inUV.size.width);
+            GAPI_Assert(inY.size.height == 2 * inUV.size.height);
+            return inY.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GNV12toBGRp, <GMatP(GMat,GMat)>, "org.opencv.imgproc.colorconvert.nv12tobgrp") {
+        static GMatDesc outMeta(GMatDesc inY, GMatDesc inUV) {
+            GAPI_Assert(inY.depth == CV_8U);
+            GAPI_Assert(inUV.depth == CV_8U);
+            GAPI_Assert(inY.chan == 1);
+            GAPI_Assert(inY.planar == false);
+            GAPI_Assert(inUV.chan == 2);
+            GAPI_Assert(inUV.planar == false);
+            GAPI_Assert(inY.size.width  == 2 * inUV.size.width);
+            GAPI_Assert(inY.size.height == 2 * inUV.size.height);
+            return inY.withType(CV_8U, 3).asPlanar();
+        }
+    };
+
+    G_TYPED_KERNEL(GResize, <GMat(GMat,Size,double,double,int)>, "org.opencv.imgproc.transform.resize") {
+        static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int /*interp*/) {
+            if (sz.width != 0 && sz.height != 0)
+            {
+                return in.withSize(sz);
+            }
+            else
+            {
+                int outSz_w = saturate_cast<int>(in.size.width  * fx);
+                int outSz_h = saturate_cast<int>(in.size.height * fy);
+                GAPI_Assert(outSz_w > 0 && outSz_h > 0);
+                return in.withSize(Size(outSz_w, outSz_h));
+            }
+        }
+    };
+
+    G_TYPED_KERNEL(GResizeP, <GMatP(GMatP,Size,int)>, "org.opencv.imgproc.transform.resizeP") {
+        static GMatDesc outMeta(GMatDesc in, Size sz, int interp) {
+            GAPI_Assert(in.depth == CV_8U);
+            GAPI_Assert(in.chan == 3);
+            GAPI_Assert(in.planar);
+            GAPI_Assert(interp == cv::INTER_LINEAR);
+            return in.withSize(sz);
+        }
+    };
+
+} //namespace imgproc
+
+//! @addtogroup gapi_filters
+//! @{
+/** @brief Applies a separable linear filter to a matrix(image).
+
+The function applies a separable linear filter to the matrix. That is, first, every row of src is
+filtered with the 1D kernel kernelX. Then, every column of the result is filtered with the 1D
+kernel kernelY. The final result is returned.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note
+ - In case of floating-point computation, rounding to nearest even is procedeed
+if hardware supports it (if not - to nearest value).
+ - Function textual ID is "org.opencv.imgproc.filters.sepfilter"
+@param src Source image.
+@param ddepth desired depth of the destination image (the following combinations of src.depth() and ddepth are supported:
+
+        src.depth() = CV_8U, ddepth = -1/CV_16S/CV_32F/CV_64F
+        src.depth() = CV_16U/CV_16S, ddepth = -1/CV_32F/CV_64F
+        src.depth() = CV_32F, ddepth = -1/CV_32F/CV_64F
+        src.depth() = CV_64F, ddepth = -1/CV_64F
+
+when ddepth=-1, the output image will have the same depth as the source)
+@param kernelX Coefficients for filtering each row.
+@param kernelY Coefficients for filtering each column.
+@param anchor Anchor position within the kernel. The default value \f$(-1,-1)\f$ means that the anchor
+is at the kernel center.
+@param delta Value added to the filtered results before storing them.
+@param borderType Pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  boxFilter, gaussianBlur, medianBlur
+ */
+GAPI_EXPORTS_W GMat sepFilter(const GMat& src, int ddepth, const Mat& kernelX, const Mat& kernelY, const Point& anchor /*FIXME: = Point(-1,-1)*/,
+                              const Scalar& delta /*FIXME = GScalar(0)*/, int borderType = BORDER_DEFAULT,
+                              const Scalar& borderValue = Scalar(0));
+
+/** @brief Convolves an image with the kernel.
+
+The function applies an arbitrary linear filter to an image. When
+the aperture is partially outside the image, the function interpolates outlier pixel values
+according to the specified border mode.
+
+The function does actually compute correlation, not the convolution:
+
+\f[\texttt{dst} (x,y) =  \sum _{ \substack{0\leq x' < \texttt{kernel.cols}\\{0\leq y' < \texttt{kernel.rows}}}}  \texttt{kernel} (x',y')* \texttt{src} (x+x'- \texttt{anchor.x} ,y+y'- \texttt{anchor.y} )\f]
+
+That is, the kernel is not mirrored around the anchor point. If you need a real convolution, flip
+the kernel using flip and set the new anchor to `(kernel.cols - anchor.x - 1, kernel.rows -
+anchor.y - 1)`.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same size and number of channels an input image.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.filter2D"
+
+@param src input image.
+@param ddepth desired depth of the destination image
+@param kernel convolution kernel (or rather a correlation kernel), a single-channel floating point
+matrix; if you want to apply different kernels to different channels, split the image into
+separate color planes using split and process them individually.
+@param anchor anchor of the kernel that indicates the relative position of a filtered point within
+the kernel; the anchor should lie within the kernel; default value (-1,-1) means that the anchor
+is at the kernel center.
+@param delta optional value added to the filtered pixels before storing them in dst.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  sepFilter
+ */
+GAPI_EXPORTS_W GMat filter2D(const GMat& src, int ddepth, const Mat& kernel, const Point& anchor = Point(-1,-1), const Scalar& delta = Scalar(0),
+                             int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
+
+
+/** @brief Blurs an image using the box filter.
+
+The function smooths an image using the kernel:
+
+\f[\texttt{K} =  \alpha \begin{bmatrix} 1 & 1 & 1 &  \cdots & 1 & 1  \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \hdotsfor{6} \\ 1 & 1 & 1 &  \cdots & 1 & 1 \end{bmatrix}\f]
+
+where
+
+\f[\alpha = \begin{cases} \frac{1}{\texttt{ksize.width*ksize.height}} & \texttt{when } \texttt{normalize=true}  \\1 & \texttt{otherwise} \end{cases}\f]
+
+Unnormalized box filter is useful for computing various integral characteristics over each pixel
+neighborhood, such as covariance matrices of image derivatives (used in dense optical flow
+algorithms, and so on). If you need to compute pixel sums over variable-size windows, use cv::integral.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.boxfilter"
+
+@param src Source image.
+@param dtype the output image depth (-1 to set the input image data type).
+@param ksize blurring kernel size.
+@param anchor Anchor position within the kernel. The default value \f$(-1,-1)\f$ means that the anchor
+is at the kernel center.
+@param normalize flag, specifying whether the kernel is normalized by its area or not.
+@param borderType Pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  sepFilter, gaussianBlur, medianBlur, integral
+ */
+GAPI_EXPORTS_W GMat boxFilter(const GMat& src, int dtype, const Size& ksize, const Point& anchor = Point(-1,-1),
+                              bool normalize = true, int borderType = BORDER_DEFAULT,
+                              const Scalar& borderValue = Scalar(0));
+
+/** @brief Blurs an image using the normalized box filter.
+
+The function smooths an image using the kernel:
+
+\f[\texttt{K} =  \frac{1}{\texttt{ksize.width*ksize.height}} \begin{bmatrix} 1 & 1 & 1 &  \cdots & 1 & 1  \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \hdotsfor{6} \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \end{bmatrix}\f]
+
+The call `blur(src, ksize, anchor, borderType)` is equivalent to `boxFilter(src, src.type(), ksize, anchor,
+true, borderType)`.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.blur"
+
+@param src Source image.
+@param ksize blurring kernel size.
+@param anchor anchor point; default value Point(-1,-1) means that the anchor is at the kernel
+center.
+@param borderType border mode used to extrapolate pixels outside of the image, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  boxFilter, bilateralFilter, GaussianBlur, medianBlur
+ */
+GAPI_EXPORTS_W GMat blur(const GMat& src, const Size& ksize, const Point& anchor = Point(-1,-1),
+                         int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
+
+
+//GAPI_EXPORTS_W void blur( InputArray src, OutputArray dst,
+ //                       Size ksize, Point anchor = Point(-1,-1),
+ //                       int borderType = BORDER_DEFAULT );
+
+
+/** @brief Blurs an image using a Gaussian filter.
+
+The function filter2Ds the source image with the specified Gaussian kernel.
+Output image must have the same type and number of channels an input image.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.gaussianBlur"
+
+@param src input image;
+@param ksize Gaussian kernel size. ksize.width and ksize.height can differ but they both must be
+positive and odd. Or, they can be zero's and then they are computed from sigma.
+@param sigmaX Gaussian kernel standard deviation in X direction.
+@param sigmaY Gaussian kernel standard deviation in Y direction; if sigmaY is zero, it is set to be
+equal to sigmaX, if both sigmas are zeros, they are computed from ksize.width and ksize.height,
+respectively (see cv::getGaussianKernel for details); to fully control the result regardless of
+possible future modifications of all this semantics, it is recommended to specify all of ksize,
+sigmaX, and sigmaY.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  sepFilter, boxFilter, medianBlur
+ */
+GAPI_EXPORTS_W GMat gaussianBlur(const GMat& src, const Size& ksize, double sigmaX, double sigmaY = 0,
+                                 int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
+
+/** @brief Blurs an image using the median filter.
+
+The function smoothes an image using the median filter with the \f$\texttt{ksize} \times
+\texttt{ksize}\f$ aperture. Each channel of a multi-channel image is processed independently.
+Output image must have the same type, size, and number of channels as the input image.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+The median filter uses cv::BORDER_REPLICATE internally to cope with border pixels, see cv::BorderTypes
+ - Function textual ID is "org.opencv.imgproc.filters.medianBlur"
+
+@param src input matrix (image)
+@param ksize aperture linear size; it must be odd and greater than 1, for example: 3, 5, 7 ...
+@sa  boxFilter, gaussianBlur
+ */
+GAPI_EXPORTS_W GMat medianBlur(const GMat& src, int ksize);
+
+/** @brief Erodes an image by using a specific structuring element.
+
+The function erodes the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the minimum is taken:
+
+\f[\texttt{dst} (x,y) =  \min _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+Erosion can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.erode"
+
+@param src input image
+@param kernel structuring element used for erosion; if `element=Mat()`, a `3 x 3` rectangular
+structuring element is used. Kernel can be created using getStructuringElement.
+@param anchor position of the anchor within the element; default value (-1, -1) means that the
+anchor is at the element center.
+@param iterations number of times erosion is applied.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of a constant border
+@sa  dilate, morphologyEx
+ */
+GAPI_EXPORTS_W GMat erode(const GMat& src, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1,
+                          int borderType = BORDER_CONSTANT,
+                          const  Scalar& borderValue = morphologyDefaultBorderValue());
+
+/** @brief Erodes an image by using 3 by 3 rectangular structuring element.
+
+The function erodes the source image using the rectangular structuring element with rectangle center as an anchor.
+Erosion can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.erode"
+
+@param src input image
+@param iterations number of times erosion is applied.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of a constant border
+@sa  erode, dilate3x3
+ */
+GAPI_EXPORTS_W GMat erode3x3(const GMat& src, int iterations = 1,
+                             int borderType = BORDER_CONSTANT,
+                             const  Scalar& borderValue = morphologyDefaultBorderValue());
+
+/** @brief Dilates an image by using a specific structuring element.
+
+The function dilates the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the maximum is taken:
+\f[\texttt{dst} (x,y) =  \max _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+Dilation can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.dilate"
+
+@param src input image.
+@param kernel structuring element used for dilation; if elemenat=Mat(), a 3 x 3 rectangular
+structuring element is used. Kernel can be created using getStructuringElement
+@param anchor position of the anchor within the element; default value (-1, -1) means that the
+anchor is at the element center.
+@param iterations number of times dilation is applied.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of a constant border
+@sa  erode, morphologyEx, getStructuringElement
+ */
+GAPI_EXPORTS_W GMat dilate(const GMat& src, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1,
+                           int borderType = BORDER_CONSTANT,
+                           const  Scalar& borderValue = morphologyDefaultBorderValue());
+
+/** @brief Dilates an image by using 3 by 3 rectangular structuring element.
+
+The function dilates the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the maximum is taken:
+\f[\texttt{dst} (x,y) =  \max _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+Dilation can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.dilate"
+
+@param src input image.
+@param iterations number of times dilation is applied.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of a constant border
+@sa  dilate, erode3x3
+ */
+
+GAPI_EXPORTS_W GMat dilate3x3(const GMat& src, int iterations = 1,
+                              int borderType = BORDER_CONSTANT,
+                              const  Scalar& borderValue = morphologyDefaultBorderValue());
+
+/** @brief Performs advanced morphological transformations.
+
+The function can perform advanced morphological transformations using an erosion and dilation as
+basic operations.
+
+Any of the operations can be done in-place. In case of multi-channel images, each channel is
+processed independently.
+
+@note
+ - Function textual ID is "org.opencv.imgproc.filters.morphologyEx"
+ - The number of iterations is the number of times erosion or dilatation operation will be
+applied. For instance, an opening operation (#MORPH_OPEN) with two iterations is equivalent to
+apply successively: erode -> erode -> dilate -> dilate
+(and not erode -> dilate -> erode -> dilate).
+
+@param src Input image.
+@param op Type of a morphological operation, see #MorphTypes
+@param kernel Structuring element. It can be created using #getStructuringElement.
+@param anchor Anchor position within the element. Both negative values mean that the anchor is at
+the kernel center.
+@param iterations Number of times erosion and dilation are applied.
+@param borderType Pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@param borderValue Border value in case of a constant border. The default value has a special
+meaning.
+@sa  dilate, erode, getStructuringElement
+ */
+GAPI_EXPORTS_W GMat morphologyEx(const GMat &src, const MorphTypes op, const Mat &kernel,
+                                 const Point       &anchor      = Point(-1,-1),
+                                 const int          iterations  = 1,
+                                 const BorderTypes  borderType  = BORDER_CONSTANT,
+                                 const Scalar      &borderValue = morphologyDefaultBorderValue());
+
+/** @brief Calculates the first, second, third, or mixed image derivatives using an extended Sobel operator.
+
+In all cases except one, the \f$\texttt{ksize} \times \texttt{ksize}\f$ separable kernel is used to
+calculate the derivative. When \f$\texttt{ksize = 1}\f$, the \f$3 \times 1\f$ or \f$1 \times 3\f$
+kernel is used (that is, no Gaussian smoothing is done). `ksize = 1` can only be used for the first
+or the second x- or y- derivatives.
+
+There is also the special value `ksize = FILTER_SCHARR (-1)` that corresponds to the \f$3\times3\f$ Scharr
+filter that may give more accurate results than the \f$3\times3\f$ Sobel. The Scharr aperture is
+
+\f[\vecthreethree{-3}{0}{3}{-10}{0}{10}{-3}{0}{3}\f]
+
+for the x-derivative, or transposed for the y-derivative.
+
+The function calculates an image derivative by convolving the image with the appropriate kernel:
+
+\f[\texttt{dst} =  \frac{\partial^{xorder+yorder} \texttt{src}}{\partial x^{xorder} \partial y^{yorder}}\f]
+
+The Sobel operators combine Gaussian smoothing and differentiation, so the result is more or less
+resistant to the noise. Most often, the function is called with ( xorder = 1, yorder = 0, ksize = 3)
+or ( xorder = 0, yorder = 1, ksize = 3) to calculate the first x- or y- image derivative. The first
+case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{0}{1}{-2}{0}{2}{-1}{0}{1}\f]
+
+The second case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{-2}{-1}{0}{0}{0}{1}{2}{1}\f]
+
+@note
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.sobel"
+
+@param src input image.
+@param ddepth output image depth, see @ref filter_depths "combinations"; in the case of
+    8-bit input images it will result in truncated derivatives.
+@param dx order of the derivative x.
+@param dy order of the derivative y.
+@param ksize size of the extended Sobel kernel; it must be odd.
+@param scale optional scale factor for the computed derivative values; by default, no scaling is
+applied (see cv::getDerivKernels for details).
+@param delta optional delta value that is added to the results prior to storing them in dst.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa filter2D, gaussianBlur, cartToPolar
+ */
+GAPI_EXPORTS_W GMat Sobel(const GMat& src, int ddepth, int dx, int dy, int ksize = 3,
+                          double scale = 1, double delta = 0,
+                          int borderType = BORDER_DEFAULT,
+                          const Scalar& borderValue = Scalar(0));
+
+/** @brief Calculates the first, second, third, or mixed image derivatives using an extended Sobel operator.
+
+In all cases except one, the \f$\texttt{ksize} \times \texttt{ksize}\f$ separable kernel is used to
+calculate the derivative. When \f$\texttt{ksize = 1}\f$, the \f$3 \times 1\f$ or \f$1 \times 3\f$
+kernel is used (that is, no Gaussian smoothing is done). `ksize = 1` can only be used for the first
+or the second x- or y- derivatives.
+
+There is also the special value `ksize = FILTER_SCHARR (-1)` that corresponds to the \f$3\times3\f$ Scharr
+filter that may give more accurate results than the \f$3\times3\f$ Sobel. The Scharr aperture is
+
+\f[\vecthreethree{-3}{0}{3}{-10}{0}{10}{-3}{0}{3}\f]
+
+for the x-derivative, or transposed for the y-derivative.
+
+The function calculates an image derivative by convolving the image with the appropriate kernel:
+
+\f[\texttt{dst} =  \frac{\partial^{xorder+yorder} \texttt{src}}{\partial x^{xorder} \partial y^{yorder}}\f]
+
+The Sobel operators combine Gaussian smoothing and differentiation, so the result is more or less
+resistant to the noise. Most often, the function is called with ( xorder = 1, yorder = 0, ksize = 3)
+or ( xorder = 0, yorder = 1, ksize = 3) to calculate the first x- or y- image derivative. The first
+case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{0}{1}{-2}{0}{2}{-1}{0}{1}\f]
+
+The second case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{-2}{-1}{0}{0}{0}{1}{2}{1}\f]
+
+@note
+ - First returned matrix correspons to dx derivative while the second one to dy.
+ - Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+ - Function textual ID is "org.opencv.imgproc.filters.sobelxy"
+
+@param src input image.
+@param ddepth output image depth, see @ref filter_depths "combinations"; in the case of
+    8-bit input images it will result in truncated derivatives.
+@param order order of the derivatives.
+@param ksize size of the extended Sobel kernel; it must be odd.
+@param scale optional scale factor for the computed derivative values; by default, no scaling is
+applied (see cv::getDerivKernels for details).
+@param delta optional delta value that is added to the results prior to storing them in dst.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa filter2D, gaussianBlur, cartToPolar
+ */
+GAPI_EXPORTS_W std::tuple<GMat, GMat> SobelXY(const GMat& src, int ddepth, int order, int ksize = 3,
+                                              double scale = 1, double delta = 0,
+                                              int borderType = BORDER_DEFAULT,
+                                              const Scalar& borderValue = Scalar(0));
+
+/** @brief Calculates the Laplacian of an image.
+
+The function calculates the Laplacian of the source image by adding up the second x and y
+derivatives calculated using the Sobel operator:
+
+\f[\texttt{dst} =  \Delta \texttt{src} =  \frac{\partial^2 \texttt{src}}{\partial x^2} +  \frac{\partial^2 \texttt{src}}{\partial y^2}\f]
+
+This is done when `ksize > 1`. When `ksize == 1`, the Laplacian is computed by filtering the image
+with the following \f$3 \times 3\f$ aperture:
+
+\f[\vecthreethree {0}{1}{0}{1}{-4}{1}{0}{1}{0}\f]
+
+@note Function textual ID is "org.opencv.imgproc.filters.laplacian"
+
+@param src Source image.
+@param ddepth Desired depth of the destination image.
+@param ksize Aperture size used to compute the second-derivative filters. See #getDerivKernels for
+details. The size must be positive and odd.
+@param scale Optional scale factor for the computed Laplacian values. By default, no scaling is
+applied. See #getDerivKernels for details.
+@param delta Optional delta value that is added to the results prior to storing them in dst .
+@param borderType Pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@return Destination image of the same size and the same number of channels as src.
+@sa  Sobel, Scharr
+ */
+GAPI_EXPORTS_W GMat Laplacian(const GMat& src, int ddepth, int ksize = 1,
+                              double scale = 1, double delta = 0, int borderType = BORDER_DEFAULT);
+
+/** @brief Applies the bilateral filter to an image.
+
+The function applies bilateral filtering to the input image, as described in
+http://www.dai.ed.ac.uk/CVonline/LOCAL_COPIES/MANDUCHI1/Bilateral_Filtering.html
+bilateralFilter can reduce unwanted noise very well while keeping edges fairly sharp. However, it is
+very slow compared to most filters.
+
+_Sigma values_: For simplicity, you can set the 2 sigma values to be the same. If they are small (\<
+10), the filter will not have much effect, whereas if they are large (\> 150), they will have a very
+strong effect, making the image look "cartoonish".
+
+_Filter size_: Large filters (d \> 5) are very slow, so it is recommended to use d=5 for real-time
+applications, and perhaps d=9 for offline applications that need heavy noise filtering.
+
+This filter does not work inplace.
+
+@note Function textual ID is "org.opencv.imgproc.filters.bilateralfilter"
+
+@param src Source 8-bit or floating-point, 1-channel or 3-channel image.
+@param d Diameter of each pixel neighborhood that is used during filtering. If it is non-positive,
+it is computed from sigmaSpace.
+@param sigmaColor Filter sigma in the color space. A larger value of the parameter means that
+farther colors within the pixel neighborhood (see sigmaSpace) will be mixed together, resulting
+in larger areas of semi-equal color.
+@param sigmaSpace Filter sigma in the coordinate space. A larger value of the parameter means that
+farther pixels will influence each other as long as their colors are close enough (see sigmaColor
+). When d\>0, it specifies the neighborhood size regardless of sigmaSpace. Otherwise, d is
+proportional to sigmaSpace.
+@param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes
+@return Destination image of the same size and type as src.
+ */
+GAPI_EXPORTS_W GMat bilateralFilter(const GMat& src, int d, double sigmaColor, double sigmaSpace,
+                                    int borderType = BORDER_DEFAULT);
+
+//! @} gapi_filters
+
+//! @addtogroup gapi_feature
+//! @{
+/** @brief Finds edges in an image using the Canny algorithm.
+
+The function finds edges in the input image and marks them in the output map edges using the
+Canny algorithm. The smallest value between threshold1 and threshold2 is used for edge linking. The
+largest value is used to find initial segments of strong edges. See
+<http://en.wikipedia.org/wiki/Canny_edge_detector>
+
+@note Function textual ID is "org.opencv.imgproc.feature.canny"
+
+@param image 8-bit input image.
+@param threshold1 first threshold for the hysteresis procedure.
+@param threshold2 second threshold for the hysteresis procedure.
+@param apertureSize aperture size for the Sobel operator.
+@param L2gradient a flag, indicating whether a more accurate \f$L_2\f$ norm
+\f$=\sqrt{(dI/dx)^2 + (dI/dy)^2}\f$ should be used to calculate the image gradient magnitude (
+L2gradient=true ), or whether the default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough (
+L2gradient=false ).
+ */
+GAPI_EXPORTS_W GMat Canny(const GMat& image, double threshold1, double threshold2,
+                          int apertureSize = 3, bool L2gradient = false);
+
+/** @brief Determines strong corners on an image.
+
+The function finds the most prominent corners in the image or in the specified image region, as
+described in @cite Shi94
+
+-   Function calculates the corner quality measure at every source image pixel using the
+    #cornerMinEigenVal or #cornerHarris .
+-   Function performs a non-maximum suppression (the local maximums in *3 x 3* neighborhood are
+    retained).
+-   The corners with the minimal eigenvalue less than
+    \f$\texttt{qualityLevel} \cdot \max_{x,y} qualityMeasureMap(x,y)\f$ are rejected.
+-   The remaining corners are sorted by the quality measure in the descending order.
+-   Function throws away each corner for which there is a stronger corner at a distance less than
+    maxDistance.
+
+The function can be used to initialize a point-based tracker of an object.
+
+@note
+ - If the function is called with different values A and B of the parameter qualityLevel , and
+A \> B, the vector of returned corners with qualityLevel=A will be the prefix of the output vector
+with qualityLevel=B .
+ - Function textual ID is "org.opencv.imgproc.feature.goodFeaturesToTrack"
+
+@param image Input 8-bit or floating-point 32-bit, single-channel image.
+@param maxCorners Maximum number of corners to return. If there are more corners than are found,
+the strongest of them is returned. `maxCorners <= 0` implies that no limit on the maximum is set
+and all detected corners are returned.
+@param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
+parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
+(see #cornerMinEigenVal ) or the Harris function response (see #cornerHarris ). The corners with the
+quality measure less than the product are rejected. For example, if the best corner has the
+quality measure = 1500, and the qualityLevel=0.01 , then all the corners with the quality measure
+less than 15 are rejected.
+@param minDistance Minimum possible Euclidean distance between the returned corners.
+@param mask Optional region of interest. If the image is not empty (it needs to have the type
+CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+@param blockSize Size of an average block for computing a derivative covariation matrix over each
+pixel neighborhood. See cornerEigenValsAndVecs .
+@param useHarrisDetector Parameter indicating whether to use a Harris detector (see #cornerHarris)
+or #cornerMinEigenVal.
+@param k Free parameter of the Harris detector.
+
+@return vector of detected corners.
+ */
+GAPI_EXPORTS_W GArray<Point2f> goodFeaturesToTrack(const GMat   &image,
+                                                         int    maxCorners,
+                                                         double qualityLevel,
+                                                         double minDistance,
+                                                   const Mat    &mask = Mat(),
+                                                         int    blockSize = 3,
+                                                         bool   useHarrisDetector = false,
+                                                         double k = 0.04);
+
+/** @brief Equalizes the histogram of a grayscale image.
+
+//! @} gapi_feature
+
+The function equalizes the histogram of the input image using the following algorithm:
+
+- Calculate the histogram \f$H\f$ for src .
+- Normalize the histogram so that the sum of histogram bins is 255.
+- Compute the integral of the histogram:
+\f[H'_i =  \sum _{0  \le j < i} H(j)\f]
+- Transform the image using \f$H'\f$ as a look-up table: \f$\texttt{dst}(x,y) = H'(\texttt{src}(x,y))\f$
+
+The algorithm normalizes the brightness and increases the contrast of the image.
+@note
+ - The returned image is of the same size and type as input.
+ - Function textual ID is "org.opencv.imgproc.equalizeHist"
+
+@param src Source 8-bit single channel image.
+ */
+GAPI_EXPORTS_W GMat equalizeHist(const GMat& src);
+
+//! @addtogroup gapi_shape
+//! @{
+/** @brief Finds contours in a binary image.
+
+The function retrieves contours from the binary image using the algorithm @cite Suzuki85 .
+The contours are a useful tool for shape analysis and object detection and recognition.
+See squares.cpp in the OpenCV sample directory.
+
+@note Function textual ID is "org.opencv.imgproc.shape.findContours"
+
+@param src Input gray-scale image @ref CV_8UC1. Non-zero pixels are treated as 1's. Zero
+pixels remain 0's, so the image is treated as binary . You can use #compare, #inRange, #threshold ,
+#adaptiveThreshold, #Canny, and others to create a binary image out of a grayscale or color one.
+If mode equals to #RETR_CCOMP, the input can also be a 32-bit integer
+image of labels ( @ref CV_32SC1 ). If #RETR_FLOODFILL then @ref CV_32SC1 is supported only.
+@param mode Contour retrieval mode, see #RetrievalModes
+@param method Contour approximation method, see #ContourApproximationModes
+@param offset Optional offset by which every contour point is shifted. This is useful if the
+contours are extracted from the image ROI and then they should be analyzed in the whole image
+context.
+
+@return GArray of detected contours. Each contour is stored as a GArray of points.
+ */
+GAPI_EXPORTS GArray<GArray<Point>>
+findContours(const GMat &src, const RetrievalModes mode, const ContourApproximationModes method,
+             const GOpaque<Point> &offset);
+
+// FIXME oc: make default value offset = Point()
+/** @overload
+@note Function textual ID is "org.opencv.imgproc.shape.findContoursNoOffset"
+ */
+GAPI_EXPORTS GArray<GArray<Point>>
+findContours(const GMat &src, const RetrievalModes mode, const ContourApproximationModes method);
+
+/** @brief Finds contours and their hierarchy in a binary image.
+
+The function retrieves contours from the binary image using the algorithm @cite Suzuki85
+and calculates their hierarchy.
+The contours are a useful tool for shape analysis and object detection and recognition.
+See squares.cpp in the OpenCV sample directory.
+
+@note Function textual ID is "org.opencv.imgproc.shape.findContoursH"
+
+@param src Input gray-scale image @ref CV_8UC1. Non-zero pixels are treated as 1's. Zero
+pixels remain 0's, so the image is treated as binary . You can use #compare, #inRange, #threshold ,
+#adaptiveThreshold, #Canny, and others to create a binary image out of a grayscale or color one.
+If mode equals to #RETR_CCOMP, the input can also be a 32-bit integer
+image of labels ( @ref CV_32SC1 ). If #RETR_FLOODFILL -- @ref CV_32SC1 supports only.
+@param mode Contour retrieval mode, see #RetrievalModes
+@param method Contour approximation method, see #ContourApproximationModes
+@param offset Optional offset by which every contour point is shifted. This is useful if the
+contours are extracted from the image ROI and then they should be analyzed in the whole image
+context.
+
+@return
+ - GArray of detected contours. Each contour is stored as a GArray of points.
+ - Optional output GArray of cv::Vec4i, containing information about the image topology.
+It has as many elements as the number of contours. For each i-th contour contours[i], the elements
+hierarchy[i][0] , hierarchy[i][1] , hierarchy[i][2] , and hierarchy[i][3] are set to 0-based
+indices in contours of the next and previous contours at the same hierarchical level, the first
+child contour and the parent contour, respectively. If for the contour i there are no next,
+previous, parent, or nested contours, the corresponding elements of hierarchy[i] will be negative.
+ */
+GAPI_EXPORTS std::tuple<GArray<GArray<Point>>,GArray<Vec4i>>
+findContoursH(const GMat &src, const RetrievalModes mode, const ContourApproximationModes method,
+              const GOpaque<Point> &offset);
+
+// FIXME oc: make default value offset = Point()
+/** @overload
+@note Function textual ID is "org.opencv.imgproc.shape.findContoursHNoOffset"
+ */
+GAPI_EXPORTS std::tuple<GArray<GArray<Point>>,GArray<Vec4i>>
+findContoursH(const GMat &src, const RetrievalModes mode, const ContourApproximationModes method);
+
+/** @brief Calculates the up-right bounding rectangle of a point set or non-zero pixels
+of gray-scale image.
+
+The function calculates and returns the minimal up-right bounding rectangle for the specified
+point set or non-zero pixels of gray-scale image.
+
+@note
+ - Function textual ID is "org.opencv.imgproc.shape.boundingRectMat"
+ - In case of a 2D points' set given, Mat should be 2-dimensional, have a single row or column
+if there are 2 channels, or have 2 columns if there is a single channel. Mat should have either
+@ref CV_32S or @ref CV_32F depth
+
+@param src Input gray-scale image @ref CV_8UC1; or input set of @ref CV_32S or @ref CV_32F
+2D points stored in Mat.
+ */
+GAPI_EXPORTS_W GOpaque<Rect> boundingRect(const GMat& src);
+
+/** @overload
+
+Calculates the up-right bounding rectangle of a point set.
+
+@note Function textual ID is "org.opencv.imgproc.shape.boundingRectVector32S"
+
+@param src Input 2D point set, stored in std::vector<cv::Point2i>.
+ */
+GAPI_EXPORTS_W GOpaque<Rect> boundingRect(const GArray<Point2i>& src);
+
+/** @overload
+
+Calculates the up-right bounding rectangle of a point set.
+
+@note Function textual ID is "org.opencv.imgproc.shape.boundingRectVector32F"
+
+@param src Input 2D point set, stored in std::vector<cv::Point2f>.
+ */
+GAPI_EXPORTS_W GOpaque<Rect> boundingRect(const GArray<Point2f>& src);
+
+/** @brief Fits a line to a 2D point set.
+
+The function fits a line to a 2D point set by minimizing \f$\sum_i \rho(r_i)\f$ where
+\f$r_i\f$ is a distance between the \f$i^{th}\f$ point, the line and \f$\rho(r)\f$ is a distance
+function, one of the following:
+-  DIST_L2
+\f[\rho (r) = r^2/2  \quad \text{(the simplest and the fastest least-squares method)}\f]
+- DIST_L1
+\f[\rho (r) = r\f]
+- DIST_L12
+\f[\rho (r) = 2  \cdot ( \sqrt{1 + \frac{r^2}{2}} - 1)\f]
+- DIST_FAIR
+\f[\rho \left (r \right ) = C^2  \cdot \left (  \frac{r}{C} -  \log{\left(1 + \frac{r}{C}\right)} \right )  \quad \text{where} \quad C=1.3998\f]
+- DIST_WELSCH
+\f[\rho \left (r \right ) =  \frac{C^2}{2} \cdot \left ( 1 -  \exp{\left(-\left(\frac{r}{C}\right)^2\right)} \right )  \quad \text{where} \quad C=2.9846\f]
+- DIST_HUBER
+\f[\rho (r) =  \fork{r^2/2}{if \(r < C\)}{C \cdot (r-C/2)}{otherwise} \quad \text{where} \quad C=1.345\f]
+
+The algorithm is based on the M-estimator ( <http://en.wikipedia.org/wiki/M-estimator> ) technique
+that iteratively fits the line using the weighted least-squares algorithm. After each iteration the
+weights \f$w_i\f$ are adjusted to be inversely proportional to \f$\rho(r_i)\f$ .
+
+@note
+ - Function textual ID is "org.opencv.imgproc.shape.fitLine2DMat"
+ - In case of an N-dimentional points' set given, Mat should be 2-dimensional, have a single row
+or column if there are N channels, or have N columns if there is a single channel.
+
+@param src Input set of 2D points stored in one of possible containers: Mat,
+std::vector<cv::Point2i>, std::vector<cv::Point2f>, std::vector<cv::Point2d>.
+@param distType Distance used by the M-estimator, see #DistanceTypes. @ref DIST_USER
+and @ref DIST_C are not supported.
+@param param Numerical parameter ( C ) for some types of distances. If it is 0, an optimal value
+is chosen.
+@param reps Sufficient accuracy for the radius (distance between the coordinate origin and the
+line). 1.0 would be a good default value for reps. If it is 0, a default value is chosen.
+@param aeps Sufficient accuracy for the angle. 0.01 would be a good default value for aeps.
+If it is 0, a default value is chosen.
+
+@return Output line parameters: a vector of 4 elements (like Vec4f) - (vx, vy, x0, y0),
+where (vx, vy) is a normalized vector collinear to the line and (x0, y0) is a point on the line.
+ */
+GAPI_EXPORTS GOpaque<Vec4f> fitLine2D(const GMat& src, const DistanceTypes distType,
+                                      const double param = 0., const double reps = 0.,
+                                      const double aeps = 0.);
+
+/** @overload
+
+@note Function textual ID is "org.opencv.imgproc.shape.fitLine2DVector32S"
+
+ */
+GAPI_EXPORTS GOpaque<Vec4f> fitLine2D(const GArray<Point2i>& src, const DistanceTypes distType,
+                                      const double param = 0., const double reps = 0.,
+                                      const double aeps = 0.);
+
+/** @overload
+
+@note Function textual ID is "org.opencv.imgproc.shape.fitLine2DVector32F"
+
+ */
+GAPI_EXPORTS GOpaque<Vec4f> fitLine2D(const GArray<Point2f>& src, const DistanceTypes distType,
+                                      const double param = 0., const double reps = 0.,
+                                      const double aeps = 0.);
+
+/** @overload
+
+@note Function textual ID is "org.opencv.imgproc.shape.fitLine2DVector64F"
+
+ */
+GAPI_EXPORTS GOpaque<Vec4f> fitLine2D(const GArray<Point2d>& src, const DistanceTypes distType,
+                                      const double param = 0., const double reps = 0.,
+                                      const double aeps = 0.);
+
+/** @brief Fits a line to a 3D point set.
+
+The function fits a line to a 3D point set by minimizing \f$\sum_i \rho(r_i)\f$ where
+\f$r_i\f$ is a distance between the \f$i^{th}\f$ point, the line and \f$\rho(r)\f$ is a distance
+function, one of the following:
+-  DIST_L2
+\f[\rho (r) = r^2/2  \quad \text{(the simplest and the fastest least-squares method)}\f]
+- DIST_L1
+\f[\rho (r) = r\f]
+- DIST_L12
+\f[\rho (r) = 2  \cdot ( \sqrt{1 + \frac{r^2}{2}} - 1)\f]
+- DIST_FAIR
+\f[\rho \left (r \right ) = C^2  \cdot \left (  \frac{r}{C} -  \log{\left(1 + \frac{r}{C}\right)} \right )  \quad \text{where} \quad C=1.3998\f]
+- DIST_WELSCH
+\f[\rho \left (r \right ) =  \frac{C^2}{2} \cdot \left ( 1 -  \exp{\left(-\left(\frac{r}{C}\right)^2\right)} \right )  \quad \text{where} \quad C=2.9846\f]
+- DIST_HUBER
+\f[\rho (r) =  \fork{r^2/2}{if \(r < C\)}{C \cdot (r-C/2)}{otherwise} \quad \text{where} \quad C=1.345\f]
+
+The algorithm is based on the M-estimator ( <http://en.wikipedia.org/wiki/M-estimator> ) technique
+that iteratively fits the line using the weighted least-squares algorithm. After each iteration the
+weights \f$w_i\f$ are adjusted to be inversely proportional to \f$\rho(r_i)\f$ .
+
+@note
+ - Function textual ID is "org.opencv.imgproc.shape.fitLine3DMat"
+ - In case of an N-dimentional points' set given, Mat should be 2-dimensional, have a single row
+or column if there are N channels, or have N columns if there is a single channel.
+
+@param src Input set of 3D points stored in one of possible containers: Mat,
+std::vector<cv::Point3i>, std::vector<cv::Point3f>, std::vector<cv::Point3d>.
+@param distType Distance used by the M-estimator, see #DistanceTypes. @ref DIST_USER
+and @ref DIST_C are not supported.
+@param param Numerical parameter ( C ) for some types of distances. If it is 0, an optimal value
+is chosen.
+@param reps Sufficient accuracy for the radius (distance between the coordinate origin and the
+line). 1.0 would be a good default value for reps. If it is 0, a default value is chosen.
+@param aeps Sufficient accuracy for the angle. 0.01 would be a good default value for aeps.
+If it is 0, a default value is chosen.
+
+@return Output line parameters: a vector of 6 elements (like Vec6f) - (vx, vy, vz, x0, y0, z0),
+where (vx, vy, vz) is a normalized vector collinear to the line and (x0, y0, z0) is a point on
+the line.
+ */
+GAPI_EXPORTS GOpaque<Vec6f> fitLine3D(const GMat& src, const DistanceTypes distType,
+                                      const double param = 0., const double reps = 0.,
+                                      const double aeps = 0.);
+
+/** @overload
+
+@note Function textual ID is "org.opencv.imgproc.shape.fitLine3DVector32S"
+
+ */
+GAPI_EXPORTS GOpaque<Vec6f> fitLine3D(const GArray<Point3i>& src, const DistanceTypes distType,
+                                      const double param = 0., const double reps = 0.,
+                                      const double aeps = 0.);
+
+/** @overload
+
+@note Function textual ID is "org.opencv.imgproc.shape.fitLine3DVector32F"
+
+ */
+GAPI_EXPORTS GOpaque<Vec6f> fitLine3D(const GArray<Point3f>& src, const DistanceTypes distType,
+                                      const double param = 0., const double reps = 0.,
+                                      const double aeps = 0.);
+
+/** @overload
+
+@note Function textual ID is "org.opencv.imgproc.shape.fitLine3DVector64F"
+
+ */
+GAPI_EXPORTS GOpaque<Vec6f> fitLine3D(const GArray<Point3d>& src, const DistanceTypes distType,
+                                      const double param = 0., const double reps = 0.,
+                                      const double aeps = 0.);
+
+//! @} gapi_shape
+
+//! @addtogroup gapi_colorconvert
+//! @{
+/** @brief Converts an image from BGR color space to RGB color space.
+
+The function converts an input image from BGR color space to RGB.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image is 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.bgr2rgb"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa RGB2BGR
+*/
+GAPI_EXPORTS_W GMat BGR2RGB(const GMat& src);
+
+/** @brief Converts an image from RGB color space to gray-scaled.
+
+The conventional ranges for R, G, and B channel values are 0 to 255.
+Resulting gray color value computed as
+\f[\texttt{dst} (I)= \texttt{0.299} * \texttt{src}(I).R + \texttt{0.587} * \texttt{src}(I).G  + \texttt{0.114} * \texttt{src}(I).B \f]
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2gray"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
+@sa RGB2YUV
+ */
+GAPI_EXPORTS_W GMat RGB2Gray(const GMat& src);
+
+/** @overload
+Resulting gray color value computed as
+\f[\texttt{dst} (I)= \texttt{rY} * \texttt{src}(I).R + \texttt{gY} * \texttt{src}(I).G  + \texttt{bY} * \texttt{src}(I).B \f]
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2graycustom"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
+@param rY float multiplier for R channel.
+@param gY float multiplier for G channel.
+@param bY float multiplier for B channel.
+@sa RGB2YUV
+ */
+GAPI_EXPORTS_W GMat RGB2Gray(const GMat& src, float rY, float gY, float bY);
+
+/** @brief Converts an image from BGR color space to gray-scaled.
+
+The conventional ranges for B, G, and R channel values are 0 to 255.
+Resulting gray color value computed as
+\f[\texttt{dst} (I)= \texttt{0.114} * \texttt{src}(I).B + \texttt{0.587} * \texttt{src}(I).G  + \texttt{0.299} * \texttt{src}(I).R \f]
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.bgr2gray"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
+@sa BGR2LUV
+ */
+GAPI_EXPORTS_W GMat BGR2Gray(const GMat& src);
+
+/** @brief Converts an image from RGB color space to YUV color space.
+
+The function converts an input image from RGB color space to YUV.
+The conventional ranges for R, G, and B channel values are 0 to 255.
+
+In case of linear transformations, the range does not matter. But in case of a non-linear
+transformation, an input RGB image should be normalized to the proper value range to get the correct
+results, like here, at RGB \f$\rightarrow\f$ Y\*u\*v\* transformation.
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2yuv"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa YUV2RGB, RGB2Lab
+*/
+GAPI_EXPORTS_W GMat RGB2YUV(const GMat& src);
+
+/** @brief Converts an image from BGR color space to I420 color space.
+
+The function converts an input image from BGR color space to I420.
+The conventional ranges for R, G, and B channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 1-channel image. @ref CV_8UC1.
+Width of I420 output image must be the same as width of input image.
+Height of I420 output image must be equal 3/2 from height of input image.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.bgr2i420"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa I4202BGR
+*/
+GAPI_EXPORTS_W GMat BGR2I420(const GMat& src);
+
+/** @brief Converts an image from RGB color space to I420 color space.
+
+The function converts an input image from RGB color space to I420.
+The conventional ranges for R, G, and B channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 1-channel image. @ref CV_8UC1.
+Width of I420 output image must be the same as width of input image.
+Height of I420 output image must be equal 3/2 from height of input image.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2i420"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa I4202RGB
+*/
+GAPI_EXPORTS_W GMat RGB2I420(const GMat& src);
+
+/** @brief Converts an image from I420 color space to BGR color space.
+
+The function converts an input image from I420 color space to BGR.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image. @ref CV_8UC3.
+Width of BGR output image must be the same as width of input image.
+Height of BGR output image must be equal 2/3 from height of input image.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.i4202bgr"
+
+@param src input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
+@sa BGR2I420
+*/
+GAPI_EXPORTS_W GMat I4202BGR(const GMat& src);
+
+/** @brief Converts an image from I420 color space to BGR color space.
+
+The function converts an input image from I420 color space to BGR.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image. @ref CV_8UC3.
+Width of RGB output image must be the same as width of input image.
+Height of RGB output image must be equal 2/3 from height of input image.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.i4202rgb"
+
+@param src input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
+@sa RGB2I420
+*/
+GAPI_EXPORTS_W GMat I4202RGB(const GMat& src);
+
+/** @brief Converts an image from BGR color space to LUV color space.
+
+The function converts an input image from BGR color space to LUV.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.bgr2luv"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa RGB2Lab, RGB2LUV
+*/
+GAPI_EXPORTS_W GMat BGR2LUV(const GMat& src);
+
+/** @brief Converts an image from LUV color space to BGR color space.
+
+The function converts an input image from LUV color space to BGR.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.luv2bgr"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa BGR2LUV
+*/
+GAPI_EXPORTS_W GMat LUV2BGR(const GMat& src);
+
+/** @brief Converts an image from YUV color space to BGR color space.
+
+The function converts an input image from YUV color space to BGR.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.yuv2bgr"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa BGR2YUV
+*/
+GAPI_EXPORTS_W GMat YUV2BGR(const GMat& src);
+
+/** @brief Converts an image from BGR color space to YUV color space.
+
+The function converts an input image from BGR color space to YUV.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.bgr2yuv"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa YUV2BGR
+*/
+GAPI_EXPORTS_W GMat BGR2YUV(const GMat& src);
+
+/** @brief Converts an image from RGB color space to Lab color space.
+
+The function converts an input image from BGR color space to Lab.
+The conventional ranges for R, G, and B channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC1.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2lab"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
+@sa RGB2YUV, RGB2LUV
+*/
+GAPI_EXPORTS_W GMat RGB2Lab(const GMat& src);
+
+/** @brief Converts an image from YUV color space to RGB.
+The function converts an input image from YUV color space to RGB.
+The conventional ranges for Y, U, and V channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.yuv2rgb"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@sa RGB2Lab, RGB2YUV
+*/
+GAPI_EXPORTS_W GMat YUV2RGB(const GMat& src);
+
+/** @brief Converts an image from NV12 (YUV420p) color space to RGB.
+The function converts an input image from NV12 color space to RGB.
+The conventional ranges for Y, U, and V channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.nv12torgb"
+
+@param src_y input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
+@param src_uv input image: 8-bit unsigned 2-channel image @ref CV_8UC2.
+
+@sa YUV2RGB, NV12toBGR
+*/
+GAPI_EXPORTS_W GMat NV12toRGB(const GMat& src_y, const GMat& src_uv);
+
+/** @brief Converts an image from NV12 (YUV420p) color space to gray-scaled.
+The function converts an input image from NV12 color space to gray-scaled.
+The conventional ranges for Y, U, and V channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 1-channel image @ref CV_8UC1.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.nv12togray"
+
+@param src_y input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
+@param src_uv input image: 8-bit unsigned 2-channel image @ref CV_8UC2.
+
+@sa YUV2RGB, NV12toBGR
+*/
+GAPI_EXPORTS_W GMat NV12toGray(const GMat& src_y, const GMat& src_uv);
+
+/** @brief Converts an image from NV12 (YUV420p) color space to BGR.
+The function converts an input image from NV12 color space to RGB.
+The conventional ranges for Y, U, and V channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.nv12tobgr"
+
+@param src_y input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
+@param src_uv input image: 8-bit unsigned 2-channel image @ref CV_8UC2.
+
+@sa YUV2BGR, NV12toRGB
+*/
+GAPI_EXPORTS_W GMat NV12toBGR(const GMat& src_y, const GMat& src_uv);
+
+/** @brief Converts an image from BayerGR color space to RGB.
+The function converts an input image from BayerGR color space to RGB.
+The conventional ranges for G, R, and B channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.bayergr2rgb"
+
+@param src_gr input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
+
+@sa YUV2BGR, NV12toRGB
+*/
+GAPI_EXPORTS_W GMat BayerGR2RGB(const GMat& src_gr);
+
+/** @brief Converts an image from RGB color space to HSV.
+The function converts an input image from RGB color space to HSV.
+The conventional ranges for R, G, and B channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2hsv"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@sa YUV2BGR, NV12toRGB
+*/
+GAPI_EXPORTS_W GMat RGB2HSV(const GMat& src);
+
+/** @brief Converts an image from RGB color space to YUV422.
+The function converts an input image from RGB color space to YUV422.
+The conventional ranges for R, G, and B channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 2-channel image @ref CV_8UC2.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2yuv422"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@sa YUV2BGR, NV12toRGB
+*/
+GAPI_EXPORTS_W GMat RGB2YUV422(const GMat& src);
+
+/** @brief Converts an image from NV12 (YUV420p) color space to RGB.
+The function converts an input image from NV12 color space to RGB.
+The conventional ranges for Y, U, and V channel values are 0 to 255.
+
+Output image must be 8-bit unsigned planar 3-channel image @ref CV_8UC1.
+Planar image memory layout is three planes laying in the memory contiguously,
+so the image height should be plane_height*plane_number,
+image type is @ref CV_8UC1.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.nv12torgbp"
+
+@param src_y input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
+@param src_uv input image: 8-bit unsigned 2-channel image @ref CV_8UC2.
+
+@sa YUV2RGB, NV12toBGRp, NV12toRGB
+*/
+GAPI_EXPORTS GMatP NV12toRGBp(const GMat &src_y, const GMat &src_uv);
+
+/** @brief Converts an image from NV12 (YUV420p) color space to BGR.
+The function converts an input image from NV12 color space to BGR.
+The conventional ranges for Y, U, and V channel values are 0 to 255.
+
+Output image must be 8-bit unsigned planar 3-channel image @ref CV_8UC1.
+Planar image memory layout is three planes laying in the memory contiguously,
+so the image height should be plane_height*plane_number,
+image type is @ref CV_8UC1.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.nv12torgbp"
+
+@param src_y input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
+@param src_uv input image: 8-bit unsigned 2-channel image @ref CV_8UC2.
+
+@sa YUV2RGB, NV12toRGBp, NV12toBGR
+*/
+GAPI_EXPORTS GMatP NV12toBGRp(const GMat &src_y, const GMat &src_uv);
+
+//! @} gapi_colorconvert
+//! @addtogroup gapi_transform
+//! @{
+/** @brief Resizes an image.
+
+The function resizes the image src down to or up to the specified size.
+
+Output image size will have the size dsize (when dsize is non-zero) or the size computed from
+src.size(), fx, and fy; the depth of output is the same as of src.
+
+If you want to resize src so that it fits the pre-created dst,
+you may call the function as follows:
+@code
+    // explicitly specify dsize=dst.size(); fx and fy will be computed from that.
+    resize(src, dst, dst.size(), 0, 0, interpolation);
+@endcode
+If you want to decimate the image by factor of 2 in each direction, you can call the function this
+way:
+@code
+    // specify fx and fy and let the function compute the destination image size.
+    resize(src, dst, Size(), 0.5, 0.5, interpolation);
+@endcode
+To shrink an image, it will generally look best with cv::INTER_AREA interpolation, whereas to
+enlarge an image, it will generally look best with cv::INTER_CUBIC (slow) or cv::INTER_LINEAR
+(faster but still looks OK).
+
+@note Function textual ID is "org.opencv.imgproc.transform.resize"
+
+@param src input image.
+@param dsize output image size; if it equals zero, it is computed as:
+ \f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
+ Either dsize or both fx and fy must be non-zero.
+@param fx scale factor along the horizontal axis; when it equals 0, it is computed as
+\f[\texttt{(double)dsize.width/src.cols}\f]
+@param fy scale factor along the vertical axis; when it equals 0, it is computed as
+\f[\texttt{(double)dsize.height/src.rows}\f]
+@param interpolation interpolation method, see cv::InterpolationFlags
+
+@sa  warpAffine, warpPerspective, remap, resizeP
+ */
+GAPI_EXPORTS_W GMat resize(const GMat& src, const Size& dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR);
+
+/** @brief Resizes a planar image.
+
+The function resizes the image src down to or up to the specified size.
+Planar image memory layout is three planes laying in the memory contiguously,
+so the image height should be plane_height*plane_number, image type is @ref CV_8UC1.
+
+Output image size will have the size dsize, the depth of output is the same as of src.
+
+@note Function textual ID is "org.opencv.imgproc.transform.resizeP"
+
+@param src input image, must be of @ref CV_8UC1 type;
+@param dsize output image size;
+@param interpolation interpolation method, only cv::INTER_LINEAR is supported at the moment
+
+@sa  warpAffine, warpPerspective, remap, resize
+ */
+GAPI_EXPORTS GMatP resizeP(const GMatP& src, const Size& dsize, int interpolation = cv::INTER_LINEAR);
+
+//! @} gapi_transform
+} //namespace gapi
+} //namespace cv
+
+#endif // OPENCV_GAPI_IMGPROC_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer.hpp
new file mode 100644
index 000000000000..abbd32ba2063
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer.hpp
@@ -0,0 +1,717 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019-2021 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_INFER_HPP
+#define OPENCV_GAPI_INFER_HPP
+
+// FIXME: Inference API is currently only available in full mode
+#if !defined(GAPI_STANDALONE)
+
+#include <functional>
+#include <string>  // string
+#include <utility> // tuple
+#include <type_traits> // is_same, false_type
+
+#include <opencv2/gapi/util/util.hpp> // all_satisfy
+#include <opencv2/gapi/util/any.hpp>  // any<>
+#include <opencv2/gapi/gkernel.hpp>   // GKernelType[M], GBackend
+#include <opencv2/gapi/garg.hpp>      // GArg
+#include <opencv2/gapi/gcommon.hpp>   // CompileArgTag
+#include <opencv2/gapi/gmetaarg.hpp>  // GMetaArg
+
+namespace cv {
+
+template<typename, typename> class GNetworkType;
+
+namespace detail {
+
+// Infer ///////////////////////////////////////////////////////////////////////
+template<typename T>
+struct accepted_infer_types {
+    static constexpr const auto value =
+            std::is_same<typename std::decay<T>::type, cv::GMat>::value
+         || std::is_same<typename std::decay<T>::type, cv::GFrame>::value;
+};
+
+template<typename... Ts>
+using valid_infer_types = all_satisfy<accepted_infer_types, Ts...>;
+
+// Infer2 //////////////////////////////////////////////////////////////////////
+
+template<typename, typename>
+struct valid_infer2_types;
+
+// Terminal case 1 (50/50 success)
+template<typename T>
+struct valid_infer2_types< std::tuple<cv::GMat>, std::tuple<T> > {
+    // By default, Nets are limited to GMat argument types only
+    // for infer2, every GMat argument may translate to either
+    // GArray<GMat> or GArray<Rect>. GArray<> part is stripped
+    // already at this point.
+    static constexpr const auto value =
+            std::is_same<typename std::decay<T>::type, cv::GMat>::value
+         || std::is_same<typename std::decay<T>::type, cv::Rect>::value;
+};
+
+// Terminal case 2 (100% failure)
+template<typename... Ts>
+struct valid_infer2_types< std::tuple<>, std::tuple<Ts...> >
+    : public std::false_type {
+};
+
+// Terminal case 3 (100% failure)
+template<typename... Ns>
+struct valid_infer2_types< std::tuple<Ns...>, std::tuple<> >
+    : public std::false_type {
+};
+
+// Recursion -- generic
+template<typename... Ns, typename T, typename...Ts>
+struct valid_infer2_types< std::tuple<cv::GMat,Ns...>, std::tuple<T,Ts...> > {
+    static constexpr const auto value =
+           valid_infer2_types< std::tuple<cv::GMat>, std::tuple<T> >::value
+        && valid_infer2_types< std::tuple<Ns...>, std::tuple<Ts...> >::value;
+};
+
+// Struct stores network input/output names.
+// Used by infer<Generic>
+struct InOutInfo
+{
+    std::vector<std::string> in_names;
+    std::vector<std::string> out_names;
+};
+
+template <typename OutT>
+class GInferOutputsTyped
+{
+public:
+    GInferOutputsTyped() = default;
+    GInferOutputsTyped(std::shared_ptr<cv::GCall> call)
+        : m_priv(std::make_shared<Priv>(std::move(call)))
+    {
+    }
+
+    OutT at(const std::string& name)
+    {
+        auto it = m_priv->blobs.find(name);
+        if (it == m_priv->blobs.end()) {
+            // FIXME: Avoid modifying GKernel
+            auto shape = cv::detail::GTypeTraits<OutT>::shape;
+            auto kind  = cv::detail::GTypeTraits<OutT>::op_kind;
+            m_priv->call->kernel().outShapes.push_back(shape);
+            m_priv->call->kernel().outCtors.emplace_back(cv::detail::GObtainCtor<OutT>::get());
+            m_priv->call->kernel().outKinds.emplace_back(kind);
+            auto out_idx = static_cast<int>(m_priv->blobs.size());
+            it = m_priv->blobs.emplace(name,
+                    cv::detail::Yield<OutT>::yield(*(m_priv->call), out_idx)).first;
+            m_priv->info->out_names.push_back(name);
+        }
+        return it->second;
+    }
+private:
+    struct Priv
+    {
+        Priv(std::shared_ptr<cv::GCall> c)
+            : call(std::move(c)), info(cv::util::any_cast<InOutInfo>(&call->params()))
+        {
+        }
+
+        std::shared_ptr<cv::GCall> call;
+        InOutInfo* info = nullptr;
+        std::unordered_map<std::string, OutT> blobs;
+    };
+
+    std::shared_ptr<Priv> m_priv;
+};
+
+template <typename... Ts>
+class GInferInputsTyped
+{
+public:
+    GInferInputsTyped()
+        : m_priv(std::make_shared<Priv>())
+    {
+    }
+
+    template <typename U>
+    GInferInputsTyped<Ts...>& setInput(const std::string& name, U in)
+    {
+        m_priv->blobs.emplace(std::piecewise_construct,
+                              std::forward_as_tuple(name),
+                              std::forward_as_tuple(in));
+        return *this;
+    }
+
+    using StorageT = cv::util::variant<Ts...>;
+    StorageT& operator[](const std::string& name) {
+        return m_priv->blobs[name];
+    }
+
+    using Map = std::unordered_map<std::string, StorageT>;
+    const Map& getBlobs() const {
+        return m_priv->blobs;
+    }
+
+private:
+    struct Priv
+    {
+        std::unordered_map<std::string, StorageT> blobs;
+    };
+
+    std::shared_ptr<Priv> m_priv;
+};
+
+template<typename InferT>
+std::shared_ptr<cv::GCall> makeCall(const std::string         &tag,
+                                    std::vector<cv::GArg>    &&args,
+                                    std::vector<std::string> &&names,
+                                    cv::GKinds               &&kinds) {
+    auto call = std::make_shared<cv::GCall>(GKernel{
+                InferT::id(),
+                tag,
+                InferT::getOutMeta,
+                {}, // outShape will be filled later
+                std::move(kinds),
+                {}, // outCtors will be filled later
+                {}, // outKinds will be filled later
+            });
+
+    call->setArgs(std::move(args));
+    call->params() = cv::detail::InOutInfo{std::move(names), {}};
+
+    return call;
+}
+
+} // namespace detail
+
+// TODO: maybe tuple_wrap_helper from util.hpp may help with this.
+// Multiple-return-value network definition (specialized base class)
+template<typename K, typename... R, typename... Args>
+class GNetworkType<K, std::function<std::tuple<R...>(Args...)> >
+{
+public:
+    using InArgs  = std::tuple<Args...>;
+    using OutArgs = std::tuple<R...>;
+
+    using Result  = OutArgs;
+    using API     = std::function<Result(Args...)>;
+
+    using ResultL = std::tuple< cv::GArray<R>... >;
+};
+
+// Single-return-value network definition (specialized base class)
+template<typename K, typename R, typename... Args>
+class GNetworkType<K, std::function<R(Args...)> >
+{
+public:
+    using InArgs  = std::tuple<Args...>;
+    using OutArgs = std::tuple<R>;
+
+    using Result  = R;
+    using API     = std::function<R(Args...)>;
+
+    using ResultL = cv::GArray<R>;
+};
+
+// InferAPI: Accepts either GMat or GFrame for very individual network's input
+template<class Net, class... Ts>
+struct InferAPI {
+    using type = typename std::enable_if
+        <    detail::valid_infer_types<Ts...>::value
+          && std::tuple_size<typename Net::InArgs>::value == sizeof...(Ts)
+        , std::function<typename Net::Result(Ts...)>
+        >::type;
+};
+
+// InferAPIRoi: Accepts a rectangle and either GMat or GFrame
+template<class Net, class T>
+struct InferAPIRoi {
+    using type = typename std::enable_if
+        <    detail::valid_infer_types<T>::value
+          && std::tuple_size<typename Net::InArgs>::value == 1u
+          , std::function<typename Net::Result(cv::GOpaque<cv::Rect>, T)>
+        >::type;
+};
+
+// InferAPIList: Accepts a list of rectangles and list of GMat/GFrames;
+// crops every input.
+template<class Net, class... Ts>
+struct InferAPIList {
+    using type = typename std::enable_if
+        <    detail::valid_infer_types<Ts...>::value
+          && std::tuple_size<typename Net::InArgs>::value == sizeof...(Ts)
+        , std::function<typename Net::ResultL(cv::GArray<cv::Rect>, Ts...)>
+        >::type;
+};
+
+// APIList2 is also template to allow different calling options
+// (GArray<cv::Rect> vs GArray<cv::GMat> per input)
+template<class Net, typename T, class... Ts>
+struct InferAPIList2 {
+    using type = typename std::enable_if
+        < detail::valid_infer_types<T>::value &&
+          cv::detail::valid_infer2_types< typename Net::InArgs
+                                        , std::tuple<Ts...> >::value,
+          std::function<typename Net::ResultL(T, cv::GArray<Ts>...)>
+        >::type;
+};
+
+// Base "Infer" kernel. Note - for whatever network, kernel ID
+// is always the same. Different inference calls are distinguished by
+// network _tag_ (an extra field in GCall)
+//
+// getOutMeta is a stub callback collected by G-API kernel subsystem
+// automatically. This is a rare case when this callback is defined by
+// a particular backend, not by a network itself.
+struct GInferBase {
+    static constexpr const char * id() {
+        return "org.opencv.dnn.infer";            // Universal stub
+    }
+    static GMetaArgs getOutMeta(const GMetaArgs &, const GArgs &) {
+        return GMetaArgs{};                       // One more universal stub
+    }
+};
+
+// Base "InferROI" kernel.
+// All notes from "Infer" kernel apply here as well.
+struct GInferROIBase {
+    static constexpr const char * id() {
+        return "org.opencv.dnn.infer-roi";        // Universal stub
+    }
+    static GMetaArgs getOutMeta(const GMetaArgs &, const GArgs &) {
+        return GMetaArgs{};                       // One more universal stub
+    }
+};
+
+// Base "Infer list" kernel.
+// All notes from "Infer" kernel apply here as well.
+struct GInferListBase {
+    static constexpr const char * id() {
+        return "org.opencv.dnn.infer-roi-list-1"; // Universal stub
+    }
+    static GMetaArgs getOutMeta(const GMetaArgs &, const GArgs &) {
+        return GMetaArgs{};                       // One more universal stub
+    }
+};
+
+// Base "Infer list 2" kernel.
+// All notes from "Infer" kernel apply here as well.
+struct GInferList2Base {
+    static constexpr const char * id() {
+        return "org.opencv.dnn.infer-roi-list-2"; // Universal stub
+    }
+    static GMetaArgs getOutMeta(const GMetaArgs &, const GArgs &) {
+        return GMetaArgs{};                       // One more universal stub
+    }
+};
+
+// A generic inference kernel. API (::on()) is fully defined by the Net
+// template parameter.
+// Acts as a regular kernel in graph (via KernelTypeMedium).
+template<typename Net, typename... Args>
+struct GInfer final
+    : public GInferBase
+    , public detail::KernelTypeMedium< GInfer<Net, Args...>
+                                     , typename InferAPI<Net, Args...>::type > {
+    using GInferBase::getOutMeta; // FIXME: name lookup conflict workaround?
+
+    static constexpr const char* tag() { return Net::tag(); }
+};
+
+// A specific roi-inference kernel. API (::on()) is fixed here and
+// verified against Net.
+template<typename Net, typename T>
+struct GInferROI final
+    : public GInferROIBase
+    , public detail::KernelTypeMedium< GInferROI<Net, T>
+                                     , typename InferAPIRoi<Net, T>::type > {
+    using GInferROIBase::getOutMeta; // FIXME: name lookup conflict workaround?
+
+    static constexpr const char* tag() { return Net::tag(); }
+};
+
+
+// A generic roi-list inference kernel. API (::on()) is derived from
+// the Net template parameter (see more in infer<> overload).
+template<typename Net, typename... Args>
+struct GInferList final
+    : public GInferListBase
+    , public detail::KernelTypeMedium< GInferList<Net, Args...>
+                                     , typename InferAPIList<Net, Args...>::type > {
+    using GInferListBase::getOutMeta; // FIXME: name lookup conflict workaround?
+
+    static constexpr const char* tag() { return Net::tag(); }
+};
+
+// An even more generic roi-list inference kernel. API (::on()) is
+// derived from the Net template parameter (see more in infer<>
+// overload).
+// Takes an extra variadic template list to reflect how this network
+// was called (with Rects or GMats as array parameters)
+template<typename Net, typename T, typename... Args>
+struct GInferList2 final
+    : public GInferList2Base
+    , public detail::KernelTypeMedium< GInferList2<Net, T, Args...>
+                                     , typename InferAPIList2<Net, T, Args...>::type > {
+    using GInferList2Base::getOutMeta; // FIXME: name lookup conflict workaround?
+
+    static constexpr const char* tag() { return Net::tag(); }
+};
+
+/**
+ * @brief G-API object used to collect network inputs
+ */
+using GInferInputs = cv::detail::GInferInputsTyped<cv::GMat, cv::GFrame>;
+
+/**
+ * @brief G-API object used to collect the list of network inputs
+ */
+using GInferListInputs = cv::detail::GInferInputsTyped<cv::GArray<cv::GMat>, cv::GArray<cv::Rect>>;
+
+/**
+ * @brief G-API object used to collect network outputs
+ */
+using GInferOutputs = cv::detail::GInferOutputsTyped<cv::GMat>;
+
+/**
+ * @brief G-API object used to collect the list of network outputs
+ */
+using GInferListOutputs = cv::detail::GInferOutputsTyped<cv::GArray<cv::GMat>>;
+
+namespace detail {
+void inline unpackBlobs(const cv::GInferInputs::Map& blobs,
+                        std::vector<cv::GArg>& args,
+                        std::vector<std::string>& names,
+                        cv::GKinds& kinds)
+{
+    for (auto&& p : blobs) {
+        names.emplace_back(p.first);
+        switch (p.second.index()) {
+            case cv::GInferInputs::StorageT::index_of<cv::GMat>():
+                args.emplace_back(cv::util::get<cv::GMat>(p.second));
+                kinds.emplace_back(cv::detail::OpaqueKind::CV_MAT);
+                break;
+            case cv::GInferInputs::StorageT::index_of<cv::GFrame>():
+                args.emplace_back(cv::util::get<cv::GFrame>(p.second));
+                kinds.emplace_back(cv::detail::OpaqueKind::CV_UNKNOWN);
+                break;
+            default:
+                GAPI_Error("InternalError");
+        }
+    }
+}
+
+template <typename InferType>
+struct InferROITraits;
+
+template <>
+struct InferROITraits<GInferROIBase>
+{
+    using outType = cv::GInferOutputs;
+    using inType  = cv::GOpaque<cv::Rect>;
+};
+
+template <>
+struct InferROITraits<GInferListBase>
+{
+    using outType = cv::GInferListOutputs;
+    using inType  = cv::GArray<cv::Rect>;
+};
+
+template<typename InferType>
+typename InferROITraits<InferType>::outType
+inferGenericROI(const std::string& tag,
+         const typename InferROITraits<InferType>::inType& in,
+         const cv::GInferInputs& inputs)
+{
+    std::vector<cv::GArg> args;
+    std::vector<std::string> names;
+    cv::GKinds kinds;
+
+    args.emplace_back(in);
+    kinds.emplace_back(cv::detail::OpaqueKind::CV_RECT);
+
+    unpackBlobs(inputs.getBlobs(), args, names, kinds);
+
+    auto call = cv::detail::makeCall<InferType>(tag,
+                                                std::move(args),
+                                                std::move(names),
+                                                std::move(kinds));
+
+    return {std::move(call)};
+}
+
+} // namespace detail
+} // namespace cv
+
+// FIXME: Probably the <API> signature makes a function/tuple/function round-trip
+#define G_API_NET(Class, API, Tag)                                      \
+    struct Class final: public cv::GNetworkType<Class, std::function API> { \
+        static constexpr const char * tag() { return Tag; }             \
+    }
+
+namespace cv {
+namespace gapi {
+
+/** @brief Calculates response for the specified network (template
+ *     parameter) for the specified region in the source image.
+ *     Currently expects a single-input network only.
+ *
+ * @tparam A network type defined with G_API_NET() macro.
+ * @param in input image where to take ROI from.
+ * @param roi an object describing the region of interest
+ *   in the source image. May be calculated in the same graph dynamically.
+ * @return an object of return type as defined in G_API_NET().
+ *   If a network has multiple return values (defined with a tuple), a tuple of
+ *   objects of appropriate type is returned.
+ * @sa  G_API_NET()
+ */
+template<typename Net, typename T>
+typename Net::Result infer(cv::GOpaque<cv::Rect> roi, T in) {
+    return GInferROI<Net, T>::on(roi, in);
+}
+
+/** @brief Calculates responses for the specified network (template
+ *     parameter) for every region in the source image.
+ *
+ * @tparam A network type defined with G_API_NET() macro.
+ * @param roi a list of rectangles describing regions of interest
+ *   in the source image. Usually an output of object detector or tracker.
+ * @param args network's input parameters as specified in G_API_NET() macro.
+ *   NOTE: verified to work reliably with 1-input topologies only.
+ * @return a list of objects of return type as defined in G_API_NET().
+ *   If a network has multiple return values (defined with a tuple), a tuple of
+ *   GArray<> objects is returned with the appropriate types inside.
+ * @sa  G_API_NET()
+ */
+template<typename Net, typename... Args>
+typename Net::ResultL infer(cv::GArray<cv::Rect> roi, Args&&... args) {
+    return GInferList<Net, Args...>::on(roi, std::forward<Args>(args)...);
+}
+
+/** @brief Calculates responses for the specified network (template
+ *     parameter) for every region in the source image, extended version.
+ *
+ * @tparam A network type defined with G_API_NET() macro.
+ * @param image A source image containing regions of interest
+ * @param args GArray<> objects of cv::Rect or cv::GMat, one per every
+ * network input:
+ * - If a cv::GArray<cv::Rect> is passed, the appropriate
+ *   regions are taken from `image` and preprocessed to this particular
+ *   network input;
+ * - If a cv::GArray<cv::GMat> is passed, the underlying data traited
+ *   as tensor (no automatic preprocessing happen).
+ * @return a list of objects of return type as defined in G_API_NET().
+ *   If a network has multiple return values (defined with a tuple), a tuple of
+ *   GArray<> objects is returned with the appropriate types inside.
+ * @sa  G_API_NET()
+ */
+
+template<typename Net, typename T, typename... Args>
+typename Net::ResultL infer2(T image, cv::GArray<Args>... args) {
+    // FIXME: Declared as "2" because in the current form it steals
+    // overloads from the regular infer
+    return GInferList2<Net, T, Args...>::on(image, args...);
+}
+
+/**
+ * @brief Calculates response for the specified network (template
+ *     parameter) given the input data.
+ *
+ * @tparam A network type defined with G_API_NET() macro.
+ * @param args network's input parameters as specified in G_API_NET() macro.
+ * @return an object of return type as defined in G_API_NET().
+ *   If a network has multiple return values (defined with a tuple), a tuple of
+ *   objects of appropriate type is returned.
+ * @sa  G_API_NET()
+ */
+template<typename Net, typename... Args>
+typename Net::Result infer(Args&&... args) {
+    return GInfer<Net, Args...>::on(std::forward<Args>(args)...);
+}
+
+/**
+ * @brief Generic network type: input and output layers are configured dynamically at runtime
+ *
+ * Unlike the network types defined with G_API_NET macro, this one
+ * doesn't fix number of network inputs and outputs at the compilation stage
+ * thus providing user with an opportunity to program them in runtime.
+ */
+struct Generic { };
+
+/**
+ * @brief Calculates response for generic network
+ *
+ * @param tag a network tag
+ * @param inputs networks's inputs
+ * @return a GInferOutputs
+ */
+template<typename T = Generic> cv::GInferOutputs
+infer(const std::string& tag, const cv::GInferInputs& inputs)
+{
+    std::vector<cv::GArg> args;
+    std::vector<std::string> names;
+    cv::GKinds kinds;
+
+    cv::detail::unpackBlobs(inputs.getBlobs(), args, names, kinds);
+
+    auto call = cv::detail::makeCall<GInferBase>(tag,
+                                                 std::move(args),
+                                                 std::move(names),
+                                                 std::move(kinds));
+
+    return cv::GInferOutputs{std::move(call)};
+}
+
+/** @brief Calculates response for the generic network
+ *     for the specified region in the source image.
+ *     Currently expects a single-input network only.
+ *
+ * @param tag a network tag
+ * @param roi a an object describing the region of interest
+ *   in the source image. May be calculated in the same graph dynamically.
+ * @param inputs networks's inputs
+ * @return a cv::GInferOutputs
+ */
+template<typename T = Generic> cv::GInferOutputs
+infer(const std::string& tag, const cv::GOpaque<cv::Rect>& roi, const cv::GInferInputs& inputs)
+{
+    return cv::detail::inferGenericROI<GInferROIBase>(tag, roi, inputs);
+}
+
+/** @brief Calculates responses for the specified network
+ *     for every region in the source image.
+ *
+ * @param tag a network tag
+ * @param rois a list of rectangles describing regions of interest
+ *   in the source image. Usually an output of object detector or tracker.
+ * @param inputs networks's inputs
+ * @return a cv::GInferListOutputs
+ */
+template<typename T = Generic> cv::GInferListOutputs
+infer(const std::string& tag, const cv::GArray<cv::Rect>& rois, const cv::GInferInputs& inputs)
+{
+    return cv::detail::inferGenericROI<GInferListBase>(tag, rois, inputs);
+}
+
+/** @brief Calculates responses for the specified network
+ *     for every region in the source image, extended version.
+ *
+ * @param tag a network tag
+ * @param in a source image containing regions of interest.
+ * @param inputs networks's inputs
+ * @return a cv::GInferListOutputs
+ */
+template<typename T = Generic, typename Input>
+typename std::enable_if<cv::detail::accepted_infer_types<Input>::value, cv::GInferListOutputs>::type
+infer2(const std::string& tag,
+       const Input& in,
+       const cv::GInferListInputs& inputs)
+{
+    std::vector<cv::GArg> args;
+    std::vector<std::string> names;
+    cv::GKinds kinds;
+
+    args.emplace_back(in);
+    auto k = cv::detail::GOpaqueTraits<Input>::kind;
+    kinds.emplace_back(k);
+
+    for (auto&& p : inputs.getBlobs()) {
+        names.emplace_back(p.first);
+        switch (p.second.index()) {
+            case cv::GInferListInputs::StorageT::index_of<cv::GArray<cv::GMat>>():
+                args.emplace_back(cv::util::get<cv::GArray<cv::GMat>>(p.second));
+                kinds.emplace_back(cv::detail::OpaqueKind::CV_MAT);
+                break;
+            case cv::GInferListInputs::StorageT::index_of<cv::GArray<cv::Rect>>():
+                args.emplace_back(cv::util::get<cv::GArray<cv::Rect>>(p.second));
+                kinds.emplace_back(cv::detail::OpaqueKind::CV_RECT);
+                break;
+            default:
+                GAPI_Error("InternalError");
+        }
+    }
+
+    auto call = cv::detail::makeCall<GInferList2Base>(tag,
+                                                      std::move(args),
+                                                      std::move(names),
+                                                      std::move(kinds));
+
+    return cv::GInferListOutputs{std::move(call)};
+}
+
+} // namespace gapi
+} // namespace cv
+
+#endif // GAPI_STANDALONE
+
+namespace cv {
+namespace gapi {
+
+// Note: the below code _is_ part of STANDALONE build,
+// just to make our compiler code compileable.
+
+// A type-erased form of network parameters.
+// Similar to how a type-erased GKernel is represented and used.
+/// @private
+struct GAPI_EXPORTS_W_SIMPLE GNetParam {
+    std::string tag;     // FIXME: const?
+    GBackend backend;    // Specifies the execution model
+    util::any params;    // Backend-interpreted parameter structure
+};
+
+/** \addtogroup gapi_compile_args
+ * @{
+ */
+/**
+ * @brief A container class for network configurations. Similar to
+ * GKernelPackage. Use cv::gapi::networks() to construct this object.
+ *
+ * @sa cv::gapi::networks
+ */
+struct GAPI_EXPORTS_W_SIMPLE GNetPackage {
+    GAPI_WRAP GNetPackage() = default;
+    GAPI_WRAP explicit GNetPackage(std::vector<GNetParam> nets);
+    explicit GNetPackage(std::initializer_list<GNetParam> ii);
+    std::vector<GBackend> backends() const;
+    std::vector<GNetParam> networks;
+};
+/** @} gapi_compile_args */
+} // namespace gapi
+
+namespace detail {
+template<typename T>
+gapi::GNetParam strip(T&& t) {
+    return gapi::GNetParam { t.tag()
+                           , t.backend()
+                           , t.params()
+                           };
+}
+
+template<> struct CompileArgTag<cv::gapi::GNetPackage> {
+    static const char* tag() { return "gapi.net_package"; }
+};
+
+} // namespace cv::detail
+
+namespace gapi {
+template<typename... Args>
+cv::gapi::GNetPackage networks(Args&&... args) {
+    return cv::gapi::GNetPackage({ cv::detail::strip(args)... });
+}
+
+inline cv::gapi::GNetPackage& operator += (      cv::gapi::GNetPackage& lhs,
+                                           const cv::gapi::GNetPackage& rhs) {
+    lhs.networks.reserve(lhs.networks.size() + rhs.networks.size());
+    lhs.networks.insert(lhs.networks.end(), rhs.networks.begin(), rhs.networks.end());
+    return lhs;
+}
+
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_INFER_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_ie.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_ie.hpp
new file mode 100644
index 000000000000..94272dea55b0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_ie.hpp
@@ -0,0 +1,70 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_INFER_BINDINGS_IE_HPP
+#define OPENCV_GAPI_INFER_BINDINGS_IE_HPP
+
+#include <opencv2/gapi/util/any.hpp>
+#include "opencv2/gapi/own/exports.hpp" // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp>     // GKernelPackage
+#include <opencv2/gapi/infer/ie.hpp>    // Params
+
+#include <string>
+
+namespace cv {
+namespace gapi {
+namespace ie {
+
+// NB: Used by python wrapper
+// This class can be marked as SIMPLE, because it's implemented as pimpl
+class GAPI_EXPORTS_W_SIMPLE PyParams {
+public:
+    GAPI_WRAP
+    PyParams() = default;
+
+    GAPI_WRAP
+    PyParams(const std::string &tag,
+             const std::string &model,
+             const std::string &weights,
+             const std::string &device);
+
+    GAPI_WRAP
+    PyParams(const std::string &tag,
+             const std::string &model,
+             const std::string &device);
+
+    GAPI_WRAP
+    PyParams& constInput(const std::string &layer_name,
+                         const cv::Mat &data,
+                         TraitAs hint = TraitAs::TENSOR);
+
+    GAPI_WRAP
+    PyParams& cfgNumRequests(size_t nireq);
+
+    GAPI_WRAP
+    PyParams& cfgBatchSize(const size_t size);
+
+    GBackend      backend() const;
+    std::string   tag()     const;
+    cv::util::any params()  const;
+
+private:
+    std::shared_ptr<Params<cv::gapi::Generic>> m_priv;
+};
+
+GAPI_EXPORTS_W PyParams params(const std::string &tag,
+                               const std::string &model,
+                               const std::string &weights,
+                               const std::string &device);
+
+GAPI_EXPORTS_W PyParams params(const std::string &tag,
+                               const std::string &model,
+                               const std::string &device);
+} // namespace ie
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_INFER_BINDINGS_IE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_onnx.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_onnx.hpp
new file mode 100644
index 000000000000..fb2376ece881
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_onnx.hpp
@@ -0,0 +1,68 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_GAPI_INFER_BINDINGS_ONNX_HPP
+#define OPENCV_GAPI_INFER_BINDINGS_ONNX_HPP
+
+#include <opencv2/gapi/gkernel.hpp>     // GKernelPackage
+#include <opencv2/gapi/infer/onnx.hpp>  // Params
+#include "opencv2/gapi/own/exports.hpp"  // GAPI_EXPORTS
+#include <opencv2/gapi/util/any.hpp>
+
+#include <string>
+
+namespace cv {
+namespace gapi {
+namespace onnx {
+
+// NB: Used by python wrapper
+// This class can be marked as SIMPLE, because it's implemented as pimpl
+class GAPI_EXPORTS_W_SIMPLE PyParams {
+public:
+    GAPI_WRAP
+    PyParams() = default;
+
+    GAPI_WRAP
+    PyParams(const std::string& tag, const std::string& model_path);
+
+    GAPI_WRAP
+    PyParams& cfgMeanStd(const std::string &layer_name,
+                         const cv::Scalar &m,
+                         const cv::Scalar &s);
+    GAPI_WRAP
+    PyParams& cfgNormalize(const std::string &layer_name, bool flag);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::OpenVINO ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::DirectML ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::CoreML ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::CUDA ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::TensorRT ep);
+
+    GAPI_WRAP
+    PyParams& cfgDisableMemPattern();
+
+    GBackend backend() const;
+    std::string tag() const;
+    cv::util::any params() const;
+
+private:
+    std::shared_ptr<Params<cv::gapi::Generic>> m_priv;
+};
+
+GAPI_EXPORTS_W PyParams params(const std::string& tag, const std::string& model_path);
+
+}  // namespace onnx
+}  // namespace gapi
+}  // namespace cv
+
+#endif  // OPENCV_GAPI_INFER_BINDINGS_ONNX_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_ov.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_ov.hpp
new file mode 100644
index 000000000000..08f5c83a3ff4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/bindings_ov.hpp
@@ -0,0 +1,128 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_INFER_BINDINGS_OV_HPP
+#define OPENCV_GAPI_INFER_BINDINGS_OV_HPP
+
+#include <opencv2/gapi/util/any.hpp>
+#include "opencv2/gapi/own/exports.hpp" // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp>     // GKernelPackage
+#include <opencv2/gapi/infer/ov.hpp>    // Params
+
+#include <string>
+
+namespace cv {
+namespace gapi {
+namespace ov {
+
+// NB: Used by python wrapper
+// This class can be marked as SIMPLE, because it's implemented as pimpl
+class GAPI_EXPORTS_W_SIMPLE PyParams {
+public:
+    GAPI_WRAP
+    PyParams() = default;
+
+    GAPI_WRAP
+    PyParams(const std::string &tag,
+             const std::string &model_path,
+             const std::string &bin_path,
+             const std::string &device);
+
+    GAPI_WRAP
+    PyParams(const std::string &tag,
+             const std::string &blob_path,
+             const std::string &device);
+
+    GAPI_WRAP
+    PyParams& cfgPluginConfig(
+            const std::map<std::string, std::string> &config);
+
+    GAPI_WRAP
+    PyParams& cfgInputTensorLayout(std::string tensor_layout);
+
+    GAPI_WRAP
+    PyParams& cfgInputTensorLayout(
+            std::map<std::string, std::string> layout_map);
+
+    GAPI_WRAP
+    PyParams& cfgInputModelLayout(std::string tensor_layout);
+
+    GAPI_WRAP
+    PyParams& cfgInputModelLayout(
+            std::map<std::string, std::string> layout_map);
+
+    GAPI_WRAP
+    PyParams& cfgOutputTensorLayout(std::string tensor_layout);
+
+    GAPI_WRAP
+    PyParams& cfgOutputTensorLayout(
+            std::map<std::string, std::string> layout_map);
+
+    GAPI_WRAP
+    PyParams& cfgOutputModelLayout(std::string tensor_layout);
+
+    GAPI_WRAP
+    PyParams& cfgOutputModelLayout(
+            std::map<std::string, std::string> layout_map);
+
+    GAPI_WRAP
+    PyParams& cfgOutputTensorPrecision(int precision);
+
+    GAPI_WRAP
+    PyParams& cfgOutputTensorPrecision(
+            std::map<std::string, int> precision_map);
+
+    GAPI_WRAP
+    PyParams& cfgReshape(std::vector<size_t> new_shape);
+
+    GAPI_WRAP
+    PyParams& cfgReshape(
+            std::map<std::string, std::vector<size_t>> new_shape_map);
+
+    GAPI_WRAP
+    PyParams& cfgNumRequests(const size_t nireq);
+
+    GAPI_WRAP
+    PyParams& cfgMean(std::vector<float> mean_values);
+
+    GAPI_WRAP
+    PyParams& cfgMean(
+            std::map<std::string, std::vector<float>> mean_map);
+
+    GAPI_WRAP
+    PyParams& cfgScale(std::vector<float> scale_values);
+
+    GAPI_WRAP
+    PyParams& cfgScale(
+            std::map<std::string, std::vector<float>> scale_map);
+
+    GAPI_WRAP
+    PyParams& cfgResize(int interpolation);
+
+    GAPI_WRAP
+    PyParams& cfgResize(std::map<std::string, int> interpolation);
+
+    GBackend      backend() const;
+    std::string   tag()     const;
+    cv::util::any params()  const;
+
+private:
+    std::shared_ptr<Params<cv::gapi::Generic>> m_priv;
+};
+
+GAPI_EXPORTS_W PyParams params(const std::string &tag,
+                               const std::string &model_path,
+                               const std::string &weights,
+                               const std::string &device);
+
+GAPI_EXPORTS_W PyParams params(const std::string &tag,
+                               const std::string &bin_path,
+                               const std::string &device);
+} // namespace ov
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_INFER_BINDINGS_OV_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/ie.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/ie.hpp
new file mode 100644
index 000000000000..9f9518d0b8e0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/ie.hpp
@@ -0,0 +1,711 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019-2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_INFER_IE_HPP
+#define OPENCV_GAPI_INFER_IE_HPP
+
+#include <unordered_map>
+#include <unordered_set>
+#include <string>
+#include <array>
+#include <tuple> // tuple, tuple_size
+#include <map>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/util/any.hpp>
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+#include <opencv2/gapi/infer.hpp>   // Generic
+#include <opencv2/gapi/streaming/onevpl/accel_types.hpp> // Preproc Dev & Ctx
+
+namespace cv {
+namespace gapi {
+// FIXME: introduce a new sub-namespace for NN?
+
+/**
+ * @brief This namespace contains G-API OpenVINO backend functions,
+ * structures, and symbols.
+ */
+namespace ie {
+
+GAPI_EXPORTS cv::gapi::GBackend backend();
+
+/**
+ * Specifies how G-API and IE should trait input data
+ *
+ * In OpenCV, the same cv::Mat is used to represent both
+ * image and tensor data. Sometimes those are hardly distinguishable,
+ * so this extra parameter is used to give G-API a hint.
+ *
+ * This hint controls how G-API reinterprets the data when converting
+ * it to IE Blob format (and which layout/etc is assigned to this data).
+ */
+enum class TraitAs: int
+{
+    TENSOR, //!< G-API traits an associated cv::Mat as a raw tensor and passes dimensions as-is
+    IMAGE   //!< G-API traits an associated cv::Mat as an image so creates an "image" blob (NCHW/NHWC, etc)
+};
+
+using IEConfig = std::map<std::string, std::string>;
+
+enum InferMode {Sync, Async};
+
+namespace detail {
+
+template <typename T>
+using AttrMap = std::map<std::string, T>;
+// NB: This type is used to hold in/out layers
+// attributes such as precision, layout, shape etc.
+//
+// User can provide attributes either:
+// 1. cv::util::monostate - No value specified explicitly.
+// 2. Attr - value specified explicitly that should be broadcasted to all layers.
+// 3. AttrMap[str->T] - map specifies value for particular layer.
+template <typename Attr>
+using LayerVariantAttr = cv::util::variant< cv::util::monostate
+                                          , AttrMap<Attr>
+                                          , Attr>;
+
+struct ParamDesc {
+    std::string model_path;
+    std::string weights_path;
+    std::string device_id;
+
+    std::vector<std::string> input_names;
+    std::vector<std::string> output_names;
+
+    using ConstInput = std::pair<cv::Mat, TraitAs>;
+    std::unordered_map<std::string, ConstInput> const_inputs;
+
+    std::size_t num_in;
+    std::size_t num_out;
+
+    enum class Kind {Load, Import};
+    Kind kind;
+    bool is_generic;
+    IEConfig config;
+
+    std::map<std::string, std::vector<std::size_t>> reshape_table;
+    std::unordered_set<std::string> layer_names_to_reshape;
+
+    // NB: Number of asyncrhonious infer requests
+    size_t nireq;
+
+    // NB: An optional config to setup RemoteContext for IE
+    cv::util::any context_config;
+
+    // NB: batch_size can't be equal to 1 by default, because some of models
+    // have 2D (Layout::NC) input and if the first dimension not equal to 1
+    // net.setBatchSize(1) will overwrite it.
+    cv::optional<size_t> batch_size;
+
+    cv::optional<cv::gapi::wip::onevpl::Device> vpl_preproc_device;
+    cv::optional<cv::gapi::wip::onevpl::Context> vpl_preproc_ctx;
+
+    InferMode mode;
+
+    using PrecisionT = int;
+    using PrecisionMapT = std::unordered_map<std::string, PrecisionT>;
+    // NB: This parameter can contain:
+    // 1. cv::util::monostate - Don't specify precision, but use default from IR/Blob.
+    // 2. PrecisionT (CV_8U, CV_32F, ...) - Specifies precision for all output layers.
+    // 3. PrecisionMapT ({{"layer0", CV_32F}, {"layer1", CV_16F}} - Specifies precision for certain output layer.
+    // cv::util::monostate is default value that means precision wasn't specified.
+    using PrecisionVariantT = cv::util::variant<cv::util::monostate,
+                                                PrecisionT,
+                                                PrecisionMapT>;
+
+    PrecisionVariantT output_precision;
+    LayerVariantAttr<std::string> input_layout;
+    LayerVariantAttr<std::string> output_layout;
+    LayerVariantAttr<int>         interpolation;
+};
+} // namespace detail
+
+// FIXME: this is probably a shared (reusable) thing
+template<typename Net>
+struct PortCfg {
+    using In = std::array
+        < std::string
+        , std::tuple_size<typename Net::InArgs>::value >;
+    using Out = std::array
+        < std::string
+        , std::tuple_size<typename Net::OutArgs>::value >;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference parameters for "OpenVINO Toolkit" model.
+ */
+template<typename Net> class Params {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on model information and specifies default values for other
+    inference description parameters. Model is loaded and compiled using "OpenVINO Toolkit".
+
+    @param model Path to topology IR (.xml file).
+    @param weights Path to weights (.bin file).
+    @param device target device to use.
+    */
+    Params(const std::string &model,
+           const std::string &weights,
+           const std::string &device)
+        : desc{ model, weights, device, {}, {}, {}
+              , std::tuple_size<typename Net::InArgs>::value  // num_in
+              , std::tuple_size<typename Net::OutArgs>::value // num_out
+              , detail::ParamDesc::Kind::Load
+              , false
+              , {}
+              , {}
+              , {}
+              , 1u
+              , {}
+              , {}
+              , {}
+              , {}
+              , InferMode::Async
+              , {}
+              , {}
+              , {}
+              , {} } {
+    }
+
+    /** @overload
+    Use this constructor to work with pre-compiled network.
+    Model is imported from a pre-compiled blob.
+
+    @param model Path to model.
+    @param device target device to use.
+    */
+    Params(const std::string &model,
+           const std::string &device)
+        : desc{ model, {}, device, {}, {}, {}
+              , std::tuple_size<typename Net::InArgs>::value  // num_in
+              , std::tuple_size<typename Net::OutArgs>::value // num_out
+              , detail::ParamDesc::Kind::Import
+              , false
+              , {}
+              , {}
+              , {}
+              , 1u
+              , {}
+              , {}
+              , {}
+              , {}
+              , InferMode::Async
+              , {}
+              , {}
+              , {}
+              , {} } {
+    }
+
+    /** @brief Specifies sequence of network input layers names for inference.
+
+    The function is used to associate cv::gapi::infer<> inputs with the model inputs.
+    Number of names has to match the number of network inputs as defined in G_API_NET().
+    In case a network has only single input layer, there is no need to specify name manually.
+
+    @param layer_names std::array<std::string, N> where N is the number of inputs
+    as defined in the @ref G_API_NET. Contains names of input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputLayers(const typename PortCfg<Net>::In &layer_names) {
+        desc.input_names.clear();
+        desc.input_names.reserve(layer_names.size());
+        std::copy(layer_names.begin(), layer_names.end(),
+                  std::back_inserter(desc.input_names));
+        return *this;
+    }
+
+    /** @brief Specifies sequence of network output layers names for inference.
+
+    The function is used to associate cv::gapi::infer<> outputs with the model outputs.
+    Number of names has to match the number of network outputs as defined in G_API_NET().
+    In case a network has only single output layer, there is no need to specify name manually.
+
+    @param layer_names std::array<std::string, N> where N is the number of outputs
+    as defined in the @ref G_API_NET. Contains names of output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputLayers(const typename PortCfg<Net>::Out &layer_names) {
+        desc.output_names.clear();
+        desc.output_names.reserve(layer_names.size());
+        std::copy(layer_names.begin(), layer_names.end(),
+                  std::back_inserter(desc.output_names));
+        return *this;
+    }
+
+    /** @brief Specifies a constant input.
+
+    The function is used to set a constant input. This input has to be
+    a preprocessed tensor if its type is TENSOR. Need to provide name of the
+    network layer which will receive provided data.
+
+    @param layer_name Name of network layer.
+    @param data cv::Mat that contains data which will be associated with network layer.
+    @param hint Input type @sa cv::gapi::ie::TraitAs.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& constInput(const std::string &layer_name,
+                            const cv::Mat &data,
+                            TraitAs hint = TraitAs::TENSOR) {
+        desc.const_inputs[layer_name] = {data, hint};
+        return *this;
+    }
+
+    /** @brief Specifies OpenVINO plugin configuration.
+
+    The function is used to set configuration for OpenVINO plugin. Some parameters
+    can be different for each plugin. Please follow https://docs.openvinotoolkit.org/latest/index.html
+    to check information about specific plugin.
+
+    @param cfg Map of pairs: (config parameter name, config parameter value).
+    @return reference to this parameter structure.
+    */
+    Params& pluginConfig(const IEConfig& cfg) {
+        desc.config = cfg;
+        return *this;
+    }
+
+    /** @overload
+    Function with a rvalue parameter.
+
+    @param cfg rvalue map of pairs: (config parameter name, config parameter value).
+    @return reference to this parameter structure.
+    */
+    Params& pluginConfig(IEConfig&& cfg) {
+        desc.config = std::move(cfg);
+        return *this;
+    }
+
+    /** @brief Specifies configuration for RemoteContext in InferenceEngine.
+
+    When RemoteContext is configured the backend imports the networks using the context.
+    It also expects cv::MediaFrames to be actually remote, to operate with blobs via the context.
+
+    @param ctx_cfg cv::util::any value which holds InferenceEngine::ParamMap.
+    @return reference to this parameter structure.
+    */
+    Params& cfgContextParams(const cv::util::any& ctx_cfg) {
+        desc.context_config = ctx_cfg;
+        return *this;
+    }
+
+    /** @overload
+    Function with an rvalue parameter.
+
+    @param ctx_cfg cv::util::any value which holds InferenceEngine::ParamMap.
+    @return reference to this parameter structure.
+    */
+    Params& cfgContextParams(cv::util::any&& ctx_cfg) {
+        desc.context_config = std::move(ctx_cfg);
+        return *this;
+    }
+
+    /** @brief Specifies number of asynchronous inference requests.
+
+    @param nireq Number of inference asynchronous requests.
+    @return reference to this parameter structure.
+    */
+    Params& cfgNumRequests(size_t nireq) {
+        GAPI_Assert(nireq > 0 && "Number of infer requests must be greater than zero!");
+        desc.nireq = nireq;
+        return *this;
+    }
+
+    /** @brief Specifies new input shapes for the network inputs.
+
+    The function is used to specify new input shapes for the network inputs.
+    Follow https://docs.openvinotoolkit.org/latest/classInferenceEngine_1_1networkNetwork.html
+    for additional information.
+
+    @param reshape_table Map of pairs: name of corresponding data and its dimension.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputReshape(const std::map<std::string, std::vector<std::size_t>>& reshape_table) {
+        desc.reshape_table = reshape_table;
+        return *this;
+    }
+
+    /** @overload */
+    Params<Net>& cfgInputReshape(std::map<std::string, std::vector<std::size_t>>&& reshape_table) {
+        desc.reshape_table = std::move(reshape_table);
+        return *this;
+    }
+
+    /** @overload
+
+    @param layer_name Name of layer.
+    @param layer_dims New dimensions for this layer.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputReshape(const std::string& layer_name, const std::vector<size_t>& layer_dims) {
+        desc.reshape_table.emplace(layer_name, layer_dims);
+        return *this;
+    }
+
+    /** @overload */
+    Params<Net>& cfgInputReshape(std::string&& layer_name, std::vector<size_t>&& layer_dims) {
+        desc.reshape_table.emplace(layer_name, layer_dims);
+        return *this;
+    }
+
+    /** @overload
+
+    @param layer_names set of names of network layers that will be used for network reshape.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputReshape(const std::unordered_set<std::string>& layer_names) {
+        desc.layer_names_to_reshape = layer_names;
+        return *this;
+    }
+
+    /** @overload
+
+    @param layer_names rvalue set of the selected layers will be reshaped automatically
+    its input image size.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputReshape(std::unordered_set<std::string>&& layer_names) {
+        desc.layer_names_to_reshape = std::move(layer_names);
+        return *this;
+    }
+
+    /** @brief Specifies the inference batch size.
+
+    The function is used to specify inference batch size.
+    Follow https://docs.openvinotoolkit.org/latest/classInferenceEngine_1_1CNNNetwork.html#a8e9d19270a48aab50cb5b1c43eecb8e9 for additional information
+
+    @param size batch size which will be used.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgBatchSize(const size_t size) {
+        desc.batch_size = cv::util::make_optional(size);
+        return *this;
+    }
+
+    Params<Net>& cfgPreprocessingParams(const cv::gapi::wip::onevpl::Device &device,
+                                        const cv::gapi::wip::onevpl::Context &ctx) {
+        desc.vpl_preproc_device = cv::util::make_optional(device);
+        desc.vpl_preproc_ctx = cv::util::make_optional(ctx);
+        return *this;
+    }
+
+    /** @brief Specifies which api will be used to run inference.
+
+    The function is used to specify mode for OpenVINO inference.
+    OpenVINO has two options to run inference:
+    1. Asynchronous (using StartAsync: https://docs.openvino.ai/latest/classInferenceEngine_1_1InferRequest.html#doxid-class-inference-engine-1-1-infer-request-1a405293e8423d82a5b45f642a3bef0d24)
+    2. Synchronous (using Infer: https://docs.openvino.ai/latest/classInferenceEngine_1_1InferRequest.html#doxid-class-inference-engine-1-1-infer-request-1a3391ce30894abde730523e9ca9371ce8)
+    By default asynchronous mode is used.
+
+    @param mode Inference mode which will be used.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInferMode(InferMode mode) {
+        desc.mode = mode;
+        return *this;
+    }
+
+    /** @brief Specifies the output precision for model.
+
+    The function is used to set an output precision for model.
+
+    @param precision Precision in OpenCV format (CV_8U, CV_32F, ...)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputPrecision(detail::ParamDesc::PrecisionT precision) {
+        desc.output_precision = precision;
+        return *this;
+    }
+
+    /** @overload
+
+    @param precision_map Map of pairs: name of corresponding output layer
+    and its precision in OpenCV format (CV_8U, CV_32F, ...)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputPrecision(detail::ParamDesc::PrecisionMapT precision_map) {
+        desc.output_precision = precision_map;
+        return *this;
+    }
+
+    /** @brief Specifies the input layout for model.
+
+    The function is used to set an input layout for model.
+
+    @param layout Layout in string representation ("NCHW", "NHWC", etc)
+    will be applied to all input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputLayout(std::string layout) {
+        desc.input_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+
+    @param layout_map Map of pairs: name of corresponding input layer
+    and its layout in string representation ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgInputLayout(detail::AttrMap<std::string> layout_map) {
+        desc.input_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies the output layout for model.
+
+    The function is used to set an output layout for model.
+
+    @param layout Layout in string representation ("NCHW", "NHWC", etc)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputLayout(std::string layout) {
+        desc.output_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+
+    @param layout_map Map of pairs: name of corresponding output layer
+    and its layout in string representation ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputLayout(detail::AttrMap<std::string> layout_map) {
+        desc.output_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies resize interpolation algorithm.
+     *
+    The function is used to configure resize preprocessing for input layer.
+
+    @param interpolation Resize interpolation algorithm.
+    Supported algorithms: #INTER_LINEAR, #INTER_AREA.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgResize(int interpolation) {
+        desc.interpolation = interpolation;
+        return *this;
+    }
+
+    /** @overload
+
+    @param interpolation Map of pairs: name of corresponding input layer
+    and its resize algorithm.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgResize(detail::AttrMap<int> interpolation) {
+        desc.interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend()    const { return cv::gapi::ie::backend();  }
+    std::string   tag()        const { return Net::tag(); }
+    cv::util::any params()     const { return { desc }; }
+    // END(G-API's network parametrization API)
+
+protected:
+    detail::ParamDesc desc;
+};
+
+/*
+* @brief This structure provides functions for generic network type that
+* fill inference parameters.
+* @see struct Generic
+*/
+template<>
+class Params<cv::gapi::Generic> {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on model information and sets default values for other
+    inference description parameters. Model is loaded and compiled using OpenVINO Toolkit.
+
+    @param tag string tag of the network for which these parameters are intended.
+    @param model path to topology IR (.xml file).
+    @param weights path to weights (.bin file).
+    @param device target device to use.
+    */
+    Params(const std::string &tag,
+           const std::string &model,
+           const std::string &weights,
+           const std::string &device)
+        : desc{ model, weights, device, {}, {}, {}, 0u, 0u,
+                detail::ParamDesc::Kind::Load, true, {}, {}, {}, 1u,
+                {}, {}, {}, {}, InferMode::Async, {}, {}, {}, {} },
+          m_tag(tag) {
+    }
+
+    /** @overload
+
+    This constructor for pre-compiled networks. Model is imported from pre-compiled
+    blob.
+
+    @param tag string tag of the network for which these parameters are intended.
+    @param model path to model.
+    @param device target device to use.
+    */
+    Params(const std::string &tag,
+           const std::string &model,
+           const std::string &device)
+        : desc{ model, {}, device, {}, {}, {}, 0u, 0u,
+                detail::ParamDesc::Kind::Import, true, {}, {}, {}, 1u,
+                {}, {}, {}, {}, InferMode::Async, {}, {}, {}, {} },
+          m_tag(tag) {
+    }
+
+    /** @see ie::Params::pluginConfig. */
+    Params& pluginConfig(const IEConfig& cfg) {
+        desc.config = cfg;
+        return *this;
+    }
+
+    /** @overload */
+    Params& pluginConfig(IEConfig&& cfg) {
+        desc.config = std::move(cfg);
+        return *this;
+    }
+
+    /** @see ie::Params::constInput. */
+    Params& constInput(const std::string &layer_name,
+                       const cv::Mat &data,
+                       TraitAs hint = TraitAs::TENSOR) {
+        desc.const_inputs[layer_name] = {data, hint};
+        return *this;
+    }
+
+    /** @see ie::Params::cfgNumRequests. */
+    Params& cfgNumRequests(size_t nireq) {
+        GAPI_Assert(nireq > 0 && "Number of infer requests must be greater than zero!");
+        desc.nireq = nireq;
+        return *this;
+    }
+
+    /** @see ie::Params::cfgInputReshape */
+    Params& cfgInputReshape(const std::map<std::string, std::vector<std::size_t>>&reshape_table) {
+        desc.reshape_table = reshape_table;
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgInputReshape(std::map<std::string, std::vector<std::size_t>> && reshape_table) {
+        desc.reshape_table = std::move(reshape_table);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgInputReshape(std::string && layer_name, std::vector<size_t> && layer_dims) {
+        desc.reshape_table.emplace(layer_name, layer_dims);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgInputReshape(const std::string & layer_name, const std::vector<size_t>&layer_dims) {
+        desc.reshape_table.emplace(layer_name, layer_dims);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgInputReshape(std::unordered_set<std::string> && layer_names) {
+        desc.layer_names_to_reshape = std::move(layer_names);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgInputReshape(const std::unordered_set<std::string>&layer_names) {
+        desc.layer_names_to_reshape = layer_names;
+        return *this;
+    }
+
+    /** @see ie::Params::cfgBatchSize */
+    Params& cfgBatchSize(const size_t size) {
+        desc.batch_size = cv::util::make_optional(size);
+        return *this;
+    }
+
+    /** @see ie::Params::cfgInferAPI */
+    Params& cfgInferMode(InferMode mode) {
+        desc.mode = mode;
+        return *this;
+    }
+
+    /** @see ie::Params::cfgOutputPrecision */
+    Params& cfgOutputPrecision(detail::ParamDesc::PrecisionT precision) {
+        desc.output_precision = precision;
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputPrecision(detail::ParamDesc::PrecisionMapT precision_map) {
+        desc.output_precision = precision_map;
+        return *this;
+    }
+
+    /** @see ie::Params::cfgInputLayout */
+    Params& cfgInputLayout(std::string layout) {
+        desc.input_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgInputLayout(detail::AttrMap<std::string> layout_map) {
+        desc.input_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ie::Params::cfgOutputLayout */
+    Params& cfgOutputLayout(std::string layout) {
+        desc.output_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputLayout(detail::AttrMap<std::string> layout_map) {
+        desc.output_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ie::Params::cfgResize */
+    Params& cfgResize(int interpolation) {
+        desc.interpolation = interpolation;
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgResize(detail::AttrMap<int> interpolation) {
+        desc.interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend()    const { return cv::gapi::ie::backend();  }
+    std::string   tag()        const { return m_tag; }
+    cv::util::any params()     const { return { desc }; }
+    // END(G-API's network parametrization API)
+
+protected:
+    detail::ParamDesc desc;
+    std::string m_tag;
+};
+
+} // namespace ie
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_INFER_IE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/onnx.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/onnx.hpp
new file mode 100644
index 000000000000..f985b41d71bf
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/onnx.hpp
@@ -0,0 +1,722 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020-2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_INFER_ONNX_HPP
+#define OPENCV_GAPI_INFER_ONNX_HPP
+
+#include <unordered_map>
+#include <string>
+#include <array>
+#include <tuple> // tuple, tuple_size
+#include <map>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/util/any.hpp>
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+#include <opencv2/gapi/infer.hpp>   // Generic
+
+namespace cv {
+namespace gapi {
+
+/**
+ * @brief This namespace contains G-API ONNX Runtime backend functions, structures, and symbols.
+ */
+namespace onnx {
+
+/**
+ * @brief This namespace contains Execution Providers structures for G-API ONNX Runtime backend.
+ */
+namespace ep {
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for ONNX CoreML Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE CoreML {
+    /** @brief Class constructor.
+
+    Constructs CoreML parameters.
+
+    */
+    GAPI_WRAP
+    CoreML() = default;
+
+    /** @brief Limit CoreML Execution Provider to run on CPU only.
+
+    This function is used to limit CoreML to run on CPU only.
+    Please follow: https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml_flag_use_cpu_only
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    CoreML& cfgUseCPUOnly() {
+        use_cpu_only = true;
+        return *this;
+    }
+
+    /** @brief Enable CoreML EP to run on a subgraph in the body of a control flow ONNX operator (i.e. a Loop, Scan or If operator).
+
+    This function is used to enable CoreML EP to run on
+    a subgraph of a control flow of ONNX operation.
+    Please follow: https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml_flag_enable_on_subgraph
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    CoreML& cfgEnableOnSubgraph() {
+        enable_on_subgraph = true;
+        return *this;
+    }
+
+    /** @brief Enable CoreML EP to run only on Apple Neural Engine.
+
+    This function is used to enable CoreML EP to run only on Apple Neural Engine.
+    Please follow: https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml_flag_only_enable_device_with_ane
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    CoreML& cfgEnableOnlyNeuralEngine() {
+        enable_only_ane = true;
+        return *this;
+    }
+
+    bool use_cpu_only = false;
+    bool enable_on_subgraph = false;
+    bool enable_only_ane = false;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for CUDA Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#cuda-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE CUDA {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    CUDA() = default;
+
+    /** @brief Class constructor.
+
+    Constructs CUDA parameters based on device type information.
+
+    @param dev_id Target device id to use.
+    */
+    GAPI_WRAP
+    explicit CUDA(const int dev_id)
+        : device_id(dev_id) {
+    }
+
+    int device_id;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for TensorRT Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#tensorrt-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE TensorRT {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    TensorRT() = default;
+
+    /** @brief Class constructor.
+
+    Constructs TensorRT parameters based on device type information.
+
+    @param dev_id Target device id to use.
+    */
+    GAPI_WRAP
+    explicit TensorRT(const int dev_id)
+        : device_id(dev_id) {
+    }
+
+    int device_id;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for ONNX OpenVINO Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#summary-of-options
+ */
+struct GAPI_EXPORTS_W_SIMPLE OpenVINO {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    OpenVINO() = default;
+
+    /** @brief Class constructor.
+
+    Constructs OpenVINO parameters based on device type information.
+
+    @param dev_type Target device type to use. ("CPU", "GPU", "GPU.0" etc)
+    */
+    GAPI_WRAP
+    explicit OpenVINO(const std::string &dev_type)
+        : device_type(dev_type) {
+    }
+
+    /** @brief Class constructor.
+
+    Constructs OpenVINO parameters based on map of options passed.
+
+    * @param params A map of parameter names and their corresponding string values.
+    */
+    GAPI_WRAP
+    explicit OpenVINO(const std::map<std::string, std::string>& params)
+        : params_map(params) {
+    }
+
+    /** @brief Specifies OpenVINO Execution Provider cache dir.
+
+    This function is used to explicitly specify the path to save and load
+    the blobs enabling model caching feature.
+
+    @param dir Path to the directory what will be used as cache.
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgCacheDir(const std::string &dir) {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        cache_dir = dir;
+        return *this;
+    }
+
+    /** @brief Specifies OpenVINO Execution Provider number of threads.
+
+    This function is used to override the accelerator default value
+    of number of threads with this value at runtime.
+
+    @param nthreads Number of threads.
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgNumThreads(size_t nthreads) {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        num_of_threads = nthreads;
+        return *this;
+    }
+
+    /** @brief Enables OpenVINO Execution Provider opencl throttling.
+
+    This function is used to enable OpenCL queue throttling for GPU devices
+    (reduces CPU utilization when using GPU).
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgEnableOpenCLThrottling() {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        enable_opencl_throttling = true;
+        return *this;
+    }
+
+    /** @brief Enables OpenVINO Execution Provider dynamic shapes.
+
+    This function is used to enable OpenCL queue throttling for GPU devices
+    (reduces CPU utilization when using GPU).
+    This function is used to enable work with dynamic shaped models
+    whose shape will be set dynamically based on the infer input
+    image/data shape at run time in CPU.
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgEnableDynamicShapes() {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        enable_dynamic_shapes = true;
+        return *this;
+    }
+
+    std::string device_type;
+    std::string cache_dir;
+    size_t num_of_threads = 0;
+    bool enable_opencl_throttling = false;
+    bool enable_dynamic_shapes = false;
+    std::map<std::string, std::string> params_map;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for ONNX DirectML Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/DirectML-ExecutionProvider.html#directml-execution-provider
+ */
+class GAPI_EXPORTS_W_SIMPLE DirectML {
+public:
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    DirectML() = default;
+
+    /** @brief Class constructor.
+
+    Constructs DirectML parameters based on device id.
+
+    @param device_id Target device id to use. ("0", "1", etc)
+    */
+    GAPI_WRAP
+    explicit DirectML(const int device_id) : ddesc(device_id) { };
+
+    /** @brief Class constructor.
+
+    Constructs DirectML parameters based on adapter name.
+
+    @param adapter_name Target adapter_name to use.
+    */
+    GAPI_WRAP
+    explicit DirectML(const std::string &adapter_name) : ddesc(adapter_name) { };
+
+    using DeviceDesc = cv::util::variant<int, std::string>;
+    DeviceDesc ddesc;
+};
+
+using EP = cv::util::variant< cv::util::monostate
+                            , OpenVINO
+                            , DirectML
+                            , CoreML
+                            , CUDA
+                            , TensorRT>;
+
+} // namespace ep
+
+GAPI_EXPORTS cv::gapi::GBackend backend();
+
+enum class TraitAs: int {
+    TENSOR, //!< G-API traits an associated cv::Mat as a raw tensor
+            // and passes dimensions as-is
+    IMAGE   //!< G-API traits an associated cv::Mat as an image so
+            // creates an "image" blob (NCHW/NHWC, etc)
+};
+
+using PostProc = std::function<void(const std::unordered_map<std::string, cv::Mat> &,
+                                          std::unordered_map<std::string, cv::Mat> &)>;
+
+namespace detail {
+/**
+* @brief This structure contains description of inference parameters
+* which is specific to ONNX models.
+*/
+struct ParamDesc {
+    std::string model_path; //!< Path to model.
+
+    // NB: nun_* may differ from topology's real input/output port numbers
+    // (e.g. topology's partial execution)
+    std::size_t num_in;  //!< How many inputs are defined in the operation
+    std::size_t num_out; //!< How many outputs are defined in the operation
+
+    // NB: Here order follows the `Net` API
+    std::vector<std::string> input_names; //!< Names of input network layers.
+    std::vector<std::string> output_names; //!< Names of output network layers.
+
+    using ConstInput = std::pair<cv::Mat, TraitAs>;
+    std::unordered_map<std::string, ConstInput> const_inputs; //!< Map with pair of name of network layer and ConstInput which will be associated with this.
+
+    std::vector<cv::Scalar> mean; //!< Mean values for preprocessing.
+    std::vector<cv::Scalar> stdev; //!< Standard deviation values for preprocessing.
+
+    std::vector<cv::GMatDesc> out_metas; //!< Out meta information about your output (type, dimension).
+    PostProc custom_post_proc; //!< Post processing function.
+
+    std::vector<bool> normalize; //!< Vector of bool values that enabled or disabled normalize of input data.
+
+    std::vector<std::string> names_to_remap; //!< Names of output layers that will be processed in PostProc function.
+
+    bool is_generic;
+
+    // TODO: Needs to modify the rest of ParamDesc accordingly to support
+    // both generic and non-generic options without duplication
+    // (as it was done for the OV IE backend)
+    // These values are pushed into the respective vector<> fields above
+    // when the generic infer parameters are unpacked (see GONNXBackendImpl::unpackKernel)
+    std::unordered_map<std::string, std::pair<cv::Scalar, cv::Scalar> > generic_mstd;
+    std::unordered_map<std::string, bool> generic_norm;
+
+    std::vector<cv::gapi::onnx::ep::EP> execution_providers;
+    bool disable_mem_pattern;
+};
+} // namespace detail
+
+template<typename Net>
+struct PortCfg {
+    using In = std::array
+        < std::string
+        , std::tuple_size<typename Net::InArgs>::value >;
+    using Out = std::array
+        < std::string
+        , std::tuple_size<typename Net::OutArgs>::value >;
+    using NormCoefs = std::array
+        < cv::Scalar
+        , std::tuple_size<typename Net::InArgs>::value >;
+    using Normalize = std::array
+        < bool
+        , std::tuple_size<typename Net::InArgs>::value >;
+};
+
+/**
+ * Contains description of inference parameters and kit of functions that
+ * fill this parameters.
+ */
+template<typename Net> class Params {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on model information and sets default values for other
+    inference description parameters.
+
+    @param model Path to model (.onnx file).
+    */
+    Params(const std::string &model) {
+        desc.model_path = model;
+        desc.num_in  = std::tuple_size<typename Net::InArgs>::value;
+        desc.num_out = std::tuple_size<typename Net::OutArgs>::value;
+        desc.is_generic = false;
+        desc.disable_mem_pattern = false;
+    }
+
+    /** @brief Specifies sequence of network input layers names for inference.
+
+    The function is used to associate data of graph inputs with input layers of
+    network topology. Number of names has to match the number of network inputs. If a network
+    has only one input layer, there is no need to call it as the layer is
+    associated with input automatically but this doesn't prevent you from
+    doing it yourself. Count of names has to match to number of network inputs.
+
+    @param layer_names std::array<std::string, N> where N is the number of inputs
+    as defined in the @ref G_API_NET. Contains names of input layers.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgInputLayers(const typename PortCfg<Net>::In &layer_names) {
+        desc.input_names.assign(layer_names.begin(), layer_names.end());
+        return *this;
+    }
+
+    /** @brief Specifies sequence of output layers names for inference.
+
+     The function is used to associate data of graph outputs with output layers of
+    network topology. If a network has only one output layer, there is no need to call it
+    as the layer is associated with output automatically but this doesn't prevent
+    you from doing it yourself. Count of names has to match to number of network
+    outputs or you can set your own output but for this case you have to
+    additionally use @ref cfgPostProc function.
+
+    @param layer_names std::array<std::string, N> where N is the number of outputs
+    as defined in the @ref G_API_NET. Contains names of output layers.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgOutputLayers(const typename PortCfg<Net>::Out &layer_names) {
+        desc.output_names.assign(layer_names.begin(), layer_names.end());
+        return *this;
+    }
+
+    /** @brief Sets a constant input.
+
+    The function is used to set constant input. This input has to be
+    a prepared tensor since preprocessing is disabled for this case. You should
+    provide name of network layer which will receive provided data.
+
+    @param layer_name Name of network layer.
+    @param data cv::Mat that contains data which will be associated with network layer.
+    @param hint Type of input (TENSOR).
+    @return the reference on modified object.
+    */
+    Params<Net>& constInput(const std::string &layer_name,
+                            const cv::Mat &data,
+                            TraitAs hint = TraitAs::TENSOR) {
+        desc.const_inputs[layer_name] = {data, hint};
+        return *this;
+    }
+
+    /** @brief Specifies mean value and standard deviation for preprocessing.
+
+    The function is used to set mean value and standard deviation for preprocessing
+    of input data.
+
+    @param m std::array<cv::Scalar, N> where N is the number of inputs
+    as defined in the @ref G_API_NET. Contains mean values.
+    @param s std::array<cv::Scalar, N> where N is the number of inputs
+    as defined in the @ref G_API_NET. Contains standard deviation values.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgMeanStd(const typename PortCfg<Net>::NormCoefs &m,
+                            const typename PortCfg<Net>::NormCoefs &s) {
+        desc.mean.assign(m.begin(), m.end());
+        desc.stdev.assign(s.begin(), s.end());
+        return *this;
+    }
+
+    /** @brief Configures graph output and provides the post processing function from user.
+
+    The function is used when you work with networks with dynamic outputs.
+    Since we can't know dimensions of inference result needs provide them for
+    construction of graph output. This dimensions can differ from inference result.
+    So you have to provide @ref PostProc function that gets information from inference
+    result and fill output which is constructed by dimensions from out_metas.
+
+    @param out_metas Out meta information about your output (type, dimension).
+    @param remap_function Post processing function, which has two parameters. First is onnx
+    result, second is graph output. Both parameters is std::map that contain pair of
+    layer's name and cv::Mat.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgPostProc(const std::vector<cv::GMatDesc> &out_metas,
+                             const PostProc &remap_function) {
+        desc.out_metas        = out_metas;
+        desc.custom_post_proc = remap_function;
+        return *this;
+    }
+
+    /** @overload
+    Function with a rvalue parameters.
+
+    @param out_metas rvalue out meta information about your output (type, dimension).
+    @param remap_function rvalue post processing function, which has two parameters. First is onnx
+    result, second is graph output. Both parameters is std::map that contain pair of
+    layer's name and cv::Mat.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgPostProc(std::vector<cv::GMatDesc> &&out_metas,
+                             PostProc &&remap_function) {
+        desc.out_metas        = std::move(out_metas);
+        desc.custom_post_proc = std::move(remap_function);
+        return *this;
+    }
+
+    /** @overload
+    The function has additional parameter names_to_remap. This parameter provides
+    information about output layers which will be used for inference and post
+    processing function.
+
+    @param out_metas Out meta information.
+    @param remap_function Post processing function.
+    @param names_to_remap Names of output layers. network's inference will
+    be done on these layers. Inference's result will be processed in post processing
+    function using these names.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgPostProc(const std::vector<cv::GMatDesc> &out_metas,
+                             const PostProc &remap_function,
+                             const std::vector<std::string> &names_to_remap) {
+        desc.out_metas        = out_metas;
+        desc.custom_post_proc = remap_function;
+        desc.names_to_remap   = names_to_remap;
+        return *this;
+    }
+
+    /** @overload
+    Function with a rvalue parameters and additional parameter names_to_remap.
+
+    @param out_metas rvalue out meta information.
+    @param remap_function rvalue post processing function.
+    @param names_to_remap rvalue names of output layers. network's inference will
+    be done on these layers. Inference's result will be processed in post processing
+    function using these names.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgPostProc(std::vector<cv::GMatDesc> &&out_metas,
+                             PostProc &&remap_function,
+                             std::vector<std::string> &&names_to_remap) {
+        desc.out_metas        = std::move(out_metas);
+        desc.custom_post_proc = std::move(remap_function);
+        desc.names_to_remap   = std::move(names_to_remap);
+        return *this;
+    }
+
+    /** @brief Specifies normalize parameter for preprocessing.
+
+    The function is used to set normalize parameter for preprocessing of input data.
+
+    @param normalizations std::array<cv::Scalar, N> where N is the number of inputs
+    as defined in the @ref G_API_NET. Сontains bool values that enabled or disabled
+    normalize of input data.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgNormalize(const typename PortCfg<Net>::Normalize &normalizations) {
+        desc.normalize.assign(normalizations.begin(), normalizations.end());
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime OpenVINO Execution Provider options.
+
+    @param ep OpenVINO Execution Provider options.
+    @see cv::gapi::onnx::ep::OpenVINO.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::OpenVINO&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime DirectML Execution Provider options.
+
+    @param ep DirectML Execution Provider options.
+    @see cv::gapi::onnx::ep::DirectML.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::DirectML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime CoreML Execution Provider options.
+
+    @param ep CoreML Execution Provider options.
+    @see cv::gapi::onnx::ep::CoreML.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::CoreML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime CUDA Execution Provider options.
+
+    @param ep CUDA Execution Provider options.
+    @see cv::gapi::onnx::ep::CUDA.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::CUDA&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime TensorRT Execution Provider options.
+
+    @param ep TensorRT Execution Provider options.
+    @see cv::gapi::onnx::ep::TensorRT.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::TensorRT&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Disables the memory pattern optimization.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgDisableMemPattern() {
+        desc.disable_mem_pattern = true;
+        return *this;
+    }
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend() const { return cv::gapi::onnx::backend(); }
+    std::string   tag()     const { return Net::tag(); }
+    cv::util::any params()  const { return { desc }; }
+    // END(G-API's network parametrization API)
+
+protected:
+    detail::ParamDesc desc;
+};
+
+/*
+* @brief This structure provides functions for generic network type that
+* fill inference parameters.
+* @see struct Generic
+*/
+template<>
+class Params<cv::gapi::Generic> {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on input information and sets default values for other
+    inference description parameters.
+
+    @param tag string tag of the network for which these parameters are intended.
+    @param model_path path to model file (.onnx file).
+    */
+    Params(const std::string& tag, const std::string& model_path)
+        : desc{model_path, 0u, 0u, {}, {}, {}, {}, {}, {}, {}, {}, {}, true, {}, {}, {}, false }, m_tag(tag) {}
+
+    /** @see onnx::Params::cfgMeanStdDev. */
+    void cfgMeanStdDev(const std::string &layer,
+                       const cv::Scalar &m,
+                       const cv::Scalar &s) {
+        desc.generic_mstd[layer] = std::make_pair(m, s);
+    }
+
+    /** @see onnx::Params::cfgNormalize. */
+    void cfgNormalize(const std::string &layer, bool flag) {
+        desc.generic_norm[layer] = flag;
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::OpenVINO&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::DirectML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::CoreML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::CUDA&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::TensorRT&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgDisableMemPattern. */
+    void cfgDisableMemPattern() {
+        desc.disable_mem_pattern = true;
+    }
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend() const { return cv::gapi::onnx::backend(); }
+    std::string   tag()     const { return m_tag; }
+    cv::util::any params()  const { return { desc }; }
+    // END(G-API's network parametrization API)
+protected:
+    detail::ParamDesc desc;
+    std::string m_tag;
+};
+
+} // namespace onnx
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_INFER_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/ov.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/ov.hpp
new file mode 100644
index 000000000000..782792489bac
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/ov.hpp
@@ -0,0 +1,709 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_INFER_OV_HPP
+#define OPENCV_GAPI_INFER_OV_HPP
+
+#include <string>
+
+#include <opencv2/gapi/util/any.hpp>
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp>     // GKernelType[M], GBackend
+#include <opencv2/gapi/infer.hpp>       // Generic
+
+#include <map>
+
+namespace cv {
+namespace gapi {
+
+/**
+ * @brief This namespace contains G-API OpenVINO 2.0 backend functions,
+ * structures, and symbols.
+ */
+namespace ov {
+
+GAPI_EXPORTS cv::gapi::GBackend backend();
+
+namespace detail {
+
+template <typename T>
+using AttrMap = std::map<std::string, T>;
+// NB: This type is supposed to be used to hold in/out layers
+// attributes such as precision, layout, shape etc.
+//
+// User can provide attributes either:
+// 1. cv::util::monostate - No value specified explicitly.
+// 2. Attr - value specified explicitly that should be broadcasted to all layers.
+// 3. AttrMap[str->T] - map specifies value for particular layer.
+template <typename Attr>
+using LayerVariantAttr = cv::util::variant< cv::util::monostate
+                                          , AttrMap<Attr>
+                                          , Attr>;
+
+struct ParamDesc {
+    struct Model {
+
+        Model(const std::string &model_path_,
+              const std::string &bin_path_)
+            : model_path(model_path_), bin_path(bin_path_) {
+        }
+
+        std::string model_path;
+        std::string bin_path;
+
+        LayerVariantAttr<std::string> input_tensor_layout;
+        LayerVariantAttr<std::string> input_model_layout;
+        LayerVariantAttr<std::string> output_tensor_layout;
+        LayerVariantAttr<std::string> output_model_layout;
+        LayerVariantAttr<int>         output_tensor_precision;
+
+        LayerVariantAttr<std::vector<size_t>> new_shapes;
+
+        LayerVariantAttr<std::vector<float>> mean_values;
+        LayerVariantAttr<std::vector<float>> scale_values;
+
+        LayerVariantAttr<int> interpolation;
+    };
+
+    struct CompiledModel {
+        std::string blob_path;
+    };
+
+    using Kind = cv::util::variant<Model, CompiledModel>;
+
+    ParamDesc(Kind              &&kind_,
+              const std::string &device_,
+              const bool        is_generic_,
+              const size_t      num_in_,
+              const size_t      num_out_)
+        : kind(std::move(kind_)), device(device_),
+          is_generic(is_generic_),
+          num_in(num_in_), num_out(num_out_) {
+    }
+
+    Kind kind;
+
+    std::string device;
+    bool is_generic;
+
+    std::size_t num_in;
+    std::size_t num_out;
+
+    std::vector<std::string> input_names;
+    std::vector<std::string> output_names;
+
+    using PluginConfigT = std::map<std::string, std::string>;
+    PluginConfigT config;
+
+    size_t nireq = 1;
+};
+
+// NB: Just helper to avoid code duplication.
+static detail::ParamDesc::Model&
+getModelToSetAttrOrThrow(detail::ParamDesc::Kind  &kind,
+                         const std::string        &attr_name) {
+    if (cv::util::holds_alternative<detail::ParamDesc::CompiledModel>(kind)) {
+        cv::util::throw_error(
+                std::logic_error("Specifying " + attr_name + " isn't"
+                                 " possible for compiled model."));
+    }
+    GAPI_Assert(cv::util::holds_alternative<detail::ParamDesc::Model>(kind));
+    return cv::util::get<detail::ParamDesc::Model>(kind);
+}
+
+} // namespace detail
+
+/**
+ * @brief This structure provides functions
+ * that fill inference parameters for "OpenVINO Toolkit" model.
+ */
+template<typename Net> struct Params {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on model information and specifies default values for other
+    inference description parameters. Model is loaded and compiled using "OpenVINO Toolkit".
+
+    @param model_path Path to a model.
+    @param bin_path Path to a data file.
+    For IR format (*.bin):
+    If path is empty, will try to read a bin file with the same name as xml.
+    If the bin file with the same name is not found, will load IR without weights.
+    For PDPD (*.pdmodel) and ONNX (*.onnx) formats bin_path isn't used.
+    @param device target device to use.
+    */
+    Params(const std::string &model_path,
+           const std::string &bin_path,
+           const std::string &device)
+        : m_desc( detail::ParamDesc::Kind{detail::ParamDesc::Model{model_path, bin_path}}
+                 , device
+                 , false /* is generic */
+                 , std::tuple_size<typename Net::InArgs>::value
+                 , std::tuple_size<typename Net::OutArgs>::value) {
+    }
+
+    /** @overload
+    Use this constructor to work with pre-compiled network.
+    Model is imported from a pre-compiled blob.
+
+    @param blob_path path to the compiled model (*.blob).
+    @param device target device to use.
+    */
+    Params(const std::string &blob_path,
+           const std::string &device)
+        : m_desc( detail::ParamDesc::Kind{detail::ParamDesc::CompiledModel{blob_path}}
+                 , device
+                 , false /* is generic */
+                 , std::tuple_size<typename Net::InArgs>::value
+                 , std::tuple_size<typename Net::OutArgs>::value) {
+    }
+
+    /** @brief Specifies sequence of network input layers names for inference.
+
+    The function is used to associate cv::gapi::infer<> inputs with the model inputs.
+    Number of names has to match the number of network inputs as defined in G_API_NET().
+    In case a network has only single input layer, there is no need to specify name manually.
+
+    @param layer_names std::array<std::string, N> where N is the number of inputs
+    as defined in the @ref G_API_NET. Contains names of input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputLayers(const std::vector<std::string> &layer_names) {
+        m_desc.input_names = layer_names;
+        return *this;
+    }
+
+    /** @brief Specifies sequence of network output layers names for inference.
+
+    The function is used to associate cv::gapi::infer<> outputs with the model outputs.
+    Number of names has to match the number of network outputs as defined in G_API_NET().
+    In case a network has only single output layer, there is no need to specify name manually.
+
+    @param layer_names std::array<std::string, N> where N is the number of outputs
+    as defined in the @ref G_API_NET. Contains names of output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputLayers(const std::vector<std::string> &layer_names) {
+        m_desc.output_names = layer_names;
+        return *this;
+    }
+
+    /** @brief Specifies OpenVINO plugin configuration.
+
+    The function is used to set configuration for OpenVINO plugin. Some parameters
+    can be different for each plugin. Please follow https://docs.openvinotoolkit.org/latest/index.html
+    to check information about specific plugin.
+
+    @param config Map of pairs: (config parameter name, config parameter value).
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgPluginConfig(const detail::ParamDesc::PluginConfigT &config) {
+        m_desc.config = config;
+        return *this;
+    }
+
+    /** @brief Specifies tensor layout for an input layer.
+
+    The function is used to set tensor layout for an input layer.
+
+    @param layout Tensor layout ("NCHW", "NWHC", etc)
+    will be applied to all input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputTensorLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input tensor layout")
+            .input_tensor_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+    @param layout_map Map of pairs: name of corresponding input layer
+    and its tensor layout represented in std::string ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgInputTensorLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input tensor layout")
+            .input_tensor_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies model layout for an input layer.
+
+    The function is used to set model layout for an input layer.
+
+    @param layout Model layout ("NCHW", "NHWC", etc)
+    will be applied to all input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputModelLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input model layout")
+            .input_model_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+    @param layout_map Map of pairs: name of corresponding input layer
+    and its model layout ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgInputModelLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input model layout")
+            .input_model_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies tensor layout for an output layer.
+
+    The function is used to set tensor layout for an output layer.
+
+    @param layout Tensor layout ("NCHW", "NWHC", etc)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputTensorLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor layout")
+            .output_tensor_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+    @param layout_map Map of pairs: name of corresponding output layer
+    and its tensor layout represented in std::string ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputTensorLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor layout")
+            .output_tensor_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies model layout for an output layer.
+
+    The function is used to set model layout for an output layer.
+
+    @param layout Model layout ("NCHW", "NHWC", etc)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputModelLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output model layout")
+            .output_model_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+    @param layout_map Map of pairs: name of corresponding output layer
+    and its model layout ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputModelLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output model layout")
+            .output_model_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies tensor precision for an output layer.
+
+    The function is used to set tensor precision for an output layer..
+
+    @param precision Precision in OpenCV format (CV_8U, CV_32F, ...)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputTensorPrecision(int precision) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor precision")
+            .output_tensor_precision = precision;
+        return *this;
+    }
+
+    /** @overload
+
+    @param precision_map Map of pairs: name of corresponding output layer
+    and its precision in OpenCV format (CV_8U, CV_32F, ...)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputTensorPrecision(detail::AttrMap<int> precision_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor precision")
+            .output_tensor_precision = std::move(precision_map);
+        return *this;
+    }
+
+    /** @brief Specifies the new shape for input layers.
+
+    The function is used to set new shape for input layers.
+
+    @param new_shape New shape will be applied to all input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgReshape(std::vector<size_t> new_shape) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "reshape")
+            .new_shapes = std::move(new_shape);
+        return *this;
+    }
+
+    /** @overload
+
+    @param new_shape_map Map of pairs: name of corresponding output layer
+    and its new shape.
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgReshape(detail::AttrMap<std::vector<size_t>> new_shape_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "reshape")
+            .new_shapes = std::move(new_shape_map);
+        return *this;
+    }
+
+    /** @brief Specifies number of asynchronous inference requests.
+
+    @param nireq Number of inference asynchronous requests.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgNumRequests(const size_t nireq) {
+        if (nireq == 0) {
+            cv::util::throw_error(
+                    std::logic_error("Number of inference requests"
+                                     " must be greater than zero."));
+        }
+        m_desc.nireq = nireq;
+        return *this;
+    }
+
+    /** @brief Specifies mean values for preprocessing.
+     *
+    The function is used to set mean values for input layer preprocessing.
+
+    @param mean_values Float vector contains mean values
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgMean(std::vector<float> mean_values) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "mean values")
+            .mean_values = std::move(mean_values);
+        return *this;
+    }
+
+    /** @overload
+
+    @param mean_map Map of pairs: name of corresponding input layer
+    and its mean values.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgMean(detail::AttrMap<std::vector<float>> mean_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "mean values")
+            .mean_values = std::move(mean_map);
+        return *this;
+    }
+
+    /** @brief Specifies scale values for preprocessing.
+     *
+    The function is used to set scale values for input layer preprocessing.
+
+    @param scale_values Float vector contains scale values
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgScale(std::vector<float> scale_values) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "scale values")
+            .scale_values = std::move(scale_values);
+        return *this;
+    }
+
+    /** @overload
+
+    @param scale_map Map of pairs: name of corresponding input layer
+    and its mean values.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgScale(detail::AttrMap<std::vector<float>> scale_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "scale values")
+            .scale_values = std::move(scale_map);
+        return *this;
+    }
+
+    /** @brief Specifies resize interpolation algorithm.
+     *
+    The function is used to configure resize preprocessing for input layer.
+
+    @param interpolation Resize interpolation algorithm.
+    Supported algorithms: #INTER_NEAREST, #INTER_LINEAR, #INTER_CUBIC.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgResize(int interpolation) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "resize preprocessing")
+            .interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    /** @overload
+
+    @param interpolation Map of pairs: name of corresponding input layer
+    and its resize algorithm.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgResize(detail::AttrMap<int> interpolation) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "resize preprocessing")
+            .interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend() const { return cv::gapi::ov::backend(); }
+    std::string   tag()     const { return Net::tag(); }
+    cv::util::any params()  const { return { m_desc }; }
+    // END(G-API's network parametrization API)
+
+protected:
+    detail::ParamDesc m_desc;
+};
+
+/*
+* @brief This structure provides functions for generic network type that
+* fill inference parameters.
+* @see struct Generic
+*/
+template<>
+class Params<cv::gapi::Generic> {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on model information and specifies default values for other
+    inference description parameters. Model is loaded and compiled using "OpenVINO Toolkit".
+
+    @param tag string tag of the network for which these parameters are intended.
+    @param model_path Path to a model.
+    @param bin_path Path to a data file.
+    For IR format (*.bin):
+    If path is empty, will try to read a bin file with the same name as xml.
+    If the bin file with the same name is not found, will load IR without weights.
+    For PDPD (*.pdmodel) and ONNX (*.onnx) formats bin_path isn't used.
+    @param device target device to use.
+    */
+    Params(const std::string &tag,
+           const std::string &model_path,
+           const std::string &bin_path,
+           const std::string &device)
+        : m_tag(tag),
+          m_desc( detail::ParamDesc::Kind{detail::ParamDesc::Model{model_path, bin_path}}
+                , device
+                , true /* is generic */
+                , 0u
+                , 0u) {
+    }
+
+    /** @overload
+
+    This constructor for pre-compiled networks. Model is imported from pre-compiled
+    blob.
+
+    @param tag string tag of the network for which these parameters are intended.
+    @param blob_path path to the compiled model (*.blob).
+    @param device target device to use.
+    */
+    Params(const std::string &tag,
+           const std::string &blob_path,
+           const std::string &device)
+        : m_tag(tag),
+          m_desc( detail::ParamDesc::Kind{detail::ParamDesc::CompiledModel{blob_path}}
+                , device
+                , true /* is generic */
+                , 0u
+                , 0u) {
+    }
+
+    /** @see ov::Params::cfgPluginConfig. */
+    Params& cfgPluginConfig(const detail::ParamDesc::PluginConfigT &config) {
+        m_desc.config = config;
+        return *this;
+    }
+
+    /** @see ov::Params::cfgInputTensorLayout. */
+    Params& cfgInputTensorLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input tensor layout")
+            .input_tensor_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgInputTensorLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input tensor layout")
+            .input_tensor_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgInputModelLayout. */
+    Params& cfgInputModelLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input model layout")
+            .input_model_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgInputModelLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input model layout")
+            .input_model_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgOutputTensorLayout. */
+    Params& cfgOutputTensorLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor layout")
+            .output_tensor_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputTensorLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor layout")
+            .output_tensor_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgOutputModelLayout. */
+    Params& cfgOutputModelLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output model layout")
+            .output_model_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputModelLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output model layout")
+            .output_model_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgOutputTensorPrecision. */
+    Params& cfgOutputTensorPrecision(int precision) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor precision")
+            .output_tensor_precision = precision;
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputTensorPrecision(detail::AttrMap<int> precision_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor precision")
+            .output_tensor_precision = std::move(precision_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgReshape. */
+    Params& cfgReshape(std::vector<size_t> new_shape) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "reshape")
+            .new_shapes = std::move(new_shape);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgReshape(detail::AttrMap<std::vector<size_t>> new_shape_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "reshape")
+            .new_shapes = std::move(new_shape_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgNumRequests. */
+    Params& cfgNumRequests(const size_t nireq) {
+        if (nireq == 0) {
+            cv::util::throw_error(
+                    std::logic_error("Number of inference requests"
+                                     " must be greater than zero."));
+        }
+        m_desc.nireq = nireq;
+        return *this;
+    }
+
+    /** @see ov::Params::cfgMean. */
+    Params& cfgMean(std::vector<float> mean_values) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "mean values")
+            .mean_values = std::move(mean_values);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgMean(detail::AttrMap<std::vector<float>> mean_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "mean values")
+            .mean_values = std::move(mean_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgScale. */
+    Params& cfgScale(std::vector<float> scale_values) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "scale values")
+            .scale_values = std::move(scale_values);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgScale(detail::AttrMap<std::vector<float>> scale_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "scale values")
+            .scale_values = std::move(scale_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgResize. */
+    Params& cfgResize(int interpolation) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "resize preprocessing")
+            .interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgResize(detail::AttrMap<int> interpolation) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "resize preprocessing")
+            .interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend() const { return cv::gapi::ov::backend(); }
+    std::string   tag()     const { return m_tag; }
+    cv::util::any params()  const { return { m_desc }; }
+    // END(G-API's network parametrization API)
+
+protected:
+    std::string m_tag;
+    detail::ParamDesc m_desc;
+};
+
+} // namespace ov
+
+namespace wip { namespace ov {
+/**
+ * @brief Ask G-API OpenVINO backend to run only inference of model provided.
+ *
+ * G-API OpenVINO backend will perform only the inference of the model provided
+ * without populating input and copying back output data.
+ * This mode is used to evaluate the pure inference performance of the model without
+ * taking into account the i/o data transfer.
+ */
+struct benchmark_mode { };
+
+} // namespace ov
+} // namespace wip
+
+} // namespace gapi
+
+namespace detail
+{
+    template<> struct CompileArgTag<cv::gapi::wip::ov::benchmark_mode>
+    {
+        static const char* tag() { return "gapi.wip.ov.benchmark_mode"; }
+    };
+}
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_INFER_OV_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/parsers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/parsers.hpp
new file mode 100644
index 000000000000..e39d6fd4c6a2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/infer/parsers.hpp
@@ -0,0 +1,138 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_PARSERS_HPP
+#define OPENCV_GAPI_PARSERS_HPP
+
+#include <utility> // std::tuple
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+
+namespace cv { namespace gapi {
+namespace nn {
+namespace parsers {
+    using GRects      = GArray<Rect>;
+    using GDetections = std::tuple<GArray<Rect>, GArray<int>>;
+
+    G_TYPED_KERNEL(GParseSSDBL, <GDetections(GMat, GOpaque<Size>, float, int)>,
+                   "org.opencv.nn.parsers.parseSSD_BL") {
+        static std::tuple<GArrayDesc,GArrayDesc> outMeta(const GMatDesc&, const GOpaqueDesc&, float, int) {
+            return std::make_tuple(empty_array_desc(), empty_array_desc());
+        }
+    };
+
+    G_TYPED_KERNEL(GParseSSD, <GRects(GMat, GOpaque<Size>, float, bool, bool)>,
+                   "org.opencv.nn.parsers.parseSSD") {
+        static GArrayDesc outMeta(const GMatDesc&, const GOpaqueDesc&, float, bool, bool) {
+            return empty_array_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GParseYolo, <GDetections(GMat, GOpaque<Size>, float, float, std::vector<float>)>,
+                   "org.opencv.nn.parsers.parseYolo") {
+        static std::tuple<GArrayDesc, GArrayDesc> outMeta(const GMatDesc&, const GOpaqueDesc&,
+                                                          float, float, const std::vector<float>&) {
+            return std::make_tuple(empty_array_desc(), empty_array_desc());
+        }
+        static const std::vector<float>& defaultAnchors() {
+            static std::vector<float> anchors {
+                0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f
+            };
+            return anchors;
+        }
+    };
+} // namespace parsers
+} // namespace nn
+
+/** @brief Parses output of SSD network.
+
+Extracts detection information (box, confidence, label) from SSD output and
+filters it by given confidence and label.
+
+@note Function textual ID is "org.opencv.nn.parsers.parseSSD_BL"
+
+@param in Input CV_32F tensor with {1,1,N,7} dimensions.
+@param inSz Size to project detected boxes to (size of the input image).
+@param confidenceThreshold If confidence of the
+detection is smaller than confidence threshold, detection is rejected.
+@param filterLabel If provided (!= -1), only detections with
+given label will get to the output.
+@return a tuple with a vector of detected boxes and a vector of appropriate labels.
+*/
+GAPI_EXPORTS_W std::tuple<GArray<Rect>, GArray<int>> parseSSD(const GMat& in,
+                                                              const GOpaque<Size>& inSz,
+                                                              const float confidenceThreshold = 0.5f,
+                                                              const int   filterLabel = -1);
+
+/** @brief Parses output of SSD network.
+
+Extracts detection information (box, confidence) from SSD output and
+filters it by given confidence and by going out of bounds.
+
+@note Function textual ID is "org.opencv.nn.parsers.parseSSD"
+
+@param in Input CV_32F tensor with {1,1,N,7} dimensions.
+@param inSz Size to project detected boxes to (size of the input image).
+@param confidenceThreshold If confidence of the
+detection is smaller than confidence threshold, detection is rejected.
+@param alignmentToSquare If provided true, bounding boxes are extended to squares.
+The center of the rectangle remains unchanged, the side of the square is
+the larger side of the rectangle.
+@param filterOutOfBounds If provided true, out-of-frame boxes are filtered.
+@return a vector of detected bounding boxes.
+*/
+GAPI_EXPORTS_W GArray<Rect> parseSSD(const GMat& in,
+                                     const GOpaque<Size>& inSz,
+                                     const float confidenceThreshold,
+                                     const bool alignmentToSquare,
+                                     const bool filterOutOfBounds);
+
+/** @brief Parses output of Yolo network.
+
+Extracts detection information (box, confidence, label) from Yolo output,
+filters it by given confidence and performs non-maximum suppression for overlapping boxes.
+
+@note Function textual ID is "org.opencv.nn.parsers.parseYolo"
+
+@param in Input CV_32F tensor with {1,13,13,N} dimensions, N should satisfy:
+\f[\texttt{N} = (\texttt{num_classes} + \texttt{5}) * \texttt{5},\f]
+where num_classes - a number of classes Yolo network was trained with.
+@param inSz Size to project detected boxes to (size of the input image).
+@param confidenceThreshold If confidence of the
+detection is smaller than confidence threshold, detection is rejected.
+@param nmsThreshold Non-maximum suppression threshold which controls minimum
+relative box intersection area required for rejecting the box with a smaller confidence.
+If 1.f, nms is not performed and no boxes are rejected.
+@param anchors Anchors Yolo network was trained with.
+@note The default anchor values are specified for YOLO v2 Tiny as described in Intel Open Model Zoo
+<a href="https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/public/yolo-v2-tiny-tf/yolo-v2-tiny-tf.md">documentation</a>.
+@return a tuple with a vector of detected boxes and a vector of appropriate labels.
+*/
+GAPI_EXPORTS_W std::tuple<GArray<Rect>, GArray<int>> parseYolo(const GMat& in,
+                                                               const GOpaque<Size>& inSz,
+                                                               const float confidenceThreshold = 0.5f,
+                                                               const float nmsThreshold = 0.5f,
+                                                               const std::vector<float>& anchors
+                                                                   = nn::parsers::GParseYolo::defaultAnchors());
+
+} // namespace gapi
+} // namespace cv
+
+// Reimport parseSSD & parseYolo under their initial namespace
+namespace cv {
+namespace gapi {
+namespace streaming {
+
+using cv::gapi::parseSSD;
+using cv::gapi::parseYolo;
+
+} // namespace streaming
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_PARSERS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/media.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/media.hpp
new file mode 100644
index 000000000000..1470f00d042e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/media.hpp
@@ -0,0 +1,258 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_MEDIA_HPP
+#define OPENCV_GAPI_MEDIA_HPP
+
+#include <memory>     // unique_ptr<>, shared_ptr<>
+#include <array>      // array<>
+#include <functional> // function<>
+#include <utility>    // forward<>()
+
+#include <opencv2/gapi/gframe.hpp>
+#include <opencv2/gapi/util/any.hpp>
+
+// Forward declaration
+namespace cv {
+namespace gapi {
+namespace s11n {
+struct IOStream;
+struct IIStream;
+} // namespace s11n
+} // namespace gapi
+} // namespace cv
+
+namespace cv {
+
+/** \addtogroup gapi_data_structures
+ * @{
+ *
+ * @brief Extra G-API data structures used to pass input/output data
+ * to the graph for processing.
+ */
+
+/**
+ * @brief cv::MediaFrame class represents an image/media frame
+ * obtained from an external source.
+ *
+ * cv::MediaFrame represents image data as specified in
+ * cv::MediaFormat. cv::MediaFrame is designed to be a thin wrapper over some
+ * external memory of buffer; the class itself provides an uniform
+ * interface over such types of memory. cv::MediaFrame wraps data from
+ * a camera driver or from a media codec and provides an abstraction
+ * layer over this memory to G-API. MediaFrame defines a compact interface
+ * to access and manage the underlying data; the implementation is
+ * fully defined by the associated Adapter (which is usually
+ * user-defined).
+ *
+ * @sa cv::RMat
+ */
+class GAPI_EXPORTS MediaFrame {
+public:
+    /// This enum defines different types of cv::MediaFrame provided
+    /// access to the underlying data. Note that different flags can't
+    /// be combined in this version.
+    enum class Access {
+        R, ///< Access data for reading
+        W, ///< Access data for writing
+    };
+    class IAdapter;
+    class View;
+    using AdapterPtr = std::unique_ptr<IAdapter>;
+
+    /**
+     * @brief Constructs an empty MediaFrame
+     *
+     * The constructed object has no any data associated with it.
+     */
+    MediaFrame();
+
+    /**
+     * @brief Constructs a MediaFrame with the given
+     * Adapter. MediaFrame takes ownership over the passed adapter.
+     *
+     * @param p an unique pointer to instance of IAdapter derived class.
+     */
+    explicit MediaFrame(AdapterPtr &&p);
+
+    /**
+     * @overload
+     * @brief Constructs a MediaFrame with the given parameters for
+     * the Adapter. The adapter of type `T` is costructed on the fly.
+     *
+     * @param args list of arguments to construct an adapter of type
+     * `T`.
+     */
+    template<class T, class... Args> static cv::MediaFrame Create(Args&&... args);
+
+    /**
+     * @brief Obtain access to the underlying data with the given
+     * mode.
+     *
+     * Depending on the associated Adapter and the data wrapped, this
+     * method may be cheap (e.g., the underlying memory is local) or
+     * costly (if the underlying memory is external or device
+     * memory).
+     *
+     * @param mode an access mode flag
+     * @return a MediaFrame::View object. The views should be handled
+     * carefully, refer to the MediaFrame::View documentation for details.
+     */
+    View access(Access mode) const;
+
+    /**
+     * @brief Returns a media frame descriptor -- the information
+     * about the media format, dimensions, etc.
+     * @return a cv::GFrameDesc
+     */
+    cv::GFrameDesc desc() const;
+
+    // FIXME: design a better solution
+    // Should be used only if the actual adapter provides implementation
+    /// @private -- exclude from the OpenCV documentation for now.
+    cv::util::any blobParams() const;
+
+    /**
+     * @brief Casts and returns the associated MediaFrame adapter to
+     * the particular adapter type `T`, returns nullptr if the type is
+     * different.
+     *
+     * This method may be useful if the adapter type is known by the
+     * caller, and some lower level access to the memory is required.
+     * Depending on the memory type, it may be more efficient than
+     * access().
+     *
+     * @return a pointer to the adapter object, nullptr if the adapter
+     * type is different.
+     */
+    template<typename T> T* get() const {
+        static_assert(std::is_base_of<IAdapter, T>::value,
+                      "T is not derived from cv::MediaFrame::IAdapter!");
+        auto* adapter = getAdapter();
+        GAPI_Assert(adapter != nullptr);
+        return dynamic_cast<T*>(adapter);
+    }
+
+    /**
+     * @brief Serialize MediaFrame's data to a byte array.
+     *
+     * @note The actual logic is implemented by frame's adapter class.
+     * Does nothing by default.
+     *
+     * @param os Bytestream to store serialized MediaFrame data in.
+     */
+    void serialize(cv::gapi::s11n::IOStream& os) const;
+
+private:
+    struct Priv;
+    std::shared_ptr<Priv> m;
+    IAdapter* getAdapter() const;
+};
+
+template<class T, class... Args>
+inline cv::MediaFrame cv::MediaFrame::Create(Args&&... args) {
+    std::unique_ptr<T> ptr(new T(std::forward<Args>(args)...));
+    return cv::MediaFrame(std::move(ptr));
+}
+
+/**
+ * @brief Provides access to the MediaFrame's underlying data.
+ *
+ * This object contains the necessary information to access the pixel
+ * data of the associated MediaFrame: arrays of pointers and strides
+ * (distance between every plane row, in bytes) for every image
+ * plane, as defined in cv::MediaFormat.
+ * There may be up to four image planes in MediaFrame.
+ *
+ * Depending on the MediaFrame::Access flag passed in
+ * MediaFrame::access(), a MediaFrame::View may be read- or
+ * write-only.
+ *
+ * Depending on the MediaFrame::IAdapter implementation associated
+ * with the parent MediaFrame, writing to memory with
+ * MediaFrame::Access::R flag may have no effect or lead to
+ * undefined behavior. Same applies to reading the memory with
+ * MediaFrame::Access::W flag -- again, depending on the IAdapter
+ * implementation, the host-side buffer the view provides access to
+ * may have no current data stored in (so in-place editing of the
+ * buffer contents may not be possible).
+ *
+ * MediaFrame::View objects must be handled carefully, as an external
+ * resource associated with MediaFrame may be locked for the time the
+ * MediaFrame::View object exists. Obtaining MediaFrame::View should
+ * be seen as "map" and destroying it as "unmap" in the "map/unmap"
+ * idiom (applicable to OpenCL, device memory, remote
+ * memory).
+ *
+ * When a MediaFrame buffer is accessed for writing, and the memory
+ * under MediaFrame::View::Ptrs is altered, the data synchronization
+ * of a host-side and device/remote buffer is not guaranteed until the
+ * MediaFrame::View is destroyed. In other words, the real data on the
+ * device or in a remote target may be updated at the MediaFrame::View
+ * destruction only -- but it depends on the associated
+ * MediaFrame::IAdapter implementation.
+ */
+class GAPI_EXPORTS MediaFrame::View final {
+public:
+    static constexpr const size_t MAX_PLANES = 4;
+    using Ptrs     = std::array<void*, MAX_PLANES>;
+    using Strides  = std::array<std::size_t, MAX_PLANES>; // in bytes
+    using Callback = std::function<void()>;
+
+    /// @private
+    View(Ptrs&& ptrs, Strides&& strs, Callback &&cb = [](){});
+
+    /// @private
+    View(const View&) = delete;
+
+    /// @private
+    View(View&&) = default;
+
+    /// @private
+    View& operator = (const View&) = delete;
+
+    ~View();
+
+    Ptrs    ptr; ///< Array of image plane pointers
+    Strides stride; ///< Array of image plane strides, in bytes.
+
+private:
+    Callback m_cb;
+};
+
+/**
+ * @brief An interface class for MediaFrame data adapters.
+ *
+ * Implement this interface to wrap media data in the MediaFrame. It
+ * makes sense to implement this class if there is a custom
+ * cv::gapi::wip::IStreamSource defined -- in this case, a stream
+ * source can produce MediaFrame objects with this adapter and the
+ * media data may be passed to graph without any copy. For example, a
+ * GStreamer-based stream source can implement an adapter over
+ * `GstBuffer` and G-API will transparently use it in the graph.
+ */
+class GAPI_EXPORTS MediaFrame::IAdapter {
+public:
+    virtual ~IAdapter() = 0;
+    virtual cv::GFrameDesc meta() const = 0;
+    virtual MediaFrame::View access(MediaFrame::Access) = 0;
+    // FIXME: design a better solution
+    // The default implementation does nothing
+    virtual cv::util::any blobParams() const;
+    virtual void serialize(cv::gapi::s11n::IOStream&) {
+        GAPI_Error("Generic serialize method of MediaFrame::IAdapter does nothing by default. "
+                             "Please, implement it in derived class to properly serialize the object.");
+    }
+    virtual void deserialize(cv::gapi::s11n::IIStream&) {
+        GAPI_Error("Generic deserialize method of MediaFrame::IAdapter does nothing by default. "
+                             "Please, implement it in derived class to properly deserialize the object.");
+    }
+};
+/** @} */
+
+} //namespace cv
+
+#endif // OPENCV_GAPI_MEDIA_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/oak/infer.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/oak/infer.hpp
new file mode 100644
index 000000000000..4a1b9f6db6a2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/oak/infer.hpp
@@ -0,0 +1,66 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2022 Intel Corporation
+
+#ifndef OPENCV_GAPI_OAK_INFER_HPP
+#define OPENCV_GAPI_OAK_INFER_HPP
+
+#include <unordered_map>
+#include <string>
+#include <array>
+#include <tuple>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/util/any.hpp>
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace oak {
+
+namespace detail {
+/**
+* @brief This structure contains description of inference parameters
+* which is specific to OAK models.
+*/
+struct ParamDesc {
+    std::string blob_file;
+};
+} // namespace detail
+
+/**
+ * Contains description of inference parameters and kit of functions that
+ * fill this parameters.
+ */
+template<typename Net> class Params {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on model information and sets default values for other
+    inference description parameters.
+
+    @param model Path to model (.blob file)
+    */
+    explicit Params(const std::string &model) {
+        desc.blob_file = model;
+    };
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend() const { return cv::gapi::oak::backend(); }
+    std::string   tag()     const { return Net::tag(); }
+    cv::util::any params()  const { return { desc }; }
+    // END(G-API's network parametrization API)
+
+protected:
+    detail::ParamDesc desc;
+};
+
+} // namespace oak
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_OAK_INFER_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/oak/oak.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/oak/oak.hpp
new file mode 100644
index 000000000000..8b56b8a3658b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/oak/oak.hpp
@@ -0,0 +1,158 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_OAK_HPP
+#define OPENCV_GAPI_OAK_HPP
+
+#include <opencv2/gapi/garg.hpp>       // IStreamSource
+#include <opencv2/gapi/gkernel.hpp>    // GKernelPackage
+#include <opencv2/gapi/gstreaming.hpp> // GOptRunArgsP
+
+namespace cv {
+namespace gapi {
+namespace oak {
+
+// FIXME: copypasted from dai library
+struct EncoderConfig {
+    /**
+     * Rate control mode specifies if constant or variable bitrate should be used (H264 / H265)
+     */
+    enum class RateControlMode: int { CBR, VBR };
+
+    /**
+     * Encoding profile, H264, H265 or MJPEG
+     */
+    enum class Profile: int { H264_BASELINE, H264_HIGH, H264_MAIN, H265_MAIN, MJPEG };
+    /**
+     * Specifies preferred bitrate (kb) of compressed output bitstream
+     */
+    std::int32_t bitrate = 8000;
+    /**
+     * Every x number of frames a keyframe will be inserted
+     */
+    std::int32_t keyframeFrequency = 30;
+    /**
+     * Specifies maximum bitrate (kb) of compressed output bitstream
+     */
+    std::int32_t maxBitrate = 8000;
+    /**
+     * Specifies number of B frames to be inserted
+     */
+    std::int32_t numBFrames = 0;
+    /**
+     * This options specifies how many frames are available in this nodes pool (can help if
+     * receiver node is slow at consuming
+     */
+    std::uint32_t numFramesPool = 4;
+    /**
+     * Encoding profile, H264, H265 or MJPEG
+     */
+    Profile profile = Profile::H265_MAIN;
+    /**
+     * Value between 0-100% (approximates quality)
+     */
+    std::int32_t quality = 80;
+    /**
+     * Lossless mode ([M]JPEG only)
+     */
+    bool lossless = false;
+    /**
+     * Rate control mode specifies if constant or variable bitrate should be used (H264 / H265)
+     */
+    RateControlMode rateCtrlMode = RateControlMode::CBR;
+    /**
+     * Input and compressed output frame width
+     */
+    std::int32_t width = 1920;
+    /**
+     * Input and compressed output frame height
+     */
+    std::int32_t height = 1080;
+    /**
+     * Frame rate
+     */
+    float frameRate = 30.0f;
+};
+
+G_API_OP(GEncFrame, <GArray<uint8_t>(GFrame, EncoderConfig)>, "org.opencv.oak.enc_frame") {
+    static GArrayDesc outMeta(const GFrameDesc&, const EncoderConfig&) {
+        return cv::empty_array_desc();
+    }
+};
+
+G_API_OP(GSobelXY, <GFrame(GFrame, const cv::Mat&, const cv::Mat&)>, "org.opencv.oak.sobelxy") {
+    static GFrameDesc outMeta(const GFrameDesc& in, const cv::Mat&, const cv::Mat&) {
+        return in;
+    }
+};
+
+G_API_OP(GCopy, <GFrame(GFrame)>, "org.opencv.oak.copy") {
+    static GFrameDesc outMeta(const GFrameDesc& in) {
+        return in;
+    }
+};
+
+// FIXME: add documentation on operations below
+
+GAPI_EXPORTS GArray<uint8_t> encode(const GFrame& in, const EncoderConfig&);
+
+GAPI_EXPORTS GFrame sobelXY(const GFrame& in,
+                            const cv::Mat& hk,
+                            const cv::Mat& vk);
+
+GAPI_EXPORTS GFrame copy(const GFrame& in);
+
+// OAK backend & kernels ////////////////////////////////////////////////////////
+GAPI_EXPORTS cv::gapi::GBackend backend();
+GAPI_EXPORTS cv::gapi::GKernelPackage kernels();
+
+// Camera object ///////////////////////////////////////////////////////////////
+
+struct GAPI_EXPORTS ColorCameraParams {
+    /**
+     * Format of the frame one gets from the camera
+     */
+    bool interleaved = false;
+
+    // FIXME: extend
+    enum class BoardSocket: int { RGB, BGR };
+
+    BoardSocket board_socket = BoardSocket::RGB;
+
+    // FIXME: extend
+    enum class Resolution: int { THE_1080_P };
+
+    Resolution resolution = Resolution::THE_1080_P;
+};
+
+class GAPI_EXPORTS ColorCamera: public cv::gapi::wip::IStreamSource {
+    cv::MediaFrame m_dummy;
+    ColorCameraParams m_params;
+
+    virtual bool pull(cv::gapi::wip::Data &data) override;
+    virtual GMetaArg descr_of() const override;
+
+public:
+    ColorCamera();
+    explicit ColorCamera(const ColorCameraParams& params);
+};
+
+} // namespace oak
+} // namespace gapi
+
+namespace detail {
+template<> struct CompileArgTag<gapi::oak::ColorCameraParams> {
+    static const char* tag() { return "gapi.oak.colorCameraParams"; }
+};
+
+template<> struct CompileArgTag<gapi::oak::EncoderConfig> {
+    static const char* tag() { return "gapi.oak.encoderConfig"; }
+};
+} // namespace detail
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_OAK_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/core.hpp
new file mode 100644
index 000000000000..b79aace0ca7b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/core.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OCL_CORE_API_HPP
+#define OPENCV_GAPI_OCL_CORE_API_HPP
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace core {
+namespace ocl {
+
+GAPI_EXPORTS_W cv::GKernelPackage kernels();
+
+} // namespace ocl
+} // namespace core
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_OCL_CORE_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/goclkernel.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/goclkernel.hpp
new file mode 100644
index 000000000000..b09082282f55
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/goclkernel.hpp
@@ -0,0 +1,260 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GOCLKERNEL_HPP
+#define OPENCV_GAPI_GOCLKERNEL_HPP
+
+#include <vector>
+#include <functional>
+#include <map>
+#include <unordered_map>
+
+#include <opencv2/core/mat.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/garg.hpp>
+
+// FIXME: namespace scheme for backends?
+namespace cv {
+
+namespace gimpl
+{
+    // Forward-declare an internal class
+    class GOCLExecutable;
+} // namespace gimpl
+
+namespace gapi
+{
+/**
+ * @brief This namespace contains G-API OpenCL backend functions, structures, and symbols.
+ */
+namespace ocl
+{
+    /**
+     * \addtogroup gapi_std_backends G-API Standard Backends
+     * @{
+     */
+    /**
+     * @brief Get a reference to OCL backend.
+     *
+     * At the moment, the OCL backend is built atop of OpenCV
+     * "Transparent API" (T-API), see cv::UMat for details.
+     *
+     * @sa gapi_std_backends
+     */
+    GAPI_EXPORTS cv::gapi::GBackend backend();
+    /** @} */
+} // namespace ocl
+} // namespace gapi
+
+
+// Represents arguments which are passed to a wrapped OCL function
+// FIXME: put into detail?
+class GAPI_EXPORTS GOCLContext
+{
+public:
+    // Generic accessor API
+    template<typename T>
+    const T& inArg(int input) { return m_args.at(input).get<T>(); }
+
+    // Syntax sugar
+    const cv::UMat&  inMat(int input);
+    cv::UMat&  outMatR(int output); // FIXME: Avoid cv::Mat m = ctx.outMatR()
+
+    const cv::Scalar& inVal(int input);
+    cv::Scalar& outValR(int output); // FIXME: Avoid cv::Scalar s = ctx.outValR()
+    template<typename T> std::vector<T>& outVecR(int output) // FIXME: the same issue
+    {
+        return outVecRef(output).wref<T>();
+    }
+    template<typename T> T& outOpaqueR(int output) // FIXME: the same issue
+    {
+        return outOpaqueRef(output).wref<T>();
+    }
+
+protected:
+    detail::VectorRef& outVecRef(int output);
+    detail::OpaqueRef& outOpaqueRef(int output);
+
+    std::vector<GArg> m_args;
+    std::unordered_map<std::size_t, GRunArgP> m_results;
+
+
+    friend class gimpl::GOCLExecutable;
+};
+
+class GAPI_EXPORTS GOCLKernel
+{
+public:
+    // This function is kernel's execution entry point (does the processing work)
+    using F = std::function<void(GOCLContext &)>;
+
+    GOCLKernel();
+    explicit GOCLKernel(const F& f);
+
+    void apply(GOCLContext &ctx);
+
+protected:
+    F m_f;
+};
+
+// FIXME: This is an ugly ad-hoc implementation. TODO: refactor
+
+namespace detail
+{
+template<class T> struct ocl_get_in;
+template<> struct ocl_get_in<cv::GMat>
+{
+    static cv::UMat    get(GOCLContext &ctx, int idx) { return ctx.inMat(idx); }
+};
+template<> struct ocl_get_in<cv::GScalar>
+{
+    static cv::Scalar get(GOCLContext &ctx, int idx) { return ctx.inVal(idx); }
+};
+template<typename U> struct ocl_get_in<cv::GArray<U> >
+{
+    static const std::vector<U>& get(GOCLContext &ctx, int idx) { return ctx.inArg<VectorRef>(idx).rref<U>(); }
+};
+template<> struct ocl_get_in<cv::GFrame>
+{
+    static cv::MediaFrame get(GOCLContext &ctx, int idx) { return ctx.inArg<cv::MediaFrame>(idx); }
+};
+template<typename U> struct ocl_get_in<cv::GOpaque<U> >
+{
+    static const U& get(GOCLContext &ctx, int idx) { return ctx.inArg<OpaqueRef>(idx).rref<U>(); }
+};
+template<class T> struct ocl_get_in
+{
+    static T get(GOCLContext &ctx, int idx) { return ctx.inArg<T>(idx); }
+};
+
+struct tracked_cv_umat{
+    //TODO Think if T - API could reallocate UMat to a proper size - how do we handle this ?
+    //tracked_cv_umat(cv::UMat& m) : r{(m)}, original_data{m.getMat(ACCESS_RW).data} {}
+    tracked_cv_umat(cv::UMat& m) : r(m), original_data{ nullptr } {}
+    cv::UMat &r; // FIXME: It was a value (not a reference) before.
+                 // Actually OCL backend should allocate its internal data!
+    uchar* original_data;
+
+    operator cv::UMat& (){ return r;}
+    void validate() const{
+        //if (r.getMat(ACCESS_RW).data != original_data)
+        //{
+        //    util::throw_error
+        //        (std::logic_error
+        //         ("OpenCV kernel output parameter was reallocated. \n"
+        //          "Incorrect meta data was provided ?"));
+        //}
+
+    }
+};
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4702)  // unreachable code
+#endif
+template<typename... Outputs>
+void postprocess_ocl(Outputs&... outs)
+{
+    struct
+    {
+        void operator()(tracked_cv_umat* bm) { bm->validate(); }
+        void operator()(...) {                  }
+
+    } validate;
+    //dummy array to unfold parameter pack
+    int dummy[] = { 0, (validate(&outs), 0)... };
+    cv::util::suppress_unused_warning(dummy);
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+template<class T> struct ocl_get_out;
+template<> struct ocl_get_out<cv::GMat>
+{
+    static tracked_cv_umat get(GOCLContext &ctx, int idx)
+    {
+        auto& r = ctx.outMatR(idx);
+        return{ r };
+    }
+};
+template<> struct ocl_get_out<cv::GScalar>
+{
+    static cv::Scalar& get(GOCLContext &ctx, int idx)
+    {
+        return ctx.outValR(idx);
+    }
+};
+template<typename U> struct ocl_get_out<cv::GArray<U> >
+{
+    static std::vector<U>& get(GOCLContext &ctx, int idx) { return ctx.outVecR<U>(idx);  }
+};
+template<typename U> struct ocl_get_out<cv::GOpaque<U> >
+{
+    static U& get(GOCLContext &ctx, int idx) { return ctx.outOpaqueR<U>(idx);  }
+};
+
+template<typename, typename, typename>
+struct OCLCallHelper;
+
+// FIXME: probably can be simplified with std::apply or analogue.
+template<typename Impl, typename... Ins, typename... Outs>
+struct OCLCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...> >
+{
+    template<typename... Inputs>
+    struct call_and_postprocess
+    {
+        template<typename... Outputs>
+        static void call(Inputs&&... ins, Outputs&&... outs)
+        {
+            //not using a std::forward on outs is deliberate in order to
+            //cause compilation error, by trying to bind rvalue references to lvalue references
+            Impl::run(std::forward<Inputs>(ins)..., outs...);
+
+            postprocess_ocl(outs...);
+        }
+    };
+
+    template<int... IIs, int... OIs>
+    static void call_impl(GOCLContext &ctx, detail::Seq<IIs...>, detail::Seq<OIs...>)
+    {
+        //TODO: Make sure that OpenCV kernels do not reallocate memory for output parameters
+        //by comparing it's state (data ptr) before and after the call.
+        //Convert own::Scalar to cv::Scalar before call kernel and run kernel
+        //convert cv::Scalar to own::Scalar after call kernel and write back results
+        call_and_postprocess<decltype(ocl_get_in<Ins>::get(ctx, IIs))...>::call(ocl_get_in<Ins>::get(ctx, IIs)..., ocl_get_out<Outs>::get(ctx, OIs)...);
+    }
+
+    static void call(GOCLContext &ctx)
+    {
+        call_impl(ctx,
+            typename detail::MkSeq<sizeof...(Ins)>::type(),
+            typename detail::MkSeq<sizeof...(Outs)>::type());
+    }
+};
+
+} // namespace detail
+
+template<class Impl, class K>
+class GOCLKernelImpl: public cv::detail::OCLCallHelper<Impl, typename K::InArgs, typename K::OutArgs>,
+                      public cv::detail::KernelTag
+{
+    using P = detail::OCLCallHelper<Impl, typename K::InArgs, typename K::OutArgs>;
+
+public:
+    using API = K;
+
+    static cv::gapi::GBackend backend()  { return cv::gapi::ocl::backend(); }
+    static cv::GOCLKernel     kernel()   { return GOCLKernel(&P::call);     }
+};
+
+#define GAPI_OCL_KERNEL(Name, API) struct Name: public cv::GOCLKernelImpl<Name, API>
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GOCLKERNEL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/imgproc.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/imgproc.hpp
new file mode 100644
index 000000000000..1bb5911b1869
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ocl/imgproc.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OCL_IMGPROC_API_HPP
+#define OPENCV_GAPI_OCL_IMGPROC_API_HPP
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace imgproc {
+namespace ocl {
+
+    GAPI_EXPORTS GKernelPackage kernels();
+
+} // namespace ocl
+} // namespace imgproc
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_OCL_IMGPROC_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/opencv_includes.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/opencv_includes.hpp
new file mode 100644
index 000000000000..7c2c42d8a2b3
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/opencv_includes.hpp
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OPENCV_INCLUDES_HPP
+#define OPENCV_GAPI_OPENCV_INCLUDES_HPP
+
+#if !defined(GAPI_STANDALONE)
+#  include <opencv2/core/mat.hpp>
+#  include <opencv2/core/cvdef.h>
+#  include <opencv2/core/types.hpp>
+#  include <opencv2/core/base.hpp>
+#define GAPI_OWN_TYPES_LIST     cv::gapi::own::Rect,                           \
+                                cv::gapi::own::Size,                           \
+                                cv::gapi::own::Point,                          \
+                                cv::gapi::own::Point2f,                        \
+                                cv::gapi::own::Scalar,                         \
+                                cv::gapi::own::Mat
+#else   // Without OpenCV
+#  include <opencv2/gapi/own/cvdefs.hpp>
+#  include <opencv2/gapi/own/types.hpp>  // cv::gapi::own::Rect/Size/Point
+#  include <opencv2/gapi/own/scalar.hpp> // cv::gapi::own::Scalar
+#  include <opencv2/gapi/own/mat.hpp>
+// replacement of cv's structures:
+namespace cv {
+    using Rect    = gapi::own::Rect;
+    using Size    = gapi::own::Size;
+    using Point   = gapi::own::Point;
+    using Point2f = gapi::own::Point2f;
+    using Point3f = gapi::own::Point3f;
+    using Scalar  = gapi::own::Scalar;
+    using Mat     = gapi::own::Mat;
+}  // namespace cv
+#define GAPI_OWN_TYPES_LIST     cv::gapi::own::VoidType
+
+#endif // !defined(GAPI_STANDALONE)
+
+#endif // OPENCV_GAPI_OPENCV_INCLUDES_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/operators.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/operators.hpp
new file mode 100644
index 000000000000..6794b44b6e46
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/operators.hpp
@@ -0,0 +1,70 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OPERATORS_HPP
+#define OPENCV_GAPI_OPERATORS_HPP
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+
+namespace cv
+{
+GAPI_EXPORTS cv::GMat operator+(const cv::GMat&    lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator+(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator+(const cv::GScalar& lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator-(const cv::GMat&    lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator-(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator-(const cv::GScalar& lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator*(const cv::GMat&    lhs, float              rhs);
+GAPI_EXPORTS cv::GMat operator*(float              lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator*(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator*(const cv::GScalar& lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator/(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator/(const cv::GScalar& lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator/(const cv::GMat&    lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator&(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator|(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator^(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator~(const cv::GMat&    lhs);
+
+GAPI_EXPORTS cv::GMat operator&(const cv::GScalar& lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator|(const cv::GScalar& lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator^(const cv::GScalar& lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator&(const cv::GMat& lhs, const cv::GScalar&    rhs);
+GAPI_EXPORTS cv::GMat operator|(const cv::GMat& lhs, const cv::GScalar&    rhs);
+GAPI_EXPORTS cv::GMat operator^(const cv::GMat& lhs, const cv::GScalar&    rhs);
+
+GAPI_EXPORTS cv::GMat operator>(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator>=(const cv::GMat&   lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator<(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator<=(const cv::GMat&   lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator==(const cv::GMat&   lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator!=(const cv::GMat&   lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator>(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator>=(const cv::GMat&   lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator<(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator<=(const cv::GMat&   lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator==(const cv::GMat&   lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator!=(const cv::GMat&   lhs, const cv::GScalar& rhs);
+
+GAPI_EXPORTS cv::GMat operator>(const cv::GScalar&    lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator>=(const cv::GScalar&   lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator<(const cv::GScalar&    lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator<=(const cv::GScalar&   lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator==(const cv::GScalar&   lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator!=(const cv::GScalar&   lhs, const cv::GMat& rhs);
+} // cv
+
+#endif // OPENCV_GAPI_OPERATORS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ot.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ot.hpp
new file mode 100644
index 000000000000..b73d7e6ee003
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/ot.hpp
@@ -0,0 +1,194 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_OT_HPP
+#define OPENCV_GAPI_OT_HPP
+
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/s11n.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+
+namespace cv {
+namespace gapi {
+/**
+ * @brief This namespace contains G-API Operation Types for
+ * VAS Object Tracking module functionality.
+ */
+namespace ot {
+
+/**
+ * @enum TrackingStatus
+ *
+ * Tracking status twin for vas::ot::TrackingStatus
+ */
+enum TrackingStatus
+{
+    NEW = 0,     /**< The object is newly added. */
+    TRACKED,     /**< The object is being tracked. */
+    LOST         /**< The object gets lost now. The object can be tracked again
+                      by specifying detected object manually. */
+};
+
+struct GAPI_EXPORTS_W_SIMPLE ObjectTrackerParams
+{
+    /**
+     * Maximum number of trackable objects in a frame.
+     * Valid range: 1 <= max_num_objects. Or it can be -1 if there is no limitation
+     * of maximum number in X86. KMB/TBH has limitation up to 1024.
+     * Default value is -1 which means there is no limitation in X86. KMB/TBH is -1 means 200.
+     */
+    GAPI_PROP_RW int32_t max_num_objects = -1;
+
+    /**
+     * Input color format. Supports 0(BGR), 1(NV12), 2(BGRX) and 4(I420)
+     */
+    GAPI_PROP_RW int32_t input_image_format = 0;
+
+    /**
+     * Specifies whether tracker to use detection class for keeping id of an object.
+     * If it is true, new detection will be associated from previous tracking only when
+     * those two have same class.
+     * class id of an object is fixed across video frames.
+     * If it is false, new detection can be associated across different-class objects.
+     * In this case, the class id of an object may change across video frames depending on the tracker input.
+     * It is recommended to turn this option off when it is likely that detector confuses the class of object.
+     * For example, when detector confuses bicycle and motorbike. Turning this option off will increase
+     * the tracking reliability as tracker will ignore the class label of detector.
+     * @n
+     * Default value is true.
+     */
+    GAPI_PROP_RW bool tracking_per_class = true;
+
+    bool operator==(const ObjectTrackerParams& other) const
+    {
+        return max_num_objects == other.max_num_objects
+            && input_image_format == other.input_image_format
+            && tracking_per_class == other.tracking_per_class;
+    }
+};
+
+using GTrackedInfo = std::tuple<cv::GArray<cv::Rect>, cv::GArray<int32_t>, cv::GArray<uint64_t>, cv::GArray<int>>;
+
+G_API_OP(GTrackFromMat, <GTrackedInfo(cv::GMat, cv::GArray<cv::Rect>, cv::GArray<int32_t>, float)>, "com.intel.track_from_mat")
+{
+    static std::tuple<cv::GArrayDesc, cv::GArrayDesc,
+                      cv::GArrayDesc, cv::GArrayDesc> outMeta(cv::GMatDesc, cv::GArrayDesc, cv::GArrayDesc, float)
+    {
+        return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc(),
+                               cv::empty_array_desc(), cv::empty_array_desc());
+    }
+};
+
+G_API_OP(GTrackFromFrame, <GTrackedInfo(cv::GFrame, cv::GArray<cv::Rect>, cv::GArray<int32_t>, float)>, "com.intel.track_from_frame")
+{
+    static std::tuple<cv::GArrayDesc, cv::GArrayDesc,
+                      cv::GArrayDesc, cv::GArrayDesc> outMeta(cv::GFrameDesc, cv::GArrayDesc, cv::GArrayDesc, float)
+    {
+       return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc(),
+                              cv::empty_array_desc(), cv::empty_array_desc());
+    }
+};
+
+/**
+ * @brief   Tracks objects with video frames.
+ *          If a detected object is overlapped enough with one of tracked object, the tracked object's
+ *          informationis updated with the input detected object.
+ *          On the other hand, if a detected object is overlapped with none of tracked objects,
+ *          the detected object is newly added and ObjectTracker starts to track the object.
+ *          In zero term tracking type, ObjectTracker clears tracked objects in case that empty
+ *          list of detected objects is passed in.
+ *
+ * @param mat                       Input frame.
+ * @param detected_rects            Detected objects rectangles in the input frame.
+ * @param detected_class_labels     Detected objects class labels in the input frame.
+ * @param delta                     Frame_delta_t Delta time between two consecutive tracking in seconds.
+ *                                  The valid range is [0.005 ~ 0.5].
+ * @return                          Tracking results of target objects.
+ *                                  cv::GArray<cv::Rect>  Array of rectangles for tracked objects.
+ *                                  cv::GArray<int32_t>   Array of detected objects labels.
+ *                                  cv::GArray<uint64_t>  Array of tracking IDs for objects.
+ *                                                        Numbering sequence starts from 1.
+ *                                                        The value 0 means the tracking ID of this object has
+ *                                                        not been assigned.
+ *                                  cv::GArray<int>       Array of tracking statuses for objects.
+ */
+GAPI_EXPORTS_W std::tuple<cv::GArray<cv::Rect>,
+                          cv::GArray<int>,
+                          cv::GArray<uint64_t>,
+                          cv::GArray<int>>
+    track(const cv::GMat& mat,
+          const cv::GArray<cv::Rect>& detected_rects,
+          const cv::GArray<int>& detected_class_labels,
+          float delta);
+
+
+/**
+   @overload
+ * @brief   Tracks objects with video frames. Overload of track(...) for frame as GFrame.
+ *
+ * @param frame                     Input frame.
+ * @param detected_rects            Detected objects rectangles in the input frame.
+ * @param detected_class_labels     Detected objects class labels in the input frame.
+ * @param delta                     Frame_delta_t Delta time between two consecutive tracking in seconds.
+ *                                  The valid range is [0.005 ~ 0.5].
+ * @return                          Tracking results of target objects.
+ * @return                          Tracking results of target objects.
+ *                                  cv::GArray<cv::Rect>          Array of rectangles for tracked objects.
+ *                                  cv::GArray<int32_t>           Array of detected objects labels.
+ *                                  cv::GArray<uint64_t>          Array of tracking IDs for objects.
+ *                                                                Numbering sequence starts from 1.
+ *                                                                The value 0 means the tracking ID of this object has
+ *                                                                not been assigned.
+ *                                  cv::GArray<int>    Array of tracking statuses for objects.
+ */
+GAPI_EXPORTS_W std::tuple<cv::GArray<cv::Rect>,
+                         cv::GArray<int>,
+                         cv::GArray<uint64_t>,
+                         cv::GArray<int>>
+    track(const cv::GFrame& frame,
+          const cv::GArray<cv::Rect>& detected_rects,
+          const cv::GArray<int>& detected_class_labels,
+          float delta);
+} // namespace ot
+} // namespace gapi
+} // namespace cv
+
+// FIXME: move to a separate file?
+namespace cv
+{
+namespace detail
+{
+template<> struct CompileArgTag<cv::gapi::ot::ObjectTrackerParams>
+{
+    static const char* tag()
+    {
+        return "cv.gapi.ot.object_tracker_params";
+    }
+};
+} // namespace detail
+
+namespace gapi
+{
+namespace s11n
+{
+namespace detail
+{
+template<> struct S11N<cv::gapi::ot::ObjectTrackerParams> {
+    static void serialize(IOStream &os, const cv::gapi::ot::ObjectTrackerParams &p) {
+        os << p. max_num_objects << p.input_image_format << p.tracking_per_class;
+    }
+    static cv::gapi::ot::ObjectTrackerParams deserialize(IIStream &is) {
+        cv::gapi::ot::ObjectTrackerParams p;
+        is >> p. max_num_objects >> p.input_image_format >> p.tracking_per_class;
+        return p;
+    }
+};
+} // namespace detail
+} // namespace s11n
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_OT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/assert.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/assert.hpp
new file mode 100644
index 000000000000..ab2fb896f127
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/assert.hpp
@@ -0,0 +1,60 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_ASSERT_HPP
+#define OPENCV_GAPI_OWN_ASSERT_HPP
+
+#include <opencv2/gapi/util/compiler_hints.hpp>
+
+#define GAPI_DbgAssertNoOp(expr) {                  \
+    constexpr bool _assert_tmp = false && (expr);   \
+    cv::util::suppress_unused_warning(_assert_tmp); \
+}
+
+#if !defined(GAPI_STANDALONE)
+#include <opencv2/core/base.hpp>
+#define GAPI_Assert CV_Assert
+
+#if defined _DEBUG || defined CV_STATIC_ANALYSIS
+#  define GAPI_DbgAssert CV_DbgAssert
+#else
+#  define GAPI_DbgAssert(expr) GAPI_DbgAssertNoOp(expr)
+#endif
+
+#define GAPI_Error(msg) CV_Error(cv::Error::StsError, msg)
+
+#else
+#include <stdexcept>
+#include <sstream>
+#include <opencv2/gapi/util/throw.hpp>
+
+namespace detail
+{
+    [[noreturn]] inline void assert_abort(const char* str, int line, const char* file, const char* func)
+    {
+        std::stringstream ss;
+        ss << file << ":" << line << ": Assertion " << str << " in function " << func << " failed\n";
+        cv::util::throw_error(std::logic_error(ss.str()));
+    }
+}
+
+#define GAPI_Assert(expr) \
+{ if (!(expr)) ::detail::assert_abort(#expr, __LINE__, __FILE__, __func__); }
+
+#ifdef NDEBUG
+#  define GAPI_DbgAssert(expr) GAPI_DbgAssertNoOp(expr)
+#else
+#  define GAPI_DbgAssert(expr) GAPI_Assert(expr)
+#endif
+
+#define GAPI_Error(msg) { \
+    ::detail::assert_abort(msg, __LINE__, __FILE__, __func__); \
+}
+
+#endif // GAPI_STANDALONE
+
+#endif // OPENCV_GAPI_OWN_ASSERT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/convert.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/convert.hpp
new file mode 100644
index 000000000000..f587e2478702
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/convert.hpp
@@ -0,0 +1,55 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_CONVERT_HPP
+#define OPENCV_GAPI_OWN_CONVERT_HPP
+
+#if !defined(GAPI_STANDALONE)
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/own/mat.hpp>
+
+namespace cv
+{
+    template<typename T>
+    std::vector<T> to_own(const cv::MatSize &sz) {
+        std::vector<T> result(sz.dims());
+        for (int i = 0; i < sz.dims(); i++) {
+            // Note: cv::MatSize is not iterable
+            result[i] = static_cast<T>(sz[i]);
+        }
+        return result;
+    }
+
+    cv::gapi::own::Mat to_own(Mat&&) = delete;
+
+    inline cv::gapi::own::Mat to_own(Mat const& m) {
+        return (m.dims == 2)
+            ?  cv::gapi::own::Mat{m.rows, m.cols, m.type(), m.data, m.step}
+            :  cv::gapi::own::Mat{to_own<int>(m.size), m.type(), m.data};
+    }
+
+namespace gapi
+{
+namespace own
+{
+
+    inline cv::Mat to_ocv(Mat const& m) {
+        return m.dims.empty()
+            ? cv::Mat{m.rows, m.cols, m.type(), m.data, m.step}
+            : cv::Mat{m.dims, m.type(), m.data};
+    }
+
+    cv::Mat to_ocv(Mat&&) = delete;
+
+} // namespace own
+} // namespace gapi
+} // namespace cv
+
+#endif // !defined(GAPI_STANDALONE)
+
+#endif // OPENCV_GAPI_OWN_CONVERT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/cvdefs.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/cvdefs.hpp
new file mode 100644
index 000000000000..d3bef98e9805
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/cvdefs.hpp
@@ -0,0 +1,166 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CV_DEFS_HPP
+#define OPENCV_GAPI_CV_DEFS_HPP
+
+#if defined(GAPI_STANDALONE)
+// Simulate OpenCV definitions taken from various
+// OpenCV interface headers if G-API is built in a
+// standalone mode.
+
+// interface.h:
+
+typedef unsigned char uchar;
+typedef          char schar;
+
+typedef unsigned short ushort;
+
+#define CV_USRTYPE1 (void)"CV_USRTYPE1 support has been dropped in OpenCV 4.0"
+
+#define CV_CN_MAX     512
+#define CV_CN_SHIFT   3
+#define CV_DEPTH_MAX  (1 << CV_CN_SHIFT)
+
+#define CV_8U   0
+#define CV_8S   1
+#define CV_16U  2
+#define CV_16S  3
+#define CV_32S  4
+#define CV_32F  5
+#define CV_64F  6
+#define CV_16F  7
+
+#define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
+#define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
+
+#define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
+#define CV_MAKE_TYPE CV_MAKETYPE
+
+#define CV_8UC1 CV_MAKETYPE(CV_8U,1)
+#define CV_8UC2 CV_MAKETYPE(CV_8U,2)
+#define CV_8UC3 CV_MAKETYPE(CV_8U,3)
+#define CV_8UC4 CV_MAKETYPE(CV_8U,4)
+#define CV_8UC(n) CV_MAKETYPE(CV_8U,(n))
+
+#define CV_8SC1 CV_MAKETYPE(CV_8S,1)
+#define CV_8SC2 CV_MAKETYPE(CV_8S,2)
+#define CV_8SC3 CV_MAKETYPE(CV_8S,3)
+#define CV_8SC4 CV_MAKETYPE(CV_8S,4)
+#define CV_8SC(n) CV_MAKETYPE(CV_8S,(n))
+
+#define CV_16UC1 CV_MAKETYPE(CV_16U,1)
+#define CV_16UC2 CV_MAKETYPE(CV_16U,2)
+#define CV_16UC3 CV_MAKETYPE(CV_16U,3)
+#define CV_16UC4 CV_MAKETYPE(CV_16U,4)
+#define CV_16UC(n) CV_MAKETYPE(CV_16U,(n))
+
+#define CV_16SC1 CV_MAKETYPE(CV_16S,1)
+#define CV_16SC2 CV_MAKETYPE(CV_16S,2)
+#define CV_16SC3 CV_MAKETYPE(CV_16S,3)
+#define CV_16SC4 CV_MAKETYPE(CV_16S,4)
+#define CV_16SC(n) CV_MAKETYPE(CV_16S,(n))
+
+#define CV_32SC1 CV_MAKETYPE(CV_32S,1)
+#define CV_32SC2 CV_MAKETYPE(CV_32S,2)
+#define CV_32SC3 CV_MAKETYPE(CV_32S,3)
+#define CV_32SC4 CV_MAKETYPE(CV_32S,4)
+#define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))
+
+#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
+#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
+#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
+#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
+#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
+
+#define CV_32FC1 CV_MAKETYPE(CV_32F,1)
+#define CV_32FC2 CV_MAKETYPE(CV_32F,2)
+#define CV_32FC3 CV_MAKETYPE(CV_32F,3)
+#define CV_32FC4 CV_MAKETYPE(CV_32F,4)
+#define CV_32FC(n) CV_MAKETYPE(CV_32F,(n))
+
+#define CV_64FC1 CV_MAKETYPE(CV_64F,1)
+#define CV_64FC2 CV_MAKETYPE(CV_64F,2)
+#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
+#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
+#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
+
+// cvdef.h:
+
+#ifndef CV_ALWAYS_INLINE
+#  if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#    define CV_ALWAYS_INLINE inline __attribute__((always_inline))
+#  elif defined(_MSC_VER)
+#    define CV_ALWAYS_INLINE __forceinline
+#  else
+#    define CV_ALWAYS_INLINE inline
+#  endif
+#endif
+
+#define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
+#define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
+#define CV_MAT_TYPE_MASK        (CV_DEPTH_MAX*CV_CN_MAX - 1)
+#define CV_MAT_TYPE(flags)      ((flags) & CV_MAT_TYPE_MASK)
+#define CV_MAT_CONT_FLAG_SHIFT  14
+#define CV_MAT_CONT_FLAG        (1 << CV_MAT_CONT_FLAG_SHIFT)
+#define CV_IS_MAT_CONT(flags)   ((flags) & CV_MAT_CONT_FLAG)
+#define CV_IS_CONT_MAT          CV_IS_MAT_CONT
+#define CV_SUBMAT_FLAG_SHIFT    15
+#define CV_SUBMAT_FLAG          (1 << CV_SUBMAT_FLAG_SHIFT)
+#define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)
+
+//** Size of each channel item,
+//   0x8442211 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
+#define CV_ELEM_SIZE1(type) \
+   ((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15)
+
+#define CV_MAT_TYPE(flags)      ((flags) & CV_MAT_TYPE_MASK)
+
+/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
+#define CV_ELEM_SIZE(type) \
+    (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
+
+#ifndef CV_OVERRIDE
+#  define CV_OVERRIDE override
+#endif
+
+// base.h:
+namespace cv
+{
+enum BorderTypes {
+    BORDER_CONSTANT    = 0, //!< `iiiiii|abcdefgh|iiiiiii`  with some specified `i`
+    BORDER_REPLICATE   = 1, //!< `aaaaaa|abcdefgh|hhhhhhh`
+    BORDER_REFLECT     = 2, //!< `fedcba|abcdefgh|hgfedcb`
+    BORDER_WRAP        = 3, //!< `cdefgh|abcdefgh|abcdefg`
+    BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
+    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno`
+
+    BORDER_REFLECT101  = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_DEFAULT     = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_ISOLATED    = 16 //!< do not look outside of ROI
+};
+// imgproc.hpp:
+enum InterpolationFlags{
+    INTER_NEAREST        = 0,
+    INTER_LINEAR         = 1,
+    INTER_CUBIC          = 2,
+    INTER_AREA           = 3,
+    INTER_LANCZOS4       = 4,
+    INTER_LINEAR_EXACT   = 5,
+    INTER_MAX            = 7,
+};
+} // namespace cv
+
+static inline int cvFloor( double value )
+{
+    int i = (int)value;
+    return i - (i > value);
+}
+
+#endif //  defined(GAPI_STANDALONE)
+
+#endif //  OPENCV_GAPI_CV_DEFS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/exports.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/exports.hpp
new file mode 100644
index 000000000000..c36f4003d0fb
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/exports.hpp
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_TYPES_HPP
+#define OPENCV_GAPI_OWN_TYPES_HPP
+
+#   if defined(__OPENCV_BUILD)
+#       include <opencv2/core/base.hpp>
+#       define GAPI_EXPORTS CV_EXPORTS
+        /* special informative macros for wrapper generators */
+#       define GAPI_PROP CV_PROP
+#       define GAPI_PROP_RW CV_PROP_RW
+#       define GAPI_WRAP CV_WRAP
+#       define GAPI_EXPORTS_W_SIMPLE CV_EXPORTS_W_SIMPLE
+#       define GAPI_EXPORTS_W CV_EXPORTS_W
+#   else
+#       define GAPI_PROP
+#       define GAPI_PROP_RW
+#       define GAPI_WRAP
+#       define GAPI_EXPORTS
+#       define GAPI_EXPORTS_W_SIMPLE
+#       define GAPI_EXPORTS_W
+
+#if 0  // Note: the following version currently is not needed for non-OpenCV build
+#       if defined _WIN32
+#           define GAPI_EXPORTS __declspec(dllexport)
+#       elif defined __GNUC__ && __GNUC__ >= 4
+#           define GAPI_EXPORTS __attribute__ ((visibility ("default")))
+#       endif
+
+#       ifndef GAPI_EXPORTS
+#           define GAPI_EXPORTS
+#       endif
+#endif
+
+#   endif
+
+#endif // OPENCV_GAPI_OWN_TYPES_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/mat.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/mat.hpp
new file mode 100644
index 000000000000..ce9c0bf36238
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/mat.hpp
@@ -0,0 +1,354 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_MAT_HPP
+#define OPENCV_GAPI_OWN_MAT_HPP
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/own/types.hpp>
+#include <opencv2/gapi/own/scalar.hpp>
+#include <opencv2/gapi/own/saturate.hpp>
+#include <opencv2/gapi/own/assert.hpp>
+
+#include <memory>                   //std::shared_ptr
+#include <cstring>                  //std::memcpy
+#include <numeric>                  //std::accumulate
+#include <vector>
+#include <opencv2/gapi/util/throw.hpp>
+
+namespace cv { namespace gapi { namespace own {
+    namespace detail {
+        template <typename T, unsigned char channels>
+        void assign_row(void* ptr, int cols, Scalar const& s)
+        {
+            auto p = static_cast<T*>(ptr);
+            for (int c = 0; c < cols; c++)
+            {
+                for (int ch = 0; ch < channels; ch++)
+                {
+                    p[c * channels + ch] = saturate<T>(s[ch], roundd);
+                }
+            }
+        }
+
+        inline size_t default_step(int type, int cols)
+        {
+            return CV_ELEM_SIZE(type) * cols;
+        }
+        //Matrix header, i.e. fields that are unique to each Mat object.
+        //Devoted class is needed to implement custom behavior on move (erasing state of moved from object)
+        struct MatHeader{
+            enum { AUTO_STEP = 0};
+            enum { TYPE_MASK = 0x00000FFF  };
+
+            MatHeader() = default;
+
+            MatHeader(int _rows, int _cols, int type, void* _data, size_t _step)
+            : flags((type & TYPE_MASK)), rows(_rows), cols(_cols), data((uchar*)_data), step(_step == AUTO_STEP ? detail::default_step(type, _cols) : _step)
+            {}
+
+            MatHeader(const std::vector<int> &_dims, int type, void* _data)
+            : flags((type & TYPE_MASK)), data((uchar*)_data), step(0), dims(_dims)
+            {}
+
+            MatHeader(const MatHeader& ) = default;
+            MatHeader(MatHeader&& src) : MatHeader(src) // reuse copy constructor here
+            {
+                MatHeader empty; //give it a name to call copy(not move) assignment below
+                src = empty;
+            }
+            MatHeader& operator=(const MatHeader& ) = default;
+            MatHeader& operator=(MatHeader&& src)
+            {
+                *this = src; //calling a copy assignment here, not move one
+                MatHeader empty; //give it a name to call copy(not move) assignment below
+                src = empty;
+                return *this;
+            }
+            /*! includes several bit-fields:
+                 - depth
+                 - number of channels
+             */
+            int flags = 0;
+
+            //! the number of rows and columns or (-1, -1) when the matrix has more than 2 dimensions
+            int rows = 0, cols = 0;
+            //! pointer to the data
+            uchar* data = nullptr;
+            size_t step = 0;
+            //! dimensions (ND-case)
+            std::vector<int> dims;
+        };
+    } // namespace detail
+    //concise version of cv::Mat suitable for GAPI needs (used when no dependence on OpenCV is required)
+    class Mat : public detail::MatHeader{
+    public:
+
+        Mat() = default;
+
+        /** @overload
+        @param _rows Number of rows in a 2D array.
+        @param _cols Number of columns in a 2D array.
+        @param _type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+        CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+        @param _data Pointer to the user data. Matrix constructors that take data and step parameters do not
+        allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+        data, which means that no data is copied. This operation is very efficient and can be used to
+        process external data using OpenCV functions. The external data is not automatically deallocated, so
+        you should take care of it.
+        @param _step Number of bytes each matrix row occupies. The value should include the padding bytes at
+        the end of each row, if any. If the parameter is missing (set to AUTO_STEP ), no padding is assumed
+        and the actual step is calculated as cols*elemSize(). See Mat::elemSize.
+        */
+        Mat(int _rows, int _cols, int _type, void* _data, size_t _step = AUTO_STEP)
+        : MatHeader (_rows, _cols, _type, _data, _step)
+        {}
+
+        Mat(const std::vector<int> &_dims, int _type, void* _data)
+        : MatHeader (_dims, _type, _data)
+        {}
+
+        Mat(std::vector<int> &&_dims, int _type, void* _data)
+        : MatHeader (std::move(_dims), _type, _data)
+        {}
+
+        Mat(Mat const& src, const Rect& roi )
+        : Mat(src)
+        {
+           rows = roi.height;
+           cols = roi.width;
+           data = ptr(roi.y, roi.x);
+        }
+
+        Mat(Mat const& ) = default;
+        Mat(Mat&& ) = default;
+
+        Mat& operator=(Mat const& ) = default;
+        Mat& operator=(Mat&& ) = default;
+
+        /** @brief Sets all or some of the array elements to the specified value.
+        @param s Assigned scalar converted to the actual array type.
+        */
+        Mat& operator = (const Scalar& s)
+        {
+            constexpr unsigned max_channels = 4; //Scalar can't fit more than 4
+            using func_p_t = void (*)(void*, int, Scalar const&);
+            using detail::assign_row;
+            #define TABLE_ENTRY(type)  {assign_row<type, 1>, assign_row<type, 2>, assign_row<type, 3>, assign_row<type, 4>}
+            static constexpr func_p_t func_tbl[][max_channels] = {
+                    TABLE_ENTRY(uchar),
+                    TABLE_ENTRY(schar),
+                    TABLE_ENTRY(ushort),
+                    TABLE_ENTRY(short),
+                    TABLE_ENTRY(int),
+                    TABLE_ENTRY(float),
+                    TABLE_ENTRY(double)
+            };
+            #undef TABLE_ENTRY
+
+            static_assert(CV_8U == 0 && CV_8S == 1  && CV_16U == 2 && CV_16S == 3
+                       && CV_32S == 4 && CV_32F == 5 && CV_64F == 6,
+                       "OCV type ids used as indexes to array, thus exact numbers are important!"
+            );
+
+            const auto depth = static_cast<unsigned int>(this->depth());
+            GAPI_Assert(depth < sizeof(func_tbl)/sizeof(func_tbl[0]));
+
+            if (dims.empty())
+            {
+                const auto channels = static_cast<unsigned int>(this->channels());
+                GAPI_Assert(channels <= max_channels);
+
+                auto* f = func_tbl[depth][channels - 1];
+                for (int r = 0; r < rows; ++r)
+                {
+                    (*f)(static_cast<void *>(ptr(r)), cols, s );
+                }
+            }
+            else
+            {
+                auto* f = func_tbl[depth][0];
+                // FIXME: better to refactor assign_row to use std::size_t by default
+                (*f)(static_cast<void *>(data), static_cast<int>(total()), s);
+            }
+            return *this;
+        }
+
+        /** @brief Returns the matrix element size in bytes.
+
+        The method returns the matrix element size in bytes. For example, if the matrix type is CV_16SC3 ,
+        the method returns 3\*sizeof(short) or 6.
+         */
+        size_t elemSize() const
+        {
+            return CV_ELEM_SIZE(type());
+        }
+        /** @brief Returns the type of a matrix element.
+
+        The method returns a matrix element type. This is an identifier compatible with the CvMat type
+        system, like CV_16SC3 or 16-bit signed 3-channel array, and so on.
+         */
+        int type() const            {return CV_MAT_TYPE(flags);}
+
+        /** @brief Returns the depth of a matrix element.
+
+        The method returns the identifier of the matrix element depth (the type of each individual channel).
+        For example, for a 16-bit signed element array, the method returns CV_16S . A complete list of
+        matrix types contains the following values:
+        -   CV_8U - 8-bit unsigned integers ( 0..255 )
+        -   CV_8S - 8-bit signed integers ( -128..127 )
+        -   CV_16U - 16-bit unsigned integers ( 0..65535 )
+        -   CV_16S - 16-bit signed integers ( -32768..32767 )
+        -   CV_32S - 32-bit signed integers ( -2147483648..2147483647 )
+        -   CV_32F - 32-bit floating-point numbers ( -FLT_MAX..FLT_MAX, INF, NAN )
+        -   CV_64F - 64-bit floating-point numbers ( -DBL_MAX..DBL_MAX, INF, NAN )
+         */
+        int depth() const           {return CV_MAT_DEPTH(flags);}
+
+        /** @brief Returns the number of matrix channels.
+
+        The method returns the number of matrix channels.
+        If matrix is N-dimensional, -1 is returned.
+         */
+        int channels() const        {return dims.empty() ? CV_MAT_CN(flags) : -1;}
+
+        /**
+        @param _rows New number of rows.
+        @param _cols New number of columns.
+        @param _type New matrix type.
+         */
+        void create(int _rows, int _cols, int _type)
+        {
+            create(Size{_cols, _rows}, _type);
+        }
+        /** @overload
+        @param _size Alternative new matrix size specification: Size(cols, rows)
+        @param _type New matrix type.
+        */
+        void create(Size _size, int _type)
+        {
+            GAPI_Assert(_size.height >= 0 && _size.width >= 0);
+            if (_size != Size{cols, rows} )
+            {
+                Mat tmp{_size.height, _size.width, _type, nullptr};
+                tmp.memory.reset(new uchar[ tmp.step * tmp.rows], [](uchar * p){delete[] p;});
+                tmp.data = tmp.memory.get();
+
+                *this = std::move(tmp);
+            }
+        }
+
+        void create(const std::vector<int> &_dims, int _type)
+        {
+            // FIXME: make a proper reallocation-on-demands
+            // WARNING: no tensor views, so no strides
+            Mat tmp{_dims, _type, nullptr};
+            // FIXME: this accumulate duplicates a lot
+            const auto sz = std::accumulate(_dims.begin(), _dims.end(), 1, std::multiplies<int>());
+            tmp.memory.reset(new uchar[CV_ELEM_SIZE(_type)*sz], [](uchar * p){delete[] p;});
+            tmp.data = tmp.memory.get();
+            *this = std::move(tmp);
+        }
+
+        /** @brief Creates a full copy of the matrix and the underlying data.
+
+        The method creates a full copy of the matrix. The original step[] is not taken into account.
+        So, the copy has a continuous buffer occupying total() * elemSize() bytes.
+         */
+        Mat clone() const
+        {
+            Mat m;
+            copyTo(m);
+            return m;
+        }
+
+        /** @brief Copies the matrix to another one.
+
+        The method copies the matrix data to another matrix. Before copying the data, the method invokes :
+        @code
+            m.create(this->size(), this->type());
+        @endcode
+        so that the destination matrix is reallocated if needed. While m.copyTo(m); works flawlessly, the
+        function does not handle the case of a partial overlap between the source and the destination
+        matrices.
+         */
+        void copyTo(Mat& dst) const
+        {
+            if (dims.empty())
+            {
+                dst.create(rows, cols, type());
+                for (int r = 0; r < rows; ++r)
+                {
+                    std::copy_n(ptr(r), detail::default_step(type(),cols), dst.ptr(r));
+                }
+            }
+            else
+            {
+                dst.create(dims, depth());
+                std::copy_n(data, total()*elemSize(), data);
+            }
+        }
+
+        /** @brief Returns true if the array has no elements.
+
+        The method returns true if Mat::total() is 0 or if Mat::data is NULL. Because of pop_back() and
+        resize() methods `M.total() == 0` does not imply that `M.data == NULL`.
+         */
+        bool empty() const
+        {
+            return data == 0 || total() == 0;
+        }
+
+        /** @brief Returns the total number of array elements.
+
+        The method returns the number of array elements (a number of pixels if the array represents an
+        image).
+         */
+        size_t total() const
+        {
+            return dims.empty()
+                 ? (static_cast<std::size_t>(rows) * cols)
+                 : std::accumulate(dims.begin(), dims.end(), static_cast<std::size_t>(1), std::multiplies<size_t>());
+        }
+
+        /** @overload
+        @param roi Extracted submatrix specified as a rectangle.
+        */
+        Mat operator()( const Rect& roi ) const
+        {
+            return Mat{*this, roi};
+        }
+
+
+        /** @brief Returns a pointer to the specified matrix row.
+
+        The methods return `uchar*` or typed pointer to the specified matrix row. See the sample in
+        Mat::isContinuous to know how to use these methods.
+        @param row Index along the dimension 0
+        @param col Index along the dimension 1
+        */
+        uchar* ptr(int row, int col = 0)
+        {
+            return const_cast<uchar*>(const_cast<const Mat*>(this)->ptr(row,col));
+        }
+        /** @overload */
+        const uchar* ptr(int row, int col = 0) const
+        {
+            return data + step * row + CV_ELEM_SIZE(type()) * col;
+        }
+
+
+    private:
+        //actual memory allocated for storage, or nullptr if object is non owning view to over memory
+        std::shared_ptr<uchar> memory;
+    };
+
+} //namespace own
+} //namespace gapi
+} //namespace cv
+
+#endif /* OPENCV_GAPI_OWN_MAT_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/saturate.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/saturate.hpp
new file mode 100644
index 000000000000..74eaecf57e69
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/saturate.hpp
@@ -0,0 +1,83 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_SATURATE_HPP
+#define OPENCV_GAPI_OWN_SATURATE_HPP
+
+#include <math.h>
+
+#include <limits>
+
+#include <opencv2/gapi/own/assert.hpp>
+#include <opencv2/gapi/util/type_traits.hpp>
+
+namespace cv { namespace gapi { namespace own {
+//-----------------------------
+//
+// Numeric cast with saturation
+//
+//-----------------------------
+
+template<typename DST, typename SRC,
+         typename = cv::util::enable_if_t<!std::is_same<DST, SRC>::value &&
+                                           std::is_integral<DST>::value  &&
+                                           std::is_integral<SRC>::value>   >
+static CV_ALWAYS_INLINE DST saturate(SRC x)
+{
+    if (sizeof(DST) > sizeof(SRC))
+        return static_cast<DST>(x);
+
+    // compiler must recognize this saturation,
+    // so compile saturate<s16>(a + b) with adds
+    // instruction (e.g.: _mm_adds_epi16 if x86)
+    return x < std::numeric_limits<DST>::min()?
+               std::numeric_limits<DST>::min():
+           x > std::numeric_limits<DST>::max()?
+               std::numeric_limits<DST>::max():
+           static_cast<DST>(x);
+}
+template<typename T>
+static CV_ALWAYS_INLINE T saturate(T x)
+{
+    return x;
+}
+
+template<typename DST, typename SRC, typename R,
+         cv::util::enable_if_t<std::is_floating_point<DST>::value, bool> = true >
+static CV_ALWAYS_INLINE DST saturate(SRC x, R)
+{
+    return static_cast<DST>(x);
+}
+template<typename DST, typename SRC, typename R,
+         cv::util::enable_if_t<std::is_integral<DST>::value &&
+                               std::is_integral<SRC>::value   , bool> = true >
+static CV_ALWAYS_INLINE DST saturate(SRC x, R)
+{
+    return saturate<DST>(x);
+}
+// Note, that OpenCV rounds differently:
+// - like std::round() for add, subtract
+// - like std::rint() for multiply, divide
+template<typename DST, typename SRC, typename R,
+         cv::util::enable_if_t<std::is_integral<DST>::value &&
+                               std::is_floating_point<SRC>::value, bool> = true >
+static CV_ALWAYS_INLINE DST saturate(SRC x, R round)
+{
+    int ix = static_cast<int>(round(x));
+    return saturate<DST>(ix);
+}
+
+// explicit suffix 'd' for double type
+inline double  ceild(double x) { return ceil(x); }
+inline double floord(double x) { return floor(x); }
+inline double roundd(double x) { return round(x); }
+inline double  rintd(double x) { return rint(x); }
+
+} //namespace own
+} //namespace gapi
+} //namespace cv
+#endif /* OPENCV_GAPI_OWN_SATURATE_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/scalar.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/scalar.hpp
new file mode 100644
index 000000000000..3b107befccaa
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/scalar.hpp
@@ -0,0 +1,47 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GAPI_OWN_SCALAR_HPP
+#define OPENCV_GAPI_GAPI_OWN_SCALAR_HPP
+
+#include <opencv2/gapi/own/exports.hpp>
+
+namespace cv
+{
+namespace gapi
+{
+namespace own
+{
+
+class GAPI_EXPORTS Scalar
+{
+public:
+    Scalar() = default;
+    explicit Scalar(double v0) { val[0] = v0; }
+    Scalar(double v0, double v1, double v2 = 0, double v3 = 0)
+        : val{v0, v1, v2, v3}
+    {
+    }
+
+    const double& operator[](int i) const { return val[i]; }
+          double& operator[](int i)       { return val[i]; }
+
+    static Scalar all(double v0) { return Scalar(v0, v0, v0, v0); }
+
+    double val[4] = {0};
+};
+
+inline bool operator==(const Scalar& lhs, const Scalar& rhs)
+{
+    return std::equal(std::begin(lhs.val), std::end(lhs.val), std::begin(rhs.val));
+}
+
+} // namespace own
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_GAPI_OWN_SCALAR_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/types.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/types.hpp
new file mode 100644
index 000000000000..211b5c85ff12
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/own/types.hpp
@@ -0,0 +1,162 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_TYPES_HPP
+#define OPENCV_GAPI_TYPES_HPP
+
+#include <algorithm>              // std::max, std::min
+#include <ostream>
+
+namespace cv
+{
+namespace gapi
+{
+
+/**
+ * @brief This namespace contains G-API own data structures used in
+ * its standalone mode build.
+ */
+namespace own
+{
+
+class Point
+{
+public:
+    Point() = default;
+    Point(int _x, int _y) : x(_x),  y(_y) {}
+
+    int x = 0;
+    int y = 0;
+};
+
+class Point2f
+{
+public:
+    Point2f() = default;
+    Point2f(float _x, float _y) : x(_x),  y(_y) {}
+
+    float x = 0.f;
+    float y = 0.f;
+};
+
+class Point3f
+{
+public:
+    Point3f() = default;
+    Point3f(float _x, float _y, float _z) : x(_x),  y(_y), z(_z) {}
+
+    float x = 0.f;
+    float y = 0.f;
+    float z = 0.f;
+};
+
+class Rect
+{
+public:
+    Rect() = default;
+    Rect(int _x, int _y, int _width, int _height) : x(_x), y(_y),   width(_width),  height(_height) {}
+#if !defined(GAPI_STANDALONE)
+    Rect(const cv::Rect& other) : x(other.x), y(other.y), width(other.width), height(other.height) {}
+    inline Rect& operator=(const cv::Rect& other)
+    {
+        x = other.x;
+        y = other.x;
+        width  = other.width;
+        height = other.height;
+        return *this;
+    }
+#endif // !defined(GAPI_STANDALONE)
+
+    int x      = 0; //!< x coordinate of the top-left corner
+    int y      = 0; //!< y coordinate of the top-left corner
+    int width  = 0; //!< width of the rectangle
+    int height = 0; //!< height of the rectangle
+};
+
+inline bool operator==(const Rect& lhs, const Rect& rhs)
+{
+    return lhs.x == rhs.x && lhs.y == rhs.y && lhs.width == rhs.width && lhs.height == rhs.height;
+}
+
+inline bool operator!=(const Rect& lhs, const Rect& rhs)
+{
+    return !(lhs == rhs);
+}
+
+inline Rect& operator&=(Rect& lhs, const Rect& rhs)
+{
+    int x1 = std::max(lhs.x, rhs.x);
+    int y1 = std::max(lhs.y, rhs.y);
+    lhs.width  = std::min(lhs.x + lhs.width,  rhs.x + rhs.width) -  x1;
+    lhs.height = std::min(lhs.y + lhs.height, rhs.y + rhs.height) - y1;
+    lhs.x = x1;
+    lhs.y = y1;
+    if( lhs.width <= 0 || lhs.height <= 0 )
+        lhs = Rect();
+    return lhs;
+}
+
+inline Rect operator&(const Rect& lhs, const Rect& rhs)
+{
+    Rect result = lhs;
+    return result &= rhs;
+}
+
+inline std::ostream& operator<<(std::ostream& o, const Rect& rect)
+{
+    return o << "[" << rect.width << " x " << rect.height << " from (" << rect.x << ", " << rect.y << ")]";
+}
+
+class Size
+{
+public:
+    Size() = default;
+    Size(int _width, int _height) : width(_width),  height(_height) {}
+#if !defined(GAPI_STANDALONE)
+    Size(const cv::Size& other) : width(other.width), height(other.height) {}
+    inline Size& operator=(const cv::Size& rhs)
+    {
+        width  = rhs.width;
+        height = rhs.height;
+        return *this;
+    }
+#endif // !defined(GAPI_STANDALONE)
+
+    int width  = 0;
+    int height = 0;
+};
+
+inline Size& operator+=(Size& lhs, const Size& rhs)
+{
+    lhs.width  += rhs.width;
+    lhs.height += rhs.height;
+    return lhs;
+}
+
+inline bool operator==(const Size& lhs, const Size& rhs)
+{
+    return lhs.width == rhs.width && lhs.height == rhs.height;
+}
+
+inline bool operator!=(const Size& lhs, const Size& rhs)
+{
+    return !(lhs == rhs);
+}
+
+
+inline std::ostream& operator<<(std::ostream& o, const Size& s)
+{
+    o << "[" << s.width << " x " << s.height << "]";
+    return o;
+}
+
+struct VoidType {};
+} // namespace own
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_TYPES_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/core.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/core.hpp
new file mode 100644
index 000000000000..20e8812b3abc
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/core.hpp
@@ -0,0 +1,20 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_PLAIDML_CORE_HPP
+#define OPENCV_GAPI_PLAIDML_CORE_HPP
+
+#include <opencv2/gapi/gkernel.hpp>     // GKernelPackage
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+namespace cv { namespace gapi { namespace core { namespace plaidml {
+
+GAPI_EXPORTS cv::GKernelPackage kernels();
+
+}}}}
+
+#endif // OPENCV_GAPI_PLAIDML_CORE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/gplaidmlkernel.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/gplaidmlkernel.hpp
new file mode 100644
index 000000000000..e22ecc7211f2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/gplaidmlkernel.hpp
@@ -0,0 +1,140 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+//
+
+
+#ifndef OPENCV_GAPI_GPLAIDMLKERNEL_HPP
+#define OPENCV_GAPI_GPLAIDMLKERNEL_HPP
+
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/garg.hpp>
+
+namespace plaidml
+{
+namespace edsl
+{
+    class Tensor;
+} // namespace edsl
+} // namespace plaidml
+
+namespace cv
+{
+namespace gapi
+{
+namespace plaidml
+{
+
+GAPI_EXPORTS cv::gapi::GBackend backend();
+
+} // namespace plaidml
+} // namespace gapi
+
+struct GPlaidMLContext
+{
+    // Generic accessor API
+    template<typename T>
+    const T& inArg(int input) { return m_args.at(input).get<T>(); }
+
+    // Syntax sugar
+    const plaidml::edsl::Tensor& inTensor(int input)
+    {
+        return inArg<plaidml::edsl::Tensor>(input);
+    }
+
+    plaidml::edsl::Tensor& outTensor(int output)
+    {
+        return *(m_results.at(output).get<plaidml::edsl::Tensor*>());
+    }
+
+    std::vector<GArg> m_args;
+    std::unordered_map<std::size_t, GArg> m_results;
+};
+
+class GAPI_EXPORTS GPlaidMLKernel
+{
+public:
+    using F = std::function<void(GPlaidMLContext &)>;
+
+    GPlaidMLKernel() = default;
+    explicit GPlaidMLKernel(const F& f) : m_f(f) {}
+
+    void apply(GPlaidMLContext &ctx) const
+    {
+        GAPI_Assert(m_f);
+        m_f(ctx);
+    }
+
+protected:
+    F m_f;
+};
+
+
+namespace detail
+{
+
+template<class T> struct plaidml_get_in;
+template<> struct plaidml_get_in<cv::GMat>
+{
+    static const plaidml::edsl::Tensor& get(GPlaidMLContext& ctx, int idx)
+    {
+        return ctx.inTensor(idx);
+    }
+};
+
+template<class T> struct plaidml_get_in
+{
+    static T get(GPlaidMLContext &ctx, int idx) { return ctx.inArg<T>(idx); }
+};
+
+template<class T> struct plaidml_get_out;
+template<> struct plaidml_get_out<cv::GMat>
+{
+    static plaidml::edsl::Tensor& get(GPlaidMLContext& ctx, int idx)
+    {
+        return ctx.outTensor(idx);
+    }
+};
+
+template<typename, typename, typename>
+struct PlaidMLCallHelper;
+
+template<typename Impl, typename... Ins, typename... Outs>
+struct PlaidMLCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...> >
+{
+    template<int... IIs, int... OIs>
+    static void call_impl(GPlaidMLContext &ctx, detail::Seq<IIs...>, detail::Seq<OIs...>)
+    {
+        Impl::run(plaidml_get_in<Ins>::get(ctx, IIs)..., plaidml_get_out<Outs>::get(ctx, OIs)...);
+    }
+
+    static void call(GPlaidMLContext& ctx)
+    {
+        call_impl(ctx,
+                  typename detail::MkSeq<sizeof...(Ins)>::type(),
+                  typename detail::MkSeq<sizeof...(Outs)>::type());
+    }
+};
+
+} // namespace detail
+
+template<class Impl, class K>
+class GPlaidMLKernelImpl: public cv::detail::PlaidMLCallHelper<Impl, typename K::InArgs, typename K::OutArgs>,
+                          public cv::detail::KernelTag
+{
+    using P = detail::PlaidMLCallHelper<Impl, typename K::InArgs, typename K::OutArgs>;
+
+public:
+    using API = K;
+
+    static cv::gapi::GBackend backend()  { return cv::gapi::plaidml::backend(); }
+    static cv::GPlaidMLKernel kernel()   { return GPlaidMLKernel(&P::call);     }
+};
+
+#define GAPI_PLAIDML_KERNEL(Name, API) struct Name: public cv::GPlaidMLKernelImpl<Name, API>
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GPLAIDMLKERNEL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/plaidml.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/plaidml.hpp
new file mode 100644
index 000000000000..3207a8cb2e44
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/plaidml/plaidml.hpp
@@ -0,0 +1,53 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_PLAIDML_PLAIDML_HPP
+#define OPENCV_GAPI_PLAIDML_PLAIDML_HPP
+
+#include <string>
+#include <opencv2/gapi/gcommon.hpp> // CompileArgTag
+
+namespace cv
+{
+namespace gapi
+{
+
+/**
+ * @brief This namespace contains G-API PlaidML backend functions,
+ * structures, and symbols.
+ */
+namespace plaidml
+{
+
+/** \addtogroup gapi_compile_args
+ * @{
+ */
+/**
+ * @brief This structure represents the basic parameters for the experimental
+ * PlaidML backend.
+ */
+struct config
+{
+    std::string dev_id; //!< Device ID. Refer to PlaidML documentation for details.
+    std::string trg_id; //!< Target ID. Refer to PlaidML documentation for details.
+};
+/** @} gapi_compile_args */
+
+} // namespace plaidml
+} // namespace gapi
+
+namespace detail
+{
+    template<> struct CompileArgTag<cv::gapi::plaidml::config>
+    {
+        static const char* tag() { return "gapi.plaidml.config"; }
+    };
+} // namespace detail
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_PLAIDML_PLAIDML_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/python/python.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/python/python.hpp
new file mode 100644
index 000000000000..1857a938d5bb
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/python/python.hpp
@@ -0,0 +1,71 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_PYTHON_API_HPP
+#define OPENCV_GAPI_PYTHON_API_HPP
+
+#include <opencv2/gapi/gkernel.hpp>     // GKernelPackage
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+namespace cv {
+namespace gapi {
+
+/**
+ * @brief This namespace contains G-API Python backend functions,
+ * structures, and symbols.
+ *
+ * This functionality is required to enable G-API custom operations
+ * and kernels when using G-API from Python, no need to use it in the
+ * C++ form.
+ */
+namespace python {
+
+GAPI_EXPORTS cv::gapi::GBackend backend();
+
+struct GPythonContext
+{
+    const cv::GArgs      &ins;
+    const cv::GMetaArgs  &in_metas;
+    const cv::GTypesInfo &out_info;
+
+    cv::optional<cv::GArg> m_state;
+};
+
+using Impl = std::function<cv::GRunArgs(const GPythonContext&)>;
+using Setup = std::function<cv::GArg(const GMetaArgs&, const GArgs&)>;
+
+class GAPI_EXPORTS GPythonKernel
+{
+public:
+    GPythonKernel() = default;
+    GPythonKernel(Impl run, Setup setup);
+
+    Impl  run;
+    Setup setup       = nullptr;
+    bool  is_stateful = false;
+};
+
+class GAPI_EXPORTS GPythonFunctor : public cv::gapi::GFunctor
+{
+public:
+    using Meta = cv::GKernel::M;
+
+    GPythonFunctor(const char* id, const Meta& meta, const Impl& impl,
+                   const Setup& setup = nullptr);
+
+    GKernelImpl    impl()    const override;
+    gapi::GBackend backend() const override;
+
+private:
+    GKernelImpl impl_;
+};
+
+} // namespace python
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_PYTHON_API_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/render.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/render.hpp
new file mode 100644
index 000000000000..52e55b0d800f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/render.hpp
@@ -0,0 +1,14 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+#ifndef OPENCV_GAPI_RENDER_ROOT_HPP
+#define OPENCV_GAPI_RENDER_ROOT_HPP
+
+// This file is just a shortcut to render/render.hpp
+
+#include <opencv2/gapi/render/render.hpp>
+
+#endif // OPENCV_GAPI_RENDER_ROOT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/render/render.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/render/render.hpp
new file mode 100644
index 000000000000..8d93a6efc028
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/render/render.hpp
@@ -0,0 +1,196 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_RENDER_HPP
+#define OPENCV_GAPI_RENDER_HPP
+
+#include <opencv2/gapi/render/render_types.hpp>
+
+#include <opencv2/gapi.hpp>
+
+/** \defgroup gapi_draw G-API Drawing and composition functionality
+ *  @{
+ *
+ *  @brief Functions for in-graph drawing.
+ *
+ *  @note This is a Work in Progress functionality and APIs may
+ *  change in the future releases.
+ *
+ *  G-API can do some in-graph drawing with a generic operations and a
+ *  set of [rendering primitives](@ref gapi_draw_prims).
+ *  In contrast with traditional OpenCV, in G-API user need to form a
+ *  *rendering list* of primitives to draw. This list can be built
+ *  manually or generated within a graph. This list is passed to
+ *  [special operations or functions](@ref gapi_draw_api) where all
+ *  primitives are interpreted and applied to the image.
+ *
+ *  For example, in a complex pipeline a list of detected objects
+ *  can be translated in-graph to a list of cv::gapi::wip::draw::Rect
+ *  primitives to highlight those with bounding boxes, or a list of
+ *  detected faces can be translated in-graph to a list of
+ *  cv::gapi::wip::draw::Mosaic primitives to hide sensitive content
+ *  or protect privacy.
+ *
+ *  Like any other operations, rendering in G-API can be reimplemented
+ *  by different backends. Currently only an OpenCV-based backend is
+ *  available.
+ *
+ *  In addition to the graph-level operations, there are also regular
+ *  (immediate) OpenCV-like functions are available -- see
+ *  cv::gapi::wip::draw::render(). These functions are just wrappers
+ *  over regular G-API and build the rendering graphs on the fly, so
+ *  take compilation arguments as parameters.
+ *
+ *  Currently this API is more machine-oriented than human-oriented.
+ *  The main purpose is to translate a set of domain-specific objects
+ *  to a list of primitives to draw. For example, in order to generate
+ *  a picture like this:
+ *
+ *  ![](modules/gapi/doc/pics/render_example.png)
+ *
+ *  Rendering list needs to be generated as follows:
+ *
+ *  @include modules/gapi/samples/draw_example.cpp
+ *
+ *  @defgroup gapi_draw_prims Drawing primitives
+ *  @defgroup gapi_draw_api Drawing operations and functions
+ *  @}
+ */
+
+namespace cv
+{
+namespace gapi
+{
+namespace wip
+{
+namespace draw
+{
+
+using GMat2     = std::tuple<cv::GMat,cv::GMat>;
+using GMatDesc2 = std::tuple<cv::GMatDesc,cv::GMatDesc>;
+
+//! @addtogroup gapi_draw_api
+//! @{
+/** @brief The function renders on the input image passed drawing primitivies
+
+@param bgr input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@param prims vector of drawing primitivies
+@param args graph compile time parameters
+*/
+void GAPI_EXPORTS_W render(cv::Mat& bgr,
+                           const Prims& prims,
+                           cv::GCompileArgs&& args = {});
+
+/** @brief The function renders on two NV12 planes passed drawing primitivies
+
+@param y_plane input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
+@param uv_plane input image: 8-bit unsigned 2-channel image @ref CV_8UC2.
+@param prims vector of drawing primitivies
+@param args graph compile time parameters
+*/
+void GAPI_EXPORTS_W render(cv::Mat& y_plane,
+                           cv::Mat& uv_plane,
+                           const Prims& prims,
+                           cv::GCompileArgs&& args = {});
+
+/** @brief The function renders on the input media frame passed drawing primitivies
+
+@param frame input Media Frame :  @ref cv::MediaFrame.
+@param prims vector of drawing primitivies
+@param args graph compile time parameters
+*/
+void GAPI_EXPORTS render(cv::MediaFrame& frame,
+                         const Prims& prims,
+                         cv::GCompileArgs&& args = {});
+
+
+G_TYPED_KERNEL_M(GRenderNV12, <GMat2(cv::GMat,cv::GMat,cv::GArray<wip::draw::Prim>)>, "org.opencv.render.nv12")
+{
+     static GMatDesc2 outMeta(GMatDesc y_plane, GMatDesc uv_plane, GArrayDesc)
+     {
+         return std::make_tuple(y_plane, uv_plane);
+     }
+};
+
+G_TYPED_KERNEL(GRenderBGR, <cv::GMat(cv::GMat,cv::GArray<wip::draw::Prim>)>, "org.opencv.render.bgr")
+{
+     static GMatDesc outMeta(GMatDesc bgr, GArrayDesc)
+     {
+         return bgr;
+     }
+};
+
+G_TYPED_KERNEL(GRenderFrame, <cv::GFrame(cv::GFrame, cv::GArray<wip::draw::Prim>)>, "org.opencv.render.frame")
+{
+    static GFrameDesc outMeta(GFrameDesc desc, GArrayDesc)
+    {
+        return desc;
+    }
+};
+
+/** @brief Renders on 3 channels input
+
+Output image must be 8-bit unsigned planar 3-channel image
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3
+@param prims draw primitives
+*/
+GAPI_EXPORTS_W GMat render3ch(const GMat& src, const GArray<Prim>& prims);
+
+/** @brief Renders on two planes
+
+Output y image must be 8-bit unsigned planar 1-channel image @ref CV_8UC1
+uv image must be 8-bit unsigned planar 2-channel image @ref CV_8UC2
+
+@param y  input image: 8-bit unsigned 1-channel image @ref CV_8UC1
+@param uv input image: 8-bit unsigned 2-channel image @ref CV_8UC2
+@param prims draw primitives
+*/
+GAPI_EXPORTS_W GMat2 renderNV12(const GMat& y,
+                                const GMat& uv,
+                                const GArray<Prim>& prims);
+
+/** @brief Renders Media Frame
+
+Output media frame frame cv::MediaFrame
+
+@param m_frame input image: cv::MediaFrame @ref cv::MediaFrame
+@param prims draw primitives
+*/
+GAPI_EXPORTS GFrame renderFrame(const GFrame& m_frame,
+                                const GArray<Prim>& prims);
+
+//! @} gapi_draw_api
+
+} // namespace draw
+} // namespace wip
+
+/**
+ * @brief This namespace contains G-API CPU rendering backend functions,
+ * structures, and symbols. See @ref gapi_draw for details.
+ */
+namespace render
+{
+namespace ocv
+{
+    GAPI_EXPORTS_W cv::GKernelPackage kernels();
+
+} // namespace ocv
+} // namespace render
+} // namespace gapi
+
+namespace detail
+{
+    template<> struct CompileArgTag<cv::gapi::wip::draw::freetype_font>
+    {
+        static const char* tag() { return "gapi.freetype_font"; }
+    };
+} // namespace detail
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_RENDER_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/render/render_types.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/render/render_types.hpp
new file mode 100644
index 000000000000..6d70e3a877dd
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/render/render_types.hpp
@@ -0,0 +1,359 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_RENDER_TYPES_HPP
+#define OPENCV_GAPI_RENDER_TYPES_HPP
+
+#include <string>
+#include <vector>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/util/variant.hpp>
+#include <opencv2/gapi/own/exports.hpp>
+
+namespace cv
+{
+namespace gapi
+{
+namespace wip
+{
+namespace draw
+{
+
+/**
+ * @brief This structure specifies which FreeType font to use by FText primitives.
+ */
+struct freetype_font
+{
+    /*@{*/
+    std::string path; //!< The path to the font file (.ttf)
+    /*@{*/
+};
+
+//! @addtogroup gapi_draw_prims
+//! @{
+/**
+ * @brief This structure represents a text string to draw.
+ *
+ * Parameters match cv::putText().
+ */
+struct GAPI_EXPORTS_W_SIMPLE Text
+{
+    /**
+     * @brief Text constructor
+     *
+     * @param text_               The text string to be drawn
+     * @param org_                The bottom-left corner of the text string in the image
+     * @param ff_                 The font type, see #HersheyFonts
+     * @param fs_                 The font scale factor that is multiplied by the font-specific base size
+     * @param color_              The text color
+     * @param thick_              The thickness of the lines used to draw a text
+     * @param lt_                 The line type. See #LineTypes
+     * @param bottom_left_origin_ When true, the image data origin is at the bottom-left corner. Otherwise, it is at the top-left corner
+     */
+    GAPI_WRAP
+    Text(const std::string& text_,
+         const cv::Point& org_,
+         int ff_,
+         double fs_,
+         const cv::Scalar& color_,
+         int thick_ = 1,
+         int lt_ = 8,
+         bool bottom_left_origin_ = false) :
+        text(text_), org(org_), ff(ff_), fs(fs_),
+        color(color_), thick(thick_), lt(lt_), bottom_left_origin(bottom_left_origin_)
+    {
+    }
+
+    GAPI_WRAP
+    Text() = default;
+
+    /*@{*/
+    GAPI_PROP_RW std::string text;               //!< The text string to be drawn
+    GAPI_PROP_RW cv::Point   org;                //!< The bottom-left corner of the text string in the image
+    GAPI_PROP_RW int         ff;                 //!< The font type, see #HersheyFonts
+    GAPI_PROP_RW double      fs;                 //!< The font scale factor that is multiplied by the font-specific base size
+    GAPI_PROP_RW cv::Scalar  color;              //!< The text color
+    GAPI_PROP_RW int         thick;              //!< The thickness of the lines used to draw a text
+    GAPI_PROP_RW int         lt;                 //!< The line type. See #LineTypes
+    GAPI_PROP_RW bool        bottom_left_origin; //!< When true, the image data origin is at the bottom-left corner. Otherwise, it is at the top-left corner
+    /*@{*/
+};
+
+/**
+ * @brief This structure represents a text string to draw using
+ * FreeType renderer.
+ *
+ * If OpenCV is built without FreeType support, this primitive will
+ * fail at the execution stage.
+ */
+struct FText
+{
+    /**
+     * @brief FText constructor
+     *
+     * @param text_ The text string to be drawn
+     * @param org_  The bottom-left corner of the text string in the image
+     * @param fh_   The height of text
+     * @param color_ The text color
+     */
+    FText(const std::wstring& text_,
+          const cv::Point& org_,
+          int fh_,
+          const cv::Scalar& color_) :
+        text(text_), org(org_), fh(fh_), color(color_)
+    {
+    }
+
+    FText() = default;
+
+    /*@{*/
+    std::wstring text;              //!< The text string to be drawn
+    cv::Point    org;               //!< The bottom-left corner of the text string in the image
+    int          fh;                //!< The height of text
+    cv::Scalar   color;             //!< The text color
+    /*@{*/
+};
+
+/**
+ * @brief This structure represents a rectangle to draw.
+ *
+ * Parameters match cv::rectangle().
+ */
+struct GAPI_EXPORTS_W_SIMPLE Rect
+{
+    /**
+     * @brief Rect constructor
+     *
+     * @param rect_   Coordinates of the rectangle
+     * @param color_  The bottom-left corner of the text string in the image
+     * @param thick_  The thickness of lines that make up the rectangle. Negative values, like #FILLED, mean that the function has to draw a filled rectangle
+     * @param lt_     The type of the line. See #LineTypes
+     * @param shift_  The number of fractional bits in the point coordinates
+     */
+    Rect(const cv::Rect& rect_,
+         const cv::Scalar& color_,
+         int thick_ = 1,
+         int lt_ = 8,
+         int shift_ = 0) :
+        rect(rect_), color(color_), thick(thick_), lt(lt_), shift(shift_)
+    {
+    }
+
+    GAPI_WRAP
+    Rect() = default;
+
+    /*@{*/
+    GAPI_PROP_RW cv::Rect   rect;  //!< Coordinates of the rectangle
+    GAPI_PROP_RW cv::Scalar color; //!< The rectangle color or brightness (grayscale image)
+    GAPI_PROP_RW int        thick; //!< The thickness of lines that make up the rectangle. Negative values, like #FILLED, mean that the function has to draw a filled rectangle
+    GAPI_PROP_RW int        lt;    //!< The type of the line. See #LineTypes
+    GAPI_PROP_RW int        shift; //!< The number of fractional bits in the point coordinates
+    /*@{*/
+};
+
+/**
+ * @brief This structure represents a circle to draw.
+ *
+ * Parameters match cv::circle().
+ */
+struct GAPI_EXPORTS_W_SIMPLE Circle
+{
+    /**
+     * @brief Circle constructor
+     *
+     * @param  center_ The center of the circle
+     * @param  radius_ The radius of the circle
+     * @param  color_  The color of the  circle
+     * @param  thick_  The thickness of the circle outline, if positive. Negative values, like #FILLED, mean that a filled circle is to be drawn
+     * @param  lt_     The Type of the circle boundary. See #LineTypes
+     * @param  shift_  The Number of fractional bits in the coordinates of the center and in the radius value
+     */
+    GAPI_WRAP
+    Circle(const cv::Point& center_,
+           int radius_,
+           const cv::Scalar& color_,
+           int thick_ = 1,
+           int lt_ = 8,
+           int shift_ = 0) :
+        center(center_), radius(radius_), color(color_), thick(thick_), lt(lt_), shift(shift_)
+    {
+    }
+
+    GAPI_WRAP
+    Circle() = default;
+
+    /*@{*/
+    GAPI_PROP_RW cv::Point  center; //!< The center of the circle
+    GAPI_PROP_RW int        radius; //!< The radius of the circle
+    GAPI_PROP_RW cv::Scalar color;  //!< The color of the  circle
+    GAPI_PROP_RW int        thick;  //!< The thickness of the circle outline, if positive. Negative values, like #FILLED, mean that a filled circle is to be drawn
+    GAPI_PROP_RW int        lt;     //!< The Type of the circle boundary. See #LineTypes
+    GAPI_PROP_RW int        shift;  //!< The Number of fractional bits in the coordinates of the center and in the radius value
+    /*@{*/
+};
+
+/**
+ * @brief This structure represents a line to draw.
+ *
+ * Parameters match cv::line().
+ */
+struct GAPI_EXPORTS_W_SIMPLE Line
+{
+    /**
+     * @brief Line constructor
+     *
+     * @param  pt1_    The first point of the line segment
+     * @param  pt2_    The second point of the line segment
+     * @param  color_  The line color
+     * @param  thick_  The thickness of line
+     * @param  lt_     The Type of the line. See #LineTypes
+     * @param  shift_  The number of fractional bits in the point coordinates
+    */
+    GAPI_WRAP
+    Line(const cv::Point& pt1_,
+         const cv::Point& pt2_,
+         const cv::Scalar& color_,
+         int thick_ = 1,
+         int lt_ = 8,
+         int shift_ = 0) :
+        pt1(pt1_), pt2(pt2_), color(color_), thick(thick_), lt(lt_), shift(shift_)
+    {
+    }
+
+    GAPI_WRAP
+    Line() = default;
+
+    /*@{*/
+    GAPI_PROP_RW cv::Point  pt1;    //!< The first point of the line segment
+    GAPI_PROP_RW cv::Point  pt2;    //!< The second point of the line segment
+    GAPI_PROP_RW cv::Scalar color;  //!< The line color
+    GAPI_PROP_RW int        thick;  //!< The thickness of line
+    GAPI_PROP_RW int        lt;     //!< The Type of the line. See #LineTypes
+    GAPI_PROP_RW int        shift;  //!< The number of fractional bits in the point coordinates
+    /*@{*/
+};
+
+/**
+ * @brief This structure represents a mosaicing operation.
+ *
+ * Mosaicing is a very basic method to obfuscate regions in the image.
+ */
+struct GAPI_EXPORTS_W_SIMPLE Mosaic
+{
+    /**
+     * @brief Mosaic constructor
+     *
+     * @param mos_    Coordinates of the mosaic
+     * @param cellSz_ Cell size (same for X, Y)
+     * @param decim_  Decimation (0 stands for no decimation)
+    */
+    Mosaic(const cv::Rect& mos_,
+           int cellSz_,
+           int decim_) :
+        mos(mos_), cellSz(cellSz_), decim(decim_)
+    {
+    }
+
+    GAPI_WRAP
+    Mosaic() : cellSz(0), decim(0) {}
+
+    /*@{*/
+    GAPI_PROP_RW cv::Rect mos;    //!< Coordinates of the mosaic
+    GAPI_PROP_RW int      cellSz; //!< Cell size (same for X, Y)
+    GAPI_PROP_RW int      decim;  //!< Decimation (0 stands for no decimation)
+    /*@{*/
+};
+
+/**
+ * @brief This structure represents an image to draw.
+ *
+ * Image is blended on a frame using the specified mask.
+ */
+struct GAPI_EXPORTS_W_SIMPLE Image
+{
+    /**
+     * @brief Mosaic constructor
+     *
+     * @param  org_   The bottom-left corner of the image
+     * @param  img_   Image to draw
+     * @param  alpha_ Alpha channel for image to draw (same size and number of channels)
+    */
+    GAPI_WRAP
+    Image(const cv::Point& org_,
+          const cv::Mat& img_,
+          const cv::Mat& alpha_) :
+        org(org_), img(img_), alpha(alpha_)
+    {
+    }
+
+    GAPI_WRAP
+    Image() = default;
+
+    /*@{*/
+    GAPI_PROP_RW cv::Point org;   //!< The bottom-left corner of the image
+    GAPI_PROP_RW cv::Mat   img;   //!< Image to draw
+    GAPI_PROP_RW cv::Mat   alpha; //!< Alpha channel for image to draw (same size and number of channels)
+    /*@{*/
+};
+
+/**
+ * @brief This structure represents a polygon to draw.
+ */
+struct GAPI_EXPORTS_W_SIMPLE Poly
+{
+    /**
+     * @brief Mosaic constructor
+     *
+     * @param points_ Points to connect
+     * @param color_  The line color
+     * @param thick_  The thickness of line
+     * @param lt_     The Type of the line. See #LineTypes
+     * @param shift_  The number of fractional bits in the point coordinate
+    */
+    GAPI_WRAP
+    Poly(const std::vector<cv::Point>& points_,
+         const cv::Scalar& color_,
+         int thick_ = 1,
+         int lt_ = 8,
+         int shift_ = 0) :
+        points(points_), color(color_), thick(thick_), lt(lt_), shift(shift_)
+    {
+    }
+
+    GAPI_WRAP
+    Poly() = default;
+
+    /*@{*/
+    GAPI_PROP_RW std::vector<cv::Point> points;  //!< Points to connect
+    GAPI_PROP_RW cv::Scalar             color;   //!< The line color
+    GAPI_PROP_RW int                    thick;   //!< The thickness of line
+    GAPI_PROP_RW int                    lt;      //!< The Type of the line. See #LineTypes
+    GAPI_PROP_RW int                    shift;   //!< The number of fractional bits in the point coordinate
+    /*@{*/
+};
+
+using Prim  = util::variant
+    < Text
+    , FText
+    , Rect
+    , Circle
+    , Line
+    , Mosaic
+    , Image
+    , Poly
+    >;
+
+using Prims = std::vector<Prim>;
+//! @} gapi_draw_prims
+
+} // namespace draw
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_RENDER_TYPES_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/rmat.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/rmat.hpp
new file mode 100644
index 000000000000..46989191b34d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/rmat.hpp
@@ -0,0 +1,160 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_RMAT_HPP
+#define OPENCV_GAPI_RMAT_HPP
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/own/exports.hpp>
+
+// Forward declaration
+namespace cv {
+namespace gapi {
+namespace s11n {
+struct IOStream;
+struct IIStream;
+} // namespace s11n
+} // namespace gapi
+} // namespace cv
+
+namespace cv {
+
+// "Remote Mat", a general class which provides an abstraction layer over the data
+// storage and placement (host, remote device etc) and allows to access this data.
+//
+// The device specific implementation is hidden in the RMat::IAdapter class
+//
+// The basic flow is the following:
+// * Backend which is aware of the remote device:
+//   - Implements own AdapterT class which is derived from RMat::IAdapter
+//   - Wraps device memory into RMat via make_rmat utility function:
+//         cv::RMat rmat = cv::make_rmat<AdapterT>(args);
+//
+// * End user:
+//   - Writes the code which works with RMats without any knowledge of the remote device:
+//     void func(const cv::RMat& in_rmat, cv::RMat& out_rmat) {
+//         // Fetch input data from the device, get mapped memory for output
+//         cv::RMat::View  in_view =  in_rmat.access(Access::R);
+//         cv::RMat::View out_view = out_rmat.access(Access::W);
+//         performCalculations(in_view, out_view);
+//         // data from out_view is transferred to the device when out_view is destroyed
+//     }
+/** \addtogroup gapi_data_structures
+ * @{
+ */
+class GAPI_EXPORTS RMat
+{
+public:
+    // A lightweight wrapper on image data:
+    // - Doesn't own the memory;
+    // - Doesn't implement copy semantics (it's assumed that a view is created each time
+    // wrapped data is being accessed);
+    // - Has an optional callback which is called when the view is destroyed.
+    class GAPI_EXPORTS View
+    {
+    public:
+        using DestroyCallback = std::function<void()>;
+        using stepsT = std::vector<size_t>;
+
+        View() = default;
+        View(const GMatDesc& desc, uchar* data, const stepsT& steps = {}, DestroyCallback&& cb = nullptr);
+        View(const GMatDesc& desc, uchar* data, size_t step, DestroyCallback&& cb = nullptr);
+
+        View(const View&) = delete;
+        View& operator=(const View&) = delete;
+        View(View&&) = default;
+        View& operator=(View&& v);
+        ~View() { if (m_cb) m_cb(); }
+
+        cv::Size size() const { return m_desc.size; }
+        const std::vector<int>& dims() const { return m_desc.dims; }
+        int cols() const { return m_desc.size.width; }
+        int rows() const { return m_desc.size.height; }
+        int type() const;
+        int depth() const { return m_desc.depth; }
+        int chan() const { return m_desc.chan; }
+        size_t elemSize() const { return CV_ELEM_SIZE(type()); }
+
+        template<typename T = uchar> T* ptr(int y = 0) {
+            return reinterpret_cast<T*>(m_data + step()*y);
+        }
+        template<typename T = uchar> const T* ptr(int y = 0) const {
+            return reinterpret_cast<T*>(m_data + step()*y);
+        }
+        template<typename T = uchar> T* ptr(int y, int x) {
+            return reinterpret_cast<T*>(m_data + step()*y + step(1)*x);
+        }
+        template<typename T = uchar> const T* ptr(int y, int x) const {
+            return reinterpret_cast<const T*>(m_data + step()*y + step(1)*x);
+        }
+        size_t step(size_t i = 0) const { GAPI_DbgAssert(i<m_steps.size()); return m_steps[i]; }
+        const stepsT& steps() const { return m_steps; }
+
+    private:
+        GMatDesc m_desc;
+        uchar* m_data = nullptr;
+        stepsT m_steps = {0u};
+        DestroyCallback m_cb = nullptr;
+    };
+
+    enum class Access { R, W };
+    class IAdapter
+    // Adapter class is going to be deleted and renamed as IAdapter
+    {
+    public:
+        virtual ~IAdapter() = default;
+        virtual GMatDesc desc() const = 0;
+        // Implementation is responsible for setting the appropriate callback to
+        // the view when accessed for writing, to ensure that the data from the view
+        // is transferred to the device when the view is destroyed
+        virtual View access(Access) = 0;
+        virtual void serialize(cv::gapi::s11n::IOStream&) {
+            GAPI_Error("Generic serialize method of RMat::IAdapter does nothing by default. "
+                                 "Please, implement it in derived class to properly serialize the object.");
+        }
+        virtual void deserialize(cv::gapi::s11n::IIStream&) {
+            GAPI_Error("Generic deserialize method of RMat::IAdapter does nothing by default. "
+                                 "Please, implement it in derived class to properly deserialize the object.");
+        }
+    };
+    using Adapter = IAdapter; // Keep backward compatibility
+    using AdapterP = std::shared_ptr<IAdapter>;
+
+    RMat() = default;
+    RMat(AdapterP&& a) : m_adapter(std::move(a)) {}
+    GMatDesc desc() const { return m_adapter->desc(); }
+
+    // Note: When accessed for write there is no guarantee that returned view
+    // will contain actual snapshot of the mapped device memory
+    // (no guarantee that fetch from a device is performed). The only
+    // guaranty is that when the view is destroyed, its data will be
+    // transferred to the device
+    View access(Access a) const { return m_adapter->access(a); }
+
+    // Cast underlying RMat adapter to the particular adapter type,
+    // return nullptr if underlying type is different
+    template<typename T> T* get() const
+    {
+        static_assert(std::is_base_of<IAdapter, T>::value, "T is not derived from IAdapter!");
+        GAPI_Assert(m_adapter != nullptr);
+        return dynamic_cast<T*>(m_adapter.get());
+    }
+
+    void serialize(cv::gapi::s11n::IOStream& os) const {
+        m_adapter->serialize(os);
+    }
+
+private:
+    AdapterP m_adapter = nullptr;
+};
+
+template<typename T, typename... Ts>
+RMat make_rmat(Ts&&... args) { return { std::make_shared<T>(std::forward<Ts>(args)...) }; }
+/** @} */
+
+} //namespace cv
+
+#endif /* OPENCV_GAPI_RMAT_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/s11n.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/s11n.hpp
new file mode 100644
index 000000000000..a94f55c249af
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/s11n.hpp
@@ -0,0 +1,513 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020-2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_S11N_HPP
+#define OPENCV_GAPI_S11N_HPP
+
+#include <vector>
+#include <map>
+#include <unordered_map>
+#include <opencv2/gapi/s11n/base.hpp>
+#include <opencv2/gapi/gcomputation.hpp>
+#include <opencv2/gapi/rmat.hpp>
+#include <opencv2/gapi/media.hpp>
+#include <opencv2/gapi/util/util.hpp>
+
+// FIXME: caused by deserialize_runarg
+#if defined _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+
+namespace cv {
+namespace gapi {
+
+/**
+* \addtogroup gapi_serialization
+* @{
+*/
+
+namespace detail {
+    GAPI_EXPORTS cv::GComputation getGraph(const std::vector<char> &bytes);
+
+    GAPI_EXPORTS cv::GMetaArgs getMetaArgs(const std::vector<char> &bytes);
+
+    GAPI_EXPORTS cv::GRunArgs getRunArgs(const std::vector<char> &bytes);
+
+    GAPI_EXPORTS std::vector<std::string> getVectorOfStrings(const std::vector<char> &bytes);
+
+    template<typename... Types>
+    cv::GCompileArgs getCompileArgs(const std::vector<char> &bytes);
+
+    template<typename... AdapterType>
+    cv::GRunArgs getRunArgsWithAdapters(const std::vector<char> &bytes);
+} // namespace detail
+
+/** @brief Serialize a graph represented by GComputation into an array of bytes.
+ *
+ * Check different overloads for more examples.
+ * @param c GComputation to serialize.
+ * @return serialized vector of bytes.
+ */
+GAPI_EXPORTS std::vector<char> serialize(const cv::GComputation &c);
+
+/** @overload
+ * @param ca GCompileArgs to serialize.
+ */
+GAPI_EXPORTS std::vector<char> serialize(const cv::GCompileArgs& ca);
+
+/** @overload
+ * @param ma GMetaArgs to serialize.
+ */
+GAPI_EXPORTS std::vector<char> serialize(const cv::GMetaArgs& ma);
+
+/** @overload
+ * @param ra GRunArgs to serialize.
+ */
+GAPI_EXPORTS std::vector<char> serialize(const cv::GRunArgs& ra);
+
+/** @overload
+ * @param vs std::vector<std::string> to serialize.
+ */
+GAPI_EXPORTS std::vector<char> serialize(const std::vector<std::string>& vs);
+
+/**
+ * @private
+ */
+template<typename T> static inline
+T deserialize(const std::vector<char> &bytes);
+
+/** @brief Deserialize GComputation from a byte array.
+ *
+ * Check different overloads for more examples.
+ * @param bytes serialized vector of bytes.
+ * @return deserialized GComputation object.
+ */
+template<> inline
+cv::GComputation deserialize(const std::vector<char> &bytes) {
+    return detail::getGraph(bytes);
+}
+
+/** @brief Deserialize GMetaArgs from a byte array.
+ *
+ * Check different overloads for more examples.
+ * @param bytes serialized vector of bytes.
+ * @return deserialized GMetaArgs object.
+ */
+template<> inline
+cv::GMetaArgs deserialize(const std::vector<char> &bytes) {
+    return detail::getMetaArgs(bytes);
+}
+
+/** @brief Deserialize GRunArgs from a byte array.
+ *
+ * Check different overloads for more examples.
+ * @param bytes serialized vector of bytes.
+ * @return deserialized GRunArgs object.
+ */
+template<> inline
+cv::GRunArgs deserialize(const std::vector<char> &bytes) {
+    return detail::getRunArgs(bytes);
+}
+
+/** @brief Deserialize std::vector<std::string> from a byte array.
+ *
+ * Check different overloads for more examples.
+ * @param bytes serialized vector of bytes.
+ * @return deserialized std::vector<std::string> object.
+ */
+template<> inline
+std::vector<std::string> deserialize(const std::vector<char> &bytes) {
+    return detail::getVectorOfStrings(bytes);
+}
+
+/**
+ * @brief Deserialize GCompileArgs which types were specified in the template from a byte array.
+ *
+ * @note cv::gapi::s11n::detail::S11N template specialization must be provided to make a custom type
+ * in GCompileArgs deserializable.
+ *
+ * @param bytes vector of bytes to deserialize GCompileArgs object from.
+ * @return GCompileArgs object.
+ * @see GCompileArgs cv::gapi::s11n::detail::S11N
+ */
+template<typename T, typename... Types> inline
+typename std::enable_if<std::is_same<T, GCompileArgs>::value, GCompileArgs>::
+type deserialize(const std::vector<char> &bytes) {
+    return detail::getCompileArgs<Types...>(bytes);
+}
+
+/**
+ * @brief Deserialize GRunArgs including RMat and MediaFrame objects if any from a byte array.
+ *
+ * Adapter types are specified in the template.
+ * @note To be used properly specified adapter types must overload their deserialize() method.
+ * @param bytes vector of bytes to deserialize GRunArgs object from.
+ * @return GRunArgs including RMat and MediaFrame objects if any.
+ * @see RMat MediaFrame
+ */
+template<typename T, typename AtLeastOneAdapterT, typename... AdapterTypes> inline
+typename std::enable_if<std::is_same<T, GRunArgs>::value, GRunArgs>::
+type deserialize(const std::vector<char> &bytes) {
+    return detail::getRunArgsWithAdapters<AtLeastOneAdapterT, AdapterTypes...>(bytes);
+}
+} // namespace gapi
+} // namespace cv
+
+namespace cv {
+namespace gapi {
+namespace s11n {
+
+/** @brief This structure is an interface for serialization routines.
+ *
+ * It's main purpose is to provide multiple overloads for operator<<()
+ * with basic C++ in addition to OpenCV/G-API types.
+ *
+ * This sctructure can be inherited and further extended with additional types.
+ *
+ * For example, it is utilized in cv::gapi::s11n::detail::S11N as input parameter
+ * in serialize() method.
+ */
+struct GAPI_EXPORTS IOStream {
+    virtual ~IOStream() = default;
+    // Define the native support for basic C++ types at the API level:
+    virtual IOStream& operator<< (bool) = 0;
+    virtual IOStream& operator<< (char) = 0;
+    virtual IOStream& operator<< (unsigned char) = 0;
+    virtual IOStream& operator<< (short) = 0;
+    virtual IOStream& operator<< (unsigned short) = 0;
+    virtual IOStream& operator<< (int) = 0;
+    virtual IOStream& operator<< (uint32_t) = 0;
+    virtual IOStream& operator<< (uint64_t) = 0;
+    virtual IOStream& operator<< (float) = 0;
+    virtual IOStream& operator<< (double) = 0;
+    virtual IOStream& operator<< (const std::string&) = 0;
+};
+
+/** @brief This structure is an interface for deserialization routines.
+ *
+ * It's main purpose is to provide multiple overloads for operator>>()
+ * with basic C++ in addition to OpenCV/G-API types.
+ *
+ * This structure can be inherited and further extended with additional types.
+ *
+ * For example, it is utilized in cv::gapi::s11n::detail::S11N as input parameter
+ * in deserialize() method.
+ */
+struct GAPI_EXPORTS IIStream {
+    virtual ~IIStream() = default;
+    virtual IIStream& operator>> (bool &) = 0;
+    virtual IIStream& operator>> (std::vector<bool>::reference) = 0;
+    virtual IIStream& operator>> (char &) = 0;
+    virtual IIStream& operator>> (unsigned char &) = 0;
+    virtual IIStream& operator>> (short &) = 0;
+    virtual IIStream& operator>> (unsigned short &) = 0;
+    virtual IIStream& operator>> (int &) = 0;
+    virtual IIStream& operator>> (float &) = 0;
+    virtual IIStream& operator>> (double &) = 0;
+    virtual IIStream& operator >> (uint32_t &) = 0;
+    virtual IIStream& operator >> (uint64_t &) = 0;
+    virtual IIStream& operator>> (std::string &) = 0;
+};
+
+namespace detail {
+GAPI_EXPORTS std::unique_ptr<IIStream> getInStream(const std::vector<char> &bytes);
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// S11N operators
+// Note: operators for basic types are defined in IIStream/IOStream
+
+// OpenCV types ////////////////////////////////////////////////////////////////
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Point &pt);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Point &pt);
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Point2f &pt);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Point2f &pt);
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Point3f &pt);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Point3f &pt);
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Size &sz);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Size &sz);
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Rect &rc);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Rect &rc);
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Scalar &s);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Scalar &s);
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Mat &m);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Mat &m);
+
+// FIXME: for GRunArgs serialization
+#if !defined(GAPI_STANDALONE)
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::UMat & um);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::UMat & um);
+#endif // !defined(GAPI_STANDALONE)
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::RMat &r);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::RMat &r);
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::gapi::wip::IStreamSource::Ptr &issptr);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::gapi::wip::IStreamSource::Ptr &issptr);
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::detail::VectorRef &vr);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::detail::VectorRef &vr);
+
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::detail::OpaqueRef &opr);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::detail::OpaqueRef &opr);
+
+/// @private -- Exclude this function from OpenCV documentation
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::MediaFrame &mf);
+/// @private -- Exclude this function from OpenCV documentation
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::MediaFrame &mf);
+
+// Generic STL types ////////////////////////////////////////////////////////////////
+template<typename K, typename V>
+IOStream& operator<< (IOStream& os, const std::map<K, V> &m) {
+    const uint32_t sz = static_cast<uint32_t>(m.size());
+    os << sz;
+    for (const auto& it : m) os << it.first << it.second;
+    return os;
+}
+template<typename K, typename V>
+IIStream& operator>> (IIStream& is, std::map<K, V> &m) {
+    m.clear();
+    uint32_t sz = 0u;
+    is >> sz;
+    for (std::size_t i = 0; i < sz; ++i) {
+        K k{};
+        V v{};
+        is >> k >> v;
+        m[k] = v;
+    }
+    return is;
+}
+
+template<typename K, typename V>
+IOStream& operator<< (IOStream& os, const std::unordered_map<K, V> &m) {
+    const uint32_t sz = static_cast<uint32_t>(m.size());
+    os << sz;
+    for (auto &&it : m) os << it.first << it.second;
+    return os;
+}
+template<typename K, typename V>
+IIStream& operator>> (IIStream& is, std::unordered_map<K, V> &m) {
+    m.clear();
+    uint32_t sz = 0u;
+    is >> sz;
+    for (std::size_t i = 0; i < sz; ++i) {
+        K k{};
+        V v{};
+        is >> k >> v;
+        m[k] = v;
+    }
+    return is;
+}
+
+template<typename T>
+IOStream& operator<< (IOStream& os, const std::vector<T> &ts) {
+    const uint32_t sz = static_cast<uint32_t>(ts.size());
+    os << sz;
+    for (auto &&v : ts) os << v;
+    return os;
+}
+template<typename T>
+IIStream& operator>> (IIStream& is, std::vector<T> &ts) {
+    uint32_t sz = 0u;
+    is >> sz;
+    if (sz == 0u) {
+        ts.clear();
+    }
+    else {
+        ts.resize(sz);
+        for (std::size_t i = 0; i < sz; ++i) is >> ts[i];
+    }
+    return is;
+}
+
+// Generic: variant serialization
+namespace detail {
+template<typename V>
+IOStream& put_v(IOStream&, const V&, std::size_t) {
+    GAPI_Error("variant>>: requested index is invalid");
+}
+
+template<typename V, typename X, typename... Xs>
+IOStream& put_v(IOStream& os, const V& v, std::size_t x) {
+    return (x == 0u)
+        ? os << cv::util::get<X>(v)
+        : put_v<V, Xs...>(os, v, x-1);
+}
+
+template<typename V>
+IIStream& get_v(IIStream&, V&, std::size_t, std::size_t) {
+    GAPI_Error("variant<<: requested index is invalid");
+}
+
+template<typename V, typename X, typename... Xs>
+IIStream& get_v(IIStream& is, V& v, std::size_t i, std::size_t gi) {
+    if (i == gi) {
+        X x{};
+        is >> x;
+        v = V{std::move(x)};
+        return is;
+    } else return get_v<V, Xs...>(is, v, i+1, gi);
+}
+} // namespace detail
+
+//! @overload
+template<typename... Ts>
+IOStream& operator<< (IOStream& os, const cv::util::variant<Ts...> &v) {
+    os << static_cast<uint32_t>(v.index());
+    return detail::put_v<cv::util::variant<Ts...>, Ts...>(os, v, v.index());
+}
+//! @overload
+template<typename... Ts>
+IIStream& operator>> (IIStream& is, cv::util::variant<Ts...> &v) {
+    int idx = -1;
+    is >> idx;
+    GAPI_Assert(idx >= 0 && idx < (int)sizeof...(Ts));
+    return detail::get_v<cv::util::variant<Ts...>, Ts...>(is, v, 0u, idx);
+}
+
+// FIXME: consider a better solution
+/// @private -- Exclude this function from OpenCV documentation
+template<typename... Ts>
+void getRunArgByIdx (IIStream& is, cv::util::variant<Ts...> &v, uint32_t idx) {
+    is = detail::get_v<cv::util::variant<Ts...>, Ts...>(is, v, 0u, idx);
+}
+} // namespace s11n
+
+namespace detail
+{
+template<typename T> struct try_deserialize_comparg;
+
+template<> struct try_deserialize_comparg<std::tuple<>> {
+static cv::util::optional<GCompileArg> exec(const std::string&, cv::gapi::s11n::IIStream&) {
+        return { };
+    }
+};
+
+template<typename T, typename... Types>
+struct try_deserialize_comparg<std::tuple<T, Types...>> {
+static cv::util::optional<GCompileArg> exec(const std::string& tag, cv::gapi::s11n::IIStream& is) {
+    if (tag == cv::detail::CompileArgTag<T>::tag()) {
+        static_assert(cv::gapi::s11n::detail::has_S11N_spec<T>::value,
+            "cv::gapi::deserialize<GCompileArgs, Types...> expects Types to have S11N "
+            "specializations with deserialization callbacks!");
+        return cv::util::optional<GCompileArg>(
+            GCompileArg { cv::gapi::s11n::detail::S11N<T>::deserialize(is) });
+    }
+    return try_deserialize_comparg<std::tuple<Types...>>::exec(tag, is);
+}
+};
+
+template<typename ...T>
+struct deserialize_arg_with_adapter;
+
+template<typename RA, typename TA>
+struct deserialize_arg_with_adapter<RA, TA> {
+static GRunArg exec(cv::gapi::s11n::IIStream& is) {
+    std::unique_ptr<TA> ptr(new TA);
+    ptr->deserialize(is);
+    return GRunArg { RA(std::move(ptr)) };
+}
+};
+
+template<typename RA>
+struct deserialize_arg_with_adapter<RA, void> {
+static GRunArg exec(cv::gapi::s11n::IIStream&) {
+    GAPI_Error("No suitable adapter class found during RMat/MediaFrame deserialization. "
+                         "Please, make sure you've passed them in cv::gapi::deserialize() template");
+    return GRunArg{};
+}
+};
+
+template<typename... Types>
+struct deserialize_runarg {
+static GRunArg exec(cv::gapi::s11n::IIStream& is, uint32_t idx) {
+    if (idx == GRunArg::index_of<RMat>()) {
+        // Type or void (if not found)
+        using TA = typename cv::util::find_adapter_impl<RMat::IAdapter, Types...>::type;
+        return deserialize_arg_with_adapter<RMat, TA>::exec(is);
+    } else if (idx == GRunArg::index_of<MediaFrame>()) {
+        // Type or void (if not found)
+        using TA = typename cv::util::find_adapter_impl<MediaFrame::IAdapter, Types...>::type;
+        return deserialize_arg_with_adapter<MediaFrame, TA>::exec(is);
+    } else { // not an adapter holding type runarg - use default deserialization
+        GRunArg arg;
+        getRunArgByIdx(is, arg, idx);
+        return arg;
+    }
+}
+};
+
+template<typename... Types>
+inline cv::util::optional<GCompileArg> tryDeserializeCompArg(const std::string& tag,
+                                                             const std::vector<char>& sArg) {
+    std::unique_ptr<cv::gapi::s11n::IIStream> pArgIs = cv::gapi::s11n::detail::getInStream(sArg);
+    return try_deserialize_comparg<std::tuple<Types...>>::exec(tag, *pArgIs);
+}
+
+template<typename... Types>
+cv::GCompileArgs getCompileArgs(const std::vector<char> &sArgs) {
+    cv::GCompileArgs args;
+
+    std::unique_ptr<cv::gapi::s11n::IIStream> pIs = cv::gapi::s11n::detail::getInStream(sArgs);
+    cv::gapi::s11n::IIStream& is = *pIs;
+
+    uint32_t sz = 0;
+    is >> sz;
+    for (uint32_t i = 0; i < sz; ++i) {
+        std::string tag;
+        is >> tag;
+
+        std::vector<char> sArg;
+        is >> sArg;
+
+        cv::util::optional<GCompileArg> dArg =
+            cv::gapi::detail::tryDeserializeCompArg<Types...>(tag, sArg);
+
+        if (dArg.has_value())
+        {
+            args.push_back(dArg.value());
+        }
+    }
+
+    return args;
+}
+
+template<typename... AdapterTypes>
+cv::GRunArgs getRunArgsWithAdapters(const std::vector<char> &bytes) {
+    std::unique_ptr<cv::gapi::s11n::IIStream> pIs = cv::gapi::s11n::detail::getInStream(bytes);
+    cv::gapi::s11n::IIStream& is = *pIs;
+    cv::GRunArgs args;
+
+    uint32_t sz = 0;
+    is >> sz;
+    for (uint32_t i = 0; i < sz; ++i) {
+        uint32_t idx = 0;
+        is >> idx;
+        args.push_back(cv::gapi::detail::deserialize_runarg<AdapterTypes...>::exec(is, idx));
+    }
+
+    return args;
+}
+} // namespace detail
+/** @} */
+
+} // namespace gapi
+} // namespace cv
+
+#if defined _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // OPENCV_GAPI_S11N_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/s11n/base.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/s11n/base.hpp
new file mode 100644
index 000000000000..760e8515f6ac
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/s11n/base.hpp
@@ -0,0 +1,80 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020-2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_S11N_BASE_HPP
+#define OPENCV_GAPI_S11N_BASE_HPP
+
+#include <opencv2/gapi/own/assert.hpp>
+#include <opencv2/gapi/own/exports.hpp>
+
+namespace cv {
+namespace gapi {
+
+/**
+ * @brief This namespace contains G-API serialization and
+ * deserialization functions and data structures.
+ */
+namespace s11n {
+struct IOStream;
+struct IIStream;
+
+namespace detail {
+
+//! @addtogroup gapi_serialization
+//! @{
+
+struct NotImplemented {
+};
+
+/** @brief This structure allows to implement serialization routines for custom types.
+ *
+ * The default S11N for custom types is not implemented.
+ *
+ * @note When providing an overloaded implementation for S11N with your type
+ * don't inherit it from NotImplemented structure.
+ *
+ * @note There are lots of overloaded >> and << operators for basic and OpenCV/G-API types
+ * which can be utilized when serializing a custom type.
+ *
+ * Example of usage:
+ * @snippet samples/cpp/tutorial_code/gapi/doc_snippets/api_ref_snippets.cpp S11N usage
+ *
+ */
+template<typename T>
+struct S11N: public NotImplemented {
+    /**
+     * @brief This function allows user to serialize their custom type.
+     *
+     * @note The default overload throws an exception if called. User need to
+     * properly overload the function to use it.
+     */
+    static void serialize(IOStream &, const T &) {
+        GAPI_Error("No serialization routine is provided!");
+    }
+    /**
+     * @brief This function allows user to deserialize their custom type.
+     *
+     * @note The default overload throws an exception if called. User need to
+     * properly overload the function to use it.
+     */
+    static T deserialize(IIStream &) {
+        GAPI_Error("No deserialization routine is provided!");
+    }
+};
+
+/// @private -- Exclude this struct from OpenCV documentation
+template<typename T> struct has_S11N_spec {
+    static constexpr bool value = !std::is_base_of<NotImplemented,
+                                        S11N<typename std::decay<T>::type>>::value;
+};
+//! @} gapi_serialization
+
+} // namespace detail
+} // namespace s11n
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_S11N_BASE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/stereo.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/stereo.hpp
new file mode 100644
index 000000000000..9b00267082ca
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/stereo.hpp
@@ -0,0 +1,85 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distereoibution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STEREO_HPP
+#define OPENCV_GAPI_STEREO_HPP
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+
+namespace cv {
+namespace gapi {
+
+/**
+ * The enum specified format of result that you get from @ref cv::gapi::stereo.
+ */
+enum class StereoOutputFormat {
+    DEPTH_FLOAT16, ///< Floating point 16 bit value, CV_16FC1.
+                   ///< This identifier is deprecated, use DEPTH_16F instead.
+    DEPTH_FLOAT32, ///< Floating point 32 bit value, CV_32FC1
+                   ///< This identifier is deprecated, use DEPTH_16F instead.
+    DISPARITY_FIXED16_11_5, ///< 16 bit signed: first bit for sign,
+                            ///< 10 bits for integer part,
+                            ///< 5 bits for fractional part.
+                            ///< This identifier is deprecated,
+                            ///< use DISPARITY_16Q_10_5 instead.
+    DISPARITY_FIXED16_12_4, ///< 16 bit signed: first bit for sign,
+                            ///< 11 bits for integer part,
+                            ///< 4 bits for fractional part.
+                            ///< This identifier is deprecated,
+                            ///< use DISPARITY_16Q_11_4 instead.
+    DEPTH_16F = DEPTH_FLOAT16, ///< Same as DEPTH_FLOAT16
+    DEPTH_32F = DEPTH_FLOAT32, ///< Same as DEPTH_FLOAT32
+    DISPARITY_16Q_10_5 = DISPARITY_FIXED16_11_5, ///< Same as DISPARITY_FIXED16_11_5
+    DISPARITY_16Q_11_4 = DISPARITY_FIXED16_12_4 ///< Same as DISPARITY_FIXED16_12_4
+};
+
+
+/**
+ * @brief This namespace contains G-API Operation Types for Stereo and
+ * related functionality.
+ */
+namespace calib3d {
+
+G_TYPED_KERNEL(GStereo, <GMat(GMat, GMat, const StereoOutputFormat)>, "org.opencv.stereo") {
+    static GMatDesc outMeta(const GMatDesc &left, const GMatDesc &right, const StereoOutputFormat of) {
+        GAPI_Assert(left.chan == 1);
+        GAPI_Assert(left.depth == CV_8U);
+
+        GAPI_Assert(right.chan == 1);
+        GAPI_Assert(right.depth == CV_8U);
+
+        switch(of) {
+            case StereoOutputFormat::DEPTH_FLOAT16:
+                return left.withDepth(CV_16FC1);
+            case StereoOutputFormat::DEPTH_FLOAT32:
+                return left.withDepth(CV_32FC1);
+            case StereoOutputFormat::DISPARITY_FIXED16_11_5:
+            case StereoOutputFormat::DISPARITY_FIXED16_12_4:
+                return left.withDepth(CV_16SC1);
+            default:
+                GAPI_Error("Unknown output format!");
+        }
+    }
+};
+
+} // namespace calib3d
+
+/** @brief Computes disparity/depth map for the specified stereo-pair.
+The function computes disparity or depth map depending on passed StereoOutputFormat argument.
+
+@param left 8-bit single-channel left image of @ref CV_8UC1 type.
+@param right 8-bit single-channel right image of @ref CV_8UC1 type.
+@param of enum to specified output kind: depth or disparity and corresponding type
+*/
+GAPI_EXPORTS GMat stereo(const GMat& left,
+                         const GMat& right,
+                         const StereoOutputFormat of = StereoOutputFormat::DEPTH_FLOAT32);
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STEREO_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/cap.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/cap.hpp
new file mode 100644
index 000000000000..9c2185c1ab46
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/cap.hpp
@@ -0,0 +1,149 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_CAP_HPP
+#define OPENCV_GAPI_STREAMING_CAP_HPP
+
+/**
+ * YOUR ATTENTION PLEASE!
+ *
+ * This is a header-only implementation of cv::VideoCapture-based
+ * Stream source.  It is not built by default with G-API as G-API
+ * doesn't depend on videoio module.
+ *
+ * If you want to use it in your application, please make sure
+ * videioio is available in your OpenCV package and is linked to your
+ * application.
+ *
+ * Note for developers: please don't put videoio dependency in G-API
+ * because of this file.
+ */
+#include <chrono>
+#include <map>
+
+#include <opencv2/videoio.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/streaming/meta.hpp>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+
+/**
+ * @brief OpenCV's VideoCapture-based streaming source.
+ *
+ * This class implements IStreamSource interface.
+ * Its constructor takes the same parameters as cv::VideoCapture does.
+ *
+ * Please make sure that videoio OpenCV module is available before using
+ * this in your application (G-API doesn't depend on it directly).
+ *
+ * @note stream sources are passed to G-API via shared pointers, so
+ *  please gapi::make_src<> to create objects and ptr() to pass a
+ *  GCaptureSource to cv::gin().
+ */
+class GCaptureSource: public IStreamSource
+{
+public:
+    explicit GCaptureSource(int id, const std::map<int, double> &properties = {})
+        : cap(id) { prep(properties); }
+
+    explicit GCaptureSource(const std::string &path,
+                            const std::map<int, double> &properties = {})
+        : cap(path) { prep(properties); }
+
+    void set(int propid, double value) {
+        cap.set(propid, value);
+    }
+
+    // TODO: Add more constructor overloads to make it
+    // fully compatible with VideoCapture's interface.
+
+protected:
+    cv::VideoCapture cap;
+    cv::Mat first;
+    bool first_pulled = false;
+    int64_t counter = 0;
+
+    void prep(const std::map<int, double> &properties)
+    {
+        for (const auto &it : properties) {
+            cap.set(it.first, it.second);
+        }
+
+        // Prepare first frame to report its meta to engine
+        // when needed
+        GAPI_Assert(first.empty());
+        cv::Mat tmp;
+        if (!cap.read(tmp))
+        {
+            GAPI_Error("Couldn't grab the very first frame");
+        }
+        // NOTE: Some decode/media VideoCapture backends continue
+        // owning the video buffer under cv::Mat so in order to
+        // process it safely in a highly concurrent pipeline, clone()
+        // is the only right way.
+        first = tmp.clone();
+    }
+
+    virtual bool pull(cv::gapi::wip::Data &data) override
+    {
+        if (!first_pulled)
+        {
+            GAPI_Assert(!first.empty());
+            first_pulled = true;
+            data = first; // no need to clone here since it was cloned already
+        }
+        else
+        {
+            if (!cap.isOpened()) return false;
+
+            cv::Mat frame;
+            if (!cap.read(frame))
+            {
+                // end-of-stream happened
+                return false;
+            }
+            // Same reason to clone as in prep()
+            data = frame.clone();
+        }
+        // Tag data with seq_id/ts
+        const auto now = std::chrono::system_clock::now();
+        const auto dur = std::chrono::duration_cast<std::chrono::microseconds>
+            (now.time_since_epoch());
+        data.meta[cv::gapi::streaming::meta_tag::timestamp] = int64_t{dur.count()};
+        data.meta[cv::gapi::streaming::meta_tag::seq_id]    = int64_t{counter++};
+        return true;
+    }
+
+    virtual GMetaArg descr_of() const override
+    {
+        GAPI_Assert(!first.empty());
+        return cv::GMetaArg{cv::descr_of(first)};
+    }
+};
+
+// NB: Overload for using from python
+GAPI_EXPORTS_W cv::Ptr<IStreamSource>
+inline make_capture_src(const std::string& path,
+                        const std::map<int, double>& properties = {})
+{
+    return make_src<GCaptureSource>(path, properties);
+}
+
+// NB: Overload for using from python
+GAPI_EXPORTS_W cv::Ptr<IStreamSource>
+inline make_capture_src(const int id,
+                        const std::map<int, double>& properties = {})
+{
+    return make_src<GCaptureSource>(id, properties);
+}
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_CAP_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/desync.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/desync.hpp
new file mode 100644
index 000000000000..0e04f5beb93b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/desync.hpp
@@ -0,0 +1,86 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020-2021 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GSTREAMING_DESYNC_HPP
+#define OPENCV_GAPI_GSTREAMING_DESYNC_HPP
+
+#include <tuple>
+
+#include <opencv2/gapi/util/util.hpp>
+#include <opencv2/gapi/gtype_traits.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/gcall.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+
+namespace cv {
+namespace gapi {
+namespace streaming {
+
+namespace detail {
+struct GDesync {
+    static const char *id() {
+        return "org.opencv.streaming.desync";
+    }
+
+    // An universal yield for desync.
+    // Yields output objects according to the input Types...
+    // Reuses gkernel machinery.
+    // FIXME: This function can be generic and declared in gkernel.hpp
+    //        (it is there already, but a part of GKernelType[M]
+    template<typename... R, int... IIs>
+    static std::tuple<R...> yield(cv::GCall &call, cv::detail::Seq<IIs...>) {
+        return std::make_tuple(cv::detail::Yield<R>::yield(call, IIs)...);
+    }
+};
+
+template<typename G>
+G desync(const G &g) {
+    cv::GKernel k{
+          GDesync::id()                                     // kernel id
+        , ""                                                // kernel tag
+        , [](const GMetaArgs &a, const GArgs &) {return a;} // outMeta callback
+        , {cv::detail::GTypeTraits<G>::shape}               // output Shape
+        , {cv::detail::GTypeTraits<G>::op_kind}             // input data kinds
+        , {cv::detail::GObtainCtor<G>::get()}               // output template ctors
+        , {cv::detail::GTypeTraits<G>::op_kind}             // output data kinds
+    };
+    cv::GCall call(std::move(k));
+    call.pass(g);
+    return std::get<0>(GDesync::yield<G>(call, cv::detail::MkSeq<1>::type()));
+}
+} // namespace detail
+
+/**
+ * @brief Starts a desynchronized branch in the graph.
+ *
+ * This operation takes a single G-API data object and returns a
+ * graph-level "duplicate" of this object.
+ *
+ * Operations which use this data object can be desynchronized
+ * from the rest of the graph.
+ *
+ * This operation has no effect when a GComputation is compiled with
+ * regular cv::GComputation::compile(), since cv::GCompiled objects
+ * always produce their full output vectors.
+ *
+ * This operation only makes sense when a GComputation is compiled in
+ * streaming mode with cv::GComputation::compileStreaming(). If this
+ * operation is used and there are desynchronized outputs, the user
+ * should use a special version of cv::GStreamingCompiled::pull()
+ * which produces an array of cv::util::optional<> objects.
+ *
+ * @note This feature is highly experimental now and is currently
+ * limited to a single GMat/GFrame argument only.
+ */
+GAPI_EXPORTS GMat desync(const GMat &g);
+GAPI_EXPORTS GFrame desync(const GFrame &f);
+
+} // namespace streaming
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_GSTREAMING_DESYNC_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/format.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/format.hpp
new file mode 100644
index 000000000000..739a3852a64d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/format.hpp
@@ -0,0 +1,94 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_GSTREAMING_FORMAT_HPP
+#define OPENCV_GAPI_GSTREAMING_FORMAT_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace streaming {
+
+GAPI_EXPORTS cv::GKernelPackage kernels();
+
+G_API_OP(GBGR, <GMat(GFrame)>, "org.opencv.streaming.BGR")
+{
+    static GMatDesc outMeta(const GFrameDesc& in) { return GMatDesc{CV_8U, 3, in.size}; }
+};
+
+G_API_OP(GY, <GMat(GFrame)>, "org.opencv.streaming.Y") {
+    static GMatDesc outMeta(const GFrameDesc& frameDesc) {
+        return GMatDesc { CV_8U, 1, frameDesc.size , false };
+    }
+};
+
+G_API_OP(GUV, <GMat(GFrame)>, "org.opencv.streaming.UV") {
+    static GMatDesc outMeta(const GFrameDesc& frameDesc) {
+        return GMatDesc { CV_8U, 2, cv::Size(frameDesc.size.width / 2, frameDesc.size.height / 2),
+                          false };
+    }
+};
+
+/** @brief Gets bgr plane from input frame
+
+@note Function textual ID is "org.opencv.streaming.BGR"
+
+@param in Input frame
+@return Image in BGR format
+*/
+GAPI_EXPORTS cv::GMat BGR(const cv::GFrame& in);
+
+/** @brief Extracts Y plane from media frame.
+
+Output image is 8-bit 1-channel image of @ref CV_8UC1.
+
+@note Function textual ID is "org.opencv.streaming.Y"
+
+@param frame input media frame.
+*/
+GAPI_EXPORTS GMat Y(const cv::GFrame& frame);
+
+/** @brief Extracts UV plane from media frame.
+
+Output image is 8-bit 2-channel image of @ref CV_8UC2.
+
+@note Function textual ID is "org.opencv.streaming.UV"
+
+@param frame input media frame.
+*/
+GAPI_EXPORTS GMat UV(const cv::GFrame& frame);
+} // namespace streaming
+
+//! @addtogroup gapi_transform
+//! @{
+/** @brief Makes a copy of the input image. Note that this copy may be not real
+(no actual data copied). Use this function to maintain graph contracts,
+e.g when graph's input needs to be passed directly to output, like in Streaming mode.
+
+@note Function textual ID is "org.opencv.streaming.copy"
+
+@param in Input image
+@return Copy of the input
+*/
+GAPI_EXPORTS_W GMat copy(const GMat& in);
+
+/** @brief Makes a copy of the input frame. Note that this copy may be not real
+(no actual data copied). Use this function to maintain graph contracts,
+e.g when graph's input needs to be passed directly to output, like in Streaming mode.
+
+@note Function textual ID is "org.opencv.streaming.copy"
+
+@param in Input frame
+@return Copy of the input
+*/
+GAPI_EXPORTS GFrame copy(const GFrame& in);
+//! @} gapi_transform
+
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_GSTREAMING_FORMAT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/gstreamer/gstreamerpipeline.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/gstreamer/gstreamerpipeline.hpp
new file mode 100644
index 000000000000..c566656cb61a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/gstreamer/gstreamerpipeline.hpp
@@ -0,0 +1,59 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_GSTREAMER_GSTREAMERPIPELINE_HPP
+#define OPENCV_GAPI_STREAMING_GSTREAMER_GSTREAMERPIPELINE_HPP
+
+#include <opencv2/gapi/streaming/gstreamer/gstreamersource.hpp>
+#include <opencv2/gapi/own/exports.hpp>
+
+#include <string>
+#include <unordered_map>
+#include <memory>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace gst {
+
+class GAPI_EXPORTS_W GStreamerPipeline
+{
+public:
+    class Priv;
+
+    GAPI_WRAP explicit GStreamerPipeline(const std::string& pipeline);
+    IStreamSource::Ptr getStreamingSource(const std::string& appsinkName,
+                                          const GStreamerSource::OutputType outputType =
+                                              GStreamerSource::OutputType::MAT);
+    virtual ~GStreamerPipeline();
+
+protected:
+    explicit GStreamerPipeline(std::unique_ptr<Priv> priv);
+
+    std::unique_ptr<Priv> m_priv;
+};
+
+} // namespace gst
+
+using GStreamerPipeline = gst::GStreamerPipeline;
+
+// NB: Function for using from python
+// FIXME: a separate function is created due to absence of wrappers for `shared_ptr<> `
+// Ideally would be to wrap the `GStreamerPipeline::getStreamingSource()` method as is
+GAPI_EXPORTS_W cv::Ptr<IStreamSource>
+inline get_streaming_source(cv::Ptr<GStreamerPipeline>& pipeline,
+                            const std::string& appsinkName,
+                            const GStreamerSource::OutputType outputType
+                                = GStreamerSource::OutputType::MAT)
+{
+    return pipeline->getStreamingSource(appsinkName, outputType);
+}
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_GSTREAMER_GSTREAMERPIPELINE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/gstreamer/gstreamersource.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/gstreamer/gstreamersource.hpp
new file mode 100644
index 000000000000..8b8a5ae3121f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/gstreamer/gstreamersource.hpp
@@ -0,0 +1,97 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_GSTREAMER_GSTREAMERSOURCE_HPP
+#define OPENCV_GAPI_STREAMING_GSTREAMER_GSTREAMERSOURCE_HPP
+
+#include <opencv2/gapi/streaming/source.hpp>
+#include <opencv2/gapi/garg.hpp>
+
+#include <memory>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace gst {
+
+/**
+ * @brief OpenCV's GStreamer streaming source.
+ *        Streams cv::Mat-s/cv::MediaFrame from passed GStreamer pipeline.
+ *
+ * This class implements IStreamSource interface.
+ *
+ * To create GStreamerSource instance you need to pass 'pipeline' and, optionally, 'outputType'
+ * arguments into constructor.
+ * 'pipeline' should represent GStreamer pipeline in form of textual description.
+ * Almost any custom pipeline is supported which can be successfully ran via gst-launch.
+ * The only two limitations are:
+ *      - there should be __one__ appsink element in the pipeline to pass data to OpenCV app.
+ *        Pipeline can actually contain many sink elements, but it must have one and only one
+ *        appsink among them.
+ *
+ *      - data passed to appsink should be video-frame in NV12 or GRAY8 format.
+ *
+ * 'outputType' is used to select type of output data to produce: 'cv::MediaFrame' or 'cv::Mat'.
+ * To produce 'cv::MediaFrame'-s you need to pass 'GStreamerSource::OutputType::FRAME' and,
+ * correspondingly, 'GStreamerSource::OutputType::MAT' to produce 'cv::Mat'-s.
+ * Please note, that in the last case, output 'cv::Mat' will be of BGR format, internal conversion
+ * from NV12 / GRAY8 GStreamer data will happen.
+ * Default value for 'outputType' is 'GStreamerSource::OutputType::MAT'.
+ *
+ * @note Stream sources are passed to G-API via shared pointers, so please use gapi::make_src<>
+ *       to create objects and ptr() to pass a GStreamerSource to cv::gin().
+ *
+ * @note You need to build OpenCV with GStreamer support to use this class.
+ */
+
+class GStreamerPipelineFacade;
+
+class GAPI_EXPORTS GStreamerSource : public IStreamSource
+{
+public:
+    class Priv;
+
+    // Indicates what type of data should be produced by GStreamerSource: cv::MediaFrame or cv::Mat
+    enum class OutputType {
+        FRAME,
+        MAT
+    };
+
+    GStreamerSource(const std::string& pipeline,
+                    const GStreamerSource::OutputType outputType =
+                        GStreamerSource::OutputType::MAT);
+    GStreamerSource(std::shared_ptr<GStreamerPipelineFacade> pipeline,
+                    const std::string& appsinkName,
+                    const GStreamerSource::OutputType outputType =
+                        GStreamerSource::OutputType::MAT);
+
+    bool pull(cv::gapi::wip::Data& data) override;
+    GMetaArg descr_of() const override;
+    ~GStreamerSource() override;
+
+protected:
+    explicit GStreamerSource(std::unique_ptr<Priv> priv);
+
+    std::unique_ptr<Priv> m_priv;
+};
+
+} // namespace gst
+
+using GStreamerSource = gst::GStreamerSource;
+
+// NB: Overload for using from python
+GAPI_EXPORTS_W cv::Ptr<IStreamSource>
+inline make_gst_src(const std::string& pipeline,
+                    const GStreamerSource::OutputType outputType =
+                    GStreamerSource::OutputType::MAT)
+{
+    return make_src<GStreamerSource>(pipeline, outputType);
+}
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_GSTREAMER_GSTREAMERSOURCE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/meta.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/meta.hpp
new file mode 100644
index 000000000000..cdd3d371cb45
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/meta.hpp
@@ -0,0 +1,80 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GSTREAMING_META_HPP
+#define OPENCV_GAPI_GSTREAMING_META_HPP
+
+#include <opencv2/gapi/gopaque.hpp>
+#include <opencv2/gapi/gcall.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/gtype_traits.hpp>
+
+namespace cv {
+namespace gapi {
+namespace streaming {
+
+// FIXME: the name is debatable
+namespace meta_tag {
+static constexpr const char * timestamp = "org.opencv.gapi.meta.timestamp";
+static constexpr const char * seq_id    = "org.opencv.gapi.meta.seq_id";
+} // namespace meta_tag
+
+namespace detail {
+struct GMeta {
+    static const char *id() {
+        return "org.opencv.streaming.meta";
+    }
+    // A universal yield for meta(), same as in GDesync
+    template<typename... R, int... IIs>
+    static std::tuple<R...> yield(cv::GCall &call, cv::detail::Seq<IIs...>) {
+        return std::make_tuple(cv::detail::Yield<R>::yield(call, IIs)...);
+    }
+    // Also a universal outMeta stub here
+    static GMetaArgs getOutMeta(const GMetaArgs &args, const GArgs &) {
+        return args;
+    }
+};
+} // namespace detail
+
+template<typename T, typename G>
+cv::GOpaque<T> meta(G g, const std::string &tag) {
+    using O = cv::GOpaque<T>;
+    cv::GKernel k{
+          detail::GMeta::id()                    // kernel id
+        , tag                                    // kernel tag. Use meta tag here
+        , &detail::GMeta::getOutMeta             // outMeta callback
+        , {cv::detail::GTypeTraits<O>::shape}    // output Shape
+        , {cv::detail::GTypeTraits<G>::op_kind}  // input data kinds
+        , {cv::detail::GObtainCtor<O>::get()}    // output template ctors
+        , {cv::detail::GTypeTraits<O>::op_kind}  // output data kind
+    };
+    cv::GCall call(std::move(k));
+    call.pass(g);
+    return std::get<0>(detail::GMeta::yield<O>(call, cv::detail::MkSeq<1>::type()));
+}
+
+template<typename G>
+cv::GOpaque<int64_t> timestamp(G g) {
+    return meta<int64_t>(g, meta_tag::timestamp);
+}
+
+template<typename G>
+cv::GOpaque<int64_t> seq_id(G g) {
+    return meta<int64_t>(g, meta_tag::seq_id);
+}
+
+template<typename G>
+cv::GOpaque<int64_t> seqNo(G g) {
+    // Old name, compatibility only
+    return seq_id(g);
+}
+
+} // namespace streaming
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_GSTREAMING_META_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/accel_types.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
new file mode 100644
index 000000000000..b670aebd1d44
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/accel_types.hpp
@@ -0,0 +1,76 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2022 Intel Corporation
+
+#ifndef GAPI_STREAMING_ONEVPL_ACCEL_TYPES_HPP
+#define GAPI_STREAMING_ONEVPL_ACCEL_TYPES_HPP
+
+#include <limits>
+#include <string>
+
+#include "opencv2/gapi/own/exports.hpp" // GAPI_EXPORTS
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+
+enum class AccelType: uint8_t {
+    HOST,
+    DX11,
+    VAAPI,
+
+    LAST_VALUE = std::numeric_limits<uint8_t>::max()
+};
+
+GAPI_EXPORTS const char* to_cstring(AccelType type);
+
+struct IDeviceSelector;
+struct GAPI_EXPORTS Device {
+    friend struct IDeviceSelector;
+    using Ptr = void*;
+
+    ~Device();
+    const std::string& get_name() const;
+    Ptr get_ptr() const;
+    AccelType get_type() const;
+private:
+    Device(Ptr device_ptr, const std::string& device_name,
+           AccelType device_type);
+
+    std::string name;
+    Ptr ptr;
+    AccelType type;
+};
+
+struct GAPI_EXPORTS Context {
+    friend struct IDeviceSelector;
+    using Ptr = void*;
+
+    ~Context();
+    Ptr get_ptr() const;
+    AccelType get_type() const;
+private:
+    Context(Ptr ctx_ptr, AccelType ctx_type);
+    Ptr ptr;
+    AccelType type;
+};
+
+GAPI_EXPORTS Device create_host_device();
+GAPI_EXPORTS Context create_host_context();
+
+GAPI_EXPORTS Device create_dx11_device(Device::Ptr device_ptr,
+                                       const std::string& device_name);
+GAPI_EXPORTS Context create_dx11_context(Context::Ptr ctx_ptr);
+
+GAPI_EXPORTS Device create_vaapi_device(Device::Ptr device_ptr,
+                                        const std::string& device_name);
+GAPI_EXPORTS Context create_vaapi_context(Context::Ptr ctx_ptr);
+} // namespace onevpl
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // GAPI_STREAMING_ONEVPL_ACCEL_TYPES_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/cfg_params.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/cfg_params.hpp
new file mode 100644
index 000000000000..0db9a86e58d0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/cfg_params.hpp
@@ -0,0 +1,209 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_ONEVPL_CFG_PARAMS_HPP
+#define OPENCV_GAPI_STREAMING_ONEVPL_CFG_PARAMS_HPP
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include <opencv2/gapi/streaming/source.hpp>
+#include <opencv2/gapi/util/variant.hpp>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+
+/**
+ * @brief Public class is using for creation of onevpl::GSource instances.
+ *
+ * Class members available through methods @ref CfgParam::get_name() and @ref CfgParam::get_value() are used by
+ * onevpl::GSource inner logic to create or find oneVPL particular implementation
+ * (software/hardware, specific API version and etc.).
+ *
+ * @note Because oneVPL may provide several implementations which are satisfying with multiple (or single one) @ref CfgParam
+ * criteria therefore it is possible to configure `preferred` parameters. This kind of CfgParams are created
+ * using `is_major = false` argument in @ref CfgParam::create method and are not used by creating oneVPL particular implementations.
+ * Instead they fill out a "score table" to select preferable implementation from available list. Implementation are satisfying
+ * with most of these optional params would be chosen.
+ * If no one optional CfgParam params were present then first of available oneVPL implementation would be applied.
+ * Please get on https://spec.oneapi.io/versions/latest/elements/oneVPL/source/API_ref/VPL_disp_api_func.html?highlight=mfxcreateconfig#mfxsetconfigfilterproperty
+ * for using OneVPL configuration. In this schema `mfxU8 *name` represents @ref CfgParam::get_name() and
+ * `mfxVariant value` is @ref CfgParam::get_value()
+ */
+struct GAPI_EXPORTS CfgParam {
+    using name_t = std::string;
+    using value_t = cv::util::variant<uint8_t, int8_t,
+                                      uint16_t, int16_t,
+                                      uint32_t, int32_t,
+                                      uint64_t, int64_t,
+                                      float_t,
+                                      double_t,
+                                      void*,
+                                      std::string>;
+    /**
+     * @brief frames_pool_size_name
+     *
+     * Special configuration parameter name for onevp::GSource:
+     *
+     * @note frames_pool_size_name allows to allocate surfaces pool appropriate size to keep
+     * decoded frames in accelerator memory ready before
+     * they would be consumed by onevp::GSource::pull operation. If you see
+     * a lot of WARNING about lack of free surface then it's time to increase
+     * frames_pool_size_name but be aware of accelerator free memory volume.
+     * If not set then MFX implementation use
+     * mfxFrameAllocRequest::NumFrameSuggested behavior
+     *
+     */
+    static constexpr const char *frames_pool_size_name() { return "frames_pool_size"; }
+    static CfgParam create_frames_pool_size(size_t value);
+
+    /**
+     * @brief acceleration_mode_name
+     *
+     * Special configuration parameter names for onevp::GSource:
+     *
+     * @note acceleration_mode_name allows to activate hardware acceleration &
+     * device memory management.
+     * Supported values:
+     * - MFX_ACCEL_MODE_VIA_D3D11   Will activate DX11 acceleration and will produces
+     * MediaFrames with data allocated in DX11 device memory
+     *
+     * If not set then MFX implementation will use default acceleration behavior:
+     * all decoding operation uses default GPU resources but MediaFrame produces
+     * data allocated by using host RAM
+     *
+     */
+    static constexpr const char *acceleration_mode_name() { return "mfxImplDescription.AccelerationMode"; }
+    static CfgParam create_acceleration_mode(uint32_t value);
+    static CfgParam create_acceleration_mode(const char* value);
+
+    /**
+     * @brief decoder_id_name
+     *
+     * Special configuration parameter names for onevp::GSource:
+     *
+     * @note decoder_id_name allows to specify VPL decoder type which MUST present
+     * in case of RAW video input data and MUST NOT present as CfgParam if video
+     * stream incapsulated into container(*.mp4, *.mkv and so on). In latter case
+     * onevp::GSource will determine it automatically
+     * Supported values:
+     * - MFX_CODEC_AVC
+     * - MFX_CODEC_HEVC
+     * - MFX_CODEC_MPEG2
+     * - MFX_CODEC_VC1
+     * - MFX_CODEC_CAPTURE
+     * - MFX_CODEC_VP9
+     * - MFX_CODEC_AV1
+     *
+     */
+    static constexpr const char *decoder_id_name() { return "mfxImplDescription.mfxDecoderDescription.decoder.CodecID"; }
+    static CfgParam create_decoder_id(uint32_t value);
+    static CfgParam create_decoder_id(const char* value);
+
+    static constexpr const char *implementation_name() { return "mfxImplDescription.Impl"; }
+    static CfgParam create_implementation(uint32_t value);
+    static CfgParam create_implementation(const char* value);
+
+
+    static constexpr const char *vpp_frames_pool_size_name() { return "vpp_frames_pool_size"; }
+    static CfgParam create_vpp_frames_pool_size(size_t value);
+
+    static constexpr const char *vpp_in_width_name() { return "vpp.In.Width"; }
+    static CfgParam create_vpp_in_width(uint16_t value);
+
+    static constexpr const char *vpp_in_height_name() { return "vpp.In.Height"; }
+    static CfgParam create_vpp_in_height(uint16_t value);
+
+    static constexpr const char *vpp_in_crop_x_name() { return "vpp.In.CropX"; }
+    static CfgParam create_vpp_in_crop_x(uint16_t value);
+
+    static constexpr const char *vpp_in_crop_y_name() { return "vpp.In.CropY"; }
+    static CfgParam create_vpp_in_crop_y(uint16_t value);
+
+    static constexpr const char *vpp_in_crop_w_name() { return "vpp.In.CropW"; }
+    static CfgParam create_vpp_in_crop_w(uint16_t value);
+
+    static constexpr const char *vpp_in_crop_h_name() { return "vpp.In.CropH"; }
+    static CfgParam create_vpp_in_crop_h(uint16_t value);
+
+
+    static constexpr const char *vpp_out_fourcc_name() { return "vpp.Out.FourCC"; }
+    static CfgParam create_vpp_out_fourcc(uint32_t value);
+
+    static constexpr const char *vpp_out_chroma_format_name() { return "vpp.Out.ChromaFormat"; }
+    static CfgParam create_vpp_out_chroma_format(uint16_t value);
+
+    static constexpr const char *vpp_out_width_name() { return "vpp.Out.Width"; }
+    static CfgParam create_vpp_out_width(uint16_t value);
+
+    static constexpr const char *vpp_out_height_name() { return "vpp.Out.Height"; }
+    static CfgParam create_vpp_out_height(uint16_t value);
+
+    static constexpr const char *vpp_out_crop_x_name() { return "vpp.Out.CropX"; }
+    static CfgParam create_vpp_out_crop_x(uint16_t value);
+
+    static constexpr const char *vpp_out_crop_y_name() { return "vpp.Out.CropY"; }
+    static CfgParam create_vpp_out_crop_y(uint16_t value);
+
+    static constexpr const char *vpp_out_crop_w_name() { return "vpp.Out.CropW"; }
+    static CfgParam create_vpp_out_crop_w(uint16_t value);
+
+    static constexpr const char *vpp_out_crop_h_name() { return "vpp.Out.CropH"; }
+    static CfgParam create_vpp_out_crop_h(uint16_t value);
+
+    static constexpr const char *vpp_out_pic_struct_name() { return "vpp.Out.PicStruct"; }
+    static CfgParam create_vpp_out_pic_struct(uint16_t value);
+
+    static constexpr const char *vpp_out_framerate_n_name() { return "vpp.Out.FrameRateExtN"; }
+    static CfgParam create_vpp_out_framerate_n(uint32_t value);
+
+    static constexpr const char *vpp_out_framerate_d_name() { return "vpp.Out.FrameRateExtD"; }
+    static CfgParam create_vpp_out_framerate_d(uint32_t value);
+
+    /**
+     * Create generic onevp::GSource configuration parameter.
+     *
+     *@param name           name of parameter.
+     *@param value          value of parameter.
+     *@param is_major       TRUE if parameter MUST be provided by OneVPL inner implementation, FALSE for optional (for resolve multiple available implementations).
+     *
+     */
+    template<typename ValueType>
+    static CfgParam create(const std::string& name, ValueType&& value, bool is_major = true) {
+        CfgParam param(name, CfgParam::value_t(std::forward<ValueType>(value)), is_major);
+        return param;
+    }
+
+    struct Priv;
+
+    const name_t& get_name() const;
+    const value_t& get_value() const;
+    bool is_major() const;
+    std::string to_string() const;
+
+    bool operator==(const CfgParam& rhs) const;
+    bool operator< (const CfgParam& rhs) const;
+    bool operator!=(const CfgParam& rhs) const;
+
+    CfgParam& operator=(const CfgParam& src);
+    CfgParam& operator=(CfgParam&& src);
+    CfgParam(const CfgParam& src);
+    CfgParam(CfgParam&& src);
+    ~CfgParam();
+private:
+    CfgParam(const std::string& param_name, value_t&& param_value, bool is_major_param);
+    std::shared_ptr<Priv> m_priv;
+};
+
+} //namespace onevpl
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_ONEVPL_CFG_PARAMS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/data_provider_interface.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/data_provider_interface.hpp
new file mode 100644
index 000000000000..ec683a7527ff
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/data_provider_interface.hpp
@@ -0,0 +1,105 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef GAPI_STREAMING_ONEVPL_ONEVPL_DATA_PROVIDER_INTERFACE_HPP
+#define GAPI_STREAMING_ONEVPL_ONEVPL_DATA_PROVIDER_INTERFACE_HPP
+#include <exception>
+#include <memory>
+#include <string>
+
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+
+struct GAPI_EXPORTS DataProviderException : public std::exception {
+    DataProviderException(const std::string& descr);
+    DataProviderException(std::string&& descr);
+
+    virtual ~DataProviderException() = default;
+    virtual const char* what() const noexcept override;
+private:
+    std::string reason;
+};
+
+struct GAPI_EXPORTS DataProviderSystemErrorException final : public DataProviderException {
+    DataProviderSystemErrorException(int error_code, const std::string& description = std::string());
+    ~DataProviderSystemErrorException() = default;
+};
+
+struct GAPI_EXPORTS DataProviderUnsupportedException final : public DataProviderException {
+    DataProviderUnsupportedException(const std::string& description);
+    ~DataProviderUnsupportedException() = default;
+};
+
+struct GAPI_EXPORTS DataProviderImplementationException : public DataProviderException {
+    DataProviderImplementationException(const std::string& description);
+    ~DataProviderImplementationException() = default;
+};
+/**
+ * @brief Public interface allows to customize extraction of video stream data
+ * used by onevpl::GSource instead of reading stream from file (by default).
+ *
+ * Interface implementation constructor MUST provide consistency and creates fully operable object.
+ * If error happened implementation MUST throw `DataProviderException` kind exceptions
+ *
+ * @note Interface implementation MUST manage stream and other constructed resources by itself to avoid any kind of leak.
+ * For simple interface implementation example please see `StreamDataProvider` in `tests/streaming/gapi_streaming_tests.cpp`
+ */
+struct GAPI_EXPORTS IDataProvider {
+    using Ptr = std::shared_ptr<IDataProvider>;
+    using mfx_codec_id_type = uint32_t;
+
+    /**
+     * NB: here is supposed to be forward declaration of mfxBitstream
+     * But according to current oneVPL implementation it is impossible to forward
+     * declare untagged struct mfxBitstream.
+     *
+     * IDataProvider makes sense only for HAVE_VPL is ON and to keep IDataProvider
+     * interface API/ABI compliant between core library and user application layer
+     * let's introduce wrapper mfx_bitstream which inherits mfxBitstream in private
+     * G-API code section and declare forward for wrapper mfx_bitstream here
+     */
+    struct mfx_bitstream;
+
+    virtual ~IDataProvider() = default;
+
+    /**
+     * The function is used by onevpl::GSource to extract codec id from data
+     *
+     */
+    virtual mfx_codec_id_type get_mfx_codec_id() const = 0;
+
+    /**
+     * The function is used by onevpl::GSource to extract binary data stream from @ref IDataProvider
+     * implementation.
+     *
+     * It MUST throw `DataProviderException` kind exceptions in fail cases.
+     * It MUST return MFX_ERR_MORE_DATA in EOF which considered as not-fail case.
+     *
+     * @param in_out_bitsream the input-output reference on MFX bitstream buffer which MUST be empty at the first request
+     * to allow implementation to allocate it by itself and to return back. Subsequent invocation of `fetch_bitstream_data`
+     * MUST use the previously used in_out_bitsream to avoid skipping rest of frames which haven't been consumed
+     * @return true for fetched data, false on EOF and throws exception on error
+     */
+    virtual bool fetch_bitstream_data(std::shared_ptr<mfx_bitstream> &in_out_bitsream) = 0;
+
+    /**
+     * The function is used by onevpl::GSource to check more binary data availability.
+     *
+     * It MUST return TRUE in case of EOF and NO_THROW exceptions.
+     *
+     * @return boolean value which detects end of stream
+     */
+    virtual bool empty() const = 0;
+};
+} // namespace onevpl
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // GAPI_STREAMING_ONEVPL_ONEVPL_DATA_PROVIDER_INTERFACE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/default.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/default.hpp
new file mode 100644
index 000000000000..8b547e1aba9b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/default.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2022 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_ONEVPL_UTILS_HPP
+#define OPENCV_GAPI_STREAMING_ONEVPL_UTILS_HPP
+
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+#include <opencv2/gapi/streaming/onevpl/cfg_params.hpp>
+#include <opencv2/gapi/streaming/onevpl/device_selector_interface.hpp>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+
+/**
+ * @brief Provides default device selector based on config.
+ */
+GAPI_EXPORTS std::shared_ptr<IDeviceSelector> getDefaultDeviceSelector(const std::vector<CfgParam>& cfg_params);
+
+} // namespace onevpl
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_ONEVPL_UTILS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/device_selector_interface.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/device_selector_interface.hpp
new file mode 100644
index 000000000000..2e2d879fba6b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/device_selector_interface.hpp
@@ -0,0 +1,61 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef GAPI_STREAMING_ONEVPL_DEVICE_SELECTOR_INTERFACE_HPP
+#define GAPI_STREAMING_ONEVPL_DEVICE_SELECTOR_INTERFACE_HPP
+
+#include <limits>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <opencv2/gapi/streaming/onevpl/accel_types.hpp>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+struct GAPI_EXPORTS IDeviceSelector {
+    using Ptr = std::shared_ptr<IDeviceSelector>;
+
+    struct GAPI_EXPORTS Score {
+        friend struct IDeviceSelector;
+        using Type = int16_t;
+        static constexpr Type MaxActivePriority = std::numeric_limits<Type>::max();
+        static constexpr Type MinActivePriority = 0;
+        static constexpr Type MaxPassivePriority = MinActivePriority - 1;
+        static constexpr Type MinPassivePriority = std::numeric_limits<Type>::min();
+
+        Score(Type val);
+        ~Score();
+
+        operator Type () const;
+        Type get() const;
+        friend bool operator< (Score lhs, Score rhs) {
+            return lhs.get() < rhs.get();
+        }
+    private:
+        Type value;
+    };
+
+    using DeviceScoreTable = std::map<Score, Device>;
+    using DeviceContexts = std::vector<Context>;
+
+    virtual ~IDeviceSelector();
+    virtual DeviceScoreTable select_devices() const = 0;
+    virtual DeviceContexts select_context() = 0;
+protected:
+    template<typename Entity, typename ...Args>
+    static Entity create(Args &&...args) {
+        return Entity(std::forward<Args>(args)...);
+    }
+};
+} // namespace onevpl
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // GAPI_STREAMING_ONEVPL_DEVICE_SELECTOR_INTERFACE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/source.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/source.hpp
new file mode 100644
index 000000000000..04dc2e246d3e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/onevpl/source.hpp
@@ -0,0 +1,94 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_ONEVPL_ONEVPL_SOURCE_HPP
+#define OPENCV_GAPI_STREAMING_ONEVPL_ONEVPL_SOURCE_HPP
+
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/streaming/meta.hpp>
+#include <opencv2/gapi/streaming/source.hpp>
+#include <opencv2/gapi/streaming/onevpl/cfg_params.hpp>
+#include <opencv2/gapi/streaming/onevpl/data_provider_interface.hpp>
+#include <opencv2/gapi/streaming/onevpl/device_selector_interface.hpp>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+using CfgParams = std::vector<CfgParam>;
+
+/**
+ * @brief G-API streaming source based on OneVPL implementation.
+ *
+ * This class implements IStreamSource interface.
+ * Its constructor takes source file path (in usual way) or @ref onevpl::IDataProvider
+ * interface implementation (for not file-based sources). It also allows to pass-through
+ * oneVPL configuration parameters by using several @ref onevpl::CfgParam.
+ *
+ * @note stream sources are passed to G-API via shared pointers, so
+ *  please gapi::make_onevpl_src<> to create objects and ptr() to pass a
+ *  GSource to cv::gin().
+ */
+class GAPI_EXPORTS GSource : public IStreamSource
+{
+public:
+    struct Priv;
+
+    GSource(const std::string& filePath,
+            const CfgParams& cfg_params = CfgParams{});
+
+    GSource(const std::string& filePath,
+            const CfgParams& cfg_params,
+            const std::string& device_id,
+            void* accel_device_ptr,
+            void* accel_ctx_ptr);
+
+    GSource(const std::string& filePath,
+            const CfgParams& cfg_params,
+            const Device &device, const Context &ctx);
+
+    GSource(const std::string& filePath,
+            const CfgParams& cfg_params,
+            std::shared_ptr<IDeviceSelector> selector);
+
+
+    GSource(std::shared_ptr<IDataProvider> source,
+            const CfgParams& cfg_params = CfgParams{});
+
+    GSource(std::shared_ptr<IDataProvider> source,
+            const CfgParams& cfg_params,
+            const std::string& device_id,
+            void* accel_device_ptr,
+            void* accel_ctx_ptr);
+
+    GSource(std::shared_ptr<IDataProvider> source,
+            const CfgParams& cfg_params,
+            std::shared_ptr<IDeviceSelector> selector);
+
+    ~GSource() override;
+
+    bool pull(cv::gapi::wip::Data& data) override;
+    GMetaArg descr_of() const override;
+
+private:
+    explicit GSource(std::unique_ptr<Priv>&& impl);
+    std::unique_ptr<Priv> m_priv;
+};
+} // namespace onevpl
+
+using GVPLSource = onevpl::GSource;
+
+template<class... Args>
+GAPI_EXPORTS_W cv::Ptr<IStreamSource> inline make_onevpl_src(Args&&... args)
+{
+    return make_src<onevpl::GSource>(std::forward<Args>(args)...);
+}
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_ONEVPL_ONEVPL_SOURCE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/queue_source.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/queue_source.hpp
new file mode 100644
index 000000000000..bd385ed16e73
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/queue_source.hpp
@@ -0,0 +1,67 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_QUEUE_SOURCE_HPP
+#define OPENCV_GAPI_STREAMING_QUEUE_SOURCE_HPP
+
+#include <memory>                      // shared_ptr
+#include <type_traits>                 // is_base_of
+
+#include <opencv2/gapi/garg.hpp>       // GRunArgs
+#include <opencv2/gapi/gmetaarg.hpp>   // GMetaArg + all descr_of
+#include <opencv2/gapi/streaming/source.hpp> // IStreamSource
+
+namespace cv {
+namespace gapi {
+namespace wip {
+struct Data; // fwd-declare to avoid circular? header dependencies
+
+class GAPI_EXPORTS QueueSourceBase: public cv::gapi::wip::IStreamSource {
+    class Priv;
+    std::shared_ptr<Priv> m_priv;
+    // FIXME: Need to understand how it works with IStreamSource's shared_from_this
+    // Can we avoid having too many shared_ptrs here?
+
+public:
+    explicit QueueSourceBase(const cv::GMetaArg &m);
+    void push(Data &&data);
+    virtual bool pull(Data &data) override;
+    virtual void halt() override;
+    virtual GMetaArg descr_of() const override;
+    virtual ~QueueSourceBase() = default;
+};
+
+/**
+ * @brief Queued streaming pipeline source.
+ *
+ */
+template<class T>
+class QueueSource final: public QueueSourceBase
+{
+public:
+    using Meta = decltype(cv::descr_of(T{}));
+    explicit QueueSource(Meta m) : QueueSourceBase(GMetaArg{m}) {
+    }
+    void push(T t) {
+        QueueSourceBase::push(Data{t});
+    }
+};
+
+class GAPI_EXPORTS QueueInput {
+    std::vector<std::shared_ptr<QueueSourceBase> > m_sources;
+
+public:
+    explicit QueueInput(const cv::GMetaArgs &args);
+
+    void push(cv::GRunArgs &&ins);
+    operator cv::GRunArgs();
+};
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_SOURCE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/source.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/source.hpp
new file mode 100644
index 000000000000..267469ad1b30
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/source.hpp
@@ -0,0 +1,67 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_SOURCE_HPP
+#define OPENCV_GAPI_STREAMING_SOURCE_HPP
+
+#include <memory>                      // shared_ptr
+#include <type_traits>                 // is_base_of
+
+#include <opencv2/gapi/gmetaarg.hpp>   // GMetaArg
+
+
+namespace cv {
+namespace gapi {
+namespace wip {
+struct Data; // forward-declaration of Data to avoid circular dependencies
+
+/**
+ * @brief Abstract streaming pipeline source.
+ *
+ * Implement this interface if you want customize the way how data is
+ * streaming into GStreamingCompiled.
+ *
+ * Objects implementing this interface can be passed to
+ * GStreamingCompiled using setSource() with cv::gin(). Regular
+ * compiled graphs (GCompiled) don't support input objects of this
+ * type.
+ *
+ * Default cv::VideoCapture-based implementation is available, see
+ * cv::gapi::wip::GCaptureSource.
+ *
+ * @note stream sources are passed to G-API via shared pointers, so
+ *  please use ptr() when passing a IStreamSource implementation to
+ *  cv::gin().
+ */
+class IStreamSource: public std::enable_shared_from_this<IStreamSource>
+{
+public:
+    using Ptr = std::shared_ptr<IStreamSource>;
+    Ptr ptr() { return shared_from_this(); }
+    virtual bool pull(Data &data) = 0;
+    virtual GMetaArg descr_of() const = 0;
+    virtual void halt() {
+        // Do nothing by default to maintain compatibility with the existing sources...
+        // In fact needs to be decorated atop of the child classes to maintain the behavior
+        // FIXME: Make it mandatory in OpenCV 5.0
+    };
+    virtual ~IStreamSource() = default;
+};
+
+template<class T, class... Args>
+IStreamSource::Ptr inline make_src(Args&&... args)
+{
+    static_assert(std::is_base_of<IStreamSource, T>::value,
+                  "T must implement the cv::gapi::IStreamSource interface!");
+    auto src_ptr = std::make_shared<T>(std::forward<Args>(args)...);
+    return src_ptr->ptr();
+}
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_SOURCE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/sync.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/sync.hpp
new file mode 100644
index 000000000000..5801e6f00a35
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/streaming/sync.hpp
@@ -0,0 +1,30 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_SYNC_HPP
+#define OPENCV_GAPI_STREAMING_SYNC_HPP
+
+namespace cv {
+namespace gapi {
+namespace streaming {
+
+enum class sync_policy {
+    dont_sync,
+    drop
+};
+
+} // namespace streaming
+} // namespace gapi
+
+namespace detail {
+    template<> struct CompileArgTag<gapi::streaming::sync_policy> {
+        static const char* tag() { return "gapi.streaming.sync_policy"; }
+    };
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_SYNC_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/any.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/any.hpp
new file mode 100644
index 000000000000..94451c771717
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/any.hpp
@@ -0,0 +1,190 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_ANY_HPP
+#define OPENCV_GAPI_UTIL_ANY_HPP
+
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+#include <opencv2/gapi/util/throw.hpp>
+
+#if defined(_MSC_VER)
+   // disable MSVC warning on "multiple copy constructors specified"
+#  pragma warning(disable: 4521)
+#endif
+
+namespace cv
+{
+
+namespace internal
+{
+    template <class T, class Source>
+    T down_cast(Source operand)
+    {
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
+       return dynamic_cast<T>(operand);
+#else
+#ifdef __GNUC__
+#warning used static cast instead of dynamic because RTTI is disabled
+#else
+#pragma message("WARNING: used static cast instead of dynamic because RTTI is disabled")
+#endif
+       return static_cast<T>(operand);
+#endif
+    }
+}
+
+namespace util
+{
+   class bad_any_cast : public std::bad_cast
+   {
+   public:
+       virtual const char* what() const noexcept override
+       {
+           return "Bad any cast";
+       }
+   };
+
+   //modeled against C++17 std::any
+
+   class any
+   {
+   private:
+      struct holder;
+      using holder_ptr = std::unique_ptr<holder>;
+      struct holder
+      {
+         virtual holder_ptr clone() = 0;
+         virtual ~holder() = default;
+      };
+
+      template <typename value_t>
+      struct holder_impl : holder
+      {
+         value_t v;
+         template<typename arg_t>
+         holder_impl(arg_t&& a) : v(std::forward<arg_t>(a)) {}
+         holder_ptr clone() override { return holder_ptr(new holder_impl (v));}
+      };
+
+      holder_ptr hldr;
+   public:
+      template<class value_t>
+      any(value_t&& arg) :  hldr(new holder_impl<typename std::decay<value_t>::type>( std::forward<value_t>(arg))) {}
+
+      any(any const& src) : hldr( src.hldr ? src.hldr->clone() : nullptr) {}
+      //simple hack in order not to write enable_if<not any> for the template constructor
+      any(any & src) : any (const_cast<any const&>(src)) {}
+
+      any()       = default;
+      any(any&& ) = default;
+
+      any& operator=(any&&) = default;
+
+      any& operator=(any const& src)
+      {
+         any copy(src);
+         swap(*this, copy);
+         return *this;
+      }
+
+      template<class value_t>
+      friend value_t* any_cast(any* operand);
+
+      template<class value_t>
+      friend const value_t* any_cast(const any* operand);
+
+      template<class value_t>
+      friend value_t& unsafe_any_cast(any& operand);
+
+      template<class value_t>
+      friend const value_t& unsafe_any_cast(const any& operand);
+
+      friend void swap(any & lhs, any& rhs)
+      {
+         swap(lhs.hldr, rhs.hldr);
+      }
+
+   };
+
+   template<class value_t>
+   value_t* any_cast(any* operand)
+   {
+      auto casted = internal::down_cast<any::holder_impl<typename std::decay<value_t>::type> *>(operand->hldr.get());
+      if (casted){
+         return & (casted->v);
+      }
+      return nullptr;
+   }
+
+   template<class value_t>
+   const value_t* any_cast(const any* operand)
+   {
+      auto casted = internal::down_cast<any::holder_impl<typename std::decay<value_t>::type> *>(operand->hldr.get());
+      if (casted){
+         return & (casted->v);
+      }
+      return nullptr;
+   }
+
+   template<class value_t>
+   value_t& any_cast(any& operand)
+   {
+      auto ptr = any_cast<value_t>(&operand);
+      if (ptr)
+      {
+         return *ptr;
+      }
+
+      throw_error(bad_any_cast());
+   }
+
+
+   template<class value_t>
+   const value_t& any_cast(const any& operand)
+   {
+      auto ptr = any_cast<value_t>(&operand);
+      if (ptr)
+      {
+         return *ptr;
+      }
+
+      throw_error(bad_any_cast());
+   }
+
+   template<class value_t>
+   inline value_t& unsafe_any_cast(any& operand)
+   {
+#ifdef DEBUG
+      return any_cast<value_t>(operand);
+#else
+      return static_cast<any::holder_impl<typename std::decay<value_t>::type> *>(operand.hldr.get())->v;
+#endif
+   }
+
+   template<class value_t>
+   inline const value_t& unsafe_any_cast(const any& operand)
+   {
+#ifdef DEBUG
+      return any_cast<value_t>(operand);
+#else
+      return static_cast<any::holder_impl<typename std::decay<value_t>::type> *>(operand.hldr.get())->v;
+#endif
+   }
+
+} // namespace util
+} // namespace cv
+
+#if defined(_MSC_VER)
+   // Enable "multiple copy constructors specified" back
+#  pragma warning(default: 4521)
+#endif
+
+#endif // OPENCV_GAPI_UTIL_ANY_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/compiler_hints.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/compiler_hints.hpp
new file mode 100644
index 000000000000..a41a97145dc7
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/compiler_hints.hpp
@@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+#ifndef OPENCV_GAPI_UTIL_COMPILER_HINTS_HPP
+#define OPENCV_GAPI_UTIL_COMPILER_HINTS_HPP
+
+namespace cv
+{
+namespace util
+{
+    //! Utility template function to prevent "unused" warnings by various compilers.
+    template<typename T> void suppress_unused_warning( const T& ) {}
+} // namespace util
+} // namespace cv
+
+#endif /* OPENCV_GAPI_UTIL_COMPILER_HINTS_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/copy_through_move.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/copy_through_move.hpp
new file mode 100644
index 000000000000..1a1121eb2187
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/copy_through_move.hpp
@@ -0,0 +1,34 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_UTIL_COPY_THROUGH_MOVE_HPP
+#define OPENCV_GAPI_UTIL_COPY_THROUGH_MOVE_HPP
+
+#include <opencv2/gapi/util/type_traits.hpp> //decay_t
+
+namespace cv
+{
+namespace util
+{
+    //This is a tool to move initialize captures of a lambda in C++11
+    template<typename T>
+    struct copy_through_move_t{
+       T value;
+       const T& get() const {return value;}
+       T&       get()       {return value;}
+       copy_through_move_t(T&& g) : value(std::move(g)) {}
+       copy_through_move_t(copy_through_move_t&&) = default;
+       copy_through_move_t(copy_through_move_t const& lhs) : copy_through_move_t(std::move(const_cast<copy_through_move_t&>(lhs))) {}
+    };
+
+    template<typename T>
+    copy_through_move_t<util::decay_t<T>> copy_through_move(T&& t){
+        return std::forward<T>(t);
+    }
+} // namespace util
+} // namespace cv
+
+#endif /* OPENCV_GAPI_UTIL_COPY_THROUGH_MOVE_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/optional.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/optional.hpp
new file mode 100644
index 000000000000..dca03cadad86
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/optional.hpp
@@ -0,0 +1,178 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_OPTIONAL_HPP
+#define OPENCV_GAPI_UTIL_OPTIONAL_HPP
+
+#include <opencv2/gapi/util/variant.hpp>
+
+// A poor man's `optional` implementation, incompletely modeled against C++17 spec.
+namespace cv
+{
+namespace util
+{
+    class bad_optional_access: public std::exception
+    {
+    public:
+        virtual const char *what() const noexcept override
+        {
+            return "Bad optional access";
+        }
+    };
+
+    // TODO: nullopt_t
+
+    // Interface ///////////////////////////////////////////////////////////////
+    template<typename T> class optional
+    {
+    public:
+        // Constructors
+        // NB.: there were issues with Clang 3.8 when =default() was used
+        // instead {}
+        optional() {}
+        optional(const optional&) = default;
+        explicit optional(T&&) noexcept;
+        explicit optional(const T&) noexcept;
+        optional(optional&&) noexcept;
+        // TODO: optional(nullopt_t) noexcept;
+        // TODO: optional(const optional<U> &)
+        // TODO: optional(optional<U> &&)
+        // TODO: optional(Args&&...)
+        // TODO: optional(initializer_list<U>)
+        // TODO: optional(U&& value);
+
+        // Assignment
+        optional& operator=(const optional&) = default;
+        optional& operator=(optional&&);
+
+        // Observers
+        T* operator-> ();
+        const T* operator-> () const;
+        T& operator* ();
+        const T& operator* () const;
+        // TODO: && versions
+
+        operator bool() const noexcept;
+        bool has_value() const noexcept;
+
+        T& value();
+        const T& value() const;
+        // TODO: && versions
+
+        template<class U>
+        T value_or(U &&default_value) const;
+
+        void swap(optional &other) noexcept;
+        void reset() noexcept;
+        // TODO: emplace
+
+        // TODO: operator==, !=, <, <=, >, >=
+
+    private:
+        struct nothing {};
+        util::variant<nothing, T> m_holder;
+    };
+
+    template<class T>
+    optional<typename std::decay<T>::type> make_optional(T&& value);
+
+    // TODO: Args... and initializer_list versions
+
+    // Implementation //////////////////////////////////////////////////////////
+    template<class T> optional<T>::optional(T &&v) noexcept
+        : m_holder(std::move(v))
+    {
+    }
+
+    template<class T> optional<T>::optional(const T &v) noexcept
+        : m_holder(v)
+    {
+    }
+
+    template<class T> optional<T>::optional(optional&& rhs) noexcept
+        : m_holder(std::move(rhs.m_holder))
+    {
+        rhs.reset();
+    }
+
+    template<class T> optional<T>& optional<T>::operator=(optional&& rhs)
+    {
+        m_holder = std::move(rhs.m_holder);
+        rhs.reset();
+        return *this;
+    }
+
+    template<class T> T* optional<T>::operator-> ()
+    {
+        return & *(*this);
+    }
+
+    template<class T> const T* optional<T>::operator-> () const
+    {
+        return & *(*this);
+    }
+
+    template<class T> T& optional<T>::operator* ()
+    {
+        return this->value();
+    }
+
+    template<class T> const T& optional<T>::operator* () const
+    {
+        return this->value();
+    }
+
+    template<class T> optional<T>::operator bool() const noexcept
+    {
+        return this->has_value();
+    }
+
+    template<class T> bool optional<T>::has_value() const noexcept
+    {
+        return util::holds_alternative<T>(m_holder);
+    }
+
+    template<class T> T& optional<T>::value()
+    {
+        if (!this->has_value())
+            throw_error(bad_optional_access());
+        return util::get<T>(m_holder);
+    }
+
+    template<class T> const T& optional<T>::value() const
+    {
+        if (!this->has_value())
+            throw_error(bad_optional_access());
+        return util::get<T>(m_holder);
+    }
+
+    template<class T>
+    template<class U> T optional<T>::value_or(U &&default_value) const
+    {
+        return (this->has_value() ? this->value() : T(default_value));
+    }
+
+    template<class T> void optional<T>::swap(optional<T> &other) noexcept
+    {
+        m_holder.swap(other.m_holder);
+    }
+
+    template<class T> void optional<T>::reset() noexcept
+    {
+        if (this->has_value())
+            m_holder = nothing{};
+    }
+
+    template<class T>
+    optional<typename std::decay<T>::type> make_optional(T&& value)
+    {
+        return optional<typename std::decay<T>::type>(std::forward<T>(value));
+    }
+} // namespace util
+} // namespace cv
+
+#endif // OPENCV_GAPI_UTIL_OPTIONAL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/throw.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/throw.hpp
new file mode 100644
index 000000000000..689bf583cfee
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/throw.hpp
@@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_THROW_HPP
+#define OPENCV_GAPI_UTIL_THROW_HPP
+
+#include <utility>  // std::forward
+
+#if !defined(__EXCEPTIONS)
+#include <stdlib.h>
+#include <stdio.h>
+#endif
+
+namespace cv
+{
+namespace util
+{
+template <class ExceptionType>
+[[noreturn]] void throw_error(ExceptionType &&e)
+{
+#if defined(__EXCEPTIONS) || defined(_CPPUNWIND)
+    throw std::forward<ExceptionType>(e);
+#else
+    fprintf(stderr, "An exception thrown! %s\n" , e.what());
+    fflush(stderr);
+    abort();
+#endif
+}
+} // namespace util
+} // namespace cv
+
+#endif // OPENCV_GAPI_UTIL_THROW_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/type_traits.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/type_traits.hpp
new file mode 100644
index 000000000000..637f18460bcd
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/type_traits.hpp
@@ -0,0 +1,31 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_TYPE_TRAITS_HPP
+#define OPENCV_GAPI_UTIL_TYPE_TRAITS_HPP
+
+#include <type_traits>
+
+namespace cv
+{
+namespace util
+{
+    //these are C++14 parts of type_traits :
+    template< bool B, class T = void >
+    using enable_if_t = typename std::enable_if<B,T>::type;
+
+    template<typename T>
+    using decay_t = typename std::decay<T>::type;
+
+    //this is not part of C++14 but still, of pretty common usage
+    template<class T, class U, class V = void>
+    using are_different_t = enable_if_t< !std::is_same<decay_t<T>, decay_t<U>>::value, V>;
+
+} // namespace cv
+} // namespace util
+
+#endif // OPENCV_GAPI_UTIL_TYPE_TRAITS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/util.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/util.hpp
new file mode 100644
index 000000000000..3be46d7ec2e8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/util.hpp
@@ -0,0 +1,190 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2019 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_HPP
+#define OPENCV_GAPI_UTIL_HPP
+
+#include <tuple>
+
+// \cond HIDDEN_SYMBOLS
+// This header file contains some generic utility functions which are
+// used in other G-API Public API headers.
+//
+// PLEASE don't put any stuff here if it is NOT used in public API headers!
+
+namespace cv
+{
+namespace detail
+{
+    // Recursive integer sequence type, useful for enumerating elements of
+    // template parameter packs.
+    template<int... I> struct Seq     { using next = Seq<I..., sizeof...(I)>; };
+    template<int Sz>   struct MkSeq   { using type = typename MkSeq<Sz-1>::type::next; };
+    template<>         struct MkSeq<0>{ using type = Seq<>; };
+
+    // Checks if elements of variadic template satisfy the given Predicate.
+    // Implemented via tuple, with an interface to accept plain type lists
+    template<template<class> class, typename, typename...> struct all_satisfy;
+
+    template<template<class> class F, typename T, typename... Ts>
+    struct all_satisfy<F, std::tuple<T, Ts...> >
+    {
+        static const constexpr bool value = F<T>::value
+            && all_satisfy<F, std::tuple<Ts...> >::value;
+    };
+    template<template<class> class F, typename T>
+    struct all_satisfy<F, std::tuple<T> >
+    {
+        static const constexpr bool value = F<T>::value;
+    };
+
+    template<template<class> class F, typename T, typename... Ts>
+    struct all_satisfy: public all_satisfy<F, std::tuple<T, Ts...> > {};
+
+    // Permute given tuple type C with given integer sequence II
+    // Sequence may be less than tuple C size.
+    template<class, class> struct permute_tuple;
+
+    template<class C, int... IIs>
+    struct permute_tuple<C, Seq<IIs...> >
+    {
+        using type = std::tuple< typename std::tuple_element<IIs, C>::type... >;
+    };
+
+    // Given T..., generates a type sequence of sizeof...(T)-1 elements
+    // which is T... without its last element
+    // Implemented via tuple, with an interface to accept plain type lists
+    template<typename T, typename... Ts> struct all_but_last;
+
+    template<typename T, typename... Ts>
+    struct all_but_last<std::tuple<T, Ts...> >
+    {
+        using C    = std::tuple<T, Ts...>;
+        using S    = typename MkSeq<std::tuple_size<C>::value - 1>::type;
+        using type = typename permute_tuple<C, S>::type;
+    };
+
+    template<typename T, typename... Ts>
+    struct all_but_last: public all_but_last<std::tuple<T, Ts...> > {};
+
+    template<typename... Ts>
+    using all_but_last_t = typename all_but_last<Ts...>::type;
+
+    // NB.: This is here because there's no constexpr std::max in C++11
+    template<std::size_t S0, std::size_t... SS> struct max_of_t
+    {
+        static constexpr const std::size_t rest  = max_of_t<SS...>::value;
+        static constexpr const std::size_t value = rest > S0 ? rest : S0;
+    };
+    template<std::size_t S> struct max_of_t<S>
+    {
+        static constexpr const std::size_t value = S;
+    };
+
+    template <typename...>
+    struct contains : std::false_type{};
+
+    template <typename T1, typename T2, typename... Ts>
+    struct contains<T1, T2, Ts...> : std::integral_constant<bool, std::is_same<T1, T2>::value ||
+                                                                  contains<T1, Ts...>::value> {};
+    template<typename T, typename... Types>
+    struct contains<T, std::tuple<Types...>> : std::integral_constant<bool, contains<T, Types...>::value> {};
+
+    template <typename...>
+    struct all_unique : std::true_type{};
+
+    template <typename T1, typename... Ts>
+    struct all_unique<T1, Ts...> : std::integral_constant<bool, !contains<T1, Ts...>::value &&
+                                                                 all_unique<Ts...>::value> {};
+
+    template<typename>
+    struct tuple_wrap_helper;
+
+    template<typename T> struct tuple_wrap_helper
+    {
+        using type = std::tuple<T>;
+        static type get(T&& obj) { return std::make_tuple(std::move(obj)); }
+    };
+
+    template<typename... Objs>
+    struct tuple_wrap_helper<std::tuple<Objs...>>
+    {
+        using type = std::tuple<Objs...>;
+        static type get(std::tuple<Objs...>&& objs) { return std::forward<std::tuple<Objs...>>(objs); }
+    };
+
+    template<typename... Ts>
+    struct make_void { typedef void type;};
+
+    template<typename... Ts>
+    using void_t = typename make_void<Ts...>::type;
+
+} // namespace detail
+
+namespace util
+{
+template<typename ...L>
+struct overload_lamba_set;
+
+template<typename L1>
+struct overload_lamba_set<L1> : public L1
+{
+    overload_lamba_set(L1&& lambda) : L1(std::move(lambda)) {}
+    overload_lamba_set(const L1& lambda) : L1(lambda) {}
+
+    using L1::operator();
+};
+
+template<typename L1, typename ...L>
+struct overload_lamba_set<L1, L...> : public L1, public overload_lamba_set<L...>
+{
+    using base_type = overload_lamba_set<L...>;
+    overload_lamba_set(L1 &&lambda1, L&& ...lambdas):
+        L1(std::move(lambda1)),
+        base_type(std::forward<L>(lambdas)...) {}
+
+    overload_lamba_set(const L1 &lambda1, L&& ...lambdas):
+        L1(lambda1),
+        base_type(std::forward<L>(lambdas)...) {}
+
+    using L1::operator();
+    using base_type::operator();
+};
+
+template<typename... L>
+overload_lamba_set<L...> overload_lambdas(L&& ...lambdas)
+{
+    return overload_lamba_set<L...>(std::forward<L>(lambdas)...);
+}
+
+template<typename ...T>
+struct find_adapter_impl;
+
+template<typename AdapterT, typename T>
+struct find_adapter_impl<AdapterT, T>
+{
+    using type = typename std::conditional<std::is_base_of<AdapterT, T>::value,
+                                           T,
+                                           void>::type;
+    static constexpr bool found = std::is_base_of<AdapterT, T>::value;
+};
+
+template<typename AdapterT, typename T, typename... Types>
+struct find_adapter_impl<AdapterT, T, Types...>
+{
+    using type = typename std::conditional<std::is_base_of<AdapterT, T>::value,
+                                           T,
+                                           typename find_adapter_impl<AdapterT, Types...>::type>::type;
+    static constexpr bool found = std::is_base_of<AdapterT, T>::value ||
+                                  find_adapter_impl<AdapterT, Types...>::found;
+};
+} // namespace util
+} // namespace cv
+
+// \endcond
+
+#endif //  OPENCV_GAPI_UTIL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/variant.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/variant.hpp
new file mode 100644
index 000000000000..48b55646c53d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/util/variant.hpp
@@ -0,0 +1,667 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_VARIANT_HPP
+#define OPENCV_GAPI_UTIL_VARIANT_HPP
+
+#include <array>
+#include <type_traits>
+
+#include <opencv2/gapi/util/compiler_hints.hpp>
+#include <opencv2/gapi/util/throw.hpp>
+#include <opencv2/gapi/util/util.hpp> // max_of_t
+#include <opencv2/gapi/util/type_traits.hpp>
+
+// A poor man's `variant` implementation, incompletely modeled against C++17 spec.
+namespace cv
+{
+namespace util
+{
+    namespace detail
+    {
+        template<std::size_t I, typename Target, typename First, typename... Remaining>
+        struct type_list_index_helper
+        {
+            static const constexpr bool is_same = std::is_same<Target, First>::value;
+            static const constexpr std::size_t value =
+                std::conditional<is_same, std::integral_constant<std::size_t, I>, type_list_index_helper<I + 1, Target, Remaining...>>::type::value;
+        };
+
+        template<std::size_t I, typename Target, typename First>
+        struct type_list_index_helper<I, Target, First>
+        {
+            static_assert(std::is_same<Target, First>::value, "Type not found");
+            static const constexpr std::size_t value = I;
+        };
+    }
+
+    template<typename Target, typename... Types>
+    struct type_list_index
+    {
+        static const constexpr std::size_t value = detail::type_list_index_helper<0, Target, Types...>::value;
+    };
+
+    template<std::size_t Index, class... Types >
+    struct type_list_element
+    {
+        using type = typename std::tuple_element<Index, std::tuple<Types...> >::type;
+    };
+
+    class bad_variant_access: public std::exception
+    {
+    public:
+        virtual const char *what() const noexcept override
+        {
+            return "Bad variant access";
+        }
+    };
+
+    // Interface ///////////////////////////////////////////////////////////////
+    struct monostate {};
+    inline bool operator==(const util::monostate&, const util::monostate&)
+    {
+        return true;
+    }
+
+    template<typename... Ts> // FIXME: no references, arrays, and void
+    class variant
+    {
+        // FIXME: Replace with std::aligned_union after gcc4.8 support is dropped
+        static constexpr const std::size_t S = cv::detail::max_of_t<sizeof(Ts)...>::value;
+        static constexpr const std::size_t A = cv::detail::max_of_t<alignof(Ts)...>::value;
+        using Memory = typename std::aligned_storage<S, A>::type[1];
+
+        template<typename T> struct cctr_h {
+            static void help(Memory memory, const Memory from) {
+                new (memory) T(*reinterpret_cast<const T*>(from));
+            }
+        };
+
+        template<typename T> struct mctr_h {
+            static void help(Memory memory, void *pval) {
+                new (memory) T(std::move(*reinterpret_cast<T*>(pval)));
+            }
+        };
+
+        //FIXME: unify with cctr_h and mctr_h
+        template<typename T> struct cnvrt_ctor_h {
+            static void help(Memory memory, void* from) {
+                using util::decay_t;
+                new (memory) decay_t<T>(std::forward<T>(*reinterpret_cast<decay_t<T>*>(from)));
+            }
+        };
+
+        template<typename T> struct copy_h {
+            static void help(Memory to, const Memory from) {
+                *reinterpret_cast<T*>(to) = *reinterpret_cast<const T*>(from);
+            }
+        };
+
+        template<typename T> struct move_h {
+            static void help(Memory to, Memory from) {
+                *reinterpret_cast<T*>(to) = std::move(*reinterpret_cast<T*>(from));
+            }
+        };
+
+        //FIXME: unify with copy_h and move_h
+        template<typename T> struct cnvrt_assign_h {
+            static void help(Memory to, void* from) {
+                using util::decay_t;
+                *reinterpret_cast<decay_t<T>*>(to) = std::forward<T>(*reinterpret_cast<decay_t<T>*>(from));
+            }
+        };
+
+        template<typename T> struct swap_h {
+            static void help(Memory to, Memory from) {
+                std::swap(*reinterpret_cast<T*>(to), *reinterpret_cast<T*>(from));
+            }
+        };
+
+        template<typename T> struct dtor_h {
+            static void help(Memory memory) {
+                (void) memory; // MSCV warning
+                reinterpret_cast<T*>(memory)->~T();
+            }
+        };
+
+        template<typename T> struct equal_h {
+            static bool help(const Memory lhs, const Memory rhs) {
+                const T& t_lhs = *reinterpret_cast<const T*>(lhs);
+                const T& t_rhs = *reinterpret_cast<const T*>(rhs);
+                return t_lhs == t_rhs;
+            }
+        };
+
+        typedef void (*CCtr) (Memory, const Memory);  // Copy c-tor (variant)
+        typedef void (*MCtr) (Memory, void*);         // Generic move c-tor
+        typedef void (*Copy) (Memory, const Memory);  // Copy assignment
+        typedef void (*Move) (Memory, Memory);        // Move assignment
+
+        typedef void (*Swap) (Memory, Memory);        // Swap
+        typedef void (*Dtor) (Memory);                // Destructor
+
+        using  cnvrt_assgn_t   = void (*) (Memory, void*);  // Converting assignment (via std::forward)
+        using  cnvrt_ctor_t    = void (*) (Memory, void*);  // Converting constructor (via std::forward)
+
+        typedef bool (*Equal)(const Memory, const Memory); // Equality test (external)
+
+        static constexpr std::array<CCtr, sizeof...(Ts)> cctrs(){ return {{(&cctr_h<Ts>::help)...}};}
+        static constexpr std::array<MCtr, sizeof...(Ts)> mctrs(){ return {{(&mctr_h<Ts>::help)...}};}
+        static constexpr std::array<Copy, sizeof...(Ts)> cpyrs(){ return {{(&copy_h<Ts>::help)...}};}
+        static constexpr std::array<Move, sizeof...(Ts)> mvers(){ return {{(&move_h<Ts>::help)...}};}
+        static constexpr std::array<Swap, sizeof...(Ts)> swprs(){ return {{(&swap_h<Ts>::help)...}};}
+        static constexpr std::array<Dtor, sizeof...(Ts)> dtors(){ return {{(&dtor_h<Ts>::help)...}};}
+
+        template<bool cond, typename T>
+        struct conditional_ref : std::conditional<cond, typename std::remove_reference<T>::type&, typename std::remove_reference<T>::type > {};
+
+        template<bool cond, typename T>
+        using conditional_ref_t = typename conditional_ref<cond, T>::type;
+
+
+        template<bool is_lvalue_arg>
+        static constexpr std::array<cnvrt_assgn_t, sizeof...(Ts)> cnvrt_assgnrs(){
+            return {{(&cnvrt_assign_h<conditional_ref_t<is_lvalue_arg,Ts>>::help)...}};
+        }
+
+        template<bool is_lvalue_arg>
+        static constexpr std::array<cnvrt_ctor_t, sizeof...(Ts)> cnvrt_ctors(){
+            return {{(&cnvrt_ctor_h<conditional_ref_t<is_lvalue_arg,Ts>>::help)...}};
+        }
+
+        std::size_t m_index = 0;
+
+    protected:
+        template<typename T, typename... Us> friend T& get(variant<Us...> &v);
+        template<typename T, typename... Us> friend const T& get(const variant<Us...> &v);
+        template<typename T, typename... Us> friend T* get_if(variant<Us...> *v) noexcept;
+        template<typename T, typename... Us> friend const T* get_if(const variant<Us...> *v) noexcept;
+
+        template<typename... Us> friend bool operator==(const variant<Us...> &lhs,
+                                                        const variant<Us...> &rhs);
+        Memory memory;
+
+    public:
+        // Constructors
+        variant() noexcept;
+        variant(const variant& other);
+        variant(variant&& other) noexcept;
+        // are_different_t is a SFINAE trick to avoid variant(T &&t) with T=variant
+        // for some reason, this version is called instead of variant(variant&& o) when
+        // variant is used in STL containers (examples: vector assignment).
+        template<
+            typename T,
+            typename = util::are_different_t<variant, T>
+        >
+        explicit variant(T&& t);
+        // template<class T, class... Args> explicit variant(Args&&... args);
+        // FIXME: other constructors
+
+        // Destructor
+        ~variant();
+
+        // Assignment
+        variant& operator=(const variant& rhs);
+        variant& operator=(variant &&rhs) noexcept;
+
+        // SFINAE trick to avoid operator=(T&&) with T=variant<>, see comment above
+        template<
+            typename T,
+            typename = util::are_different_t<variant, T>
+        >
+        variant& operator=(T&& t) noexcept;
+
+        // Observers
+        std::size_t index() const noexcept;
+        // FIXME: valueless_by_exception()
+
+        // Modifiers
+        // FIXME: emplace()
+        void swap(variant &rhs) noexcept;
+
+        // Non-C++17x!
+        template<typename T> static constexpr std::size_t index_of();
+    };
+
+    // FIMXE: visit
+    template<typename T, typename... Types>
+    T* get_if(util::variant<Types...>* v) noexcept;
+
+    template<typename T, typename... Types>
+    const T* get_if(const util::variant<Types...>* v) noexcept;
+
+    template<typename T, typename... Types>
+    T& get(util::variant<Types...> &v);
+
+    template<typename T, typename... Types>
+    const T& get(const util::variant<Types...> &v);
+
+    template<std::size_t Index, typename... Types>
+    typename util::type_list_element<Index, Types...>::type& get(util::variant<Types...> &v);
+
+    template<std::size_t Index, typename... Types>
+    const typename util::type_list_element<Index, Types...>::type& get(const util::variant<Types...> &v);
+
+    template<typename T, typename... Types>
+    bool holds_alternative(const util::variant<Types...> &v) noexcept;
+
+
+    // Visitor
+    namespace detail
+    {
+        struct visitor_interface {};
+
+        // Class `visitor_return_type_deduction_helper`
+        // introduces solution for deduction `return_type` in `visit` function in common way
+        // for both Lambda and class Visitor and keep one interface invocation point: `visit` only
+        // his helper class is required to unify return_type deduction mechanism because
+        // for Lambda it is possible to take type of `decltype(visitor(get<0>(var)))`
+        // but for class Visitor there is no operator() in base case,
+        // because it provides `operator() (std::size_t index, ...)`
+        // So `visitor_return_type_deduction_helper` expose `operator()`
+        // uses only for class Visitor only for deduction `return type` in visit()
+        template<typename R>
+        struct visitor_return_type_deduction_helper
+        {
+            using return_type = R;
+
+            // to be used in Lambda return type deduction context only
+            template<typename T>
+            return_type operator() (T&&);
+        };
+    }
+
+    // Special purpose `static_visitor` can receive additional arguments
+    template<typename R, typename Impl>
+    struct static_visitor : public detail::visitor_interface,
+                            public detail::visitor_return_type_deduction_helper<R> {
+
+        // assign responsibility for return type deduction to helper class
+        using return_type = typename detail::visitor_return_type_deduction_helper<R>::return_type;
+        using detail::visitor_return_type_deduction_helper<R>::operator();
+        friend Impl;
+
+        template<typename VariantValue, typename ...Args>
+        return_type operator() (std::size_t index, VariantValue&& value, Args&& ...args)
+        {
+            suppress_unused_warning(index);
+            return static_cast<Impl*>(this)-> visit(
+                                                std::forward<VariantValue>(value),
+                                                std::forward<Args>(args)...);
+        }
+    };
+
+    // Special purpose `static_indexed_visitor` can receive additional arguments
+    // And make forwarding current variant index as runtime function argument to its `Impl`
+    template<typename R, typename Impl>
+    struct static_indexed_visitor : public detail::visitor_interface,
+                                    public detail::visitor_return_type_deduction_helper<R> {
+
+        // assign responsibility for return type deduction to helper class
+        using return_type = typename detail::visitor_return_type_deduction_helper<R>::return_type;
+        using detail::visitor_return_type_deduction_helper<R>::operator();
+        friend Impl;
+
+        template<typename VariantValue, typename ...Args>
+        return_type operator() (std::size_t Index, VariantValue&& value, Args&& ...args)
+        {
+            return static_cast<Impl*>(this)-> visit(Index,
+                                                std::forward<VariantValue>(value),
+                                                std::forward<Args>(args)...);
+        }
+    };
+
+    template <class T>
+    struct variant_size;
+
+    template <class... Types>
+    struct variant_size<util::variant<Types...>>
+        : std::integral_constant<std::size_t, sizeof...(Types)> { };
+    // FIXME: T&&, const TT&& versions.
+
+    // Implementation //////////////////////////////////////////////////////////
+    template<typename... Ts>
+    variant<Ts...>::variant() noexcept
+    {
+        typedef typename std::tuple_element<0, std::tuple<Ts...> >::type TFirst;
+        new (memory) TFirst();
+    }
+
+    template<typename... Ts>
+    variant<Ts...>::variant(const variant &other)
+        : m_index(other.m_index)
+    {
+        (cctrs()[m_index])(memory, other.memory);
+    }
+
+    template<typename... Ts>
+    variant<Ts...>::variant(variant &&other) noexcept
+        : m_index(other.m_index)
+    {
+        (mctrs()[m_index])(memory, other.memory);
+    }
+
+    template<typename... Ts>
+    template<class T, typename>
+    variant<Ts...>::variant(T&& t)
+        : m_index(util::type_list_index<util::decay_t<T>, Ts...>::value)
+    {
+        const constexpr bool is_lvalue_arg =  std::is_lvalue_reference<T>::value;
+        (cnvrt_ctors<is_lvalue_arg>()[m_index])(memory, const_cast<util::decay_t<T> *>(&t));
+    }
+
+    template<typename... Ts>
+    variant<Ts...>::~variant()
+    {
+        (dtors()[m_index])(memory);
+    }
+
+    template<typename... Ts>
+    variant<Ts...>& variant<Ts...>::operator=(const variant<Ts...> &rhs)
+    {
+        if (m_index != rhs.m_index)
+        {
+            (dtors()[    m_index])(memory);
+            (cctrs()[rhs.m_index])(memory, rhs.memory);
+            m_index = rhs.m_index;
+        }
+        else
+        {
+            (cpyrs()[rhs.m_index])(memory, rhs.memory);
+        }
+        return *this;
+    }
+
+    template<typename... Ts>
+    variant<Ts...>& variant<Ts...>::operator=(variant<Ts...> &&rhs) noexcept
+    {
+        if (m_index != rhs.m_index)
+        {
+            (dtors()[    m_index])(memory);
+            (mctrs()[rhs.m_index])(memory, rhs.memory);
+            m_index = rhs.m_index;
+        }
+        else
+        {
+            (mvers()[rhs.m_index])(memory, rhs.memory);
+        }
+        return *this;
+    }
+
+    template<typename... Ts>
+    template<typename T, typename>
+    variant<Ts...>& variant<Ts...>::operator=(T&& t) noexcept
+    {
+        using decayed_t = util::decay_t<T>;
+        // FIXME: No version with implicit type conversion available!
+        const constexpr std::size_t t_index =
+            util::type_list_index<decayed_t, Ts...>::value;
+
+        const constexpr bool is_lvalue_arg =  std::is_lvalue_reference<T>::value;
+
+        if (t_index != m_index)
+        {
+            (dtors()[m_index])(memory);
+            (cnvrt_ctors<is_lvalue_arg>()[t_index])(memory, &t);
+            m_index = t_index;
+        }
+        else
+        {
+            (cnvrt_assgnrs<is_lvalue_arg>()[m_index])(memory, &t);
+        }
+        return *this;
+
+    }
+
+    template<typename... Ts>
+    std::size_t util::variant<Ts...>::index() const noexcept
+    {
+        return m_index;
+    }
+
+    template<typename... Ts>
+    void variant<Ts...>::swap(variant<Ts...> &rhs) noexcept
+    {
+        if (m_index == rhs.index())
+        {
+            (swprs()[m_index](memory, rhs.memory));
+        }
+        else
+        {
+            variant<Ts...> tmp(std::move(*this));
+            *this = std::move(rhs);
+            rhs   = std::move(tmp);
+        }
+    }
+
+    template<typename... Ts>
+    template<typename T>
+    constexpr std::size_t variant<Ts...>::index_of()
+    {
+        return util::type_list_index<T, Ts...>::value; // FIXME: tests!
+    }
+
+    template<typename T, typename... Types>
+    T* get_if(util::variant<Types...>* v) noexcept
+    {
+        const constexpr std::size_t t_index =
+            util::type_list_index<T, Types...>::value;
+
+        if (v && v->index() == t_index)
+            return (T*)(&v->memory);  // workaround for ICC 2019
+            // original code: return reinterpret_cast<T&>(v.memory);
+        return nullptr;
+    }
+
+    template<typename T, typename... Types>
+    const T* get_if(const util::variant<Types...>* v) noexcept
+    {
+        const constexpr std::size_t t_index =
+            util::type_list_index<T, Types...>::value;
+
+        if (v && v->index() == t_index)
+            return (const T*)(&v->memory);  // workaround for ICC 2019
+            // original code: return reinterpret_cast<const T&>(v.memory);
+        return nullptr;
+    }
+
+    template<typename T, typename... Types>
+    T& get(util::variant<Types...> &v)
+    {
+        if (auto* p = get_if<T>(&v))
+            return *p;
+        else
+            throw_error(bad_variant_access());
+    }
+
+    template<typename T, typename... Types>
+    const T& get(const util::variant<Types...> &v)
+    {
+        if (auto* p = get_if<T>(&v))
+            return *p;
+        else
+            throw_error(bad_variant_access());
+    }
+
+    template<std::size_t Index, typename... Types>
+    typename util::type_list_element<Index, Types...>::type& get(util::variant<Types...> &v)
+    {
+        using ReturnType = typename util::type_list_element<Index, Types...>::type;
+        return const_cast<ReturnType&>(get<Index, Types...>(static_cast<const util::variant<Types...> &>(v)));
+    }
+
+    template<std::size_t Index, typename... Types>
+    const typename util::type_list_element<Index, Types...>::type& get(const util::variant<Types...> &v)
+    {
+        static_assert(Index < sizeof...(Types),
+                      "`Index` it out of bound of `util::variant` type list");
+        using ReturnType = typename util::type_list_element<Index, Types...>::type;
+        return get<ReturnType>(v);
+    }
+
+    template<typename T, typename... Types>
+    bool holds_alternative(const util::variant<Types...> &v) noexcept
+    {
+        return v.index() == util::variant<Types...>::template index_of<T>();
+    }
+
+#if defined(__GNUC__) && (__GNUC__ == 11 || __GNUC__ == 12)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+    template<typename... Us> bool operator==(const variant<Us...> &lhs,
+                                             const variant<Us...> &rhs)
+    {
+        using V = variant<Us...>;
+
+        // Instantiate table only here since it requires operator== for <Us...>
+        // <Us...> should have operator== only if this one is used, not in general
+        static const std::array<typename V::Equal, sizeof...(Us)> eqs = {
+            {(&V::template equal_h<Us>::help)...}
+        };
+        if (lhs.index() != rhs.index())
+            return false;
+        return (eqs[lhs.index()])(lhs.memory, rhs.memory);
+    }
+
+#if defined(__GNUC__) && (__GNUC__ == 11 || __GNUC__ == 12)
+#pragma GCC diagnostic pop
+#endif
+
+    template<typename... Us> bool operator!=(const variant<Us...> &lhs,
+                                             const variant<Us...> &rhs)
+    {
+        return !(lhs == rhs);
+    }
+
+namespace detail
+{
+    // terminate recursion implementation for `non-void` ReturnType
+    template<typename ReturnType, std::size_t CurIndex, std::size_t ElemCount,
+             typename Visitor, typename Variant, typename... VisitorArgs>
+    ReturnType apply_visitor_impl(Visitor&&, Variant&,
+                                  std::true_type, std::false_type,
+                                  VisitorArgs&& ...)
+    {
+        return {};
+    }
+
+    // terminate recursion implementation for `void` ReturnType
+    template<typename ReturnType, std::size_t CurIndex, std::size_t ElemCount,
+             typename Visitor, typename Variant, typename... VisitorArgs>
+    void apply_visitor_impl(Visitor&&, Variant&,
+                            std::true_type, std::true_type,
+                            VisitorArgs&& ...)
+    {
+    }
+
+    // Intermediate resursion processor for Lambda Visitors
+    template<typename ReturnType, std::size_t CurIndex, std::size_t ElemCount,
+             typename Visitor, typename Variant, bool no_return_value, typename... VisitorArgs>
+    typename std::enable_if<!std::is_base_of<visitor_interface, typename std::decay<Visitor>::type>::value, ReturnType>::type
+         apply_visitor_impl(Visitor&& visitor, Variant&& v, std::false_type not_processed,
+                                               std::integral_constant<bool, no_return_value> should_no_return,
+                                               VisitorArgs&& ...args)
+    {
+        static_assert(std::is_same<ReturnType, decltype(visitor(get<CurIndex>(v)))>::value,
+                      "Different `ReturnType`s detected! All `Visitor::visit` or `overload_lamba_set`"
+                      " must return the same type");
+        suppress_unused_warning(not_processed);
+        if (v.index() == CurIndex)
+        {
+            return visitor.operator()(get<CurIndex>(v), std::forward<VisitorArgs>(args)... );
+        }
+
+        using is_variant_processed_t = std::integral_constant<bool, CurIndex + 1 >= ElemCount>;
+        return apply_visitor_impl<ReturnType, CurIndex +1, ElemCount>(
+                                  std::forward<Visitor>(visitor),
+                                  std::forward<Variant>(v),
+                                  is_variant_processed_t{},
+                                  should_no_return,
+                                  std::forward<VisitorArgs>(args)...);
+    }
+
+    //Visual Studio 2014 compilation fix: cast visitor to base class before invoke operator()
+    template<std::size_t CurIndex, typename ReturnType, typename Visitor, class Value, typename... VisitorArgs>
+    typename std::enable_if<std::is_base_of<static_visitor<ReturnType, typename std::decay<Visitor>::type>,
+                                            typename std::decay<Visitor>::type>::value, ReturnType>::type
+    invoke_class_visitor(Visitor& visitor, Value&& v,  VisitorArgs&&...args)
+    {
+        return static_cast<static_visitor<ReturnType, typename std::decay<Visitor>::type>&>(visitor).operator() (CurIndex, std::forward<Value>(v), std::forward<VisitorArgs>(args)... );
+    }
+
+    //Visual Studio 2014 compilation fix: cast visitor to base class before invoke operator()
+    template<std::size_t CurIndex, typename ReturnType, typename Visitor, class Value, typename... VisitorArgs>
+    typename std::enable_if<std::is_base_of<static_indexed_visitor<ReturnType, typename std::decay<Visitor>::type>,
+                                            typename std::decay<Visitor>::type>::value, ReturnType>::type
+    invoke_class_visitor(Visitor& visitor, Value&& v,  VisitorArgs&&...args)
+    {
+        return static_cast<static_indexed_visitor<ReturnType, typename std::decay<Visitor>::type>&>(visitor).operator() (CurIndex, std::forward<Value>(v), std::forward<VisitorArgs>(args)... );
+    }
+
+    // Intermediate recursion processor for special case `visitor_interface` derived Visitors
+    template<typename ReturnType, std::size_t CurIndex, std::size_t ElemCount,
+             typename Visitor, typename Variant, bool no_return_value, typename... VisitorArgs>
+    typename std::enable_if<std::is_base_of<visitor_interface, typename std::decay<Visitor>::type>::value, ReturnType>::type
+         apply_visitor_impl(Visitor&& visitor, Variant&& v, std::false_type not_processed,
+                                               std::integral_constant<bool, no_return_value> should_no_return,
+                                               VisitorArgs&& ...args)
+    {
+        static_assert(std::is_same<ReturnType, decltype(visitor(get<CurIndex>(v)))>::value,
+                      "Different `ReturnType`s detected! All `Visitor::visit` or `overload_lamba_set`"
+                      " must return the same type");
+        suppress_unused_warning(not_processed);
+        if (v.index() == CurIndex)
+        {
+            return invoke_class_visitor<CurIndex, ReturnType>(visitor, get<CurIndex>(v), std::forward<VisitorArgs>(args)... );
+        }
+
+        using is_variant_processed_t = std::integral_constant<bool, CurIndex + 1 >= ElemCount>;
+        return apply_visitor_impl<ReturnType, CurIndex +1, ElemCount>(
+                                  std::forward<Visitor>(visitor),
+                                  std::forward<Variant>(v),
+                                  is_variant_processed_t{},
+                                  should_no_return,
+                                  std::forward<VisitorArgs>(args)...);
+    }
+} // namespace detail
+
+    template<typename Visitor, typename Variant, typename... VisitorArg>
+    auto visit(Visitor &visitor, const Variant& var, VisitorArg &&...args) -> decltype(visitor(get<0>(var)))
+    {
+        constexpr std::size_t varsize = util::variant_size<Variant>::value;
+        static_assert(varsize != 0, "utils::variant must contains one type at least ");
+        using is_variant_processed_t = std::false_type;
+
+        using ReturnType = decltype(visitor(get<0>(var)));
+        using return_t = std::is_same<ReturnType, void>;
+        return detail::apply_visitor_impl<ReturnType, 0, varsize, Visitor>(
+                                    std::forward<Visitor>(visitor),
+                                    var, is_variant_processed_t{},
+                                    return_t{},
+                                    std::forward<VisitorArg>(args)...);
+    }
+
+    template<typename Visitor, typename Variant>
+    auto visit(Visitor&& visitor, const Variant& var) -> decltype(visitor(get<0>(var)))
+    {
+        constexpr std::size_t varsize = util::variant_size<Variant>::value;
+        static_assert(varsize != 0, "utils::variant must contains one type at least ");
+        using is_variant_processed_t = std::false_type;
+
+        using ReturnType = decltype(visitor(get<0>(var)));
+        using return_t = std::is_same<ReturnType, void>;
+        return detail::apply_visitor_impl<ReturnType, 0, varsize, Visitor>(
+                                    std::forward<Visitor>(visitor),
+                                    var, is_variant_processed_t{},
+                                    return_t{});
+    }
+} // namespace util
+} // namespace cv
+
+#endif // OPENCV_GAPI_UTIL_VARIANT_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/gapi/video.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/video.hpp
new file mode 100644
index 000000000000..4dcc1d418241
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/gapi/video.hpp
@@ -0,0 +1,364 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2020 Intel Corporation
+
+#ifndef OPENCV_GAPI_VIDEO_HPP
+#define OPENCV_GAPI_VIDEO_HPP
+
+#include <utility> // std::tuple
+
+#include <opencv2/gapi/gkernel.hpp>
+
+
+/** \defgroup gapi_video G-API Video processing functionality
+ */
+
+namespace cv { namespace gapi {
+
+/** @brief Structure for the Kalman filter's initialization parameters.*/
+
+struct GAPI_EXPORTS KalmanParams
+{
+    // initial state
+
+    //! corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
+    Mat state;
+    //! posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
+    Mat errorCov;
+
+    // dynamic system description
+
+    //! state transition matrix (A)
+    Mat transitionMatrix;
+    //! measurement matrix (H)
+    Mat measurementMatrix;
+    //! process noise covariance matrix (Q)
+    Mat processNoiseCov;
+    //! measurement noise covariance matrix (R)
+    Mat measurementNoiseCov;
+    //! control matrix (B) (Optional: not used if there's no control)
+    Mat controlMatrix;
+};
+
+/**
+ * @brief This namespace contains G-API Operations and functions for
+ * video-oriented algorithms, like optical flow and background subtraction.
+ */
+namespace  video
+{
+using GBuildPyrOutput  = std::tuple<GArray<GMat>, GScalar>;
+
+using GOptFlowLKOutput = std::tuple<cv::GArray<cv::Point2f>,
+                                    cv::GArray<uchar>,
+                                    cv::GArray<float>>;
+
+G_TYPED_KERNEL(GBuildOptFlowPyramid, <GBuildPyrOutput(GMat,Size,GScalar,bool,int,int,bool)>,
+               "org.opencv.video.buildOpticalFlowPyramid")
+{
+    static std::tuple<GArrayDesc,GScalarDesc>
+            outMeta(GMatDesc,const Size&,GScalarDesc,bool,int,int,bool)
+    {
+        return std::make_tuple(empty_array_desc(), empty_scalar_desc());
+    }
+};
+
+G_TYPED_KERNEL(GCalcOptFlowLK,
+               <GOptFlowLKOutput(GMat,GMat,cv::GArray<cv::Point2f>,cv::GArray<cv::Point2f>,Size,
+                                 GScalar,TermCriteria,int,double)>,
+               "org.opencv.video.calcOpticalFlowPyrLK")
+{
+    static std::tuple<GArrayDesc,GArrayDesc,GArrayDesc> outMeta(GMatDesc,GMatDesc,GArrayDesc,
+                                                                GArrayDesc,const Size&,GScalarDesc,
+                                                                const TermCriteria&,int,double)
+    {
+        return std::make_tuple(empty_array_desc(), empty_array_desc(), empty_array_desc());
+    }
+
+};
+
+G_TYPED_KERNEL(GCalcOptFlowLKForPyr,
+               <GOptFlowLKOutput(cv::GArray<cv::GMat>,cv::GArray<cv::GMat>,
+                                 cv::GArray<cv::Point2f>,cv::GArray<cv::Point2f>,Size,GScalar,
+                                 TermCriteria,int,double)>,
+               "org.opencv.video.calcOpticalFlowPyrLKForPyr")
+{
+    static std::tuple<GArrayDesc,GArrayDesc,GArrayDesc> outMeta(GArrayDesc,GArrayDesc,
+                                                                GArrayDesc,GArrayDesc,
+                                                                const Size&,GScalarDesc,
+                                                                const TermCriteria&,int,double)
+    {
+        return std::make_tuple(empty_array_desc(), empty_array_desc(), empty_array_desc());
+    }
+};
+
+enum BackgroundSubtractorType
+{
+    TYPE_BS_MOG2,
+    TYPE_BS_KNN
+};
+
+/** @brief Structure for the Background Subtractor operation's initialization parameters.*/
+
+struct BackgroundSubtractorParams
+{
+    //! Type of the Background Subtractor operation.
+    BackgroundSubtractorType operation = TYPE_BS_MOG2;
+
+    //! Length of the history.
+    int history = 500;
+
+    //! For MOG2: Threshold on the squared Mahalanobis distance between the pixel
+    //! and the model to decide whether a pixel is well described by
+    //! the background model.
+    //! For KNN: Threshold on the squared distance between the pixel and the sample
+    //! to decide whether a pixel is close to that sample.
+    double threshold = 16;
+
+    //! If true, the algorithm will detect shadows and mark them.
+    bool detectShadows = true;
+
+    //! The value between 0 and 1 that indicates how fast
+    //! the background model is learnt.
+    //! Negative parameter value makes the algorithm use some automatically
+    //! chosen learning rate.
+    double learningRate = -1;
+
+    //! default constructor
+    BackgroundSubtractorParams() {}
+
+    /** Full constructor
+    @param op MOG2/KNN Background Subtractor type.
+    @param histLength Length of the history.
+    @param thrshld For MOG2: Threshold on the squared Mahalanobis distance between
+    the pixel and the model to decide whether a pixel is well described by the background model.
+    For KNN: Threshold on the squared distance between the pixel and the sample to decide
+    whether a pixel is close to that sample.
+    @param detect If true, the algorithm will detect shadows and mark them. It decreases the
+    speed a bit, so if you do not need this feature, set the parameter to false.
+    @param lRate The value between 0 and 1 that indicates how fast the background model is learnt.
+    Negative parameter value makes the algorithm to use some automatically chosen learning rate.
+    */
+    BackgroundSubtractorParams(BackgroundSubtractorType op, int histLength,
+                               double thrshld, bool detect, double lRate) : operation(op),
+                                                                            history(histLength),
+                                                                            threshold(thrshld),
+                                                                            detectShadows(detect),
+                                                                            learningRate(lRate){}
+};
+
+G_TYPED_KERNEL(GBackgroundSubtractor, <GMat(GMat, BackgroundSubtractorParams)>,
+               "org.opencv.video.BackgroundSubtractor")
+{
+    static GMatDesc outMeta(const GMatDesc& in, const BackgroundSubtractorParams& bsParams)
+    {
+        GAPI_Assert(bsParams.history >= 0);
+        GAPI_Assert(bsParams.learningRate <= 1);
+        return in.withType(CV_8U, 1);
+    }
+};
+
+void checkParams(const cv::gapi::KalmanParams& kfParams,
+                 const cv::GMatDesc& measurement, const cv::GMatDesc& control = {});
+
+G_TYPED_KERNEL(GKalmanFilter, <GMat(GMat, GOpaque<bool>, GMat, KalmanParams)>,
+               "org.opencv.video.KalmanFilter")
+{
+    static GMatDesc outMeta(const GMatDesc& measurement, const GOpaqueDesc&,
+                            const GMatDesc& control, const KalmanParams& kfParams)
+    {
+        checkParams(kfParams, measurement, control);
+        return measurement.withSize(Size(1, kfParams.transitionMatrix.rows));
+    }
+};
+
+G_TYPED_KERNEL(GKalmanFilterNoControl, <GMat(GMat, GOpaque<bool>, KalmanParams)>, "org.opencv.video.KalmanFilterNoControl")
+{
+    static GMatDesc outMeta(const GMatDesc& measurement, const GOpaqueDesc&, const KalmanParams& kfParams)
+    {
+        checkParams(kfParams, measurement);
+        return measurement.withSize(Size(1, kfParams.transitionMatrix.rows));
+    }
+};
+} //namespace video
+
+//! @addtogroup gapi_video
+//! @{
+/** @brief Constructs the image pyramid which can be passed to calcOpticalFlowPyrLK.
+
+@note Function textual ID is "org.opencv.video.buildOpticalFlowPyramid"
+
+@param img                8-bit input image.
+@param winSize            window size of optical flow algorithm. Must be not less than winSize
+                          argument of calcOpticalFlowPyrLK. It is needed to calculate required
+                          padding for pyramid levels.
+@param maxLevel           0-based maximal pyramid level number.
+@param withDerivatives    set to precompute gradients for the every pyramid level. If pyramid is
+                          constructed without the gradients then calcOpticalFlowPyrLK will calculate
+                          them internally.
+@param pyrBorder          the border mode for pyramid layers.
+@param derivBorder        the border mode for gradients.
+@param tryReuseInputImage put ROI of input image into the pyramid if possible. You can pass false
+                          to force data copying.
+
+@return
+ - output pyramid.
+ - number of levels in constructed pyramid. Can be less than maxLevel.
+ */
+GAPI_EXPORTS std::tuple<GArray<GMat>, GScalar>
+buildOpticalFlowPyramid(const GMat     &img,
+                        const Size     &winSize,
+                        const GScalar  &maxLevel,
+                              bool      withDerivatives    = true,
+                              int       pyrBorder          = BORDER_REFLECT_101,
+                              int       derivBorder        = BORDER_CONSTANT,
+                              bool      tryReuseInputImage = true);
+
+/** @brief Calculates an optical flow for a sparse feature set using the iterative Lucas-Kanade
+method with pyramids.
+
+See @cite Bouguet00 .
+
+@note Function textual ID is "org.opencv.video.calcOpticalFlowPyrLK"
+
+@param prevImg first 8-bit input image (GMat) or pyramid (GArray<GMat>) constructed by
+buildOpticalFlowPyramid.
+@param nextImg second input image (GMat) or pyramid (GArray<GMat>) of the same size and the same
+type as prevImg.
+@param prevPts GArray of 2D points for which the flow needs to be found; point coordinates must be
+single-precision floating-point numbers.
+@param predPts GArray of 2D points initial for the flow search; make sense only when
+OPTFLOW_USE_INITIAL_FLOW flag is passed; in that case the vector must have the same size as in
+the input.
+@param winSize size of the search window at each pyramid level.
+@param maxLevel 0-based maximal pyramid level number; if set to 0, pyramids are not used (single
+level), if set to 1, two levels are used, and so on; if pyramids are passed to input then
+algorithm will use as many levels as pyramids have but no more than maxLevel.
+@param criteria parameter, specifying the termination criteria of the iterative search algorithm
+(after the specified maximum number of iterations criteria.maxCount or when the search window
+moves by less than criteria.epsilon).
+@param flags operation flags:
+ -   **OPTFLOW_USE_INITIAL_FLOW** uses initial estimations, stored in nextPts; if the flag is
+     not set, then prevPts is copied to nextPts and is considered the initial estimate.
+ -   **OPTFLOW_LK_GET_MIN_EIGENVALS** use minimum eigen values as an error measure (see
+     minEigThreshold description); if the flag is not set, then L1 distance between patches
+     around the original and a moved point, divided by number of pixels in a window, is used as a
+     error measure.
+@param minEigThresh the algorithm calculates the minimum eigen value of a 2x2 normal matrix of
+optical flow equations (this matrix is called a spatial gradient matrix in @cite Bouguet00), divided
+by number of pixels in a window; if this value is less than minEigThreshold, then a corresponding
+feature is filtered out and its flow is not processed, so it allows to remove bad points and get a
+performance boost.
+
+@return
+ - GArray of 2D points (with single-precision floating-point coordinates)
+containing the calculated new positions of input features in the second image.
+ - status GArray (of unsigned chars); each element of the vector is set to 1 if
+the flow for the corresponding features has been found, otherwise, it is set to 0.
+ - GArray of errors (doubles); each element of the vector is set to an error for the
+corresponding feature, type of the error measure can be set in flags parameter; if the flow wasn't
+found then the error is not defined (use the status parameter to find such cases).
+ */
+GAPI_EXPORTS std::tuple<GArray<Point2f>, GArray<uchar>, GArray<float>>
+calcOpticalFlowPyrLK(const GMat            &prevImg,
+                     const GMat            &nextImg,
+                     const GArray<Point2f> &prevPts,
+                     const GArray<Point2f> &predPts,
+                     const Size            &winSize      = Size(21, 21),
+                     const GScalar         &maxLevel     = 3,
+                     const TermCriteria    &criteria     = TermCriteria(TermCriteria::COUNT |
+                                                                        TermCriteria::EPS,
+                                                                        30, 0.01),
+                           int              flags        = 0,
+                           double           minEigThresh = 1e-4);
+
+/**
+@overload
+@note Function textual ID is "org.opencv.video.calcOpticalFlowPyrLKForPyr"
+*/
+GAPI_EXPORTS std::tuple<GArray<Point2f>, GArray<uchar>, GArray<float>>
+calcOpticalFlowPyrLK(const GArray<GMat>    &prevPyr,
+                     const GArray<GMat>    &nextPyr,
+                     const GArray<Point2f> &prevPts,
+                     const GArray<Point2f> &predPts,
+                     const Size            &winSize      = Size(21, 21),
+                     const GScalar         &maxLevel     = 3,
+                     const TermCriteria    &criteria     = TermCriteria(TermCriteria::COUNT |
+                                                                        TermCriteria::EPS,
+                                                                        30, 0.01),
+                           int              flags        = 0,
+                           double           minEigThresh = 1e-4);
+
+/** @brief Gaussian Mixture-based or K-nearest neighbours-based Background/Foreground Segmentation Algorithm.
+The operation generates a foreground mask.
+
+@return Output image is foreground mask, i.e. 8-bit unsigned 1-channel (binary) matrix @ref CV_8UC1.
+
+@note Functional textual ID is "org.opencv.video.BackgroundSubtractor"
+
+@param src input image: Floating point frame is used without scaling and should be in range [0,255].
+@param bsParams Set of initialization parameters for Background Subtractor kernel.
+*/
+GAPI_EXPORTS GMat BackgroundSubtractor(const GMat& src, const cv::gapi::video::BackgroundSubtractorParams& bsParams);
+
+/** @brief Standard Kalman filter algorithm <http://en.wikipedia.org/wiki/Kalman_filter>.
+
+@note Functional textual ID is "org.opencv.video.KalmanFilter"
+
+@param measurement input matrix: 32-bit or 64-bit float 1-channel matrix containing measurements.
+@param haveMeasurement dynamic input flag that indicates whether we get measurements
+at a particular iteration .
+@param control input matrix: 32-bit or 64-bit float 1-channel matrix contains control data
+for changing dynamic system.
+@param kfParams Set of initialization parameters for Kalman filter kernel.
+
+@return Output matrix is predicted or corrected state. They can be 32-bit or 64-bit float
+1-channel matrix @ref CV_32FC1 or @ref CV_64FC1.
+
+@details If measurement matrix is given (haveMeasurements == true), corrected state will
+be returned which corresponds to the pipeline
+cv::KalmanFilter::predict(control) -> cv::KalmanFilter::correct(measurement).
+Otherwise, predicted state will be returned which corresponds to the call of
+cv::KalmanFilter::predict(control).
+@sa cv::KalmanFilter
+*/
+GAPI_EXPORTS GMat KalmanFilter(const GMat& measurement, const GOpaque<bool>& haveMeasurement,
+                               const GMat& control, const cv::gapi::KalmanParams& kfParams);
+
+/** @overload
+The case of Standard Kalman filter algorithm when there is no control in a dynamic system.
+In this case the controlMatrix is empty and control vector is absent.
+
+@note Function textual ID is "org.opencv.video.KalmanFilterNoControl"
+
+@param measurement input matrix: 32-bit or 64-bit float 1-channel matrix containing measurements.
+@param haveMeasurement dynamic input flag that indicates whether we get measurements
+at a particular iteration.
+@param kfParams Set of initialization parameters for Kalman filter kernel.
+
+@return Output matrix is predicted or corrected state. They can be 32-bit or 64-bit float
+1-channel matrix @ref CV_32FC1 or @ref CV_64FC1.
+
+@sa cv::KalmanFilter
+ */
+GAPI_EXPORTS GMat KalmanFilter(const GMat& measurement, const GOpaque<bool>& haveMeasurement,
+                               const cv::gapi::KalmanParams& kfParams);
+
+//! @} gapi_video
+} //namespace gapi
+} //namespace cv
+
+
+namespace cv { namespace detail {
+template<> struct CompileArgTag<cv::gapi::video::BackgroundSubtractorParams>
+{
+    static const char* tag()
+    {
+        return "org.opencv.video.background_substractor_params";
+    }
+};
+}  // namespace detail
+}  // namespace cv
+
+#endif // OPENCV_GAPI_VIDEO_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/highgui.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/highgui.hpp
new file mode 100644
index 000000000000..35b64bceae90
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/highgui.hpp
@@ -0,0 +1,826 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HIGHGUI_HPP
+#define OPENCV_HIGHGUI_HPP
+
+#include "opencv2/core.hpp"
+#ifdef HAVE_OPENCV_IMGCODECS
+#include "opencv2/imgcodecs.hpp"
+#endif
+#ifdef HAVE_OPENCV_VIDEOIO
+#include "opencv2/videoio.hpp"
+#endif
+
+/**
+@defgroup highgui High-level GUI
+
+While OpenCV was designed for use in full-scale applications and can be used within functionally
+rich UI frameworks (such as Qt\*, WinForms\*, or Cocoa\*) or without any UI at all, sometimes there
+it is required to try functionality quickly and visualize the results. This is what the HighGUI
+module has been designed for.
+
+It provides easy interface to:
+
+-   Create and manipulate windows that can display images and "remember" their content (no need to
+    handle repaint events from OS).
+-   Add trackbars to the windows, handle simple mouse events as well as keyboard commands.
+
+@{
+    @defgroup highgui_window_flags Flags related creating and manipulating HighGUI windows and mouse events
+    @defgroup highgui_opengl OpenGL support
+    @defgroup highgui_qt Qt New Functions
+
+    ![image](pics/qtgui.png)
+
+    This figure explains new functionality implemented with Qt\* GUI. The new GUI provides a statusbar,
+    a toolbar, and a control panel. The control panel can have trackbars and buttonbars attached to it.
+    If you cannot see the control panel, press Ctrl+P or right-click any Qt window and select **Display
+    properties window**.
+
+    -   To attach a trackbar, the window name parameter must be NULL.
+
+    -   To attach a buttonbar, a button must be created. If the last bar attached to the control panel
+        is a buttonbar, the new button is added to the right of the last button. If the last bar
+        attached to the control panel is a trackbar, or the control panel is empty, a new buttonbar is
+        created. Then, a new button is attached to it.
+
+    See below the example used to generate the figure:
+
+    @include highgui_qt.cpp
+
+    @defgroup highgui_winrt WinRT support
+
+    This figure explains new functionality implemented with WinRT GUI. The new GUI provides an Image control,
+    and a slider panel. Slider panel holds trackbars attached to it.
+
+    Sliders are attached below the image control. Every new slider is added below the previous one.
+
+    See below the example used to generate the figure:
+    @code
+    void sample_app::MainPage::ShowWindow()
+    {
+        static cv::String windowName("sample");
+        cv::winrt_initContainer(this->cvContainer);
+        cv::namedWindow(windowName); // not required
+
+        cv::Mat image = cv::imread("Assets/sample.jpg");
+        cv::Mat converted = cv::Mat(image.rows, image.cols, CV_8UC4);
+        cv::cvtColor(image, converted, COLOR_BGR2BGRA);
+        cv::imshow(windowName, converted); // this will create window if it hasn't been created before
+
+        int state = 42;
+        cv::TrackbarCallback callback = [](int pos, void* userdata)
+        {
+            if (pos == 0) {
+                cv::destroyWindow(windowName);
+            }
+        };
+        cv::TrackbarCallback callbackTwin = [](int pos, void* userdata)
+        {
+            if (pos >= 70) {
+                cv::destroyAllWindows();
+            }
+        };
+        cv::createTrackbar("Sample trackbar", windowName, &state, 100, callback);
+        cv::createTrackbar("Twin brother", windowName, &state, 100, callbackTwin);
+    }
+    @endcode
+@}
+*/
+
+///////////////////////// graphical user interface //////////////////////////
+namespace cv
+{
+
+//! @addtogroup highgui
+//! @{
+
+//! @addtogroup highgui_window_flags
+//! @{
+
+//! Flags for cv::namedWindow
+enum WindowFlags {
+       WINDOW_NORMAL     = 0x00000000, //!< the user can resize the window (no constraint) / also use to switch a fullscreen window to a normal size.
+       WINDOW_AUTOSIZE   = 0x00000001, //!< the user cannot resize the window, the size is constrainted by the image displayed.
+       WINDOW_OPENGL     = 0x00001000, //!< window with opengl support.
+
+       WINDOW_FULLSCREEN = 1,          //!< change the window to fullscreen.
+       WINDOW_FREERATIO  = 0x00000100, //!< the image expends as much as it can (no ratio constraint).
+       WINDOW_KEEPRATIO  = 0x00000000, //!< the ratio of the image is respected.
+       WINDOW_GUI_EXPANDED=0x00000000, //!< status bar and tool bar
+       WINDOW_GUI_NORMAL = 0x00000010, //!< old fashious way
+    };
+
+//! Flags for cv::setWindowProperty / cv::getWindowProperty
+enum WindowPropertyFlags {
+       WND_PROP_FULLSCREEN   = 0, //!< fullscreen property    (can be WINDOW_NORMAL or WINDOW_FULLSCREEN).
+       WND_PROP_AUTOSIZE     = 1, //!< autosize property      (can be WINDOW_NORMAL or WINDOW_AUTOSIZE).
+       WND_PROP_ASPECT_RATIO = 2, //!< window's aspect ration (can be set to WINDOW_FREERATIO or WINDOW_KEEPRATIO).
+       WND_PROP_OPENGL       = 3, //!< opengl support.
+       WND_PROP_VISIBLE      = 4, //!< checks whether the window exists and is visible
+       WND_PROP_TOPMOST      = 5, //!< property to toggle normal window being topmost or not
+       WND_PROP_VSYNC        = 6  //!< enable or disable VSYNC (in OpenGL mode)
+     };
+
+//! Mouse Events see cv::MouseCallback
+enum MouseEventTypes {
+       EVENT_MOUSEMOVE      = 0, //!< indicates that the mouse pointer has moved over the window.
+       EVENT_LBUTTONDOWN    = 1, //!< indicates that the left mouse button is pressed.
+       EVENT_RBUTTONDOWN    = 2, //!< indicates that the right mouse button is pressed.
+       EVENT_MBUTTONDOWN    = 3, //!< indicates that the middle mouse button is pressed.
+       EVENT_LBUTTONUP      = 4, //!< indicates that left mouse button is released.
+       EVENT_RBUTTONUP      = 5, //!< indicates that right mouse button is released.
+       EVENT_MBUTTONUP      = 6, //!< indicates that middle mouse button is released.
+       EVENT_LBUTTONDBLCLK  = 7, //!< indicates that left mouse button is double clicked.
+       EVENT_RBUTTONDBLCLK  = 8, //!< indicates that right mouse button is double clicked.
+       EVENT_MBUTTONDBLCLK  = 9, //!< indicates that middle mouse button is double clicked.
+       EVENT_MOUSEWHEEL     = 10,//!< positive and negative values mean forward and backward scrolling, respectively.
+       EVENT_MOUSEHWHEEL    = 11 //!< positive and negative values mean right and left scrolling, respectively.
+     };
+
+//! Mouse Event Flags see cv::MouseCallback
+enum MouseEventFlags {
+       EVENT_FLAG_LBUTTON   = 1, //!< indicates that the left mouse button is down.
+       EVENT_FLAG_RBUTTON   = 2, //!< indicates that the right mouse button is down.
+       EVENT_FLAG_MBUTTON   = 4, //!< indicates that the middle mouse button is down.
+       EVENT_FLAG_CTRLKEY   = 8, //!< indicates that CTRL Key is pressed.
+       EVENT_FLAG_SHIFTKEY  = 16,//!< indicates that SHIFT Key is pressed.
+       EVENT_FLAG_ALTKEY    = 32 //!< indicates that ALT Key is pressed.
+     };
+
+//! @} highgui_window_flags
+
+//! @addtogroup highgui_qt
+//! @{
+
+//! Qt font weight
+enum QtFontWeights {
+        QT_FONT_LIGHT           = 25, //!< Weight of 25
+        QT_FONT_NORMAL          = 50, //!< Weight of 50
+        QT_FONT_DEMIBOLD        = 63, //!< Weight of 63
+        QT_FONT_BOLD            = 75, //!< Weight of 75
+        QT_FONT_BLACK           = 87  //!< Weight of 87
+     };
+
+//! Qt font style
+enum QtFontStyles {
+        QT_STYLE_NORMAL         = 0, //!< Normal font.
+        QT_STYLE_ITALIC         = 1, //!< Italic font.
+        QT_STYLE_OBLIQUE        = 2  //!< Oblique font.
+     };
+
+//! Qt "button" type
+enum QtButtonTypes {
+       QT_PUSH_BUTTON   = 0,    //!< Push button.
+       QT_CHECKBOX      = 1,    //!< Checkbox button.
+       QT_RADIOBOX      = 2,    //!< Radiobox button.
+       QT_NEW_BUTTONBAR = 1024  //!< Button should create a new buttonbar
+     };
+
+//! @} highgui_qt
+
+/** @brief Callback function for mouse events. see cv::setMouseCallback
+@param event one of the cv::MouseEventTypes constants.
+@param x The x-coordinate of the mouse event.
+@param y The y-coordinate of the mouse event.
+@param flags one of the cv::MouseEventFlags constants.
+@param userdata The optional parameter.
+ */
+typedef void (*MouseCallback)(int event, int x, int y, int flags, void* userdata);
+
+/** @brief Callback function for Trackbar see cv::createTrackbar
+@param pos current position of the specified trackbar.
+@param userdata The optional parameter.
+ */
+typedef void (*TrackbarCallback)(int pos, void* userdata);
+
+/** @brief Callback function defined to be called every frame. See cv::setOpenGlDrawCallback
+@param userdata The optional parameter.
+ */
+typedef void (*OpenGlDrawCallback)(void* userdata);
+
+/** @brief Callback function for a button created by cv::createButton
+@param state current state of the button. It could be -1 for a push button, 0 or 1 for a check/radio box button.
+@param userdata The optional parameter.
+ */
+typedef void (*ButtonCallback)(int state, void* userdata);
+
+/** @brief Creates a window.
+
+The function namedWindow creates a window that can be used as a placeholder for images and
+trackbars. Created windows are referred to by their names.
+
+If a window with the same name already exists, the function does nothing.
+
+You can call cv::destroyWindow or cv::destroyAllWindows to close the window and de-allocate any associated
+memory usage. For a simple program, you do not really have to call these functions because all the
+resources and windows of the application are closed automatically by the operating system upon exit.
+
+@note Qt backend supports additional flags:
+ -   **WINDOW_NORMAL or WINDOW_AUTOSIZE:** WINDOW_NORMAL enables you to resize the
+     window, whereas WINDOW_AUTOSIZE adjusts automatically the window size to fit the
+     displayed image (see imshow ), and you cannot change the window size manually.
+ -   **WINDOW_FREERATIO or WINDOW_KEEPRATIO:** WINDOW_FREERATIO adjusts the image
+     with no respect to its ratio, whereas WINDOW_KEEPRATIO keeps the image ratio.
+ -   **WINDOW_GUI_NORMAL or WINDOW_GUI_EXPANDED:** WINDOW_GUI_NORMAL is the old way to draw the window
+     without statusbar and toolbar, whereas WINDOW_GUI_EXPANDED is a new enhanced GUI.
+By default, flags == WINDOW_AUTOSIZE | WINDOW_KEEPRATIO | WINDOW_GUI_EXPANDED
+
+@param winname Name of the window in the window caption that may be used as a window identifier.
+@param flags Flags of the window. The supported flags are: (cv::WindowFlags)
+ */
+CV_EXPORTS_W void namedWindow(const String& winname, int flags = WINDOW_AUTOSIZE);
+
+/** @brief Destroys the specified window.
+
+The function destroyWindow destroys the window with the given name.
+
+@param winname Name of the window to be destroyed.
+ */
+CV_EXPORTS_W void destroyWindow(const String& winname);
+
+/** @brief Destroys all of the HighGUI windows.
+
+The function destroyAllWindows destroys all of the opened HighGUI windows.
+ */
+CV_EXPORTS_W void destroyAllWindows();
+
+
+/** @brief HighGUI backend used.
+
+The function returns HighGUI backend name used: could be COCOA, GTK2/3, QT, WAYLAND or WIN32.
+Returns empty string if there is no available UI backend.
+ */
+CV_EXPORTS_W const std::string currentUIFramework();
+
+
+CV_EXPORTS_W int startWindowThread();
+
+/** @brief Similar to #waitKey, but returns full key code.
+
+@note Key code is implementation specific and depends on used backend: QT/GTK/Win32/etc
+
+*/
+CV_EXPORTS_W int waitKeyEx(int delay = 0);
+
+/** @brief Waits for a pressed key.
+
+The function waitKey waits for a key event infinitely (when \f$\texttt{delay}\leq 0\f$ ) or for delay
+milliseconds, when it is positive. Since the OS has a minimum time between switching threads, the
+function will not wait exactly delay ms, it will wait at least delay ms, depending on what else is
+running on your computer at that time. It returns the code of the pressed key or -1 if no key was
+pressed before the specified time had elapsed. To check for a key press but not wait for it, use
+#pollKey.
+
+@note The functions #waitKey and #pollKey are the only methods in HighGUI that can fetch and handle
+GUI events, so one of them needs to be called periodically for normal event processing unless
+HighGUI is used within an environment that takes care of event processing.
+
+@note The function only works if there is at least one HighGUI window created and the window is
+active. If there are several HighGUI windows, any of them can be active.
+
+@param delay Delay in milliseconds. 0 is the special value that means "forever".
+ */
+CV_EXPORTS_W int waitKey(int delay = 0);
+
+/** @brief Polls for a pressed key.
+
+The function pollKey polls for a key event without waiting. It returns the code of the pressed key
+or -1 if no key was pressed since the last invocation. To wait until a key was pressed, use #waitKey.
+
+@note The functions #waitKey and #pollKey are the only methods in HighGUI that can fetch and handle
+GUI events, so one of them needs to be called periodically for normal event processing unless
+HighGUI is used within an environment that takes care of event processing.
+
+@note The function only works if there is at least one HighGUI window created and the window is
+active. If there are several HighGUI windows, any of them can be active.
+ */
+CV_EXPORTS_W int pollKey();
+
+/** @brief Displays an image in the specified window.
+
+The function imshow displays an image in the specified window. If the window was created with the
+cv::WINDOW_AUTOSIZE flag, the image is shown with its original size, however it is still limited by the screen resolution.
+Otherwise, the image is scaled to fit the window. The function may scale the image, depending on its depth:
+
+-   If the image is 8-bit unsigned, it is displayed as is.
+-   If the image is 16-bit unsigned, the pixels are divided by 256. That is, the
+    value range [0,255\*256] is mapped to [0,255].
+-   If the image is 32-bit or 64-bit floating-point, the pixel values are multiplied by 255. That is, the
+    value range [0,1] is mapped to [0,255].
+-   32-bit integer images are not processed anymore due to ambiguouty of required transform.
+    Convert to 8-bit unsigned matrix using a custom preprocessing specific to image's context.
+
+If window was created with OpenGL support, cv::imshow also support ogl::Buffer , ogl::Texture2D and
+cuda::GpuMat as input.
+
+If the window was not created before this function, it is assumed creating a window with cv::WINDOW_AUTOSIZE.
+
+If you need to show an image that is bigger than the screen resolution, you will need to call namedWindow("", WINDOW_NORMAL) before the imshow.
+
+@note This function should be followed by a call to cv::waitKey or cv::pollKey to perform GUI
+housekeeping tasks that are necessary to actually show the given image and make the window respond
+to mouse and keyboard events. Otherwise, it won't display the image and the window might lock up.
+For example, **waitKey(0)** will display the window infinitely until any keypress (it is suitable
+for image display). **waitKey(25)** will display a frame and wait approximately 25 ms for a key
+press (suitable for displaying a video frame-by-frame). To remove the window, use cv::destroyWindow.
+
+@note [__Windows Backend Only__] Pressing Ctrl+C will copy the image to the clipboard. Pressing Ctrl+S will show a dialog to save the image.
+@note [__Wayland Backend Only__] Supoorting format is extended.
+-   If the image is 8-bit signed, the pixels are biased by 128. That is, the
+    value range [-128,127] is mapped to [0,255].
+-   If the image is 16-bit signed, the pixels are divided by 256 and biased by 128. That is, the
+    value range [-32768,32767] is mapped to [0,255].
+
+@param winname Name of the window.
+@param mat Image to be shown.
+ */
+CV_EXPORTS_W void imshow(const String& winname, InputArray mat);
+
+/** @brief Resizes the window to the specified size
+
+@note The specified window size is for the image area. Toolbars are not counted.
+Only windows created without cv::WINDOW_AUTOSIZE flag can be resized.
+
+@param winname Window name.
+@param width The new window width.
+@param height The new window height.
+ */
+CV_EXPORTS_W void resizeWindow(const String& winname, int width, int height);
+
+/** @overload
+@param winname Window name.
+@param size The new window size.
+*/
+CV_EXPORTS_W void resizeWindow(const String& winname, const cv::Size& size);
+
+/** @brief Moves the window to the specified position
+
+@param winname Name of the window.
+@param x The new x-coordinate of the window.
+@param y The new y-coordinate of the window.
+
+@note [__Wayland Backend Only__] This function is not supported by the Wayland protocol limitation.
+ */
+CV_EXPORTS_W void moveWindow(const String& winname, int x, int y);
+
+/** @brief Changes parameters of a window dynamically.
+
+The function setWindowProperty enables changing properties of a window.
+
+@param winname Name of the window.
+@param prop_id Window property to edit. The supported operation flags are: (cv::WindowPropertyFlags)
+@param prop_value New value of the window property. The supported flags are: (cv::WindowFlags)
+
+@note [__Wayland Backend Only__] This function is not supported.
+ */
+CV_EXPORTS_W void setWindowProperty(const String& winname, int prop_id, double prop_value);
+
+/** @brief Updates window title
+@param winname Name of the window.
+@param title New title.
+*/
+CV_EXPORTS_W void setWindowTitle(const String& winname, const String& title);
+
+/** @brief Provides parameters of a window.
+
+The function getWindowProperty returns properties of a window.
+
+@param winname Name of the window.
+@param prop_id Window property to retrieve. The following operation flags are available: (cv::WindowPropertyFlags)
+
+@sa setWindowProperty
+
+@note [__Wayland Backend Only__] This function is not supported.
+ */
+CV_EXPORTS_W double getWindowProperty(const String& winname, int prop_id);
+
+/** @brief Provides rectangle of image in the window.
+
+The function getWindowImageRect returns the client screen coordinates, width and height of the image rendering area.
+
+@param winname Name of the window.
+
+@sa resizeWindow moveWindow
+
+@note [__Wayland Backend Only__] This function is not supported by the Wayland protocol limitation.
+ */
+CV_EXPORTS_W Rect getWindowImageRect(const String& winname);
+
+/** @example samples/cpp/create_mask.cpp
+This program demonstrates using mouse events and how to make and use a mask image (black and white) .
+*/
+/** @brief Sets mouse handler for the specified window
+
+@param winname Name of the window.
+@param onMouse Callback function for mouse events. See OpenCV samples on how to specify and use the callback.
+@param userdata The optional parameter passed to the callback.
+ */
+CV_EXPORTS void setMouseCallback(const String& winname, MouseCallback onMouse, void* userdata = 0);
+
+/** @brief Gets the mouse-wheel motion delta, when handling mouse-wheel events cv::EVENT_MOUSEWHEEL and
+cv::EVENT_MOUSEHWHEEL.
+
+For regular mice with a scroll-wheel, delta will be a multiple of 120. The value 120 corresponds to
+a one notch rotation of the wheel or the threshold for action to be taken and one such action should
+occur for each delta. Some high-precision mice with higher-resolution freely-rotating wheels may
+generate smaller values.
+
+For cv::EVENT_MOUSEWHEEL positive and negative values mean forward and backward scrolling,
+respectively. For cv::EVENT_MOUSEHWHEEL, where available, positive and negative values mean right and
+left scrolling, respectively.
+
+@note Mouse-wheel events are currently supported only on Windows and Cocoa.
+
+@param flags The mouse callback flags parameter.
+ */
+CV_EXPORTS int getMouseWheelDelta(int flags);
+
+/** @brief Allows users to select a ROI on the given image.
+
+The function creates a window and allows users to select a ROI using the mouse.
+Controls: use `space` or `enter` to finish selection, use key `c` to cancel selection (function will return the zero cv::Rect).
+
+@param windowName name of the window where selection process will be shown.
+@param img image to select a ROI.
+@param showCrosshair if true crosshair of selection rectangle will be shown.
+@param fromCenter if true center of selection will match initial mouse position. In opposite case a corner of
+selection rectangle will correspont to the initial mouse position.
+@param printNotice if true a notice to select ROI or cancel selection will be printed in console.
+@return selected ROI or empty rect if selection canceled.
+
+@note The function sets it's own mouse callback for specified window using cv::setMouseCallback(windowName, ...).
+After finish of work an empty callback will be set for the used window.
+ */
+CV_EXPORTS_W Rect selectROI(const String& windowName, InputArray img, bool showCrosshair = true, bool fromCenter = false, bool printNotice = true);
+
+/** @overload
+ */
+CV_EXPORTS_W Rect selectROI(InputArray img, bool showCrosshair = true, bool fromCenter = false, bool printNotice = true);
+
+/** @brief Allows users to select multiple ROIs on the given image.
+
+The function creates a window and allows users to select multiple ROIs using the mouse.
+Controls: use `space` or `enter` to finish current selection and start a new one,
+use `esc` to terminate multiple ROI selection process.
+
+@param windowName name of the window where selection process will be shown.
+@param img image to select a ROI.
+@param boundingBoxes selected ROIs.
+@param showCrosshair if true crosshair of selection rectangle will be shown.
+@param fromCenter if true center of selection will match initial mouse position. In opposite case a corner of
+selection rectangle will correspont to the initial mouse position.
+@param printNotice if true a notice to select ROI or cancel selection will be printed in console.
+
+@note The function sets it's own mouse callback for specified window using cv::setMouseCallback(windowName, ...).
+After finish of work an empty callback will be set for the used window.
+ */
+CV_EXPORTS_W void selectROIs(const String& windowName, InputArray img,
+                             CV_OUT std::vector<Rect>& boundingBoxes, bool showCrosshair = true, bool fromCenter = false, bool printNotice = true);
+
+/** @brief Creates a trackbar and attaches it to the specified window.
+
+The function createTrackbar creates a trackbar (a slider or range control) with the specified name
+and range, assigns a variable value to be a position synchronized with the trackbar and specifies
+the callback function onChange to be called on the trackbar position change. The created trackbar is
+displayed in the specified window winname.
+
+@note [__Qt Backend Only__] winname can be empty if the trackbar should be attached to the
+control panel.
+
+Clicking the label of each trackbar enables editing the trackbar values manually.
+
+@param trackbarname Name of the created trackbar.
+@param winname Name of the window that will be used as a parent of the created trackbar.
+@param value Optional pointer to an integer variable whose value reflects the position of the
+slider. Upon creation, the slider position is defined by this variable.
+@param count Maximal position of the slider. The minimal position is always 0.
+@param onChange Pointer to the function to be called every time the slider changes position. This
+function should be prototyped as void Foo(int,void\*); , where the first parameter is the trackbar
+position and the second parameter is the user data (see the next parameter). If the callback is
+the NULL pointer, no callbacks are called, but only value is updated.
+@param userdata User data that is passed as is to the callback. It can be used to handle trackbar
+events without using global variables.
+ */
+CV_EXPORTS int createTrackbar(const String& trackbarname, const String& winname,
+                              int* value, int count,
+                              TrackbarCallback onChange = 0,
+                              void* userdata = 0);
+
+/** @brief Returns the trackbar position.
+
+The function returns the current position of the specified trackbar.
+
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+panel.
+
+@param trackbarname Name of the trackbar.
+@param winname Name of the window that is the parent of the trackbar.
+ */
+CV_EXPORTS_W int getTrackbarPos(const String& trackbarname, const String& winname);
+
+/** @brief Sets the trackbar position.
+
+The function sets the position of the specified trackbar in the specified window.
+
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+panel.
+
+@param trackbarname Name of the trackbar.
+@param winname Name of the window that is the parent of trackbar.
+@param pos New position.
+ */
+CV_EXPORTS_W void setTrackbarPos(const String& trackbarname, const String& winname, int pos);
+
+/** @brief Sets the trackbar maximum position.
+
+The function sets the maximum position of the specified trackbar in the specified window.
+
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+panel.
+
+@param trackbarname Name of the trackbar.
+@param winname Name of the window that is the parent of trackbar.
+@param maxval New maximum position.
+ */
+CV_EXPORTS_W void setTrackbarMax(const String& trackbarname, const String& winname, int maxval);
+
+/** @brief Sets the trackbar minimum position.
+
+The function sets the minimum position of the specified trackbar in the specified window.
+
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+panel.
+
+@param trackbarname Name of the trackbar.
+@param winname Name of the window that is the parent of trackbar.
+@param minval New minimum position.
+ */
+CV_EXPORTS_W void setTrackbarMin(const String& trackbarname, const String& winname, int minval);
+
+//! @addtogroup highgui_opengl OpenGL support
+//! @{
+
+/** @brief Displays OpenGL 2D texture in the specified window.
+
+@param winname Name of the window.
+@param tex OpenGL 2D texture data.
+ */
+CV_EXPORTS void imshow(const String& winname, const ogl::Texture2D& tex);
+
+/** @brief Sets a callback function to be called to draw on top of displayed image.
+
+The function setOpenGlDrawCallback can be used to draw 3D data on the window. See the example of
+callback function below:
+@code
+    void on_opengl(void* param)
+    {
+        glLoadIdentity();
+
+        glTranslated(0.0, 0.0, -1.0);
+
+        glRotatef( 55, 1, 0, 0 );
+        glRotatef( 45, 0, 1, 0 );
+        glRotatef( 0, 0, 0, 1 );
+
+        static const int coords[6][4][3] = {
+            { { +1, -1, -1 }, { -1, -1, -1 }, { -1, +1, -1 }, { +1, +1, -1 } },
+            { { +1, +1, -1 }, { -1, +1, -1 }, { -1, +1, +1 }, { +1, +1, +1 } },
+            { { +1, -1, +1 }, { +1, -1, -1 }, { +1, +1, -1 }, { +1, +1, +1 } },
+            { { -1, -1, -1 }, { -1, -1, +1 }, { -1, +1, +1 }, { -1, +1, -1 } },
+            { { +1, -1, +1 }, { -1, -1, +1 }, { -1, -1, -1 }, { +1, -1, -1 } },
+            { { -1, -1, +1 }, { +1, -1, +1 }, { +1, +1, +1 }, { -1, +1, +1 } }
+        };
+
+        for (int i = 0; i < 6; ++i) {
+                    glColor3ub( i*20, 100+i*10, i*42 );
+                    glBegin(GL_QUADS);
+                    for (int j = 0; j < 4; ++j) {
+                            glVertex3d(0.2 * coords[i][j][0], 0.2 * coords[i][j][1], 0.2 * coords[i][j][2]);
+                    }
+                    glEnd();
+        }
+    }
+@endcode
+
+@param winname Name of the window.
+@param onOpenGlDraw Pointer to the function to be called every frame. This function should be
+prototyped as void Foo(void\*) .
+@param userdata Pointer passed to the callback function.(__Optional__)
+ */
+CV_EXPORTS void setOpenGlDrawCallback(const String& winname, OpenGlDrawCallback onOpenGlDraw, void* userdata = 0);
+
+/** @brief Sets the specified window as current OpenGL context.
+
+@param winname Name of the window.
+ */
+CV_EXPORTS void setOpenGlContext(const String& winname);
+
+/** @brief Force window to redraw its context and call draw callback ( See cv::setOpenGlDrawCallback ).
+
+@param winname Name of the window.
+ */
+CV_EXPORTS void updateWindow(const String& winname);
+
+//! @} highgui_opengl
+
+//! @addtogroup highgui_qt
+//! @{
+
+/** @brief QtFont available only for Qt. See cv::fontQt
+ */
+struct QtFont
+{
+    const char* nameFont;  //!< Name of the font
+    Scalar      color;     //!< Color of the font. Scalar(blue_component, green_component, red_component[, alpha_component])
+    int         font_face; //!< See cv::QtFontStyles
+    const int*  ascii;     //!< font data and metrics
+    const int*  greek;
+    const int*  cyrillic;
+    float       hscale, vscale;
+    float       shear;     //!< slope coefficient: 0 - normal, >0 - italic
+    int         thickness; //!< See cv::QtFontWeights
+    float       dx;        //!< horizontal interval between letters
+    int         line_type; //!< PointSize
+};
+
+/** @brief Creates the font to draw a text on an image.
+
+The function fontQt creates a cv::QtFont object. This cv::QtFont is not compatible with putText .
+
+A basic usage of this function is the following: :
+@code
+    QtFont font = fontQt("Times");
+    addText( img1, "Hello World !", Point(50,50), font);
+@endcode
+
+@param nameFont Name of the font. The name should match the name of a system font (such as
+*Times*). If the font is not found, a default one is used.
+@param pointSize Size of the font. If not specified, equal zero or negative, the point size of the
+font is set to a system-dependent default value. Generally, this is 12 points.
+@param color Color of the font in BGRA where A = 255 is fully transparent. Use the macro CV_RGB
+for simplicity.
+@param weight Font weight. Available operation flags are : cv::QtFontWeights You can also specify a positive integer for better control.
+@param style Font style. Available operation flags are : cv::QtFontStyles
+@param spacing Spacing between characters. It can be negative or positive.
+ */
+CV_EXPORTS QtFont fontQt(const String& nameFont, int pointSize = -1,
+                         Scalar color = Scalar::all(0), int weight = QT_FONT_NORMAL,
+                         int style = QT_STYLE_NORMAL, int spacing = 0);
+
+/** @brief Draws a text on the image.
+
+The function addText draws *text* on the image *img* using a specific font *font* (see example cv::fontQt
+)
+
+@param img 8-bit 3-channel image where the text should be drawn.
+@param text Text to write on an image.
+@param org Point(x,y) where the text should start on an image.
+@param font Font to use to draw a text.
+ */
+CV_EXPORTS void addText( const Mat& img, const String& text, Point org, const QtFont& font);
+
+/** @brief Draws a text on the image.
+
+@param img 8-bit 3-channel image where the text should be drawn.
+@param text Text to write on an image.
+@param org Point(x,y) where the text should start on an image.
+@param nameFont Name of the font. The name should match the name of a system font (such as
+*Times*). If the font is not found, a default one is used.
+@param pointSize Size of the font. If not specified, equal zero or negative, the point size of the
+font is set to a system-dependent default value. Generally, this is 12 points.
+@param color Color of the font in BGRA where A = 255 is fully transparent.
+@param weight Font weight. Available operation flags are : cv::QtFontWeights You can also specify a positive integer for better control.
+@param style Font style. Available operation flags are : cv::QtFontStyles
+@param spacing Spacing between characters. It can be negative or positive.
+ */
+CV_EXPORTS_W void addText(const Mat& img, const String& text, Point org, const String& nameFont, int pointSize = -1, Scalar color = Scalar::all(0),
+        int weight = QT_FONT_NORMAL, int style = QT_STYLE_NORMAL, int spacing = 0);
+
+/** @brief Displays a text on a window image as an overlay for a specified duration.
+
+The function displayOverlay displays useful information/tips on top of the window for a certain
+amount of time *delayms*. The function does not modify the image, displayed in the window, that is,
+after the specified delay the original content of the window is restored.
+
+@param winname Name of the window.
+@param text Overlay text to write on a window image.
+@param delayms The period (in milliseconds), during which the overlay text is displayed. If this
+function is called before the previous overlay text timed out, the timer is restarted and the text
+is updated. If this value is zero, the text never disappears.
+ */
+CV_EXPORTS_W void displayOverlay(const String& winname, const String& text, int delayms = 0);
+
+/** @brief Displays a text on the window statusbar during the specified period of time.
+
+The function displayStatusBar displays useful information/tips on top of the window for a certain
+amount of time *delayms* . This information is displayed on the window statusbar (the window must be
+created with the CV_GUI_EXPANDED flags).
+
+@param winname Name of the window.
+@param text Text to write on the window statusbar.
+@param delayms Duration (in milliseconds) to display the text. If this function is called before
+the previous text timed out, the timer is restarted and the text is updated. If this value is
+zero, the text never disappears.
+ */
+CV_EXPORTS_W void displayStatusBar(const String& winname, const String& text, int delayms = 0);
+
+/** @brief Saves parameters of the specified window.
+
+The function saveWindowParameters saves size, location, flags, trackbars value, zoom and panning
+location of the window windowName.
+
+@param windowName Name of the window.
+ */
+CV_EXPORTS void saveWindowParameters(const String& windowName);
+
+/** @brief Loads parameters of the specified window.
+
+The function loadWindowParameters loads size, location, flags, trackbars value, zoom and panning
+location of the window windowName.
+
+@param windowName Name of the window.
+ */
+CV_EXPORTS void loadWindowParameters(const String& windowName);
+
+CV_EXPORTS  int startLoop(int (*pt2Func)(int argc, char *argv[]), int argc, char* argv[]);
+
+CV_EXPORTS  void stopLoop();
+
+/** @brief Attaches a button to the control panel.
+
+The function createButton attaches a button to the control panel. Each button is added to a
+buttonbar to the right of the last button. A new buttonbar is created if nothing was attached to the
+control panel before, or if the last element attached to the control panel was a trackbar or if the
+QT_NEW_BUTTONBAR flag is added to the type.
+
+See below various examples of the cv::createButton function call: :
+@code
+    createButton("",callbackButton);//create a push button "button 0", that will call callbackButton.
+    createButton("button2",callbackButton,NULL,QT_CHECKBOX,0);
+    createButton("button3",callbackButton,&value);
+    createButton("button5",callbackButton1,NULL,QT_RADIOBOX);
+    createButton("button6",callbackButton2,NULL,QT_PUSH_BUTTON,1);
+    createButton("button6",callbackButton2,NULL,QT_PUSH_BUTTON|QT_NEW_BUTTONBAR);// create a push button in a new row
+@endcode
+
+@param  bar_name Name of the button.
+@param on_change Pointer to the function to be called every time the button changes its state.
+This function should be prototyped as void Foo(int state,\*void); . *state* is the current state
+of the button. It could be -1 for a push button, 0 or 1 for a check/radio box button.
+@param userdata Pointer passed to the callback function.
+@param type Optional type of the button. Available types are: (cv::QtButtonTypes)
+@param initial_button_state Default state of the button. Use for checkbox and radiobox. Its
+value could be 0 or 1. (__Optional__)
+*/
+CV_EXPORTS int createButton( const String& bar_name, ButtonCallback on_change,
+                             void* userdata = 0, int type = QT_PUSH_BUTTON,
+                             bool initial_button_state = false);
+
+//! @} highgui_qt
+
+//! @} highgui
+
+} // cv
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/highgui/highgui.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/highgui/highgui.hpp
new file mode 100644
index 000000000000..160c9cf4af24
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/highgui/highgui.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/highgui.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/highgui/highgui_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/highgui/highgui_c.h
new file mode 100644
index 000000000000..e508e1497521
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/highgui/highgui_c.h
@@ -0,0 +1,251 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HIGHGUI_H
+#define OPENCV_HIGHGUI_H
+
+#include "opencv2/core/core_c.h"
+#include "opencv2/imgproc/imgproc_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** @addtogroup highgui_c
+  @{
+  */
+
+/****************************************************************************************\
+*                                  Basic GUI functions                                   *
+\****************************************************************************************/
+//YV
+//-----------New for Qt
+/* For font */
+enum {  CV_FONT_LIGHT           = 25,//QFont::Light,
+        CV_FONT_NORMAL          = 50,//QFont::Normal,
+        CV_FONT_DEMIBOLD        = 63,//QFont::DemiBold,
+        CV_FONT_BOLD            = 75,//QFont::Bold,
+        CV_FONT_BLACK           = 87 //QFont::Black
+};
+
+enum {  CV_STYLE_NORMAL         = 0,//QFont::StyleNormal,
+        CV_STYLE_ITALIC         = 1,//QFont::StyleItalic,
+        CV_STYLE_OBLIQUE        = 2 //QFont::StyleOblique
+};
+/* ---------*/
+
+//for color cvScalar(blue_component, green_component, red_component[, alpha_component])
+//and alpha= 0 <-> 0xFF (not transparent <-> transparent)
+CVAPI(CvFont) cvFontQt(const char* nameFont, int pointSize CV_DEFAULT(-1), CvScalar color CV_DEFAULT(cvScalarAll(0)), int weight CV_DEFAULT(CV_FONT_NORMAL),  int style CV_DEFAULT(CV_STYLE_NORMAL), int spacing CV_DEFAULT(0));
+
+CVAPI(void) cvAddText(const CvArr* img, const char* text, CvPoint org, CvFont *arg2);
+
+CVAPI(void) cvDisplayOverlay(const char* name, const char* text, int delayms CV_DEFAULT(0));
+CVAPI(void) cvDisplayStatusBar(const char* name, const char* text, int delayms CV_DEFAULT(0));
+
+CVAPI(void) cvSaveWindowParameters(const char* name);
+CVAPI(void) cvLoadWindowParameters(const char* name);
+CVAPI(int) cvStartLoop(int (*pt2Func)(int argc, char *argv[]), int argc, char* argv[]);
+CVAPI(void) cvStopLoop( void );
+
+typedef void (CV_CDECL *CvButtonCallback)(int state, void* userdata);
+enum {CV_PUSH_BUTTON = 0, CV_CHECKBOX = 1, CV_RADIOBOX = 2};
+CVAPI(int) cvCreateButton( const char* button_name CV_DEFAULT(NULL),CvButtonCallback on_change CV_DEFAULT(NULL), void* userdata CV_DEFAULT(NULL) , int button_type CV_DEFAULT(CV_PUSH_BUTTON), int initial_button_state CV_DEFAULT(0));
+//----------------------
+
+
+/* this function is used to set some external parameters in case of X Window */
+CVAPI(int) cvInitSystem( int argc, char** argv );
+
+CVAPI(int) cvStartWindowThread( void );
+
+// ---------  YV ---------
+enum
+{
+    //These 3 flags are used by cvSet/GetWindowProperty
+    CV_WND_PROP_FULLSCREEN = 0, //to change/get window's fullscreen property
+    CV_WND_PROP_AUTOSIZE   = 1, //to change/get window's autosize property
+    CV_WND_PROP_ASPECTRATIO= 2, //to change/get window's aspectratio property
+    CV_WND_PROP_OPENGL     = 3, //to change/get window's opengl support
+    CV_WND_PROP_VISIBLE    = 4,
+
+    //These 2 flags are used by cvNamedWindow and cvSet/GetWindowProperty
+    CV_WINDOW_NORMAL       = 0x00000000, //the user can resize the window (no constraint)  / also use to switch a fullscreen window to a normal size
+    CV_WINDOW_AUTOSIZE     = 0x00000001, //the user cannot resize the window, the size is constrainted by the image displayed
+    CV_WINDOW_OPENGL       = 0x00001000, //window with opengl support
+
+    //Those flags are only for Qt
+    CV_GUI_EXPANDED         = 0x00000000, //status bar and tool bar
+    CV_GUI_NORMAL           = 0x00000010, //old fashious way
+
+    //These 3 flags are used by cvNamedWindow and cvSet/GetWindowProperty
+    CV_WINDOW_FULLSCREEN   = 1,//change the window to fullscreen
+    CV_WINDOW_FREERATIO    = 0x00000100,//the image expends as much as it can (no ratio constraint)
+    CV_WINDOW_KEEPRATIO    = 0x00000000//the ration image is respected.
+};
+
+/* create window */
+CVAPI(int) cvNamedWindow( const char* name, int flags CV_DEFAULT(CV_WINDOW_AUTOSIZE) );
+
+/* Set and Get Property of the window */
+CVAPI(void) cvSetWindowProperty(const char* name, int prop_id, double prop_value);
+CVAPI(double) cvGetWindowProperty(const char* name, int prop_id);
+
+/* display image within window (highgui windows remember their content) */
+CVAPI(void) cvShowImage( const char* name, const CvArr* image );
+
+/* resize/move window */
+CVAPI(void) cvResizeWindow( const char* name, int width, int height );
+CVAPI(void) cvMoveWindow( const char* name, int x, int y );
+
+
+/* destroy window and all the trackers associated with it */
+CVAPI(void) cvDestroyWindow( const char* name );
+
+CVAPI(void) cvDestroyAllWindows(void);
+
+/* get native window handle (HWND in case of Win32 and Widget in case of X Window) */
+CVAPI(void*) cvGetWindowHandle( const char* name );
+
+/* get name of highgui window given its native handle */
+CVAPI(const char*) cvGetWindowName( void* window_handle );
+
+
+typedef void (CV_CDECL *CvTrackbarCallback)(int pos);
+
+/* create trackbar and display it on top of given window, set callback */
+CVAPI(int) cvCreateTrackbar( const char* trackbar_name, const char* window_name,
+                             int* value, int count, CvTrackbarCallback on_change CV_DEFAULT(NULL));
+
+typedef void (CV_CDECL *CvTrackbarCallback2)(int pos, void* userdata);
+
+CVAPI(int) cvCreateTrackbar2( const char* trackbar_name, const char* window_name,
+                              int* value, int count, CvTrackbarCallback2 on_change,
+                              void* userdata CV_DEFAULT(0));
+
+/* retrieve or set trackbar position */
+CVAPI(int) cvGetTrackbarPos( const char* trackbar_name, const char* window_name );
+CVAPI(void) cvSetTrackbarPos( const char* trackbar_name, const char* window_name, int pos );
+CVAPI(void) cvSetTrackbarMax(const char* trackbar_name, const char* window_name, int maxval);
+CVAPI(void) cvSetTrackbarMin(const char* trackbar_name, const char* window_name, int minval);
+
+enum
+{
+    CV_EVENT_MOUSEMOVE      =0,
+    CV_EVENT_LBUTTONDOWN    =1,
+    CV_EVENT_RBUTTONDOWN    =2,
+    CV_EVENT_MBUTTONDOWN    =3,
+    CV_EVENT_LBUTTONUP      =4,
+    CV_EVENT_RBUTTONUP      =5,
+    CV_EVENT_MBUTTONUP      =6,
+    CV_EVENT_LBUTTONDBLCLK  =7,
+    CV_EVENT_RBUTTONDBLCLK  =8,
+    CV_EVENT_MBUTTONDBLCLK  =9,
+    CV_EVENT_MOUSEWHEEL     =10,
+    CV_EVENT_MOUSEHWHEEL    =11
+};
+
+enum
+{
+    CV_EVENT_FLAG_LBUTTON   =1,
+    CV_EVENT_FLAG_RBUTTON   =2,
+    CV_EVENT_FLAG_MBUTTON   =4,
+    CV_EVENT_FLAG_CTRLKEY   =8,
+    CV_EVENT_FLAG_SHIFTKEY  =16,
+    CV_EVENT_FLAG_ALTKEY    =32
+};
+
+
+#define CV_GET_WHEEL_DELTA(flags) ((short)((flags >> 16) & 0xffff)) // upper 16 bits
+
+typedef void (CV_CDECL *CvMouseCallback )(int event, int x, int y, int flags, void* param);
+
+/* assign callback for mouse events */
+CVAPI(void) cvSetMouseCallback( const char* window_name, CvMouseCallback on_mouse,
+                                void* param CV_DEFAULT(NULL));
+
+/* wait for key event infinitely (delay<=0) or for "delay" milliseconds */
+CVAPI(int) cvWaitKey(int delay CV_DEFAULT(0));
+
+// OpenGL support
+
+typedef void (CV_CDECL *CvOpenGlDrawCallback)(void* userdata);
+CVAPI(void) cvSetOpenGlDrawCallback(const char* window_name, CvOpenGlDrawCallback callback, void* userdata CV_DEFAULT(NULL));
+
+CVAPI(void) cvSetOpenGlContext(const char* window_name);
+CVAPI(void) cvUpdateWindow(const char* window_name);
+
+
+/****************************************************************************************\
+
+*                              Obsolete functions/synonyms                               *
+\****************************************************************************************/
+
+#define cvAddSearchPath(path)
+#define cvvInitSystem cvInitSystem
+#define cvvNamedWindow cvNamedWindow
+#define cvvShowImage cvShowImage
+#define cvvResizeWindow cvResizeWindow
+#define cvvDestroyWindow cvDestroyWindow
+#define cvvCreateTrackbar cvCreateTrackbar
+#define cvvAddSearchPath cvAddSearchPath
+#define cvvWaitKey(name) cvWaitKey(0)
+#define cvvWaitKeyEx(name,delay) cvWaitKey(delay)
+#define HG_AUTOSIZE CV_WINDOW_AUTOSIZE
+#define set_preprocess_func cvSetPreprocessFuncWin32
+#define set_postprocess_func cvSetPostprocessFuncWin32
+
+#if defined _WIN32
+
+CVAPI(void) cvSetPreprocessFuncWin32_(const void* callback);
+CVAPI(void) cvSetPostprocessFuncWin32_(const void* callback);
+#define cvSetPreprocessFuncWin32(callback) cvSetPreprocessFuncWin32_((const void*)(callback))
+#define cvSetPostprocessFuncWin32(callback) cvSetPostprocessFuncWin32_((const void*)(callback))
+
+#endif
+
+/** @} highgui_c */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs.hpp
new file mode 100644
index 000000000000..eba25ce1cfb8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs.hpp
@@ -0,0 +1,475 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_IMGCODECS_HPP
+#define OPENCV_IMGCODECS_HPP
+
+#include "opencv2/core.hpp"
+
+/**
+  @defgroup imgcodecs Image file reading and writing
+  @{
+    @defgroup imgcodecs_flags Flags used for image file reading and writing
+    @defgroup imgcodecs_ios iOS glue
+    @defgroup imgcodecs_macosx MacOS(OSX) glue
+  @}
+*/
+
+//////////////////////////////// image codec ////////////////////////////////
+namespace cv
+{
+
+//! @addtogroup imgcodecs
+//! @{
+
+//! @addtogroup imgcodecs_flags
+//! @{
+
+//! Imread flags
+enum ImreadModes {
+       IMREAD_UNCHANGED            = -1, //!< If set, return the loaded image as is (with alpha channel, otherwise it gets cropped). Ignore EXIF orientation.
+       IMREAD_GRAYSCALE            = 0,  //!< If set, always convert image to the single channel grayscale image (codec internal conversion).
+       IMREAD_COLOR                = 1,  //!< If set, always convert image to the 3 channel BGR color image.
+       IMREAD_ANYDEPTH             = 2,  //!< If set, return 16-bit/32-bit image when the input has the corresponding depth, otherwise convert it to 8-bit.
+       IMREAD_ANYCOLOR             = 4,  //!< If set, the image is read in any possible color format.
+       IMREAD_LOAD_GDAL            = 8,  //!< If set, use the gdal driver for loading the image.
+       IMREAD_REDUCED_GRAYSCALE_2  = 16, //!< If set, always convert image to the single channel grayscale image and the image size reduced 1/2.
+       IMREAD_REDUCED_COLOR_2      = 17, //!< If set, always convert image to the 3 channel BGR color image and the image size reduced 1/2.
+       IMREAD_REDUCED_GRAYSCALE_4  = 32, //!< If set, always convert image to the single channel grayscale image and the image size reduced 1/4.
+       IMREAD_REDUCED_COLOR_4      = 33, //!< If set, always convert image to the 3 channel BGR color image and the image size reduced 1/4.
+       IMREAD_REDUCED_GRAYSCALE_8  = 64, //!< If set, always convert image to the single channel grayscale image and the image size reduced 1/8.
+       IMREAD_REDUCED_COLOR_8      = 65, //!< If set, always convert image to the 3 channel BGR color image and the image size reduced 1/8.
+       IMREAD_IGNORE_ORIENTATION   = 128 //!< If set, do not rotate the image according to EXIF's orientation flag.
+     };
+
+//! Imwrite flags
+enum ImwriteFlags {
+       IMWRITE_JPEG_QUALITY        = 1,  //!< For JPEG, it can be a quality from 0 to 100 (the higher is the better). Default value is 95.
+       IMWRITE_JPEG_PROGRESSIVE    = 2,  //!< Enable JPEG features, 0 or 1, default is False.
+       IMWRITE_JPEG_OPTIMIZE       = 3,  //!< Enable JPEG features, 0 or 1, default is False.
+       IMWRITE_JPEG_RST_INTERVAL   = 4,  //!< JPEG restart interval, 0 - 65535, default is 0 - no restart.
+       IMWRITE_JPEG_LUMA_QUALITY   = 5,  //!< Separate luma quality level, 0 - 100, default is -1 - don't use. If JPEG_LIB_VERSION < 70, Not supported.
+       IMWRITE_JPEG_CHROMA_QUALITY = 6,  //!< Separate chroma quality level, 0 - 100, default is -1 - don't use. If JPEG_LIB_VERSION < 70, Not supported.
+       IMWRITE_JPEG_SAMPLING_FACTOR = 7, //!< For JPEG, set sampling factor. See cv::ImwriteJPEGSamplingFactorParams.
+       IMWRITE_PNG_COMPRESSION     = 16, //!< For PNG, it can be the compression level from 0 to 9. A higher value means a smaller size and longer compression time. If specified, strategy is changed to IMWRITE_PNG_STRATEGY_DEFAULT (Z_DEFAULT_STRATEGY). Default value is 1 (best speed setting).
+       IMWRITE_PNG_STRATEGY        = 17, //!< One of cv::ImwritePNGFlags, default is IMWRITE_PNG_STRATEGY_RLE.
+       IMWRITE_PNG_BILEVEL         = 18, //!< Binary level PNG, 0 or 1, default is 0.
+       IMWRITE_PXM_BINARY          = 32, //!< For PPM, PGM, or PBM, it can be a binary format flag, 0 or 1. Default value is 1.
+       IMWRITE_EXR_TYPE            = (3 << 4) + 0 /* 48 */, //!< override EXR storage type (FLOAT (FP32) is default)
+       IMWRITE_EXR_COMPRESSION     = (3 << 4) + 1 /* 49 */, //!< override EXR compression type (ZIP_COMPRESSION = 3 is default)
+       IMWRITE_EXR_DWA_COMPRESSION_LEVEL = (3 << 4) + 2 /* 50 */, //!< override EXR DWA compression level (45 is default)
+       IMWRITE_WEBP_QUALITY        = 64, //!< For WEBP, it can be a quality from 1 to 100 (the higher is the better). By default (without any parameter) and for quality above 100 the lossless compression is used.
+       IMWRITE_HDR_COMPRESSION     = (5 << 4) + 0 /* 80 */, //!< specify HDR compression
+       IMWRITE_PAM_TUPLETYPE       = 128,//!< For PAM, sets the TUPLETYPE field to the corresponding string value that is defined for the format
+       IMWRITE_TIFF_RESUNIT        = 256,//!< For TIFF, use to specify which DPI resolution unit to set; see libtiff documentation for valid values
+       IMWRITE_TIFF_XDPI           = 257,//!< For TIFF, use to specify the X direction DPI
+       IMWRITE_TIFF_YDPI           = 258,//!< For TIFF, use to specify the Y direction DPI
+       IMWRITE_TIFF_COMPRESSION    = 259,//!< For TIFF, use to specify the image compression scheme. See cv::ImwriteTiffCompressionFlags. Note, for images whose depth is CV_32F, only libtiff's SGILOG compression scheme is used. For other supported depths, the compression scheme can be specified by this flag; LZW compression is the default.
+       IMWRITE_TIFF_ROWSPERSTRIP   = 278,//!< For TIFF, use to specify the number of rows per strip.
+       IMWRITE_TIFF_PREDICTOR      = 317,//!< For TIFF, use to specify predictor. See cv::ImwriteTiffPredictorFlags.
+       IMWRITE_JPEG2000_COMPRESSION_X1000 = 272,//!< For JPEG2000, use to specify the target compression rate (multiplied by 1000). The value can be from 0 to 1000. Default is 1000.
+       IMWRITE_AVIF_QUALITY        = 512,//!< For AVIF, it can be a quality between 0 and 100 (the higher the better). Default is 95.
+       IMWRITE_AVIF_DEPTH          = 513,//!< For AVIF, it can be 8, 10 or 12. If >8, it is stored/read as CV_32F. Default is 8.
+       IMWRITE_AVIF_SPEED          = 514 //!< For AVIF, it is between 0 (slowest) and (fastest). Default is 9.
+     };
+
+enum ImwriteJPEGSamplingFactorParams {
+       IMWRITE_JPEG_SAMPLING_FACTOR_411 = 0x411111, //!< 4x1,1x1,1x1
+       IMWRITE_JPEG_SAMPLING_FACTOR_420 = 0x221111, //!< 2x2,1x1,1x1(Default)
+       IMWRITE_JPEG_SAMPLING_FACTOR_422 = 0x211111, //!< 2x1,1x1,1x1
+       IMWRITE_JPEG_SAMPLING_FACTOR_440 = 0x121111, //!< 1x2,1x1,1x1
+       IMWRITE_JPEG_SAMPLING_FACTOR_444 = 0x111111  //!< 1x1,1x1,1x1(No subsampling)
+     };
+
+enum ImwriteTiffCompressionFlags {
+        IMWRITE_TIFF_COMPRESSION_NONE = 1,            //!< dump mode
+        IMWRITE_TIFF_COMPRESSION_CCITTRLE = 2,        //!< CCITT modified Huffman RLE
+        IMWRITE_TIFF_COMPRESSION_CCITTFAX3 = 3,       //!< CCITT Group 3 fax encoding
+        IMWRITE_TIFF_COMPRESSION_CCITT_T4 = 3,        //!< CCITT T.4 (TIFF 6 name)
+        IMWRITE_TIFF_COMPRESSION_CCITTFAX4 = 4,       //!< CCITT Group 4 fax encoding
+        IMWRITE_TIFF_COMPRESSION_CCITT_T6 = 4,        //!< CCITT T.6 (TIFF 6 name)
+        IMWRITE_TIFF_COMPRESSION_LZW = 5,             //!< Lempel-Ziv  & Welch
+        IMWRITE_TIFF_COMPRESSION_OJPEG = 6,           //!< !6.0 JPEG
+        IMWRITE_TIFF_COMPRESSION_JPEG = 7,            //!< %JPEG DCT compression
+        IMWRITE_TIFF_COMPRESSION_T85 = 9,             //!< !TIFF/FX T.85 JBIG compression
+        IMWRITE_TIFF_COMPRESSION_T43 = 10,            //!< !TIFF/FX T.43 colour by layered JBIG compression
+        IMWRITE_TIFF_COMPRESSION_NEXT = 32766,        //!< NeXT 2-bit RLE
+        IMWRITE_TIFF_COMPRESSION_CCITTRLEW = 32771,   //!< #1 w/ word alignment
+        IMWRITE_TIFF_COMPRESSION_PACKBITS = 32773,    //!< Macintosh RLE
+        IMWRITE_TIFF_COMPRESSION_THUNDERSCAN = 32809, //!< ThunderScan RLE
+        IMWRITE_TIFF_COMPRESSION_IT8CTPAD = 32895,    //!< IT8 CT w/padding
+        IMWRITE_TIFF_COMPRESSION_IT8LW = 32896,       //!< IT8 Linework RLE
+        IMWRITE_TIFF_COMPRESSION_IT8MP = 32897,       //!< IT8 Monochrome picture
+        IMWRITE_TIFF_COMPRESSION_IT8BL = 32898,       //!< IT8 Binary line art
+        IMWRITE_TIFF_COMPRESSION_PIXARFILM = 32908,   //!< Pixar companded 10bit LZW
+        IMWRITE_TIFF_COMPRESSION_PIXARLOG = 32909,    //!< Pixar companded 11bit ZIP
+        IMWRITE_TIFF_COMPRESSION_DEFLATE = 32946,     //!< Deflate compression, legacy tag
+        IMWRITE_TIFF_COMPRESSION_ADOBE_DEFLATE = 8,   //!< Deflate compression, as recognized by Adobe
+        IMWRITE_TIFF_COMPRESSION_DCS = 32947,         //!< Kodak DCS encoding
+        IMWRITE_TIFF_COMPRESSION_JBIG = 34661,        //!< ISO JBIG
+        IMWRITE_TIFF_COMPRESSION_SGILOG = 34676,      //!< SGI Log Luminance RLE
+        IMWRITE_TIFF_COMPRESSION_SGILOG24 = 34677,    //!< SGI Log 24-bit packed
+        IMWRITE_TIFF_COMPRESSION_JP2000 = 34712,      //!< Leadtools JPEG2000
+        IMWRITE_TIFF_COMPRESSION_LERC = 34887,        //!< ESRI Lerc codec: https://github.com/Esri/lerc
+        IMWRITE_TIFF_COMPRESSION_LZMA = 34925,        //!< LZMA2
+        IMWRITE_TIFF_COMPRESSION_ZSTD = 50000,        //!< ZSTD: WARNING not registered in Adobe-maintained registry
+        IMWRITE_TIFF_COMPRESSION_WEBP = 50001,        //!< WEBP: WARNING not registered in Adobe-maintained registry
+        IMWRITE_TIFF_COMPRESSION_JXL = 50002          //!< JPEGXL: WARNING not registered in Adobe-maintained registry
+};
+
+enum ImwriteTiffPredictorFlags {
+        IMWRITE_TIFF_PREDICTOR_NONE = 1,              //!< no prediction scheme used
+        IMWRITE_TIFF_PREDICTOR_HORIZONTAL = 2,        //!< horizontal differencing
+        IMWRITE_TIFF_PREDICTOR_FLOATINGPOINT = 3      //!< floating point predictor
+
+};
+
+enum ImwriteEXRTypeFlags {
+       /*IMWRITE_EXR_TYPE_UNIT = 0, //!< not supported */
+       IMWRITE_EXR_TYPE_HALF   = 1, //!< store as HALF (FP16)
+       IMWRITE_EXR_TYPE_FLOAT  = 2  //!< store as FP32 (default)
+     };
+
+enum ImwriteEXRCompressionFlags {
+       IMWRITE_EXR_COMPRESSION_NO    = 0, //!< no compression
+       IMWRITE_EXR_COMPRESSION_RLE   = 1, //!< run length encoding
+       IMWRITE_EXR_COMPRESSION_ZIPS  = 2, //!< zlib compression, one scan line at a time
+       IMWRITE_EXR_COMPRESSION_ZIP   = 3, //!< zlib compression, in blocks of 16 scan lines
+       IMWRITE_EXR_COMPRESSION_PIZ   = 4, //!< piz-based wavelet compression
+       IMWRITE_EXR_COMPRESSION_PXR24 = 5, //!< lossy 24-bit float compression
+       IMWRITE_EXR_COMPRESSION_B44   = 6, //!< lossy 4-by-4 pixel block compression, fixed compression rate
+       IMWRITE_EXR_COMPRESSION_B44A  = 7, //!< lossy 4-by-4 pixel block compression, flat fields are compressed more
+       IMWRITE_EXR_COMPRESSION_DWAA  = 8, //!< lossy DCT based compression, in blocks of 32 scanlines. More efficient for partial buffer access. Supported since OpenEXR 2.2.0.
+       IMWRITE_EXR_COMPRESSION_DWAB  = 9, //!< lossy DCT based compression, in blocks of 256 scanlines. More efficient space wise and faster to decode full frames than DWAA_COMPRESSION. Supported since OpenEXR 2.2.0.
+     };
+
+//! Imwrite PNG specific flags used to tune the compression algorithm.
+/** These flags will be modify the way of PNG image compression and will be passed to the underlying zlib processing stage.
+
+-   The effect of IMWRITE_PNG_STRATEGY_FILTERED is to force more Huffman coding and less string matching; it is somewhat intermediate between IMWRITE_PNG_STRATEGY_DEFAULT and IMWRITE_PNG_STRATEGY_HUFFMAN_ONLY.
+-   IMWRITE_PNG_STRATEGY_RLE is designed to be almost as fast as IMWRITE_PNG_STRATEGY_HUFFMAN_ONLY, but give better compression for PNG image data.
+-   The strategy parameter only affects the compression ratio but not the correctness of the compressed output even if it is not set appropriately.
+-   IMWRITE_PNG_STRATEGY_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler decoder for special applications.
+*/
+enum ImwritePNGFlags {
+       IMWRITE_PNG_STRATEGY_DEFAULT      = 0, //!< Use this value for normal data.
+       IMWRITE_PNG_STRATEGY_FILTERED     = 1, //!< Use this value for data produced by a filter (or predictor).Filtered data consists mostly of small values with a somewhat random distribution. In this case, the compression algorithm is tuned to compress them better.
+       IMWRITE_PNG_STRATEGY_HUFFMAN_ONLY = 2, //!< Use this value to force Huffman encoding only (no string match).
+       IMWRITE_PNG_STRATEGY_RLE          = 3, //!< Use this value to limit match distances to one (run-length encoding).
+       IMWRITE_PNG_STRATEGY_FIXED        = 4  //!< Using this value prevents the use of dynamic Huffman codes, allowing for a simpler decoder for special applications.
+     };
+
+//! Imwrite PAM specific tupletype flags used to define the 'TUPLETYPE' field of a PAM file.
+enum ImwritePAMFlags {
+       IMWRITE_PAM_FORMAT_NULL            = 0,
+       IMWRITE_PAM_FORMAT_BLACKANDWHITE   = 1,
+       IMWRITE_PAM_FORMAT_GRAYSCALE       = 2,
+       IMWRITE_PAM_FORMAT_GRAYSCALE_ALPHA = 3,
+       IMWRITE_PAM_FORMAT_RGB             = 4,
+       IMWRITE_PAM_FORMAT_RGB_ALPHA       = 5
+     };
+
+//! Imwrite HDR specific values for IMWRITE_HDR_COMPRESSION parameter key
+enum ImwriteHDRCompressionFlags {
+    IMWRITE_HDR_COMPRESSION_NONE = 0,
+    IMWRITE_HDR_COMPRESSION_RLE = 1
+};
+
+//! @} imgcodecs_flags
+
+/** @brief Loads an image from a file.
+
+@anchor imread
+
+The function imread loads an image from the specified file and returns it. If the image cannot be
+read (because of missing file, improper permissions, unsupported or invalid format), the function
+returns an empty matrix ( Mat::data==NULL ).
+
+Currently, the following file formats are supported:
+
+-   Windows bitmaps - \*.bmp, \*.dib (always supported)
+-   JPEG files - \*.jpeg, \*.jpg, \*.jpe (see the *Note* section)
+-   JPEG 2000 files - \*.jp2 (see the *Note* section)
+-   Portable Network Graphics - \*.png (see the *Note* section)
+-   WebP - \*.webp (see the *Note* section)
+-   AVIF - \*.avif (see the *Note* section)
+-   Portable image format - \*.pbm, \*.pgm, \*.ppm \*.pxm, \*.pnm (always supported)
+-   PFM files - \*.pfm (see the *Note* section)
+-   Sun rasters - \*.sr, \*.ras (always supported)
+-   TIFF files - \*.tiff, \*.tif (see the *Note* section)
+-   OpenEXR Image files - \*.exr (see the *Note* section)
+-   Radiance HDR - \*.hdr, \*.pic (always supported)
+-   Raster and Vector geospatial data supported by GDAL (see the *Note* section)
+
+@note
+-   The function determines the type of an image by the content, not by the file extension.
+-   In the case of color images, the decoded images will have the channels stored in **B G R** order.
+-   When using IMREAD_GRAYSCALE, the codec's internal grayscale conversion will be used, if available.
+    Results may differ to the output of cvtColor()
+-   On Microsoft Windows\* OS and MacOSX\*, the codecs shipped with an OpenCV image (libjpeg,
+    libpng, libtiff, and libjasper) are used by default. So, OpenCV can always read JPEGs, PNGs,
+    and TIFFs. On MacOSX, there is also an option to use native MacOSX image readers. But beware
+    that currently these native image loaders give images with different pixel values because of
+    the color management embedded into MacOSX.
+-   On Linux\*, BSD flavors and other Unix-like open-source operating systems, OpenCV looks for
+    codecs supplied with an OS image. Install the relevant packages (do not forget the development
+    files, for example, "libjpeg-dev", in Debian\* and Ubuntu\*) to get the codec support or turn
+    on the OPENCV_BUILD_3RDPARTY_LIBS flag in CMake.
+-   In the case you set *WITH_GDAL* flag to true in CMake and @ref IMREAD_LOAD_GDAL to load the image,
+    then the [GDAL](http://www.gdal.org) driver will be used in order to decode the image, supporting
+    the following formats: [Raster](http://www.gdal.org/formats_list.html),
+    [Vector](http://www.gdal.org/ogr_formats.html).
+-   If EXIF information is embedded in the image file, the EXIF orientation will be taken into account
+    and thus the image will be rotated accordingly except if the flags @ref IMREAD_IGNORE_ORIENTATION
+    or @ref IMREAD_UNCHANGED are passed.
+-   Use the IMREAD_UNCHANGED flag to keep the floating point values from PFM image.
+-   By default number of pixels must be less than 2^30. Limit can be set using system
+    variable OPENCV_IO_MAX_IMAGE_PIXELS
+
+@param filename Name of file to be loaded.
+@param flags Flag that can take values of cv::ImreadModes
+*/
+CV_EXPORTS_W Mat imread( const String& filename, int flags = IMREAD_COLOR );
+
+/** @brief Loads an image from a file.
+
+This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts and the return value.
+@param filename Name of file to be loaded.
+@param dst object in which the image will be loaded.
+@param flags Flag that can take values of cv::ImreadModes
+@note
+The image passing through the img parameter can be pre-allocated. The memory is reused if the shape and the type match with the load image.
+ */
+CV_EXPORTS_W void imread( const String& filename, OutputArray dst, int flags = IMREAD_COLOR );
+
+/** @brief Loads a multi-page image from a file.
+
+The function imreadmulti loads a multi-page image from the specified file into a vector of Mat objects.
+@param filename Name of file to be loaded.
+@param mats A vector of Mat objects holding each page.
+@param flags Flag that can take values of cv::ImreadModes, default with cv::IMREAD_ANYCOLOR.
+@sa cv::imread
+*/
+CV_EXPORTS_W bool imreadmulti(const String& filename, CV_OUT std::vector<Mat>& mats, int flags = IMREAD_ANYCOLOR);
+
+/** @brief Loads a of images of a multi-page image from a file.
+
+The function imreadmulti loads a specified range from a multi-page image from the specified file into a vector of Mat objects.
+@param filename Name of file to be loaded.
+@param mats A vector of Mat objects holding each page.
+@param start Start index of the image to load
+@param count Count number of images to load
+@param flags Flag that can take values of cv::ImreadModes, default with cv::IMREAD_ANYCOLOR.
+@sa cv::imread
+*/
+CV_EXPORTS_W bool imreadmulti(const String& filename, CV_OUT std::vector<Mat>& mats, int start, int count, int flags = IMREAD_ANYCOLOR);
+
+/** @brief Returns the number of images inside the give file
+
+The function imcount will return the number of pages in a multi-page image, or 1 for single-page images
+@param filename Name of file to be loaded.
+@param flags Flag that can take values of cv::ImreadModes, default with cv::IMREAD_ANYCOLOR.
+*/
+CV_EXPORTS_W size_t imcount(const String& filename, int flags = IMREAD_ANYCOLOR);
+
+/** @brief Saves an image to a specified file.
+
+The function imwrite saves the image to the specified file. The image format is chosen based on the
+filename extension (see cv::imread for the list of extensions). In general, only 8-bit unsigned (CV_8U)
+single-channel or 3-channel (with 'BGR' channel order) images
+can be saved using this function, with these exceptions:
+
+- With OpenEXR encoder, only 32-bit float (CV_32F) images can be saved.
+  - 8-bit unsigned (CV_8U) images are not supported.
+- With Radiance HDR encoder, non 64-bit float (CV_64F) images can be saved.
+  - All images will be converted to 32-bit float (CV_32F).
+- With JPEG 2000 encoder, 8-bit unsigned (CV_8U) and 16-bit unsigned (CV_16U) images can be saved.
+- With PAM encoder, 8-bit unsigned (CV_8U) and 16-bit unsigned (CV_16U) images can be saved.
+- With PNG encoder, 8-bit unsigned (CV_8U) and 16-bit unsigned (CV_16U) images can be saved.
+  - PNG images with an alpha channel can be saved using this function. To do this, create
+    8-bit (or 16-bit) 4-channel image BGRA, where the alpha channel goes last. Fully transparent pixels
+    should have alpha set to 0, fully opaque pixels should have alpha set to 255/65535 (see the code sample below).
+- With PGM/PPM encoder, 8-bit unsigned (CV_8U) and 16-bit unsigned (CV_16U) images can be saved.
+- With TIFF encoder, 8-bit unsigned (CV_8U), 16-bit unsigned (CV_16U),
+                     32-bit float (CV_32F) and 64-bit float (CV_64F) images can be saved.
+  - Multiple images (vector of Mat) can be saved in TIFF format (see the code sample below).
+  - 32-bit float 3-channel (CV_32FC3) TIFF images will be saved
+    using the LogLuv high dynamic range encoding (4 bytes per pixel)
+
+If the image format is not supported, the image will be converted to 8-bit unsigned (CV_8U) and saved that way.
+
+If the format, depth or channel order is different, use
+Mat::convertTo and cv::cvtColor to convert it before saving. Or, use the universal FileStorage I/O
+functions to save the image to XML or YAML format.
+
+The sample below shows how to create a BGRA image, how to set custom compression parameters and save it to a PNG file.
+It also demonstrates how to save multiple images in a TIFF file:
+@include snippets/imgcodecs_imwrite.cpp
+@param filename Name of the file.
+@param img (Mat or vector of Mat) Image or Images to be saved.
+@param params Format-specific parameters encoded as pairs (paramId_1, paramValue_1, paramId_2, paramValue_2, ... .) see cv::ImwriteFlags
+*/
+CV_EXPORTS_W bool imwrite( const String& filename, InputArray img,
+              const std::vector<int>& params = std::vector<int>());
+
+//! @brief multi-image overload for bindings
+CV_WRAP static inline
+bool imwritemulti(const String& filename, InputArrayOfArrays img,
+                  const std::vector<int>& params = std::vector<int>())
+{
+    return imwrite(filename, img, params);
+}
+
+/** @brief Reads an image from a buffer in memory.
+
+The function imdecode reads an image from the specified buffer in the memory. If the buffer is too short or
+contains invalid data, the function returns an empty matrix ( Mat::data==NULL ).
+
+See cv::imread for the list of supported formats and flags description.
+
+@note In the case of color images, the decoded images will have the channels stored in **B G R** order.
+@param buf Input array or vector of bytes.
+@param flags The same flags as in cv::imread, see cv::ImreadModes.
+*/
+CV_EXPORTS_W Mat imdecode( InputArray buf, int flags );
+
+/** @overload
+@param buf Input array or vector of bytes.
+@param flags The same flags as in cv::imread, see cv::ImreadModes.
+@param dst The optional output placeholder for the decoded matrix. It can save the image
+reallocations when the function is called repeatedly for images of the same size. In case of decoder
+failure the function returns empty cv::Mat object, but does not release user-provided dst buffer.
+*/
+CV_EXPORTS Mat imdecode( InputArray buf, int flags, Mat* dst);
+
+/** @brief Reads a multi-page image from a buffer in memory.
+
+The function imdecodemulti reads a multi-page image from the specified buffer in the memory. If the buffer is too short or
+contains invalid data, the function returns false.
+
+See cv::imreadmulti for the list of supported formats and flags description.
+
+@note In the case of color images, the decoded images will have the channels stored in **B G R** order.
+@param buf Input array or vector of bytes.
+@param flags The same flags as in cv::imread, see cv::ImreadModes.
+@param mats A vector of Mat objects holding each page, if more than one.
+@param range A continuous selection of pages.
+*/
+CV_EXPORTS_W bool imdecodemulti(InputArray buf, int flags, CV_OUT std::vector<Mat>& mats, const cv::Range& range = Range::all());
+
+/** @brief Encodes an image into a memory buffer.
+
+The function imencode compresses the image and stores it in the memory buffer that is resized to fit the
+result. See cv::imwrite for the list of supported formats and flags description.
+
+@param ext File extension that defines the output format. Must include a leading period.
+@param img Image to be written.
+@param buf Output buffer resized to fit the compressed image.
+@param params Format-specific parameters. See cv::imwrite and cv::ImwriteFlags.
+*/
+CV_EXPORTS_W bool imencode( const String& ext, InputArray img,
+                            CV_OUT std::vector<uchar>& buf,
+                            const std::vector<int>& params = std::vector<int>());
+
+/** @brief Returns true if the specified image can be decoded by OpenCV
+
+@param filename File name of the image
+*/
+CV_EXPORTS_W bool haveImageReader( const String& filename );
+
+/** @brief Returns true if an image with the specified filename can be encoded by OpenCV
+
+ @param filename File name of the image
+ */
+CV_EXPORTS_W bool haveImageWriter( const String& filename );
+
+/** @brief To read Multi Page images on demand
+
+The ImageCollection class provides iterator API to read multi page images on demand. Create iterator
+to the collection of the images and iterate over the collection. Decode the necessary page with operator*.
+
+The performance of page decoding is O(1) if collection is increment sequentially. If the user wants to access random page,
+then the time Complexity is O(n) because the collection has to be reinitialized every time in order to go to the correct page.
+However, the intermediate pages are not decoded during the process, so typically it's quite fast.
+This is required because multipage codecs does not support going backwards.
+After decoding the one page, it is stored inside the collection cache. Hence, trying to get Mat object from already decoded page is O(1).
+If you need memory, you can use .releaseCache() method to release cached index.
+The space complexity is O(n) if all pages are decoded into memory. The user is able to decode and release images on demand.
+*/
+class CV_EXPORTS ImageCollection {
+public:
+    struct CV_EXPORTS iterator {
+        iterator(ImageCollection* col);
+        iterator(ImageCollection* col, int end);
+        Mat& operator*();
+        Mat* operator->();
+        iterator& operator++();
+        iterator operator++(int);
+        friend bool operator== (const iterator& a, const iterator& b) { return a.m_curr == b.m_curr; }
+        friend bool operator!= (const iterator& a, const iterator& b) { return a.m_curr != b.m_curr; }
+
+    private:
+        ImageCollection* m_pCollection;
+        int m_curr;
+    };
+
+    ImageCollection();
+    ImageCollection(const String& filename, int flags);
+    void init(const String& img, int flags);
+    size_t size() const;
+    const Mat& at(int index);
+    const Mat& operator[](int index);
+    void releaseCache(int index);
+    iterator begin();
+    iterator end();
+
+    class Impl;
+    Ptr<Impl> getImpl();
+protected:
+    Ptr<Impl> pImpl;
+};
+
+//! @} imgcodecs
+
+} // cv
+
+#endif //OPENCV_IMGCODECS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/imgcodecs.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/imgcodecs.hpp
new file mode 100644
index 000000000000..a3cd23264546
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/imgcodecs.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/imgcodecs.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/imgcodecs_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/imgcodecs_c.h
new file mode 100644
index 000000000000..c78b3f72faa6
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/imgcodecs_c.h
@@ -0,0 +1 @@
+#error "This header with legacy C API declarations has been removed from OpenCV. Legacy constants are available from legacy/constants_c.h file."
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/ios.h b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/ios.h
new file mode 100644
index 000000000000..5f1721817047
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/ios.h
@@ -0,0 +1,59 @@
+
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#import <UIKit/UIKit.h>
+#import <Accelerate/Accelerate.h>
+#import <AVFoundation/AVFoundation.h>
+#import <ImageIO/ImageIO.h>
+#include "opencv2/core.hpp"
+
+//! @addtogroup imgcodecs_ios
+//! @{
+
+CV_EXPORTS CGImageRef MatToCGImage(const cv::Mat& image) CF_RETURNS_RETAINED;
+CV_EXPORTS void CGImageToMat(const CGImageRef image, cv::Mat& m, bool alphaExist = false);
+CV_EXPORTS UIImage* MatToUIImage(const cv::Mat& image);
+CV_EXPORTS void UIImageToMat(const UIImage* image,
+                             cv::Mat& m, bool alphaExist = false);
+
+//! @}
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/legacy/constants_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/legacy/constants_c.h
new file mode 100644
index 000000000000..de7be4f74d78
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/legacy/constants_c.h
@@ -0,0 +1,54 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_IMGCODECS_LEGACY_CONSTANTS_H
+#define OPENCV_IMGCODECS_LEGACY_CONSTANTS_H
+
+/* duplicate of "ImreadModes" enumeration for better compatibility with OpenCV 3.x */
+enum
+{
+/* 8bit, color or not */
+    CV_LOAD_IMAGE_UNCHANGED  =-1,
+/* 8bit, gray */
+    CV_LOAD_IMAGE_GRAYSCALE  =0,
+/* ?, color */
+    CV_LOAD_IMAGE_COLOR      =1,
+/* any depth, ? */
+    CV_LOAD_IMAGE_ANYDEPTH   =2,
+/* ?, any color */
+    CV_LOAD_IMAGE_ANYCOLOR   =4,
+/* ?, no rotate */
+    CV_LOAD_IMAGE_IGNORE_ORIENTATION  =128
+};
+
+/* duplicate of "ImwriteFlags" enumeration for better compatibility with OpenCV 3.x */
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY =1,
+    CV_IMWRITE_JPEG_PROGRESSIVE =2,
+    CV_IMWRITE_JPEG_OPTIMIZE =3,
+    CV_IMWRITE_JPEG_RST_INTERVAL =4,
+    CV_IMWRITE_JPEG_LUMA_QUALITY =5,
+    CV_IMWRITE_JPEG_CHROMA_QUALITY =6,
+    CV_IMWRITE_PNG_COMPRESSION =16,
+    CV_IMWRITE_PNG_STRATEGY =17,
+    CV_IMWRITE_PNG_BILEVEL =18,
+    CV_IMWRITE_PNG_STRATEGY_DEFAULT =0,
+    CV_IMWRITE_PNG_STRATEGY_FILTERED =1,
+    CV_IMWRITE_PNG_STRATEGY_HUFFMAN_ONLY =2,
+    CV_IMWRITE_PNG_STRATEGY_RLE =3,
+    CV_IMWRITE_PNG_STRATEGY_FIXED =4,
+    CV_IMWRITE_PXM_BINARY =32,
+    CV_IMWRITE_EXR_TYPE = 48,
+    CV_IMWRITE_WEBP_QUALITY =64,
+    CV_IMWRITE_PAM_TUPLETYPE = 128,
+    CV_IMWRITE_PAM_FORMAT_NULL = 0,
+    CV_IMWRITE_PAM_FORMAT_BLACKANDWHITE = 1,
+    CV_IMWRITE_PAM_FORMAT_GRAYSCALE = 2,
+    CV_IMWRITE_PAM_FORMAT_GRAYSCALE_ALPHA = 3,
+    CV_IMWRITE_PAM_FORMAT_RGB = 4,
+    CV_IMWRITE_PAM_FORMAT_RGB_ALPHA = 5,
+};
+
+#endif // OPENCV_IMGCODECS_LEGACY_CONSTANTS_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/macosx.h b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/macosx.h
new file mode 100644
index 000000000000..cfb07707001d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgcodecs/macosx.h
@@ -0,0 +1,20 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if !defined(__APPLE__) || !defined(__MACH__)
+#error This header should be used in macOS ObjC/Swift projects.
+#endif
+
+#import <AppKit/AppKit.h>
+#include "opencv2/core.hpp"
+
+//! @addtogroup imgcodecs_macosx
+//! @{
+
+CV_EXPORTS CGImageRef MatToCGImage(const cv::Mat& image) CF_RETURNS_RETAINED;
+CV_EXPORTS void CGImageToMat(const CGImageRef image, cv::Mat& m, bool alphaExist = false);
+CV_EXPORTS NSImage* MatToNSImage(const cv::Mat& image);
+CV_EXPORTS void NSImageToMat(const NSImage* image, cv::Mat& m, bool alphaExist = false);
+
+//! @}
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc.hpp
new file mode 100644
index 000000000000..471a857f63fc
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc.hpp
@@ -0,0 +1,5070 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_IMGPROC_HPP
+#define OPENCV_IMGPROC_HPP
+
+#include "opencv2/core.hpp"
+
+/**
+@defgroup imgproc Image Processing
+
+This module includes image-processing functions.
+
+@{
+    @defgroup imgproc_filter Image Filtering
+
+    Functions and classes described in this section are used to perform various linear or non-linear
+    filtering operations on 2D images (represented as Mat's). It means that for each pixel location
+    \f$(x,y)\f$ in the source image (normally, rectangular), its neighborhood is considered and used to
+    compute the response. In case of a linear filter, it is a weighted sum of pixel values. In case of
+    morphological operations, it is the minimum or maximum values, and so on. The computed response is
+    stored in the destination image at the same location \f$(x,y)\f$. It means that the output image
+    will be of the same size as the input image. Normally, the functions support multi-channel arrays,
+    in which case every channel is processed independently. Therefore, the output image will also have
+    the same number of channels as the input one.
+
+    Another common feature of the functions and classes described in this section is that, unlike
+    simple arithmetic functions, they need to extrapolate values of some non-existing pixels. For
+    example, if you want to smooth an image using a Gaussian \f$3 \times 3\f$ filter, then, when
+    processing the left-most pixels in each row, you need pixels to the left of them, that is, outside
+    of the image. You can let these pixels be the same as the left-most image pixels ("replicated
+    border" extrapolation method), or assume that all the non-existing pixels are zeros ("constant
+    border" extrapolation method), and so on. OpenCV enables you to specify the extrapolation method.
+    For details, see #BorderTypes
+
+    @anchor filter_depths
+    ### Depth combinations
+    Input depth (src.depth()) | Output depth (ddepth)
+    --------------------------|----------------------
+    CV_8U                     | -1/CV_16S/CV_32F/CV_64F
+    CV_16U/CV_16S             | -1/CV_32F/CV_64F
+    CV_32F                    | -1/CV_32F
+    CV_64F                    | -1/CV_64F
+
+    @note when ddepth=-1, the output image will have the same depth as the source.
+
+    @note if you need double floating-point accuracy and using single floating-point input data
+    (CV_32F input and CV_64F output depth combination), you can use @ref Mat.convertTo to convert
+    the input data to the desired precision.
+
+    @defgroup imgproc_transform Geometric Image Transformations
+
+    The functions in this section perform various geometrical transformations of 2D images. They do not
+    change the image content but deform the pixel grid and map this deformed grid to the destination
+    image. In fact, to avoid sampling artifacts, the mapping is done in the reverse order, from
+    destination to the source. That is, for each pixel \f$(x, y)\f$ of the destination image, the
+    functions compute coordinates of the corresponding "donor" pixel in the source image and copy the
+    pixel value:
+
+    \f[\texttt{dst} (x,y)= \texttt{src} (f_x(x,y), f_y(x,y))\f]
+
+    In case when you specify the forward mapping \f$\left<g_x, g_y\right>: \texttt{src} \rightarrow
+    \texttt{dst}\f$, the OpenCV functions first compute the corresponding inverse mapping
+    \f$\left<f_x, f_y\right>: \texttt{dst} \rightarrow \texttt{src}\f$ and then use the above formula.
+
+    The actual implementations of the geometrical transformations, from the most generic remap and to
+    the simplest and the fastest resize, need to solve two main problems with the above formula:
+
+    - Extrapolation of non-existing pixels. Similarly to the filtering functions described in the
+    previous section, for some \f$(x,y)\f$, either one of \f$f_x(x,y)\f$, or \f$f_y(x,y)\f$, or both
+    of them may fall outside of the image. In this case, an extrapolation method needs to be used.
+    OpenCV provides the same selection of extrapolation methods as in the filtering functions. In
+    addition, it provides the method #BORDER_TRANSPARENT. This means that the corresponding pixels in
+    the destination image will not be modified at all.
+
+    - Interpolation of pixel values. Usually \f$f_x(x,y)\f$ and \f$f_y(x,y)\f$ are floating-point
+    numbers. This means that \f$\left<f_x, f_y\right>\f$ can be either an affine or perspective
+    transformation, or radial lens distortion correction, and so on. So, a pixel value at fractional
+    coordinates needs to be retrieved. In the simplest case, the coordinates can be just rounded to the
+    nearest integer coordinates and the corresponding pixel can be used. This is called a
+    nearest-neighbor interpolation. However, a better result can be achieved by using more
+    sophisticated [interpolation methods](http://en.wikipedia.org/wiki/Multivariate_interpolation) ,
+    where a polynomial function is fit into some neighborhood of the computed pixel \f$(f_x(x,y),
+    f_y(x,y))\f$, and then the value of the polynomial at \f$(f_x(x,y), f_y(x,y))\f$ is taken as the
+    interpolated pixel value. In OpenCV, you can choose between several interpolation methods. See
+    #resize for details.
+
+    @note The geometrical transformations do not work with `CV_8S` or `CV_32S` images.
+
+    @defgroup imgproc_misc Miscellaneous Image Transformations
+    @defgroup imgproc_draw Drawing Functions
+
+    Drawing functions work with matrices/images of arbitrary depth. The boundaries of the shapes can be
+    rendered with antialiasing (implemented only for 8-bit images for now). All the functions include
+    the parameter color that uses an RGB value (that may be constructed with the Scalar constructor )
+    for color images and brightness for grayscale images. For color images, the channel ordering is
+    normally *Blue, Green, Red*. This is what imshow, imread, and imwrite expect. So, if you form a
+    color using the Scalar constructor, it should look like:
+
+    \f[\texttt{Scalar} (blue \_ component, green \_ component, red \_ component[, alpha \_ component])\f]
+
+    If you are using your own image rendering and I/O functions, you can use any channel ordering. The
+    drawing functions process each channel independently and do not depend on the channel order or even
+    on the used color space. The whole image can be converted from BGR to RGB or to a different color
+    space using cvtColor .
+
+    If a drawn figure is partially or completely outside the image, the drawing functions clip it. Also,
+    many drawing functions can handle pixel coordinates specified with sub-pixel accuracy. This means
+    that the coordinates can be passed as fixed-point numbers encoded as integers. The number of
+    fractional bits is specified by the shift parameter and the real point coordinates are calculated as
+    \f$\texttt{Point}(x,y)\rightarrow\texttt{Point2f}(x*2^{-shift},y*2^{-shift})\f$ . This feature is
+    especially effective when rendering antialiased shapes.
+
+    @note The functions do not support alpha-transparency when the target image is 4-channel. In this
+    case, the color[3] is simply copied to the repainted pixels. Thus, if you want to paint
+    semi-transparent shapes, you can paint them in a separate buffer and then blend it with the main
+    image.
+
+    @defgroup imgproc_color_conversions Color Space Conversions
+    @defgroup imgproc_colormap ColorMaps in OpenCV
+
+    The human perception isn't built for observing fine changes in grayscale images. Human eyes are more
+    sensitive to observing changes between colors, so you often need to recolor your grayscale images to
+    get a clue about them. OpenCV now comes with various colormaps to enhance the visualization in your
+    computer vision application.
+
+    In OpenCV you only need applyColorMap to apply a colormap on a given image. The following sample
+    code reads the path to an image from command line, applies a Jet colormap on it and shows the
+    result:
+
+    @include snippets/imgproc_applyColorMap.cpp
+
+    @see #ColormapTypes
+
+    @defgroup imgproc_subdiv2d Planar Subdivision
+
+    The Subdiv2D class described in this section is used to perform various planar subdivision on
+    a set of 2D points (represented as vector of Point2f). OpenCV subdivides a plane into triangles
+    using the Delaunay's algorithm, which corresponds to the dual graph of the Voronoi diagram.
+    In the figure below, the Delaunay's triangulation is marked with black lines and the Voronoi
+    diagram with red lines.
+
+    ![Delaunay triangulation (black) and Voronoi (red)](pics/delaunay_voronoi.png)
+
+    The subdivisions can be used for the 3D piece-wise transformation of a plane, morphing, fast
+    location of points on the plane, building special graphs (such as NNG,RNG), and so forth.
+
+    @defgroup imgproc_hist Histograms
+    @defgroup imgproc_shape Structural Analysis and Shape Descriptors
+    @defgroup imgproc_motion Motion Analysis and Object Tracking
+    @defgroup imgproc_feature Feature Detection
+    @defgroup imgproc_object Object Detection
+    @defgroup imgproc_segmentation Image Segmentation
+    @defgroup imgproc_hal Hardware Acceleration Layer
+    @{
+        @defgroup imgproc_hal_functions Functions
+        @defgroup imgproc_hal_interface Interface
+    @}
+  @}
+*/
+
+namespace cv
+{
+
+/** @addtogroup imgproc
+@{
+*/
+
+//! @addtogroup imgproc_filter
+//! @{
+
+enum SpecialFilter {
+    FILTER_SCHARR = -1
+};
+
+//! type of morphological operation
+enum MorphTypes{
+    MORPH_ERODE    = 0, //!< see #erode
+    MORPH_DILATE   = 1, //!< see #dilate
+    MORPH_OPEN     = 2, //!< an opening operation
+                        //!< \f[\texttt{dst} = \mathrm{open} ( \texttt{src} , \texttt{element} )= \mathrm{dilate} ( \mathrm{erode} ( \texttt{src} , \texttt{element} ))\f]
+    MORPH_CLOSE    = 3, //!< a closing operation
+                        //!< \f[\texttt{dst} = \mathrm{close} ( \texttt{src} , \texttt{element} )= \mathrm{erode} ( \mathrm{dilate} ( \texttt{src} , \texttt{element} ))\f]
+    MORPH_GRADIENT = 4, //!< a morphological gradient
+                        //!< \f[\texttt{dst} = \mathrm{morph\_grad} ( \texttt{src} , \texttt{element} )= \mathrm{dilate} ( \texttt{src} , \texttt{element} )- \mathrm{erode} ( \texttt{src} , \texttt{element} )\f]
+    MORPH_TOPHAT   = 5, //!< "top hat"
+                        //!< \f[\texttt{dst} = \mathrm{tophat} ( \texttt{src} , \texttt{element} )= \texttt{src} - \mathrm{open} ( \texttt{src} , \texttt{element} )\f]
+    MORPH_BLACKHAT = 6, //!< "black hat"
+                        //!< \f[\texttt{dst} = \mathrm{blackhat} ( \texttt{src} , \texttt{element} )= \mathrm{close} ( \texttt{src} , \texttt{element} )- \texttt{src}\f]
+    MORPH_HITMISS  = 7  //!< "hit or miss"
+                        //!<   .- Only supported for CV_8UC1 binary images. A tutorial can be found in the documentation
+};
+
+//! shape of the structuring element
+enum MorphShapes {
+    MORPH_RECT    = 0, //!< a rectangular structuring element:  \f[E_{ij}=1\f]
+    MORPH_CROSS   = 1, //!< a cross-shaped structuring element:
+                       //!< \f[E_{ij} = \begin{cases} 1 & \texttt{if } {i=\texttt{anchor.y } {or } {j=\texttt{anchor.x}}} \\0 & \texttt{otherwise} \end{cases}\f]
+    MORPH_ELLIPSE = 2 //!< an elliptic structuring element, that is, a filled ellipse inscribed
+                      //!< into the rectangle Rect(0, 0, esize.width, 0.esize.height)
+};
+
+//! @} imgproc_filter
+
+//! @addtogroup imgproc_transform
+//! @{
+
+//! interpolation algorithm
+enum InterpolationFlags{
+    /** nearest neighbor interpolation */
+    INTER_NEAREST        = 0,
+    /** bilinear interpolation */
+    INTER_LINEAR         = 1,
+    /** bicubic interpolation */
+    INTER_CUBIC          = 2,
+    /** resampling using pixel area relation. It may be a preferred method for image decimation, as
+    it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST
+    method. */
+    INTER_AREA           = 3,
+    /** Lanczos interpolation over 8x8 neighborhood */
+    INTER_LANCZOS4       = 4,
+    /** Bit exact bilinear interpolation */
+    INTER_LINEAR_EXACT = 5,
+    /** Bit exact nearest neighbor interpolation. This will produce same results as
+    the nearest neighbor method in PIL, scikit-image or Matlab. */
+    INTER_NEAREST_EXACT  = 6,
+    /** mask for interpolation codes */
+    INTER_MAX            = 7,
+    /** flag, fills all of the destination image pixels. If some of them correspond to outliers in the
+    source image, they are set to zero */
+    WARP_FILL_OUTLIERS   = 8,
+    /** flag, inverse transformation
+
+    For example, #linearPolar or #logPolar transforms:
+    - flag is __not__ set: \f$dst( \rho , \phi ) = src(x,y)\f$
+    - flag is set: \f$dst(x,y) = src( \rho , \phi )\f$
+    */
+    WARP_INVERSE_MAP     = 16,
+    WARP_RELATIVE_MAP    = 32
+};
+
+/** \brief Specify the polar mapping mode
+@sa warpPolar
+*/
+enum WarpPolarMode
+{
+    WARP_POLAR_LINEAR = 0, ///< Remaps an image to/from polar space.
+    WARP_POLAR_LOG = 256   ///< Remaps an image to/from semilog-polar space.
+};
+
+enum InterpolationMasks {
+       INTER_BITS      = 5,
+       INTER_BITS2     = INTER_BITS * 2,
+       INTER_TAB_SIZE  = 1 << INTER_BITS,
+       INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE
+     };
+
+//! @} imgproc_transform
+
+//! @addtogroup imgproc_misc
+//! @{
+
+//! Distance types for Distance Transform and M-estimators
+//! @see distanceTransform, fitLine
+enum DistanceTypes {
+    DIST_USER    = -1,  //!< User defined distance
+    DIST_L1      = 1,   //!< distance = |x1-x2| + |y1-y2|
+    DIST_L2      = 2,   //!< the simple euclidean distance
+    DIST_C       = 3,   //!< distance = max(|x1-x2|,|y1-y2|)
+    DIST_L12     = 4,   //!< L1-L2 metric: distance = 2(sqrt(1+x*x/2) - 1))
+    DIST_FAIR    = 5,   //!< distance = c^2(|x|/c-log(1+|x|/c)), c = 1.3998
+    DIST_WELSCH  = 6,   //!< distance = c^2/2(1-exp(-(x/c)^2)), c = 2.9846
+    DIST_HUBER   = 7    //!< distance = |x|<c ? x^2/2 : c(|x|-c/2), c=1.345
+};
+
+//! Mask size for distance transform
+enum DistanceTransformMasks {
+    DIST_MASK_3       = 3, //!< mask=3
+    DIST_MASK_5       = 5, //!< mask=5
+    DIST_MASK_PRECISE = 0  //!<
+};
+
+//! type of the threshold operation
+//! ![threshold types](pics/threshold.png)
+enum ThresholdTypes {
+    THRESH_BINARY     = 0, //!< \f[\texttt{dst} (x,y) =  \fork{\texttt{maxval}}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{0}{otherwise}\f]
+    THRESH_BINARY_INV = 1, //!< \f[\texttt{dst} (x,y) =  \fork{0}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{\texttt{maxval}}{otherwise}\f]
+    THRESH_TRUNC      = 2, //!< \f[\texttt{dst} (x,y) =  \fork{\texttt{threshold}}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{\texttt{src}(x,y)}{otherwise}\f]
+    THRESH_TOZERO     = 3, //!< \f[\texttt{dst} (x,y) =  \fork{\texttt{src}(x,y)}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{0}{otherwise}\f]
+    THRESH_TOZERO_INV = 4, //!< \f[\texttt{dst} (x,y) =  \fork{0}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{\texttt{src}(x,y)}{otherwise}\f]
+    THRESH_MASK       = 7,
+    THRESH_OTSU       = 8, //!< flag, use Otsu algorithm to choose the optimal threshold value
+    THRESH_TRIANGLE   = 16 //!< flag, use Triangle algorithm to choose the optimal threshold value
+};
+
+//! adaptive threshold algorithm
+//! @see adaptiveThreshold
+enum AdaptiveThresholdTypes {
+    /** the threshold value \f$T(x,y)\f$ is a mean of the \f$\texttt{blockSize} \times
+    \texttt{blockSize}\f$ neighborhood of \f$(x, y)\f$ minus C */
+    ADAPTIVE_THRESH_MEAN_C     = 0,
+    /** the threshold value \f$T(x, y)\f$ is a weighted sum (cross-correlation with a Gaussian
+    window) of the \f$\texttt{blockSize} \times \texttt{blockSize}\f$ neighborhood of \f$(x, y)\f$
+    minus C . The default sigma (standard deviation) is used for the specified blockSize . See
+    #getGaussianKernel*/
+    ADAPTIVE_THRESH_GAUSSIAN_C = 1
+};
+
+//! class of the pixel in GrabCut algorithm
+enum GrabCutClasses {
+    GC_BGD    = 0,  //!< an obvious background pixels
+    GC_FGD    = 1,  //!< an obvious foreground (object) pixel
+    GC_PR_BGD = 2,  //!< a possible background pixel
+    GC_PR_FGD = 3   //!< a possible foreground pixel
+};
+
+//! GrabCut algorithm flags
+enum GrabCutModes {
+    /** The function initializes the state and the mask using the provided rectangle. After that it
+    runs iterCount iterations of the algorithm. */
+    GC_INIT_WITH_RECT  = 0,
+    /** The function initializes the state using the provided mask. Note that GC_INIT_WITH_RECT
+    and GC_INIT_WITH_MASK can be combined. Then, all the pixels outside of the ROI are
+    automatically initialized with GC_BGD .*/
+    GC_INIT_WITH_MASK  = 1,
+    /** The value means that the algorithm should just resume. */
+    GC_EVAL            = 2,
+    /** The value means that the algorithm should just run the grabCut algorithm (a single iteration) with the fixed model */
+    GC_EVAL_FREEZE_MODEL = 3
+};
+
+//! distanceTransform algorithm flags
+enum DistanceTransformLabelTypes {
+    /** each connected component of zeros in src (as well as all the non-zero pixels closest to the
+    connected component) will be assigned the same label */
+    DIST_LABEL_CCOMP = 0,
+    /** each zero pixel (and all the non-zero pixels closest to it) gets its own label. */
+    DIST_LABEL_PIXEL = 1
+};
+
+//! floodfill algorithm flags
+enum FloodFillFlags {
+    /** If set, the difference between the current pixel and seed pixel is considered. Otherwise,
+    the difference between neighbor pixels is considered (that is, the range is floating). */
+    FLOODFILL_FIXED_RANGE = 1 << 16,
+    /** If set, the function does not change the image ( newVal is ignored), and only fills the
+    mask with the value specified in bits 8-16 of flags as described above. This option only make
+    sense in function variants that have the mask parameter. */
+    FLOODFILL_MASK_ONLY   = 1 << 17
+};
+
+//! @} imgproc_misc
+
+//! @addtogroup imgproc_shape
+//! @{
+
+//! connected components statistics
+enum ConnectedComponentsTypes {
+    CC_STAT_LEFT   = 0, //!< The leftmost (x) coordinate which is the inclusive start of the bounding
+                        //!< box in the horizontal direction.
+    CC_STAT_TOP    = 1, //!< The topmost (y) coordinate which is the inclusive start of the bounding
+                        //!< box in the vertical direction.
+    CC_STAT_WIDTH  = 2, //!< The horizontal size of the bounding box
+    CC_STAT_HEIGHT = 3, //!< The vertical size of the bounding box
+    CC_STAT_AREA   = 4, //!< The total area (in pixels) of the connected component
+#ifndef CV_DOXYGEN
+    CC_STAT_MAX    = 5 //!< Max enumeration value. Used internally only for memory allocation
+#endif
+};
+
+//! connected components algorithm
+enum ConnectedComponentsAlgorithmsTypes {
+    CCL_DEFAULT   = -1, //!< Spaghetti @cite Bolelli2019 algorithm for 8-way connectivity, Spaghetti4C @cite Bolelli2021 algorithm for 4-way connectivity.
+    CCL_WU        = 0,  //!< SAUF @cite Wu2009 algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity. The parallel implementation described in @cite Bolelli2017 is available for SAUF.
+    CCL_GRANA     = 1,  //!< BBDT @cite Grana2010 algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity. The parallel implementation described in @cite Bolelli2017 is available for both BBDT and SAUF.
+    CCL_BOLELLI   = 2,  //!< Spaghetti @cite Bolelli2019 algorithm for 8-way connectivity, Spaghetti4C @cite Bolelli2021 algorithm for 4-way connectivity. The parallel implementation described in @cite Bolelli2017 is available for both Spaghetti and Spaghetti4C.
+    CCL_SAUF      = 3,  //!< Same as CCL_WU. It is preferable to use the flag with the name of the algorithm (CCL_SAUF) rather than the one with the name of the first author (CCL_WU).
+    CCL_BBDT      = 4,  //!< Same as CCL_GRANA. It is preferable to use the flag with the name of the algorithm (CCL_BBDT) rather than the one with the name of the first author (CCL_GRANA).
+    CCL_SPAGHETTI = 5,  //!< Same as CCL_BOLELLI. It is preferable to use the flag with the name of the algorithm (CCL_SPAGHETTI) rather than the one with the name of the first author (CCL_BOLELLI).
+};
+
+//! mode of the contour retrieval algorithm
+enum RetrievalModes {
+    /** retrieves only the extreme outer contours. It sets `hierarchy[i][2]=hierarchy[i][3]=-1` for
+    all the contours. */
+    RETR_EXTERNAL  = 0,
+    /** retrieves all of the contours without establishing any hierarchical relationships. */
+    RETR_LIST      = 1,
+    /** retrieves all of the contours and organizes them into a two-level hierarchy. At the top
+    level, there are external boundaries of the components. At the second level, there are
+    boundaries of the holes. If there is another contour inside a hole of a connected component, it
+    is still put at the top level. */
+    RETR_CCOMP     = 2,
+    /** retrieves all of the contours and reconstructs a full hierarchy of nested contours.*/
+    RETR_TREE      = 3,
+    RETR_FLOODFILL = 4 //!<
+};
+
+//! the contour approximation algorithm
+enum ContourApproximationModes {
+    /** stores absolutely all the contour points. That is, any 2 subsequent points (x1,y1) and
+    (x2,y2) of the contour will be either horizontal, vertical or diagonal neighbors, that is,
+    max(abs(x1-x2),abs(y2-y1))==1. */
+    CHAIN_APPROX_NONE      = 1,
+    /** compresses horizontal, vertical, and diagonal segments and leaves only their end points.
+    For example, an up-right rectangular contour is encoded with 4 points. */
+    CHAIN_APPROX_SIMPLE    = 2,
+    /** applies one of the flavors of the Teh-Chin chain approximation algorithm @cite TehChin89 */
+    CHAIN_APPROX_TC89_L1   = 3,
+    /** applies one of the flavors of the Teh-Chin chain approximation algorithm @cite TehChin89 */
+    CHAIN_APPROX_TC89_KCOS = 4
+};
+
+/** @brief Shape matching methods
+
+\f$A\f$ denotes object1,\f$B\f$ denotes object2
+
+\f$\begin{array}{l} m^A_i =  \mathrm{sign} (h^A_i)  \cdot \log{h^A_i} \\ m^B_i =  \mathrm{sign} (h^B_i)  \cdot \log{h^B_i} \end{array}\f$
+
+and \f$h^A_i, h^B_i\f$ are the Hu moments of \f$A\f$ and \f$B\f$ , respectively.
+*/
+enum ShapeMatchModes {
+    CONTOURS_MATCH_I1  =1, //!< \f[I_1(A,B) =  \sum _{i=1...7}  \left |  \frac{1}{m^A_i} -  \frac{1}{m^B_i} \right |\f]
+    CONTOURS_MATCH_I2  =2, //!< \f[I_2(A,B) =  \sum _{i=1...7}  \left | m^A_i - m^B_i  \right |\f]
+    CONTOURS_MATCH_I3  =3  //!< \f[I_3(A,B) =  \max _{i=1...7}  \frac{ \left| m^A_i - m^B_i \right| }{ \left| m^A_i \right| }\f]
+};
+
+//! @} imgproc_shape
+
+//! @addtogroup imgproc_feature
+//! @{
+
+//! Variants of a Hough transform
+enum HoughModes {
+
+    /** classical or standard Hough transform. Every line is represented by two floating-point
+    numbers \f$(\rho, \theta)\f$ , where \f$\rho\f$ is a distance between (0,0) point and the line,
+    and \f$\theta\f$ is the angle between x-axis and the normal to the line. Thus, the matrix must
+    be (the created sequence will be) of CV_32FC2 type */
+    HOUGH_STANDARD      = 0,
+    /** probabilistic Hough transform (more efficient in case if the picture contains a few long
+    linear segments). It returns line segments rather than the whole line. Each segment is
+    represented by starting and ending points, and the matrix must be (the created sequence will
+    be) of the CV_32SC4 type. */
+    HOUGH_PROBABILISTIC = 1,
+    /** multi-scale variant of the classical Hough transform. The lines are encoded the same way as
+    HOUGH_STANDARD. */
+    HOUGH_MULTI_SCALE   = 2,
+    HOUGH_GRADIENT      = 3, //!< basically *21HT*, described in @cite Yuen90
+    HOUGH_GRADIENT_ALT  = 4, //!< variation of HOUGH_GRADIENT to get better accuracy
+};
+
+//! Variants of Line Segment %Detector
+enum LineSegmentDetectorModes {
+    LSD_REFINE_NONE = 0, //!< No refinement applied
+    LSD_REFINE_STD  = 1, //!< Standard refinement is applied. E.g. breaking arches into smaller straighter line approximations.
+    LSD_REFINE_ADV  = 2  //!< Advanced refinement. Number of false alarms is calculated, lines are
+                         //!< refined through increase of precision, decrement in size, etc.
+};
+
+//! @} imgproc_feature
+
+/** Histogram comparison methods
+  @ingroup imgproc_hist
+*/
+enum HistCompMethods {
+    /** Correlation
+    \f[d(H_1,H_2) =  \frac{\sum_I (H_1(I) - \bar{H_1}) (H_2(I) - \bar{H_2})}{\sqrt{\sum_I(H_1(I) - \bar{H_1})^2 \sum_I(H_2(I) - \bar{H_2})^2}}\f]
+    where
+    \f[\bar{H_k} =  \frac{1}{N} \sum _J H_k(J)\f]
+    and \f$N\f$ is a total number of histogram bins. */
+    HISTCMP_CORREL        = 0,
+    /** Chi-Square
+    \f[d(H_1,H_2) =  \sum _I  \frac{\left(H_1(I)-H_2(I)\right)^2}{H_1(I)}\f] */
+    HISTCMP_CHISQR        = 1,
+    /** Intersection
+    \f[d(H_1,H_2) =  \sum _I  \min (H_1(I), H_2(I))\f] */
+    HISTCMP_INTERSECT     = 2,
+    /** Bhattacharyya distance
+    (In fact, OpenCV computes Hellinger distance, which is related to Bhattacharyya coefficient.)
+    \f[d(H_1,H_2) =  \sqrt{1 - \frac{1}{\sqrt{\bar{H_1} \bar{H_2} N^2}} \sum_I \sqrt{H_1(I) \cdot H_2(I)}}\f] */
+    HISTCMP_BHATTACHARYYA = 3,
+    HISTCMP_HELLINGER     = HISTCMP_BHATTACHARYYA, //!< Synonym for HISTCMP_BHATTACHARYYA
+    /** Alternative Chi-Square
+    \f[d(H_1,H_2) =  2 * \sum _I  \frac{\left(H_1(I)-H_2(I)\right)^2}{H_1(I)+H_2(I)}\f]
+    This alternative formula is regularly used for texture comparison. See e.g. @cite Puzicha1997 */
+    HISTCMP_CHISQR_ALT    = 4,
+    /** Kullback-Leibler divergence
+    \f[d(H_1,H_2) = \sum _I H_1(I) \log \left(\frac{H_1(I)}{H_2(I)}\right)\f] */
+    HISTCMP_KL_DIV        = 5
+};
+
+/** the color conversion codes
+@see @ref imgproc_color_conversions
+@ingroup imgproc_color_conversions
+ */
+enum ColorConversionCodes {
+    COLOR_BGR2BGRA     = 0, //!< add alpha channel to RGB or BGR image
+    COLOR_RGB2RGBA     = COLOR_BGR2BGRA,
+
+    COLOR_BGRA2BGR     = 1, //!< remove alpha channel from RGB or BGR image
+    COLOR_RGBA2RGB     = COLOR_BGRA2BGR,
+
+    COLOR_BGR2RGBA     = 2, //!< convert between RGB and BGR color spaces (with or without alpha channel)
+    COLOR_RGB2BGRA     = COLOR_BGR2RGBA,
+
+    COLOR_RGBA2BGR     = 3,
+    COLOR_BGRA2RGB     = COLOR_RGBA2BGR,
+
+    COLOR_BGR2RGB      = 4,
+    COLOR_RGB2BGR      = COLOR_BGR2RGB,
+
+    COLOR_BGRA2RGBA    = 5,
+    COLOR_RGBA2BGRA    = COLOR_BGRA2RGBA,
+
+    COLOR_BGR2GRAY     = 6, //!< convert between RGB/BGR and grayscale, @ref color_convert_rgb_gray "color conversions"
+    COLOR_RGB2GRAY     = 7,
+    COLOR_GRAY2BGR     = 8,
+    COLOR_GRAY2RGB     = COLOR_GRAY2BGR,
+    COLOR_GRAY2BGRA    = 9,
+    COLOR_GRAY2RGBA    = COLOR_GRAY2BGRA,
+    COLOR_BGRA2GRAY    = 10,
+    COLOR_RGBA2GRAY    = 11,
+
+    COLOR_BGR2BGR565   = 12, //!< convert between RGB/BGR and BGR565 (16-bit images)
+    COLOR_RGB2BGR565   = 13,
+    COLOR_BGR5652BGR   = 14,
+    COLOR_BGR5652RGB   = 15,
+    COLOR_BGRA2BGR565  = 16,
+    COLOR_RGBA2BGR565  = 17,
+    COLOR_BGR5652BGRA  = 18,
+    COLOR_BGR5652RGBA  = 19,
+
+    COLOR_GRAY2BGR565  = 20, //!< convert between grayscale to BGR565 (16-bit images)
+    COLOR_BGR5652GRAY  = 21,
+
+    COLOR_BGR2BGR555   = 22,  //!< convert between RGB/BGR and BGR555 (16-bit images)
+    COLOR_RGB2BGR555   = 23,
+    COLOR_BGR5552BGR   = 24,
+    COLOR_BGR5552RGB   = 25,
+    COLOR_BGRA2BGR555  = 26,
+    COLOR_RGBA2BGR555  = 27,
+    COLOR_BGR5552BGRA  = 28,
+    COLOR_BGR5552RGBA  = 29,
+
+    COLOR_GRAY2BGR555  = 30, //!< convert between grayscale and BGR555 (16-bit images)
+    COLOR_BGR5552GRAY  = 31,
+
+    COLOR_BGR2XYZ      = 32, //!< convert RGB/BGR to CIE XYZ, @ref color_convert_rgb_xyz "color conversions"
+    COLOR_RGB2XYZ      = 33,
+    COLOR_XYZ2BGR      = 34,
+    COLOR_XYZ2RGB      = 35,
+
+    COLOR_BGR2YCrCb    = 36, //!< convert RGB/BGR to luma-chroma (aka YCC), @ref color_convert_rgb_ycrcb "color conversions"
+    COLOR_RGB2YCrCb    = 37,
+    COLOR_YCrCb2BGR    = 38,
+    COLOR_YCrCb2RGB    = 39,
+
+    COLOR_BGR2HSV      = 40, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
+    COLOR_RGB2HSV      = 41,
+
+    COLOR_BGR2Lab      = 44, //!< convert RGB/BGR to CIE Lab, @ref color_convert_rgb_lab "color conversions"
+    COLOR_RGB2Lab      = 45,
+
+    COLOR_BGR2Luv      = 50, //!< convert RGB/BGR to CIE Luv, @ref color_convert_rgb_luv "color conversions"
+    COLOR_RGB2Luv      = 51,
+    COLOR_BGR2HLS      = 52, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
+    COLOR_RGB2HLS      = 53,
+
+    COLOR_HSV2BGR      = 54, //!< backward conversions HSV to RGB/BGR with H range 0..180 if 8 bit image
+    COLOR_HSV2RGB      = 55,
+
+    COLOR_Lab2BGR      = 56,
+    COLOR_Lab2RGB      = 57,
+    COLOR_Luv2BGR      = 58,
+    COLOR_Luv2RGB      = 59,
+    COLOR_HLS2BGR      = 60, //!< backward conversions HLS to RGB/BGR with H range 0..180 if 8 bit image
+    COLOR_HLS2RGB      = 61,
+
+    COLOR_BGR2HSV_FULL = 66, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
+    COLOR_RGB2HSV_FULL = 67,
+    COLOR_BGR2HLS_FULL = 68, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
+    COLOR_RGB2HLS_FULL = 69,
+
+    COLOR_HSV2BGR_FULL = 70, //!< backward conversions HSV to RGB/BGR with H range 0..255 if 8 bit image
+    COLOR_HSV2RGB_FULL = 71,
+    COLOR_HLS2BGR_FULL = 72, //!< backward conversions HLS to RGB/BGR with H range 0..255 if 8 bit image
+    COLOR_HLS2RGB_FULL = 73,
+
+    COLOR_LBGR2Lab     = 74,
+    COLOR_LRGB2Lab     = 75,
+    COLOR_LBGR2Luv     = 76,
+    COLOR_LRGB2Luv     = 77,
+
+    COLOR_Lab2LBGR     = 78,
+    COLOR_Lab2LRGB     = 79,
+    COLOR_Luv2LBGR     = 80,
+    COLOR_Luv2LRGB     = 81,
+
+    COLOR_BGR2YUV      = 82, //!< convert between RGB/BGR and YUV
+    COLOR_RGB2YUV      = 83,
+    COLOR_YUV2BGR      = 84,
+    COLOR_YUV2RGB      = 85,
+
+    COLOR_YUV2RGB_NV12  = 90, //!< convert between 4:2:0-subsampled YUV NV12 and RGB, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_NV12  = 91, //!< convert between 4:2:0-subsampled YUV NV12 and BGR, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_NV21  = 92, //!< convert between 4:2:0-subsampled YUV NV21 and RGB, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_NV21  = 93, //!< convert between 4:2:0-subsampled YUV NV21 and BGR, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV420sp2RGB  = COLOR_YUV2RGB_NV21, //!< synonym to NV21
+    COLOR_YUV420sp2BGR  = COLOR_YUV2BGR_NV21, //!< synonym to NV21
+
+    COLOR_YUV2RGBA_NV12 = 94, //!< convert between 4:2:0-subsampled YUV NV12 and RGBA, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_NV12 = 95, //!< convert between 4:2:0-subsampled YUV NV12 and BGRA, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_NV21 = 96, //!< convert between 4:2:0-subsampled YUV NV21 and RGBA, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_NV21 = 97, //!< convert between 4:2:0-subsampled YUV NV21 and BGRA, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV420sp2RGBA = COLOR_YUV2RGBA_NV21, //!< synonym to NV21
+    COLOR_YUV420sp2BGRA = COLOR_YUV2BGRA_NV21, //!< synonym to NV21
+
+    COLOR_YUV2RGB_YV12  =  98, //!< convert between 4:2:0-subsampled YUV YV12 and RGB, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_YV12  =  99, //!< convert between 4:2:0-subsampled YUV YV12 and BGR, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_IYUV  = 100, //!< convert between 4:2:0-subsampled YUV IYUV and RGB, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_IYUV  = 101, //!< convert between 4:2:0-subsampled YUV IYUV and BGR, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_I420  = COLOR_YUV2RGB_IYUV, //!< synonym to IYUV
+    COLOR_YUV2BGR_I420  = COLOR_YUV2BGR_IYUV, //!< synonym to IYUV
+    COLOR_YUV420p2RGB   = COLOR_YUV2RGB_YV12, //!< synonym to YV12
+    COLOR_YUV420p2BGR   = COLOR_YUV2BGR_YV12, //!< synonym to YV12
+
+    COLOR_YUV2RGBA_YV12 = 102, //!< convert between 4:2:0-subsampled YUV YV12 and RGBA, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_YV12 = 103, //!< convert between 4:2:0-subsampled YUV YV12 and BGRA, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_IYUV = 104, //!< convert between 4:2:0-subsampled YUV YV12 and RGBA, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_IYUV = 105, //!< convert between 4:2:0-subsampled YUV YV12 and BGRA, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_I420 = COLOR_YUV2RGBA_IYUV, //!< synonym to IYUV
+    COLOR_YUV2BGRA_I420 = COLOR_YUV2BGRA_IYUV, //!< synonym to IYUV
+    COLOR_YUV420p2RGBA  = COLOR_YUV2RGBA_YV12, //!< synonym to YV12
+    COLOR_YUV420p2BGRA  = COLOR_YUV2BGRA_YV12, //!< synonym to YV12
+
+    COLOR_YUV2GRAY_420  = 106, //!< extract Y channel from YUV 4:2:0 image
+    COLOR_YUV2GRAY_NV21 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_NV12 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_YV12 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_IYUV = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_I420 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV420sp2GRAY = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV420p2GRAY  = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+
+    COLOR_YUV2RGB_UYVY = 107, //!< convert between YUV UYVY and RGB, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_UYVY = 108, //!< convert between YUV UYVY and BGR, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2RGB_VYUY = 109, //!< convert between YUV VYUY and RGB, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2BGR_VYUY = 110, //!< convert between YUV VYUY and BGR, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_Y422 = COLOR_YUV2RGB_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGR_Y422 = COLOR_YUV2BGR_UYVY, //!< synonym to UYVY
+    COLOR_YUV2RGB_UYNV = COLOR_YUV2RGB_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGR_UYNV = COLOR_YUV2BGR_UYVY, //!< synonym to UYVY
+
+    COLOR_YUV2RGBA_UYVY = 111, //!< convert between YUV UYVY and RGBA, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_UYVY = 112, //!< convert between YUV UYVY and BGRA, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2RGBA_VYUY = 113, //!< convert between YUV VYUY and RGBA, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2BGRA_VYUY = 114, //!< convert between YUV VYUY and BGRA, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_Y422 = COLOR_YUV2RGBA_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGRA_Y422 = COLOR_YUV2BGRA_UYVY, //!< synonym to UYVY
+    COLOR_YUV2RGBA_UYNV = COLOR_YUV2RGBA_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGRA_UYNV = COLOR_YUV2BGRA_UYVY, //!< synonym to UYVY
+
+    COLOR_YUV2RGB_YUY2 = 115, //!< convert between YUV YUY2 and RGB, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_YUY2 = 116, //!< convert between YUV YUY2 and BGR, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_YVYU = 117, //!< convert between YUV YVYU and RGB, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_YVYU = 118, //!< convert between YUV YVYU and BGR, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_YUYV = COLOR_YUV2RGB_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGR_YUYV = COLOR_YUV2BGR_YUY2, //!< synonym to YUY2
+    COLOR_YUV2RGB_YUNV = COLOR_YUV2RGB_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGR_YUNV = COLOR_YUV2BGR_YUY2, //!< synonym to YUY2
+
+    COLOR_YUV2RGBA_YUY2 = 119, //!< convert between YUV YUY2 and RGBA, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_YUY2 = 120, //!< convert between YUV YUY2 and BGRA, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_YVYU = 121, //!< convert between YUV YVYU and RGBA, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_YVYU = 122, //!< convert between YUV YVYU and BGRA, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_YUYV = COLOR_YUV2RGBA_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGRA_YUYV = COLOR_YUV2BGRA_YUY2, //!< synonym to YUY2
+    COLOR_YUV2RGBA_YUNV = COLOR_YUV2RGBA_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGRA_YUNV = COLOR_YUV2BGRA_YUY2, //!< synonym to YUY2
+
+    COLOR_YUV2GRAY_UYVY = 123, //!< extract Y channel from YUV 4:2:2 image
+    COLOR_YUV2GRAY_YUY2 = 124, //!< extract Y channel from YUV 4:2:2 image
+    //CV_YUV2GRAY_VYUY  = CV_YUV2GRAY_UYVY, //!< synonym to COLOR_YUV2GRAY_UYVY
+    COLOR_YUV2GRAY_Y422 = COLOR_YUV2GRAY_UYVY, //!< synonym to COLOR_YUV2GRAY_UYVY
+    COLOR_YUV2GRAY_UYNV = COLOR_YUV2GRAY_UYVY, //!< synonym to COLOR_YUV2GRAY_UYVY
+    COLOR_YUV2GRAY_YVYU = COLOR_YUV2GRAY_YUY2, //!< synonym to COLOR_YUV2GRAY_YUY2
+    COLOR_YUV2GRAY_YUYV = COLOR_YUV2GRAY_YUY2, //!< synonym to COLOR_YUV2GRAY_YUY2
+    COLOR_YUV2GRAY_YUNV = COLOR_YUV2GRAY_YUY2, //!< synonym to COLOR_YUV2GRAY_YUY2
+
+    //! alpha premultiplication
+    COLOR_RGBA2mRGBA    = 125,
+    COLOR_mRGBA2RGBA    = 126,
+
+    COLOR_RGB2YUV_I420  = 127, //!< convert between RGB and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_I420  = 128, //!< convert between BGR and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_IYUV  = COLOR_RGB2YUV_I420, //!< synonym to I420
+    COLOR_BGR2YUV_IYUV  = COLOR_BGR2YUV_I420, //!< synonym to I420
+
+    COLOR_RGBA2YUV_I420 = 129, //!< convert between RGBA and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_I420 = 130, //!< convert between BGRA and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_IYUV = COLOR_RGBA2YUV_I420, //!< synonym to I420
+    COLOR_BGRA2YUV_IYUV = COLOR_BGRA2YUV_I420, //!< synonym to I420
+    COLOR_RGB2YUV_YV12  = 131, //!< convert between RGB and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_YV12  = 132, //!< convert between BGR and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_YV12 = 133, //!< convert between RGBA and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_YV12 = 134, //!< convert between BGRA and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+
+    //! Demosaicing, see @ref color_convert_bayer "color conversions" for additional information
+    COLOR_BayerBG2BGR = 46, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2BGR = 47, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2BGR = 48, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2BGR = 49, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2BGR = COLOR_BayerBG2BGR,
+    COLOR_BayerGRBG2BGR = COLOR_BayerGB2BGR,
+    COLOR_BayerBGGR2BGR = COLOR_BayerRG2BGR,
+    COLOR_BayerGBRG2BGR = COLOR_BayerGR2BGR,
+
+    COLOR_BayerRGGB2RGB = COLOR_BayerBGGR2BGR,
+    COLOR_BayerGRBG2RGB = COLOR_BayerGBRG2BGR,
+    COLOR_BayerBGGR2RGB = COLOR_BayerRGGB2BGR,
+    COLOR_BayerGBRG2RGB = COLOR_BayerGRBG2BGR,
+
+    COLOR_BayerBG2RGB = COLOR_BayerRG2BGR, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2RGB = COLOR_BayerGR2BGR, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2RGB = COLOR_BayerBG2BGR, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2RGB = COLOR_BayerGB2BGR, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerBG2GRAY = 86, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2GRAY = 87, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2GRAY = 88, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2GRAY = 89, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2GRAY = COLOR_BayerBG2GRAY,
+    COLOR_BayerGRBG2GRAY = COLOR_BayerGB2GRAY,
+    COLOR_BayerBGGR2GRAY = COLOR_BayerRG2GRAY,
+    COLOR_BayerGBRG2GRAY = COLOR_BayerGR2GRAY,
+
+    //! Demosaicing using Variable Number of Gradients
+    COLOR_BayerBG2BGR_VNG = 62, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2BGR_VNG = 63, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2BGR_VNG = 64, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2BGR_VNG = 65, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2BGR_VNG = COLOR_BayerBG2BGR_VNG,
+    COLOR_BayerGRBG2BGR_VNG = COLOR_BayerGB2BGR_VNG,
+    COLOR_BayerBGGR2BGR_VNG = COLOR_BayerRG2BGR_VNG,
+    COLOR_BayerGBRG2BGR_VNG = COLOR_BayerGR2BGR_VNG,
+
+    COLOR_BayerRGGB2RGB_VNG = COLOR_BayerBGGR2BGR_VNG,
+    COLOR_BayerGRBG2RGB_VNG = COLOR_BayerGBRG2BGR_VNG,
+    COLOR_BayerBGGR2RGB_VNG = COLOR_BayerRGGB2BGR_VNG,
+    COLOR_BayerGBRG2RGB_VNG = COLOR_BayerGRBG2BGR_VNG,
+
+    COLOR_BayerBG2RGB_VNG = COLOR_BayerRG2BGR_VNG, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2RGB_VNG = COLOR_BayerGR2BGR_VNG, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2RGB_VNG = COLOR_BayerBG2BGR_VNG, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2RGB_VNG = COLOR_BayerGB2BGR_VNG, //!< equivalent to GBRG Bayer pattern
+
+    //! Edge-Aware Demosaicing
+    COLOR_BayerBG2BGR_EA  = 135, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2BGR_EA  = 136, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2BGR_EA  = 137, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2BGR_EA  = 138, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2BGR_EA  = COLOR_BayerBG2BGR_EA,
+    COLOR_BayerGRBG2BGR_EA  = COLOR_BayerGB2BGR_EA,
+    COLOR_BayerBGGR2BGR_EA  = COLOR_BayerRG2BGR_EA,
+    COLOR_BayerGBRG2BGR_EA  = COLOR_BayerGR2BGR_EA,
+
+    COLOR_BayerRGGB2RGB_EA  = COLOR_BayerBGGR2BGR_EA,
+    COLOR_BayerGRBG2RGB_EA  = COLOR_BayerGBRG2BGR_EA,
+    COLOR_BayerBGGR2RGB_EA  = COLOR_BayerRGGB2BGR_EA,
+    COLOR_BayerGBRG2RGB_EA  = COLOR_BayerGRBG2BGR_EA,
+
+    COLOR_BayerBG2RGB_EA  = COLOR_BayerRG2BGR_EA, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2RGB_EA  = COLOR_BayerGR2BGR_EA, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2RGB_EA  = COLOR_BayerBG2BGR_EA, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2RGB_EA  = COLOR_BayerGB2BGR_EA, //!< equivalent to GBRG Bayer pattern
+
+    //! Demosaicing with alpha channel
+    COLOR_BayerBG2BGRA = 139, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2BGRA = 140, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2BGRA = 141, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2BGRA = 142, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2BGRA = COLOR_BayerBG2BGRA,
+    COLOR_BayerGRBG2BGRA = COLOR_BayerGB2BGRA,
+    COLOR_BayerBGGR2BGRA = COLOR_BayerRG2BGRA,
+    COLOR_BayerGBRG2BGRA = COLOR_BayerGR2BGRA,
+
+    COLOR_BayerRGGB2RGBA = COLOR_BayerBGGR2BGRA,
+    COLOR_BayerGRBG2RGBA = COLOR_BayerGBRG2BGRA,
+    COLOR_BayerBGGR2RGBA = COLOR_BayerRGGB2BGRA,
+    COLOR_BayerGBRG2RGBA = COLOR_BayerGRBG2BGRA,
+
+    COLOR_BayerBG2RGBA = COLOR_BayerRG2BGRA, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2RGBA = COLOR_BayerGR2BGRA, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2RGBA = COLOR_BayerBG2BGRA, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2RGBA = COLOR_BayerGB2BGRA, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_RGB2YUV_UYVY = 143, //!< convert between RGB and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_UYVY = 144, //!< convert between BGR and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_Y422 = COLOR_RGB2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGR2YUV_Y422 = COLOR_BGR2YUV_UYVY, //!< synonym to UYVY
+    COLOR_RGB2YUV_UYNV = COLOR_RGB2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGR2YUV_UYNV = COLOR_BGR2YUV_UYVY, //!< synonym to UYVY
+
+    COLOR_RGBA2YUV_UYVY = 145, //!< convert between RGBA and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_UYVY = 146, //!< convert between BGRA and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_Y422 = COLOR_RGBA2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGRA2YUV_Y422 = COLOR_BGRA2YUV_UYVY, //!< synonym to UYVY
+    COLOR_RGBA2YUV_UYNV = COLOR_RGBA2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGRA2YUV_UYNV = COLOR_BGRA2YUV_UYVY, //!< synonym to UYVY
+
+    COLOR_RGB2YUV_YUY2 = 147, //!< convert between RGB and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_YUY2 = 148, //!< convert between BGR and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_YVYU = 149, //!< convert between RGB and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_YVYU = 150, //!< convert between BGR and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_YUYV = COLOR_RGB2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGR2YUV_YUYV = COLOR_BGR2YUV_YUY2, //!< synonym to YUY2
+    COLOR_RGB2YUV_YUNV = COLOR_RGB2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGR2YUV_YUNV = COLOR_BGR2YUV_YUY2, //!< synonym to YUY2
+
+    COLOR_RGBA2YUV_YUY2 = 151, //!< convert between RGBA and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_YUY2 = 152, //!< convert between BGRA and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_YVYU = 153, //!< convert between RGBA and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_YVYU = 154, //!< convert between BGRA and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_YUYV = COLOR_RGBA2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGRA2YUV_YUYV = COLOR_BGRA2YUV_YUY2, //!< synonym to YUY2
+    COLOR_RGBA2YUV_YUNV = COLOR_RGBA2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGRA2YUV_YUNV = COLOR_BGRA2YUV_YUY2, //!< synonym to YUY2
+
+    COLOR_COLORCVT_MAX  = 155
+};
+
+//! @addtogroup imgproc_shape
+//! @{
+
+//! types of intersection between rectangles
+enum RectanglesIntersectTypes {
+    INTERSECT_NONE = 0, //!< No intersection
+    INTERSECT_PARTIAL  = 1, //!< There is a partial intersection
+    INTERSECT_FULL  = 2 //!< One of the rectangle is fully enclosed in the other
+};
+
+/** types of line
+@ingroup imgproc_draw
+*/
+enum LineTypes {
+    FILLED  = -1,
+    LINE_4  = 4, //!< 4-connected line
+    LINE_8  = 8, //!< 8-connected line
+    LINE_AA = 16 //!< antialiased line
+};
+
+/** Only a subset of Hershey fonts <https://en.wikipedia.org/wiki/Hershey_fonts> are supported
+@ingroup imgproc_draw
+*/
+enum HersheyFonts {
+    FONT_HERSHEY_SIMPLEX        = 0, //!< normal size sans-serif font
+    FONT_HERSHEY_PLAIN          = 1, //!< small size sans-serif font
+    FONT_HERSHEY_DUPLEX         = 2, //!< normal size sans-serif font (more complex than FONT_HERSHEY_SIMPLEX)
+    FONT_HERSHEY_COMPLEX        = 3, //!< normal size serif font
+    FONT_HERSHEY_TRIPLEX        = 4, //!< normal size serif font (more complex than FONT_HERSHEY_COMPLEX)
+    FONT_HERSHEY_COMPLEX_SMALL  = 5, //!< smaller version of FONT_HERSHEY_COMPLEX
+    FONT_HERSHEY_SCRIPT_SIMPLEX = 6, //!< hand-writing style font
+    FONT_HERSHEY_SCRIPT_COMPLEX = 7, //!< more complex variant of FONT_HERSHEY_SCRIPT_SIMPLEX
+    FONT_ITALIC                 = 16 //!< flag for italic font
+};
+
+/** Possible set of marker types used for the cv::drawMarker function
+@ingroup imgproc_draw
+*/
+enum MarkerTypes
+{
+    MARKER_CROSS = 0,           //!< A crosshair marker shape
+    MARKER_TILTED_CROSS = 1,    //!< A 45 degree tilted crosshair marker shape
+    MARKER_STAR = 2,            //!< A star marker shape, combination of cross and tilted cross
+    MARKER_DIAMOND = 3,         //!< A diamond marker shape
+    MARKER_SQUARE = 4,          //!< A square marker shape
+    MARKER_TRIANGLE_UP = 5,     //!< An upwards pointing triangle marker shape
+    MARKER_TRIANGLE_DOWN = 6    //!< A downwards pointing triangle marker shape
+};
+
+/** @brief finds arbitrary template in the grayscale image using Generalized Hough Transform
+*/
+class CV_EXPORTS_W GeneralizedHough : public Algorithm
+{
+public:
+    //! set template to search
+    CV_WRAP virtual void setTemplate(InputArray templ, Point templCenter = Point(-1, -1)) = 0;
+    CV_WRAP virtual void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter = Point(-1, -1)) = 0;
+
+    //! find template on image
+    CV_WRAP virtual void detect(InputArray image, OutputArray positions, OutputArray votes = noArray()) = 0;
+    CV_WRAP virtual void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes = noArray()) = 0;
+
+    //! Canny low threshold.
+    CV_WRAP virtual void setCannyLowThresh(int cannyLowThresh) = 0;
+    CV_WRAP virtual int getCannyLowThresh() const = 0;
+
+    //! Canny high threshold.
+    CV_WRAP virtual void setCannyHighThresh(int cannyHighThresh) = 0;
+    CV_WRAP virtual int getCannyHighThresh() const = 0;
+
+    //! Minimum distance between the centers of the detected objects.
+    CV_WRAP virtual void setMinDist(double minDist) = 0;
+    CV_WRAP virtual double getMinDist() const = 0;
+
+    //! Inverse ratio of the accumulator resolution to the image resolution.
+    CV_WRAP virtual void setDp(double dp) = 0;
+    CV_WRAP virtual double getDp() const = 0;
+
+    //! Maximal size of inner buffers.
+    CV_WRAP virtual void setMaxBufferSize(int maxBufferSize) = 0;
+    CV_WRAP virtual int getMaxBufferSize() const = 0;
+};
+
+/** @brief finds arbitrary template in the grayscale image using Generalized Hough Transform
+
+Detects position only without translation and rotation @cite Ballard1981 .
+*/
+class CV_EXPORTS_W GeneralizedHoughBallard : public GeneralizedHough
+{
+public:
+    //! R-Table levels.
+    CV_WRAP virtual void setLevels(int levels) = 0;
+    CV_WRAP virtual int getLevels() const = 0;
+
+    //! The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.
+    CV_WRAP virtual void setVotesThreshold(int votesThreshold) = 0;
+    CV_WRAP virtual int getVotesThreshold() const = 0;
+};
+
+/** @brief finds arbitrary template in the grayscale image using Generalized Hough Transform
+
+Detects position, translation and rotation @cite Guil1999 .
+*/
+class CV_EXPORTS_W GeneralizedHoughGuil : public GeneralizedHough
+{
+public:
+    //! Angle difference in degrees between two points in feature.
+    CV_WRAP virtual void setXi(double xi) = 0;
+    CV_WRAP virtual double getXi() const = 0;
+
+    //! Feature table levels.
+    CV_WRAP virtual void setLevels(int levels) = 0;
+    CV_WRAP virtual int getLevels() const = 0;
+
+    //! Maximal difference between angles that treated as equal.
+    CV_WRAP virtual void setAngleEpsilon(double angleEpsilon) = 0;
+    CV_WRAP virtual double getAngleEpsilon() const = 0;
+
+    //! Minimal rotation angle to detect in degrees.
+    CV_WRAP virtual void setMinAngle(double minAngle) = 0;
+    CV_WRAP virtual double getMinAngle() const = 0;
+
+    //! Maximal rotation angle to detect in degrees.
+    CV_WRAP virtual void setMaxAngle(double maxAngle) = 0;
+    CV_WRAP virtual double getMaxAngle() const = 0;
+
+    //! Angle step in degrees.
+    CV_WRAP virtual void setAngleStep(double angleStep) = 0;
+    CV_WRAP virtual double getAngleStep() const = 0;
+
+    //! Angle votes threshold.
+    CV_WRAP virtual void setAngleThresh(int angleThresh) = 0;
+    CV_WRAP virtual int getAngleThresh() const = 0;
+
+    //! Minimal scale to detect.
+    CV_WRAP virtual void setMinScale(double minScale) = 0;
+    CV_WRAP virtual double getMinScale() const = 0;
+
+    //! Maximal scale to detect.
+    CV_WRAP virtual void setMaxScale(double maxScale) = 0;
+    CV_WRAP virtual double getMaxScale() const = 0;
+
+    //! Scale step.
+    CV_WRAP virtual void setScaleStep(double scaleStep) = 0;
+    CV_WRAP virtual double getScaleStep() const = 0;
+
+    //! Scale votes threshold.
+    CV_WRAP virtual void setScaleThresh(int scaleThresh) = 0;
+    CV_WRAP virtual int getScaleThresh() const = 0;
+
+    //! Position votes threshold.
+    CV_WRAP virtual void setPosThresh(int posThresh) = 0;
+    CV_WRAP virtual int getPosThresh() const = 0;
+};
+
+//! @} imgproc_shape
+
+//! @addtogroup imgproc_hist
+//! @{
+
+/** @brief Base class for Contrast Limited Adaptive Histogram Equalization.
+*/
+class CV_EXPORTS_W CLAHE : public Algorithm
+{
+public:
+    /** @brief Equalizes the histogram of a grayscale image using Contrast Limited Adaptive Histogram Equalization.
+
+    @param src Source image of type CV_8UC1 or CV_16UC1.
+    @param dst Destination image.
+     */
+    CV_WRAP virtual void apply(InputArray src, OutputArray dst) = 0;
+
+    /** @brief Sets threshold for contrast limiting.
+
+    @param clipLimit threshold value.
+    */
+    CV_WRAP virtual void setClipLimit(double clipLimit) = 0;
+
+    //! Returns threshold value for contrast limiting.
+    CV_WRAP virtual double getClipLimit() const = 0;
+
+    /** @brief Sets size of grid for histogram equalization. Input image will be divided into
+    equally sized rectangular tiles.
+
+    @param tileGridSize defines the number of tiles in row and column.
+    */
+    CV_WRAP virtual void setTilesGridSize(Size tileGridSize) = 0;
+
+    //!@brief Returns Size defines the number of tiles in row and column.
+    CV_WRAP virtual Size getTilesGridSize() const = 0;
+
+    CV_WRAP virtual void collectGarbage() = 0;
+};
+
+//! @} imgproc_hist
+
+//! @addtogroup imgproc_subdiv2d
+//! @{
+
+class CV_EXPORTS_W Subdiv2D
+{
+public:
+    /** Subdiv2D point location cases */
+    enum { PTLOC_ERROR        = -2, //!< Point location error
+           PTLOC_OUTSIDE_RECT = -1, //!< Point outside the subdivision bounding rect
+           PTLOC_INSIDE       = 0, //!< Point inside some facet
+           PTLOC_VERTEX       = 1, //!< Point coincides with one of the subdivision vertices
+           PTLOC_ON_EDGE      = 2  //!< Point on some edge
+         };
+
+    /** Subdiv2D edge type navigation (see: getEdge()) */
+    enum { NEXT_AROUND_ORG   = 0x00,
+           NEXT_AROUND_DST   = 0x22,
+           PREV_AROUND_ORG   = 0x11,
+           PREV_AROUND_DST   = 0x33,
+           NEXT_AROUND_LEFT  = 0x13,
+           NEXT_AROUND_RIGHT = 0x31,
+           PREV_AROUND_LEFT  = 0x20,
+           PREV_AROUND_RIGHT = 0x02
+         };
+
+    /** creates an empty Subdiv2D object.
+    To create a new empty Delaunay subdivision you need to use the #initDelaunay function.
+     */
+    CV_WRAP Subdiv2D();
+
+    /** @overload
+
+    @param rect Rectangle that includes all of the 2D points that are to be added to the subdivision.
+
+    The function creates an empty Delaunay subdivision where 2D points can be added using the function
+    insert() . All of the points to be added must be within the specified rectangle, otherwise a runtime
+    error is raised.
+     */
+    CV_WRAP Subdiv2D(Rect rect);
+
+    /** @brief Creates a new empty Delaunay subdivision
+
+    @param rect Rectangle that includes all of the 2D points that are to be added to the subdivision.
+
+     */
+    CV_WRAP void initDelaunay(Rect rect);
+
+    /** @brief Insert a single point into a Delaunay triangulation.
+
+    @param pt Point to insert.
+
+    The function inserts a single point into a subdivision and modifies the subdivision topology
+    appropriately. If a point with the same coordinates exists already, no new point is added.
+    @returns the ID of the point.
+
+    @note If the point is outside of the triangulation specified rect a runtime error is raised.
+     */
+    CV_WRAP int insert(Point2f pt);
+
+    /** @brief Insert multiple points into a Delaunay triangulation.
+
+    @param ptvec Points to insert.
+
+    The function inserts a vector of points into a subdivision and modifies the subdivision topology
+    appropriately.
+     */
+    CV_WRAP void insert(const std::vector<Point2f>& ptvec);
+
+    /** @brief Returns the location of a point within a Delaunay triangulation.
+
+    @param pt Point to locate.
+    @param edge Output edge that the point belongs to or is located to the right of it.
+    @param vertex Optional output vertex the input point coincides with.
+
+    The function locates the input point within the subdivision and gives one of the triangle edges
+    or vertices.
+
+    @returns an integer which specify one of the following five cases for point location:
+    -  The point falls into some facet. The function returns #PTLOC_INSIDE and edge will contain one of
+       edges of the facet.
+    -  The point falls onto the edge. The function returns #PTLOC_ON_EDGE and edge will contain this edge.
+    -  The point coincides with one of the subdivision vertices. The function returns #PTLOC_VERTEX and
+       vertex will contain a pointer to the vertex.
+    -  The point is outside the subdivision reference rectangle. The function returns #PTLOC_OUTSIDE_RECT
+       and no pointers are filled.
+    -  One of input arguments is invalid. A runtime error is raised or, if silent or "parent" error
+       processing mode is selected, #PTLOC_ERROR is returned.
+     */
+    CV_WRAP int locate(Point2f pt, CV_OUT int& edge, CV_OUT int& vertex);
+
+    /** @brief Finds the subdivision vertex closest to the given point.
+
+    @param pt Input point.
+    @param nearestPt Output subdivision vertex point.
+
+    The function is another function that locates the input point within the subdivision. It finds the
+    subdivision vertex that is the closest to the input point. It is not necessarily one of vertices
+    of the facet containing the input point, though the facet (located using locate() ) is used as a
+    starting point.
+
+    @returns vertex ID.
+     */
+    CV_WRAP int findNearest(Point2f pt, CV_OUT Point2f* nearestPt = 0);
+
+    /** @brief Returns a list of all edges.
+
+    @param edgeList Output vector.
+
+    The function gives each edge as a 4 numbers vector, where each two are one of the edge
+    vertices. i.e. org_x = v[0], org_y = v[1], dst_x = v[2], dst_y = v[3].
+     */
+    CV_WRAP void getEdgeList(CV_OUT std::vector<Vec4f>& edgeList) const;
+
+    /** @brief Returns a list of the leading edge ID connected to each triangle.
+
+    @param leadingEdgeList Output vector.
+
+    The function gives one edge ID for each triangle.
+     */
+    CV_WRAP void getLeadingEdgeList(CV_OUT std::vector<int>& leadingEdgeList) const;
+
+    /** @brief Returns a list of all triangles.
+
+    @param triangleList Output vector.
+
+    The function gives each triangle as a 6 numbers vector, where each two are one of the triangle
+    vertices. i.e. p1_x = v[0], p1_y = v[1], p2_x = v[2], p2_y = v[3], p3_x = v[4], p3_y = v[5].
+     */
+    CV_WRAP void getTriangleList(CV_OUT std::vector<Vec6f>& triangleList) const;
+
+    /** @brief Returns a list of all Voronoi facets.
+
+    @param idx Vector of vertices IDs to consider. For all vertices you can pass empty vector.
+    @param facetList Output vector of the Voronoi facets.
+    @param facetCenters Output vector of the Voronoi facets center points.
+
+     */
+    CV_WRAP void getVoronoiFacetList(const std::vector<int>& idx, CV_OUT std::vector<std::vector<Point2f> >& facetList,
+                                     CV_OUT std::vector<Point2f>& facetCenters);
+
+    /** @brief Returns vertex location from vertex ID.
+
+    @param vertex vertex ID.
+    @param firstEdge Optional. The first edge ID which is connected to the vertex.
+    @returns vertex (x,y)
+
+     */
+    CV_WRAP Point2f getVertex(int vertex, CV_OUT int* firstEdge = 0) const;
+
+    /** @brief Returns one of the edges related to the given edge.
+
+    @param edge Subdivision edge ID.
+    @param nextEdgeType Parameter specifying which of the related edges to return.
+    The following values are possible:
+    -   NEXT_AROUND_ORG next around the edge origin ( eOnext on the picture below if e is the input edge)
+    -   NEXT_AROUND_DST next around the edge vertex ( eDnext )
+    -   PREV_AROUND_ORG previous around the edge origin (reversed eRnext )
+    -   PREV_AROUND_DST previous around the edge destination (reversed eLnext )
+    -   NEXT_AROUND_LEFT next around the left facet ( eLnext )
+    -   NEXT_AROUND_RIGHT next around the right facet ( eRnext )
+    -   PREV_AROUND_LEFT previous around the left facet (reversed eOnext )
+    -   PREV_AROUND_RIGHT previous around the right facet (reversed eDnext )
+
+    ![sample output](pics/quadedge.png)
+
+    @returns edge ID related to the input edge.
+     */
+    CV_WRAP int getEdge( int edge, int nextEdgeType ) const;
+
+    /** @brief Returns next edge around the edge origin.
+
+    @param edge Subdivision edge ID.
+
+    @returns an integer which is next edge ID around the edge origin: eOnext on the
+    picture above if e is the input edge).
+     */
+    CV_WRAP int nextEdge(int edge) const;
+
+    /** @brief Returns another edge of the same quad-edge.
+
+    @param edge Subdivision edge ID.
+    @param rotate Parameter specifying which of the edges of the same quad-edge as the input
+    one to return. The following values are possible:
+    -   0 - the input edge ( e on the picture below if e is the input edge)
+    -   1 - the rotated edge ( eRot )
+    -   2 - the reversed edge (reversed e (in green))
+    -   3 - the reversed rotated edge (reversed eRot (in green))
+
+    @returns one of the edges ID of the same quad-edge as the input edge.
+     */
+    CV_WRAP int rotateEdge(int edge, int rotate) const;
+    CV_WRAP int symEdge(int edge) const;
+
+    /** @brief Returns the edge origin.
+
+    @param edge Subdivision edge ID.
+    @param orgpt Output vertex location.
+
+    @returns vertex ID.
+     */
+    CV_WRAP int edgeOrg(int edge, CV_OUT Point2f* orgpt = 0) const;
+
+    /** @brief Returns the edge destination.
+
+    @param edge Subdivision edge ID.
+    @param dstpt Output vertex location.
+
+    @returns vertex ID.
+     */
+    CV_WRAP int edgeDst(int edge, CV_OUT Point2f* dstpt = 0) const;
+
+protected:
+    int newEdge();
+    void deleteEdge(int edge);
+    int newPoint(Point2f pt, bool isvirtual, int firstEdge = 0);
+    void deletePoint(int vtx);
+    void setEdgePoints( int edge, int orgPt, int dstPt );
+    void splice( int edgeA, int edgeB );
+    int connectEdges( int edgeA, int edgeB );
+    void swapEdges( int edge );
+    int isRightOf(Point2f pt, int edge) const;
+    void calcVoronoi();
+    void clearVoronoi();
+    void checkSubdiv() const;
+
+    struct CV_EXPORTS Vertex
+    {
+        Vertex();
+        Vertex(Point2f pt, bool isvirtual, int firstEdge=0);
+        bool isvirtual() const;
+        bool isfree() const;
+
+        int firstEdge;
+        int type;
+        Point2f pt;
+    };
+
+    struct CV_EXPORTS QuadEdge
+    {
+        QuadEdge();
+        QuadEdge(int edgeidx);
+        bool isfree() const;
+
+        int next[4];
+        int pt[4];
+    };
+
+    //! All of the vertices
+    std::vector<Vertex> vtx;
+    //! All of the edges
+    std::vector<QuadEdge> qedges;
+    int freeQEdge;
+    int freePoint;
+    bool validGeometry;
+
+    int recentEdge;
+    //! Top left corner of the bounding rect
+    Point2f topLeft;
+    //! Bottom right corner of the bounding rect
+    Point2f bottomRight;
+};
+
+//! @} imgproc_subdiv2d
+
+//! @addtogroup imgproc_feature
+//! @{
+
+/** @example samples/cpp/lsd_lines.cpp
+An example using the LineSegmentDetector
+\image html building_lsd.png "Sample output image" width=434 height=300
+*/
+
+/** @brief Line segment detector class
+
+following the algorithm described at @cite Rafael12 .
+
+@note Implementation has been removed from OpenCV version 3.4.6 to 3.4.15 and version 4.1.0 to 4.5.3 due original code license conflict.
+restored again after [Computation of a NFA](https://github.com/rafael-grompone-von-gioi/binomial_nfa) code published under the MIT license.
+*/
+class CV_EXPORTS_W LineSegmentDetector : public Algorithm
+{
+public:
+
+    /** @brief Finds lines in the input image.
+
+    This is the output of the default parameters of the algorithm on the above shown image.
+
+    ![image](pics/building_lsd.png)
+
+    @param image A grayscale (CV_8UC1) input image. If only a roi needs to be selected, use:
+    `lsd_ptr-\>detect(image(roi), lines, ...); lines += Scalar(roi.x, roi.y, roi.x, roi.y);`
+    @param lines A vector of Vec4f elements specifying the beginning and ending point of a line. Where
+    Vec4f is (x1, y1, x2, y2), point 1 is the start, point 2 - end. Returned lines are strictly
+    oriented depending on the gradient.
+    @param width Vector of widths of the regions, where the lines are found. E.g. Width of line.
+    @param prec Vector of precisions with which the lines are found.
+    @param nfa Vector containing number of false alarms in the line region, with precision of 10%. The
+    bigger the value, logarithmically better the detection.
+    - -1 corresponds to 10 mean false alarms
+    - 0 corresponds to 1 mean false alarm
+    - 1 corresponds to 0.1 mean false alarms
+    This vector will be calculated only when the objects type is #LSD_REFINE_ADV.
+    */
+    CV_WRAP virtual void detect(InputArray image, OutputArray lines,
+                        OutputArray width = noArray(), OutputArray prec = noArray(),
+                        OutputArray nfa = noArray()) = 0;
+
+    /** @brief Draws the line segments on a given image.
+    @param image The image, where the lines will be drawn. Should be bigger or equal to the image,
+    where the lines were found.
+    @param lines A vector of the lines that needed to be drawn.
+     */
+    CV_WRAP virtual void drawSegments(InputOutputArray image, InputArray lines) = 0;
+
+    /** @brief Draws two groups of lines in blue and red, counting the non overlapping (mismatching) pixels.
+
+    @param size The size of the image, where lines1 and lines2 were found.
+    @param lines1 The first group of lines that needs to be drawn. It is visualized in blue color.
+    @param lines2 The second group of lines. They visualized in red color.
+    @param image Optional image, where the lines will be drawn. The image should be color(3-channel)
+    in order for lines1 and lines2 to be drawn in the above mentioned colors.
+     */
+    CV_WRAP virtual int compareSegments(const Size& size, InputArray lines1, InputArray lines2, InputOutputArray image = noArray()) = 0;
+
+    virtual ~LineSegmentDetector() { }
+};
+
+/** @brief Creates a smart pointer to a LineSegmentDetector object and initializes it.
+
+The LineSegmentDetector algorithm is defined using the standard values. Only advanced users may want
+to edit those, as to tailor it for their own application.
+
+@param refine The way found lines will be refined, see #LineSegmentDetectorModes
+@param scale The scale of the image that will be used to find the lines. Range (0..1].
+@param sigma_scale Sigma for Gaussian filter. It is computed as sigma = sigma_scale/scale.
+@param quant Bound to the quantization error on the gradient norm.
+@param ang_th Gradient angle tolerance in degrees.
+@param log_eps Detection threshold: -log10(NFA) \> log_eps. Used only when advance refinement is chosen.
+@param density_th Minimal density of aligned region points in the enclosing rectangle.
+@param n_bins Number of bins in pseudo-ordering of gradient modulus.
+ */
+CV_EXPORTS_W Ptr<LineSegmentDetector> createLineSegmentDetector(
+    int refine = LSD_REFINE_STD, double scale = 0.8,
+    double sigma_scale = 0.6, double quant = 2.0, double ang_th = 22.5,
+    double log_eps = 0, double density_th = 0.7, int n_bins = 1024);
+
+//! @} imgproc_feature
+
+//! @addtogroup imgproc_filter
+//! @{
+
+/** @brief Returns Gaussian filter coefficients.
+
+The function computes and returns the \f$\texttt{ksize} \times 1\f$ matrix of Gaussian filter
+coefficients:
+
+\f[G_i= \alpha *e^{-(i-( \texttt{ksize} -1)/2)^2/(2* \texttt{sigma}^2)},\f]
+
+where \f$i=0..\texttt{ksize}-1\f$ and \f$\alpha\f$ is the scale factor chosen so that \f$\sum_i G_i=1\f$.
+
+Two of such generated kernels can be passed to sepFilter2D. Those functions automatically recognize
+smoothing kernels (a symmetrical kernel with sum of weights equal to 1) and handle them accordingly.
+You may also use the higher-level GaussianBlur.
+@param ksize Aperture size. It should be odd ( \f$\texttt{ksize} \mod 2 = 1\f$ ) and positive.
+@param sigma Gaussian standard deviation. If it is non-positive, it is computed from ksize as
+`sigma = 0.3*((ksize-1)*0.5 - 1) + 0.8`.
+@param ktype Type of filter coefficients. It can be CV_32F or CV_64F .
+@sa  sepFilter2D, getDerivKernels, getStructuringElement, GaussianBlur
+ */
+CV_EXPORTS_W Mat getGaussianKernel( int ksize, double sigma, int ktype = CV_64F );
+
+/** @brief Returns filter coefficients for computing spatial image derivatives.
+
+The function computes and returns the filter coefficients for spatial image derivatives. When
+`ksize=FILTER_SCHARR`, the Scharr \f$3 \times 3\f$ kernels are generated (see #Scharr). Otherwise, Sobel
+kernels are generated (see #Sobel). The filters are normally passed to #sepFilter2D or to
+
+@param kx Output matrix of row filter coefficients. It has the type ktype .
+@param ky Output matrix of column filter coefficients. It has the type ktype .
+@param dx Derivative order in respect of x.
+@param dy Derivative order in respect of y.
+@param ksize Aperture size. It can be FILTER_SCHARR, 1, 3, 5, or 7.
+@param normalize Flag indicating whether to normalize (scale down) the filter coefficients or not.
+Theoretically, the coefficients should have the denominator \f$=2^{ksize*2-dx-dy-2}\f$. If you are
+going to filter floating-point images, you are likely to use the normalized kernels. But if you
+compute derivatives of an 8-bit image, store the results in a 16-bit image, and wish to preserve
+all the fractional bits, you may want to set normalize=false .
+@param ktype Type of filter coefficients. It can be CV_32f or CV_64F .
+ */
+CV_EXPORTS_W void getDerivKernels( OutputArray kx, OutputArray ky,
+                                   int dx, int dy, int ksize,
+                                   bool normalize = false, int ktype = CV_32F );
+
+/** @brief Returns Gabor filter coefficients.
+
+For more details about gabor filter equations and parameters, see: [Gabor
+Filter](http://en.wikipedia.org/wiki/Gabor_filter).
+
+@param ksize Size of the filter returned.
+@param sigma Standard deviation of the gaussian envelope.
+@param theta Orientation of the normal to the parallel stripes of a Gabor function.
+@param lambd Wavelength of the sinusoidal factor.
+@param gamma Spatial aspect ratio.
+@param psi Phase offset.
+@param ktype Type of filter coefficients. It can be CV_32F or CV_64F .
+ */
+CV_EXPORTS_W Mat getGaborKernel( Size ksize, double sigma, double theta, double lambd,
+                                 double gamma, double psi = CV_PI*0.5, int ktype = CV_64F );
+
+//! returns "magic" border value for erosion and dilation. It is automatically transformed to Scalar::all(-DBL_MAX) for dilation.
+static inline Scalar morphologyDefaultBorderValue() { return Scalar::all(DBL_MAX); }
+
+/** @brief Returns a structuring element of the specified size and shape for morphological operations.
+
+The function constructs and returns the structuring element that can be further passed to #erode,
+#dilate or #morphologyEx. But you can also construct an arbitrary binary mask yourself and use it as
+the structuring element.
+
+@param shape Element shape that could be one of #MorphShapes
+@param ksize Size of the structuring element.
+@param anchor Anchor position within the element. The default value \f$(-1, -1)\f$ means that the
+anchor is at the center. Note that only the shape of a cross-shaped element depends on the anchor
+position. In other cases the anchor just regulates how much the result of the morphological
+operation is shifted.
+ */
+CV_EXPORTS_W Mat getStructuringElement(int shape, Size ksize, Point anchor = Point(-1,-1));
+
+/** @example samples/cpp/tutorial_code/ImgProc/Smoothing/Smoothing.cpp
+Sample code for simple filters
+![Sample screenshot](Smoothing_Tutorial_Result_Median_Filter.jpg)
+Check @ref tutorial_gausian_median_blur_bilateral_filter "the corresponding tutorial" for more details
+ */
+
+/** @brief Blurs an image using the median filter.
+
+The function smoothes an image using the median filter with the \f$\texttt{ksize} \times
+\texttt{ksize}\f$ aperture. Each channel of a multi-channel image is processed independently.
+In-place operation is supported.
+
+@note The median filter uses #BORDER_REPLICATE internally to cope with border pixels, see #BorderTypes
+
+@param src input 1-, 3-, or 4-channel image; when ksize is 3 or 5, the image depth should be
+CV_8U, CV_16U, or CV_32F, for larger aperture sizes, it can only be CV_8U.
+@param dst destination array of the same size and type as src.
+@param ksize aperture linear size; it must be odd and greater than 1, for example: 3, 5, 7 ...
+@sa  bilateralFilter, blur, boxFilter, GaussianBlur
+ */
+CV_EXPORTS_W void medianBlur( InputArray src, OutputArray dst, int ksize );
+
+/** @brief Blurs an image using a Gaussian filter.
+
+The function convolves the source image with the specified Gaussian kernel. In-place filtering is
+supported.
+
+@param src input image; the image can have any number of channels, which are processed
+independently, but the depth should be CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst output image of the same size and type as src.
+@param ksize Gaussian kernel size. ksize.width and ksize.height can differ but they both must be
+positive and odd. Or, they can be zero's and then they are computed from sigma.
+@param sigmaX Gaussian kernel standard deviation in X direction.
+@param sigmaY Gaussian kernel standard deviation in Y direction; if sigmaY is zero, it is set to be
+equal to sigmaX, if both sigmas are zeros, they are computed from ksize.width and ksize.height,
+respectively (see #getGaussianKernel for details); to fully control the result regardless of
+possible future modifications of all this semantics, it is recommended to specify all of ksize,
+sigmaX, and sigmaY.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+
+@sa  sepFilter2D, filter2D, blur, boxFilter, bilateralFilter, medianBlur
+ */
+CV_EXPORTS_W void GaussianBlur( InputArray src, OutputArray dst, Size ksize,
+                                double sigmaX, double sigmaY = 0,
+                                int borderType = BORDER_DEFAULT );
+
+/** @brief Applies the bilateral filter to an image.
+
+The function applies bilateral filtering to the input image, as described in
+http://www.dai.ed.ac.uk/CVonline/LOCAL_COPIES/MANDUCHI1/Bilateral_Filtering.html
+bilateralFilter can reduce unwanted noise very well while keeping edges fairly sharp. However, it is
+very slow compared to most filters.
+
+_Sigma values_: For simplicity, you can set the 2 sigma values to be the same. If they are small (\<
+10), the filter will not have much effect, whereas if they are large (\> 150), they will have a very
+strong effect, making the image look "cartoonish".
+
+_Filter size_: Large filters (d \> 5) are very slow, so it is recommended to use d=5 for real-time
+applications, and perhaps d=9 for offline applications that need heavy noise filtering.
+
+This filter does not work inplace.
+@param src Source 8-bit or floating-point, 1-channel or 3-channel image.
+@param dst Destination image of the same size and type as src .
+@param d Diameter of each pixel neighborhood that is used during filtering. If it is non-positive,
+it is computed from sigmaSpace.
+@param sigmaColor Filter sigma in the color space. A larger value of the parameter means that
+farther colors within the pixel neighborhood (see sigmaSpace) will be mixed together, resulting
+in larger areas of semi-equal color.
+@param sigmaSpace Filter sigma in the coordinate space. A larger value of the parameter means that
+farther pixels will influence each other as long as their colors are close enough (see sigmaColor
+). When d\>0, it specifies the neighborhood size regardless of sigmaSpace. Otherwise, d is
+proportional to sigmaSpace.
+@param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes
+ */
+CV_EXPORTS_W void bilateralFilter( InputArray src, OutputArray dst, int d,
+                                   double sigmaColor, double sigmaSpace,
+                                   int borderType = BORDER_DEFAULT );
+
+/** @brief Blurs an image using the box filter.
+
+The function smooths an image using the kernel:
+
+\f[\texttt{K} =  \alpha \begin{bmatrix} 1 & 1 & 1 &  \cdots & 1 & 1  \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \hdotsfor{6} \\ 1 & 1 & 1 &  \cdots & 1 & 1 \end{bmatrix}\f]
+
+where
+
+\f[\alpha = \begin{cases} \frac{1}{\texttt{ksize.width*ksize.height}} & \texttt{when } \texttt{normalize=true}  \\1 & \texttt{otherwise}\end{cases}\f]
+
+Unnormalized box filter is useful for computing various integral characteristics over each pixel
+neighborhood, such as covariance matrices of image derivatives (used in dense optical flow
+algorithms, and so on). If you need to compute pixel sums over variable-size windows, use #integral.
+
+@param src input image.
+@param dst output image of the same size and type as src.
+@param ddepth the output image depth (-1 to use src.depth()).
+@param ksize blurring kernel size.
+@param anchor anchor point; default value Point(-1,-1) means that the anchor is at the kernel
+center.
+@param normalize flag, specifying whether the kernel is normalized by its area or not.
+@param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  blur, bilateralFilter, GaussianBlur, medianBlur, integral
+ */
+CV_EXPORTS_W void boxFilter( InputArray src, OutputArray dst, int ddepth,
+                             Size ksize, Point anchor = Point(-1,-1),
+                             bool normalize = true,
+                             int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates the normalized sum of squares of the pixel values overlapping the filter.
+
+For every pixel \f$ (x, y) \f$ in the source image, the function calculates the sum of squares of those neighboring
+pixel values which overlap the filter placed over the pixel \f$ (x, y) \f$.
+
+The unnormalized square box filter can be useful in computing local image statistics such as the local
+variance and standard deviation around the neighborhood of a pixel.
+
+@param src input image
+@param dst output image of the same size and type as src
+@param ddepth the output image depth (-1 to use src.depth())
+@param ksize kernel size
+@param anchor kernel anchor point. The default value of Point(-1, -1) denotes that the anchor is at the kernel
+center.
+@param normalize flag, specifying whether the kernel is to be normalized by it's area or not.
+@param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa boxFilter
+*/
+CV_EXPORTS_W void sqrBoxFilter( InputArray src, OutputArray dst, int ddepth,
+                                Size ksize, Point anchor = Point(-1, -1),
+                                bool normalize = true,
+                                int borderType = BORDER_DEFAULT );
+
+/** @brief Blurs an image using the normalized box filter.
+
+The function smooths an image using the kernel:
+
+\f[\texttt{K} =  \frac{1}{\texttt{ksize.width*ksize.height}} \begin{bmatrix} 1 & 1 & 1 &  \cdots & 1 & 1  \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \hdotsfor{6} \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \end{bmatrix}\f]
+
+The call `blur(src, dst, ksize, anchor, borderType)` is equivalent to `boxFilter(src, dst, src.type(), ksize,
+anchor, true, borderType)`.
+
+@param src input image; it can have any number of channels, which are processed independently, but
+the depth should be CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst output image of the same size and type as src.
+@param ksize blurring kernel size.
+@param anchor anchor point; default value Point(-1,-1) means that the anchor is at the kernel
+center.
+@param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  boxFilter, bilateralFilter, GaussianBlur, medianBlur
+ */
+CV_EXPORTS_W void blur( InputArray src, OutputArray dst,
+                        Size ksize, Point anchor = Point(-1,-1),
+                        int borderType = BORDER_DEFAULT );
+
+/** @brief Blurs an image using the stackBlur.
+
+The function applies and stackBlur to an image.
+stackBlur can generate similar results as Gaussian blur, and the time consumption does not increase with the increase of kernel size.
+It creates a kind of moving stack of colors whilst scanning through the image. Thereby it just has to add one new block of color to the right side
+of the stack and remove the leftmost color. The remaining colors on the topmost layer of the stack are either added on or reduced by one,
+depending on if they are on the right or on the left side of the stack. The only supported borderType is BORDER_REPLICATE.
+Original paper was proposed by Mario Klingemann, which can be found http://underdestruction.com/2004/02/25/stackblur-2004.
+
+@param src input image. The number of channels can be arbitrary, but the depth should be one of
+CV_8U, CV_16U, CV_16S or CV_32F.
+@param dst output image of the same size and type as src.
+@param ksize stack-blurring kernel size. The ksize.width and ksize.height can differ but they both must be
+positive and odd.
+*/
+CV_EXPORTS_W void stackBlur(InputArray src, OutputArray dst, Size ksize);
+
+/** @brief Convolves an image with the kernel.
+
+The function applies an arbitrary linear filter to an image. In-place operation is supported. When
+the aperture is partially outside the image, the function interpolates outlier pixel values
+according to the specified border mode.
+
+The function does actually compute correlation, not the convolution:
+
+\f[\texttt{dst} (x,y) =  \sum _{ \substack{0\leq x' < \texttt{kernel.cols}\\{0\leq y' < \texttt{kernel.rows}}}}  \texttt{kernel} (x',y')* \texttt{src} (x+x'- \texttt{anchor.x} ,y+y'- \texttt{anchor.y} )\f]
+
+That is, the kernel is not mirrored around the anchor point. If you need a real convolution, flip
+the kernel using #flip and set the new anchor to `(kernel.cols - anchor.x - 1, kernel.rows -
+anchor.y - 1)`.
+
+The function uses the DFT-based algorithm in case of sufficiently large kernels (~`11 x 11` or
+larger) and the direct algorithm for small kernels.
+
+@param src input image.
+@param dst output image of the same size and the same number of channels as src.
+@param ddepth desired depth of the destination image, see @ref filter_depths "combinations"
+@param kernel convolution kernel (or rather a correlation kernel), a single-channel floating point
+matrix; if you want to apply different kernels to different channels, split the image into
+separate color planes using split and process them individually.
+@param anchor anchor of the kernel that indicates the relative position of a filtered point within
+the kernel; the anchor should lie within the kernel; default value (-1,-1) means that the anchor
+is at the kernel center.
+@param delta optional value added to the filtered pixels before storing them in dst.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  sepFilter2D, dft, matchTemplate
+ */
+CV_EXPORTS_W void filter2D( InputArray src, OutputArray dst, int ddepth,
+                            InputArray kernel, Point anchor = Point(-1,-1),
+                            double delta = 0, int borderType = BORDER_DEFAULT );
+
+/** @brief Applies a separable linear filter to an image.
+
+The function applies a separable linear filter to the image. That is, first, every row of src is
+filtered with the 1D kernel kernelX. Then, every column of the result is filtered with the 1D
+kernel kernelY. The final result shifted by delta is stored in dst .
+
+@param src Source image.
+@param dst Destination image of the same size and the same number of channels as src .
+@param ddepth Destination image depth, see @ref filter_depths "combinations"
+@param kernelX Coefficients for filtering each row.
+@param kernelY Coefficients for filtering each column.
+@param anchor Anchor position within the kernel. The default value \f$(-1,-1)\f$ means that the anchor
+is at the kernel center.
+@param delta Value added to the filtered results before storing them.
+@param borderType Pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  filter2D, Sobel, GaussianBlur, boxFilter, blur
+ */
+CV_EXPORTS_W void sepFilter2D( InputArray src, OutputArray dst, int ddepth,
+                               InputArray kernelX, InputArray kernelY,
+                               Point anchor = Point(-1,-1),
+                               double delta = 0, int borderType = BORDER_DEFAULT );
+
+/** @example samples/cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp
+Sample code using Sobel and/or Scharr OpenCV functions to make a simple Edge Detector
+![Sample screenshot](Sobel_Derivatives_Tutorial_Result.jpg)
+Check @ref tutorial_sobel_derivatives "the corresponding tutorial" for more details
+*/
+
+/** @brief Calculates the first, second, third, or mixed image derivatives using an extended Sobel operator.
+
+In all cases except one, the \f$\texttt{ksize} \times \texttt{ksize}\f$ separable kernel is used to
+calculate the derivative. When \f$\texttt{ksize = 1}\f$, the \f$3 \times 1\f$ or \f$1 \times 3\f$
+kernel is used (that is, no Gaussian smoothing is done). `ksize = 1` can only be used for the first
+or the second x- or y- derivatives.
+
+There is also the special value `ksize = #FILTER_SCHARR (-1)` that corresponds to the \f$3\times3\f$ Scharr
+filter that may give more accurate results than the \f$3\times3\f$ Sobel. The Scharr aperture is
+
+\f[\vecthreethree{-3}{0}{3}{-10}{0}{10}{-3}{0}{3}\f]
+
+for the x-derivative, or transposed for the y-derivative.
+
+The function calculates an image derivative by convolving the image with the appropriate kernel:
+
+\f[\texttt{dst} =  \frac{\partial^{xorder+yorder} \texttt{src}}{\partial x^{xorder} \partial y^{yorder}}\f]
+
+The Sobel operators combine Gaussian smoothing and differentiation, so the result is more or less
+resistant to the noise. Most often, the function is called with ( xorder = 1, yorder = 0, ksize = 3)
+or ( xorder = 0, yorder = 1, ksize = 3) to calculate the first x- or y- image derivative. The first
+case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{0}{1}{-2}{0}{2}{-1}{0}{1}\f]
+
+The second case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{-2}{-1}{0}{0}{0}{1}{2}{1}\f]
+
+@param src input image.
+@param dst output image of the same size and the same number of channels as src .
+@param ddepth output image depth, see @ref filter_depths "combinations"; in the case of
+    8-bit input images it will result in truncated derivatives.
+@param dx order of the derivative x.
+@param dy order of the derivative y.
+@param ksize size of the extended Sobel kernel; it must be 1, 3, 5, or 7.
+@param scale optional scale factor for the computed derivative values; by default, no scaling is
+applied (see #getDerivKernels for details).
+@param delta optional delta value that is added to the results prior to storing them in dst.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  Scharr, Laplacian, sepFilter2D, filter2D, GaussianBlur, cartToPolar
+ */
+CV_EXPORTS_W void Sobel( InputArray src, OutputArray dst, int ddepth,
+                         int dx, int dy, int ksize = 3,
+                         double scale = 1, double delta = 0,
+                         int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates the first order image derivative in both x and y using a Sobel operator
+
+Equivalent to calling:
+
+@code
+Sobel( src, dx, CV_16SC1, 1, 0, 3 );
+Sobel( src, dy, CV_16SC1, 0, 1, 3 );
+@endcode
+
+@param src input image.
+@param dx output image with first-order derivative in x.
+@param dy output image with first-order derivative in y.
+@param ksize size of Sobel kernel. It must be 3.
+@param borderType pixel extrapolation method, see #BorderTypes.
+                  Only #BORDER_DEFAULT=#BORDER_REFLECT_101 and #BORDER_REPLICATE are supported.
+
+@sa Sobel
+ */
+
+CV_EXPORTS_W void spatialGradient( InputArray src, OutputArray dx,
+                                   OutputArray dy, int ksize = 3,
+                                   int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates the first x- or y- image derivative using Scharr operator.
+
+The function computes the first x- or y- spatial image derivative using the Scharr operator. The
+call
+
+\f[\texttt{Scharr(src, dst, ddepth, dx, dy, scale, delta, borderType)}\f]
+
+is equivalent to
+
+\f[\texttt{Sobel(src, dst, ddepth, dx, dy, FILTER_SCHARR, scale, delta, borderType)} .\f]
+
+@param src input image.
+@param dst output image of the same size and the same number of channels as src.
+@param ddepth output image depth, see @ref filter_depths "combinations"
+@param dx order of the derivative x.
+@param dy order of the derivative y.
+@param scale optional scale factor for the computed derivative values; by default, no scaling is
+applied (see #getDerivKernels for details).
+@param delta optional delta value that is added to the results prior to storing them in dst.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  cartToPolar
+ */
+CV_EXPORTS_W void Scharr( InputArray src, OutputArray dst, int ddepth,
+                          int dx, int dy, double scale = 1, double delta = 0,
+                          int borderType = BORDER_DEFAULT );
+
+/** @example samples/cpp/laplace.cpp
+An example using Laplace transformations for edge detection
+*/
+
+/** @brief Calculates the Laplacian of an image.
+
+The function calculates the Laplacian of the source image by adding up the second x and y
+derivatives calculated using the Sobel operator:
+
+\f[\texttt{dst} =  \Delta \texttt{src} =  \frac{\partial^2 \texttt{src}}{\partial x^2} +  \frac{\partial^2 \texttt{src}}{\partial y^2}\f]
+
+This is done when `ksize > 1`. When `ksize == 1`, the Laplacian is computed by filtering the image
+with the following \f$3 \times 3\f$ aperture:
+
+\f[\vecthreethree {0}{1}{0}{1}{-4}{1}{0}{1}{0}\f]
+
+@param src Source image.
+@param dst Destination image of the same size and the same number of channels as src .
+@param ddepth Desired depth of the destination image, see @ref filter_depths "combinations".
+@param ksize Aperture size used to compute the second-derivative filters. See #getDerivKernels for
+details. The size must be positive and odd.
+@param scale Optional scale factor for the computed Laplacian values. By default, no scaling is
+applied. See #getDerivKernels for details.
+@param delta Optional delta value that is added to the results prior to storing them in dst .
+@param borderType Pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  Sobel, Scharr
+ */
+CV_EXPORTS_W void Laplacian( InputArray src, OutputArray dst, int ddepth,
+                             int ksize = 1, double scale = 1, double delta = 0,
+                             int borderType = BORDER_DEFAULT );
+
+//! @} imgproc_filter
+
+//! @addtogroup imgproc_feature
+//! @{
+
+/** @example samples/cpp/edge.cpp
+This program demonstrates usage of the Canny edge detector
+
+Check @ref tutorial_canny_detector "the corresponding tutorial" for more details
+*/
+
+/** @brief Finds edges in an image using the Canny algorithm @cite Canny86 .
+
+The function finds edges in the input image and marks them in the output map edges using the
+Canny algorithm. The smallest value between threshold1 and threshold2 is used for edge linking. The
+largest value is used to find initial segments of strong edges. See
+<http://en.wikipedia.org/wiki/Canny_edge_detector>
+
+@param image 8-bit input image.
+@param edges output edge map; single channels 8-bit image, which has the same size as image .
+@param threshold1 first threshold for the hysteresis procedure.
+@param threshold2 second threshold for the hysteresis procedure.
+@param apertureSize aperture size for the Sobel operator.
+@param L2gradient a flag, indicating whether a more accurate \f$L_2\f$ norm
+\f$=\sqrt{(dI/dx)^2 + (dI/dy)^2}\f$ should be used to calculate the image gradient magnitude (
+L2gradient=true ), or whether the default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough (
+L2gradient=false ).
+ */
+CV_EXPORTS_W void Canny( InputArray image, OutputArray edges,
+                         double threshold1, double threshold2,
+                         int apertureSize = 3, bool L2gradient = false );
+
+/** \overload
+
+Finds edges in an image using the Canny algorithm with custom image gradient.
+
+@param dx 16-bit x derivative of input image (CV_16SC1 or CV_16SC3).
+@param dy 16-bit y derivative of input image (same type as dx).
+@param edges output edge map; single channels 8-bit image, which has the same size as image .
+@param threshold1 first threshold for the hysteresis procedure.
+@param threshold2 second threshold for the hysteresis procedure.
+@param L2gradient a flag, indicating whether a more accurate \f$L_2\f$ norm
+\f$=\sqrt{(dI/dx)^2 + (dI/dy)^2}\f$ should be used to calculate the image gradient magnitude (
+L2gradient=true ), or whether the default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough (
+L2gradient=false ).
+ */
+CV_EXPORTS_W void Canny( InputArray dx, InputArray dy,
+                         OutputArray edges,
+                         double threshold1, double threshold2,
+                         bool L2gradient = false );
+
+/** @brief Calculates the minimal eigenvalue of gradient matrices for corner detection.
+
+The function is similar to cornerEigenValsAndVecs but it calculates and stores only the minimal
+eigenvalue of the covariance matrix of derivatives, that is, \f$\min(\lambda_1, \lambda_2)\f$ in terms
+of the formulae in the cornerEigenValsAndVecs description.
+
+@param src Input single-channel 8-bit or floating-point image.
+@param dst Image to store the minimal eigenvalues. It has the type CV_32FC1 and the same size as
+src .
+@param blockSize Neighborhood size (see the details on #cornerEigenValsAndVecs ).
+@param ksize Aperture parameter for the Sobel operator.
+@param borderType Pixel extrapolation method. See #BorderTypes. #BORDER_WRAP is not supported.
+ */
+CV_EXPORTS_W void cornerMinEigenVal( InputArray src, OutputArray dst,
+                                     int blockSize, int ksize = 3,
+                                     int borderType = BORDER_DEFAULT );
+
+/** @brief Harris corner detector.
+
+The function runs the Harris corner detector on the image. Similarly to cornerMinEigenVal and
+cornerEigenValsAndVecs , for each pixel \f$(x, y)\f$ it calculates a \f$2\times2\f$ gradient covariance
+matrix \f$M^{(x,y)}\f$ over a \f$\texttt{blockSize} \times \texttt{blockSize}\f$ neighborhood. Then, it
+computes the following characteristic:
+
+\f[\texttt{dst} (x,y) =  \mathrm{det} M^{(x,y)} - k  \cdot \left ( \mathrm{tr} M^{(x,y)} \right )^2\f]
+
+Corners in the image can be found as the local maxima of this response map.
+
+@param src Input single-channel 8-bit or floating-point image.
+@param dst Image to store the Harris detector responses. It has the type CV_32FC1 and the same
+size as src .
+@param blockSize Neighborhood size (see the details on #cornerEigenValsAndVecs ).
+@param ksize Aperture parameter for the Sobel operator.
+@param k Harris detector free parameter. See the formula above.
+@param borderType Pixel extrapolation method. See #BorderTypes. #BORDER_WRAP is not supported.
+ */
+CV_EXPORTS_W void cornerHarris( InputArray src, OutputArray dst, int blockSize,
+                                int ksize, double k,
+                                int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates eigenvalues and eigenvectors of image blocks for corner detection.
+
+For every pixel \f$p\f$ , the function cornerEigenValsAndVecs considers a blockSize \f$\times\f$ blockSize
+neighborhood \f$S(p)\f$ . It calculates the covariation matrix of derivatives over the neighborhood as:
+
+\f[M =  \begin{bmatrix} \sum _{S(p)}(dI/dx)^2 &  \sum _{S(p)}dI/dx dI/dy  \\ \sum _{S(p)}dI/dx dI/dy &  \sum _{S(p)}(dI/dy)^2 \end{bmatrix}\f]
+
+where the derivatives are computed using the Sobel operator.
+
+After that, it finds eigenvectors and eigenvalues of \f$M\f$ and stores them in the destination image as
+\f$(\lambda_1, \lambda_2, x_1, y_1, x_2, y_2)\f$ where
+
+-   \f$\lambda_1, \lambda_2\f$ are the non-sorted eigenvalues of \f$M\f$
+-   \f$x_1, y_1\f$ are the eigenvectors corresponding to \f$\lambda_1\f$
+-   \f$x_2, y_2\f$ are the eigenvectors corresponding to \f$\lambda_2\f$
+
+The output of the function can be used for robust edge or corner detection.
+
+@param src Input single-channel 8-bit or floating-point image.
+@param dst Image to store the results. It has the same size as src and the type CV_32FC(6) .
+@param blockSize Neighborhood size (see details below).
+@param ksize Aperture parameter for the Sobel operator.
+@param borderType Pixel extrapolation method. See #BorderTypes. #BORDER_WRAP is not supported.
+
+@sa  cornerMinEigenVal, cornerHarris, preCornerDetect
+ */
+CV_EXPORTS_W void cornerEigenValsAndVecs( InputArray src, OutputArray dst,
+                                          int blockSize, int ksize,
+                                          int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates a feature map for corner detection.
+
+The function calculates the complex spatial derivative-based function of the source image
+
+\f[\texttt{dst} = (D_x  \texttt{src} )^2  \cdot D_{yy}  \texttt{src} + (D_y  \texttt{src} )^2  \cdot D_{xx}  \texttt{src} - 2 D_x  \texttt{src} \cdot D_y  \texttt{src} \cdot D_{xy}  \texttt{src}\f]
+
+where \f$D_x\f$,\f$D_y\f$ are the first image derivatives, \f$D_{xx}\f$,\f$D_{yy}\f$ are the second image
+derivatives, and \f$D_{xy}\f$ is the mixed derivative.
+
+The corners can be found as local maximums of the functions, as shown below:
+@code
+    Mat corners, dilated_corners;
+    preCornerDetect(image, corners, 3);
+    // dilation with 3x3 rectangular structuring element
+    dilate(corners, dilated_corners, Mat(), 1);
+    Mat corner_mask = corners == dilated_corners;
+@endcode
+
+@param src Source single-channel 8-bit of floating-point image.
+@param dst Output image that has the type CV_32F and the same size as src .
+@param ksize %Aperture size of the Sobel .
+@param borderType Pixel extrapolation method. See #BorderTypes. #BORDER_WRAP is not supported.
+ */
+CV_EXPORTS_W void preCornerDetect( InputArray src, OutputArray dst, int ksize,
+                                   int borderType = BORDER_DEFAULT );
+
+/** @brief Refines the corner locations.
+
+The function iterates to find the sub-pixel accurate location of corners or radial saddle
+points as described in @cite forstner1987fast, and as shown on the figure below.
+
+![image](pics/cornersubpix.png)
+
+Sub-pixel accurate corner locator is based on the observation that every vector from the center \f$q\f$
+to a point \f$p\f$ located within a neighborhood of \f$q\f$ is orthogonal to the image gradient at \f$p\f$
+subject to image and measurement noise. Consider the expression:
+
+\f[\epsilon _i = {DI_{p_i}}^T  \cdot (q - p_i)\f]
+
+where \f${DI_{p_i}}\f$ is an image gradient at one of the points \f$p_i\f$ in a neighborhood of \f$q\f$ . The
+value of \f$q\f$ is to be found so that \f$\epsilon_i\f$ is minimized. A system of equations may be set up
+with \f$\epsilon_i\f$ set to zero:
+
+\f[\sum _i(DI_{p_i}  \cdot {DI_{p_i}}^T) \cdot q -  \sum _i(DI_{p_i}  \cdot {DI_{p_i}}^T  \cdot p_i)\f]
+
+where the gradients are summed within a neighborhood ("search window") of \f$q\f$ . Calling the first
+gradient term \f$G\f$ and the second gradient term \f$b\f$ gives:
+
+\f[q = G^{-1}  \cdot b\f]
+
+The algorithm sets the center of the neighborhood window at this new center \f$q\f$ and then iterates
+until the center stays within a set threshold.
+
+@param image Input single-channel, 8-bit or float image.
+@param corners Initial coordinates of the input corners and refined coordinates provided for
+output.
+@param winSize Half of the side length of the search window. For example, if winSize=Size(5,5) ,
+then a \f$(5*2+1) \times (5*2+1) = 11 \times 11\f$ search window is used.
+@param zeroZone Half of the size of the dead region in the middle of the search zone over which
+the summation in the formula below is not done. It is used sometimes to avoid possible
+singularities of the autocorrelation matrix. The value of (-1,-1) indicates that there is no such
+a size.
+@param criteria Criteria for termination of the iterative process of corner refinement. That is,
+the process of corner position refinement stops either after criteria.maxCount iterations or when
+the corner position moves by less than criteria.epsilon on some iteration.
+ */
+CV_EXPORTS_W void cornerSubPix( InputArray image, InputOutputArray corners,
+                                Size winSize, Size zeroZone,
+                                TermCriteria criteria );
+
+/** @brief Determines strong corners on an image.
+
+The function finds the most prominent corners in the image or in the specified image region, as
+described in @cite Shi94
+
+-   Function calculates the corner quality measure at every source image pixel using the
+    #cornerMinEigenVal or #cornerHarris .
+-   Function performs a non-maximum suppression (the local maximums in *3 x 3* neighborhood are
+    retained).
+-   The corners with the minimal eigenvalue less than
+    \f$\texttt{qualityLevel} \cdot \max_{x,y} qualityMeasureMap(x,y)\f$ are rejected.
+-   The remaining corners are sorted by the quality measure in the descending order.
+-   Function throws away each corner for which there is a stronger corner at a distance less than
+    maxDistance.
+
+The function can be used to initialize a point-based tracker of an object.
+
+@note If the function is called with different values A and B of the parameter qualityLevel , and
+A \> B, the vector of returned corners with qualityLevel=A will be the prefix of the output vector
+with qualityLevel=B .
+
+@param image Input 8-bit or floating-point 32-bit, single-channel image.
+@param corners Output vector of detected corners.
+@param maxCorners Maximum number of corners to return. If there are more corners than are found,
+the strongest of them is returned. `maxCorners <= 0` implies that no limit on the maximum is set
+and all detected corners are returned.
+@param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
+parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
+(see #cornerMinEigenVal ) or the Harris function response (see #cornerHarris ). The corners with the
+quality measure less than the product are rejected. For example, if the best corner has the
+quality measure = 1500, and the qualityLevel=0.01 , then all the corners with the quality measure
+less than 15 are rejected.
+@param minDistance Minimum possible Euclidean distance between the returned corners.
+@param mask Optional region of interest. If the image is not empty (it needs to have the type
+CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+@param blockSize Size of an average block for computing a derivative covariation matrix over each
+pixel neighborhood. See cornerEigenValsAndVecs .
+@param useHarrisDetector Parameter indicating whether to use a Harris detector (see #cornerHarris)
+or #cornerMinEigenVal.
+@param k Free parameter of the Harris detector.
+
+@sa  cornerMinEigenVal, cornerHarris, calcOpticalFlowPyrLK, estimateRigidTransform,
+ */
+
+CV_EXPORTS_W void goodFeaturesToTrack( InputArray image, OutputArray corners,
+                                     int maxCorners, double qualityLevel, double minDistance,
+                                     InputArray mask = noArray(), int blockSize = 3,
+                                     bool useHarrisDetector = false, double k = 0.04 );
+
+CV_EXPORTS_W void goodFeaturesToTrack( InputArray image, OutputArray corners,
+                                     int maxCorners, double qualityLevel, double minDistance,
+                                     InputArray mask, int blockSize,
+                                     int gradientSize, bool useHarrisDetector = false,
+                                     double k = 0.04 );
+
+/** @brief Same as above, but returns also quality measure of the detected corners.
+
+@param image Input 8-bit or floating-point 32-bit, single-channel image.
+@param corners Output vector of detected corners.
+@param maxCorners Maximum number of corners to return. If there are more corners than are found,
+the strongest of them is returned. `maxCorners <= 0` implies that no limit on the maximum is set
+and all detected corners are returned.
+@param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
+parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
+(see #cornerMinEigenVal ) or the Harris function response (see #cornerHarris ). The corners with the
+quality measure less than the product are rejected. For example, if the best corner has the
+quality measure = 1500, and the qualityLevel=0.01 , then all the corners with the quality measure
+less than 15 are rejected.
+@param minDistance Minimum possible Euclidean distance between the returned corners.
+@param mask Region of interest. If the image is not empty (it needs to have the type
+CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+@param cornersQuality Output vector of quality measure of the detected corners.
+@param blockSize Size of an average block for computing a derivative covariation matrix over each
+pixel neighborhood. See cornerEigenValsAndVecs .
+@param gradientSize Aperture parameter for the Sobel operator used for derivatives computation.
+See cornerEigenValsAndVecs .
+@param useHarrisDetector Parameter indicating whether to use a Harris detector (see #cornerHarris)
+or #cornerMinEigenVal.
+@param k Free parameter of the Harris detector.
+ */
+CV_EXPORTS CV_WRAP_AS(goodFeaturesToTrackWithQuality) void goodFeaturesToTrack(
+        InputArray image, OutputArray corners,
+        int maxCorners, double qualityLevel, double minDistance,
+        InputArray mask, OutputArray cornersQuality, int blockSize = 3,
+        int gradientSize = 3, bool useHarrisDetector = false, double k = 0.04);
+
+/** @example samples/cpp/tutorial_code/ImgTrans/houghlines.cpp
+An example using the Hough line detector
+![Sample input image](Hough_Lines_Tutorial_Original_Image.jpg) ![Output image](Hough_Lines_Tutorial_Result.jpg)
+*/
+
+/** @brief Finds lines in a binary image using the standard Hough transform.
+
+The function implements the standard or standard multi-scale Hough transform algorithm for line
+detection. See <http://homepages.inf.ed.ac.uk/rbf/HIPR2/hough.htm> for a good explanation of Hough
+transform.
+
+@param image 8-bit, single-channel binary source image. The image may be modified by the function.
+@param lines Output vector of lines. Each line is represented by a 2 or 3 element vector
+\f$(\rho, \theta)\f$ or \f$(\rho, \theta, \textrm{votes})\f$, where \f$\rho\f$ is the distance from
+the coordinate origin \f$(0,0)\f$ (top-left corner of the image), \f$\theta\f$ is the line rotation
+angle in radians ( \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ), and
+\f$\textrm{votes}\f$ is the value of accumulator.
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param threshold %Accumulator threshold parameter. Only those lines are returned that get enough
+votes ( \f$>\texttt{threshold}\f$ ).
+@param srn For the multi-scale Hough transform, it is a divisor for the distance resolution rho.
+The coarse accumulator distance resolution is rho and the accurate accumulator resolution is
+rho/srn. If both srn=0 and stn=0, the classical Hough transform is used. Otherwise, both these
+parameters should be positive.
+@param stn For the multi-scale Hough transform, it is a divisor for the distance resolution theta.
+@param min_theta For standard and multi-scale Hough transform, minimum angle to check for lines.
+Must fall between 0 and max_theta.
+@param max_theta For standard and multi-scale Hough transform, an upper bound for the angle.
+Must fall between min_theta and CV_PI. The actual maximum angle in the accumulator may be slightly
+less than max_theta, depending on the parameters min_theta and theta.
+ */
+CV_EXPORTS_W void HoughLines( InputArray image, OutputArray lines,
+                              double rho, double theta, int threshold,
+                              double srn = 0, double stn = 0,
+                              double min_theta = 0, double max_theta = CV_PI );
+
+/** @brief Finds line segments in a binary image using the probabilistic Hough transform.
+
+The function implements the probabilistic Hough transform algorithm for line detection, described
+in @cite Matas00
+
+See the line detection example below:
+@include snippets/imgproc_HoughLinesP.cpp
+This is a sample picture the function parameters have been tuned for:
+
+![image](pics/building.jpg)
+
+And this is the output of the above program in case of the probabilistic Hough transform:
+
+![image](pics/houghp.png)
+
+@param image 8-bit, single-channel binary source image. The image may be modified by the function.
+@param lines Output vector of lines. Each line is represented by a 4-element vector
+\f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected
+line segment.
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param threshold %Accumulator threshold parameter. Only those lines are returned that get enough
+votes ( \f$>\texttt{threshold}\f$ ).
+@param minLineLength Minimum line length. Line segments shorter than that are rejected.
+@param maxLineGap Maximum allowed gap between points on the same line to link them.
+
+@sa LineSegmentDetector
+ */
+CV_EXPORTS_W void HoughLinesP( InputArray image, OutputArray lines,
+                               double rho, double theta, int threshold,
+                               double minLineLength = 0, double maxLineGap = 0 );
+
+/** @brief Finds lines in a set of points using the standard Hough transform.
+
+The function finds lines in a set of points using a modification of the Hough transform.
+@include snippets/imgproc_HoughLinesPointSet.cpp
+@param point Input vector of points. Each vector must be encoded as a Point vector \f$(x,y)\f$. Type must be CV_32FC2 or CV_32SC2.
+@param lines Output vector of found lines. Each vector is encoded as a vector<Vec3d> \f$(votes, rho, theta)\f$.
+The larger the value of 'votes', the higher the reliability of the Hough line.
+@param lines_max Max count of Hough lines.
+@param threshold %Accumulator threshold parameter. Only those lines are returned that get enough
+votes ( \f$>\texttt{threshold}\f$ ).
+@param min_rho Minimum value for \f$\rho\f$ for the accumulator (Note: \f$\rho\f$ can be negative. The absolute value \f$|\rho|\f$ is the distance of a line to the origin.).
+@param max_rho Maximum value for \f$\rho\f$ for the accumulator.
+@param rho_step Distance resolution of the accumulator.
+@param min_theta Minimum angle value of the accumulator in radians.
+@param max_theta Upper bound for the angle value of the accumulator in radians. The actual maximum
+angle may be slightly less than max_theta, depending on the parameters min_theta and theta_step.
+@param theta_step Angle resolution of the accumulator in radians.
+ */
+CV_EXPORTS_W void HoughLinesPointSet( InputArray point, OutputArray lines, int lines_max, int threshold,
+                                      double min_rho, double max_rho, double rho_step,
+                                      double min_theta, double max_theta, double theta_step );
+
+/** @example samples/cpp/tutorial_code/ImgTrans/houghcircles.cpp
+An example using the Hough circle detector
+*/
+
+/** @brief Finds circles in a grayscale image using the Hough transform.
+
+The function finds circles in a grayscale image using a modification of the Hough transform.
+
+Example: :
+@include snippets/imgproc_HoughLinesCircles.cpp
+
+@note Usually the function detects the centers of circles well. However, it may fail to find correct
+radii. You can assist to the function by specifying the radius range ( minRadius and maxRadius ) if
+you know it. Or, in the case of #HOUGH_GRADIENT method you may set maxRadius to a negative number
+to return centers only without radius search, and find the correct radius using an additional procedure.
+
+It also helps to smooth image a bit unless it's already soft. For example,
+GaussianBlur() with 7x7 kernel and 1.5x1.5 sigma or similar blurring may help.
+
+@param image 8-bit, single-channel, grayscale input image.
+@param circles Output vector of found circles. Each vector is encoded as  3 or 4 element
+floating-point vector \f$(x, y, radius)\f$ or \f$(x, y, radius, votes)\f$ .
+@param method Detection method, see #HoughModes. The available methods are #HOUGH_GRADIENT and #HOUGH_GRADIENT_ALT.
+@param dp Inverse ratio of the accumulator resolution to the image resolution. For example, if
+dp=1 , the accumulator has the same resolution as the input image. If dp=2 , the accumulator has
+half as big width and height. For #HOUGH_GRADIENT_ALT the recommended value is dp=1.5,
+unless some small very circles need to be detected.
+@param minDist Minimum distance between the centers of the detected circles. If the parameter is
+too small, multiple neighbor circles may be falsely detected in addition to a true one. If it is
+too large, some circles may be missed.
+@param param1 First method-specific parameter. In case of #HOUGH_GRADIENT and #HOUGH_GRADIENT_ALT,
+it is the higher threshold of the two passed to the Canny edge detector (the lower one is twice smaller).
+Note that #HOUGH_GRADIENT_ALT uses #Scharr algorithm to compute image derivatives, so the threshold value
+should normally be higher, such as 300 or normally exposed and contrasty images.
+@param param2 Second method-specific parameter. In case of #HOUGH_GRADIENT, it is the
+accumulator threshold for the circle centers at the detection stage. The smaller it is, the more
+false circles may be detected. Circles, corresponding to the larger accumulator values, will be
+returned first. In the case of #HOUGH_GRADIENT_ALT algorithm, this is the circle "perfectness" measure.
+The closer it to 1, the better shaped circles algorithm selects. In most cases 0.9 should be fine.
+If you want get better detection of small circles, you may decrease it to 0.85, 0.8 or even less.
+But then also try to limit the search range [minRadius, maxRadius] to avoid many false circles.
+@param minRadius Minimum circle radius.
+@param maxRadius Maximum circle radius. If <= 0, uses the maximum image dimension. If < 0, #HOUGH_GRADIENT returns
+centers without finding the radius. #HOUGH_GRADIENT_ALT always computes circle radiuses.
+
+@sa fitEllipse, minEnclosingCircle
+ */
+CV_EXPORTS_W void HoughCircles( InputArray image, OutputArray circles,
+                               int method, double dp, double minDist,
+                               double param1 = 100, double param2 = 100,
+                               int minRadius = 0, int maxRadius = 0 );
+
+//! @} imgproc_feature
+
+//! @addtogroup imgproc_filter
+//! @{
+
+/** @example samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp
+Advanced morphology Transformations sample code
+![Sample screenshot](Morphology_2_Tutorial_Result.jpg)
+Check @ref tutorial_opening_closing_hats "the corresponding tutorial" for more details
+*/
+
+/** @brief Erodes an image by using a specific structuring element.
+
+The function erodes the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the minimum is taken:
+
+\f[\texttt{dst} (x,y) =  \min _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+The function supports the in-place mode. Erosion can be applied several ( iterations ) times. In
+case of multi-channel images, each channel is processed independently.
+
+@param src input image; the number of channels can be arbitrary, but the depth should be one of
+CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst output image of the same size and type as src.
+@param kernel structuring element used for erosion; if `element=Mat()`, a `3 x 3` rectangular
+structuring element is used. Kernel can be created using #getStructuringElement.
+@param anchor position of the anchor within the element; default value (-1, -1) means that the
+anchor is at the element center.
+@param iterations number of times erosion is applied.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@param borderValue border value in case of a constant border
+@sa  dilate, morphologyEx, getStructuringElement
+ */
+CV_EXPORTS_W void erode( InputArray src, OutputArray dst, InputArray kernel,
+                         Point anchor = Point(-1,-1), int iterations = 1,
+                         int borderType = BORDER_CONSTANT,
+                         const Scalar& borderValue = morphologyDefaultBorderValue() );
+
+/** @example samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
+Erosion and Dilation sample code
+![Sample Screenshot-Erosion](Morphology_1_Tutorial_Erosion_Result.jpg)![Sample Screenshot-Dilation](Morphology_1_Tutorial_Dilation_Result.jpg)
+Check @ref tutorial_erosion_dilatation "the corresponding tutorial" for more details
+*/
+
+/** @brief Dilates an image by using a specific structuring element.
+
+The function dilates the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the maximum is taken:
+\f[\texttt{dst} (x,y) =  \max _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+The function supports the in-place mode. Dilation can be applied several ( iterations ) times. In
+case of multi-channel images, each channel is processed independently.
+
+@param src input image; the number of channels can be arbitrary, but the depth should be one of
+CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst output image of the same size and type as src.
+@param kernel structuring element used for dilation; if element=Mat(), a 3 x 3 rectangular
+structuring element is used. Kernel can be created using #getStructuringElement
+@param anchor position of the anchor within the element; default value (-1, -1) means that the
+anchor is at the element center.
+@param iterations number of times dilation is applied.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not suported.
+@param borderValue border value in case of a constant border
+@sa  erode, morphologyEx, getStructuringElement
+ */
+CV_EXPORTS_W void dilate( InputArray src, OutputArray dst, InputArray kernel,
+                          Point anchor = Point(-1,-1), int iterations = 1,
+                          int borderType = BORDER_CONSTANT,
+                          const Scalar& borderValue = morphologyDefaultBorderValue() );
+
+/** @brief Performs advanced morphological transformations.
+
+The function cv::morphologyEx can perform advanced morphological transformations using an erosion and dilation as
+basic operations.
+
+Any of the operations can be done in-place. In case of multi-channel images, each channel is
+processed independently.
+
+@param src Source image. The number of channels can be arbitrary. The depth should be one of
+CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst Destination image of the same size and type as source image.
+@param op Type of a morphological operation, see #MorphTypes
+@param kernel Structuring element. It can be created using #getStructuringElement.
+@param anchor Anchor position with the kernel. Negative values mean that the anchor is at the
+kernel center.
+@param iterations Number of times erosion and dilation are applied.
+@param borderType Pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@param borderValue Border value in case of a constant border. The default value has a special
+meaning.
+@sa  dilate, erode, getStructuringElement
+@note The number of iterations is the number of times erosion or dilatation operation will be applied.
+For instance, an opening operation (#MORPH_OPEN) with two iterations is equivalent to apply
+successively: erode -> erode -> dilate -> dilate (and not erode -> dilate -> erode -> dilate).
+ */
+CV_EXPORTS_W void morphologyEx( InputArray src, OutputArray dst,
+                                int op, InputArray kernel,
+                                Point anchor = Point(-1,-1), int iterations = 1,
+                                int borderType = BORDER_CONSTANT,
+                                const Scalar& borderValue = morphologyDefaultBorderValue() );
+
+//! @} imgproc_filter
+
+//! @addtogroup imgproc_transform
+//! @{
+
+/** @brief Resizes an image.
+
+The function resize resizes the image src down to or up to the specified size. Note that the
+initial dst type or size are not taken into account. Instead, the size and type are derived from
+the `src`,`dsize`,`fx`, and `fy`. If you want to resize src so that it fits the pre-created dst,
+you may call the function as follows:
+@code
+    // explicitly specify dsize=dst.size(); fx and fy will be computed from that.
+    resize(src, dst, dst.size(), 0, 0, interpolation);
+@endcode
+If you want to decimate the image by factor of 2 in each direction, you can call the function this
+way:
+@code
+    // specify fx and fy and let the function compute the destination image size.
+    resize(src, dst, Size(), 0.5, 0.5, interpolation);
+@endcode
+To shrink an image, it will generally look best with #INTER_AREA interpolation, whereas to
+enlarge an image, it will generally look best with #INTER_CUBIC (slow) or #INTER_LINEAR
+(faster but still looks OK).
+
+@param src input image.
+@param dst output image; it has the size dsize (when it is non-zero) or the size computed from
+src.size(), fx, and fy; the type of dst is the same as of src.
+@param dsize output image size; if it equals zero (`None` in Python), it is computed as:
+ \f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
+ Either dsize or both fx and fy must be non-zero.
+@param fx scale factor along the horizontal axis; when it equals 0, it is computed as
+\f[\texttt{(double)dsize.width/src.cols}\f]
+@param fy scale factor along the vertical axis; when it equals 0, it is computed as
+\f[\texttt{(double)dsize.height/src.rows}\f]
+@param interpolation interpolation method, see #InterpolationFlags
+
+@sa  warpAffine, warpPerspective, remap
+ */
+CV_EXPORTS_W void resize( InputArray src, OutputArray dst,
+                          Size dsize, double fx = 0, double fy = 0,
+                          int interpolation = INTER_LINEAR );
+
+/** @brief Applies an affine transformation to an image.
+
+The function warpAffine transforms the source image using the specified matrix:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} ( \texttt{M} _{11} x +  \texttt{M} _{12} y +  \texttt{M} _{13}, \texttt{M} _{21} x +  \texttt{M} _{22} y +  \texttt{M} _{23})\f]
+
+when the flag #WARP_INVERSE_MAP is set. Otherwise, the transformation is first inverted
+with #invertAffineTransform and then put in the formula above instead of M. The function cannot
+operate in-place.
+
+@param src input image.
+@param dst output image that has the size dsize and the same type as src .
+@param M \f$2\times 3\f$ transformation matrix.
+@param dsize size of the output image.
+@param flags combination of interpolation methods (see #InterpolationFlags) and the optional
+flag #WARP_INVERSE_MAP that means that M is the inverse transformation (
+\f$\texttt{dst}\rightarrow\texttt{src}\f$ ).
+@param borderMode pixel extrapolation method (see #BorderTypes); when
+borderMode=#BORDER_TRANSPARENT, it means that the pixels in the destination image corresponding to
+the "outliers" in the source image are not modified by the function.
+@param borderValue value used in case of a constant border; by default, it is 0.
+
+@sa  warpPerspective, resize, remap, getRectSubPix, transform
+ */
+CV_EXPORTS_W void warpAffine( InputArray src, OutputArray dst,
+                              InputArray M, Size dsize,
+                              int flags = INTER_LINEAR,
+                              int borderMode = BORDER_CONSTANT,
+                              const Scalar& borderValue = Scalar());
+
+/** @example samples/cpp/warpPerspective_demo.cpp
+An example program shows using cv::getPerspectiveTransform and cv::warpPerspective for image warping
+*/
+
+/** @brief Applies a perspective transformation to an image.
+
+The function warpPerspective transforms the source image using the specified matrix:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} \left ( \frac{M_{11} x + M_{12} y + M_{13}}{M_{31} x + M_{32} y + M_{33}} ,
+     \frac{M_{21} x + M_{22} y + M_{23}}{M_{31} x + M_{32} y + M_{33}} \right )\f]
+
+when the flag #WARP_INVERSE_MAP is set. Otherwise, the transformation is first inverted with invert
+and then put in the formula above instead of M. The function cannot operate in-place.
+
+@param src input image.
+@param dst output image that has the size dsize and the same type as src .
+@param M \f$3\times 3\f$ transformation matrix.
+@param dsize size of the output image.
+@param flags combination of interpolation methods (#INTER_LINEAR or #INTER_NEAREST) and the
+optional flag #WARP_INVERSE_MAP, that sets M as the inverse transformation (
+\f$\texttt{dst}\rightarrow\texttt{src}\f$ ).
+@param borderMode pixel extrapolation method (#BORDER_CONSTANT or #BORDER_REPLICATE).
+@param borderValue value used in case of a constant border; by default, it equals 0.
+
+@sa  warpAffine, resize, remap, getRectSubPix, perspectiveTransform
+ */
+CV_EXPORTS_W void warpPerspective( InputArray src, OutputArray dst,
+                                   InputArray M, Size dsize,
+                                   int flags = INTER_LINEAR,
+                                   int borderMode = BORDER_CONSTANT,
+                                   const Scalar& borderValue = Scalar());
+
+/** @brief Applies a generic geometrical transformation to an image.
+
+The function remap transforms the source image using the specified map:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} (map_x(x,y),map_y(x,y))\f]
+\f[\texttt{dst} (x,y) =  \texttt{src} (x+map_x(x,y),y+map_y(x,y))\f] with WARP_RELATIVE_MAP
+
+where values of pixels with non-integer coordinates are computed using one of available
+interpolation methods. \f$map_x\f$ and \f$map_y\f$ can be encoded as separate floating-point maps
+in \f$map_1\f$ and \f$map_2\f$ respectively, or interleaved floating-point maps of \f$(x,y)\f$ in
+\f$map_1\f$, or fixed-point maps created by using #convertMaps. The reason you might want to
+convert from floating to fixed-point representations of a map is that they can yield much faster
+(\~2x) remapping operations. In the converted case, \f$map_1\f$ contains pairs (cvFloor(x),
+cvFloor(y)) and \f$map_2\f$ contains indices in a table of interpolation coefficients.
+
+This function cannot operate in-place.
+
+@param src Source image.
+@param dst Destination image. It has the same size as map1 and the same type as src .
+@param map1 The first map of either (x,y) points or just x values having the type CV_16SC2 ,
+CV_32FC1, or CV_32FC2. See #convertMaps for details on converting a floating point
+representation to fixed-point for speed.
+@param map2 The second map of y values having the type CV_16UC1, CV_32FC1, or none (empty map
+if map1 is (x,y) points), respectively.
+@param interpolation Interpolation method (see #InterpolationFlags). The methods #INTER_AREA
+#INTER_LINEAR_EXACT and #INTER_NEAREST_EXACT are not supported by this function.
+The extra flag WARP_RELATIVE_MAP that can be ORed to the interpolation method
+(e.g. INTER_LINEAR | WARP_RELATIVE_MAP)
+@param borderMode Pixel extrapolation method (see #BorderTypes). When
+borderMode=#BORDER_TRANSPARENT, it means that the pixels in the destination image that
+corresponds to the "outliers" in the source image are not modified by the function.
+@param borderValue Value used in case of a constant border. By default, it is 0.
+@note
+Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
+ */
+CV_EXPORTS_W void remap( InputArray src, OutputArray dst,
+                         InputArray map1, InputArray map2,
+                         int interpolation, int borderMode = BORDER_CONSTANT,
+                         const Scalar& borderValue = Scalar());
+
+/** @brief Converts image transformation maps from one representation to another.
+
+The function converts a pair of maps for remap from one representation to another. The following
+options ( (map1.type(), map2.type()) \f$\rightarrow\f$ (dstmap1.type(), dstmap2.type()) ) are
+supported:
+
+- \f$\texttt{(CV_32FC1, CV_32FC1)} \rightarrow \texttt{(CV_16SC2, CV_16UC1)}\f$. This is the
+most frequently used conversion operation, in which the original floating-point maps (see #remap)
+are converted to a more compact and much faster fixed-point representation. The first output array
+contains the rounded coordinates and the second array (created only when nninterpolation=false )
+contains indices in the interpolation tables.
+
+- \f$\texttt{(CV_32FC2)} \rightarrow \texttt{(CV_16SC2, CV_16UC1)}\f$. The same as above but
+the original maps are stored in one 2-channel matrix.
+
+- Reverse conversion. Obviously, the reconstructed floating-point maps will not be exactly the same
+as the originals.
+
+@param map1 The first input map of type CV_16SC2, CV_32FC1, or CV_32FC2 .
+@param map2 The second input map of type CV_16UC1, CV_32FC1, or none (empty matrix),
+respectively.
+@param dstmap1 The first output map that has the type dstmap1type and the same size as src .
+@param dstmap2 The second output map.
+@param dstmap1type Type of the first output map that should be CV_16SC2, CV_32FC1, or
+CV_32FC2 .
+@param nninterpolation Flag indicating whether the fixed-point maps are used for the
+nearest-neighbor or for a more complex interpolation.
+
+@sa  remap, undistort, initUndistortRectifyMap
+ */
+CV_EXPORTS_W void convertMaps( InputArray map1, InputArray map2,
+                               OutputArray dstmap1, OutputArray dstmap2,
+                               int dstmap1type, bool nninterpolation = false );
+
+/** @brief Calculates an affine matrix of 2D rotation.
+
+The function calculates the following matrix:
+
+\f[\begin{bmatrix} \alpha &  \beta & (1- \alpha )  \cdot \texttt{center.x} -  \beta \cdot \texttt{center.y} \\ - \beta &  \alpha &  \beta \cdot \texttt{center.x} + (1- \alpha )  \cdot \texttt{center.y} \end{bmatrix}\f]
+
+where
+
+\f[\begin{array}{l} \alpha =  \texttt{scale} \cdot \cos \texttt{angle} , \\ \beta =  \texttt{scale} \cdot \sin \texttt{angle} \end{array}\f]
+
+The transformation maps the rotation center to itself. If this is not the target, adjust the shift.
+
+@param center Center of the rotation in the source image.
+@param angle Rotation angle in degrees. Positive values mean counter-clockwise rotation (the
+coordinate origin is assumed to be the top-left corner).
+@param scale Isotropic scale factor.
+
+@sa  getAffineTransform, warpAffine, transform
+ */
+CV_EXPORTS_W Mat getRotationMatrix2D(Point2f center, double angle, double scale);
+
+/** @sa getRotationMatrix2D */
+CV_EXPORTS Matx23d getRotationMatrix2D_(Point2f center, double angle, double scale);
+
+inline
+Mat getRotationMatrix2D(Point2f center, double angle, double scale)
+{
+    return Mat(getRotationMatrix2D_(center, angle, scale), true);
+}
+
+/** @brief Calculates an affine transform from three pairs of the corresponding points.
+
+The function calculates the \f$2 \times 3\f$ matrix of an affine transform so that:
+
+\f[\begin{bmatrix} x'_i \\ y'_i \end{bmatrix} = \texttt{map_matrix} \cdot \begin{bmatrix} x_i \\ y_i \\ 1 \end{bmatrix}\f]
+
+where
+
+\f[dst(i)=(x'_i,y'_i), src(i)=(x_i, y_i), i=0,1,2\f]
+
+@param src Coordinates of triangle vertices in the source image.
+@param dst Coordinates of the corresponding triangle vertices in the destination image.
+
+@sa  warpAffine, transform
+ */
+CV_EXPORTS Mat getAffineTransform( const Point2f src[], const Point2f dst[] );
+
+/** @brief Inverts an affine transformation.
+
+The function computes an inverse affine transformation represented by \f$2 \times 3\f$ matrix M:
+
+\f[\begin{bmatrix} a_{11} & a_{12} & b_1  \\ a_{21} & a_{22} & b_2 \end{bmatrix}\f]
+
+The result is also a \f$2 \times 3\f$ matrix of the same type as M.
+
+@param M Original affine transformation.
+@param iM Output reverse affine transformation.
+ */
+CV_EXPORTS_W void invertAffineTransform( InputArray M, OutputArray iM );
+
+/** @brief Calculates a perspective transform from four pairs of the corresponding points.
+
+The function calculates the \f$3 \times 3\f$ matrix of a perspective transform so that:
+
+\f[\begin{bmatrix} t_i x'_i \\ t_i y'_i \\ t_i \end{bmatrix} = \texttt{map_matrix} \cdot \begin{bmatrix} x_i \\ y_i \\ 1 \end{bmatrix}\f]
+
+where
+
+\f[dst(i)=(x'_i,y'_i), src(i)=(x_i, y_i), i=0,1,2,3\f]
+
+@param src Coordinates of quadrangle vertices in the source image.
+@param dst Coordinates of the corresponding quadrangle vertices in the destination image.
+@param solveMethod method passed to cv::solve (#DecompTypes)
+
+@sa  findHomography, warpPerspective, perspectiveTransform
+ */
+CV_EXPORTS_W Mat getPerspectiveTransform(InputArray src, InputArray dst, int solveMethod = DECOMP_LU);
+
+/** @overload */
+CV_EXPORTS Mat getPerspectiveTransform(const Point2f src[], const Point2f dst[], int solveMethod = DECOMP_LU);
+
+
+CV_EXPORTS_W Mat getAffineTransform( InputArray src, InputArray dst );
+
+/** @brief Retrieves a pixel rectangle from an image with sub-pixel accuracy.
+
+The function getRectSubPix extracts pixels from src:
+
+\f[patch(x, y) = src(x +  \texttt{center.x} - ( \texttt{dst.cols} -1)*0.5, y +  \texttt{center.y} - ( \texttt{dst.rows} -1)*0.5)\f]
+
+where the values of the pixels at non-integer coordinates are retrieved using bilinear
+interpolation. Every channel of multi-channel images is processed independently. Also
+the image should be a single channel or three channel image. While the center of the
+rectangle must be inside the image, parts of the rectangle may be outside.
+
+@param image Source image.
+@param patchSize Size of the extracted patch.
+@param center Floating point coordinates of the center of the extracted rectangle within the
+source image. The center must be inside the image.
+@param patch Extracted patch that has the size patchSize and the same number of channels as src .
+@param patchType Depth of the extracted pixels. By default, they have the same depth as src .
+
+@sa  warpAffine, warpPerspective
+ */
+CV_EXPORTS_W void getRectSubPix( InputArray image, Size patchSize,
+                                 Point2f center, OutputArray patch, int patchType = -1 );
+
+/** @example samples/cpp/polar_transforms.cpp
+An example using the cv::linearPolar and cv::logPolar operations
+*/
+
+/** @brief Remaps an image to semilog-polar coordinates space.
+
+@deprecated This function produces same result as cv::warpPolar(src, dst, src.size(), center, maxRadius, flags+WARP_POLAR_LOG);
+
+@internal
+Transform the source image using the following transformation (See @ref polar_remaps_reference_image "Polar remaps reference image d)"):
+\f[\begin{array}{l}
+  dst( \rho , \phi ) = src(x,y) \\
+  dst.size() \leftarrow src.size()
+\end{array}\f]
+
+where
+\f[\begin{array}{l}
+  I = (dx,dy) = (x - center.x,y - center.y) \\
+  \rho = M \cdot log_e(\texttt{magnitude} (I)) ,\\
+  \phi = Kangle \cdot \texttt{angle} (I) \\
+\end{array}\f]
+
+and
+\f[\begin{array}{l}
+  M = src.cols / log_e(maxRadius) \\
+  Kangle = src.rows / 2\Pi \\
+\end{array}\f]
+
+The function emulates the human "foveal" vision and can be used for fast scale and
+rotation-invariant template matching, for object tracking and so forth.
+@param src Source image
+@param dst Destination image. It will have same size and type as src.
+@param center The transformation center; where the output precision is maximal
+@param M Magnitude scale parameter. It determines the radius of the bounding circle to transform too.
+@param flags A combination of interpolation methods, see #InterpolationFlags
+
+@note
+-   The function can not operate in-place.
+-   To calculate magnitude and angle in degrees #cartToPolar is used internally thus angles are measured from 0 to 360 with accuracy about 0.3 degrees.
+
+@sa cv::linearPolar
+@endinternal
+*/
+CV_EXPORTS_W void logPolar( InputArray src, OutputArray dst,
+                            Point2f center, double M, int flags );
+
+/** @brief Remaps an image to polar coordinates space.
+
+@deprecated This function produces same result as cv::warpPolar(src, dst, src.size(), center, maxRadius, flags)
+
+@internal
+Transform the source image using the following transformation (See @ref polar_remaps_reference_image "Polar remaps reference image c)"):
+\f[\begin{array}{l}
+  dst( \rho , \phi ) = src(x,y) \\
+  dst.size() \leftarrow src.size()
+\end{array}\f]
+
+where
+\f[\begin{array}{l}
+  I = (dx,dy) = (x - center.x,y - center.y) \\
+  \rho = Kmag \cdot \texttt{magnitude} (I) ,\\
+  \phi = angle \cdot \texttt{angle} (I)
+\end{array}\f]
+
+and
+\f[\begin{array}{l}
+  Kx = src.cols / maxRadius \\
+  Ky = src.rows / 2\Pi
+\end{array}\f]
+
+
+@param src Source image
+@param dst Destination image. It will have same size and type as src.
+@param center The transformation center;
+@param maxRadius The radius of the bounding circle to transform. It determines the inverse magnitude scale parameter too.
+@param flags A combination of interpolation methods, see #InterpolationFlags
+
+@note
+-   The function can not operate in-place.
+-   To calculate magnitude and angle in degrees #cartToPolar is used internally thus angles are measured from 0 to 360 with accuracy about 0.3 degrees.
+
+@sa cv::logPolar
+@endinternal
+*/
+CV_EXPORTS_W void linearPolar( InputArray src, OutputArray dst,
+                               Point2f center, double maxRadius, int flags );
+
+
+/** \brief Remaps an image to polar or semilog-polar coordinates space
+
+@anchor polar_remaps_reference_image
+![Polar remaps reference](pics/polar_remap_doc.png)
+
+Transform the source image using the following transformation:
+\f[
+dst(\rho , \phi ) = src(x,y)
+\f]
+
+where
+\f[
+\begin{array}{l}
+\vec{I} = (x - center.x, \;y - center.y) \\
+\phi = Kangle \cdot \texttt{angle} (\vec{I}) \\
+\rho = \left\{\begin{matrix}
+Klin \cdot \texttt{magnitude} (\vec{I}) & default \\
+Klog \cdot log_e(\texttt{magnitude} (\vec{I})) & if \; semilog \\
+\end{matrix}\right.
+\end{array}
+\f]
+
+and
+\f[
+\begin{array}{l}
+Kangle = dsize.height / 2\Pi \\
+Klin = dsize.width / maxRadius \\
+Klog = dsize.width / log_e(maxRadius) \\
+\end{array}
+\f]
+
+
+\par Linear vs semilog mapping
+
+Polar mapping can be linear or semi-log. Add one of #WarpPolarMode to `flags` to specify the polar mapping mode.
+
+Linear is the default mode.
+
+The semilog mapping emulates the human "foveal" vision that permit very high acuity on the line of sight (central vision)
+in contrast to peripheral vision where acuity is minor.
+
+\par Option on `dsize`:
+
+- if both values in `dsize <=0 ` (default),
+the destination image will have (almost) same area of source bounding circle:
+\f[\begin{array}{l}
+dsize.area  \leftarrow (maxRadius^2 \cdot \Pi) \\
+dsize.width = \texttt{cvRound}(maxRadius) \\
+dsize.height = \texttt{cvRound}(maxRadius \cdot \Pi) \\
+\end{array}\f]
+
+
+- if only `dsize.height <= 0`,
+the destination image area will be proportional to the bounding circle area but scaled by `Kx * Kx`:
+\f[\begin{array}{l}
+dsize.height = \texttt{cvRound}(dsize.width \cdot \Pi) \\
+\end{array}
+\f]
+
+- if both values in `dsize > 0 `,
+the destination image will have the given size therefore the area of the bounding circle will be scaled to `dsize`.
+
+
+\par Reverse mapping
+
+You can get reverse mapping adding #WARP_INVERSE_MAP to `flags`
+\snippet polar_transforms.cpp InverseMap
+
+In addiction, to calculate the original coordinate from a polar mapped coordinate \f$(rho, phi)->(x, y)\f$:
+\snippet polar_transforms.cpp InverseCoordinate
+
+@param src Source image.
+@param dst Destination image. It will have same type as src.
+@param dsize The destination image size (see description for valid options).
+@param center The transformation center.
+@param maxRadius The radius of the bounding circle to transform. It determines the inverse magnitude scale parameter too.
+@param flags A combination of interpolation methods, #InterpolationFlags + #WarpPolarMode.
+            - Add #WARP_POLAR_LINEAR to select linear polar mapping (default)
+            - Add #WARP_POLAR_LOG to select semilog polar mapping
+            - Add #WARP_INVERSE_MAP for reverse mapping.
+@note
+-  The function can not operate in-place.
+-  To calculate magnitude and angle in degrees #cartToPolar is used internally thus angles are measured from 0 to 360 with accuracy about 0.3 degrees.
+-  This function uses #remap. Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
+
+@sa cv::remap
+*/
+CV_EXPORTS_W void warpPolar(InputArray src, OutputArray dst, Size dsize,
+                            Point2f center, double maxRadius, int flags);
+
+
+//! @} imgproc_transform
+
+//! @addtogroup imgproc_misc
+//! @{
+
+/** @brief Calculates the integral of an image.
+
+The function calculates one or more integral images for the source image as follows:
+
+\f[\texttt{sum} (X,Y) =  \sum _{x<X,y<Y}  \texttt{image} (x,y)\f]
+
+\f[\texttt{sqsum} (X,Y) =  \sum _{x<X,y<Y}  \texttt{image} (x,y)^2\f]
+
+\f[\texttt{tilted} (X,Y) =  \sum _{y<Y,abs(x-X+1) \leq Y-y-1}  \texttt{image} (x,y)\f]
+
+Using these integral images, you can calculate sum, mean, and standard deviation over a specific
+up-right or rotated rectangular region of the image in a constant time, for example:
+
+\f[\sum _{x_1 \leq x < x_2,  \, y_1  \leq y < y_2}  \texttt{image} (x,y) =  \texttt{sum} (x_2,y_2)- \texttt{sum} (x_1,y_2)- \texttt{sum} (x_2,y_1)+ \texttt{sum} (x_1,y_1)\f]
+
+It makes possible to do a fast blurring or fast block correlation with a variable window size, for
+example. In case of multi-channel images, sums for each channel are accumulated independently.
+
+As a practical example, the next figure shows the calculation of the integral of a straight
+rectangle Rect(4,4,3,2) and of a tilted rectangle Rect(5,1,2,3) . The selected pixels in the
+original image are shown, as well as the relative pixels in the integral images sum and tilted .
+
+![integral calculation example](pics/integral.png)
+
+@param src input image as \f$W \times H\f$, 8-bit or floating-point (32f or 64f).
+@param sum integral image as \f$(W+1)\times (H+1)\f$ , 32-bit integer or floating-point (32f or 64f).
+@param sqsum integral image for squared pixel values; it is \f$(W+1)\times (H+1)\f$, double-precision
+floating-point (64f) array.
+@param tilted integral for the image rotated by 45 degrees; it is \f$(W+1)\times (H+1)\f$ array with
+the same data type as sum.
+@param sdepth desired depth of the integral and the tilted integral images, CV_32S, CV_32F, or
+CV_64F.
+@param sqdepth desired depth of the integral image of squared pixel values, CV_32F or CV_64F.
+ */
+CV_EXPORTS_AS(integral3) void integral( InputArray src, OutputArray sum,
+                                        OutputArray sqsum, OutputArray tilted,
+                                        int sdepth = -1, int sqdepth = -1 );
+
+/** @overload */
+CV_EXPORTS_W void integral( InputArray src, OutputArray sum, int sdepth = -1 );
+
+/** @overload */
+CV_EXPORTS_AS(integral2) void integral( InputArray src, OutputArray sum,
+                                        OutputArray sqsum, int sdepth = -1, int sqdepth = -1 );
+
+//! @} imgproc_misc
+
+//! @addtogroup imgproc_motion
+//! @{
+
+/** @brief Adds an image to the accumulator image.
+
+The function adds src or some of its elements to dst :
+
+\f[\texttt{dst} (x,y)  \leftarrow \texttt{dst} (x,y) +  \texttt{src} (x,y)  \quad \text{if} \quad \texttt{mask} (x,y)  \ne 0\f]
+
+The function supports multi-channel images. Each channel is processed independently.
+
+The function cv::accumulate can be used, for example, to collect statistics of a scene background
+viewed by a still camera and for the further foreground-background segmentation.
+
+@param src Input image of type CV_8UC(n), CV_16UC(n), CV_32FC(n) or CV_64FC(n), where n is a positive integer.
+@param dst %Accumulator image with the same number of channels as input image, and a depth of CV_32F or CV_64F.
+@param mask Optional operation mask.
+
+@sa  accumulateSquare, accumulateProduct, accumulateWeighted
+ */
+CV_EXPORTS_W void accumulate( InputArray src, InputOutputArray dst,
+                              InputArray mask = noArray() );
+
+/** @brief Adds the square of a source image to the accumulator image.
+
+The function adds the input image src or its selected region, raised to a power of 2, to the
+accumulator dst :
+
+\f[\texttt{dst} (x,y)  \leftarrow \texttt{dst} (x,y) +  \texttt{src} (x,y)^2  \quad \text{if} \quad \texttt{mask} (x,y)  \ne 0\f]
+
+The function supports multi-channel images. Each channel is processed independently.
+
+@param src Input image as 1- or 3-channel, 8-bit or 32-bit floating point.
+@param dst %Accumulator image with the same number of channels as input image, 32-bit or 64-bit
+floating-point.
+@param mask Optional operation mask.
+
+@sa  accumulateSquare, accumulateProduct, accumulateWeighted
+ */
+CV_EXPORTS_W void accumulateSquare( InputArray src, InputOutputArray dst,
+                                    InputArray mask = noArray() );
+
+/** @brief Adds the per-element product of two input images to the accumulator image.
+
+The function adds the product of two images or their selected regions to the accumulator dst :
+
+\f[\texttt{dst} (x,y)  \leftarrow \texttt{dst} (x,y) +  \texttt{src1} (x,y)  \cdot \texttt{src2} (x,y)  \quad \text{if} \quad \texttt{mask} (x,y)  \ne 0\f]
+
+The function supports multi-channel images. Each channel is processed independently.
+
+@param src1 First input image, 1- or 3-channel, 8-bit or 32-bit floating point.
+@param src2 Second input image of the same type and the same size as src1 .
+@param dst %Accumulator image with the same number of channels as input images, 32-bit or 64-bit
+floating-point.
+@param mask Optional operation mask.
+
+@sa  accumulate, accumulateSquare, accumulateWeighted
+ */
+CV_EXPORTS_W void accumulateProduct( InputArray src1, InputArray src2,
+                                     InputOutputArray dst, InputArray mask=noArray() );
+
+/** @brief Updates a running average.
+
+The function calculates the weighted sum of the input image src and the accumulator dst so that dst
+becomes a running average of a frame sequence:
+
+\f[\texttt{dst} (x,y)  \leftarrow (1- \texttt{alpha} )  \cdot \texttt{dst} (x,y) +  \texttt{alpha} \cdot \texttt{src} (x,y)  \quad \text{if} \quad \texttt{mask} (x,y)  \ne 0\f]
+
+That is, alpha regulates the update speed (how fast the accumulator "forgets" about earlier images).
+The function supports multi-channel images. Each channel is processed independently.
+
+@param src Input image as 1- or 3-channel, 8-bit or 32-bit floating point.
+@param dst %Accumulator image with the same number of channels as input image, 32-bit or 64-bit
+floating-point.
+@param alpha Weight of the input image.
+@param mask Optional operation mask.
+
+@sa  accumulate, accumulateSquare, accumulateProduct
+ */
+CV_EXPORTS_W void accumulateWeighted( InputArray src, InputOutputArray dst,
+                                      double alpha, InputArray mask = noArray() );
+
+/** @brief The function is used to detect translational shifts that occur between two images.
+
+The operation takes advantage of the Fourier shift theorem for detecting the translational shift in
+the frequency domain. It can be used for fast image registration as well as motion estimation. For
+more information please see <http://en.wikipedia.org/wiki/Phase_correlation>
+
+Calculates the cross-power spectrum of two supplied source arrays. The arrays are padded if needed
+with getOptimalDFTSize.
+
+The function performs the following equations:
+- First it applies a Hanning window (see <http://en.wikipedia.org/wiki/Hann_function>) to each
+image to remove possible edge effects. This window is cached until the array size changes to speed
+up processing time.
+- Next it computes the forward DFTs of each source array:
+\f[\mathbf{G}_a = \mathcal{F}\{src_1\}, \; \mathbf{G}_b = \mathcal{F}\{src_2\}\f]
+where \f$\mathcal{F}\f$ is the forward DFT.
+- It then computes the cross-power spectrum of each frequency domain array:
+\f[R = \frac{ \mathbf{G}_a \mathbf{G}_b^*}{|\mathbf{G}_a \mathbf{G}_b^*|}\f]
+- Next the cross-correlation is converted back into the time domain via the inverse DFT:
+\f[r = \mathcal{F}^{-1}\{R\}\f]
+- Finally, it computes the peak location and computes a 5x5 weighted centroid around the peak to
+achieve sub-pixel accuracy.
+\f[(\Delta x, \Delta y) = \texttt{weightedCentroid} \{\arg \max_{(x, y)}\{r\}\}\f]
+- If non-zero, the response parameter is computed as the sum of the elements of r within the 5x5
+centroid around the peak location. It is normalized to a maximum of 1 (meaning there is a single
+peak) and will be smaller when there are multiple peaks.
+
+@param src1 Source floating point array (CV_32FC1 or CV_64FC1)
+@param src2 Source floating point array (CV_32FC1 or CV_64FC1)
+@param window Floating point array with windowing coefficients to reduce edge effects (optional).
+@param response Signal power within the 5x5 centroid around the peak, between 0 and 1 (optional).
+@returns detected phase shift (sub-pixel) between the two arrays.
+
+@sa dft, getOptimalDFTSize, idft, mulSpectrums createHanningWindow
+ */
+CV_EXPORTS_W Point2d phaseCorrelate(InputArray src1, InputArray src2,
+                                    InputArray window = noArray(), CV_OUT double* response = 0);
+
+/** @brief This function computes a Hanning window coefficients in two dimensions.
+
+See (http://en.wikipedia.org/wiki/Hann_function) and (http://en.wikipedia.org/wiki/Window_function)
+for more information.
+
+An example is shown below:
+@code
+    // create hanning window of size 100x100 and type CV_32F
+    Mat hann;
+    createHanningWindow(hann, Size(100, 100), CV_32F);
+@endcode
+@param dst Destination array to place Hann coefficients in
+@param winSize The window size specifications (both width and height must be > 1)
+@param type Created array type
+ */
+CV_EXPORTS_W void createHanningWindow(OutputArray dst, Size winSize, int type);
+
+/** @brief Performs the per-element division of the first Fourier spectrum by the second Fourier spectrum.
+
+The function cv::divSpectrums performs the per-element division of the first array by the second array.
+The arrays are CCS-packed or complex matrices that are results of a real or complex Fourier transform.
+
+@param a first input array.
+@param b second input array of the same size and type as src1 .
+@param c output array of the same size and type as src1 .
+@param flags operation flags; currently, the only supported flag is cv::DFT_ROWS, which indicates that
+each row of src1 and src2 is an independent 1D Fourier spectrum. If you do not want to use this flag, then simply add a `0` as value.
+@param conjB optional flag that conjugates the second input array before the multiplication (true)
+or not (false).
+*/
+CV_EXPORTS_W void divSpectrums(InputArray a, InputArray b, OutputArray c,
+                               int flags, bool conjB = false);
+
+//! @} imgproc_motion
+
+//! @addtogroup imgproc_misc
+//! @{
+
+/** @brief Applies a fixed-level threshold to each array element.
+
+The function applies fixed-level thresholding to a multiple-channel array. The function is typically
+used to get a bi-level (binary) image out of a grayscale image ( #compare could be also used for
+this purpose) or for removing a noise, that is, filtering out pixels with too small or too large
+values. There are several types of thresholding supported by the function. They are determined by
+type parameter.
+
+Also, the special values #THRESH_OTSU or #THRESH_TRIANGLE may be combined with one of the
+above values. In these cases, the function determines the optimal threshold value using the Otsu's
+or Triangle algorithm and uses it instead of the specified thresh.
+
+@note Currently, the Otsu's and Triangle methods are implemented only for 8-bit single-channel images.
+
+@param src input array (multiple-channel, 8-bit or 32-bit floating point).
+@param dst output array of the same size  and type and the same number of channels as src.
+@param thresh threshold value.
+@param maxval maximum value to use with the #THRESH_BINARY and #THRESH_BINARY_INV thresholding
+types.
+@param type thresholding type (see #ThresholdTypes).
+@return the computed threshold value if Otsu's or Triangle methods used.
+
+@sa  adaptiveThreshold, findContours, compare, min, max
+ */
+CV_EXPORTS_W double threshold( InputArray src, OutputArray dst,
+                               double thresh, double maxval, int type );
+
+
+/** @brief Applies an adaptive threshold to an array.
+
+The function transforms a grayscale image to a binary image according to the formulae:
+-   **THRESH_BINARY**
+    \f[dst(x,y) =  \fork{\texttt{maxValue}}{if \(src(x,y) > T(x,y)\)}{0}{otherwise}\f]
+-   **THRESH_BINARY_INV**
+    \f[dst(x,y) =  \fork{0}{if \(src(x,y) > T(x,y)\)}{\texttt{maxValue}}{otherwise}\f]
+where \f$T(x,y)\f$ is a threshold calculated individually for each pixel (see adaptiveMethod parameter).
+
+The function can process the image in-place.
+
+@param src Source 8-bit single-channel image.
+@param dst Destination image of the same size and the same type as src.
+@param maxValue Non-zero value assigned to the pixels for which the condition is satisfied
+@param adaptiveMethod Adaptive thresholding algorithm to use, see #AdaptiveThresholdTypes.
+The #BORDER_REPLICATE | #BORDER_ISOLATED is used to process boundaries.
+@param thresholdType Thresholding type that must be either #THRESH_BINARY or #THRESH_BINARY_INV,
+see #ThresholdTypes.
+@param blockSize Size of a pixel neighborhood that is used to calculate a threshold value for the
+pixel: 3, 5, 7, and so on.
+@param C Constant subtracted from the mean or weighted mean (see the details below). Normally, it
+is positive but may be zero or negative as well.
+
+@sa  threshold, blur, GaussianBlur
+ */
+CV_EXPORTS_W void adaptiveThreshold( InputArray src, OutputArray dst,
+                                     double maxValue, int adaptiveMethod,
+                                     int thresholdType, int blockSize, double C );
+
+//! @} imgproc_misc
+
+//! @addtogroup imgproc_filter
+//! @{
+
+/** @example samples/cpp/tutorial_code/ImgProc/Pyramids/Pyramids.cpp
+An example using pyrDown and pyrUp functions
+*/
+
+/** @brief Blurs an image and downsamples it.
+
+By default, size of the output image is computed as `Size((src.cols+1)/2, (src.rows+1)/2)`, but in
+any case, the following conditions should be satisfied:
+
+\f[\begin{array}{l} | \texttt{dstsize.width} *2-src.cols| \leq 2 \\ | \texttt{dstsize.height} *2-src.rows| \leq 2 \end{array}\f]
+
+The function performs the downsampling step of the Gaussian pyramid construction. First, it
+convolves the source image with the kernel:
+
+\f[\frac{1}{256} \begin{bmatrix} 1 & 4 & 6 & 4 & 1  \\ 4 & 16 & 24 & 16 & 4  \\ 6 & 24 & 36 & 24 & 6  \\ 4 & 16 & 24 & 16 & 4  \\ 1 & 4 & 6 & 4 & 1 \end{bmatrix}\f]
+
+Then, it downsamples the image by rejecting even rows and columns.
+
+@param src input image.
+@param dst output image; it has the specified size and the same type as src.
+@param dstsize size of the output image.
+@param borderType Pixel extrapolation method, see #BorderTypes (#BORDER_CONSTANT isn't supported)
+ */
+CV_EXPORTS_W void pyrDown( InputArray src, OutputArray dst,
+                           const Size& dstsize = Size(), int borderType = BORDER_DEFAULT );
+
+/** @brief Upsamples an image and then blurs it.
+
+By default, size of the output image is computed as `Size(src.cols\*2, (src.rows\*2)`, but in any
+case, the following conditions should be satisfied:
+
+\f[\begin{array}{l} | \texttt{dstsize.width} -src.cols*2| \leq  ( \texttt{dstsize.width}   \mod  2)  \\ | \texttt{dstsize.height} -src.rows*2| \leq  ( \texttt{dstsize.height}   \mod  2) \end{array}\f]
+
+The function performs the upsampling step of the Gaussian pyramid construction, though it can
+actually be used to construct the Laplacian pyramid. First, it upsamples the source image by
+injecting even zero rows and columns and then convolves the result with the same kernel as in
+pyrDown multiplied by 4.
+
+@param src input image.
+@param dst output image. It has the specified size and the same type as src .
+@param dstsize size of the output image.
+@param borderType Pixel extrapolation method, see #BorderTypes (only #BORDER_DEFAULT is supported)
+ */
+CV_EXPORTS_W void pyrUp( InputArray src, OutputArray dst,
+                         const Size& dstsize = Size(), int borderType = BORDER_DEFAULT );
+
+/** @brief Constructs the Gaussian pyramid for an image.
+
+The function constructs a vector of images and builds the Gaussian pyramid by recursively applying
+pyrDown to the previously built pyramid layers, starting from `dst[0]==src`.
+
+@param src Source image. Check pyrDown for the list of supported types.
+@param dst Destination vector of maxlevel+1 images of the same type as src. dst[0] will be the
+same as src. dst[1] is the next pyramid layer, a smoothed and down-sized src, and so on.
+@param maxlevel 0-based index of the last (the smallest) pyramid layer. It must be non-negative.
+@param borderType Pixel extrapolation method, see #BorderTypes (#BORDER_CONSTANT isn't supported)
+ */
+CV_EXPORTS void buildPyramid( InputArray src, OutputArrayOfArrays dst,
+                              int maxlevel, int borderType = BORDER_DEFAULT );
+
+//! @} imgproc_filter
+
+//! @addtogroup imgproc_hist
+//! @{
+
+/** @example samples/cpp/demhist.cpp
+An example for creating histograms of an image
+*/
+
+/** @brief Calculates a histogram of a set of arrays.
+
+The function cv::calcHist calculates the histogram of one or more arrays. The elements of a tuple used
+to increment a histogram bin are taken from the corresponding input arrays at the same location. The
+sample below shows how to compute a 2D Hue-Saturation histogram for a color image. :
+@include snippets/imgproc_calcHist.cpp
+
+@param images Source arrays. They all should have the same depth, CV_8U, CV_16U or CV_32F , and the same
+size. Each of them can have an arbitrary number of channels.
+@param nimages Number of source images.
+@param channels List of the dims channels used to compute the histogram. The first array channels
+are numerated from 0 to images[0].channels()-1 , the second array channels are counted from
+images[0].channels() to images[0].channels() + images[1].channels()-1, and so on.
+@param mask Optional mask. If the matrix is not empty, it must be an 8-bit array of the same size
+as images[i] . The non-zero mask elements mark the array elements counted in the histogram.
+@param hist Output histogram, which is a dense or sparse dims -dimensional array.
+@param dims Histogram dimensionality that must be positive and not greater than CV_MAX_DIMS
+(equal to 32 in the current OpenCV version).
+@param histSize Array of histogram sizes in each dimension.
+@param ranges Array of the dims arrays of the histogram bin boundaries in each dimension. When the
+histogram is uniform ( uniform =true), then for each dimension i it is enough to specify the lower
+(inclusive) boundary \f$L_0\f$ of the 0-th histogram bin and the upper (exclusive) boundary
+\f$U_{\texttt{histSize}[i]-1}\f$ for the last histogram bin histSize[i]-1 . That is, in case of a
+uniform histogram each of ranges[i] is an array of 2 elements. When the histogram is not uniform (
+uniform=false ), then each of ranges[i] contains histSize[i]+1 elements:
+\f$L_0, U_0=L_1, U_1=L_2, ..., U_{\texttt{histSize[i]}-2}=L_{\texttt{histSize[i]}-1}, U_{\texttt{histSize[i]}-1}\f$
+. The array elements, that are not between \f$L_0\f$ and \f$U_{\texttt{histSize[i]}-1}\f$ , are not
+counted in the histogram.
+@param uniform Flag indicating whether the histogram is uniform or not (see above).
+@param accumulate Accumulation flag. If it is set, the histogram is not cleared in the beginning
+when it is allocated. This feature enables you to compute a single histogram from several sets of
+arrays, or to update the histogram in time.
+*/
+CV_EXPORTS void calcHist( const Mat* images, int nimages,
+                          const int* channels, InputArray mask,
+                          OutputArray hist, int dims, const int* histSize,
+                          const float** ranges, bool uniform = true, bool accumulate = false );
+
+/** @overload
+
+this variant uses %SparseMat for output
+*/
+CV_EXPORTS void calcHist( const Mat* images, int nimages,
+                          const int* channels, InputArray mask,
+                          SparseMat& hist, int dims,
+                          const int* histSize, const float** ranges,
+                          bool uniform = true, bool accumulate = false );
+
+/** @overload
+
+this variant supports only uniform histograms.
+
+ranges argument is either empty vector or a flattened vector of histSize.size()*2 elements
+(histSize.size() element pairs). The first and second elements of each pair specify the lower and
+upper boundaries.
+*/
+CV_EXPORTS_W void calcHist( InputArrayOfArrays images,
+                            const std::vector<int>& channels,
+                            InputArray mask, OutputArray hist,
+                            const std::vector<int>& histSize,
+                            const std::vector<float>& ranges,
+                            bool accumulate = false );
+
+/** @brief Calculates the back projection of a histogram.
+
+The function cv::calcBackProject calculates the back project of the histogram. That is, similarly to
+#calcHist , at each location (x, y) the function collects the values from the selected channels
+in the input images and finds the corresponding histogram bin. But instead of incrementing it, the
+function reads the bin value, scales it by scale , and stores in backProject(x,y) . In terms of
+statistics, the function computes probability of each element value in respect with the empirical
+probability distribution represented by the histogram. See how, for example, you can find and track
+a bright-colored object in a scene:
+
+- Before tracking, show the object to the camera so that it covers almost the whole frame.
+Calculate a hue histogram. The histogram may have strong maximums, corresponding to the dominant
+colors in the object.
+
+- When tracking, calculate a back projection of a hue plane of each input video frame using that
+pre-computed histogram. Threshold the back projection to suppress weak colors. It may also make
+sense to suppress pixels with non-sufficient color saturation and too dark or too bright pixels.
+
+- Find connected components in the resulting picture and choose, for example, the largest
+component.
+
+This is an approximate algorithm of the CamShift color object tracker.
+
+@param images Source arrays. They all should have the same depth, CV_8U, CV_16U or CV_32F , and the same
+size. Each of them can have an arbitrary number of channels.
+@param nimages Number of source images.
+@param channels The list of channels used to compute the back projection. The number of channels
+must match the histogram dimensionality. The first array channels are numerated from 0 to
+images[0].channels()-1 , the second array channels are counted from images[0].channels() to
+images[0].channels() + images[1].channels()-1, and so on.
+@param hist Input histogram that can be dense or sparse.
+@param backProject Destination back projection array that is a single-channel array of the same
+size and depth as images[0] .
+@param ranges Array of arrays of the histogram bin boundaries in each dimension. See #calcHist .
+@param scale Optional scale factor for the output back projection.
+@param uniform Flag indicating whether the histogram is uniform or not (see #calcHist).
+
+@sa calcHist, compareHist
+ */
+CV_EXPORTS void calcBackProject( const Mat* images, int nimages,
+                                 const int* channels, InputArray hist,
+                                 OutputArray backProject, const float** ranges,
+                                 double scale = 1, bool uniform = true );
+
+/** @overload */
+CV_EXPORTS void calcBackProject( const Mat* images, int nimages,
+                                 const int* channels, const SparseMat& hist,
+                                 OutputArray backProject, const float** ranges,
+                                 double scale = 1, bool uniform = true );
+
+/** @overload */
+CV_EXPORTS_W void calcBackProject( InputArrayOfArrays images, const std::vector<int>& channels,
+                                   InputArray hist, OutputArray dst,
+                                   const std::vector<float>& ranges,
+                                   double scale );
+
+/** @brief Compares two histograms.
+
+The function cv::compareHist compares two dense or two sparse histograms using the specified method.
+
+The function returns \f$d(H_1, H_2)\f$ .
+
+While the function works well with 1-, 2-, 3-dimensional dense histograms, it may not be suitable
+for high-dimensional sparse histograms. In such histograms, because of aliasing and sampling
+problems, the coordinates of non-zero histogram bins can slightly shift. To compare such histograms
+or more general sparse configurations of weighted points, consider using the #EMD function.
+
+@param H1 First compared histogram.
+@param H2 Second compared histogram of the same size as H1 .
+@param method Comparison method, see #HistCompMethods
+ */
+CV_EXPORTS_W double compareHist( InputArray H1, InputArray H2, int method );
+
+/** @overload */
+CV_EXPORTS double compareHist( const SparseMat& H1, const SparseMat& H2, int method );
+
+/** @brief Equalizes the histogram of a grayscale image.
+
+The function equalizes the histogram of the input image using the following algorithm:
+
+- Calculate the histogram \f$H\f$ for src .
+- Normalize the histogram so that the sum of histogram bins is 255.
+- Compute the integral of the histogram:
+\f[H'_i =  \sum _{0  \le j < i} H(j)\f]
+- Transform the image using \f$H'\f$ as a look-up table: \f$\texttt{dst}(x,y) = H'(\texttt{src}(x,y))\f$
+
+The algorithm normalizes the brightness and increases the contrast of the image.
+
+@param src Source 8-bit single channel image.
+@param dst Destination image of the same size and type as src .
+ */
+CV_EXPORTS_W void equalizeHist( InputArray src, OutputArray dst );
+
+/** @brief Creates a smart pointer to a cv::CLAHE class and initializes it.
+
+@param clipLimit Threshold for contrast limiting.
+@param tileGridSize Size of grid for histogram equalization. Input image will be divided into
+equally sized rectangular tiles. tileGridSize defines the number of tiles in row and column.
+ */
+CV_EXPORTS_W Ptr<CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
+
+/** @brief Computes the "minimal work" distance between two weighted point configurations.
+
+The function computes the earth mover distance and/or a lower boundary of the distance between the
+two weighted point configurations. One of the applications described in @cite RubnerSept98,
+@cite Rubner2000 is multi-dimensional histogram comparison for image retrieval. EMD is a transportation
+problem that is solved using some modification of a simplex algorithm, thus the complexity is
+exponential in the worst case, though, on average it is much faster. In the case of a real metric
+the lower boundary can be calculated even faster (using linear-time algorithm) and it can be used
+to determine roughly whether the two signatures are far enough so that they cannot relate to the
+same object.
+
+@param signature1 First signature, a \f$\texttt{size1}\times \texttt{dims}+1\f$ floating-point matrix.
+Each row stores the point weight followed by the point coordinates. The matrix is allowed to have
+a single column (weights only) if the user-defined cost matrix is used. The weights must be
+non-negative and have at least one non-zero value.
+@param signature2 Second signature of the same format as signature1 , though the number of rows
+may be different. The total weights may be different. In this case an extra "dummy" point is added
+to either signature1 or signature2. The weights must be non-negative and have at least one non-zero
+value.
+@param distType Used metric. See #DistanceTypes.
+@param cost User-defined \f$\texttt{size1}\times \texttt{size2}\f$ cost matrix. Also, if a cost matrix
+is used, lower boundary lowerBound cannot be calculated because it needs a metric function.
+@param lowerBound Optional input/output parameter: lower boundary of a distance between the two
+signatures that is a distance between mass centers. The lower boundary may not be calculated if
+the user-defined cost matrix is used, the total weights of point configurations are not equal, or
+if the signatures consist of weights only (the signature matrices have a single column). You
+**must** initialize \*lowerBound . If the calculated distance between mass centers is greater or
+equal to \*lowerBound (it means that the signatures are far enough), the function does not
+calculate EMD. In any case \*lowerBound is set to the calculated distance between mass centers on
+return. Thus, if you want to calculate both distance between mass centers and EMD, \*lowerBound
+should be set to 0.
+@param flow Resultant \f$\texttt{size1} \times \texttt{size2}\f$ flow matrix: \f$\texttt{flow}_{i,j}\f$ is
+a flow from \f$i\f$ -th point of signature1 to \f$j\f$ -th point of signature2 .
+ */
+CV_EXPORTS float EMD( InputArray signature1, InputArray signature2,
+                      int distType, InputArray cost=noArray(),
+                      float* lowerBound = 0, OutputArray flow = noArray() );
+
+CV_EXPORTS_AS(EMD) float wrapperEMD( InputArray signature1, InputArray signature2,
+                      int distType, InputArray cost=noArray(),
+                      CV_IN_OUT Ptr<float> lowerBound = Ptr<float>(), OutputArray flow = noArray() );
+
+//! @} imgproc_hist
+
+//! @addtogroup imgproc_segmentation
+//! @{
+
+/** @example samples/cpp/watershed.cpp
+An example using the watershed algorithm
+*/
+
+/** @brief Performs a marker-based image segmentation using the watershed algorithm.
+
+The function implements one of the variants of watershed, non-parametric marker-based segmentation
+algorithm, described in @cite Meyer92 .
+
+Before passing the image to the function, you have to roughly outline the desired regions in the
+image markers with positive (\>0) indices. So, every region is represented as one or more connected
+components with the pixel values 1, 2, 3, and so on. Such markers can be retrieved from a binary
+mask using #findContours and #drawContours (see the watershed.cpp demo). The markers are "seeds" of
+the future image regions. All the other pixels in markers , whose relation to the outlined regions
+is not known and should be defined by the algorithm, should be set to 0's. In the function output,
+each pixel in markers is set to a value of the "seed" components or to -1 at boundaries between the
+regions.
+
+@note Any two neighbor connected components are not necessarily separated by a watershed boundary
+(-1's pixels); for example, they can touch each other in the initial marker image passed to the
+function.
+
+@param image Input 8-bit 3-channel image.
+@param markers Input/output 32-bit single-channel image (map) of markers. It should have the same
+size as image .
+
+@sa findContours
+ */
+CV_EXPORTS_W void watershed( InputArray image, InputOutputArray markers );
+
+//! @} imgproc_segmentation
+
+//! @addtogroup imgproc_filter
+//! @{
+
+/** @brief Performs initial step of meanshift segmentation of an image.
+
+The function implements the filtering stage of meanshift segmentation, that is, the output of the
+function is the filtered "posterized" image with color gradients and fine-grain texture flattened.
+At every pixel (X,Y) of the input image (or down-sized input image, see below) the function executes
+meanshift iterations, that is, the pixel (X,Y) neighborhood in the joint space-color hyperspace is
+considered:
+
+\f[(x,y): X- \texttt{sp} \le x  \le X+ \texttt{sp} , Y- \texttt{sp} \le y  \le Y+ \texttt{sp} , ||(R,G,B)-(r,g,b)||   \le \texttt{sr}\f]
+
+where (R,G,B) and (r,g,b) are the vectors of color components at (X,Y) and (x,y), respectively
+(though, the algorithm does not depend on the color space used, so any 3-component color space can
+be used instead). Over the neighborhood the average spatial value (X',Y') and average color vector
+(R',G',B') are found and they act as the neighborhood center on the next iteration:
+
+\f[(X,Y)~(X',Y'), (R,G,B)~(R',G',B').\f]
+
+After the iterations over, the color components of the initial pixel (that is, the pixel from where
+the iterations started) are set to the final value (average color at the last iteration):
+
+\f[I(X,Y) <- (R*,G*,B*)\f]
+
+When maxLevel \> 0, the gaussian pyramid of maxLevel+1 levels is built, and the above procedure is
+run on the smallest layer first. After that, the results are propagated to the larger layer and the
+iterations are run again only on those pixels where the layer colors differ by more than sr from the
+lower-resolution layer of the pyramid. That makes boundaries of color regions sharper. Note that the
+results will be actually different from the ones obtained by running the meanshift procedure on the
+whole original image (i.e. when maxLevel==0).
+
+@param src The source 8-bit, 3-channel image.
+@param dst The destination image of the same format and the same size as the source.
+@param sp The spatial window radius.
+@param sr The color window radius.
+@param maxLevel Maximum level of the pyramid for the segmentation.
+@param termcrit Termination criteria: when to stop meanshift iterations.
+ */
+CV_EXPORTS_W void pyrMeanShiftFiltering( InputArray src, OutputArray dst,
+                                         double sp, double sr, int maxLevel = 1,
+                                         TermCriteria termcrit=TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS,5,1) );
+
+//! @}
+
+//! @addtogroup imgproc_segmentation
+//! @{
+
+/** @example samples/cpp/grabcut.cpp
+An example using the GrabCut algorithm
+![Sample Screenshot](grabcut_output1.jpg)
+*/
+
+/** @brief Runs the GrabCut algorithm.
+
+The function implements the [GrabCut image segmentation algorithm](http://en.wikipedia.org/wiki/GrabCut).
+
+@param img Input 8-bit 3-channel image.
+@param mask Input/output 8-bit single-channel mask. The mask is initialized by the function when
+mode is set to #GC_INIT_WITH_RECT. Its elements may have one of the #GrabCutClasses.
+@param rect ROI containing a segmented object. The pixels outside of the ROI are marked as
+"obvious background". The parameter is only used when mode==#GC_INIT_WITH_RECT .
+@param bgdModel Temporary array for the background model. Do not modify it while you are
+processing the same image.
+@param fgdModel Temporary arrays for the foreground model. Do not modify it while you are
+processing the same image.
+@param iterCount Number of iterations the algorithm should make before returning the result. Note
+that the result can be refined with further calls with mode==#GC_INIT_WITH_MASK or
+mode==GC_EVAL .
+@param mode Operation mode that could be one of the #GrabCutModes
+ */
+CV_EXPORTS_W void grabCut( InputArray img, InputOutputArray mask, Rect rect,
+                           InputOutputArray bgdModel, InputOutputArray fgdModel,
+                           int iterCount, int mode = GC_EVAL );
+
+//! @} imgproc_segmentation
+
+//! @addtogroup imgproc_misc
+//! @{
+
+/** @example samples/cpp/distrans.cpp
+An example on using the distance transform
+*/
+
+/** @brief Calculates the distance to the closest zero pixel for each pixel of the source image.
+
+The function cv::distanceTransform calculates the approximate or precise distance from every binary
+image pixel to the nearest zero pixel. For zero image pixels, the distance will obviously be zero.
+
+When maskSize == #DIST_MASK_PRECISE and distanceType == #DIST_L2 , the function runs the
+algorithm described in @cite Felzenszwalb04 . This algorithm is parallelized with the TBB library.
+
+In other cases, the algorithm @cite Borgefors86 is used. This means that for a pixel the function
+finds the shortest path to the nearest zero pixel consisting of basic shifts: horizontal, vertical,
+diagonal, or knight's move (the latest is available for a \f$5\times 5\f$ mask). The overall
+distance is calculated as a sum of these basic distances. Since the distance function should be
+symmetric, all of the horizontal and vertical shifts must have the same cost (denoted as a ), all
+the diagonal shifts must have the same cost (denoted as `b`), and all knight's moves must have the
+same cost (denoted as `c`). For the #DIST_C and #DIST_L1 types, the distance is calculated
+precisely, whereas for #DIST_L2 (Euclidean distance) the distance can be calculated only with a
+relative error (a \f$5\times 5\f$ mask gives more accurate results). For `a`,`b`, and `c`, OpenCV
+uses the values suggested in the original paper:
+- DIST_L1: `a = 1, b = 2`
+- DIST_L2:
+    - `3 x 3`: `a=0.955, b=1.3693`
+    - `5 x 5`: `a=1, b=1.4, c=2.1969`
+- DIST_C: `a = 1, b = 1`
+
+Typically, for a fast, coarse distance estimation #DIST_L2, a \f$3\times 3\f$ mask is used. For a
+more accurate distance estimation #DIST_L2, a \f$5\times 5\f$ mask or the precise algorithm is used.
+Note that both the precise and the approximate algorithms are linear on the number of pixels.
+
+This variant of the function does not only compute the minimum distance for each pixel \f$(x, y)\f$
+but also identifies the nearest connected component consisting of zero pixels
+(labelType==#DIST_LABEL_CCOMP) or the nearest zero pixel (labelType==#DIST_LABEL_PIXEL). Index of the
+component/pixel is stored in `labels(x, y)`. When labelType==#DIST_LABEL_CCOMP, the function
+automatically finds connected components of zero pixels in the input image and marks them with
+distinct labels. When labelType==#DIST_LABEL_PIXEL, the function scans through the input image and
+marks all the zero pixels with distinct labels.
+
+In this mode, the complexity is still linear. That is, the function provides a very fast way to
+compute the Voronoi diagram for a binary image. Currently, the second variant can use only the
+approximate distance transform algorithm, i.e. maskSize=#DIST_MASK_PRECISE is not supported
+yet.
+
+@param src 8-bit, single-channel (binary) source image.
+@param dst Output image with calculated distances. It is a 8-bit or 32-bit floating-point,
+single-channel image of the same size as src.
+@param labels Output 2D array of labels (the discrete Voronoi diagram). It has the type
+CV_32SC1 and the same size as src.
+@param distanceType Type of distance, see #DistanceTypes
+@param maskSize Size of the distance transform mask, see #DistanceTransformMasks.
+#DIST_MASK_PRECISE is not supported by this variant. In case of the #DIST_L1 or #DIST_C distance type,
+the parameter is forced to 3 because a \f$3\times 3\f$ mask gives the same result as \f$5\times
+5\f$ or any larger aperture.
+@param labelType Type of the label array to build, see #DistanceTransformLabelTypes.
+ */
+CV_EXPORTS_AS(distanceTransformWithLabels) void distanceTransform( InputArray src, OutputArray dst,
+                                     OutputArray labels, int distanceType, int maskSize,
+                                     int labelType = DIST_LABEL_CCOMP );
+
+/** @overload
+@param src 8-bit, single-channel (binary) source image.
+@param dst Output image with calculated distances. It is a 8-bit or 32-bit floating-point,
+single-channel image of the same size as src .
+@param distanceType Type of distance, see #DistanceTypes
+@param maskSize Size of the distance transform mask, see #DistanceTransformMasks. In case of the
+#DIST_L1 or #DIST_C distance type, the parameter is forced to 3 because a \f$3\times 3\f$ mask gives
+the same result as \f$5\times 5\f$ or any larger aperture.
+@param dstType Type of output image. It can be CV_8U or CV_32F. Type CV_8U can be used only for
+the first variant of the function and distanceType == #DIST_L1.
+*/
+CV_EXPORTS_W void distanceTransform( InputArray src, OutputArray dst,
+                                     int distanceType, int maskSize, int dstType=CV_32F);
+
+/** @brief Fills a connected component with the given color.
+
+The function cv::floodFill fills a connected component starting from the seed point with the specified
+color. The connectivity is determined by the color/brightness closeness of the neighbor pixels. The
+pixel at \f$(x,y)\f$ is considered to belong to the repainted domain if:
+
+- in case of a grayscale image and floating range
+\f[\texttt{src} (x',y')- \texttt{loDiff} \leq \texttt{src} (x,y)  \leq \texttt{src} (x',y')+ \texttt{upDiff}\f]
+
+
+- in case of a grayscale image and fixed range
+\f[\texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)- \texttt{loDiff} \leq \texttt{src} (x,y)  \leq \texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)+ \texttt{upDiff}\f]
+
+
+- in case of a color image and floating range
+\f[\texttt{src} (x',y')_r- \texttt{loDiff} _r \leq \texttt{src} (x,y)_r \leq \texttt{src} (x',y')_r+ \texttt{upDiff} _r,\f]
+\f[\texttt{src} (x',y')_g- \texttt{loDiff} _g \leq \texttt{src} (x,y)_g \leq \texttt{src} (x',y')_g+ \texttt{upDiff} _g\f]
+and
+\f[\texttt{src} (x',y')_b- \texttt{loDiff} _b \leq \texttt{src} (x,y)_b \leq \texttt{src} (x',y')_b+ \texttt{upDiff} _b\f]
+
+
+- in case of a color image and fixed range
+\f[\texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_r- \texttt{loDiff} _r \leq \texttt{src} (x,y)_r \leq \texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_r+ \texttt{upDiff} _r,\f]
+\f[\texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_g- \texttt{loDiff} _g \leq \texttt{src} (x,y)_g \leq \texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_g+ \texttt{upDiff} _g\f]
+and
+\f[\texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_b- \texttt{loDiff} _b \leq \texttt{src} (x,y)_b \leq \texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_b+ \texttt{upDiff} _b\f]
+
+
+where \f$src(x',y')\f$ is the value of one of pixel neighbors that is already known to belong to the
+component. That is, to be added to the connected component, a color/brightness of the pixel should
+be close enough to:
+- Color/brightness of one of its neighbors that already belong to the connected component in case
+of a floating range.
+- Color/brightness of the seed point in case of a fixed range.
+
+Use these functions to either mark a connected component with the specified color in-place, or build
+a mask and then extract the contour, or copy the region to another image, and so on.
+
+@param image Input/output 1- or 3-channel, 8-bit, or floating-point image. It is modified by the
+function unless the #FLOODFILL_MASK_ONLY flag is set in the second variant of the function. See
+the details below.
+@param mask Operation mask that should be a single-channel 8-bit image, 2 pixels wider and 2 pixels
+taller than image. If an empty Mat is passed it will be created automatically. Since this is both an
+input and output parameter, you must take responsibility of initializing it.
+Flood-filling cannot go across non-zero pixels in the input mask. For example,
+an edge detector output can be used as a mask to stop filling at edges. On output, pixels in the
+mask corresponding to filled pixels in the image are set to 1 or to the specified value in flags
+as described below. Additionally, the function fills the border of the mask with ones to simplify
+internal processing. It is therefore possible to use the same mask in multiple calls to the function
+to make sure the filled areas do not overlap.
+@param seedPoint Starting point.
+@param newVal New value of the repainted domain pixels.
+@param loDiff Maximal lower brightness/color difference between the currently observed pixel and
+one of its neighbors belonging to the component, or a seed pixel being added to the component.
+@param upDiff Maximal upper brightness/color difference between the currently observed pixel and
+one of its neighbors belonging to the component, or a seed pixel being added to the component.
+@param rect Optional output parameter set by the function to the minimum bounding rectangle of the
+repainted domain.
+@param flags Operation flags. The first 8 bits contain a connectivity value. The default value of
+4 means that only the four nearest neighbor pixels (those that share an edge) are considered. A
+connectivity value of 8 means that the eight nearest neighbor pixels (those that share a corner)
+will be considered. The next 8 bits (8-16) contain a value between 1 and 255 with which to fill
+the mask (the default value is 1). For example, 4 | ( 255 \<\< 8 ) will consider 4 nearest
+neighbours and fill the mask with a value of 255. The following additional options occupy higher
+bits and therefore may be further combined with the connectivity and mask fill values using
+bit-wise or (|), see #FloodFillFlags.
+
+@note Since the mask is larger than the filled image, a pixel \f$(x, y)\f$ in image corresponds to the
+pixel \f$(x+1, y+1)\f$ in the mask .
+
+@sa findContours
+ */
+CV_EXPORTS_W int floodFill( InputOutputArray image, InputOutputArray mask,
+                            Point seedPoint, Scalar newVal, CV_OUT Rect* rect=0,
+                            Scalar loDiff = Scalar(), Scalar upDiff = Scalar(),
+                            int flags = 4 );
+
+/** @example samples/cpp/ffilldemo.cpp
+An example using the FloodFill technique
+*/
+
+/** @overload
+
+variant without `mask` parameter
+*/
+CV_EXPORTS int floodFill( InputOutputArray image,
+                          Point seedPoint, Scalar newVal, CV_OUT Rect* rect = 0,
+                          Scalar loDiff = Scalar(), Scalar upDiff = Scalar(),
+                          int flags = 4 );
+
+//! Performs linear blending of two images:
+//! \f[ \texttt{dst}(i,j) = \texttt{weights1}(i,j)*\texttt{src1}(i,j) + \texttt{weights2}(i,j)*\texttt{src2}(i,j) \f]
+//! @param src1 It has a type of CV_8UC(n) or CV_32FC(n), where n is a positive integer.
+//! @param src2 It has the same type and size as src1.
+//! @param weights1 It has a type of CV_32FC1 and the same size with src1.
+//! @param weights2 It has a type of CV_32FC1 and the same size with src1.
+//! @param dst It is created if it does not have the same size and type with src1.
+CV_EXPORTS_W void blendLinear(InputArray src1, InputArray src2, InputArray weights1, InputArray weights2, OutputArray dst);
+
+//! @} imgproc_misc
+
+//! @addtogroup imgproc_color_conversions
+//! @{
+
+/** @brief Converts an image from one color space to another.
+
+The function converts an input image from one color space to another. In case of a transformation
+to-from RGB color space, the order of the channels should be specified explicitly (RGB or BGR). Note
+that the default color format in OpenCV is often referred to as RGB but it is actually BGR (the
+bytes are reversed). So the first byte in a standard (24-bit) color image will be an 8-bit Blue
+component, the second byte will be Green, and the third byte will be Red. The fourth, fifth, and
+sixth bytes would then be the second pixel (Blue, then Green, then Red), and so on.
+
+The conventional ranges for R, G, and B channel values are:
+-   0 to 255 for CV_8U images
+-   0 to 65535 for CV_16U images
+-   0 to 1 for CV_32F images
+
+In case of linear transformations, the range does not matter. But in case of a non-linear
+transformation, an input RGB image should be normalized to the proper value range to get the correct
+results, for example, for RGB \f$\rightarrow\f$ L\*u\*v\* transformation. For example, if you have a
+32-bit floating-point image directly converted from an 8-bit image without any scaling, then it will
+have the 0..255 value range instead of 0..1 assumed by the function. So, before calling #cvtColor ,
+you need first to scale the image down:
+@code
+    img *= 1./255;
+    cvtColor(img, img, COLOR_BGR2Luv);
+@endcode
+If you use #cvtColor with 8-bit images, the conversion will have some information lost. For many
+applications, this will not be noticeable but it is recommended to use 32-bit images in applications
+that need the full range of colors or that convert an image before an operation and then convert
+back.
+
+If conversion adds the alpha channel, its value will set to the maximum of corresponding channel
+range: 255 for CV_8U, 65535 for CV_16U, 1 for CV_32F.
+
+@param src input image: 8-bit unsigned, 16-bit unsigned ( CV_16UC... ), or single-precision
+floating-point.
+@param dst output image of the same size and depth as src.
+@param code color space conversion code (see #ColorConversionCodes).
+@param dstCn number of channels in the destination image; if the parameter is 0, the number of the
+channels is derived automatically from src and code.
+
+@see @ref imgproc_color_conversions
+ */
+CV_EXPORTS_W void cvtColor( InputArray src, OutputArray dst, int code, int dstCn = 0 );
+
+/** @brief Converts an image from one color space to another where the source image is
+stored in two planes.
+
+This function only supports YUV420 to RGB conversion as of now.
+
+@param src1 8-bit image (#CV_8U) of the Y plane.
+@param src2 image containing interleaved U/V plane.
+@param dst output image.
+@param code Specifies the type of conversion. It can take any of the following values:
+- #COLOR_YUV2BGR_NV12
+- #COLOR_YUV2RGB_NV12
+- #COLOR_YUV2BGRA_NV12
+- #COLOR_YUV2RGBA_NV12
+- #COLOR_YUV2BGR_NV21
+- #COLOR_YUV2RGB_NV21
+- #COLOR_YUV2BGRA_NV21
+- #COLOR_YUV2RGBA_NV21
+*/
+CV_EXPORTS_W void cvtColorTwoPlane( InputArray src1, InputArray src2, OutputArray dst, int code );
+
+/** @brief main function for all demosaicing processes
+
+@param src input image: 8-bit unsigned or 16-bit unsigned.
+@param dst output image of the same size and depth as src.
+@param code Color space conversion code (see the description below).
+@param dstCn number of channels in the destination image; if the parameter is 0, the number of the
+channels is derived automatically from src and code.
+
+The function can do the following transformations:
+
+-   Demosaicing using bilinear interpolation
+
+    #COLOR_BayerBG2BGR , #COLOR_BayerGB2BGR , #COLOR_BayerRG2BGR , #COLOR_BayerGR2BGR
+
+    #COLOR_BayerBG2GRAY , #COLOR_BayerGB2GRAY , #COLOR_BayerRG2GRAY , #COLOR_BayerGR2GRAY
+
+-   Demosaicing using Variable Number of Gradients.
+
+    #COLOR_BayerBG2BGR_VNG , #COLOR_BayerGB2BGR_VNG , #COLOR_BayerRG2BGR_VNG , #COLOR_BayerGR2BGR_VNG
+
+-   Edge-Aware Demosaicing.
+
+    #COLOR_BayerBG2BGR_EA , #COLOR_BayerGB2BGR_EA , #COLOR_BayerRG2BGR_EA , #COLOR_BayerGR2BGR_EA
+
+-   Demosaicing with alpha channel
+
+    #COLOR_BayerBG2BGRA , #COLOR_BayerGB2BGRA , #COLOR_BayerRG2BGRA , #COLOR_BayerGR2BGRA
+
+@sa cvtColor
+*/
+CV_EXPORTS_W void demosaicing(InputArray src, OutputArray dst, int code, int dstCn = 0);
+
+//! @} imgproc_color_conversions
+
+//! @addtogroup imgproc_shape
+//! @{
+
+/** @brief Calculates all of the moments up to the third order of a polygon or rasterized shape.
+
+The function computes moments, up to the 3rd order, of a vector shape or a rasterized shape. The
+results are returned in the structure cv::Moments.
+
+@param array Single chanel raster image (CV_8U, CV_16U, CV_16S, CV_32F, CV_64F) or an array (
+\f$1 \times N\f$ or \f$N \times 1\f$ ) of 2D points (Point or Point2f).
+@param binaryImage If it is true, all non-zero image pixels are treated as 1's. The parameter is
+used for images only.
+@returns moments.
+
+@note Only applicable to contour moments calculations from Python bindings: Note that the numpy
+type for the input array should be either np.int32 or np.float32.
+
+@sa  contourArea, arcLength
+ */
+CV_EXPORTS_W Moments moments( InputArray array, bool binaryImage = false );
+
+/** @brief Calculates seven Hu invariants.
+
+The function calculates seven Hu invariants (introduced in @cite Hu62; see also
+<http://en.wikipedia.org/wiki/Image_moment>) defined as:
+
+\f[\begin{array}{l} hu[0]= \eta _{20}+ \eta _{02} \\ hu[1]=( \eta _{20}- \eta _{02})^{2}+4 \eta _{11}^{2} \\ hu[2]=( \eta _{30}-3 \eta _{12})^{2}+ (3 \eta _{21}- \eta _{03})^{2} \\ hu[3]=( \eta _{30}+ \eta _{12})^{2}+ ( \eta _{21}+ \eta _{03})^{2} \\ hu[4]=( \eta _{30}-3 \eta _{12})( \eta _{30}+ \eta _{12})[( \eta _{30}+ \eta _{12})^{2}-3( \eta _{21}+ \eta _{03})^{2}]+(3 \eta _{21}- \eta _{03})( \eta _{21}+ \eta _{03})[3( \eta _{30}+ \eta _{12})^{2}-( \eta _{21}+ \eta _{03})^{2}] \\ hu[5]=( \eta _{20}- \eta _{02})[( \eta _{30}+ \eta _{12})^{2}- ( \eta _{21}+ \eta _{03})^{2}]+4 \eta _{11}( \eta _{30}+ \eta _{12})( \eta _{21}+ \eta _{03}) \\ hu[6]=(3 \eta _{21}- \eta _{03})( \eta _{21}+ \eta _{03})[3( \eta _{30}+ \eta _{12})^{2}-( \eta _{21}+ \eta _{03})^{2}]-( \eta _{30}-3 \eta _{12})( \eta _{21}+ \eta _{03})[3( \eta _{30}+ \eta _{12})^{2}-( \eta _{21}+ \eta _{03})^{2}] \\ \end{array}\f]
+
+where \f$\eta_{ji}\f$ stands for \f$\texttt{Moments::nu}_{ji}\f$ .
+
+These values are proved to be invariants to the image scale, rotation, and reflection except the
+seventh one, whose sign is changed by reflection. This invariance is proved with the assumption of
+infinite image resolution. In case of raster images, the computed Hu invariants for the original and
+transformed images are a bit different.
+
+@param moments Input moments computed with moments .
+@param hu Output Hu invariants.
+
+@sa matchShapes
+ */
+CV_EXPORTS void HuMoments( const Moments& moments, double hu[7] );
+
+/** @overload */
+CV_EXPORTS_W void HuMoments( const Moments& m, OutputArray hu );
+
+//! @} imgproc_shape
+
+//! @addtogroup imgproc_object
+//! @{
+
+//! type of the template matching operation
+enum TemplateMatchModes {
+    TM_SQDIFF        = 0, /*!< \f[R(x,y)= \sum _{x',y'} (T(x',y')-I(x+x',y+y'))^2\f]
+                               with mask:
+                               \f[R(x,y)= \sum _{x',y'} \left( (T(x',y')-I(x+x',y+y')) \cdot
+                                  M(x',y') \right)^2\f] */
+    TM_SQDIFF_NORMED = 1, /*!< \f[R(x,y)= \frac{\sum_{x',y'} (T(x',y')-I(x+x',y+y'))^2}{\sqrt{\sum_{
+                                  x',y'}T(x',y')^2 \cdot \sum_{x',y'} I(x+x',y+y')^2}}\f]
+                               with mask:
+                               \f[R(x,y)= \frac{\sum _{x',y'} \left( (T(x',y')-I(x+x',y+y')) \cdot
+                                  M(x',y') \right)^2}{\sqrt{\sum_{x',y'} \left( T(x',y') \cdot
+                                  M(x',y') \right)^2 \cdot \sum_{x',y'} \left( I(x+x',y+y') \cdot
+                                  M(x',y') \right)^2}}\f] */
+    TM_CCORR         = 2, /*!< \f[R(x,y)= \sum _{x',y'} (T(x',y') \cdot I(x+x',y+y'))\f]
+                               with mask:
+                               \f[R(x,y)= \sum _{x',y'} (T(x',y') \cdot I(x+x',y+y') \cdot M(x',y')
+                                  ^2)\f] */
+    TM_CCORR_NORMED  = 3, /*!< \f[R(x,y)= \frac{\sum_{x',y'} (T(x',y') \cdot I(x+x',y+y'))}{\sqrt{
+                                  \sum_{x',y'}T(x',y')^2 \cdot \sum_{x',y'} I(x+x',y+y')^2}}\f]
+                               with mask:
+                               \f[R(x,y)= \frac{\sum_{x',y'} (T(x',y') \cdot I(x+x',y+y') \cdot
+                                  M(x',y')^2)}{\sqrt{\sum_{x',y'} \left( T(x',y') \cdot M(x',y')
+                                  \right)^2 \cdot \sum_{x',y'} \left( I(x+x',y+y') \cdot M(x',y')
+                                  \right)^2}}\f] */
+    TM_CCOEFF        = 4, /*!< \f[R(x,y)= \sum _{x',y'} (T'(x',y') \cdot I'(x+x',y+y'))\f]
+                               where
+                               \f[\begin{array}{l} T'(x',y')=T(x',y') - 1/(w \cdot h) \cdot \sum _{
+                                  x'',y''} T(x'',y'') \\ I'(x+x',y+y')=I(x+x',y+y') - 1/(w \cdot h)
+                                  \cdot \sum _{x'',y''} I(x+x'',y+y'') \end{array}\f]
+                               with mask:
+                               \f[\begin{array}{l} T'(x',y')=M(x',y') \cdot \left( T(x',y') -
+                                  \frac{1}{\sum _{x'',y''} M(x'',y'')} \cdot \sum _{x'',y''}
+                                  (T(x'',y'') \cdot M(x'',y'')) \right) \\ I'(x+x',y+y')=M(x',y')
+                                  \cdot \left( I(x+x',y+y') - \frac{1}{\sum _{x'',y''} M(x'',y'')}
+                                  \cdot \sum _{x'',y''} (I(x+x'',y+y'') \cdot M(x'',y'')) \right)
+                                  \end{array} \f] */
+    TM_CCOEFF_NORMED = 5  /*!< \f[R(x,y)= \frac{ \sum_{x',y'} (T'(x',y') \cdot I'(x+x',y+y')) }{
+                                  \sqrt{\sum_{x',y'}T'(x',y')^2 \cdot \sum_{x',y'} I'(x+x',y+y')^2}
+                                  }\f] */
+};
+
+/** @example samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp
+An example using Template Matching algorithm
+*/
+
+/** @brief Compares a template against overlapped image regions.
+
+The function slides through image , compares the overlapped patches of size \f$w \times h\f$ against
+templ using the specified method and stores the comparison results in result . #TemplateMatchModes
+describes the formulae for the available comparison methods ( \f$I\f$ denotes image, \f$T\f$
+template, \f$R\f$ result, \f$M\f$ the optional mask ). The summation is done over template and/or
+the image patch: \f$x' = 0...w-1, y' = 0...h-1\f$
+
+After the function finishes the comparison, the best matches can be found as global minimums (when
+#TM_SQDIFF was used) or maximums (when #TM_CCORR or #TM_CCOEFF was used) using the
+#minMaxLoc function. In case of a color image, template summation in the numerator and each sum in
+the denominator is done over all of the channels and separate mean values are used for each channel.
+That is, the function can take a color template and a color image. The result will still be a
+single-channel image, which is easier to analyze.
+
+@param image Image where the search is running. It must be 8-bit or 32-bit floating-point.
+@param templ Searched template. It must be not greater than the source image and have the same
+data type.
+@param result Map of comparison results. It must be single-channel 32-bit floating-point. If image
+is \f$W \times H\f$ and templ is \f$w \times h\f$ , then result is \f$(W-w+1) \times (H-h+1)\f$ .
+@param method Parameter specifying the comparison method, see #TemplateMatchModes
+@param mask Optional mask. It must have the same size as templ. It must either have the same number
+            of channels as template or only one channel, which is then used for all template and
+            image channels. If the data type is #CV_8U, the mask is interpreted as a binary mask,
+            meaning only elements where mask is nonzero are used and are kept unchanged independent
+            of the actual mask value (weight equals 1). For data tpye #CV_32F, the mask values are
+            used as weights. The exact formulas are documented in #TemplateMatchModes.
+ */
+CV_EXPORTS_W void matchTemplate( InputArray image, InputArray templ,
+                                 OutputArray result, int method, InputArray mask = noArray() );
+
+//! @}
+
+//! @addtogroup imgproc_shape
+//! @{
+
+/** @example samples/cpp/connected_components.cpp
+This program demonstrates connected components and use of the trackbar
+*/
+
+/** @brief computes the connected components labeled image of boolean image
+
+image with 4 or 8 way connectivity - returns N, the total number of labels [0, N-1] where 0
+represents the background label. ltype specifies the output label image type, an important
+consideration based on the total number of labels or alternatively the total number of pixels in
+the source image. ccltype specifies the connected components labeling algorithm to use, currently
+Bolelli (Spaghetti) @cite Bolelli2019, Grana (BBDT) @cite Grana2010 and Wu's (SAUF) @cite Wu2009 algorithms
+are supported, see the #ConnectedComponentsAlgorithmsTypes for details. Note that SAUF algorithm forces
+a row major ordering of labels while Spaghetti and BBDT do not.
+This function uses parallel version of the algorithms if at least one allowed
+parallel framework is enabled and if the rows of the image are at least twice the number returned by #getNumberOfCPUs.
+
+@param image the 8-bit single-channel image to be labeled
+@param labels destination labeled image
+@param connectivity 8 or 4 for 8-way or 4-way connectivity respectively
+@param ltype output image label type. Currently CV_32S and CV_16U are supported.
+@param ccltype connected components algorithm type (see the #ConnectedComponentsAlgorithmsTypes).
+*/
+CV_EXPORTS_AS(connectedComponentsWithAlgorithm) int connectedComponents(InputArray image, OutputArray labels,
+                                                                        int connectivity, int ltype, int ccltype);
+
+
+/** @overload
+
+@param image the 8-bit single-channel image to be labeled
+@param labels destination labeled image
+@param connectivity 8 or 4 for 8-way or 4-way connectivity respectively
+@param ltype output image label type. Currently CV_32S and CV_16U are supported.
+*/
+CV_EXPORTS_W int connectedComponents(InputArray image, OutputArray labels,
+                                     int connectivity = 8, int ltype = CV_32S);
+
+
+/** @brief computes the connected components labeled image of boolean image and also produces a statistics output for each label
+
+image with 4 or 8 way connectivity - returns N, the total number of labels [0, N-1] where 0
+represents the background label. ltype specifies the output label image type, an important
+consideration based on the total number of labels or alternatively the total number of pixels in
+the source image. ccltype specifies the connected components labeling algorithm to use, currently
+Bolelli (Spaghetti) @cite Bolelli2019, Grana (BBDT) @cite Grana2010 and Wu's (SAUF) @cite Wu2009 algorithms
+are supported, see the #ConnectedComponentsAlgorithmsTypes for details. Note that SAUF algorithm forces
+a row major ordering of labels while Spaghetti and BBDT do not.
+This function uses parallel version of the algorithms (statistics included) if at least one allowed
+parallel framework is enabled and if the rows of the image are at least twice the number returned by #getNumberOfCPUs.
+
+@param image the 8-bit single-channel image to be labeled
+@param labels destination labeled image
+@param stats statistics output for each label, including the background label.
+Statistics are accessed via stats(label, COLUMN) where COLUMN is one of
+#ConnectedComponentsTypes, selecting the statistic. The data type is CV_32S.
+@param centroids centroid output for each label, including the background label. Centroids are
+accessed via centroids(label, 0) for x and centroids(label, 1) for y. The data type CV_64F.
+@param connectivity 8 or 4 for 8-way or 4-way connectivity respectively
+@param ltype output image label type. Currently CV_32S and CV_16U are supported.
+@param ccltype connected components algorithm type (see #ConnectedComponentsAlgorithmsTypes).
+*/
+CV_EXPORTS_AS(connectedComponentsWithStatsWithAlgorithm) int connectedComponentsWithStats(InputArray image, OutputArray labels,
+                                                                                          OutputArray stats, OutputArray centroids,
+                                                                                          int connectivity, int ltype, int ccltype);
+
+/** @overload
+@param image the 8-bit single-channel image to be labeled
+@param labels destination labeled image
+@param stats statistics output for each label, including the background label.
+Statistics are accessed via stats(label, COLUMN) where COLUMN is one of
+#ConnectedComponentsTypes, selecting the statistic. The data type is CV_32S.
+@param centroids centroid output for each label, including the background label. Centroids are
+accessed via centroids(label, 0) for x and centroids(label, 1) for y. The data type CV_64F.
+@param connectivity 8 or 4 for 8-way or 4-way connectivity respectively
+@param ltype output image label type. Currently CV_32S and CV_16U are supported.
+*/
+CV_EXPORTS_W int connectedComponentsWithStats(InputArray image, OutputArray labels,
+                                              OutputArray stats, OutputArray centroids,
+                                              int connectivity = 8, int ltype = CV_32S);
+
+
+/** @brief Finds contours in a binary image.
+
+The function retrieves contours from the binary image using the algorithm @cite Suzuki85 . The contours
+are a useful tool for shape analysis and object detection and recognition. See squares.cpp in the
+OpenCV sample directory.
+@note Since opencv 3.2 source image is not modified by this function.
+
+@param image Source, an 8-bit single-channel image. Non-zero pixels are treated as 1's. Zero
+pixels remain 0's, so the image is treated as binary . You can use #compare, #inRange, #threshold ,
+#adaptiveThreshold, #Canny, and others to create a binary image out of a grayscale or color one.
+If mode equals to #RETR_CCOMP or #RETR_FLOODFILL, the input can also be a 32-bit integer image of labels (CV_32SC1).
+@param contours Detected contours. Each contour is stored as a vector of points (e.g.
+std::vector<std::vector<cv::Point> >).
+@param hierarchy Optional output vector (e.g. std::vector<cv::Vec4i>), containing information about the image topology. It has
+as many elements as the number of contours. For each i-th contour contours[i], the elements
+hierarchy[i][0] , hierarchy[i][1] , hierarchy[i][2] , and hierarchy[i][3] are set to 0-based indices
+in contours of the next and previous contours at the same hierarchical level, the first child
+contour and the parent contour, respectively. If for the contour i there are no next, previous,
+parent, or nested contours, the corresponding elements of hierarchy[i] will be negative.
+@note In Python, hierarchy is nested inside a top level array. Use hierarchy[0][i] to access hierarchical elements of i-th contour.
+@param mode Contour retrieval mode, see #RetrievalModes
+@param method Contour approximation method, see #ContourApproximationModes
+@param offset Optional offset by which every contour point is shifted. This is useful if the
+contours are extracted from the image ROI and then they should be analyzed in the whole image
+context.
+ */
+CV_EXPORTS_W void findContours( InputArray image, OutputArrayOfArrays contours,
+                              OutputArray hierarchy, int mode,
+                              int method, Point offset = Point());
+
+/** @overload */
+CV_EXPORTS void findContours( InputArray image, OutputArrayOfArrays contours,
+                              int mode, int method, Point offset = Point());
+
+//! @brief Find contours using link runs algorithm
+//!
+//! This function implements an algorithm different from cv::findContours:
+//! - doesn't allocate temporary image internally, thus it has reduced memory consumption
+//! - supports CV_8UC1 images only
+//! - outputs 2-level hierarhy only (RETR_CCOMP mode)
+//! - doesn't support approximation change other than CHAIN_APPROX_SIMPLE
+//! In all other aspects this function is compatible with cv::findContours.
+CV_EXPORTS_W void findContoursLinkRuns(InputArray image, OutputArrayOfArrays contours, OutputArray hierarchy);
+
+//! @overload
+CV_EXPORTS_W void findContoursLinkRuns(InputArray image, OutputArrayOfArrays contours);
+
+/** @brief Approximates a polygonal curve(s) with the specified precision.
+
+The function cv::approxPolyDP approximates a curve or a polygon with another curve/polygon with less
+vertices so that the distance between them is less or equal to the specified precision. It uses the
+Douglas-Peucker algorithm <http://en.wikipedia.org/wiki/Ramer-Douglas-Peucker_algorithm>
+
+@param curve Input vector of a 2D point stored in std::vector or Mat
+@param approxCurve Result of the approximation. The type should match the type of the input curve.
+@param epsilon Parameter specifying the approximation accuracy. This is the maximum distance
+between the original curve and its approximation.
+@param closed If true, the approximated curve is closed (its first and last vertices are
+connected). Otherwise, it is not closed.
+ */
+CV_EXPORTS_W void approxPolyDP( InputArray curve,
+                                OutputArray approxCurve,
+                                double epsilon, bool closed );
+
+/** @brief Calculates a contour perimeter or a curve length.
+
+The function computes a curve length or a closed contour perimeter.
+
+@param curve Input vector of 2D points, stored in std::vector or Mat.
+@param closed Flag indicating whether the curve is closed or not.
+ */
+CV_EXPORTS_W double arcLength( InputArray curve, bool closed );
+
+/** @brief Calculates the up-right bounding rectangle of a point set or non-zero pixels of gray-scale image.
+
+The function calculates and returns the minimal up-right bounding rectangle for the specified point set or
+non-zero pixels of gray-scale image.
+
+@param array Input gray-scale image or 2D point set, stored in std::vector or Mat.
+ */
+CV_EXPORTS_W Rect boundingRect( InputArray array );
+
+/** @brief Calculates a contour area.
+
+The function computes a contour area. Similarly to moments , the area is computed using the Green
+formula. Thus, the returned area and the number of non-zero pixels, if you draw the contour using
+#drawContours or #fillPoly , can be different. Also, the function will most certainly give a wrong
+results for contours with self-intersections.
+
+Example:
+@code
+    vector<Point> contour;
+    contour.push_back(Point2f(0, 0));
+    contour.push_back(Point2f(10, 0));
+    contour.push_back(Point2f(10, 10));
+    contour.push_back(Point2f(5, 4));
+
+    double area0 = contourArea(contour);
+    vector<Point> approx;
+    approxPolyDP(contour, approx, 5, true);
+    double area1 = contourArea(approx);
+
+    cout << "area0 =" << area0 << endl <<
+            "area1 =" << area1 << endl <<
+            "approx poly vertices" << approx.size() << endl;
+@endcode
+@param contour Input vector of 2D points (contour vertices), stored in std::vector or Mat.
+@param oriented Oriented area flag. If it is true, the function returns a signed area value,
+depending on the contour orientation (clockwise or counter-clockwise). Using this feature you can
+determine orientation of a contour by taking the sign of an area. By default, the parameter is
+false, which means that the absolute value is returned.
+ */
+CV_EXPORTS_W double contourArea( InputArray contour, bool oriented = false );
+
+/** @brief Finds a rotated rectangle of the minimum area enclosing the input 2D point set.
+
+The function calculates and returns the minimum-area bounding rectangle (possibly rotated) for a
+specified point set. Developer should keep in mind that the returned RotatedRect can contain negative
+indices when data is close to the containing Mat element boundary.
+
+@param points Input vector of 2D points, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W RotatedRect minAreaRect( InputArray points );
+
+/** @brief Finds the four vertices of a rotated rect. Useful to draw the rotated rectangle.
+
+The function finds the four vertices of a rotated rectangle. This function is useful to draw the
+rectangle. In C++, instead of using this function, you can directly use RotatedRect::points method. Please
+visit the @ref tutorial_bounding_rotated_ellipses "tutorial on Creating Bounding rotated boxes and ellipses for contours" for more information.
+
+@param box The input rotated rectangle. It may be the output of @ref minAreaRect.
+@param points The output array of four vertices of rectangles.
+ */
+CV_EXPORTS_W void boxPoints(RotatedRect box, OutputArray points);
+
+/** @brief Finds a circle of the minimum area enclosing a 2D point set.
+
+The function finds the minimal enclosing circle of a 2D point set using an iterative algorithm.
+
+@param points Input vector of 2D points, stored in std::vector\<\> or Mat
+@param center Output center of the circle.
+@param radius Output radius of the circle.
+ */
+CV_EXPORTS_W void minEnclosingCircle( InputArray points,
+                                      CV_OUT Point2f& center, CV_OUT float& radius );
+
+/** @example samples/cpp/minarea.cpp
+*/
+
+/** @brief Finds a triangle of minimum area enclosing a 2D point set and returns its area.
+
+The function finds a triangle of minimum area enclosing the given set of 2D points and returns its
+area. The output for a given 2D point set is shown in the image below. 2D points are depicted in
+*red* and the enclosing triangle in *yellow*.
+
+![Sample output of the minimum enclosing triangle function](pics/minenclosingtriangle.png)
+
+The implementation of the algorithm is based on O'Rourke's @cite ORourke86 and Klee and Laskowski's
+@cite KleeLaskowski85 papers. O'Rourke provides a \f$\theta(n)\f$ algorithm for finding the minimal
+enclosing triangle of a 2D convex polygon with n vertices. Since the #minEnclosingTriangle function
+takes a 2D point set as input an additional preprocessing step of computing the convex hull of the
+2D point set is required. The complexity of the #convexHull function is \f$O(n log(n))\f$ which is higher
+than \f$\theta(n)\f$. Thus the overall complexity of the function is \f$O(n log(n))\f$.
+
+@param points Input vector of 2D points with depth CV_32S or CV_32F, stored in std::vector\<\> or Mat
+@param triangle Output vector of three 2D points defining the vertices of the triangle. The depth
+of the OutputArray must be CV_32F.
+ */
+CV_EXPORTS_W double minEnclosingTriangle( InputArray points, CV_OUT OutputArray triangle );
+
+/** @brief Compares two shapes.
+
+The function compares two shapes. All three implemented methods use the Hu invariants (see #HuMoments)
+
+@param contour1 First contour or grayscale image.
+@param contour2 Second contour or grayscale image.
+@param method Comparison method, see #ShapeMatchModes
+@param parameter Method-specific parameter (not supported now).
+ */
+CV_EXPORTS_W double matchShapes( InputArray contour1, InputArray contour2,
+                                 int method, double parameter );
+
+/** @example samples/cpp/convexhull.cpp
+An example using the convexHull functionality
+*/
+
+/** @brief Finds the convex hull of a point set.
+
+The function cv::convexHull finds the convex hull of a 2D point set using the Sklansky's algorithm @cite Sklansky82
+that has *O(N logN)* complexity in the current implementation.
+
+@param points Input 2D point set, stored in std::vector or Mat.
+@param hull Output convex hull. It is either an integer vector of indices or vector of points. In
+the first case, the hull elements are 0-based indices of the convex hull points in the original
+array (since the set of convex hull points is a subset of the original point set). In the second
+case, hull elements are the convex hull points themselves.
+@param clockwise Orientation flag. If it is true, the output convex hull is oriented clockwise.
+Otherwise, it is oriented counter-clockwise. The assumed coordinate system has its X axis pointing
+to the right, and its Y axis pointing upwards.
+@param returnPoints Operation flag. In case of a matrix, when the flag is true, the function
+returns convex hull points. Otherwise, it returns indices of the convex hull points. When the
+output array is std::vector, the flag is ignored, and the output depends on the type of the
+vector: std::vector\<int\> implies returnPoints=false, std::vector\<Point\> implies
+returnPoints=true.
+
+@note `points` and `hull` should be different arrays, inplace processing isn't supported.
+
+Check @ref tutorial_hull "the corresponding tutorial" for more details.
+
+useful links:
+
+https://www.learnopencv.com/convex-hull-using-opencv-in-python-and-c/
+ */
+CV_EXPORTS_W void convexHull( InputArray points, OutputArray hull,
+                              bool clockwise = false, bool returnPoints = true );
+
+/** @brief Finds the convexity defects of a contour.
+
+The figure below displays convexity defects of a hand contour:
+
+![image](pics/defects.png)
+
+@param contour Input contour.
+@param convexhull Convex hull obtained using convexHull that should contain indices of the contour
+points that make the hull.
+@param convexityDefects The output vector of convexity defects. In C++ and the new Python/Java
+interface each convexity defect is represented as 4-element integer vector (a.k.a. #Vec4i):
+(start_index, end_index, farthest_pt_index, fixpt_depth), where indices are 0-based indices
+in the original contour of the convexity defect beginning, end and the farthest point, and
+fixpt_depth is fixed-point approximation (with 8 fractional bits) of the distance between the
+farthest contour point and the hull. That is, to get the floating-point value of the depth will be
+fixpt_depth/256.0.
+ */
+CV_EXPORTS_W void convexityDefects( InputArray contour, InputArray convexhull, OutputArray convexityDefects );
+
+/** @brief Tests a contour convexity.
+
+The function tests whether the input contour is convex or not. The contour must be simple, that is,
+without self-intersections. Otherwise, the function output is undefined.
+
+@param contour Input vector of 2D points, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W bool isContourConvex( InputArray contour );
+
+/** @example samples/cpp/intersectExample.cpp
+Examples of how intersectConvexConvex works
+*/
+
+/** @brief Finds intersection of two convex polygons
+
+@param p1 First polygon
+@param p2 Second polygon
+@param p12 Output polygon describing the intersecting area
+@param handleNested When true, an intersection is found if one of the polygons is fully enclosed in the other.
+When false, no intersection is found. If the polygons share a side or the vertex of one polygon lies on an edge
+of the other, they are not considered nested and an intersection will be found regardless of the value of handleNested.
+
+@returns Absolute value of area of intersecting polygon
+
+@note intersectConvexConvex doesn't confirm that both polygons are convex and will return invalid results if they aren't.
+ */
+CV_EXPORTS_W float intersectConvexConvex( InputArray p1, InputArray p2,
+                                          OutputArray p12, bool handleNested = true );
+
+/** @example samples/cpp/fitellipse.cpp
+An example using the fitEllipse technique
+*/
+
+/** @brief Fits an ellipse around a set of 2D points.
+
+The function calculates the ellipse that fits (in a least-squares sense) a set of 2D points best of
+all. It returns the rotated rectangle in which the ellipse is inscribed. The first algorithm described by @cite Fitzgibbon95
+is used. Developer should keep in mind that it is possible that the returned
+ellipse/rotatedRect data contains negative indices, due to the data points being close to the
+border of the containing Mat element.
+
+@param points Input 2D point set, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W RotatedRect fitEllipse( InputArray points );
+
+/** @brief Fits an ellipse around a set of 2D points.
+
+ The function calculates the ellipse that fits a set of 2D points.
+ It returns the rotated rectangle in which the ellipse is inscribed.
+ The Approximate Mean Square (AMS) proposed by @cite Taubin1991 is used.
+
+ For an ellipse, this basis set is \f$ \chi= \left(x^2, x y, y^2, x, y, 1\right) \f$,
+ which is a set of six free coefficients \f$ A^T=\left\{A_{\text{xx}},A_{\text{xy}},A_{\text{yy}},A_x,A_y,A_0\right\} \f$.
+ However, to specify an ellipse, all that is needed is five numbers; the major and minor axes lengths \f$ (a,b) \f$,
+ the position \f$ (x_0,y_0) \f$, and the orientation \f$ \theta \f$. This is because the basis set includes lines,
+ quadratics, parabolic and hyperbolic functions as well as elliptical functions as possible fits.
+ If the fit is found to be a parabolic or hyperbolic function then the standard #fitEllipse method is used.
+ The AMS method restricts the fit to parabolic, hyperbolic and elliptical curves
+ by imposing the condition that \f$ A^T ( D_x^T D_x  +   D_y^T D_y) A = 1 \f$ where
+ the matrices \f$ Dx \f$ and \f$ Dy \f$ are the partial derivatives of the design matrix \f$ D \f$ with
+ respect to x and y. The matrices are formed row by row applying the following to
+ each of the points in the set:
+ \f{align*}{
+ D(i,:)&=\left\{x_i^2, x_i y_i, y_i^2, x_i, y_i, 1\right\} &
+ D_x(i,:)&=\left\{2 x_i,y_i,0,1,0,0\right\} &
+ D_y(i,:)&=\left\{0,x_i,2 y_i,0,1,0\right\}
+ \f}
+ The AMS method minimizes the cost function
+ \f{equation*}{
+ \epsilon ^2=\frac{ A^T D^T D A }{ A^T (D_x^T D_x +  D_y^T D_y) A^T }
+ \f}
+
+ The minimum cost is found by solving the generalized eigenvalue problem.
+
+ \f{equation*}{
+ D^T D A = \lambda  \left( D_x^T D_x +  D_y^T D_y\right) A
+ \f}
+
+ @param points Input 2D point set, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W RotatedRect fitEllipseAMS( InputArray points );
+
+
+/** @brief Fits an ellipse around a set of 2D points.
+
+ The function calculates the ellipse that fits a set of 2D points.
+ It returns the rotated rectangle in which the ellipse is inscribed.
+ The Direct least square (Direct) method by @cite Fitzgibbon1999 is used.
+
+ For an ellipse, this basis set is \f$ \chi= \left(x^2, x y, y^2, x, y, 1\right) \f$,
+ which is a set of six free coefficients \f$ A^T=\left\{A_{\text{xx}},A_{\text{xy}},A_{\text{yy}},A_x,A_y,A_0\right\} \f$.
+ However, to specify an ellipse, all that is needed is five numbers; the major and minor axes lengths \f$ (a,b) \f$,
+ the position \f$ (x_0,y_0) \f$, and the orientation \f$ \theta \f$. This is because the basis set includes lines,
+ quadratics, parabolic and hyperbolic functions as well as elliptical functions as possible fits.
+ The Direct method confines the fit to ellipses by ensuring that \f$ 4 A_{xx} A_{yy}- A_{xy}^2 > 0 \f$.
+ The condition imposed is that \f$ 4 A_{xx} A_{yy}- A_{xy}^2=1 \f$ which satisfies the inequality
+ and as the coefficients can be arbitrarily scaled is not overly restrictive.
+
+ \f{equation*}{
+ \epsilon ^2= A^T D^T D A \quad \text{with} \quad A^T C A =1 \quad \text{and} \quad C=\left(\begin{matrix}
+ 0 & 0  & 2  & 0  & 0  &  0  \\
+ 0 & -1  & 0  & 0  & 0  &  0 \\
+ 2 & 0  & 0  & 0  & 0  &  0 \\
+ 0 & 0  & 0  & 0  & 0  &  0 \\
+ 0 & 0  & 0  & 0  & 0  &  0 \\
+ 0 & 0  & 0  & 0  & 0  &  0
+ \end{matrix} \right)
+ \f}
+
+ The minimum cost is found by solving the generalized eigenvalue problem.
+
+ \f{equation*}{
+ D^T D A = \lambda  \left( C\right) A
+ \f}
+
+ The system produces only one positive eigenvalue \f$ \lambda\f$ which is chosen as the solution
+ with its eigenvector \f$\mathbf{u}\f$. These are used to find the coefficients
+
+ \f{equation*}{
+ A = \sqrt{\frac{1}{\mathbf{u}^T C \mathbf{u}}}  \mathbf{u}
+ \f}
+ The scaling factor guarantees that  \f$A^T C A =1\f$.
+
+ @param points Input 2D point set, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W RotatedRect fitEllipseDirect( InputArray points );
+
+/** @brief Fits a line to a 2D or 3D point set.
+
+The function fitLine fits a line to a 2D or 3D point set by minimizing \f$\sum_i \rho(r_i)\f$ where
+\f$r_i\f$ is a distance between the \f$i^{th}\f$ point, the line and \f$\rho(r)\f$ is a distance function, one
+of the following:
+-  DIST_L2
+\f[\rho (r) = r^2/2  \quad \text{(the simplest and the fastest least-squares method)}\f]
+- DIST_L1
+\f[\rho (r) = r\f]
+- DIST_L12
+\f[\rho (r) = 2  \cdot ( \sqrt{1 + \frac{r^2}{2}} - 1)\f]
+- DIST_FAIR
+\f[\rho \left (r \right ) = C^2  \cdot \left (  \frac{r}{C} -  \log{\left(1 + \frac{r}{C}\right)} \right )  \quad \text{where} \quad C=1.3998\f]
+- DIST_WELSCH
+\f[\rho \left (r \right ) =  \frac{C^2}{2} \cdot \left ( 1 -  \exp{\left(-\left(\frac{r}{C}\right)^2\right)} \right )  \quad \text{where} \quad C=2.9846\f]
+- DIST_HUBER
+\f[\rho (r) =  \fork{r^2/2}{if \(r < C\)}{C \cdot (r-C/2)}{otherwise} \quad \text{where} \quad C=1.345\f]
+
+The algorithm is based on the M-estimator ( <http://en.wikipedia.org/wiki/M-estimator> ) technique
+that iteratively fits the line using the weighted least-squares algorithm. After each iteration the
+weights \f$w_i\f$ are adjusted to be inversely proportional to \f$\rho(r_i)\f$ .
+
+@param points Input vector of 2D or 3D points, stored in std::vector\<\> or Mat.
+@param line Output line parameters. In case of 2D fitting, it should be a vector of 4 elements
+(like Vec4f) - (vx, vy, x0, y0), where (vx, vy) is a normalized vector collinear to the line and
+(x0, y0) is a point on the line. In case of 3D fitting, it should be a vector of 6 elements (like
+Vec6f) - (vx, vy, vz, x0, y0, z0), where (vx, vy, vz) is a normalized vector collinear to the line
+and (x0, y0, z0) is a point on the line.
+@param distType Distance used by the M-estimator, see #DistanceTypes
+@param param Numerical parameter ( C ) for some types of distances. If it is 0, an optimal value
+is chosen.
+@param reps Sufficient accuracy for the radius (distance between the coordinate origin and the line).
+@param aeps Sufficient accuracy for the angle. 0.01 would be a good default value for reps and aeps.
+ */
+CV_EXPORTS_W void fitLine( InputArray points, OutputArray line, int distType,
+                           double param, double reps, double aeps );
+
+/** @brief Performs a point-in-contour test.
+
+The function determines whether the point is inside a contour, outside, or lies on an edge (or
+coincides with a vertex). It returns positive (inside), negative (outside), or zero (on an edge)
+value, correspondingly. When measureDist=false , the return value is +1, -1, and 0, respectively.
+Otherwise, the return value is a signed distance between the point and the nearest contour edge.
+
+See below a sample output of the function where each image pixel is tested against the contour:
+
+![sample output](pics/pointpolygon.png)
+
+@param contour Input contour.
+@param pt Point tested against the contour.
+@param measureDist If true, the function estimates the signed distance from the point to the
+nearest contour edge. Otherwise, the function only checks if the point is inside a contour or not.
+ */
+CV_EXPORTS_W double pointPolygonTest( InputArray contour, Point2f pt, bool measureDist );
+
+/** @brief Finds out if there is any intersection between two rotated rectangles.
+
+If there is then the vertices of the intersecting region are returned as well.
+
+Below are some examples of intersection configurations. The hatched pattern indicates the
+intersecting region and the red vertices are returned by the function.
+
+![intersection examples](pics/intersection.png)
+
+@param rect1 First rectangle
+@param rect2 Second rectangle
+@param intersectingRegion The output array of the vertices of the intersecting region. It returns
+at most 8 vertices. Stored as std::vector\<cv::Point2f\> or cv::Mat as Mx1 of type CV_32FC2.
+@returns One of #RectanglesIntersectTypes
+ */
+CV_EXPORTS_W int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& rect2, OutputArray intersectingRegion  );
+
+/** @brief Creates a smart pointer to a cv::GeneralizedHoughBallard class and initializes it.
+*/
+CV_EXPORTS_W Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();
+
+/** @brief Creates a smart pointer to a cv::GeneralizedHoughGuil class and initializes it.
+*/
+CV_EXPORTS_W Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();
+
+//! @} imgproc_shape
+
+//! @addtogroup imgproc_colormap
+//! @{
+
+//! GNU Octave/MATLAB equivalent colormaps
+enum ColormapTypes
+{
+    COLORMAP_AUTUMN = 0, //!< ![autumn](pics/colormaps/colorscale_autumn.jpg)
+    COLORMAP_BONE = 1, //!< ![bone](pics/colormaps/colorscale_bone.jpg)
+    COLORMAP_JET = 2, //!< ![jet](pics/colormaps/colorscale_jet.jpg)
+    COLORMAP_WINTER = 3, //!< ![winter](pics/colormaps/colorscale_winter.jpg)
+    COLORMAP_RAINBOW = 4, //!< ![rainbow](pics/colormaps/colorscale_rainbow.jpg)
+    COLORMAP_OCEAN = 5, //!< ![ocean](pics/colormaps/colorscale_ocean.jpg)
+    COLORMAP_SUMMER = 6, //!< ![summer](pics/colormaps/colorscale_summer.jpg)
+    COLORMAP_SPRING = 7, //!< ![spring](pics/colormaps/colorscale_spring.jpg)
+    COLORMAP_COOL = 8, //!< ![cool](pics/colormaps/colorscale_cool.jpg)
+    COLORMAP_HSV = 9, //!< ![HSV](pics/colormaps/colorscale_hsv.jpg)
+    COLORMAP_PINK = 10, //!< ![pink](pics/colormaps/colorscale_pink.jpg)
+    COLORMAP_HOT = 11, //!< ![hot](pics/colormaps/colorscale_hot.jpg)
+    COLORMAP_PARULA = 12, //!< ![parula](pics/colormaps/colorscale_parula.jpg)
+    COLORMAP_MAGMA = 13, //!< ![magma](pics/colormaps/colorscale_magma.jpg)
+    COLORMAP_INFERNO = 14, //!< ![inferno](pics/colormaps/colorscale_inferno.jpg)
+    COLORMAP_PLASMA = 15, //!< ![plasma](pics/colormaps/colorscale_plasma.jpg)
+    COLORMAP_VIRIDIS = 16, //!< ![viridis](pics/colormaps/colorscale_viridis.jpg)
+    COLORMAP_CIVIDIS = 17, //!< ![cividis](pics/colormaps/colorscale_cividis.jpg)
+    COLORMAP_TWILIGHT = 18, //!< ![twilight](pics/colormaps/colorscale_twilight.jpg)
+    COLORMAP_TWILIGHT_SHIFTED = 19, //!< ![twilight shifted](pics/colormaps/colorscale_twilight_shifted.jpg)
+    COLORMAP_TURBO = 20, //!< ![turbo](pics/colormaps/colorscale_turbo.jpg)
+    COLORMAP_DEEPGREEN = 21  //!< ![deepgreen](pics/colormaps/colorscale_deepgreen.jpg)
+};
+
+/** @example samples/cpp/falsecolor.cpp
+An example using applyColorMap function
+*/
+
+/** @brief Applies a GNU Octave/MATLAB equivalent colormap on a given image.
+
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
+@param dst The result is the colormapped source image. Note: Mat::create is called on dst.
+@param colormap The colormap to apply, see #ColormapTypes
+*/
+CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, int colormap);
+
+/** @brief Applies a user colormap on a given image.
+
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
+@param dst The result is the colormapped source image of the same number of channels as userColor. Note: Mat::create is called on dst.
+@param userColor The colormap to apply of type CV_8UC1 or CV_8UC3 and size 256
+*/
+CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, InputArray userColor);
+
+//! @} imgproc_colormap
+
+//! @addtogroup imgproc_draw
+//! @{
+
+
+/** OpenCV color channel order is BGR[A] */
+#define CV_RGB(r, g, b)  cv::Scalar((b), (g), (r), 0)
+
+/** @brief Draws a line segment connecting two points.
+
+The function line draws the line segment between pt1 and pt2 points in the image. The line is
+clipped by the image boundaries. For non-antialiased lines with integer coordinates, the 8-connected
+or 4-connected Bresenham algorithm is used. Thick lines are drawn with rounding endings. Antialiased
+lines are drawn using Gaussian filtering.
+
+@param img Image.
+@param pt1 First point of the line segment.
+@param pt2 Second point of the line segment.
+@param color Line color.
+@param thickness Line thickness.
+@param lineType Type of the line. See #LineTypes.
+@param shift Number of fractional bits in the point coordinates.
+ */
+CV_EXPORTS_W void line(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
+                     int thickness = 1, int lineType = LINE_8, int shift = 0);
+
+/** @brief Draws an arrow segment pointing from the first point to the second one.
+
+The function cv::arrowedLine draws an arrow between pt1 and pt2 points in the image. See also #line.
+
+@param img Image.
+@param pt1 The point the arrow starts from.
+@param pt2 The point the arrow points to.
+@param color Line color.
+@param thickness Line thickness.
+@param line_type Type of the line. See #LineTypes
+@param shift Number of fractional bits in the point coordinates.
+@param tipLength The length of the arrow tip in relation to the arrow length
+ */
+CV_EXPORTS_W void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
+                     int thickness=1, int line_type=8, int shift=0, double tipLength=0.1);
+
+/** @brief Draws a simple, thick, or filled up-right rectangle.
+
+The function cv::rectangle draws a rectangle outline or a filled rectangle whose two opposite corners
+are pt1 and pt2.
+
+@param img Image.
+@param pt1 Vertex of the rectangle.
+@param pt2 Vertex of the rectangle opposite to pt1 .
+@param color Rectangle color or brightness (grayscale image).
+@param thickness Thickness of lines that make up the rectangle. Negative values, like #FILLED,
+mean that the function has to draw a filled rectangle.
+@param lineType Type of the line. See #LineTypes
+@param shift Number of fractional bits in the point coordinates.
+ */
+CV_EXPORTS_W void rectangle(InputOutputArray img, Point pt1, Point pt2,
+                          const Scalar& color, int thickness = 1,
+                          int lineType = LINE_8, int shift = 0);
+
+/** @overload
+
+use `rec` parameter as alternative specification of the drawn rectangle: `r.tl() and
+r.br()-Point(1,1)` are opposite corners
+*/
+CV_EXPORTS_W void rectangle(InputOutputArray img, Rect rec,
+                          const Scalar& color, int thickness = 1,
+                          int lineType = LINE_8, int shift = 0);
+
+/** @example samples/cpp/tutorial_code/ImgProc/basic_drawing/Drawing_2.cpp
+An example using drawing functions
+*/
+
+/** @brief Draws a circle.
+
+The function cv::circle draws a simple or filled circle with a given center and radius.
+@param img Image where the circle is drawn.
+@param center Center of the circle.
+@param radius Radius of the circle.
+@param color Circle color.
+@param thickness Thickness of the circle outline, if positive. Negative values, like #FILLED,
+mean that a filled circle is to be drawn.
+@param lineType Type of the circle boundary. See #LineTypes
+@param shift Number of fractional bits in the coordinates of the center and in the radius value.
+ */
+CV_EXPORTS_W void circle(InputOutputArray img, Point center, int radius,
+                       const Scalar& color, int thickness = 1,
+                       int lineType = LINE_8, int shift = 0);
+
+/** @brief Draws a simple or thick elliptic arc or fills an ellipse sector.
+
+The function cv::ellipse with more parameters draws an ellipse outline, a filled ellipse, an elliptic
+arc, or a filled ellipse sector. The drawing code uses general parametric form.
+A piecewise-linear curve is used to approximate the elliptic arc
+boundary. If you need more control of the ellipse rendering, you can retrieve the curve using
+#ellipse2Poly and then render it with #polylines or fill it with #fillPoly. If you use the first
+variant of the function and want to draw the whole ellipse, not an arc, pass `startAngle=0` and
+`endAngle=360`. If `startAngle` is greater than `endAngle`, they are swapped. The figure below explains
+the meaning of the parameters to draw the blue arc.
+
+![Parameters of Elliptic Arc](pics/ellipse.svg)
+
+@param img Image.
+@param center Center of the ellipse.
+@param axes Half of the size of the ellipse main axes.
+@param angle Ellipse rotation angle in degrees.
+@param startAngle Starting angle of the elliptic arc in degrees.
+@param endAngle Ending angle of the elliptic arc in degrees.
+@param color Ellipse color.
+@param thickness Thickness of the ellipse arc outline, if positive. Otherwise, this indicates that
+a filled ellipse sector is to be drawn.
+@param lineType Type of the ellipse boundary. See #LineTypes
+@param shift Number of fractional bits in the coordinates of the center and values of axes.
+ */
+CV_EXPORTS_W void ellipse(InputOutputArray img, Point center, Size axes,
+                        double angle, double startAngle, double endAngle,
+                        const Scalar& color, int thickness = 1,
+                        int lineType = LINE_8, int shift = 0);
+
+/** @overload
+@param img Image.
+@param box Alternative ellipse representation via RotatedRect. This means that the function draws
+an ellipse inscribed in the rotated rectangle.
+@param color Ellipse color.
+@param thickness Thickness of the ellipse arc outline, if positive. Otherwise, this indicates that
+a filled ellipse sector is to be drawn.
+@param lineType Type of the ellipse boundary. See #LineTypes
+*/
+CV_EXPORTS_W void ellipse(InputOutputArray img, const RotatedRect& box, const Scalar& color,
+                        int thickness = 1, int lineType = LINE_8);
+
+/* ----------------------------------------------------------------------------------------- */
+/* ADDING A SET OF PREDEFINED MARKERS WHICH COULD BE USED TO HIGHLIGHT POSITIONS IN AN IMAGE */
+/* ----------------------------------------------------------------------------------------- */
+
+/** @brief Draws a marker on a predefined position in an image.
+
+The function cv::drawMarker draws a marker on a given position in the image. For the moment several
+marker types are supported, see #MarkerTypes for more information.
+
+@param img Image.
+@param position The point where the crosshair is positioned.
+@param color Line color.
+@param markerType The specific type of marker you want to use, see #MarkerTypes
+@param thickness Line thickness.
+@param line_type Type of the line, See #LineTypes
+@param markerSize The length of the marker axis [default = 20 pixels]
+ */
+CV_EXPORTS_W void drawMarker(InputOutputArray img, Point position, const Scalar& color,
+                             int markerType = MARKER_CROSS, int markerSize=20, int thickness=1,
+                             int line_type=8);
+
+/* ----------------------------------------------------------------------------------------- */
+/* END OF MARKER SECTION */
+/* ----------------------------------------------------------------------------------------- */
+
+/** @brief Fills a convex polygon.
+
+The function cv::fillConvexPoly draws a filled convex polygon. This function is much faster than the
+function #fillPoly . It can fill not only convex polygons but any monotonic polygon without
+self-intersections, that is, a polygon whose contour intersects every horizontal line (scan line)
+twice at the most (though, its top-most and/or the bottom edge could be horizontal).
+
+@param img Image.
+@param points Polygon vertices.
+@param color Polygon color.
+@param lineType Type of the polygon boundaries. See #LineTypes
+@param shift Number of fractional bits in the vertex coordinates.
+ */
+CV_EXPORTS_W void fillConvexPoly(InputOutputArray img, InputArray points,
+                                 const Scalar& color, int lineType = LINE_8,
+                                 int shift = 0);
+
+/** @overload */
+CV_EXPORTS void fillConvexPoly(InputOutputArray img, const Point* pts, int npts,
+                               const Scalar& color, int lineType = LINE_8,
+                               int shift = 0);
+
+/** @example samples/cpp/tutorial_code/ImgProc/basic_drawing/Drawing_1.cpp
+An example using drawing functions
+Check @ref tutorial_random_generator_and_text "the corresponding tutorial" for more details
+*/
+
+/** @brief Fills the area bounded by one or more polygons.
+
+The function cv::fillPoly fills an area bounded by several polygonal contours. The function can fill
+complex areas, for example, areas with holes, contours with self-intersections (some of their
+parts), and so forth.
+
+@param img Image.
+@param pts Array of polygons where each polygon is represented as an array of points.
+@param color Polygon color.
+@param lineType Type of the polygon boundaries. See #LineTypes
+@param shift Number of fractional bits in the vertex coordinates.
+@param offset Optional offset of all points of the contours.
+ */
+CV_EXPORTS_W void fillPoly(InputOutputArray img, InputArrayOfArrays pts,
+                           const Scalar& color, int lineType = LINE_8, int shift = 0,
+                           Point offset = Point() );
+
+/** @overload */
+CV_EXPORTS void fillPoly(InputOutputArray img, const Point** pts,
+                         const int* npts, int ncontours,
+                         const Scalar& color, int lineType = LINE_8, int shift = 0,
+                         Point offset = Point() );
+
+/** @brief Draws several polygonal curves.
+
+@param img Image.
+@param pts Array of polygonal curves.
+@param isClosed Flag indicating whether the drawn polylines are closed or not. If they are closed,
+the function draws a line from the last vertex of each curve to its first vertex.
+@param color Polyline color.
+@param thickness Thickness of the polyline edges.
+@param lineType Type of the line segments. See #LineTypes
+@param shift Number of fractional bits in the vertex coordinates.
+
+The function cv::polylines draws one or more polygonal curves.
+ */
+CV_EXPORTS_W void polylines(InputOutputArray img, InputArrayOfArrays pts,
+                            bool isClosed, const Scalar& color,
+                            int thickness = 1, int lineType = LINE_8, int shift = 0 );
+
+/** @overload */
+CV_EXPORTS void polylines(InputOutputArray img, const Point* const* pts, const int* npts,
+                          int ncontours, bool isClosed, const Scalar& color,
+                          int thickness = 1, int lineType = LINE_8, int shift = 0 );
+
+/** @example samples/cpp/contours2.cpp
+An example program illustrates the use of cv::findContours and cv::drawContours
+\image html WindowsQtContoursOutput.png "Screenshot of the program"
+*/
+
+/** @example samples/cpp/segment_objects.cpp
+An example using drawContours to clean up a background segmentation result
+*/
+
+/** @brief Draws contours outlines or filled contours.
+
+The function draws contour outlines in the image if \f$\texttt{thickness} \ge 0\f$ or fills the area
+bounded by the contours if \f$\texttt{thickness}<0\f$ . The example below shows how to retrieve
+connected components from the binary image and label them: :
+@include snippets/imgproc_drawContours.cpp
+
+@param image Destination image.
+@param contours All the input contours. Each contour is stored as a point vector.
+@param contourIdx Parameter indicating a contour to draw. If it is negative, all the contours are drawn.
+@param color Color of the contours.
+@param thickness Thickness of lines the contours are drawn with. If it is negative (for example,
+thickness=#FILLED ), the contour interiors are drawn.
+@param lineType Line connectivity. See #LineTypes
+@param hierarchy Optional information about hierarchy. It is only needed if you want to draw only
+some of the contours (see maxLevel ).
+@param maxLevel Maximal level for drawn contours. If it is 0, only the specified contour is drawn.
+If it is 1, the function draws the contour(s) and all the nested contours. If it is 2, the function
+draws the contours, all the nested contours, all the nested-to-nested contours, and so on. This
+parameter is only taken into account when there is hierarchy available.
+@param offset Optional contour shift parameter. Shift all the drawn contours by the specified
+\f$\texttt{offset}=(dx,dy)\f$ .
+@note When thickness=#FILLED, the function is designed to handle connected components with holes correctly
+even when no hierarchy data is provided. This is done by analyzing all the outlines together
+using even-odd rule. This may give incorrect results if you have a joint collection of separately retrieved
+contours. In order to solve this problem, you need to call #drawContours separately for each sub-group
+of contours, or iterate over the collection using contourIdx parameter.
+ */
+CV_EXPORTS_W void drawContours( InputOutputArray image, InputArrayOfArrays contours,
+                              int contourIdx, const Scalar& color,
+                              int thickness = 1, int lineType = LINE_8,
+                              InputArray hierarchy = noArray(),
+                              int maxLevel = INT_MAX, Point offset = Point() );
+
+/** @brief Clips the line against the image rectangle.
+
+The function cv::clipLine calculates a part of the line segment that is entirely within the specified
+rectangle. It returns false if the line segment is completely outside the rectangle. Otherwise,
+it returns true .
+@param imgSize Image size. The image rectangle is Rect(0, 0, imgSize.width, imgSize.height) .
+@param pt1 First line point.
+@param pt2 Second line point.
+ */
+CV_EXPORTS bool clipLine(Size imgSize, CV_IN_OUT Point& pt1, CV_IN_OUT Point& pt2);
+
+/** @overload
+@param imgSize Image size. The image rectangle is Rect(0, 0, imgSize.width, imgSize.height) .
+@param pt1 First line point.
+@param pt2 Second line point.
+*/
+CV_EXPORTS bool clipLine(Size2l imgSize, CV_IN_OUT Point2l& pt1, CV_IN_OUT Point2l& pt2);
+
+/** @overload
+@param imgRect Image rectangle.
+@param pt1 First line point.
+@param pt2 Second line point.
+*/
+CV_EXPORTS_W bool clipLine(Rect imgRect, CV_OUT CV_IN_OUT Point& pt1, CV_OUT CV_IN_OUT Point& pt2);
+
+/** @brief Approximates an elliptic arc with a polyline.
+
+The function ellipse2Poly computes the vertices of a polyline that approximates the specified
+elliptic arc. It is used by #ellipse. If `arcStart` is greater than `arcEnd`, they are swapped.
+
+@param center Center of the arc.
+@param axes Half of the size of the ellipse main axes. See #ellipse for details.
+@param angle Rotation angle of the ellipse in degrees. See #ellipse for details.
+@param arcStart Starting angle of the elliptic arc in degrees.
+@param arcEnd Ending angle of the elliptic arc in degrees.
+@param delta Angle between the subsequent polyline vertices. It defines the approximation
+accuracy.
+@param pts Output vector of polyline vertices.
+ */
+CV_EXPORTS_W void ellipse2Poly( Point center, Size axes, int angle,
+                                int arcStart, int arcEnd, int delta,
+                                CV_OUT std::vector<Point>& pts );
+
+/** @overload
+@param center Center of the arc.
+@param axes Half of the size of the ellipse main axes. See #ellipse for details.
+@param angle Rotation angle of the ellipse in degrees. See #ellipse for details.
+@param arcStart Starting angle of the elliptic arc in degrees.
+@param arcEnd Ending angle of the elliptic arc in degrees.
+@param delta Angle between the subsequent polyline vertices. It defines the approximation accuracy.
+@param pts Output vector of polyline vertices.
+*/
+CV_EXPORTS void ellipse2Poly(Point2d center, Size2d axes, int angle,
+                             int arcStart, int arcEnd, int delta,
+                             CV_OUT std::vector<Point2d>& pts);
+
+/** @brief Draws a text string.
+
+The function cv::putText renders the specified text string in the image. Symbols that cannot be rendered
+using the specified font are replaced by question marks. See #getTextSize for a text rendering code
+example.
+
+@param img Image.
+@param text Text string to be drawn.
+@param org Bottom-left corner of the text string in the image.
+@param fontFace Font type, see #HersheyFonts.
+@param fontScale Font scale factor that is multiplied by the font-specific base size.
+@param color Text color.
+@param thickness Thickness of the lines used to draw a text.
+@param lineType Line type. See #LineTypes
+@param bottomLeftOrigin When true, the image data origin is at the bottom-left corner. Otherwise,
+it is at the top-left corner.
+ */
+CV_EXPORTS_W void putText( InputOutputArray img, const String& text, Point org,
+                         int fontFace, double fontScale, Scalar color,
+                         int thickness = 1, int lineType = LINE_8,
+                         bool bottomLeftOrigin = false );
+
+/** @brief Calculates the width and height of a text string.
+
+The function cv::getTextSize calculates and returns the size of a box that contains the specified text.
+That is, the following code renders some text, the tight box surrounding it, and the baseline: :
+@code
+    String text = "Funny text inside the box";
+    int fontFace = FONT_HERSHEY_SCRIPT_SIMPLEX;
+    double fontScale = 2;
+    int thickness = 3;
+
+    Mat img(600, 800, CV_8UC3, Scalar::all(0));
+
+    int baseline=0;
+    Size textSize = getTextSize(text, fontFace,
+                                fontScale, thickness, &baseline);
+    baseline += thickness;
+
+    // center the text
+    Point textOrg((img.cols - textSize.width)/2,
+                  (img.rows + textSize.height)/2);
+
+    // draw the box
+    rectangle(img, textOrg + Point(0, baseline),
+              textOrg + Point(textSize.width, -textSize.height),
+              Scalar(0,0,255));
+    // ... and the baseline first
+    line(img, textOrg + Point(0, thickness),
+         textOrg + Point(textSize.width, thickness),
+         Scalar(0, 0, 255));
+
+    // then put the text itself
+    putText(img, text, textOrg, fontFace, fontScale,
+            Scalar::all(255), thickness, 8);
+@endcode
+
+@param text Input text string.
+@param fontFace Font to use, see #HersheyFonts.
+@param fontScale Font scale factor that is multiplied by the font-specific base size.
+@param thickness Thickness of lines used to render the text. See #putText for details.
+@param[out] baseLine y-coordinate of the baseline relative to the bottom-most text
+point.
+@return The size of a box that contains the specified text.
+
+@see putText
+ */
+CV_EXPORTS_W Size getTextSize(const String& text, int fontFace,
+                            double fontScale, int thickness,
+                            CV_OUT int* baseLine);
+
+
+/** @brief Calculates the font-specific size to use to achieve a given height in pixels.
+
+@param fontFace Font to use, see cv::HersheyFonts.
+@param pixelHeight Pixel height to compute the fontScale for
+@param thickness Thickness of lines used to render the text.See putText for details.
+@return The fontSize to use for cv::putText
+
+@see cv::putText
+*/
+CV_EXPORTS_W double getFontScaleFromHeight(const int fontFace,
+                                           const int pixelHeight,
+                                           const int thickness = 1);
+
+/** @brief Class for iterating over all pixels on a raster line segment.
+
+The class LineIterator is used to get each pixel of a raster line connecting
+two specified points.
+It can be treated as a versatile implementation of the Bresenham algorithm
+where you can stop at each pixel and do some extra processing, for
+example, grab pixel values along the line or draw a line with an effect
+(for example, with XOR operation).
+
+The number of pixels along the line is stored in LineIterator::count.
+The method LineIterator::pos returns the current position in the image:
+
+@code{.cpp}
+// grabs pixels along the line (pt1, pt2)
+// from 8-bit 3-channel image to the buffer
+LineIterator it(img, pt1, pt2, 8);
+LineIterator it2 = it;
+vector<Vec3b> buf(it.count);
+
+for(int i = 0; i < it.count; i++, ++it)
+    buf[i] = *(const Vec3b*)*it;
+
+// alternative way of iterating through the line
+for(int i = 0; i < it2.count; i++, ++it2)
+{
+    Vec3b val = img.at<Vec3b>(it2.pos());
+    CV_Assert(buf[i] == val);
+}
+@endcode
+*/
+class CV_EXPORTS LineIterator
+{
+public:
+    /** @brief Initializes iterator object for the given line and image.
+
+    The returned iterator can be used to traverse all pixels on a line that
+    connects the given two points.
+    The line will be clipped on the image boundaries.
+
+    @param img Underlying image.
+    @param pt1 First endpoint of the line.
+    @param pt2 The other endpoint of the line.
+    @param connectivity Pixel connectivity of the iterator. Valid values are 4 (iterator can move
+    up, down, left and right) and 8 (iterator can also move diagonally).
+    @param leftToRight If true, the line is traversed from the leftmost endpoint to the rightmost
+    endpoint. Otherwise, the line is traversed from \p pt1 to \p pt2.
+    */
+    LineIterator( const Mat& img, Point pt1, Point pt2,
+                  int connectivity = 8, bool leftToRight = false )
+    {
+        init(&img, Rect(0, 0, img.cols, img.rows), pt1, pt2, connectivity, leftToRight);
+        ptmode = false;
+    }
+    LineIterator( Point pt1, Point pt2,
+                  int connectivity = 8, bool leftToRight = false )
+    {
+        init(0, Rect(std::min(pt1.x, pt2.x),
+                     std::min(pt1.y, pt2.y),
+                     std::max(pt1.x, pt2.x) - std::min(pt1.x, pt2.x) + 1,
+                     std::max(pt1.y, pt2.y) - std::min(pt1.y, pt2.y) + 1),
+             pt1, pt2, connectivity, leftToRight);
+        ptmode = true;
+    }
+    LineIterator( Size boundingAreaSize, Point pt1, Point pt2,
+                  int connectivity = 8, bool leftToRight = false )
+    {
+        init(0, Rect(0, 0, boundingAreaSize.width, boundingAreaSize.height),
+             pt1, pt2, connectivity, leftToRight);
+        ptmode = true;
+    }
+    LineIterator( Rect boundingAreaRect, Point pt1, Point pt2,
+                  int connectivity = 8, bool leftToRight = false )
+    {
+        init(0, boundingAreaRect, pt1, pt2, connectivity, leftToRight);
+        ptmode = true;
+    }
+    void init(const Mat* img, Rect boundingAreaRect, Point pt1, Point pt2, int connectivity, bool leftToRight);
+
+    /** @brief Returns pointer to the current pixel.
+    */
+    uchar* operator *();
+
+    /** @brief Moves iterator to the next pixel on the line.
+
+    This is the prefix version (++it).
+    */
+    LineIterator& operator ++();
+
+    /** @brief Moves iterator to the next pixel on the line.
+
+    This is the postfix version (it++).
+    */
+    LineIterator operator ++(int);
+
+    /** @brief Returns coordinates of the current pixel.
+    */
+    Point pos() const;
+
+    uchar* ptr;
+    const uchar* ptr0;
+    int step, elemSize;
+    int err, count;
+    int minusDelta, plusDelta;
+    int minusStep, plusStep;
+    int minusShift, plusShift;
+    Point p;
+    bool ptmode;
+};
+
+//! @cond IGNORED
+
+// === LineIterator implementation ===
+
+inline
+uchar* LineIterator::operator *()
+{
+    return ptmode ? 0 : ptr;
+}
+
+inline
+LineIterator& LineIterator::operator ++()
+{
+    int mask = err < 0 ? -1 : 0;
+    err += minusDelta + (plusDelta & mask);
+    if(!ptmode)
+    {
+        ptr += minusStep + (plusStep & mask);
+    }
+    else
+    {
+        p.x += minusShift + (plusShift & mask);
+        p.y += minusStep + (plusStep & mask);
+    }
+    return *this;
+}
+
+inline
+LineIterator LineIterator::operator ++(int)
+{
+    LineIterator it = *this;
+    ++(*this);
+    return it;
+}
+
+inline
+Point LineIterator::pos() const
+{
+    if(!ptmode)
+    {
+        size_t offset = (size_t)(ptr - ptr0);
+        int y = (int)(offset/step);
+        int x = (int)((offset - (size_t)y*step)/elemSize);
+        return Point(x, y);
+    }
+    return p;
+}
+
+//! @endcond
+
+//! @} imgproc_draw
+
+//! @} imgproc
+
+} // cv
+
+
+#include "./imgproc/segmentation.hpp"
+
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/bindings.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/bindings.hpp
new file mode 100644
index 000000000000..c69527a77945
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/bindings.hpp
@@ -0,0 +1,34 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_IMGPROC_BINDINGS_HPP
+#define OPENCV_IMGPROC_BINDINGS_HPP
+
+// This file contains special overloads for OpenCV bindings
+// No need to use these functions in C++ code.
+
+namespace cv {
+
+/** @brief Finds lines in a binary image using the standard Hough transform and get accumulator.
+ *
+ * @note This function is for bindings use only. Use original function in C++ code
+ *
+ * @sa HoughLines
+ */
+CV_WRAP static inline
+void HoughLinesWithAccumulator(
+        InputArray image, OutputArray lines,
+        double rho, double theta, int threshold,
+        double srn = 0, double stn = 0,
+        double min_theta = 0, double max_theta = CV_PI
+)
+{
+    std::vector<Vec3f> lines_acc;
+    HoughLines(image, lines_acc, rho, theta, threshold, srn, stn, min_theta, max_theta);
+    Mat(lines_acc).copyTo(lines);
+}
+
+}  // namespace
+
+#endif  // OPENCV_IMGPROC_BINDINGS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/detail/gcgraph.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/detail/gcgraph.hpp
new file mode 100644
index 000000000000..f17c6e7afb9f
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/detail/gcgraph.hpp
@@ -0,0 +1,395 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_IMGPROC_DETAIL_GCGRAPH_HPP
+#define OPENCV_IMGPROC_DETAIL_GCGRAPH_HPP
+
+//! @cond IGNORED
+
+namespace cv { namespace detail {
+template <class TWeight> class GCGraph
+{
+public:
+    GCGraph();
+    GCGraph( unsigned int vtxCount, unsigned int edgeCount );
+    ~GCGraph();
+    void create( unsigned int vtxCount, unsigned int edgeCount );
+    int addVtx();
+    void addEdges( int i, int j, TWeight w, TWeight revw );
+    void addTermWeights( int i, TWeight sourceW, TWeight sinkW );
+    TWeight maxFlow();
+    bool inSourceSegment( int i );
+private:
+    class Vtx
+    {
+    public:
+        Vtx *next; // initialized and used in maxFlow() only
+        int parent;
+        int first;
+        int ts;
+        int dist;
+        TWeight weight;
+        uchar t;
+    };
+    class Edge
+    {
+    public:
+        int dst;
+        int next;
+        TWeight weight;
+    };
+
+    std::vector<Vtx> vtcs;
+    std::vector<Edge> edges;
+    TWeight flow;
+};
+
+template <class TWeight>
+GCGraph<TWeight>::GCGraph()
+{
+    flow = 0;
+}
+template <class TWeight>
+GCGraph<TWeight>::GCGraph( unsigned int vtxCount, unsigned int edgeCount )
+{
+    create( vtxCount, edgeCount );
+}
+template <class TWeight>
+GCGraph<TWeight>::~GCGraph()
+{
+}
+template <class TWeight>
+void GCGraph<TWeight>::create( unsigned int vtxCount, unsigned int edgeCount )
+{
+    vtcs.reserve( vtxCount );
+    edges.reserve( edgeCount + 2 );
+    flow = 0;
+}
+
+template <class TWeight>
+int GCGraph<TWeight>::addVtx()
+{
+    Vtx v;
+    memset( &v, 0, sizeof(Vtx));
+    vtcs.push_back(v);
+    return (int)vtcs.size() - 1;
+}
+
+template <class TWeight>
+void GCGraph<TWeight>::addEdges( int i, int j, TWeight w, TWeight revw )
+{
+    CV_Assert( i>=0 && i<(int)vtcs.size() );
+    CV_Assert( j>=0 && j<(int)vtcs.size() );
+    CV_Assert( w>=0 && revw>=0 );
+    CV_Assert( i != j );
+
+    if( !edges.size() )
+        edges.resize( 2 );
+
+    Edge fromI, toI;
+    fromI.dst = j;
+    fromI.next = vtcs[i].first;
+    fromI.weight = w;
+    vtcs[i].first = (int)edges.size();
+    edges.push_back( fromI );
+
+    toI.dst = i;
+    toI.next = vtcs[j].first;
+    toI.weight = revw;
+    vtcs[j].first = (int)edges.size();
+    edges.push_back( toI );
+}
+
+template <class TWeight>
+void GCGraph<TWeight>::addTermWeights( int i, TWeight sourceW, TWeight sinkW )
+{
+    CV_Assert( i>=0 && i<(int)vtcs.size() );
+
+    TWeight dw = vtcs[i].weight;
+    if( dw > 0 )
+        sourceW += dw;
+    else
+        sinkW -= dw;
+    flow += (sourceW < sinkW) ? sourceW : sinkW;
+    vtcs[i].weight = sourceW - sinkW;
+}
+
+template <class TWeight>
+TWeight GCGraph<TWeight>::maxFlow()
+{
+    CV_Assert(!vtcs.empty());
+    CV_Assert(!edges.empty());
+    const int TERMINAL = -1, ORPHAN = -2;
+    Vtx stub, *nilNode = &stub, *first = nilNode, *last = nilNode;
+    int curr_ts = 0;
+    stub.next = nilNode;
+    Vtx *vtxPtr = &vtcs[0];
+    Edge *edgePtr = &edges[0];
+
+    std::vector<Vtx*> orphans;
+
+    // initialize the active queue and the graph vertices
+    for( int i = 0; i < (int)vtcs.size(); i++ )
+    {
+        Vtx* v = vtxPtr + i;
+        v->ts = 0;
+        if( v->weight != 0 )
+        {
+            last = last->next = v;
+            v->dist = 1;
+            v->parent = TERMINAL;
+            v->t = v->weight < 0;
+        }
+        else
+            v->parent = 0;
+    }
+    first = first->next;
+    last->next = nilNode;
+    nilNode->next = 0;
+
+    // run the search-path -> augment-graph -> restore-trees loop
+    for(;;)
+    {
+        Vtx* v, *u;
+        int e0 = -1, ei = 0, ej = 0;
+        TWeight minWeight, weight;
+        uchar vt;
+
+        // grow S & T search trees, find an edge connecting them
+        while( first != nilNode )
+        {
+            v = first;
+            if( v->parent )
+            {
+                vt = v->t;
+                for( ei = v->first; ei != 0; ei = edgePtr[ei].next )
+                {
+                    if( edgePtr[ei^vt].weight == 0 )
+                        continue;
+                    u = vtxPtr+edgePtr[ei].dst;
+                    if( !u->parent )
+                    {
+                        u->t = vt;
+                        u->parent = ei ^ 1;
+                        u->ts = v->ts;
+                        u->dist = v->dist + 1;
+                        if( !u->next )
+                        {
+                            u->next = nilNode;
+                            last = last->next = u;
+                        }
+                        continue;
+                    }
+
+                    if( u->t != vt )
+                    {
+                        e0 = ei ^ vt;
+                        break;
+                    }
+
+                    if( u->dist > v->dist+1 && u->ts <= v->ts )
+                    {
+                        // reassign the parent
+                        u->parent = ei ^ 1;
+                        u->ts = v->ts;
+                        u->dist = v->dist + 1;
+                    }
+                }
+                if( e0 > 0 )
+                    break;
+            }
+            // exclude the vertex from the active list
+            first = first->next;
+            v->next = 0;
+        }
+
+        if( e0 <= 0 )
+            break;
+
+        // find the minimum edge weight along the path
+        minWeight = edgePtr[e0].weight;
+        CV_Assert( minWeight > 0 );
+        // k = 1: source tree, k = 0: destination tree
+        for( int k = 1; k >= 0; k-- )
+        {
+            for( v = vtxPtr+edgePtr[e0^k].dst;; v = vtxPtr+edgePtr[ei].dst )
+            {
+                if( (ei = v->parent) < 0 )
+                    break;
+                weight = edgePtr[ei^k].weight;
+                minWeight = MIN(minWeight, weight);
+                CV_Assert( minWeight > 0 );
+            }
+            weight = fabs(v->weight);
+            minWeight = MIN(minWeight, weight);
+            CV_Assert( minWeight > 0 );
+        }
+
+        // modify weights of the edges along the path and collect orphans
+        edgePtr[e0].weight -= minWeight;
+        edgePtr[e0^1].weight += minWeight;
+        flow += minWeight;
+
+        // k = 1: source tree, k = 0: destination tree
+        for( int k = 1; k >= 0; k-- )
+        {
+            for( v = vtxPtr+edgePtr[e0^k].dst;; v = vtxPtr+edgePtr[ei].dst )
+            {
+                if( (ei = v->parent) < 0 )
+                    break;
+                edgePtr[ei^(k^1)].weight += minWeight;
+                if( (edgePtr[ei^k].weight -= minWeight) == 0 )
+                {
+                    orphans.push_back(v);
+                    v->parent = ORPHAN;
+                }
+            }
+
+            v->weight = v->weight + minWeight*(1-k*2);
+            if( v->weight == 0 )
+            {
+               orphans.push_back(v);
+               v->parent = ORPHAN;
+            }
+        }
+
+        // restore the search trees by finding new parents for the orphans
+        curr_ts++;
+        while( !orphans.empty() )
+        {
+            Vtx* v2 = orphans.back();
+            orphans.pop_back();
+
+            int d, minDist = INT_MAX;
+            e0 = 0;
+            vt = v2->t;
+
+            for( ei = v2->first; ei != 0; ei = edgePtr[ei].next )
+            {
+                if( edgePtr[ei^(vt^1)].weight == 0 )
+                    continue;
+                u = vtxPtr+edgePtr[ei].dst;
+                if( u->t != vt || u->parent == 0 )
+                    continue;
+                // compute the distance to the tree root
+                for( d = 0;; )
+                {
+                    if( u->ts == curr_ts )
+                    {
+                        d += u->dist;
+                        break;
+                    }
+                    ej = u->parent;
+                    d++;
+                    if( ej < 0 )
+                    {
+                        if( ej == ORPHAN )
+                            d = INT_MAX-1;
+                        else
+                        {
+                            u->ts = curr_ts;
+                            u->dist = 1;
+                        }
+                        break;
+                    }
+                    u = vtxPtr+edgePtr[ej].dst;
+                }
+
+                // update the distance
+                if( ++d < INT_MAX )
+                {
+                    if( d < minDist )
+                    {
+                        minDist = d;
+                        e0 = ei;
+                    }
+                    for( u = vtxPtr+edgePtr[ei].dst; u->ts != curr_ts; u = vtxPtr+edgePtr[u->parent].dst )
+                    {
+                        u->ts = curr_ts;
+                        u->dist = --d;
+                    }
+                }
+            }
+
+            if( (v2->parent = e0) > 0 )
+            {
+                v2->ts = curr_ts;
+                v2->dist = minDist;
+                continue;
+            }
+
+            /* no parent is found */
+            v2->ts = 0;
+            for( ei = v2->first; ei != 0; ei = edgePtr[ei].next )
+            {
+                u = vtxPtr+edgePtr[ei].dst;
+                ej = u->parent;
+                if( u->t != vt || !ej )
+                    continue;
+                if( edgePtr[ei^(vt^1)].weight && !u->next )
+                {
+                    u->next = nilNode;
+                    last = last->next = u;
+                }
+                if( ej > 0 && vtxPtr+edgePtr[ej].dst == v2 )
+                {
+                    orphans.push_back(u);
+                    u->parent = ORPHAN;
+                }
+            }
+        }
+    }
+    return flow;
+}
+
+template <class TWeight>
+bool GCGraph<TWeight>::inSourceSegment( int i )
+{
+    CV_Assert( i>=0 && i<(int)vtcs.size() );
+    return vtcs[i].t == 0;
+}
+
+}} // namespace detail, cv
+
+
+//! @endcond
+
+#endif  // OPENCV_IMGPROC_DETAIL_GCGRAPH_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/detail/legacy.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/detail/legacy.hpp
new file mode 100644
index 000000000000..029d9c90e83a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/detail/legacy.hpp
@@ -0,0 +1,38 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_IMGPROC_DETAIL_LEGACY_HPP
+#define OPENCV_IMGPROC_DETAIL_LEGACY_HPP
+
+#include "opencv2/imgproc.hpp"
+
+namespace cv {
+
+#ifdef __OPENCV_BUILD
+
+CV_EXPORTS void findContours_legacy(InputArray _image,
+                                    OutputArrayOfArrays _contours,
+                                    OutputArray _hierarchy,
+                                    int mode,
+                                    int method,
+                                    Point offset = Point());
+CV_EXPORTS void findContours_legacy(InputArray image,
+                                    OutputArrayOfArrays contours,
+                                    int mode,
+                                    int method,
+                                    Point offset = Point());
+
+CV_EXPORTS float EMD_legacy( InputArray _signature1, InputArray _signature2,
+               int distType, InputArray _cost,
+               float* lowerBound, OutputArray _flow );
+
+CV_EXPORTS float wrapperEMD_legacy(InputArray _signature1, InputArray _signature2,
+               int distType, InputArray _cost,
+               Ptr<float> lowerBound, OutputArray _flow);
+
+#endif
+
+}  // namespace cv
+
+#endif  // OPENCV_IMGPROC_DETAIL_LEGACY_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/hal/hal.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/hal/hal.hpp
new file mode 100644
index 000000000000..48851ece073c
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/hal/hal.hpp
@@ -0,0 +1,251 @@
+#ifndef CV_IMGPROC_HAL_HPP
+#define CV_IMGPROC_HAL_HPP
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/hal/interface.h"
+
+namespace cv { namespace hal {
+
+//! @addtogroup imgproc_hal_functions
+//! @{
+
+//---------------------------
+//! @cond IGNORED
+
+struct CV_EXPORTS Filter2D
+{
+    CV_DEPRECATED static Ptr<hal::Filter2D> create(uchar * , size_t , int ,
+                                     int , int ,
+                                     int , int ,
+                                     int , int ,
+                                     int , double ,
+                                     int , int ,
+                                     bool , bool );
+    virtual void apply(uchar * , size_t ,
+                       uchar * , size_t ,
+                       int , int ,
+                       int , int ,
+                       int , int ) = 0;
+    virtual ~Filter2D() {}
+};
+
+struct CV_EXPORTS SepFilter2D
+{
+    CV_DEPRECATED static Ptr<hal::SepFilter2D> create(int , int , int ,
+                                        uchar * , int ,
+                                        uchar * , int ,
+                                        int , int ,
+                                        double , int );
+    virtual void apply(uchar * , size_t ,
+                       uchar * , size_t ,
+                       int , int ,
+                       int , int ,
+                       int , int ) = 0;
+    virtual ~SepFilter2D() {}
+};
+
+
+struct CV_EXPORTS Morph
+{
+    CV_DEPRECATED static Ptr<hal::Morph> create(int , int , int , int , int ,
+                                    int , uchar * , size_t ,
+                                    int , int ,
+                                    int , int ,
+                                    int , const double *,
+                                    int , bool , bool );
+    virtual void apply(uchar * , size_t , uchar * , size_t , int , int ,
+                       int , int , int , int ,
+                       int , int , int , int ) = 0;
+    virtual ~Morph() {}
+};
+
+//! @endcond
+//---------------------------
+
+CV_EXPORTS void filter2D(int stype, int dtype, int kernel_type,
+                         uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int full_width, int full_height,
+                         int offset_x, int offset_y,
+                         uchar * kernel_data, size_t kernel_step,
+                         int kernel_width, int kernel_height,
+                         int anchor_x, int anchor_y,
+                         double delta, int borderType,
+                         bool isSubmatrix);
+
+CV_EXPORTS void sepFilter2D(int stype, int dtype, int ktype,
+                            uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int full_width, int full_height,
+                            int offset_x, int offset_y,
+                            uchar * kernelx_data, int kernelx_len,
+                            uchar * kernely_data, int kernely_len,
+                            int anchor_x, int anchor_y,
+                            double delta, int borderType);
+
+CV_EXPORTS void morph(int op, int src_type, int dst_type,
+                      uchar * src_data, size_t src_step,
+                      uchar * dst_data, size_t dst_step,
+                      int width, int height,
+                      int roi_width, int roi_height, int roi_x, int roi_y,
+                      int roi_width2, int roi_height2, int roi_x2, int roi_y2,
+                      int kernel_type, uchar * kernel_data, size_t kernel_step,
+                      int kernel_width, int kernel_height, int anchor_x, int anchor_y,
+                      int borderType, const double borderValue[4],
+                      int iterations, bool isSubmatrix);
+
+
+CV_EXPORTS void resize(int src_type,
+                       const uchar * src_data, size_t src_step, int src_width, int src_height,
+                       uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                       double inv_scale_x, double inv_scale_y, int interpolation);
+
+CV_EXPORTS void warpAffine(int src_type,
+                           const uchar * src_data, size_t src_step, int src_width, int src_height,
+                           uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                           const double M[6], int interpolation, int borderType, const double borderValue[4]);
+
+CV_EXPORTS void warpPerspective(int src_type,
+                               const uchar * src_data, size_t src_step, int src_width, int src_height,
+                               uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                               const double M[9], int interpolation, int borderType, const double borderValue[4]);
+
+CV_EXPORTS void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, int dcn, bool swapBlue);
+
+CV_EXPORTS void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step,
+                               uchar * dst_data, size_t dst_step,
+                               int width, int height,
+                               int scn, bool swapBlue, int greenBits);
+
+CV_EXPORTS void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step,
+                               uchar * dst_data, size_t dst_step,
+                               int width, int height,
+                               int dcn, bool swapBlue, int greenBits);
+
+CV_EXPORTS void cvtBGRtoGray(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height,
+                             int depth, int scn, bool swapBlue);
+
+CV_EXPORTS void cvtGraytoBGR(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height,
+                             int depth, int dcn);
+
+CV_EXPORTS void cvtBGR5x5toGray(const uchar * src_data, size_t src_step,
+                                uchar * dst_data, size_t dst_step,
+                                int width, int height,
+                                int greenBits);
+
+CV_EXPORTS void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
+                                uchar * dst_data, size_t dst_step,
+                                int width, int height,
+                                int greenBits);
+CV_EXPORTS void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue, bool isCbCr);
+
+CV_EXPORTS void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue, bool isCbCr);
+
+CV_EXPORTS void cvtBGRtoXYZ(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue);
+
+CV_EXPORTS void cvtXYZtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue);
+
+CV_EXPORTS void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV);
+
+CV_EXPORTS void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
+
+CV_EXPORTS void cvtBGRtoLab(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue, bool isLab, bool srgb);
+
+CV_EXPORTS void cvtLabtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue, bool isLab, bool srgb);
+
+CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int dst_width, int dst_height,
+                                    int dcn, bool swapBlue, int uIdx);
+
+//! Separate Y and UV planes
+CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int dst_width, int dst_height,
+                                    int dcn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int dst_width, int dst_height,
+                                    int dcn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                      uchar * dst_data, size_t dst_step,
+                                      int dst_width, int dst_height,
+                                      int dcn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
+                                      uchar * dst_data, size_t dst_step,
+                                      int width, int height,
+                                      int scn, bool swapBlue, int uIdx);
+
+//! Separate Y and UV planes
+CV_EXPORTS void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
+                                    uchar * y_data, uchar * uv_data, size_t dst_step,
+                                    int width, int height,
+                                    int scn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int width, int height,
+                                    int dcn, bool swapBlue, int uIdx, int ycn);
+
+CV_EXPORTS void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int width, int height,
+                                    int scn, bool swapBlue, int uIdx, int ycn);
+
+CV_EXPORTS void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
+                                        uchar * dst_data, size_t dst_step,
+                                        int width, int height);
+
+CV_EXPORTS void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
+                                        uchar * dst_data, size_t dst_step,
+                                        int width, int height);
+
+CV_EXPORTS void integral(int depth, int sdepth, int sqdepth,
+                         const uchar* src, size_t srcstep,
+                         uchar* sum, size_t sumstep,
+                         uchar* sqsum, size_t sqsumstep,
+                         uchar* tilted, size_t tstep,
+                         int width, int height, int cn);
+
+//! @}
+
+}}
+
+#endif // CV_IMGPROC_HAL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/hal/interface.h b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/hal/interface.h
new file mode 100644
index 000000000000..f8dbcfe791d9
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/hal/interface.h
@@ -0,0 +1,46 @@
+#ifndef OPENCV_IMGPROC_HAL_INTERFACE_H
+#define OPENCV_IMGPROC_HAL_INTERFACE_H
+
+//! @addtogroup imgproc_hal_interface
+//! @{
+
+//! @name Interpolation modes
+//! @sa cv::InterpolationFlags
+//! @{
+#define CV_HAL_INTER_NEAREST 0
+#define CV_HAL_INTER_LINEAR 1
+#define CV_HAL_INTER_CUBIC 2
+#define CV_HAL_INTER_AREA 3
+#define CV_HAL_INTER_LANCZOS4 4
+//! @}
+
+//! @name Morphology operations
+//! @sa cv::MorphTypes
+//! @{
+#define CV_HAL_MORPH_ERODE 0
+#define CV_HAL_MORPH_DILATE 1
+//! @}
+
+//! @name Threshold types
+//! @sa cv::ThresholdTypes
+//! @{
+#define CV_HAL_THRESH_BINARY      0
+#define CV_HAL_THRESH_BINARY_INV  1
+#define CV_HAL_THRESH_TRUNC       2
+#define CV_HAL_THRESH_TOZERO      3
+#define CV_HAL_THRESH_TOZERO_INV  4
+#define CV_HAL_THRESH_MASK        7
+#define CV_HAL_THRESH_OTSU        8
+#define CV_HAL_THRESH_TRIANGLE    16
+//! @}
+
+//! @name Adaptive threshold algorithm
+//! @sa cv::AdaptiveThresholdTypes
+//! @{
+#define CV_HAL_ADAPTIVE_THRESH_MEAN_C     0
+#define CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C 1
+//! @}
+
+//! @}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/imgproc.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/imgproc.hpp
new file mode 100644
index 000000000000..4175bd0bc01e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/imgproc.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/imgproc.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/imgproc_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/imgproc_c.h
new file mode 100644
index 000000000000..e97b802e6919
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/imgproc_c.h
@@ -0,0 +1,1185 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_IMGPROC_IMGPROC_C_H
+#define OPENCV_IMGPROC_IMGPROC_C_H
+
+#include "opencv2/imgproc/types_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup imgproc_c
+@{
+*/
+
+/*********************** Background statistics accumulation *****************************/
+
+/** @brief Adds image to accumulator
+@see cv::accumulate
+*/
+CVAPI(void)  cvAcc( const CvArr* image, CvArr* sum,
+                   const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Adds squared image to accumulator
+@see cv::accumulateSquare
+*/
+CVAPI(void)  cvSquareAcc( const CvArr* image, CvArr* sqsum,
+                         const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Adds a product of two images to accumulator
+@see cv::accumulateProduct
+*/
+CVAPI(void)  cvMultiplyAcc( const CvArr* image1, const CvArr* image2, CvArr* acc,
+                           const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Adds image to accumulator with weights: acc = acc*(1-alpha) + image*alpha
+@see cv::accumulateWeighted
+*/
+CVAPI(void)  cvRunningAvg( const CvArr* image, CvArr* acc, double alpha,
+                          const CvArr* mask CV_DEFAULT(NULL) );
+
+/****************************************************************************************\
+*                                    Image Processing                                    *
+\****************************************************************************************/
+
+/** Copies source 2D array inside of the larger destination array and
+   makes a border of the specified type (IPL_BORDER_*) around the copied area. */
+CVAPI(void) cvCopyMakeBorder( const CvArr* src, CvArr* dst, CvPoint offset,
+                              int bordertype, CvScalar value CV_DEFAULT(cvScalarAll(0)));
+
+/** @brief Smooths the image in one of several ways.
+
+@param src The source image
+@param dst The destination image
+@param smoothtype Type of the smoothing, see SmoothMethod_c
+@param size1 The first parameter of the smoothing operation, the aperture width. Must be a
+positive odd number (1, 3, 5, ...)
+@param size2 The second parameter of the smoothing operation, the aperture height. Ignored by
+CV_MEDIAN and CV_BILATERAL methods. In the case of simple scaled/non-scaled and Gaussian blur if
+size2 is zero, it is set to size1. Otherwise it must be a positive odd number.
+@param sigma1 In the case of a Gaussian parameter this parameter may specify Gaussian \f$\sigma\f$
+(standard deviation). If it is zero, it is calculated from the kernel size:
+\f[\sigma  = 0.3 (n/2 - 1) + 0.8  \quad   \text{where}   \quad  n= \begin{array}{l l} \mbox{\texttt{size1} for horizontal kernel} \\ \mbox{\texttt{size2} for vertical kernel} \end{array}\f]
+Using standard sigma for small kernels ( \f$3\times 3\f$ to \f$7\times 7\f$ ) gives better speed. If
+sigma1 is not zero, while size1 and size2 are zeros, the kernel size is calculated from the
+sigma (to provide accurate enough operation).
+@param sigma2 additional parameter for bilateral filtering
+
+@see cv::GaussianBlur, cv::blur, cv::medianBlur, cv::bilateralFilter.
+ */
+CVAPI(void) cvSmooth( const CvArr* src, CvArr* dst,
+                      int smoothtype CV_DEFAULT(CV_GAUSSIAN),
+                      int size1 CV_DEFAULT(3),
+                      int size2 CV_DEFAULT(0),
+                      double sigma1 CV_DEFAULT(0),
+                      double sigma2 CV_DEFAULT(0));
+
+/** @brief Convolves an image with the kernel.
+
+@param src input image.
+@param dst output image of the same size and the same number of channels as src.
+@param kernel convolution kernel (or rather a correlation kernel), a single-channel floating point
+matrix; if you want to apply different kernels to different channels, split the image into
+separate color planes using split and process them individually.
+@param anchor anchor of the kernel that indicates the relative position of a filtered point within
+the kernel; the anchor should lie within the kernel; default value (-1,-1) means that the anchor
+is at the kernel center.
+
+@see cv::filter2D
+ */
+CVAPI(void) cvFilter2D( const CvArr* src, CvArr* dst, const CvMat* kernel,
+                        CvPoint anchor CV_DEFAULT(cvPoint(-1,-1)));
+
+/** @brief Finds integral image: SUM(X,Y) = sum(x<X,y<Y)I(x,y)
+@see cv::integral
+*/
+CVAPI(void) cvIntegral( const CvArr* image, CvArr* sum,
+                       CvArr* sqsum CV_DEFAULT(NULL),
+                       CvArr* tilted_sum CV_DEFAULT(NULL));
+
+/** @brief Smoothes the input image with gaussian kernel and then down-samples it.
+
+   dst_width = floor(src_width/2)[+1],
+   dst_height = floor(src_height/2)[+1]
+   @see cv::pyrDown
+*/
+CVAPI(void)  cvPyrDown( const CvArr* src, CvArr* dst,
+                        int filter CV_DEFAULT(CV_GAUSSIAN_5x5) );
+
+/** @brief Up-samples image and smoothes the result with gaussian kernel.
+
+   dst_width = src_width*2,
+   dst_height = src_height*2
+   @see cv::pyrUp
+*/
+CVAPI(void)  cvPyrUp( const CvArr* src, CvArr* dst,
+                      int filter CV_DEFAULT(CV_GAUSSIAN_5x5) );
+
+/** @brief Builds pyramid for an image
+@see buildPyramid
+*/
+CVAPI(CvMat**) cvCreatePyramid( const CvArr* img, int extra_layers, double rate,
+                                const CvSize* layer_sizes CV_DEFAULT(0),
+                                CvArr* bufarr CV_DEFAULT(0),
+                                int calc CV_DEFAULT(1),
+                                int filter CV_DEFAULT(CV_GAUSSIAN_5x5) );
+
+/** @brief Releases pyramid */
+CVAPI(void)  cvReleasePyramid( CvMat*** pyramid, int extra_layers );
+
+
+/** @brief Filters image using meanshift algorithm
+@see cv::pyrMeanShiftFiltering
+*/
+CVAPI(void) cvPyrMeanShiftFiltering( const CvArr* src, CvArr* dst,
+    double sp, double sr, int max_level CV_DEFAULT(1),
+    CvTermCriteria termcrit CV_DEFAULT(cvTermCriteria(CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,5,1)));
+
+/** @brief Segments image using seed "markers"
+@see cv::watershed
+*/
+CVAPI(void) cvWatershed( const CvArr* image, CvArr* markers );
+
+/** @brief Calculates an image derivative using generalized Sobel
+
+   (aperture_size = 1,3,5,7) or Scharr (aperture_size = -1) operator.
+   Scharr can be used only for the first dx or dy derivative
+@see cv::Sobel
+*/
+CVAPI(void) cvSobel( const CvArr* src, CvArr* dst,
+                    int xorder, int yorder,
+                    int aperture_size CV_DEFAULT(3));
+
+/** @brief Calculates the image Laplacian: (d2/dx + d2/dy)I
+@see cv::Laplacian
+*/
+CVAPI(void) cvLaplace( const CvArr* src, CvArr* dst,
+                      int aperture_size CV_DEFAULT(3) );
+
+/** @brief Converts input array pixels from one color space to another
+@see cv::cvtColor
+*/
+CVAPI(void)  cvCvtColor( const CvArr* src, CvArr* dst, int code );
+
+
+/** @brief Resizes image (input array is resized to fit the destination array)
+@see cv::resize
+*/
+CVAPI(void)  cvResize( const CvArr* src, CvArr* dst,
+                       int interpolation CV_DEFAULT( CV_INTER_LINEAR ));
+
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 5054 )
+#endif
+/** @brief Warps image with affine transform
+@note ::cvGetQuadrangleSubPix is similar to ::cvWarpAffine, but the outliers are extrapolated using
+replication border mode.
+@see cv::warpAffine
+*/
+CVAPI(void)  cvWarpAffine( const CvArr* src, CvArr* dst, const CvMat* map_matrix,
+                           int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
+                           CvScalar fillval CV_DEFAULT(cvScalarAll(0)) );
+
+/** @brief Computes affine transform matrix for mapping src[i] to dst[i] (i=0,1,2)
+@see cv::getAffineTransform
+*/
+CVAPI(CvMat*) cvGetAffineTransform( const CvPoint2D32f * src,
+                                    const CvPoint2D32f * dst,
+                                    CvMat * map_matrix );
+
+/** @brief Computes rotation_matrix matrix
+@see cv::getRotationMatrix2D
+*/
+CVAPI(CvMat*)  cv2DRotationMatrix( CvPoint2D32f center, double angle,
+                                   double scale, CvMat* map_matrix );
+
+/** @brief Warps image with perspective (projective) transform
+@see cv::warpPerspective
+*/
+CVAPI(void)  cvWarpPerspective( const CvArr* src, CvArr* dst, const CvMat* map_matrix,
+                                int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
+                                CvScalar fillval CV_DEFAULT(cvScalarAll(0)) );
+
+/** @brief Computes perspective transform matrix for mapping src[i] to dst[i] (i=0,1,2,3)
+@see cv::getPerspectiveTransform
+*/
+CVAPI(CvMat*) cvGetPerspectiveTransform( const CvPoint2D32f* src,
+                                         const CvPoint2D32f* dst,
+                                         CvMat* map_matrix );
+
+/** @brief Performs generic geometric transformation using the specified coordinate maps
+@see cv::remap
+*/
+CVAPI(void)  cvRemap( const CvArr* src, CvArr* dst,
+                      const CvArr* mapx, const CvArr* mapy,
+                      int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
+                      CvScalar fillval CV_DEFAULT(cvScalarAll(0)) );
+
+/** @brief Converts mapx & mapy from floating-point to integer formats for cvRemap
+@see cv::convertMaps
+*/
+CVAPI(void)  cvConvertMaps( const CvArr* mapx, const CvArr* mapy,
+                            CvArr* mapxy, CvArr* mapalpha );
+
+/** @brief Performs forward or inverse log-polar image transform
+@see cv::warpPolar
+*/
+CVAPI(void)  cvLogPolar( const CvArr* src, CvArr* dst,
+                         CvPoint2D32f center, double M,
+                         int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));
+
+/** Performs forward or inverse linear-polar image transform
+@see cv::warpPolar
+*/
+CVAPI(void)  cvLinearPolar( const CvArr* src, CvArr* dst,
+                         CvPoint2D32f center, double maxRadius,
+                         int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));
+
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
+/** @brief Returns a structuring element of the specified size and shape for morphological operations.
+
+@note the created structuring element IplConvKernel\* element must be released in the end using
+`cvReleaseStructuringElement(&element)`.
+
+@param cols Width of the structuring element
+@param rows Height of the structuring element
+@param anchor_x x-coordinate of the anchor
+@param anchor_y y-coordinate of the anchor
+@param shape element shape that could be one of the cv::MorphShapes_c
+@param values integer array of cols*rows elements that specifies the custom shape of the
+structuring element, when shape=CV_SHAPE_CUSTOM.
+
+@see cv::getStructuringElement
+ */
+ CVAPI(IplConvKernel*)  cvCreateStructuringElementEx(
+            int cols, int  rows, int  anchor_x, int  anchor_y,
+            int shape, int* values CV_DEFAULT(NULL) );
+
+/** @brief releases structuring element
+@see cvCreateStructuringElementEx
+*/
+CVAPI(void)  cvReleaseStructuringElement( IplConvKernel** element );
+
+/** @brief erodes input image (applies minimum filter) one or more times.
+   If element pointer is NULL, 3x3 rectangular element is used
+@see cv::erode
+*/
+CVAPI(void)  cvErode( const CvArr* src, CvArr* dst,
+                      IplConvKernel* element CV_DEFAULT(NULL),
+                      int iterations CV_DEFAULT(1) );
+
+/** @brief dilates input image (applies maximum filter) one or more times.
+
+   If element pointer is NULL, 3x3 rectangular element is used
+@see cv::dilate
+*/
+CVAPI(void)  cvDilate( const CvArr* src, CvArr* dst,
+                       IplConvKernel* element CV_DEFAULT(NULL),
+                       int iterations CV_DEFAULT(1) );
+
+/** @brief Performs complex morphological transformation
+@see cv::morphologyEx
+*/
+CVAPI(void)  cvMorphologyEx( const CvArr* src, CvArr* dst,
+                             CvArr* temp, IplConvKernel* element,
+                             int operation, int iterations CV_DEFAULT(1) );
+
+/** @brief Calculates all spatial and central moments up to the 3rd order
+@see cv::moments
+*/
+CVAPI(void) cvMoments( const CvArr* arr, CvMoments* moments, int binary CV_DEFAULT(0));
+
+/** @brief Retrieve spatial moments */
+CVAPI(double)  cvGetSpatialMoment( CvMoments* moments, int x_order, int y_order );
+/** @brief Retrieve central moments */
+CVAPI(double)  cvGetCentralMoment( CvMoments* moments, int x_order, int y_order );
+/** @brief Retrieve normalized central moments */
+CVAPI(double)  cvGetNormalizedCentralMoment( CvMoments* moments,
+                                             int x_order, int y_order );
+
+/** @brief Calculates 7 Hu's invariants from precalculated spatial and central moments
+@see cv::HuMoments
+*/
+CVAPI(void) cvGetHuMoments( CvMoments*  moments, CvHuMoments*  hu_moments );
+
+/*********************************** data sampling **************************************/
+
+/** @brief Fetches pixels that belong to the specified line segment and stores them to the buffer.
+
+   Returns the number of retrieved points.
+@see cv::LineSegmentDetector
+*/
+CVAPI(int)  cvSampleLine( const CvArr* image, CvPoint pt1, CvPoint pt2, void* buffer,
+                          int connectivity CV_DEFAULT(8));
+
+/** @brief Retrieves the rectangular image region with specified center from the input array.
+
+ dst(x,y) <- src(x + center.x - dst_width/2, y + center.y - dst_height/2).
+ Values of pixels with fractional coordinates are retrieved using bilinear interpolation
+@see cv::getRectSubPix
+*/
+CVAPI(void)  cvGetRectSubPix( const CvArr* src, CvArr* dst, CvPoint2D32f center );
+
+
+/** @brief Retrieves quadrangle from the input array.
+
+    matrixarr = ( a11  a12 | b1 )   dst(x,y) <- src(A[x y]' + b)
+                ( a21  a22 | b2 )   (bilinear interpolation is used to retrieve pixels
+                                     with fractional coordinates)
+@see cvWarpAffine
+*/
+CVAPI(void)  cvGetQuadrangleSubPix( const CvArr* src, CvArr* dst,
+                                    const CvMat* map_matrix );
+
+/** @brief Measures similarity between template and overlapped windows in the source image
+   and fills the resultant image with the measurements
+@see cv::matchTemplate
+*/
+CVAPI(void)  cvMatchTemplate( const CvArr* image, const CvArr* templ,
+                              CvArr* result, int method );
+
+/** @brief Computes earth mover distance between
+   two weighted point sets (called signatures)
+@see cv::EMD
+*/
+CVAPI(float)  cvCalcEMD2( const CvArr* signature1,
+                          const CvArr* signature2,
+                          int distance_type,
+                          CvDistanceFunction distance_func CV_DEFAULT(NULL),
+                          const CvArr* cost_matrix CV_DEFAULT(NULL),
+                          CvArr* flow CV_DEFAULT(NULL),
+                          float* lower_bound CV_DEFAULT(NULL),
+                          void* userdata CV_DEFAULT(NULL));
+
+/****************************************************************************************\
+*                              Contours retrieving                                       *
+\****************************************************************************************/
+
+/** @brief Retrieves outer and optionally inner boundaries of white (non-zero) connected
+   components in the black (zero) background
+@see cv::findContours, cvStartFindContours, cvFindNextContour, cvSubstituteContour, cvEndFindContours
+*/
+CVAPI(int)  cvFindContours( CvArr* image, CvMemStorage* storage, CvSeq** first_contour,
+                            int header_size CV_DEFAULT(sizeof(CvContour)),
+                            int mode CV_DEFAULT(CV_RETR_LIST),
+                            int method CV_DEFAULT(CV_CHAIN_APPROX_SIMPLE),
+                            CvPoint offset CV_DEFAULT(cvPoint(0,0)));
+
+/** @brief Initializes contour retrieving process.
+
+   Calls cvStartFindContours.
+   Calls cvFindNextContour until null pointer is returned
+   or some other condition becomes true.
+   Calls cvEndFindContours at the end.
+@see cvFindContours
+*/
+CVAPI(CvContourScanner)  cvStartFindContours( CvArr* image, CvMemStorage* storage,
+                            int header_size CV_DEFAULT(sizeof(CvContour)),
+                            int mode CV_DEFAULT(CV_RETR_LIST),
+                            int method CV_DEFAULT(CV_CHAIN_APPROX_SIMPLE),
+                            CvPoint offset CV_DEFAULT(cvPoint(0,0)));
+
+/** @brief Retrieves next contour
+@see cvFindContours
+*/
+CVAPI(CvSeq*)  cvFindNextContour( CvContourScanner scanner );
+
+
+/** @brief Substitutes the last retrieved contour with the new one
+
+   (if the substitutor is null, the last retrieved contour is removed from the tree)
+@see cvFindContours
+*/
+CVAPI(void)   cvSubstituteContour( CvContourScanner scanner, CvSeq* new_contour );
+
+
+/** @brief Releases contour scanner and returns pointer to the first outer contour
+@see cvFindContours
+*/
+CVAPI(CvSeq*)  cvEndFindContours( CvContourScanner* scanner );
+
+/** @brief Approximates Freeman chain(s) with a polygonal curve.
+
+This is a standalone contour approximation routine, not represented in the new interface. When
+cvFindContours retrieves contours as Freeman chains, it calls the function to get approximated
+contours, represented as polygons.
+
+@param src_seq Pointer to the approximated Freeman chain that can refer to other chains.
+@param storage Storage location for the resulting polylines.
+@param method Approximation method (see the description of the function :ocvFindContours ).
+@param parameter Method parameter (not used now).
+@param minimal_perimeter Approximates only those contours whose perimeters are not less than
+minimal_perimeter . Other chains are removed from the resulting structure.
+@param recursive Recursion flag. If it is non-zero, the function approximates all chains that can
+be obtained from chain by using the h_next or v_next links. Otherwise, the single input chain is
+approximated.
+@see cvStartReadChainPoints, cvReadChainPoint
+ */
+CVAPI(CvSeq*) cvApproxChains( CvSeq* src_seq, CvMemStorage* storage,
+                            int method CV_DEFAULT(CV_CHAIN_APPROX_SIMPLE),
+                            double parameter CV_DEFAULT(0),
+                            int  minimal_perimeter CV_DEFAULT(0),
+                            int  recursive CV_DEFAULT(0));
+
+/** @brief Initializes Freeman chain reader.
+
+   The reader is used to iteratively get coordinates of all the chain points.
+   If the Freeman codes should be read as is, a simple sequence reader should be used
+@see cvApproxChains
+*/
+CVAPI(void) cvStartReadChainPoints( CvChain* chain, CvChainPtReader* reader );
+
+/** @brief Retrieves the next chain point
+@see cvApproxChains
+*/
+CVAPI(CvPoint) cvReadChainPoint( CvChainPtReader* reader );
+
+
+/****************************************************************************************\
+*                            Contour Processing and Shape Analysis                       *
+\****************************************************************************************/
+
+/** @brief Approximates a single polygonal curve (contour) or
+   a tree of polygonal curves (contours)
+@see cv::approxPolyDP
+*/
+CVAPI(CvSeq*)  cvApproxPoly( const void* src_seq,
+                             int header_size, CvMemStorage* storage,
+                             int method, double eps,
+                             int recursive CV_DEFAULT(0));
+
+/** @brief Calculates perimeter of a contour or length of a part of contour
+@see cv::arcLength
+*/
+CVAPI(double)  cvArcLength( const void* curve,
+                            CvSlice slice CV_DEFAULT(CV_WHOLE_SEQ),
+                            int is_closed CV_DEFAULT(-1));
+
+/** same as cvArcLength for closed contour
+*/
+CV_INLINE double cvContourPerimeter( const void* contour )
+{
+    return cvArcLength( contour, CV_WHOLE_SEQ, 1 );
+}
+
+
+/** @brief Calculates contour bounding rectangle (update=1) or
+   just retrieves pre-calculated rectangle (update=0)
+@see cv::boundingRect
+*/
+CVAPI(CvRect)  cvBoundingRect( CvArr* points, int update CV_DEFAULT(0) );
+
+/** @brief Calculates area of a contour or contour segment
+@see cv::contourArea
+*/
+CVAPI(double)  cvContourArea( const CvArr* contour,
+                              CvSlice slice CV_DEFAULT(CV_WHOLE_SEQ),
+                              int oriented CV_DEFAULT(0));
+
+/** @brief Finds minimum area rotated rectangle bounding a set of points
+@see cv::minAreaRect
+*/
+CVAPI(CvBox2D)  cvMinAreaRect2( const CvArr* points,
+                                CvMemStorage* storage CV_DEFAULT(NULL));
+
+/** @brief Finds minimum enclosing circle for a set of points
+@see cv::minEnclosingCircle
+*/
+CVAPI(int)  cvMinEnclosingCircle( const CvArr* points,
+                                  CvPoint2D32f* center, float* radius );
+
+/** @brief Compares two contours by matching their moments
+@see cv::matchShapes
+*/
+CVAPI(double)  cvMatchShapes( const void* object1, const void* object2,
+                              int method, double parameter CV_DEFAULT(0));
+
+/** @brief Calculates exact convex hull of 2d point set
+@see cv::convexHull
+*/
+CVAPI(CvSeq*) cvConvexHull2( const CvArr* input,
+                             void* hull_storage CV_DEFAULT(NULL),
+                             int orientation CV_DEFAULT(CV_CLOCKWISE),
+                             int return_points CV_DEFAULT(0));
+
+/** @brief Checks whether the contour is convex or not (returns 1 if convex, 0 if not)
+@see cv::isContourConvex
+*/
+CVAPI(int)  cvCheckContourConvexity( const CvArr* contour );
+
+
+/** @brief Finds convexity defects for the contour
+@see cv::convexityDefects
+*/
+CVAPI(CvSeq*)  cvConvexityDefects( const CvArr* contour, const CvArr* convexhull,
+                                   CvMemStorage* storage CV_DEFAULT(NULL));
+
+/** @brief Fits ellipse into a set of 2d points
+@see cv::fitEllipse
+*/
+CVAPI(CvBox2D) cvFitEllipse2( const CvArr* points );
+
+/** @brief Finds minimum rectangle containing two given rectangles */
+CVAPI(CvRect)  cvMaxRect( const CvRect* rect1, const CvRect* rect2 );
+
+/** @brief Finds coordinates of the box vertices */
+CVAPI(void) cvBoxPoints( CvBox2D box, CvPoint2D32f pt[4] );
+
+/** @brief Initializes sequence header for a matrix (column or row vector) of points
+
+   a wrapper for cvMakeSeqHeaderForArray (it does not initialize bounding rectangle!!!) */
+CVAPI(CvSeq*) cvPointSeqFromMat( int seq_kind, const CvArr* mat,
+                                 CvContour* contour_header,
+                                 CvSeqBlock* block );
+
+/** @brief Checks whether the point is inside polygon, outside, on an edge (at a vertex).
+
+   Returns positive, negative or zero value, correspondingly.
+   Optionally, measures a signed distance between
+   the point and the nearest polygon edge (measure_dist=1)
+@see cv::pointPolygonTest
+*/
+CVAPI(double) cvPointPolygonTest( const CvArr* contour,
+                                  CvPoint2D32f pt, int measure_dist );
+
+/****************************************************************************************\
+*                                  Histogram functions                                   *
+\****************************************************************************************/
+
+/** @brief Creates a histogram.
+
+The function creates a histogram of the specified size and returns a pointer to the created
+histogram. If the array ranges is 0, the histogram bin ranges must be specified later via the
+function cvSetHistBinRanges. Though cvCalcHist and cvCalcBackProject may process 8-bit images
+without setting bin ranges, they assume they are equally spaced in 0 to 255 bins.
+
+@param dims Number of histogram dimensions.
+@param sizes Array of the histogram dimension sizes.
+@param type Histogram representation format. CV_HIST_ARRAY means that the histogram data is
+represented as a multi-dimensional dense array CvMatND. CV_HIST_SPARSE means that histogram data
+is represented as a multi-dimensional sparse array CvSparseMat.
+@param ranges Array of ranges for the histogram bins. Its meaning depends on the uniform parameter
+value. The ranges are used when the histogram is calculated or backprojected to determine which
+histogram bin corresponds to which value/tuple of values from the input image(s).
+@param uniform Uniformity flag. If not zero, the histogram has evenly spaced bins and for every
+\f$0<=i<cDims\f$ ranges[i] is an array of two numbers: lower and upper boundaries for the i-th
+histogram dimension. The whole range [lower,upper] is then split into dims[i] equal parts to
+determine the i-th input tuple value ranges for every histogram bin. And if uniform=0 , then the
+i-th element of the ranges array contains dims[i]+1 elements: \f$\texttt{lower}_0,
+\texttt{upper}_0, \texttt{lower}_1, \texttt{upper}_1 = \texttt{lower}_2,
+...
+\texttt{upper}_{dims[i]-1}\f$ where \f$\texttt{lower}_j\f$ and \f$\texttt{upper}_j\f$ are lower
+and upper boundaries of the i-th input tuple value for the j-th bin, respectively. In either
+case, the input values that are beyond the specified range for a histogram bin are not counted
+by cvCalcHist and filled with 0 by cvCalcBackProject.
+ */
+CVAPI(CvHistogram*)  cvCreateHist( int dims, int* sizes, int type,
+                                   float** ranges CV_DEFAULT(NULL),
+                                   int uniform CV_DEFAULT(1));
+
+/** @brief Sets the bounds of the histogram bins.
+
+This is a standalone function for setting bin ranges in the histogram. For a more detailed
+description of the parameters ranges and uniform, see the :ocvCalcHist function that can initialize
+the ranges as well. Ranges for the histogram bins must be set before the histogram is calculated or
+the backproject of the histogram is calculated.
+
+@param hist Histogram.
+@param ranges Array of bin ranges arrays. See :ocvCreateHist for details.
+@param uniform Uniformity flag. See :ocvCreateHist for details.
+ */
+CVAPI(void)  cvSetHistBinRanges( CvHistogram* hist, float** ranges,
+                                int uniform CV_DEFAULT(1));
+
+/** @brief Makes a histogram out of an array.
+
+The function initializes the histogram, whose header and bins are allocated by the user.
+cvReleaseHist does not need to be called afterwards. Only dense histograms can be initialized this
+way. The function returns hist.
+
+@param dims Number of the histogram dimensions.
+@param sizes Array of the histogram dimension sizes.
+@param hist Histogram header initialized by the function.
+@param data Array used to store histogram bins.
+@param ranges Histogram bin ranges. See cvCreateHist for details.
+@param uniform Uniformity flag. See cvCreateHist for details.
+ */
+CVAPI(CvHistogram*)  cvMakeHistHeaderForArray(
+                            int  dims, int* sizes, CvHistogram* hist,
+                            float* data, float** ranges CV_DEFAULT(NULL),
+                            int uniform CV_DEFAULT(1));
+
+/** @brief Releases the histogram.
+
+The function releases the histogram (header and the data). The pointer to the histogram is cleared
+by the function. If \*hist pointer is already NULL, the function does nothing.
+
+@param hist Double pointer to the released histogram.
+ */
+CVAPI(void)  cvReleaseHist( CvHistogram** hist );
+
+/** @brief Clears the histogram.
+
+The function sets all of the histogram bins to 0 in case of a dense histogram and removes all
+histogram bins in case of a sparse array.
+
+@param hist Histogram.
+ */
+CVAPI(void)  cvClearHist( CvHistogram* hist );
+
+/** @brief Finds the minimum and maximum histogram bins.
+
+The function finds the minimum and maximum histogram bins and their positions. All of output
+arguments are optional. Among several extremas with the same value the ones with the minimum index
+(in the lexicographical order) are returned. In case of several maximums or minimums, the earliest
+in the lexicographical order (extrema locations) is returned.
+
+@param hist Histogram.
+@param min_value Pointer to the minimum value of the histogram.
+@param max_value Pointer to the maximum value of the histogram.
+@param min_idx Pointer to the array of coordinates for the minimum.
+@param max_idx Pointer to the array of coordinates for the maximum.
+ */
+CVAPI(void)  cvGetMinMaxHistValue( const CvHistogram* hist,
+                                   float* min_value, float* max_value,
+                                   int* min_idx CV_DEFAULT(NULL),
+                                   int* max_idx CV_DEFAULT(NULL));
+
+
+/** @brief Normalizes the histogram.
+
+The function normalizes the histogram bins by scaling them so that the sum of the bins becomes equal
+to factor.
+
+@param hist Pointer to the histogram.
+@param factor Normalization factor.
+ */
+CVAPI(void)  cvNormalizeHist( CvHistogram* hist, double factor );
+
+
+/** @brief Thresholds the histogram.
+
+The function clears histogram bins that are below the specified threshold.
+
+@param hist Pointer to the histogram.
+@param threshold Threshold level.
+ */
+CVAPI(void)  cvThreshHist( CvHistogram* hist, double threshold );
+
+
+/** Compares two histogram */
+CVAPI(double)  cvCompareHist( const CvHistogram* hist1,
+                              const CvHistogram* hist2,
+                              int method);
+
+/** @brief Copies a histogram.
+
+The function makes a copy of the histogram. If the second histogram pointer \*dst is NULL, a new
+histogram of the same size as src is created. Otherwise, both histograms must have equal types and
+sizes. Then the function copies the bin values of the source histogram to the destination histogram
+and sets the same bin value ranges as in src.
+
+@param src Source histogram.
+@param dst Pointer to the destination histogram.
+ */
+CVAPI(void)  cvCopyHist( const CvHistogram* src, CvHistogram** dst );
+
+
+/** @brief Calculates bayesian probabilistic histograms
+   (each or src and dst is an array of _number_ histograms */
+CVAPI(void)  cvCalcBayesianProb( CvHistogram** src, int number,
+                                CvHistogram** dst);
+
+/** @brief Calculates array histogram
+@see cv::calcHist
+*/
+CVAPI(void)  cvCalcArrHist( CvArr** arr, CvHistogram* hist,
+                            int accumulate CV_DEFAULT(0),
+                            const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @overload */
+CV_INLINE  void  cvCalcHist( IplImage** image, CvHistogram* hist,
+                             int accumulate CV_DEFAULT(0),
+                             const CvArr* mask CV_DEFAULT(NULL) )
+{
+    cvCalcArrHist( (CvArr**)image, hist, accumulate, mask );
+}
+
+/** @brief Calculates back project
+@see cvCalcBackProject, cv::calcBackProject
+*/
+CVAPI(void)  cvCalcArrBackProject( CvArr** image, CvArr* dst,
+                                   const CvHistogram* hist );
+
+#define  cvCalcBackProject(image, dst, hist) cvCalcArrBackProject((CvArr**)image, dst, hist)
+
+
+/** @brief Locates a template within an image by using a histogram comparison.
+
+The function calculates the back projection by comparing histograms of the source image patches with
+the given histogram. The function is similar to matchTemplate, but instead of comparing the raster
+patch with all its possible positions within the search window, the function CalcBackProjectPatch
+compares histograms. See the algorithm diagram below:
+
+![image](pics/backprojectpatch.png)
+
+@param image Source images (though, you may pass CvMat\*\* as well).
+@param dst Destination image.
+@param range
+@param hist Histogram.
+@param method Comparison method passed to cvCompareHist (see the function description).
+@param factor Normalization factor for histograms that affects the normalization scale of the
+destination image. Pass 1 if not sure.
+
+@see cvCalcBackProjectPatch
+ */
+CVAPI(void)  cvCalcArrBackProjectPatch( CvArr** image, CvArr* dst, CvSize range,
+                                        CvHistogram* hist, int method,
+                                        double factor );
+
+#define  cvCalcBackProjectPatch( image, dst, range, hist, method, factor ) \
+     cvCalcArrBackProjectPatch( (CvArr**)image, dst, range, hist, method, factor )
+
+
+/** @brief Divides one histogram by another.
+
+The function calculates the object probability density from two histograms as:
+
+\f[\texttt{disthist} (I)= \forkthree{0}{if \(\texttt{hist1}(I)=0\)}{\texttt{scale}}{if \(\texttt{hist1}(I) \ne 0\) and \(\texttt{hist2}(I) > \texttt{hist1}(I)\)}{\frac{\texttt{hist2}(I) \cdot \texttt{scale}}{\texttt{hist1}(I)}}{if \(\texttt{hist1}(I) \ne 0\) and \(\texttt{hist2}(I) \le \texttt{hist1}(I)\)}\f]
+
+@param hist1 First histogram (the divisor).
+@param hist2 Second histogram.
+@param dst_hist Destination histogram.
+@param scale Scale factor for the destination histogram.
+ */
+CVAPI(void)  cvCalcProbDensity( const CvHistogram* hist1, const CvHistogram* hist2,
+                                CvHistogram* dst_hist, double scale CV_DEFAULT(255) );
+
+/** @brief equalizes histogram of 8-bit single-channel image
+@see cv::equalizeHist
+*/
+CVAPI(void)  cvEqualizeHist( const CvArr* src, CvArr* dst );
+
+
+/** @brief Applies distance transform to binary image
+@see cv::distanceTransform
+*/
+CVAPI(void)  cvDistTransform( const CvArr* src, CvArr* dst,
+                              int distance_type CV_DEFAULT(CV_DIST_L2),
+                              int mask_size CV_DEFAULT(3),
+                              const float* mask CV_DEFAULT(NULL),
+                              CvArr* labels CV_DEFAULT(NULL),
+                              int labelType CV_DEFAULT(CV_DIST_LABEL_CCOMP));
+
+
+/** @brief Applies fixed-level threshold to grayscale image.
+
+   This is a basic operation applied before retrieving contours
+@see cv::threshold
+*/
+CVAPI(double)  cvThreshold( const CvArr*  src, CvArr*  dst,
+                            double  threshold, double  max_value,
+                            int threshold_type );
+
+/** @brief Applies adaptive threshold to grayscale image.
+
+   The two parameters for methods CV_ADAPTIVE_THRESH_MEAN_C and
+   CV_ADAPTIVE_THRESH_GAUSSIAN_C are:
+   neighborhood size (3, 5, 7 etc.),
+   and a constant subtracted from mean (...,-3,-2,-1,0,1,2,3,...)
+@see cv::adaptiveThreshold
+*/
+CVAPI(void)  cvAdaptiveThreshold( const CvArr* src, CvArr* dst, double max_value,
+                                  int adaptive_method CV_DEFAULT(CV_ADAPTIVE_THRESH_MEAN_C),
+                                  int threshold_type CV_DEFAULT(CV_THRESH_BINARY),
+                                  int block_size CV_DEFAULT(3),
+                                  double param1 CV_DEFAULT(5));
+
+/** @brief Fills the connected component until the color difference gets large enough
+@see cv::floodFill
+*/
+CVAPI(void)  cvFloodFill( CvArr* image, CvPoint seed_point,
+                          CvScalar new_val, CvScalar lo_diff CV_DEFAULT(cvScalarAll(0)),
+                          CvScalar up_diff CV_DEFAULT(cvScalarAll(0)),
+                          CvConnectedComp* comp CV_DEFAULT(NULL),
+                          int flags CV_DEFAULT(4),
+                          CvArr* mask CV_DEFAULT(NULL));
+
+/****************************************************************************************\
+*                                  Feature detection                                     *
+\****************************************************************************************/
+
+/** @brief Runs canny edge detector
+@see cv::Canny
+*/
+CVAPI(void)  cvCanny( const CvArr* image, CvArr* edges, double threshold1,
+                      double threshold2, int  aperture_size CV_DEFAULT(3) );
+
+/** @brief Calculates constraint image for corner detection
+
+   Dx^2 * Dyy + Dxx * Dy^2 - 2 * Dx * Dy * Dxy.
+   Applying threshold to the result gives coordinates of corners
+@see cv::preCornerDetect
+*/
+CVAPI(void) cvPreCornerDetect( const CvArr* image, CvArr* corners,
+                               int aperture_size CV_DEFAULT(3) );
+
+/** @brief Calculates eigen values and vectors of 2x2
+   gradient covariation matrix at every image pixel
+@see cv::cornerEigenValsAndVecs
+*/
+CVAPI(void)  cvCornerEigenValsAndVecs( const CvArr* image, CvArr* eigenvv,
+                                       int block_size, int aperture_size CV_DEFAULT(3) );
+
+/** @brief Calculates minimal eigenvalue for 2x2 gradient covariation matrix at
+   every image pixel
+@see cv::cornerMinEigenVal
+*/
+CVAPI(void)  cvCornerMinEigenVal( const CvArr* image, CvArr* eigenval,
+                                  int block_size, int aperture_size CV_DEFAULT(3) );
+
+/** @brief Harris corner detector:
+
+   Calculates det(M) - k*(trace(M)^2), where M is 2x2 gradient covariation matrix for each pixel
+@see cv::cornerHarris
+*/
+CVAPI(void)  cvCornerHarris( const CvArr* image, CvArr* harris_response,
+                             int block_size, int aperture_size CV_DEFAULT(3),
+                             double k CV_DEFAULT(0.04) );
+
+/** @brief Adjust corner position using some sort of gradient search
+@see cv::cornerSubPix
+*/
+CVAPI(void)  cvFindCornerSubPix( const CvArr* image, CvPoint2D32f* corners,
+                                 int count, CvSize win, CvSize zero_zone,
+                                 CvTermCriteria  criteria );
+
+/** @brief Finds a sparse set of points within the selected region
+   that seem to be easy to track
+@see cv::goodFeaturesToTrack
+*/
+CVAPI(void)  cvGoodFeaturesToTrack( const CvArr* image, CvArr* eig_image,
+                                    CvArr* temp_image, CvPoint2D32f* corners,
+                                    int* corner_count, double  quality_level,
+                                    double  min_distance,
+                                    const CvArr* mask CV_DEFAULT(NULL),
+                                    int block_size CV_DEFAULT(3),
+                                    int use_harris CV_DEFAULT(0),
+                                    double k CV_DEFAULT(0.04) );
+
+/** @brief Finds lines on binary image using one of several methods.
+
+   line_storage is either memory storage or 1 x _max number of lines_ CvMat, its
+   number of columns is changed by the function.
+   method is one of CV_HOUGH_*;
+   rho, theta and threshold are used for each of those methods;
+   param1 ~ line length, param2 ~ line gap - for probabilistic,
+   param1 ~ srn, param2 ~ stn - for multi-scale
+@see cv::HoughLines
+*/
+CVAPI(CvSeq*)  cvHoughLines2( CvArr* image, void* line_storage, int method,
+                              double rho, double theta, int threshold,
+                              double param1 CV_DEFAULT(0), double param2 CV_DEFAULT(0),
+                              double min_theta CV_DEFAULT(0), double max_theta CV_DEFAULT(CV_PI));
+
+/** @brief Finds circles in the image
+@see cv::HoughCircles
+*/
+CVAPI(CvSeq*) cvHoughCircles( CvArr* image, void* circle_storage,
+                              int method, double dp, double min_dist,
+                              double param1 CV_DEFAULT(100),
+                              double param2 CV_DEFAULT(100),
+                              int min_radius CV_DEFAULT(0),
+                              int max_radius CV_DEFAULT(0));
+
+/** @brief Fits a line into set of 2d or 3d points in a robust way (M-estimator technique)
+@see cv::fitLine
+*/
+CVAPI(void)  cvFitLine( const CvArr* points, int dist_type, double param,
+                        double reps, double aeps, float* line );
+
+/****************************************************************************************\
+*                                     Drawing                                            *
+\****************************************************************************************/
+
+/****************************************************************************************\
+*       Drawing functions work with images/matrices of arbitrary type.                   *
+*       For color images the channel order is BGR[A]                                     *
+*       Antialiasing is supported only for 8-bit image now.                              *
+*       All the functions include parameter color that means rgb value (that may be      *
+*       constructed with CV_RGB macro) for color images and brightness                   *
+*       for grayscale images.                                                            *
+*       If a drawn figure is partially or completely outside of the image, it is clipped.*
+\****************************************************************************************/
+
+#define CV_FILLED -1
+
+#define CV_AA 16
+
+/** @brief Draws 4-connected, 8-connected or antialiased line segment connecting two points
+@see cv::line
+*/
+CVAPI(void)  cvLine( CvArr* img, CvPoint pt1, CvPoint pt2,
+                     CvScalar color, int thickness CV_DEFAULT(1),
+                     int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0) );
+
+/** @brief Draws a rectangle given two opposite corners of the rectangle (pt1 & pt2)
+
+   if thickness<0 (e.g. thickness == CV_FILLED), the filled box is drawn
+@see cv::rectangle
+*/
+CVAPI(void)  cvRectangle( CvArr* img, CvPoint pt1, CvPoint pt2,
+                          CvScalar color, int thickness CV_DEFAULT(1),
+                          int line_type CV_DEFAULT(8),
+                          int shift CV_DEFAULT(0));
+
+/** @brief Draws a rectangle specified by a CvRect structure
+@see cv::rectangle
+*/
+CVAPI(void)  cvRectangleR( CvArr* img, CvRect r,
+                           CvScalar color, int thickness CV_DEFAULT(1),
+                           int line_type CV_DEFAULT(8),
+                           int shift CV_DEFAULT(0));
+
+
+/** @brief Draws a circle with specified center and radius.
+
+   Thickness works in the same way as with cvRectangle
+@see cv::circle
+*/
+CVAPI(void)  cvCircle( CvArr* img, CvPoint center, int radius,
+                       CvScalar color, int thickness CV_DEFAULT(1),
+                       int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0));
+
+/** @brief Draws ellipse outline, filled ellipse, elliptic arc or filled elliptic sector
+
+   depending on _thickness_, _start_angle_ and _end_angle_ parameters. The resultant figure
+   is rotated by _angle_. All the angles are in degrees
+@see cv::ellipse
+*/
+CVAPI(void)  cvEllipse( CvArr* img, CvPoint center, CvSize axes,
+                        double angle, double start_angle, double end_angle,
+                        CvScalar color, int thickness CV_DEFAULT(1),
+                        int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0));
+
+CV_INLINE  void  cvEllipseBox( CvArr* img, CvBox2D box, CvScalar color,
+                               int thickness CV_DEFAULT(1),
+                               int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0) )
+{
+    CvSize axes = cvSize(
+        cvRound(box.size.width*0.5),
+        cvRound(box.size.height*0.5)
+    );
+
+    cvEllipse( img, cvPointFrom32f( box.center ), axes, box.angle,
+               0, 360, color, thickness, line_type, shift );
+}
+
+/** @brief Fills convex or monotonous polygon.
+@see cv::fillConvexPoly
+*/
+CVAPI(void)  cvFillConvexPoly( CvArr* img, const CvPoint* pts, int npts, CvScalar color,
+                               int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0));
+
+/** @brief Fills an area bounded by one or more arbitrary polygons
+@see cv::fillPoly
+*/
+CVAPI(void)  cvFillPoly( CvArr* img, CvPoint** pts, const int* npts,
+                         int contours, CvScalar color,
+                         int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0) );
+
+/** @brief Draws one or more polygonal curves
+@see cv::polylines
+*/
+CVAPI(void)  cvPolyLine( CvArr* img, CvPoint** pts, const int* npts, int contours,
+                         int is_closed, CvScalar color, int thickness CV_DEFAULT(1),
+                         int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0) );
+
+#define cvDrawRect cvRectangle
+#define cvDrawLine cvLine
+#define cvDrawCircle cvCircle
+#define cvDrawEllipse cvEllipse
+#define cvDrawPolyLine cvPolyLine
+
+/** @brief Clips the line segment connecting *pt1 and *pt2
+   by the rectangular window
+
+   (0<=x<img_size.width, 0<=y<img_size.height).
+@see cv::clipLine
+*/
+CVAPI(int) cvClipLine( CvSize img_size, CvPoint* pt1, CvPoint* pt2 );
+
+/** @brief Initializes line iterator.
+
+Initially, line_iterator->ptr will point to pt1 (or pt2, see left_to_right description) location in
+the image. Returns the number of pixels on the line between the ending points.
+@see cv::LineIterator
+*/
+CVAPI(int)  cvInitLineIterator( const CvArr* image, CvPoint pt1, CvPoint pt2,
+                                CvLineIterator* line_iterator,
+                                int connectivity CV_DEFAULT(8),
+                                int left_to_right CV_DEFAULT(0));
+
+#define CV_NEXT_LINE_POINT( line_iterator )                     \
+{                                                               \
+    int _line_iterator_mask = (line_iterator).err < 0 ? -1 : 0; \
+    (line_iterator).err += (line_iterator).minus_delta +        \
+        ((line_iterator).plus_delta & _line_iterator_mask);     \
+    (line_iterator).ptr += (line_iterator).minus_step +         \
+        ((line_iterator).plus_step & _line_iterator_mask);      \
+}
+
+
+#define CV_FONT_HERSHEY_SIMPLEX         0
+#define CV_FONT_HERSHEY_PLAIN           1
+#define CV_FONT_HERSHEY_DUPLEX          2
+#define CV_FONT_HERSHEY_COMPLEX         3
+#define CV_FONT_HERSHEY_TRIPLEX         4
+#define CV_FONT_HERSHEY_COMPLEX_SMALL   5
+#define CV_FONT_HERSHEY_SCRIPT_SIMPLEX  6
+#define CV_FONT_HERSHEY_SCRIPT_COMPLEX  7
+
+#define CV_FONT_ITALIC                 16
+
+#define CV_FONT_VECTOR0    CV_FONT_HERSHEY_SIMPLEX
+
+
+/** Font structure */
+typedef struct CvFont
+{
+  const char* nameFont;   //Qt:nameFont
+  CvScalar color;       //Qt:ColorFont -> cvScalar(blue_component, green_component, red_component[, alpha_component])
+    int         font_face;    //Qt: bool italic         /** =CV_FONT_* */
+    const int*  ascii;      //!< font data and metrics
+    const int*  greek;
+    const int*  cyrillic;
+    float       hscale, vscale;
+    float       shear;      //!< slope coefficient: 0 - normal, >0 - italic
+    int         thickness;    //!< Qt: weight               /** letters thickness */
+    float       dx;       //!< horizontal interval between letters
+    int         line_type;    //!< Qt: PointSize
+}
+CvFont;
+
+/** @brief Initializes font structure (OpenCV 1.x API).
+
+The function initializes the font structure that can be passed to text rendering functions.
+
+@param font Pointer to the font structure initialized by the function
+@param font_face Font name identifier. See cv::HersheyFonts and corresponding old CV_* identifiers.
+@param hscale Horizontal scale. If equal to 1.0f , the characters have the original width
+depending on the font type. If equal to 0.5f , the characters are of half the original width.
+@param vscale Vertical scale. If equal to 1.0f , the characters have the original height depending
+on the font type. If equal to 0.5f , the characters are of half the original height.
+@param shear Approximate tangent of the character slope relative to the vertical line. A zero
+value means a non-italic font, 1.0f means about a 45 degree slope, etc.
+@param thickness Thickness of the text strokes
+@param line_type Type of the strokes, see line description
+
+@sa cvPutText
+ */
+CVAPI(void)  cvInitFont( CvFont* font, int font_face,
+                         double hscale, double vscale,
+                         double shear CV_DEFAULT(0),
+                         int thickness CV_DEFAULT(1),
+                         int line_type CV_DEFAULT(8));
+
+CV_INLINE CvFont cvFont( double scale, int thickness CV_DEFAULT(1) )
+{
+    CvFont font;
+    cvInitFont( &font, CV_FONT_HERSHEY_PLAIN, scale, scale, 0, thickness, CV_AA );
+    return font;
+}
+
+/** @brief Renders text stroke with specified font and color at specified location.
+   CvFont should be initialized with cvInitFont
+@see cvInitFont, cvGetTextSize, cvFont, cv::putText
+*/
+CVAPI(void)  cvPutText( CvArr* img, const char* text, CvPoint org,
+                        const CvFont* font, CvScalar color );
+
+/** @brief Calculates bounding box of text stroke (useful for alignment)
+@see cv::getTextSize
+*/
+CVAPI(void)  cvGetTextSize( const char* text_string, const CvFont* font,
+                            CvSize* text_size, int* baseline );
+
+/** @brief Unpacks color value
+
+if arrtype is CV_8UC?, _color_ is treated as packed color value, otherwise the first channels
+(depending on arrtype) of destination scalar are set to the same value = _color_
+*/
+CVAPI(CvScalar)  cvColorToScalar( double packed_color, int arrtype );
+
+/** @brief Returns the polygon points which make up the given ellipse.
+
+The ellipse is define by the box of size 'axes' rotated 'angle' around the 'center'. A partial
+sweep of the ellipse arc can be done by specifying arc_start and arc_end to be something other than
+0 and 360, respectively. The input array 'pts' must be large enough to hold the result. The total
+number of points stored into 'pts' is returned by this function.
+@see cv::ellipse2Poly
+*/
+CVAPI(int) cvEllipse2Poly( CvPoint center, CvSize axes,
+                 int angle, int arc_start, int arc_end, CvPoint * pts, int delta );
+
+/** @brief Draws contour outlines or filled interiors on the image
+@see cv::drawContours
+*/
+CVAPI(void)  cvDrawContours( CvArr *img, CvSeq* contour,
+                             CvScalar external_color, CvScalar hole_color,
+                             int max_level, int thickness CV_DEFAULT(1),
+                             int line_type CV_DEFAULT(8),
+                             CvPoint offset CV_DEFAULT(cvPoint(0,0)));
+
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/segmentation.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/segmentation.hpp
new file mode 100644
index 000000000000..c40d5011ee3a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/segmentation.hpp
@@ -0,0 +1,141 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_IMGPROC_SEGMENTATION_HPP
+#define OPENCV_IMGPROC_SEGMENTATION_HPP
+
+#include "opencv2/imgproc.hpp"
+
+namespace cv {
+
+namespace segmentation {
+
+//! @addtogroup imgproc_segmentation
+//! @{
+
+
+/** @brief Intelligent Scissors image segmentation
+ *
+ * This class is used to find the path (contour) between two points
+ * which can be used for image segmentation.
+ *
+ * Usage example:
+ * @snippet snippets/imgproc_segmentation.cpp usage_example_intelligent_scissors
+ *
+ * Reference: <a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.138.3811&rep=rep1&type=pdf">"Intelligent Scissors for Image Composition"</a>
+ * algorithm designed by Eric N. Mortensen and William A. Barrett, Brigham Young University
+ * @cite Mortensen95intelligentscissors
+ */
+class CV_EXPORTS_W_SIMPLE IntelligentScissorsMB
+{
+public:
+    CV_WRAP
+    IntelligentScissorsMB();
+
+    /** @brief Specify weights of feature functions
+     *
+     * Consider keeping weights normalized (sum of weights equals to 1.0)
+     * Discrete dynamic programming (DP) goal is minimization of costs between pixels.
+     *
+     * @param weight_non_edge Specify cost of non-edge pixels (default: 0.43f)
+     * @param weight_gradient_direction Specify cost of gradient direction function (default: 0.43f)
+     * @param weight_gradient_magnitude Specify cost of gradient magnitude function (default: 0.14f)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setWeights(float weight_non_edge, float weight_gradient_direction, float weight_gradient_magnitude);
+
+    /** @brief Specify gradient magnitude max value threshold
+     *
+     * Zero limit value is used to disable gradient magnitude thresholding (default behavior, as described in original article).
+     * Otherwize pixels with `gradient magnitude >= threshold` have zero cost.
+     *
+     * @note Thresholding should be used for images with irregular regions (to avoid stuck on parameters from high-contract areas, like embedded logos).
+     *
+     * @param gradient_magnitude_threshold_max Specify gradient magnitude max value threshold (default: 0, disabled)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setGradientMagnitudeMaxLimit(float gradient_magnitude_threshold_max = 0.0f);
+
+    /** @brief Switch to "Laplacian Zero-Crossing" edge feature extractor and specify its parameters
+     *
+     * This feature extractor is used by default according to article.
+     *
+     * Implementation has additional filtering for regions with low-amplitude noise.
+     * This filtering is enabled through parameter of minimal gradient amplitude (use some small value 4, 8, 16).
+     *
+     * @note Current implementation of this feature extractor is based on processing of grayscale images (color image is converted to grayscale image first).
+     *
+     * @note Canny edge detector is a bit slower, but provides better results (especially on color images): use setEdgeFeatureCannyParameters().
+     *
+     * @param gradient_magnitude_min_value Minimal gradient magnitude value for edge pixels (default: 0, check is disabled)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setEdgeFeatureZeroCrossingParameters(float gradient_magnitude_min_value = 0.0f);
+
+    /** @brief Switch edge feature extractor to use Canny edge detector
+     *
+     * @note "Laplacian Zero-Crossing" feature extractor is used by default (following to original article)
+     *
+     * @sa Canny
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setEdgeFeatureCannyParameters(
+            double threshold1, double threshold2,
+            int apertureSize = 3, bool L2gradient = false
+    );
+
+    /** @brief Specify input image and extract image features
+     *
+     * @param image input image. Type is #CV_8UC1 / #CV_8UC3
+     */
+    CV_WRAP
+    IntelligentScissorsMB& applyImage(InputArray image);
+
+    /** @brief Specify custom features of input image
+     *
+     * Customized advanced variant of applyImage() call.
+     *
+     * @param non_edge Specify cost of non-edge pixels. Type is CV_8UC1. Expected values are `{0, 1}`.
+     * @param gradient_direction Specify gradient direction feature. Type is CV_32FC2. Values are expected to be normalized: `x^2 + y^2 == 1`
+     * @param gradient_magnitude Specify cost of gradient magnitude function: Type is CV_32FC1. Values should be in range `[0, 1]`.
+     * @param image **Optional parameter**. Must be specified if subset of features is specified (non-specified features are calculated internally)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& applyImageFeatures(
+            InputArray non_edge, InputArray gradient_direction, InputArray gradient_magnitude,
+            InputArray image = noArray()
+    );
+
+    /** @brief Prepares a map of optimal paths for the given source point on the image
+     *
+     * @note applyImage() / applyImageFeatures() must be called before this call
+     *
+     * @param sourcePt The source point used to find the paths
+     */
+    CV_WRAP void buildMap(const Point& sourcePt);
+
+    /** @brief Extracts optimal contour for the given target point on the image
+     *
+     * @note buildMap() must be called before this call
+     *
+     * @param targetPt The target point
+     * @param[out] contour The list of pixels which contains optimal path between the source and the target points of the image. Type is CV_32SC2 (compatible with `std::vector<Point>`)
+     * @param backward Flag to indicate reverse order of retrived pixels (use "true" value to fetch points from the target to the source point)
+     */
+    CV_WRAP void getContour(const Point& targetPt, OutputArray contour, bool backward = false) const;
+
+#ifndef CV_DOXYGEN
+    struct Impl;
+    inline Impl* getImpl() const { return impl.get(); }
+protected:
+    std::shared_ptr<Impl> impl;
+#endif
+};
+
+//! @}
+
+}  // namespace segmentation
+}  // namespace cv
+
+#endif // OPENCV_IMGPROC_SEGMENTATION_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/types_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/types_c.h
new file mode 100644
index 000000000000..255ed0c37f65
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/imgproc/types_c.h
@@ -0,0 +1,660 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_IMGPROC_TYPES_C_H
+#define OPENCV_IMGPROC_TYPES_C_H
+
+#include "opencv2/core/core_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup imgproc_c
+  @{
+*/
+
+/** Connected component structure */
+typedef struct CvConnectedComp
+{
+    double area;    /**<area of the connected component  */
+    CvScalar value; /**<average color of the connected component */
+    CvRect rect;    /**<ROI of the component  */
+    CvSeq* contour; /**<optional component boundary
+                      (the contour might have child contours corresponding to the holes)*/
+}
+CvConnectedComp;
+
+/** Image smooth methods */
+enum SmoothMethod_c
+{
+    /** linear convolution with \f$\texttt{size1}\times\texttt{size2}\f$ box kernel (all 1's). If
+    you want to smooth different pixels with different-size box kernels, you can use the integral
+    image that is computed using integral */
+    CV_BLUR_NO_SCALE =0,
+    /** linear convolution with \f$\texttt{size1}\times\texttt{size2}\f$ box kernel (all
+    1's) with subsequent scaling by \f$1/(\texttt{size1}\cdot\texttt{size2})\f$ */
+    CV_BLUR  =1,
+    /** linear convolution with a \f$\texttt{size1}\times\texttt{size2}\f$ Gaussian kernel */
+    CV_GAUSSIAN  =2,
+    /** median filter with a \f$\texttt{size1}\times\texttt{size1}\f$ square aperture */
+    CV_MEDIAN =3,
+    /** bilateral filter with a \f$\texttt{size1}\times\texttt{size1}\f$ square aperture, color
+    sigma= sigma1 and spatial sigma= sigma2. If size1=0, the aperture square side is set to
+    cvRound(sigma2\*1.5)\*2+1. See cv::bilateralFilter */
+    CV_BILATERAL =4
+};
+
+/** Filters used in pyramid decomposition */
+enum
+{
+    CV_GAUSSIAN_5x5 = 7
+};
+
+/** Special filters */
+enum
+{
+    CV_SCHARR =-1,
+    CV_MAX_SOBEL_KSIZE =7
+};
+
+/** Constants for color conversion */
+enum
+{
+    CV_BGR2BGRA    =0,
+    CV_RGB2RGBA    =CV_BGR2BGRA,
+
+    CV_BGRA2BGR    =1,
+    CV_RGBA2RGB    =CV_BGRA2BGR,
+
+    CV_BGR2RGBA    =2,
+    CV_RGB2BGRA    =CV_BGR2RGBA,
+
+    CV_RGBA2BGR    =3,
+    CV_BGRA2RGB    =CV_RGBA2BGR,
+
+    CV_BGR2RGB     =4,
+    CV_RGB2BGR     =CV_BGR2RGB,
+
+    CV_BGRA2RGBA   =5,
+    CV_RGBA2BGRA   =CV_BGRA2RGBA,
+
+    CV_BGR2GRAY    =6,
+    CV_RGB2GRAY    =7,
+    CV_GRAY2BGR    =8,
+    CV_GRAY2RGB    =CV_GRAY2BGR,
+    CV_GRAY2BGRA   =9,
+    CV_GRAY2RGBA   =CV_GRAY2BGRA,
+    CV_BGRA2GRAY   =10,
+    CV_RGBA2GRAY   =11,
+
+    CV_BGR2BGR565  =12,
+    CV_RGB2BGR565  =13,
+    CV_BGR5652BGR  =14,
+    CV_BGR5652RGB  =15,
+    CV_BGRA2BGR565 =16,
+    CV_RGBA2BGR565 =17,
+    CV_BGR5652BGRA =18,
+    CV_BGR5652RGBA =19,
+
+    CV_GRAY2BGR565 =20,
+    CV_BGR5652GRAY =21,
+
+    CV_BGR2BGR555  =22,
+    CV_RGB2BGR555  =23,
+    CV_BGR5552BGR  =24,
+    CV_BGR5552RGB  =25,
+    CV_BGRA2BGR555 =26,
+    CV_RGBA2BGR555 =27,
+    CV_BGR5552BGRA =28,
+    CV_BGR5552RGBA =29,
+
+    CV_GRAY2BGR555 =30,
+    CV_BGR5552GRAY =31,
+
+    CV_BGR2XYZ     =32,
+    CV_RGB2XYZ     =33,
+    CV_XYZ2BGR     =34,
+    CV_XYZ2RGB     =35,
+
+    CV_BGR2YCrCb   =36,
+    CV_RGB2YCrCb   =37,
+    CV_YCrCb2BGR   =38,
+    CV_YCrCb2RGB   =39,
+
+    CV_BGR2HSV     =40,
+    CV_RGB2HSV     =41,
+
+    CV_BGR2Lab     =44,
+    CV_RGB2Lab     =45,
+
+    CV_BayerBG2BGR =46,
+    CV_BayerGB2BGR =47,
+    CV_BayerRG2BGR =48,
+    CV_BayerGR2BGR =49,
+
+    CV_BayerBG2RGB =CV_BayerRG2BGR,
+    CV_BayerGB2RGB =CV_BayerGR2BGR,
+    CV_BayerRG2RGB =CV_BayerBG2BGR,
+    CV_BayerGR2RGB =CV_BayerGB2BGR,
+
+    CV_BGR2Luv     =50,
+    CV_RGB2Luv     =51,
+    CV_BGR2HLS     =52,
+    CV_RGB2HLS     =53,
+
+    CV_HSV2BGR     =54,
+    CV_HSV2RGB     =55,
+
+    CV_Lab2BGR     =56,
+    CV_Lab2RGB     =57,
+    CV_Luv2BGR     =58,
+    CV_Luv2RGB     =59,
+    CV_HLS2BGR     =60,
+    CV_HLS2RGB     =61,
+
+    CV_BayerBG2BGR_VNG =62,
+    CV_BayerGB2BGR_VNG =63,
+    CV_BayerRG2BGR_VNG =64,
+    CV_BayerGR2BGR_VNG =65,
+
+    CV_BayerBG2RGB_VNG =CV_BayerRG2BGR_VNG,
+    CV_BayerGB2RGB_VNG =CV_BayerGR2BGR_VNG,
+    CV_BayerRG2RGB_VNG =CV_BayerBG2BGR_VNG,
+    CV_BayerGR2RGB_VNG =CV_BayerGB2BGR_VNG,
+
+    CV_BGR2HSV_FULL = 66,
+    CV_RGB2HSV_FULL = 67,
+    CV_BGR2HLS_FULL = 68,
+    CV_RGB2HLS_FULL = 69,
+
+    CV_HSV2BGR_FULL = 70,
+    CV_HSV2RGB_FULL = 71,
+    CV_HLS2BGR_FULL = 72,
+    CV_HLS2RGB_FULL = 73,
+
+    CV_LBGR2Lab     = 74,
+    CV_LRGB2Lab     = 75,
+    CV_LBGR2Luv     = 76,
+    CV_LRGB2Luv     = 77,
+
+    CV_Lab2LBGR     = 78,
+    CV_Lab2LRGB     = 79,
+    CV_Luv2LBGR     = 80,
+    CV_Luv2LRGB     = 81,
+
+    CV_BGR2YUV      = 82,
+    CV_RGB2YUV      = 83,
+    CV_YUV2BGR      = 84,
+    CV_YUV2RGB      = 85,
+
+    CV_BayerBG2GRAY = 86,
+    CV_BayerGB2GRAY = 87,
+    CV_BayerRG2GRAY = 88,
+    CV_BayerGR2GRAY = 89,
+
+    //YUV 4:2:0 formats family
+    CV_YUV2RGB_NV12 = 90,
+    CV_YUV2BGR_NV12 = 91,
+    CV_YUV2RGB_NV21 = 92,
+    CV_YUV2BGR_NV21 = 93,
+    CV_YUV420sp2RGB = CV_YUV2RGB_NV21,
+    CV_YUV420sp2BGR = CV_YUV2BGR_NV21,
+
+    CV_YUV2RGBA_NV12 = 94,
+    CV_YUV2BGRA_NV12 = 95,
+    CV_YUV2RGBA_NV21 = 96,
+    CV_YUV2BGRA_NV21 = 97,
+    CV_YUV420sp2RGBA = CV_YUV2RGBA_NV21,
+    CV_YUV420sp2BGRA = CV_YUV2BGRA_NV21,
+
+    CV_YUV2RGB_YV12 = 98,
+    CV_YUV2BGR_YV12 = 99,
+    CV_YUV2RGB_IYUV = 100,
+    CV_YUV2BGR_IYUV = 101,
+    CV_YUV2RGB_I420 = CV_YUV2RGB_IYUV,
+    CV_YUV2BGR_I420 = CV_YUV2BGR_IYUV,
+    CV_YUV420p2RGB = CV_YUV2RGB_YV12,
+    CV_YUV420p2BGR = CV_YUV2BGR_YV12,
+
+    CV_YUV2RGBA_YV12 = 102,
+    CV_YUV2BGRA_YV12 = 103,
+    CV_YUV2RGBA_IYUV = 104,
+    CV_YUV2BGRA_IYUV = 105,
+    CV_YUV2RGBA_I420 = CV_YUV2RGBA_IYUV,
+    CV_YUV2BGRA_I420 = CV_YUV2BGRA_IYUV,
+    CV_YUV420p2RGBA = CV_YUV2RGBA_YV12,
+    CV_YUV420p2BGRA = CV_YUV2BGRA_YV12,
+
+    CV_YUV2GRAY_420 = 106,
+    CV_YUV2GRAY_NV21 = CV_YUV2GRAY_420,
+    CV_YUV2GRAY_NV12 = CV_YUV2GRAY_420,
+    CV_YUV2GRAY_YV12 = CV_YUV2GRAY_420,
+    CV_YUV2GRAY_IYUV = CV_YUV2GRAY_420,
+    CV_YUV2GRAY_I420 = CV_YUV2GRAY_420,
+    CV_YUV420sp2GRAY = CV_YUV2GRAY_420,
+    CV_YUV420p2GRAY = CV_YUV2GRAY_420,
+
+    //YUV 4:2:2 formats family
+    CV_YUV2RGB_UYVY = 107,
+    CV_YUV2BGR_UYVY = 108,
+    //CV_YUV2RGB_VYUY = 109,
+    //CV_YUV2BGR_VYUY = 110,
+    CV_YUV2RGB_Y422 = CV_YUV2RGB_UYVY,
+    CV_YUV2BGR_Y422 = CV_YUV2BGR_UYVY,
+    CV_YUV2RGB_UYNV = CV_YUV2RGB_UYVY,
+    CV_YUV2BGR_UYNV = CV_YUV2BGR_UYVY,
+
+    CV_YUV2RGBA_UYVY = 111,
+    CV_YUV2BGRA_UYVY = 112,
+    //CV_YUV2RGBA_VYUY = 113,
+    //CV_YUV2BGRA_VYUY = 114,
+    CV_YUV2RGBA_Y422 = CV_YUV2RGBA_UYVY,
+    CV_YUV2BGRA_Y422 = CV_YUV2BGRA_UYVY,
+    CV_YUV2RGBA_UYNV = CV_YUV2RGBA_UYVY,
+    CV_YUV2BGRA_UYNV = CV_YUV2BGRA_UYVY,
+
+    CV_YUV2RGB_YUY2 = 115,
+    CV_YUV2BGR_YUY2 = 116,
+    CV_YUV2RGB_YVYU = 117,
+    CV_YUV2BGR_YVYU = 118,
+    CV_YUV2RGB_YUYV = CV_YUV2RGB_YUY2,
+    CV_YUV2BGR_YUYV = CV_YUV2BGR_YUY2,
+    CV_YUV2RGB_YUNV = CV_YUV2RGB_YUY2,
+    CV_YUV2BGR_YUNV = CV_YUV2BGR_YUY2,
+
+    CV_YUV2RGBA_YUY2 = 119,
+    CV_YUV2BGRA_YUY2 = 120,
+    CV_YUV2RGBA_YVYU = 121,
+    CV_YUV2BGRA_YVYU = 122,
+    CV_YUV2RGBA_YUYV = CV_YUV2RGBA_YUY2,
+    CV_YUV2BGRA_YUYV = CV_YUV2BGRA_YUY2,
+    CV_YUV2RGBA_YUNV = CV_YUV2RGBA_YUY2,
+    CV_YUV2BGRA_YUNV = CV_YUV2BGRA_YUY2,
+
+    CV_YUV2GRAY_UYVY = 123,
+    CV_YUV2GRAY_YUY2 = 124,
+    //CV_YUV2GRAY_VYUY = CV_YUV2GRAY_UYVY,
+    CV_YUV2GRAY_Y422 = CV_YUV2GRAY_UYVY,
+    CV_YUV2GRAY_UYNV = CV_YUV2GRAY_UYVY,
+    CV_YUV2GRAY_YVYU = CV_YUV2GRAY_YUY2,
+    CV_YUV2GRAY_YUYV = CV_YUV2GRAY_YUY2,
+    CV_YUV2GRAY_YUNV = CV_YUV2GRAY_YUY2,
+
+    // alpha premultiplication
+    CV_RGBA2mRGBA = 125,
+    CV_mRGBA2RGBA = 126,
+
+    CV_RGB2YUV_I420 = 127,
+    CV_BGR2YUV_I420 = 128,
+    CV_RGB2YUV_IYUV = CV_RGB2YUV_I420,
+    CV_BGR2YUV_IYUV = CV_BGR2YUV_I420,
+
+    CV_RGBA2YUV_I420 = 129,
+    CV_BGRA2YUV_I420 = 130,
+    CV_RGBA2YUV_IYUV = CV_RGBA2YUV_I420,
+    CV_BGRA2YUV_IYUV = CV_BGRA2YUV_I420,
+    CV_RGB2YUV_YV12  = 131,
+    CV_BGR2YUV_YV12  = 132,
+    CV_RGBA2YUV_YV12 = 133,
+    CV_BGRA2YUV_YV12 = 134,
+
+    // Edge-Aware Demosaicing
+    CV_BayerBG2BGR_EA = 135,
+    CV_BayerGB2BGR_EA = 136,
+    CV_BayerRG2BGR_EA = 137,
+    CV_BayerGR2BGR_EA = 138,
+
+    CV_BayerBG2RGB_EA = CV_BayerRG2BGR_EA,
+    CV_BayerGB2RGB_EA = CV_BayerGR2BGR_EA,
+    CV_BayerRG2RGB_EA = CV_BayerBG2BGR_EA,
+    CV_BayerGR2RGB_EA = CV_BayerGB2BGR_EA,
+
+    CV_BayerBG2BGRA =139,
+    CV_BayerGB2BGRA =140,
+    CV_BayerRG2BGRA =141,
+    CV_BayerGR2BGRA =142,
+
+    CV_BayerBG2RGBA =CV_BayerRG2BGRA,
+    CV_BayerGB2RGBA =CV_BayerGR2BGRA,
+    CV_BayerRG2RGBA =CV_BayerBG2BGRA,
+    CV_BayerGR2RGBA =CV_BayerGB2BGRA,
+
+    CV_COLORCVT_MAX  = 143
+};
+
+
+/** Sub-pixel interpolation methods */
+enum
+{
+    CV_INTER_NN        =0,
+    CV_INTER_LINEAR    =1,
+    CV_INTER_CUBIC     =2,
+    CV_INTER_AREA      =3,
+    CV_INTER_LANCZOS4  =4
+};
+
+/** ... and other image warping flags */
+enum
+{
+    CV_WARP_FILL_OUTLIERS = 8,
+    CV_WARP_INVERSE_MAP   = 16,
+    CV_WARP_RELATIVE_MAP  = 32
+};
+
+/** Shapes of a structuring element for morphological operations
+@see cv::MorphShapes, cv::getStructuringElement
+*/
+enum MorphShapes_c
+{
+    CV_SHAPE_RECT      =0,
+    CV_SHAPE_CROSS     =1,
+    CV_SHAPE_ELLIPSE   =2,
+    CV_SHAPE_CUSTOM    =100 //!< custom structuring element
+};
+
+/** Morphological operations */
+enum
+{
+    CV_MOP_ERODE        =0,
+    CV_MOP_DILATE       =1,
+    CV_MOP_OPEN         =2,
+    CV_MOP_CLOSE        =3,
+    CV_MOP_GRADIENT     =4,
+    CV_MOP_TOPHAT       =5,
+    CV_MOP_BLACKHAT     =6
+};
+
+/** Spatial and central moments */
+typedef struct CvMoments
+{
+    double  m00, m10, m01, m20, m11, m02, m30, m21, m12, m03; /**< spatial moments */
+    double  mu20, mu11, mu02, mu30, mu21, mu12, mu03; /**< central moments */
+    double  inv_sqrt_m00; /**< m00 != 0 ? 1/sqrt(m00) : 0 */
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvMoments(){}
+    CvMoments(const cv::Moments& m)
+    {
+        m00 = m.m00; m10 = m.m10; m01 = m.m01;
+        m20 = m.m20; m11 = m.m11; m02 = m.m02;
+        m30 = m.m30; m21 = m.m21; m12 = m.m12; m03 = m.m03;
+        mu20 = m.mu20; mu11 = m.mu11; mu02 = m.mu02;
+        mu30 = m.mu30; mu21 = m.mu21; mu12 = m.mu12; mu03 = m.mu03;
+        double am00 = std::abs(m.m00);
+        inv_sqrt_m00 = am00 > DBL_EPSILON ? 1./std::sqrt(am00) : 0;
+    }
+    operator cv::Moments() const
+    {
+        return cv::Moments(m00, m10, m01, m20, m11, m02, m30, m21, m12, m03);
+    }
+#endif
+}
+CvMoments;
+
+#ifdef __cplusplus
+} // extern "C"
+
+CV_INLINE CvMoments cvMoments()
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    CvMoments self = CV_STRUCT_INITIALIZER; return self;
+#else
+    return CvMoments();
+#endif
+}
+
+CV_INLINE CvMoments cvMoments(const cv::Moments& m)
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    double am00 = std::abs(m.m00);
+    CvMoments self = {
+        m.m00, m.m10, m.m01, m.m20, m.m11, m.m02, m.m30, m.m21, m.m12, m.m03,
+        m.mu20, m.mu11, m.mu02, m.mu30, m.mu21, m.mu12, m.mu03,
+        am00 > DBL_EPSILON ? 1./std::sqrt(am00) : 0
+    };
+    return self;
+#else
+    return CvMoments(m);
+#endif
+}
+
+extern "C" {
+#endif // __cplusplus
+
+/** Hu invariants */
+typedef struct CvHuMoments
+{
+    double hu1, hu2, hu3, hu4, hu5, hu6, hu7; /**< Hu invariants */
+}
+CvHuMoments;
+
+/** Template matching methods */
+enum
+{
+    CV_TM_SQDIFF        =0,
+    CV_TM_SQDIFF_NORMED =1,
+    CV_TM_CCORR         =2,
+    CV_TM_CCORR_NORMED  =3,
+    CV_TM_CCOEFF        =4,
+    CV_TM_CCOEFF_NORMED =5
+};
+
+typedef float (CV_CDECL * CvDistanceFunction)( const float* a, const float* b, void* user_param );
+
+/** Contour retrieval modes */
+enum
+{
+    CV_RETR_EXTERNAL=0,
+    CV_RETR_LIST=1,
+    CV_RETR_CCOMP=2,
+    CV_RETR_TREE=3,
+    CV_RETR_FLOODFILL=4
+};
+
+/** Contour approximation methods */
+enum
+{
+    CV_CHAIN_CODE=0,
+    CV_CHAIN_APPROX_NONE=1,
+    CV_CHAIN_APPROX_SIMPLE=2,
+    CV_CHAIN_APPROX_TC89_L1=3,
+    CV_CHAIN_APPROX_TC89_KCOS=4,
+    CV_LINK_RUNS=5
+};
+
+/*
+Internal structure that is used for sequential retrieving contours from the image.
+It supports both hierarchical and plane variants of Suzuki algorithm.
+*/
+typedef struct _CvContourScanner* CvContourScanner;
+
+/** Freeman chain reader state */
+typedef struct CvChainPtReader
+{
+    CV_SEQ_READER_FIELDS()
+    char      code;
+    CvPoint   pt;
+    schar     deltas[8][2];
+}
+CvChainPtReader;
+
+/** initializes 8-element array for fast access to 3x3 neighborhood of a pixel */
+#define  CV_INIT_3X3_DELTAS( deltas, step, nch )            \
+    ((deltas)[0] =  (nch),  (deltas)[1] = -(step) + (nch),  \
+     (deltas)[2] = -(step), (deltas)[3] = -(step) - (nch),  \
+     (deltas)[4] = -(nch),  (deltas)[5] =  (step) - (nch),  \
+     (deltas)[6] =  (step), (deltas)[7] =  (step) + (nch))
+
+
+/** Contour approximation algorithms */
+enum
+{
+    CV_POLY_APPROX_DP = 0
+};
+
+/** Shape matching methods */
+enum
+{
+    CV_CONTOURS_MATCH_I1  =1, //!< \f[I_1(A,B) =  \sum _{i=1...7}  \left |  \frac{1}{m^A_i} -  \frac{1}{m^B_i} \right |\f]
+    CV_CONTOURS_MATCH_I2  =2, //!< \f[I_2(A,B) =  \sum _{i=1...7}  \left | m^A_i - m^B_i  \right |\f]
+    CV_CONTOURS_MATCH_I3  =3  //!< \f[I_3(A,B) =  \max _{i=1...7}  \frac{ \left| m^A_i - m^B_i \right| }{ \left| m^A_i \right| }\f]
+};
+
+/** Shape orientation */
+enum
+{
+    CV_CLOCKWISE         =1,
+    CV_COUNTER_CLOCKWISE =2
+};
+
+
+/** Convexity defect */
+typedef struct CvConvexityDefect
+{
+    CvPoint* start; /**< point of the contour where the defect begins */
+    CvPoint* end; /**< point of the contour where the defect ends */
+    CvPoint* depth_point; /**< the farthest from the convex hull point within the defect */
+    float depth; /**< distance between the farthest point and the convex hull */
+} CvConvexityDefect;
+
+
+/** Histogram comparison methods */
+enum
+{
+    CV_COMP_CORREL        =0,
+    CV_COMP_CHISQR        =1,
+    CV_COMP_INTERSECT     =2,
+    CV_COMP_BHATTACHARYYA =3,
+    CV_COMP_HELLINGER     =CV_COMP_BHATTACHARYYA,
+    CV_COMP_CHISQR_ALT    =4,
+    CV_COMP_KL_DIV        =5
+};
+
+/** Mask size for distance transform */
+enum
+{
+    CV_DIST_MASK_3   =3,
+    CV_DIST_MASK_5   =5,
+    CV_DIST_MASK_PRECISE =0
+};
+
+/** Content of output label array: connected components or pixels */
+enum
+{
+  CV_DIST_LABEL_CCOMP = 0,
+  CV_DIST_LABEL_PIXEL = 1
+};
+
+/** Distance types for Distance Transform and M-estimators */
+enum
+{
+    CV_DIST_USER    =-1,  /**< User defined distance */
+    CV_DIST_L1      =1,   /**< distance = |x1-x2| + |y1-y2| */
+    CV_DIST_L2      =2,   /**< the simple euclidean distance */
+    CV_DIST_C       =3,   /**< distance = max(|x1-x2|,|y1-y2|) */
+    CV_DIST_L12     =4,   /**< L1-L2 metric: distance = 2(sqrt(1+x*x/2) - 1)) */
+    CV_DIST_FAIR    =5,   /**< distance = c^2(|x|/c-log(1+|x|/c)), c = 1.3998 */
+    CV_DIST_WELSCH  =6,   /**< distance = c^2/2(1-exp(-(x/c)^2)), c = 2.9846 */
+    CV_DIST_HUBER   =7    /**< distance = |x|<c ? x^2/2 : c(|x|-c/2), c=1.345 */
+};
+
+
+/** Threshold types */
+enum
+{
+    CV_THRESH_BINARY      =0,  /**< value = value > threshold ? max_value : 0       */
+    CV_THRESH_BINARY_INV  =1,  /**< value = value > threshold ? 0 : max_value       */
+    CV_THRESH_TRUNC       =2,  /**< value = value > threshold ? threshold : value   */
+    CV_THRESH_TOZERO      =3,  /**< value = value > threshold ? value : 0           */
+    CV_THRESH_TOZERO_INV  =4,  /**< value = value > threshold ? 0 : value           */
+    CV_THRESH_MASK        =7,
+    CV_THRESH_OTSU        =8, /**< use Otsu algorithm to choose the optimal threshold value;
+                                 combine the flag with one of the above CV_THRESH_* values */
+    CV_THRESH_TRIANGLE    =16  /**< use Triangle algorithm to choose the optimal threshold value;
+                                 combine the flag with one of the above CV_THRESH_* values, but not
+                                 with CV_THRESH_OTSU */
+};
+
+/** Adaptive threshold methods */
+enum
+{
+    CV_ADAPTIVE_THRESH_MEAN_C  =0,
+    CV_ADAPTIVE_THRESH_GAUSSIAN_C  =1
+};
+
+/** FloodFill flags */
+enum
+{
+    CV_FLOODFILL_FIXED_RANGE =(1 << 16),
+    CV_FLOODFILL_MASK_ONLY   =(1 << 17)
+};
+
+
+/** Canny edge detector flags */
+enum
+{
+    CV_CANNY_L2_GRADIENT  =(1 << 31)
+};
+
+/** Variants of a Hough transform */
+enum
+{
+    CV_HOUGH_STANDARD =0,
+    CV_HOUGH_PROBABILISTIC =1,
+    CV_HOUGH_MULTI_SCALE =2,
+    CV_HOUGH_GRADIENT =3
+};
+
+
+/* Fast search data structures  */
+struct CvFeatureTree;
+struct CvLSH;
+struct CvLSHOperations;
+
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/ml.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/ml.hpp
new file mode 100644
index 000000000000..d537ab7759b6
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/ml.hpp
@@ -0,0 +1,1956 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_ML_HPP
+#define OPENCV_ML_HPP
+
+#ifdef __cplusplus
+#  include "opencv2/core.hpp"
+#endif
+
+#ifdef __cplusplus
+
+#include <float.h>
+#include <map>
+#include <iostream>
+
+/**
+  @defgroup ml Machine Learning
+
+  The Machine Learning Library (MLL) is a set of classes and functions for statistical
+  classification, regression, and clustering of data.
+
+  Most of the classification and regression algorithms are implemented as C++ classes. As the
+  algorithms have different sets of features (like an ability to handle missing measurements or
+  categorical input variables), there is a little common ground between the classes. This common
+  ground is defined by the class cv::ml::StatModel that all the other ML classes are derived from.
+
+  See detailed overview here: @ref ml_intro.
+ */
+
+namespace cv
+{
+
+namespace ml
+{
+
+//! @addtogroup ml
+//! @{
+
+/** @brief Variable types */
+enum VariableTypes
+{
+    VAR_NUMERICAL    =0, //!< same as VAR_ORDERED
+    VAR_ORDERED      =0, //!< ordered variables
+    VAR_CATEGORICAL  =1  //!< categorical variables
+};
+
+/** @brief %Error types */
+enum ErrorTypes
+{
+    TEST_ERROR = 0,
+    TRAIN_ERROR = 1
+};
+
+/** @brief Sample types */
+enum SampleTypes
+{
+    ROW_SAMPLE = 0, //!< each training sample is a row of samples
+    COL_SAMPLE = 1  //!< each training sample occupies a column of samples
+};
+
+/** @brief The structure represents the logarithmic grid range of statmodel parameters.
+
+It is used for optimizing statmodel accuracy by varying model parameters, the accuracy estimate
+being computed by cross-validation.
+ */
+class CV_EXPORTS_W ParamGrid
+{
+public:
+    /** @brief Default constructor */
+    ParamGrid();
+    /** @brief Constructor with parameters */
+    ParamGrid(double _minVal, double _maxVal, double _logStep);
+
+    CV_PROP_RW double minVal; //!< Minimum value of the statmodel parameter. Default value is 0.
+    CV_PROP_RW double maxVal; //!< Maximum value of the statmodel parameter. Default value is 0.
+    /** @brief Logarithmic step for iterating the statmodel parameter.
+
+    The grid determines the following iteration sequence of the statmodel parameter values:
+    \f[(minVal, minVal*step, minVal*{step}^2, \dots,  minVal*{logStep}^n),\f]
+    where \f$n\f$ is the maximal index satisfying
+    \f[\texttt{minVal} * \texttt{logStep} ^n <  \texttt{maxVal}\f]
+    The grid is logarithmic, so logStep must always be greater than 1. Default value is 1.
+    */
+    CV_PROP_RW double logStep;
+
+    /** @brief Creates a ParamGrid Ptr that can be given to the %SVM::trainAuto method
+
+    @param minVal minimum value of the parameter grid
+    @param maxVal maximum value of the parameter grid
+    @param logstep Logarithmic step for iterating the statmodel parameter
+    */
+    CV_WRAP static Ptr<ParamGrid> create(double minVal=0., double maxVal=0., double logstep=1.);
+};
+
+/** @brief Class encapsulating training data.
+
+Please note that the class only specifies the interface of training data, but not implementation.
+All the statistical model classes in _ml_ module accepts Ptr\<TrainData\> as parameter. In other
+words, you can create your own class derived from TrainData and pass smart pointer to the instance
+of this class into StatModel::train.
+
+@sa @ref ml_intro_data
+ */
+class CV_EXPORTS_W TrainData
+{
+public:
+    static inline float missingValue() { return FLT_MAX; }
+    virtual ~TrainData();
+
+    CV_WRAP virtual int getLayout() const = 0;
+    CV_WRAP virtual int getNTrainSamples() const = 0;
+    CV_WRAP virtual int getNTestSamples() const = 0;
+    CV_WRAP virtual int getNSamples() const = 0;
+    CV_WRAP virtual int getNVars() const = 0;
+    CV_WRAP virtual int getNAllVars() const = 0;
+
+    CV_WRAP virtual void getSample(InputArray varIdx, int sidx, float* buf) const = 0;
+    CV_WRAP virtual Mat getSamples() const = 0;
+    CV_WRAP virtual Mat getMissing() const = 0;
+
+    /** @brief Returns matrix of train samples
+
+    @param layout The requested layout. If it's different from the initial one, the matrix is
+        transposed. See ml::SampleTypes.
+    @param compressSamples if true, the function returns only the training samples (specified by
+        sampleIdx)
+    @param compressVars if true, the function returns the shorter training samples, containing only
+        the active variables.
+
+    In current implementation the function tries to avoid physical data copying and returns the
+    matrix stored inside TrainData (unless the transposition or compression is needed).
+     */
+    CV_WRAP virtual Mat getTrainSamples(int layout=ROW_SAMPLE,
+                                bool compressSamples=true,
+                                bool compressVars=true) const = 0;
+
+    /** @brief Returns the vector of responses
+
+    The function returns ordered or the original categorical responses. Usually it's used in
+    regression algorithms.
+     */
+    CV_WRAP virtual Mat getTrainResponses() const = 0;
+
+    /** @brief Returns the vector of normalized categorical responses
+
+    The function returns vector of responses. Each response is integer from `0` to `<number of
+    classes>-1`. The actual label value can be retrieved then from the class label vector, see
+    TrainData::getClassLabels.
+     */
+    CV_WRAP virtual Mat getTrainNormCatResponses() const = 0;
+    CV_WRAP virtual Mat getTestResponses() const = 0;
+    CV_WRAP virtual Mat getTestNormCatResponses() const = 0;
+    CV_WRAP virtual Mat getResponses() const = 0;
+    CV_WRAP virtual Mat getNormCatResponses() const = 0;
+    CV_WRAP virtual Mat getSampleWeights() const = 0;
+    CV_WRAP virtual Mat getTrainSampleWeights() const = 0;
+    CV_WRAP virtual Mat getTestSampleWeights() const = 0;
+    CV_WRAP virtual Mat getVarIdx() const = 0;
+    CV_WRAP virtual Mat getVarType() const = 0;
+    CV_WRAP virtual Mat getVarSymbolFlags() const = 0;
+    CV_WRAP virtual int getResponseType() const = 0;
+    CV_WRAP virtual Mat getTrainSampleIdx() const = 0;
+    CV_WRAP virtual Mat getTestSampleIdx() const = 0;
+    CV_WRAP virtual void getValues(int vi, InputArray sidx, float* values) const = 0;
+    virtual void getNormCatValues(int vi, InputArray sidx, int* values) const = 0;
+    CV_WRAP virtual Mat getDefaultSubstValues() const = 0;
+
+    CV_WRAP virtual int getCatCount(int vi) const = 0;
+
+    /** @brief Returns the vector of class labels
+
+    The function returns vector of unique labels occurred in the responses.
+     */
+    CV_WRAP virtual Mat getClassLabels() const = 0;
+
+    CV_WRAP virtual Mat getCatOfs() const = 0;
+    CV_WRAP virtual Mat getCatMap() const = 0;
+
+    /** @brief Splits the training data into the training and test parts
+    @sa TrainData::setTrainTestSplitRatio
+     */
+    CV_WRAP virtual void setTrainTestSplit(int count, bool shuffle=true) = 0;
+
+    /** @brief Splits the training data into the training and test parts
+
+    The function selects a subset of specified relative size and then returns it as the training
+    set. If the function is not called, all the data is used for training. Please, note that for
+    each of TrainData::getTrain\* there is corresponding TrainData::getTest\*, so that the test
+    subset can be retrieved and processed as well.
+    @sa TrainData::setTrainTestSplit
+     */
+    CV_WRAP virtual void setTrainTestSplitRatio(double ratio, bool shuffle=true) = 0;
+    CV_WRAP virtual void shuffleTrainTest() = 0;
+
+    /** @brief Returns matrix of test samples */
+    CV_WRAP virtual Mat getTestSamples() const = 0;
+
+    /** @brief Returns vector of symbolic names captured in loadFromCSV() */
+    CV_WRAP virtual void getNames(std::vector<String>& names) const = 0;
+
+    /** @brief Extract from 1D vector elements specified by passed indexes.
+    @param vec input vector (supported types: CV_32S, CV_32F, CV_64F)
+    @param idx 1D index vector
+     */
+    static CV_WRAP Mat getSubVector(const Mat& vec, const Mat& idx);
+
+    /** @brief Extract from matrix rows/cols specified by passed indexes.
+    @param matrix input matrix (supported types: CV_32S, CV_32F, CV_64F)
+    @param idx 1D index vector
+    @param layout specifies to extract rows (cv::ml::ROW_SAMPLES) or to extract columns (cv::ml::COL_SAMPLES)
+     */
+    static CV_WRAP Mat getSubMatrix(const Mat& matrix, const Mat& idx, int layout);
+
+    /** @brief Reads the dataset from a .csv file and returns the ready-to-use training data.
+
+    @param filename The input file name
+    @param headerLineCount The number of lines in the beginning to skip; besides the header, the
+        function also skips empty lines and lines staring with `#`
+    @param responseStartIdx Index of the first output variable. If -1, the function considers the
+        last variable as the response
+    @param responseEndIdx Index of the last output variable + 1. If -1, then there is single
+        response variable at responseStartIdx.
+    @param varTypeSpec The optional text string that specifies the variables' types. It has the
+        format `ord[n1-n2,n3,n4-n5,...]cat[n6,n7-n8,...]`. That is, variables from `n1 to n2`
+        (inclusive range), `n3`, `n4 to n5` ... are considered ordered and `n6`, `n7 to n8` ... are
+        considered as categorical. The range `[n1..n2] + [n3] + [n4..n5] + ... + [n6] + [n7..n8]`
+        should cover all the variables. If varTypeSpec is not specified, then algorithm uses the
+        following rules:
+        - all input variables are considered ordered by default. If some column contains has non-
+          numerical values, e.g. 'apple', 'pear', 'apple', 'apple', 'mango', the corresponding
+          variable is considered categorical.
+        - if there are several output variables, they are all considered as ordered. Error is
+          reported when non-numerical values are used.
+        - if there is a single output variable, then if its values are non-numerical or are all
+          integers, then it's considered categorical. Otherwise, it's considered ordered.
+    @param delimiter The character used to separate values in each line.
+    @param missch The character used to specify missing measurements. It should not be a digit.
+        Although it's a non-numerical value, it surely does not affect the decision of whether the
+        variable ordered or categorical.
+    @note If the dataset only contains input variables and no responses, use responseStartIdx = -2
+        and responseEndIdx = 0. The output variables vector will just contain zeros.
+     */
+    static Ptr<TrainData> loadFromCSV(const String& filename,
+                                      int headerLineCount,
+                                      int responseStartIdx=-1,
+                                      int responseEndIdx=-1,
+                                      const String& varTypeSpec=String(),
+                                      char delimiter=',',
+                                      char missch='?');
+
+    /** @brief Creates training data from in-memory arrays.
+
+    @param samples matrix of samples. It should have CV_32F type.
+    @param layout see ml::SampleTypes.
+    @param responses matrix of responses. If the responses are scalar, they should be stored as a
+        single row or as a single column. The matrix should have type CV_32F or CV_32S (in the
+        former case the responses are considered as ordered by default; in the latter case - as
+        categorical)
+    @param varIdx vector specifying which variables to use for training. It can be an integer vector
+        (CV_32S) containing 0-based variable indices or byte vector (CV_8U) containing a mask of
+        active variables.
+    @param sampleIdx vector specifying which samples to use for training. It can be an integer
+        vector (CV_32S) containing 0-based sample indices or byte vector (CV_8U) containing a mask
+        of training samples.
+    @param sampleWeights optional vector with weights for each sample. It should have CV_32F type.
+    @param varType optional vector of type CV_8U and size `<number_of_variables_in_samples> +
+        <number_of_variables_in_responses>`, containing types of each input and output variable. See
+        ml::VariableTypes.
+     */
+    CV_WRAP static Ptr<TrainData> create(InputArray samples, int layout, InputArray responses,
+                                 InputArray varIdx=noArray(), InputArray sampleIdx=noArray(),
+                                 InputArray sampleWeights=noArray(), InputArray varType=noArray());
+};
+
+/** @brief Base class for statistical models in OpenCV ML.
+ */
+class CV_EXPORTS_W StatModel : public Algorithm
+{
+public:
+    /** Predict options */
+    enum Flags {
+        UPDATE_MODEL = 1,
+        RAW_OUTPUT=1, //!< makes the method return the raw results (the sum), not the class label
+        COMPRESSED_INPUT=2,
+        PREPROCESSED_INPUT=4
+    };
+
+    /** @brief Returns the number of variables in training samples */
+    CV_WRAP virtual int getVarCount() const = 0;
+
+    CV_WRAP virtual bool empty() const CV_OVERRIDE;
+
+    /** @brief Returns true if the model is trained */
+    CV_WRAP virtual bool isTrained() const = 0;
+    /** @brief Returns true if the model is classifier */
+    CV_WRAP virtual bool isClassifier() const = 0;
+
+    /** @brief Trains the statistical model
+
+    @param trainData training data that can be loaded from file using TrainData::loadFromCSV or
+        created with TrainData::create.
+    @param flags optional flags, depending on the model. Some of the models can be updated with the
+        new training samples, not completely overwritten (such as NormalBayesClassifier or ANN_MLP).
+     */
+    CV_WRAP virtual bool train( const Ptr<TrainData>& trainData, int flags=0 );
+
+    /** @brief Trains the statistical model
+
+    @param samples training samples
+    @param layout See ml::SampleTypes.
+    @param responses vector of responses associated with the training samples.
+    */
+    CV_WRAP virtual bool train( InputArray samples, int layout, InputArray responses );
+
+    /** @brief Computes error on the training or test dataset
+
+    @param data the training data
+    @param test if true, the error is computed over the test subset of the data, otherwise it's
+        computed over the training subset of the data. Please note that if you loaded a completely
+        different dataset to evaluate already trained classifier, you will probably want not to set
+        the test subset at all with TrainData::setTrainTestSplitRatio and specify test=false, so
+        that the error is computed for the whole new set. Yes, this sounds a bit confusing.
+    @param resp the optional output responses.
+
+    The method uses StatModel::predict to compute the error. For regression models the error is
+    computed as RMS, for classifiers - as a percent of missclassified samples (0%-100%).
+     */
+    CV_WRAP virtual float calcError( const Ptr<TrainData>& data, bool test, OutputArray resp ) const;
+
+    /** @brief Predicts response(s) for the provided sample(s)
+
+    @param samples The input samples, floating-point matrix
+    @param results The optional output matrix of results.
+    @param flags The optional flags, model-dependent. See cv::ml::StatModel::Flags.
+     */
+    CV_WRAP virtual float predict( InputArray samples, OutputArray results=noArray(), int flags=0 ) const = 0;
+
+    /** @brief Create and train model with default parameters
+
+    The class must implement static `create()` method with no parameters or with all default parameter values
+    */
+    template<typename _Tp> static Ptr<_Tp> train(const Ptr<TrainData>& data, int flags=0)
+    {
+        Ptr<_Tp> model = _Tp::create();
+        return !model.empty() && model->train(data, flags) ? model : Ptr<_Tp>();
+    }
+};
+
+/****************************************************************************************\
+*                                 Normal Bayes Classifier                                *
+\****************************************************************************************/
+
+/** @brief Bayes classifier for normally distributed data.
+
+@sa @ref ml_intro_bayes
+ */
+class CV_EXPORTS_W NormalBayesClassifier : public StatModel
+{
+public:
+    /** @brief Predicts the response for sample(s).
+
+    The method estimates the most probable classes for input vectors. Input vectors (one or more)
+    are stored as rows of the matrix inputs. In case of multiple input vectors, there should be one
+    output vector outputs. The predicted class for a single input vector is returned by the method.
+    The vector outputProbs contains the output probabilities corresponding to each element of
+    result.
+     */
+    CV_WRAP virtual float predictProb( InputArray inputs, OutputArray outputs,
+                               OutputArray outputProbs, int flags=0 ) const = 0;
+
+    /** Creates empty model
+    Use StatModel::train to train the model after creation. */
+    CV_WRAP static Ptr<NormalBayesClassifier> create();
+
+    /** @brief Loads and creates a serialized NormalBayesClassifier from a file
+     *
+     * Use NormalBayesClassifier::save to serialize and store an NormalBayesClassifier to disk.
+     * Load the NormalBayesClassifier from this file again, by calling this function with the path to the file.
+     * Optionally specify the node for the file containing the classifier
+     *
+     * @param filepath path to serialized NormalBayesClassifier
+     * @param nodeName name of node containing the classifier
+     */
+    CV_WRAP static Ptr<NormalBayesClassifier> load(const String& filepath , const String& nodeName = String());
+};
+
+/****************************************************************************************\
+*                          K-Nearest Neighbour Classifier                                *
+\****************************************************************************************/
+
+/** @brief The class implements K-Nearest Neighbors model
+
+@sa @ref ml_intro_knn
+ */
+class CV_EXPORTS_W KNearest : public StatModel
+{
+public:
+
+    /** Default number of neighbors to use in predict method. */
+    /** @see setDefaultK */
+    CV_WRAP virtual int getDefaultK() const = 0;
+    /** @copybrief getDefaultK @see getDefaultK */
+    CV_WRAP virtual void setDefaultK(int val) = 0;
+
+    /** Whether classification or regression model should be trained. */
+    /** @see setIsClassifier */
+    CV_WRAP virtual bool getIsClassifier() const = 0;
+    /** @copybrief getIsClassifier @see getIsClassifier */
+    CV_WRAP virtual void setIsClassifier(bool val) = 0;
+
+    /** Parameter for KDTree implementation. */
+    /** @see setEmax */
+    CV_WRAP virtual int getEmax() const = 0;
+    /** @copybrief getEmax @see getEmax */
+    CV_WRAP virtual void setEmax(int val) = 0;
+
+    /** %Algorithm type, one of KNearest::Types. */
+    /** @see setAlgorithmType */
+    CV_WRAP virtual int getAlgorithmType() const = 0;
+    /** @copybrief getAlgorithmType @see getAlgorithmType */
+    CV_WRAP virtual void setAlgorithmType(int val) = 0;
+
+    /** @brief Finds the neighbors and predicts responses for input vectors.
+
+    @param samples Input samples stored by rows. It is a single-precision floating-point matrix of
+        `<number_of_samples> * k` size.
+    @param k Number of used nearest neighbors. Should be greater than 1.
+    @param results Vector with results of prediction (regression or classification) for each input
+        sample. It is a single-precision floating-point vector with `<number_of_samples>` elements.
+    @param neighborResponses Optional output values for corresponding neighbors. It is a single-
+        precision floating-point matrix of `<number_of_samples> * k` size.
+    @param dist Optional output distances from the input vectors to the corresponding neighbors. It
+        is a single-precision floating-point matrix of `<number_of_samples> * k` size.
+
+    For each input vector (a row of the matrix samples), the method finds the k nearest neighbors.
+    In case of regression, the predicted result is a mean value of the particular vector's neighbor
+    responses. In case of classification, the class is determined by voting.
+
+    For each input vector, the neighbors are sorted by their distances to the vector.
+
+    In case of C++ interface you can use output pointers to empty matrices and the function will
+    allocate memory itself.
+
+    If only a single input vector is passed, all output matrices are optional and the predicted
+    value is returned by the method.
+
+    The function is parallelized with the TBB library.
+     */
+    CV_WRAP virtual float findNearest( InputArray samples, int k,
+                               OutputArray results,
+                               OutputArray neighborResponses=noArray(),
+                               OutputArray dist=noArray() ) const = 0;
+
+    /** @brief Implementations of KNearest algorithm
+       */
+    enum Types
+    {
+        BRUTE_FORCE=1,
+        KDTREE=2
+    };
+
+    /** @brief Creates the empty model
+
+    The static method creates empty %KNearest classifier. It should be then trained using StatModel::train method.
+     */
+    CV_WRAP static Ptr<KNearest> create();
+    /** @brief Loads and creates a serialized knearest from a file
+     *
+     * Use KNearest::save to serialize and store an KNearest to disk.
+     * Load the KNearest from this file again, by calling this function with the path to the file.
+     *
+     * @param filepath path to serialized KNearest
+     */
+    CV_WRAP static Ptr<KNearest> load(const String& filepath);
+};
+
+/****************************************************************************************\
+*                                   Support Vector Machines                              *
+\****************************************************************************************/
+
+/** @brief Support Vector Machines.
+
+@sa @ref ml_intro_svm
+ */
+class CV_EXPORTS_W SVM : public StatModel
+{
+public:
+
+    class CV_EXPORTS Kernel : public Algorithm
+    {
+    public:
+        virtual int getType() const = 0;
+        virtual void calc( int vcount, int n, const float* vecs, const float* another, float* results ) = 0;
+    };
+
+    /** Type of a %SVM formulation.
+    See SVM::Types. Default value is SVM::C_SVC. */
+    /** @see setType */
+    CV_WRAP virtual int getType() const = 0;
+    /** @copybrief getType @see getType */
+    CV_WRAP virtual void setType(int val) = 0;
+
+    /** Parameter \f$\gamma\f$ of a kernel function.
+    For SVM::POLY, SVM::RBF, SVM::SIGMOID or SVM::CHI2. Default value is 1. */
+    /** @see setGamma */
+    CV_WRAP virtual double getGamma() const = 0;
+    /** @copybrief getGamma @see getGamma */
+    CV_WRAP virtual void setGamma(double val) = 0;
+
+    /** Parameter _coef0_ of a kernel function.
+    For SVM::POLY or SVM::SIGMOID. Default value is 0.*/
+    /** @see setCoef0 */
+    CV_WRAP virtual double getCoef0() const = 0;
+    /** @copybrief getCoef0 @see getCoef0 */
+    CV_WRAP virtual void setCoef0(double val) = 0;
+
+    /** Parameter _degree_ of a kernel function.
+    For SVM::POLY. Default value is 0. */
+    /** @see setDegree */
+    CV_WRAP virtual double getDegree() const = 0;
+    /** @copybrief getDegree @see getDegree */
+    CV_WRAP virtual void setDegree(double val) = 0;
+
+    /** Parameter _C_ of a %SVM optimization problem.
+    For SVM::C_SVC, SVM::EPS_SVR or SVM::NU_SVR. Default value is 0. */
+    /** @see setC */
+    CV_WRAP virtual double getC() const = 0;
+    /** @copybrief getC @see getC */
+    CV_WRAP virtual void setC(double val) = 0;
+
+    /** Parameter \f$\nu\f$ of a %SVM optimization problem.
+    For SVM::NU_SVC, SVM::ONE_CLASS or SVM::NU_SVR. Default value is 0. */
+    /** @see setNu */
+    CV_WRAP virtual double getNu() const = 0;
+    /** @copybrief getNu @see getNu */
+    CV_WRAP virtual void setNu(double val) = 0;
+
+    /** Parameter \f$\epsilon\f$ of a %SVM optimization problem.
+    For SVM::EPS_SVR. Default value is 0. */
+    /** @see setP */
+    CV_WRAP virtual double getP() const = 0;
+    /** @copybrief getP @see getP */
+    CV_WRAP virtual void setP(double val) = 0;
+
+    /** Optional weights in the SVM::C_SVC problem, assigned to particular classes.
+    They are multiplied by _C_ so the parameter _C_ of class _i_ becomes `classWeights(i) * C`. Thus
+    these weights affect the misclassification penalty for different classes. The larger weight,
+    the larger penalty on misclassification of data from the corresponding class. Default value is
+    empty Mat. */
+    /** @see setClassWeights */
+    CV_WRAP virtual cv::Mat getClassWeights() const = 0;
+    /** @copybrief getClassWeights @see getClassWeights */
+    CV_WRAP virtual void setClassWeights(const cv::Mat &val) = 0;
+
+    /** Termination criteria of the iterative %SVM training procedure which solves a partial
+    case of constrained quadratic optimization problem.
+    You can specify tolerance and/or the maximum number of iterations. Default value is
+    `TermCriteria( TermCriteria::MAX_ITER + TermCriteria::EPS, 1000, FLT_EPSILON )`; */
+    /** @see setTermCriteria */
+    CV_WRAP virtual cv::TermCriteria getTermCriteria() const = 0;
+    /** @copybrief getTermCriteria @see getTermCriteria */
+    CV_WRAP virtual void setTermCriteria(const cv::TermCriteria &val) = 0;
+
+    /** Type of a %SVM kernel.
+    See SVM::KernelTypes. Default value is SVM::RBF. */
+    CV_WRAP virtual int getKernelType() const = 0;
+
+    /** Initialize with one of predefined kernels.
+    See SVM::KernelTypes. */
+    CV_WRAP virtual void setKernel(int kernelType) = 0;
+
+    /** Initialize with custom kernel.
+    See SVM::Kernel class for implementation details */
+    virtual void setCustomKernel(const Ptr<Kernel> &_kernel) = 0;
+
+    //! %SVM type
+    enum Types {
+        /** C-Support Vector Classification. n-class classification (n \f$\geq\f$ 2), allows
+        imperfect separation of classes with penalty multiplier C for outliers. */
+        C_SVC=100,
+        /** \f$\nu\f$-Support Vector Classification. n-class classification with possible
+        imperfect separation. Parameter \f$\nu\f$ (in the range 0..1, the larger the value, the smoother
+        the decision boundary) is used instead of C. */
+        NU_SVC=101,
+        /** Distribution Estimation (One-class %SVM). All the training data are from
+        the same class, %SVM builds a boundary that separates the class from the rest of the feature
+        space. */
+        ONE_CLASS=102,
+        /** \f$\epsilon\f$-Support Vector Regression. The distance between feature vectors
+        from the training set and the fitting hyper-plane must be less than p. For outliers the
+        penalty multiplier C is used. */
+        EPS_SVR=103,
+        /** \f$\nu\f$-Support Vector Regression. \f$\nu\f$ is used instead of p.
+        See @cite LibSVM for details. */
+        NU_SVR=104
+    };
+
+    /** @brief %SVM kernel type
+
+    A comparison of different kernels on the following 2D test case with four classes. Four
+    SVM::C_SVC SVMs have been trained (one against rest) with auto_train. Evaluation on three
+    different kernels (SVM::CHI2, SVM::INTER, SVM::RBF). The color depicts the class with max score.
+    Bright means max-score \> 0, dark means max-score \< 0.
+    ![image](pics/SVM_Comparison.png)
+    */
+    enum KernelTypes {
+        /** Returned by SVM::getKernelType in case when custom kernel has been set */
+        CUSTOM=-1,
+        /** Linear kernel. No mapping is done, linear discrimination (or regression) is
+        done in the original feature space. It is the fastest option. \f$K(x_i, x_j) = x_i^T x_j\f$. */
+        LINEAR=0,
+        /** Polynomial kernel:
+        \f$K(x_i, x_j) = (\gamma x_i^T x_j + coef0)^{degree}, \gamma > 0\f$. */
+        POLY=1,
+        /** Radial basis function (RBF), a good choice in most cases.
+        \f$K(x_i, x_j) = e^{-\gamma ||x_i - x_j||^2}, \gamma > 0\f$. */
+        RBF=2,
+        /** Sigmoid kernel: \f$K(x_i, x_j) = \tanh(\gamma x_i^T x_j + coef0)\f$. */
+        SIGMOID=3,
+        /** Exponential Chi2 kernel, similar to the RBF kernel:
+        \f$K(x_i, x_j) = e^{-\gamma \chi^2(x_i,x_j)}, \chi^2(x_i,x_j) = (x_i-x_j)^2/(x_i+x_j), \gamma > 0\f$. */
+        CHI2=4,
+        /** Histogram intersection kernel. A fast kernel. \f$K(x_i, x_j) = min(x_i,x_j)\f$. */
+        INTER=5
+    };
+
+    //! %SVM params type
+    enum ParamTypes {
+        C=0,
+        GAMMA=1,
+        P=2,
+        NU=3,
+        COEF=4,
+        DEGREE=5
+    };
+
+    /** @brief Trains an %SVM with optimal parameters.
+
+    @param data the training data that can be constructed using TrainData::create or
+        TrainData::loadFromCSV.
+    @param kFold Cross-validation parameter. The training set is divided into kFold subsets. One
+        subset is used to test the model, the others form the train set. So, the %SVM algorithm is
+        executed kFold times.
+    @param Cgrid grid for C
+    @param gammaGrid grid for gamma
+    @param pGrid grid for p
+    @param nuGrid grid for nu
+    @param coeffGrid grid for coeff
+    @param degreeGrid grid for degree
+    @param balanced If true and the problem is 2-class classification then the method creates more
+        balanced cross-validation subsets that is proportions between classes in subsets are close
+        to such proportion in the whole train dataset.
+
+    The method trains the %SVM model automatically by choosing the optimal parameters C, gamma, p,
+    nu, coef0, degree. Parameters are considered optimal when the cross-validation
+    estimate of the test set error is minimal.
+
+    If there is no need to optimize a parameter, the corresponding grid step should be set to any
+    value less than or equal to 1. For example, to avoid optimization in gamma, set `gammaGrid.step
+    = 0`, `gammaGrid.minVal`, `gamma_grid.maxVal` as arbitrary numbers. In this case, the value
+    `Gamma` is taken for gamma.
+
+    And, finally, if the optimization in a parameter is required but the corresponding grid is
+    unknown, you may call the function SVM::getDefaultGrid. To generate a grid, for example, for
+    gamma, call `SVM::getDefaultGrid(SVM::GAMMA)`.
+
+    This function works for the classification (SVM::C_SVC or SVM::NU_SVC) as well as for the
+    regression (SVM::EPS_SVR or SVM::NU_SVR). If it is SVM::ONE_CLASS, no optimization is made and
+    the usual %SVM with parameters specified in params is executed.
+     */
+    virtual bool trainAuto( const Ptr<TrainData>& data, int kFold = 10,
+                    ParamGrid Cgrid = getDefaultGrid(C),
+                    ParamGrid gammaGrid  = getDefaultGrid(GAMMA),
+                    ParamGrid pGrid      = getDefaultGrid(P),
+                    ParamGrid nuGrid     = getDefaultGrid(NU),
+                    ParamGrid coeffGrid  = getDefaultGrid(COEF),
+                    ParamGrid degreeGrid = getDefaultGrid(DEGREE),
+                    bool balanced=false) = 0;
+
+    /** @brief Trains an %SVM with optimal parameters
+
+    @param samples training samples
+    @param layout See ml::SampleTypes.
+    @param responses vector of responses associated with the training samples.
+    @param kFold Cross-validation parameter. The training set is divided into kFold subsets. One
+        subset is used to test the model, the others form the train set. So, the %SVM algorithm is
+    @param Cgrid grid for C
+    @param gammaGrid grid for gamma
+    @param pGrid grid for p
+    @param nuGrid grid for nu
+    @param coeffGrid grid for coeff
+    @param degreeGrid grid for degree
+    @param balanced If true and the problem is 2-class classification then the method creates more
+        balanced cross-validation subsets that is proportions between classes in subsets are close
+        to such proportion in the whole train dataset.
+
+    The method trains the %SVM model automatically by choosing the optimal parameters C, gamma, p,
+    nu, coef0, degree. Parameters are considered optimal when the cross-validation
+    estimate of the test set error is minimal.
+
+    This function only makes use of SVM::getDefaultGrid for parameter optimization and thus only
+    offers rudimentary parameter options.
+
+    This function works for the classification (SVM::C_SVC or SVM::NU_SVC) as well as for the
+    regression (SVM::EPS_SVR or SVM::NU_SVR). If it is SVM::ONE_CLASS, no optimization is made and
+    the usual %SVM with parameters specified in params is executed.
+    */
+    CV_WRAP virtual bool trainAuto(InputArray samples,
+            int layout,
+            InputArray responses,
+            int kFold = 10,
+            Ptr<ParamGrid> Cgrid = SVM::getDefaultGridPtr(SVM::C),
+            Ptr<ParamGrid> gammaGrid  = SVM::getDefaultGridPtr(SVM::GAMMA),
+            Ptr<ParamGrid> pGrid      = SVM::getDefaultGridPtr(SVM::P),
+            Ptr<ParamGrid> nuGrid     = SVM::getDefaultGridPtr(SVM::NU),
+            Ptr<ParamGrid> coeffGrid  = SVM::getDefaultGridPtr(SVM::COEF),
+            Ptr<ParamGrid> degreeGrid = SVM::getDefaultGridPtr(SVM::DEGREE),
+            bool balanced=false) = 0;
+
+    /** @brief Retrieves all the support vectors
+
+    The method returns all the support vectors as a floating-point matrix, where support vectors are
+    stored as matrix rows.
+     */
+    CV_WRAP virtual Mat getSupportVectors() const = 0;
+
+    /** @brief Retrieves all the uncompressed support vectors of a linear %SVM
+
+    The method returns all the uncompressed support vectors of a linear %SVM that the compressed
+    support vector, used for prediction, was derived from. They are returned in a floating-point
+    matrix, where the support vectors are stored as matrix rows.
+     */
+    CV_WRAP virtual Mat getUncompressedSupportVectors() const = 0;
+
+    /** @brief Retrieves the decision function
+
+    @param i the index of the decision function. If the problem solved is regression, 1-class or
+        2-class classification, then there will be just one decision function and the index should
+        always be 0. Otherwise, in the case of N-class classification, there will be \f$N(N-1)/2\f$
+        decision functions.
+    @param alpha the optional output vector for weights, corresponding to different support vectors.
+        In the case of linear %SVM all the alpha's will be 1's.
+    @param svidx the optional output vector of indices of support vectors within the matrix of
+        support vectors (which can be retrieved by SVM::getSupportVectors). In the case of linear
+        %SVM each decision function consists of a single "compressed" support vector.
+
+    The method returns rho parameter of the decision function, a scalar subtracted from the weighted
+    sum of kernel responses.
+     */
+    CV_WRAP virtual double getDecisionFunction(int i, OutputArray alpha, OutputArray svidx) const = 0;
+
+    /** @brief Generates a grid for %SVM parameters.
+
+    @param param_id %SVM parameters IDs that must be one of the SVM::ParamTypes. The grid is
+    generated for the parameter with this ID.
+
+    The function generates a grid for the specified parameter of the %SVM algorithm. The grid may be
+    passed to the function SVM::trainAuto.
+     */
+    static ParamGrid getDefaultGrid( int param_id );
+
+    /** @brief Generates a grid for %SVM parameters.
+
+    @param param_id %SVM parameters IDs that must be one of the SVM::ParamTypes. The grid is
+    generated for the parameter with this ID.
+
+    The function generates a grid pointer for the specified parameter of the %SVM algorithm.
+    The grid may be passed to the function SVM::trainAuto.
+     */
+    CV_WRAP static Ptr<ParamGrid> getDefaultGridPtr( int param_id );
+
+    /** Creates empty model.
+    Use StatModel::train to train the model. Since %SVM has several parameters, you may want to
+    find the best parameters for your problem, it can be done with SVM::trainAuto. */
+    CV_WRAP static Ptr<SVM> create();
+
+    /** @brief Loads and creates a serialized svm from a file
+     *
+     * Use SVM::save to serialize and store an SVM to disk.
+     * Load the SVM from this file again, by calling this function with the path to the file.
+     *
+     * @param filepath path to serialized svm
+     */
+    CV_WRAP static Ptr<SVM> load(const String& filepath);
+};
+
+/****************************************************************************************\
+*                              Expectation - Maximization                                *
+\****************************************************************************************/
+
+/** @brief The class implements the Expectation Maximization algorithm.
+
+@sa @ref ml_intro_em
+ */
+class CV_EXPORTS_W EM : public StatModel
+{
+public:
+    //! Type of covariation matrices
+    enum Types {
+        /** A scaled identity matrix \f$\mu_k * I\f$. There is the only
+        parameter \f$\mu_k\f$ to be estimated for each matrix. The option may be used in special cases,
+        when the constraint is relevant, or as a first step in the optimization (for example in case
+        when the data is preprocessed with PCA). The results of such preliminary estimation may be
+        passed again to the optimization procedure, this time with
+        covMatType=EM::COV_MAT_DIAGONAL. */
+        COV_MAT_SPHERICAL=0,
+        /** A diagonal matrix with positive diagonal elements. The number of
+        free parameters is d for each matrix. This is most commonly used option yielding good
+        estimation results. */
+        COV_MAT_DIAGONAL=1,
+        /** A symmetric positively defined matrix. The number of free
+        parameters in each matrix is about \f$d^2/2\f$. It is not recommended to use this option, unless
+        there is pretty accurate initial estimation of the parameters and/or a huge number of
+        training samples. */
+        COV_MAT_GENERIC=2,
+        COV_MAT_DEFAULT=COV_MAT_DIAGONAL
+    };
+
+    //! Default parameters
+    enum {DEFAULT_NCLUSTERS=5, DEFAULT_MAX_ITERS=100};
+
+    //! The initial step
+    enum {START_E_STEP=1, START_M_STEP=2, START_AUTO_STEP=0};
+
+    /** The number of mixture components in the Gaussian mixture model.
+    Default value of the parameter is EM::DEFAULT_NCLUSTERS=5. Some of %EM implementation could
+    determine the optimal number of mixtures within a specified value range, but that is not the
+    case in ML yet. */
+    /** @see setClustersNumber */
+    CV_WRAP virtual int getClustersNumber() const = 0;
+    /** @copybrief getClustersNumber @see getClustersNumber */
+    CV_WRAP virtual void setClustersNumber(int val) = 0;
+
+    /** Constraint on covariance matrices which defines type of matrices.
+    See EM::Types. */
+    /** @see setCovarianceMatrixType */
+    CV_WRAP virtual int getCovarianceMatrixType() const = 0;
+    /** @copybrief getCovarianceMatrixType @see getCovarianceMatrixType */
+    CV_WRAP virtual void setCovarianceMatrixType(int val) = 0;
+
+    /** The termination criteria of the %EM algorithm.
+    The %EM algorithm can be terminated by the number of iterations termCrit.maxCount (number of
+    M-steps) or when relative change of likelihood logarithm is less than termCrit.epsilon. Default
+    maximum number of iterations is EM::DEFAULT_MAX_ITERS=100. */
+    /** @see setTermCriteria */
+    CV_WRAP virtual TermCriteria getTermCriteria() const = 0;
+    /** @copybrief getTermCriteria @see getTermCriteria */
+    CV_WRAP virtual void setTermCriteria(const TermCriteria &val) = 0;
+
+    /** @brief Returns weights of the mixtures
+
+    Returns vector with the number of elements equal to the number of mixtures.
+     */
+    CV_WRAP virtual Mat getWeights() const = 0;
+    /** @brief Returns the cluster centers (means of the Gaussian mixture)
+
+    Returns matrix with the number of rows equal to the number of mixtures and number of columns
+    equal to the space dimensionality.
+     */
+    CV_WRAP virtual Mat getMeans() const = 0;
+    /** @brief Returns covariation matrices
+
+    Returns vector of covariation matrices. Number of matrices is the number of gaussian mixtures,
+    each matrix is a square floating-point matrix NxN, where N is the space dimensionality.
+     */
+    CV_WRAP virtual void getCovs(CV_OUT std::vector<Mat>& covs) const = 0;
+
+    /** @brief Returns posterior probabilities for the provided samples
+
+    @param samples The input samples, floating-point matrix
+    @param results The optional output \f$ nSamples \times nClusters\f$ matrix of results. It contains
+    posterior probabilities for each sample from the input
+    @param flags This parameter will be ignored
+     */
+    CV_WRAP virtual float predict( InputArray samples, OutputArray results=noArray(), int flags=0 ) const CV_OVERRIDE = 0;
+
+    /** @brief Returns a likelihood logarithm value and an index of the most probable mixture component
+    for the given sample.
+
+    @param sample A sample for classification. It should be a one-channel matrix of
+        \f$1 \times dims\f$ or \f$dims \times 1\f$ size.
+    @param probs Optional output matrix that contains posterior probabilities of each component
+        given the sample. It has \f$1 \times nclusters\f$ size and CV_64FC1 type.
+
+    The method returns a two-element double vector. Zero element is a likelihood logarithm value for
+    the sample. First element is an index of the most probable mixture component for the given
+    sample.
+     */
+    CV_WRAP virtual Vec2d predict2(InputArray sample, OutputArray probs) const = 0;
+
+    /** @brief Estimate the Gaussian mixture parameters from a samples set.
+
+    This variation starts with Expectation step. Initial values of the model parameters will be
+    estimated by the k-means algorithm.
+
+    Unlike many of the ML models, %EM is an unsupervised learning algorithm and it does not take
+    responses (class labels or function values) as input. Instead, it computes the *Maximum
+    Likelihood Estimate* of the Gaussian mixture parameters from an input sample set, stores all the
+    parameters inside the structure: \f$p_{i,k}\f$ in probs, \f$a_k\f$ in means , \f$S_k\f$ in
+    covs[k], \f$\pi_k\f$ in weights , and optionally computes the output "class label" for each
+    sample: \f$\texttt{labels}_i=\texttt{arg max}_k(p_{i,k}), i=1..N\f$ (indices of the most
+    probable mixture component for each sample).
+
+    The trained model can be used further for prediction, just like any other classifier. The
+    trained model is similar to the NormalBayesClassifier.
+
+    @param samples Samples from which the Gaussian mixture model will be estimated. It should be a
+        one-channel matrix, each row of which is a sample. If the matrix does not have CV_64F type
+        it will be converted to the inner matrix of such type for the further computing.
+    @param logLikelihoods The optional output matrix that contains a likelihood logarithm value for
+        each sample. It has \f$nsamples \times 1\f$ size and CV_64FC1 type.
+    @param labels The optional output "class label" for each sample:
+        \f$\texttt{labels}_i=\texttt{arg max}_k(p_{i,k}), i=1..N\f$ (indices of the most probable
+        mixture component for each sample). It has \f$nsamples \times 1\f$ size and CV_32SC1 type.
+    @param probs The optional output matrix that contains posterior probabilities of each Gaussian
+        mixture component given the each sample. It has \f$nsamples \times nclusters\f$ size and
+        CV_64FC1 type.
+     */
+    CV_WRAP virtual bool trainEM(InputArray samples,
+                         OutputArray logLikelihoods=noArray(),
+                         OutputArray labels=noArray(),
+                         OutputArray probs=noArray()) = 0;
+
+    /** @brief Estimate the Gaussian mixture parameters from a samples set.
+
+    This variation starts with Expectation step. You need to provide initial means \f$a_k\f$ of
+    mixture components. Optionally you can pass initial weights \f$\pi_k\f$ and covariance matrices
+    \f$S_k\f$ of mixture components.
+
+    @param samples Samples from which the Gaussian mixture model will be estimated. It should be a
+        one-channel matrix, each row of which is a sample. If the matrix does not have CV_64F type
+        it will be converted to the inner matrix of such type for the further computing.
+    @param means0 Initial means \f$a_k\f$ of mixture components. It is a one-channel matrix of
+        \f$nclusters \times dims\f$ size. If the matrix does not have CV_64F type it will be
+        converted to the inner matrix of such type for the further computing.
+    @param covs0 The vector of initial covariance matrices \f$S_k\f$ of mixture components. Each of
+        covariance matrices is a one-channel matrix of \f$dims \times dims\f$ size. If the matrices
+        do not have CV_64F type they will be converted to the inner matrices of such type for the
+        further computing.
+    @param weights0 Initial weights \f$\pi_k\f$ of mixture components. It should be a one-channel
+        floating-point matrix with \f$1 \times nclusters\f$ or \f$nclusters \times 1\f$ size.
+    @param logLikelihoods The optional output matrix that contains a likelihood logarithm value for
+        each sample. It has \f$nsamples \times 1\f$ size and CV_64FC1 type.
+    @param labels The optional output "class label" for each sample:
+        \f$\texttt{labels}_i=\texttt{arg max}_k(p_{i,k}), i=1..N\f$ (indices of the most probable
+        mixture component for each sample). It has \f$nsamples \times 1\f$ size and CV_32SC1 type.
+    @param probs The optional output matrix that contains posterior probabilities of each Gaussian
+        mixture component given the each sample. It has \f$nsamples \times nclusters\f$ size and
+        CV_64FC1 type.
+    */
+    CV_WRAP virtual bool trainE(InputArray samples, InputArray means0,
+                        InputArray covs0=noArray(),
+                        InputArray weights0=noArray(),
+                        OutputArray logLikelihoods=noArray(),
+                        OutputArray labels=noArray(),
+                        OutputArray probs=noArray()) = 0;
+
+    /** @brief Estimate the Gaussian mixture parameters from a samples set.
+
+    This variation starts with Maximization step. You need to provide initial probabilities
+    \f$p_{i,k}\f$ to use this option.
+
+    @param samples Samples from which the Gaussian mixture model will be estimated. It should be a
+        one-channel matrix, each row of which is a sample. If the matrix does not have CV_64F type
+        it will be converted to the inner matrix of such type for the further computing.
+    @param probs0 the probabilities
+    @param logLikelihoods The optional output matrix that contains a likelihood logarithm value for
+        each sample. It has \f$nsamples \times 1\f$ size and CV_64FC1 type.
+    @param labels The optional output "class label" for each sample:
+        \f$\texttt{labels}_i=\texttt{arg max}_k(p_{i,k}), i=1..N\f$ (indices of the most probable
+        mixture component for each sample). It has \f$nsamples \times 1\f$ size and CV_32SC1 type.
+    @param probs The optional output matrix that contains posterior probabilities of each Gaussian
+        mixture component given the each sample. It has \f$nsamples \times nclusters\f$ size and
+        CV_64FC1 type.
+    */
+    CV_WRAP virtual bool trainM(InputArray samples, InputArray probs0,
+                        OutputArray logLikelihoods=noArray(),
+                        OutputArray labels=noArray(),
+                        OutputArray probs=noArray()) = 0;
+
+    /** Creates empty %EM model.
+    The model should be trained then using StatModel::train(traindata, flags) method. Alternatively, you
+    can use one of the EM::train\* methods or load it from file using Algorithm::load\<EM\>(filename).
+     */
+    CV_WRAP static Ptr<EM> create();
+
+    /** @brief Loads and creates a serialized EM from a file
+     *
+     * Use EM::save to serialize and store an EM to disk.
+     * Load the EM from this file again, by calling this function with the path to the file.
+     * Optionally specify the node for the file containing the classifier
+     *
+     * @param filepath path to serialized EM
+     * @param nodeName name of node containing the classifier
+     */
+    CV_WRAP static Ptr<EM> load(const String& filepath , const String& nodeName = String());
+};
+
+/****************************************************************************************\
+*                                      Decision Tree                                     *
+\****************************************************************************************/
+
+/** @brief The class represents a single decision tree or a collection of decision trees.
+
+The current public interface of the class allows user to train only a single decision tree, however
+the class is capable of storing multiple decision trees and using them for prediction (by summing
+responses or using a voting schemes), and the derived from DTrees classes (such as RTrees and Boost)
+use this capability to implement decision tree ensembles.
+
+@sa @ref ml_intro_trees
+*/
+class CV_EXPORTS_W DTrees : public StatModel
+{
+public:
+    /** Predict options */
+    enum Flags { PREDICT_AUTO=0, PREDICT_SUM=(1<<8), PREDICT_MAX_VOTE=(2<<8), PREDICT_MASK=(3<<8) };
+
+    /** Cluster possible values of a categorical variable into K\<=maxCategories clusters to
+    find a suboptimal split.
+    If a discrete variable, on which the training procedure tries to make a split, takes more than
+    maxCategories values, the precise best subset estimation may take a very long time because the
+    algorithm is exponential. Instead, many decision trees engines (including our implementation)
+    try to find sub-optimal split in this case by clustering all the samples into maxCategories
+    clusters that is some categories are merged together. The clustering is applied only in n \>
+    2-class classification problems for categorical variables with N \> max_categories possible
+    values. In case of regression and 2-class classification the optimal split can be found
+    efficiently without employing clustering, thus the parameter is not used in these cases.
+    Default value is 10.*/
+    /** @see setMaxCategories */
+    CV_WRAP virtual int getMaxCategories() const = 0;
+    /** @copybrief getMaxCategories @see getMaxCategories */
+    CV_WRAP virtual void setMaxCategories(int val) = 0;
+
+    /** The maximum possible depth of the tree.
+    That is the training algorithms attempts to split a node while its depth is less than maxDepth.
+    The root node has zero depth. The actual depth may be smaller if the other termination criteria
+    are met (see the outline of the training procedure @ref ml_intro_trees "here"), and/or if the
+    tree is pruned. Default value is INT_MAX.*/
+    /** @see setMaxDepth */
+    CV_WRAP virtual int getMaxDepth() const = 0;
+    /** @copybrief getMaxDepth @see getMaxDepth */
+    CV_WRAP virtual void setMaxDepth(int val) = 0;
+
+    /** If the number of samples in a node is less than this parameter then the node will not be split.
+
+    Default value is 10.*/
+    /** @see setMinSampleCount */
+    CV_WRAP virtual int getMinSampleCount() const = 0;
+    /** @copybrief getMinSampleCount @see getMinSampleCount */
+    CV_WRAP virtual void setMinSampleCount(int val) = 0;
+
+    /** If CVFolds \> 1 then algorithms prunes the built decision tree using K-fold
+    cross-validation procedure where K is equal to CVFolds.
+    Default value is 10.*/
+    /** @see setCVFolds */
+    CV_WRAP virtual int getCVFolds() const = 0;
+    /** @copybrief getCVFolds @see getCVFolds */
+    CV_WRAP virtual void setCVFolds(int val) = 0;
+
+    /** If true then surrogate splits will be built.
+    These splits allow to work with missing data and compute variable importance correctly.
+    Default value is false.
+    @note currently it's not implemented.*/
+    /** @see setUseSurrogates */
+    CV_WRAP virtual bool getUseSurrogates() const = 0;
+    /** @copybrief getUseSurrogates @see getUseSurrogates */
+    CV_WRAP virtual void setUseSurrogates(bool val) = 0;
+
+    /** If true then a pruning will be harsher.
+    This will make a tree more compact and more resistant to the training data noise but a bit less
+    accurate. Default value is true.*/
+    /** @see setUse1SERule */
+    CV_WRAP virtual bool getUse1SERule() const = 0;
+    /** @copybrief getUse1SERule @see getUse1SERule */
+    CV_WRAP virtual void setUse1SERule(bool val) = 0;
+
+    /** If true then pruned branches are physically removed from the tree.
+    Otherwise they are retained and it is possible to get results from the original unpruned (or
+    pruned less aggressively) tree. Default value is true.*/
+    /** @see setTruncatePrunedTree */
+    CV_WRAP virtual bool getTruncatePrunedTree() const = 0;
+    /** @copybrief getTruncatePrunedTree @see getTruncatePrunedTree */
+    CV_WRAP virtual void setTruncatePrunedTree(bool val) = 0;
+
+    /** Termination criteria for regression trees.
+    If all absolute differences between an estimated value in a node and values of train samples
+    in this node are less than this parameter then the node will not be split further. Default
+    value is 0.01f*/
+    /** @see setRegressionAccuracy */
+    CV_WRAP virtual float getRegressionAccuracy() const = 0;
+    /** @copybrief getRegressionAccuracy @see getRegressionAccuracy */
+    CV_WRAP virtual void setRegressionAccuracy(float val) = 0;
+
+    /** @brief The array of a priori class probabilities, sorted by the class label value.
+
+    The parameter can be used to tune the decision tree preferences toward a certain class. For
+    example, if you want to detect some rare anomaly occurrence, the training base will likely
+    contain much more normal cases than anomalies, so a very good classification performance
+    will be achieved just by considering every case as normal. To avoid this, the priors can be
+    specified, where the anomaly probability is artificially increased (up to 0.5 or even
+    greater), so the weight of the misclassified anomalies becomes much bigger, and the tree is
+    adjusted properly.
+
+    You can also think about this parameter as weights of prediction categories which determine
+    relative weights that you give to misclassification. That is, if the weight of the first
+    category is 1 and the weight of the second category is 10, then each mistake in predicting
+    the second category is equivalent to making 10 mistakes in predicting the first category.
+    Default value is empty Mat.*/
+    /** @see setPriors */
+    CV_WRAP virtual cv::Mat getPriors() const = 0;
+    /** @copybrief getPriors @see getPriors */
+    CV_WRAP virtual void setPriors(const cv::Mat &val) = 0;
+
+    /** @brief The class represents a decision tree node.
+     */
+    class CV_EXPORTS Node
+    {
+    public:
+        Node();
+        double value; //!< Value at the node: a class label in case of classification or estimated
+                      //!< function value in case of regression.
+        int classIdx; //!< Class index normalized to 0..class_count-1 range and assigned to the
+                      //!< node. It is used internally in classification trees and tree ensembles.
+        int parent; //!< Index of the parent node
+        int left; //!< Index of the left child node
+        int right; //!< Index of right child node
+        int defaultDir; //!< Default direction where to go (-1: left or +1: right). It helps in the
+                        //!< case of missing values.
+        int split; //!< Index of the first split
+    };
+
+    /** @brief The class represents split in a decision tree.
+     */
+    class CV_EXPORTS Split
+    {
+    public:
+        Split();
+        int varIdx; //!< Index of variable on which the split is created.
+        bool inversed; //!< If true, then the inverse split rule is used (i.e. left and right
+                       //!< branches are exchanged in the rule expressions below).
+        float quality; //!< The split quality, a positive number. It is used to choose the best split.
+        int next; //!< Index of the next split in the list of splits for the node
+        float c; /**< The threshold value in case of split on an ordered variable.
+                      The rule is:
+                      @code{.none}
+                      if var_value < c
+                        then next_node <- left
+                        else next_node <- right
+                      @endcode */
+        int subsetOfs; /**< Offset of the bitset used by the split on a categorical variable.
+                            The rule is:
+                            @code{.none}
+                            if bitset[var_value] == 1
+                                then next_node <- left
+                                else next_node <- right
+                            @endcode */
+    };
+
+    /** @brief Returns indices of root nodes
+    */
+    virtual const std::vector<int>& getRoots() const = 0;
+    /** @brief Returns all the nodes
+
+    all the node indices are indices in the returned vector
+     */
+    virtual const std::vector<Node>& getNodes() const = 0;
+    /** @brief Returns all the splits
+
+    all the split indices are indices in the returned vector
+     */
+    virtual const std::vector<Split>& getSplits() const = 0;
+    /** @brief Returns all the bitsets for categorical splits
+
+    Split::subsetOfs is an offset in the returned vector
+     */
+    virtual const std::vector<int>& getSubsets() const = 0;
+
+    /** @brief Creates the empty model
+
+    The static method creates empty decision tree with the specified parameters. It should be then
+    trained using train method (see StatModel::train). Alternatively, you can load the model from
+    file using Algorithm::load\<DTrees\>(filename).
+     */
+    CV_WRAP static Ptr<DTrees> create();
+
+    /** @brief Loads and creates a serialized DTrees from a file
+     *
+     * Use DTree::save to serialize and store an DTree to disk.
+     * Load the DTree from this file again, by calling this function with the path to the file.
+     * Optionally specify the node for the file containing the classifier
+     *
+     * @param filepath path to serialized DTree
+     * @param nodeName name of node containing the classifier
+     */
+    CV_WRAP static Ptr<DTrees> load(const String& filepath , const String& nodeName = String());
+};
+
+/****************************************************************************************\
+*                                   Random Trees Classifier                              *
+\****************************************************************************************/
+
+/** @brief The class implements the random forest predictor.
+
+@sa @ref ml_intro_rtrees
+ */
+class CV_EXPORTS_W RTrees : public DTrees
+{
+public:
+
+    /** If true then variable importance will be calculated and then it can be retrieved by RTrees::getVarImportance.
+    Default value is false.*/
+    /** @see setCalculateVarImportance */
+    CV_WRAP virtual bool getCalculateVarImportance() const = 0;
+    /** @copybrief getCalculateVarImportance @see getCalculateVarImportance */
+    CV_WRAP virtual void setCalculateVarImportance(bool val) = 0;
+
+    /** The size of the randomly selected subset of features at each tree node and that are used
+    to find the best split(s).
+    If you set it to 0 then the size will be set to the square root of the total number of
+    features. Default value is 0.*/
+    /** @see setActiveVarCount */
+    CV_WRAP virtual int getActiveVarCount() const = 0;
+    /** @copybrief getActiveVarCount @see getActiveVarCount */
+    CV_WRAP virtual void setActiveVarCount(int val) = 0;
+
+    /** The termination criteria that specifies when the training algorithm stops.
+    Either when the specified number of trees is trained and added to the ensemble or when
+    sufficient accuracy (measured as OOB error) is achieved. Typically the more trees you have the
+    better the accuracy. However, the improvement in accuracy generally diminishes and asymptotes
+    pass a certain number of trees. Also to keep in mind, the number of tree increases the
+    prediction time linearly. Default value is TermCriteria(TermCriteria::MAX_ITERS +
+    TermCriteria::EPS, 50, 0.1)*/
+    /** @see setTermCriteria */
+    CV_WRAP virtual TermCriteria getTermCriteria() const = 0;
+    /** @copybrief getTermCriteria @see getTermCriteria */
+    CV_WRAP virtual void setTermCriteria(const TermCriteria &val) = 0;
+
+    /** Returns the variable importance array.
+    The method returns the variable importance vector, computed at the training stage when
+    CalculateVarImportance is set to true. If this flag was set to false, the empty matrix is
+    returned.
+     */
+    CV_WRAP virtual Mat getVarImportance() const = 0;
+
+    /** Returns the result of each individual tree in the forest.
+    In case the model is a regression problem, the method will return each of the trees'
+    results for each of the sample cases. If the model is a classifier, it will return
+    a Mat with samples + 1 rows, where the first row gives the class number and the
+    following rows return the votes each class had for each sample.
+        @param samples Array containing the samples for which votes will be calculated.
+        @param results Array where the result of the calculation will be written.
+        @param flags Flags for defining the type of RTrees.
+    */
+    CV_WRAP virtual void getVotes(InputArray samples, OutputArray results, int flags) const = 0;
+
+    /** Returns the OOB error value, computed at the training stage when calcOOBError is set to true.
+     * If this flag was set to false, 0 is returned. The OOB error is also scaled by sample weighting.
+     */
+#if CV_VERSION_MAJOR == 4
+    CV_WRAP virtual double getOOBError() const { return 0; }
+#else
+    /*CV_WRAP*/ virtual double getOOBError() const = 0;
+#endif
+
+    /** Creates the empty model.
+    Use StatModel::train to train the model, StatModel::train to create and train the model,
+    Algorithm::load to load the pre-trained model.
+     */
+    CV_WRAP static Ptr<RTrees> create();
+
+    /** @brief Loads and creates a serialized RTree from a file
+     *
+     * Use RTree::save to serialize and store an RTree to disk.
+     * Load the RTree from this file again, by calling this function with the path to the file.
+     * Optionally specify the node for the file containing the classifier
+     *
+     * @param filepath path to serialized RTree
+     * @param nodeName name of node containing the classifier
+     */
+    CV_WRAP static Ptr<RTrees> load(const String& filepath , const String& nodeName = String());
+};
+
+/****************************************************************************************\
+*                                   Boosted tree classifier                              *
+\****************************************************************************************/
+
+/** @brief Boosted tree classifier derived from DTrees
+
+@sa @ref ml_intro_boost
+ */
+class CV_EXPORTS_W Boost : public DTrees
+{
+public:
+    /** Type of the boosting algorithm.
+    See Boost::Types. Default value is Boost::REAL. */
+    /** @see setBoostType */
+    CV_WRAP virtual int getBoostType() const = 0;
+    /** @copybrief getBoostType @see getBoostType */
+    CV_WRAP virtual void setBoostType(int val) = 0;
+
+    /** The number of weak classifiers.
+    Default value is 100. */
+    /** @see setWeakCount */
+    CV_WRAP virtual int getWeakCount() const = 0;
+    /** @copybrief getWeakCount @see getWeakCount */
+    CV_WRAP virtual void setWeakCount(int val) = 0;
+
+    /** A threshold between 0 and 1 used to save computational time.
+    Samples with summary weight \f$\leq 1 - weight_trim_rate\f$ do not participate in the *next*
+    iteration of training. Set this parameter to 0 to turn off this functionality. Default value is 0.95.*/
+    /** @see setWeightTrimRate */
+    CV_WRAP virtual double getWeightTrimRate() const = 0;
+    /** @copybrief getWeightTrimRate @see getWeightTrimRate */
+    CV_WRAP virtual void setWeightTrimRate(double val) = 0;
+
+    /** Boosting type.
+    Gentle AdaBoost and Real AdaBoost are often the preferable choices. */
+    enum Types {
+        DISCRETE=0, //!< Discrete AdaBoost.
+        REAL=1, //!< Real AdaBoost. It is a technique that utilizes confidence-rated predictions
+                //!< and works well with categorical data.
+        LOGIT=2, //!< LogitBoost. It can produce good regression fits.
+        GENTLE=3 //!< Gentle AdaBoost. It puts less weight on outlier data points and for that
+                 //!<reason is often good with regression data.
+    };
+
+    /** Creates the empty model.
+    Use StatModel::train to train the model, Algorithm::load\<Boost\>(filename) to load the pre-trained model. */
+    CV_WRAP static Ptr<Boost> create();
+
+    /** @brief Loads and creates a serialized Boost from a file
+     *
+     * Use Boost::save to serialize and store an RTree to disk.
+     * Load the Boost from this file again, by calling this function with the path to the file.
+     * Optionally specify the node for the file containing the classifier
+     *
+     * @param filepath path to serialized Boost
+     * @param nodeName name of node containing the classifier
+     */
+    CV_WRAP static Ptr<Boost> load(const String& filepath , const String& nodeName = String());
+};
+
+/****************************************************************************************\
+*                                   Gradient Boosted Trees                               *
+\****************************************************************************************/
+
+/*class CV_EXPORTS_W GBTrees : public DTrees
+{
+public:
+    struct CV_EXPORTS_W_MAP Params : public DTrees::Params
+    {
+        CV_PROP_RW int weakCount;
+        CV_PROP_RW int lossFunctionType;
+        CV_PROP_RW float subsamplePortion;
+        CV_PROP_RW float shrinkage;
+
+        Params();
+        Params( int lossFunctionType, int weakCount, float shrinkage,
+                float subsamplePortion, int maxDepth, bool useSurrogates );
+    };
+
+    enum {SQUARED_LOSS=0, ABSOLUTE_LOSS, HUBER_LOSS=3, DEVIANCE_LOSS};
+
+    virtual void setK(int k) = 0;
+
+    virtual float predictSerial( InputArray samples,
+                                 OutputArray weakResponses, int flags) const = 0;
+
+    static Ptr<GBTrees> create(const Params& p);
+};*/
+
+/****************************************************************************************\
+*                              Artificial Neural Networks (ANN)                          *
+\****************************************************************************************/
+
+/////////////////////////////////// Multi-Layer Perceptrons //////////////////////////////
+
+/** @brief Artificial Neural Networks - Multi-Layer Perceptrons.
+
+Unlike many other models in ML that are constructed and trained at once, in the MLP model these
+steps are separated. First, a network with the specified topology is created using the non-default
+constructor or the method ANN_MLP::create. All the weights are set to zeros. Then, the network is
+trained using a set of input and output vectors. The training procedure can be repeated more than
+once, that is, the weights can be adjusted based on the new training data.
+
+Additional flags for StatModel::train are available: ANN_MLP::TrainFlags.
+
+@sa @ref ml_intro_ann
+ */
+class CV_EXPORTS_W ANN_MLP : public StatModel
+{
+public:
+    /** Available training methods */
+    enum TrainingMethods {
+        BACKPROP=0, //!< The back-propagation algorithm.
+        RPROP = 1, //!< The RPROP algorithm. See @cite RPROP93 for details.
+        ANNEAL = 2 //!< The simulated annealing algorithm. See @cite Kirkpatrick83 for details.
+    };
+
+    /** Sets training method and common parameters.
+    @param method Default value is ANN_MLP::RPROP. See ANN_MLP::TrainingMethods.
+    @param param1 passed to setRpropDW0 for ANN_MLP::RPROP and to setBackpropWeightScale for ANN_MLP::BACKPROP and to initialT for ANN_MLP::ANNEAL.
+    @param param2 passed to setRpropDWMin for ANN_MLP::RPROP and to setBackpropMomentumScale for ANN_MLP::BACKPROP and to finalT for ANN_MLP::ANNEAL.
+    */
+    CV_WRAP virtual void setTrainMethod(int method, double param1 = 0, double param2 = 0) = 0;
+
+    /** Returns current training method */
+    CV_WRAP virtual int getTrainMethod() const = 0;
+
+    /** Initialize the activation function for each neuron.
+    Currently the default and the only fully supported activation function is ANN_MLP::SIGMOID_SYM.
+    @param type The type of activation function. See ANN_MLP::ActivationFunctions.
+    @param param1 The first parameter of the activation function, \f$\alpha\f$. Default value is 0.
+    @param param2 The second parameter of the activation function, \f$\beta\f$. Default value is 0.
+    */
+    CV_WRAP virtual void setActivationFunction(int type, double param1 = 0, double param2 = 0) = 0;
+
+    /**  Integer vector specifying the number of neurons in each layer including the input and output layers.
+    The very first element specifies the number of elements in the input layer.
+    The last element - number of elements in the output layer. Default value is empty Mat.
+    @sa getLayerSizes */
+    CV_WRAP virtual void setLayerSizes(InputArray _layer_sizes) = 0;
+
+    /**  Integer vector specifying the number of neurons in each layer including the input and output layers.
+    The very first element specifies the number of elements in the input layer.
+    The last element - number of elements in the output layer.
+    @sa setLayerSizes */
+    CV_WRAP virtual cv::Mat getLayerSizes() const = 0;
+
+    /** Termination criteria of the training algorithm.
+    You can specify the maximum number of iterations (maxCount) and/or how much the error could
+    change between the iterations to make the algorithm continue (epsilon). Default value is
+    TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 1000, 0.01).*/
+    /** @see setTermCriteria */
+    CV_WRAP virtual TermCriteria getTermCriteria() const = 0;
+    /** @copybrief getTermCriteria @see getTermCriteria */
+    CV_WRAP virtual void setTermCriteria(TermCriteria val) = 0;
+
+    /** BPROP: Strength of the weight gradient term.
+    The recommended value is about 0.1. Default value is 0.1.*/
+    /** @see setBackpropWeightScale */
+    CV_WRAP virtual double getBackpropWeightScale() const = 0;
+    /** @copybrief getBackpropWeightScale @see getBackpropWeightScale */
+    CV_WRAP virtual void setBackpropWeightScale(double val) = 0;
+
+    /** BPROP: Strength of the momentum term (the difference between weights on the 2 previous iterations).
+    This parameter provides some inertia to smooth the random fluctuations of the weights. It can
+    vary from 0 (the feature is disabled) to 1 and beyond. The value 0.1 or so is good enough.
+    Default value is 0.1.*/
+    /** @see setBackpropMomentumScale */
+    CV_WRAP virtual double getBackpropMomentumScale() const = 0;
+    /** @copybrief getBackpropMomentumScale @see getBackpropMomentumScale */
+    CV_WRAP virtual void setBackpropMomentumScale(double val) = 0;
+
+    /** RPROP: Initial value \f$\Delta_0\f$ of update-values \f$\Delta_{ij}\f$.
+    Default value is 0.1.*/
+    /** @see setRpropDW0 */
+    CV_WRAP virtual double getRpropDW0() const = 0;
+    /** @copybrief getRpropDW0 @see getRpropDW0 */
+    CV_WRAP virtual void setRpropDW0(double val) = 0;
+
+    /** RPROP: Increase factor \f$\eta^+\f$.
+    It must be \>1. Default value is 1.2.*/
+    /** @see setRpropDWPlus */
+    CV_WRAP virtual double getRpropDWPlus() const = 0;
+    /** @copybrief getRpropDWPlus @see getRpropDWPlus */
+    CV_WRAP virtual void setRpropDWPlus(double val) = 0;
+
+    /** RPROP: Decrease factor \f$\eta^-\f$.
+    It must be \<1. Default value is 0.5.*/
+    /** @see setRpropDWMinus */
+    CV_WRAP virtual double getRpropDWMinus() const = 0;
+    /** @copybrief getRpropDWMinus @see getRpropDWMinus */
+    CV_WRAP virtual void setRpropDWMinus(double val) = 0;
+
+    /** RPROP: Update-values lower limit \f$\Delta_{min}\f$.
+    It must be positive. Default value is FLT_EPSILON.*/
+    /** @see setRpropDWMin */
+    CV_WRAP virtual double getRpropDWMin() const = 0;
+    /** @copybrief getRpropDWMin @see getRpropDWMin */
+    CV_WRAP virtual void setRpropDWMin(double val) = 0;
+
+    /** RPROP: Update-values upper limit \f$\Delta_{max}\f$.
+    It must be \>1. Default value is 50.*/
+    /** @see setRpropDWMax */
+    CV_WRAP virtual double getRpropDWMax() const = 0;
+    /** @copybrief getRpropDWMax @see getRpropDWMax */
+    CV_WRAP virtual void setRpropDWMax(double val) = 0;
+
+    /** ANNEAL: Update initial temperature.
+    It must be \>=0. Default value is 10.*/
+    /** @see setAnnealInitialT */
+    CV_WRAP virtual double getAnnealInitialT() const = 0;
+    /** @copybrief getAnnealInitialT @see getAnnealInitialT */
+    CV_WRAP virtual void setAnnealInitialT(double val) = 0;
+
+    /** ANNEAL: Update final temperature.
+    It must be \>=0 and less than initialT. Default value is 0.1.*/
+    /** @see setAnnealFinalT */
+    CV_WRAP virtual double getAnnealFinalT() const = 0;
+    /** @copybrief getAnnealFinalT @see getAnnealFinalT */
+    CV_WRAP virtual void setAnnealFinalT(double val) = 0;
+
+    /** ANNEAL: Update cooling ratio.
+    It must be \>0 and less than 1. Default value is 0.95.*/
+    /** @see setAnnealCoolingRatio */
+    CV_WRAP virtual double getAnnealCoolingRatio() const = 0;
+    /** @copybrief getAnnealCoolingRatio @see getAnnealCoolingRatio */
+    CV_WRAP virtual void setAnnealCoolingRatio(double val) = 0;
+
+    /** ANNEAL: Update iteration per step.
+    It must be \>0 . Default value is 10.*/
+    /** @see setAnnealItePerStep */
+    CV_WRAP virtual int getAnnealItePerStep() const = 0;
+    /** @copybrief getAnnealItePerStep @see getAnnealItePerStep */
+    CV_WRAP virtual void setAnnealItePerStep(int val) = 0;
+
+    /** @brief Set/initialize anneal RNG */
+    virtual void setAnnealEnergyRNG(const RNG& rng) = 0;
+
+    /** possible activation functions */
+    enum ActivationFunctions {
+        /** Identity function: \f$f(x)=x\f$ */
+        IDENTITY = 0,
+        /** Symmetrical sigmoid: \f$f(x)=\beta*(1-e^{-\alpha x})/(1+e^{-\alpha x})\f$
+        @note
+        If you are using the default sigmoid activation function with the default parameter values
+        fparam1=0 and fparam2=0 then the function used is y = 1.7159\*tanh(2/3 \* x), so the output
+        will range from [-1.7159, 1.7159], instead of [0,1].*/
+        SIGMOID_SYM = 1,
+        /** Gaussian function: \f$f(x)=\beta e^{-\alpha x*x}\f$ */
+        GAUSSIAN = 2,
+        /** ReLU function: \f$f(x)=max(0,x)\f$ */
+        RELU = 3,
+        /** Leaky ReLU function: for x>0 \f$f(x)=x \f$ and x<=0 \f$f(x)=\alpha x \f$*/
+        LEAKYRELU= 4
+    };
+
+    /** Train options */
+    enum TrainFlags {
+        /** Update the network weights, rather than compute them from scratch. In the latter case
+        the weights are initialized using the Nguyen-Widrow algorithm. */
+        UPDATE_WEIGHTS = 1,
+        /** Do not normalize the input vectors. If this flag is not set, the training algorithm
+        normalizes each input feature independently, shifting its mean value to 0 and making the
+        standard deviation equal to 1. If the network is assumed to be updated frequently, the new
+        training data could be much different from original one. In this case, you should take care
+        of proper normalization. */
+        NO_INPUT_SCALE = 2,
+        /** Do not normalize the output vectors. If the flag is not set, the training algorithm
+        normalizes each output feature independently, by transforming it to the certain range
+        depending on the used activation function. */
+        NO_OUTPUT_SCALE = 4
+    };
+
+    CV_WRAP virtual Mat getWeights(int layerIdx) const = 0;
+
+    /** @brief Creates empty model
+
+    Use StatModel::train to train the model, Algorithm::load\<ANN_MLP\>(filename) to load the pre-trained model.
+    Note that the train method has optional flags: ANN_MLP::TrainFlags.
+     */
+    CV_WRAP static Ptr<ANN_MLP> create();
+
+    /** @brief Loads and creates a serialized ANN from a file
+     *
+     * Use ANN::save to serialize and store an ANN to disk.
+     * Load the ANN from this file again, by calling this function with the path to the file.
+     *
+     * @param filepath path to serialized ANN
+     */
+    CV_WRAP static Ptr<ANN_MLP> load(const String& filepath);
+
+};
+
+#ifndef DISABLE_OPENCV_3_COMPATIBILITY
+typedef ANN_MLP ANN_MLP_ANNEAL;
+#endif
+
+/****************************************************************************************\
+*                           Logistic Regression                                          *
+\****************************************************************************************/
+
+/** @brief Implements Logistic Regression classifier.
+
+@sa @ref ml_intro_lr
+ */
+class CV_EXPORTS_W LogisticRegression : public StatModel
+{
+public:
+
+    /** Learning rate. */
+    /** @see setLearningRate */
+    CV_WRAP virtual double getLearningRate() const = 0;
+    /** @copybrief getLearningRate @see getLearningRate */
+    CV_WRAP virtual void setLearningRate(double val) = 0;
+
+    /** Number of iterations. */
+    /** @see setIterations */
+    CV_WRAP virtual int getIterations() const = 0;
+    /** @copybrief getIterations @see getIterations */
+    CV_WRAP virtual void setIterations(int val) = 0;
+
+    /** Kind of regularization to be applied. See LogisticRegression::RegKinds. */
+    /** @see setRegularization */
+    CV_WRAP virtual int getRegularization() const = 0;
+    /** @copybrief getRegularization @see getRegularization */
+    CV_WRAP virtual void setRegularization(int val) = 0;
+
+    /** Kind of training method used. See LogisticRegression::Methods. */
+    /** @see setTrainMethod */
+    CV_WRAP virtual int getTrainMethod() const = 0;
+    /** @copybrief getTrainMethod @see getTrainMethod */
+    CV_WRAP virtual void setTrainMethod(int val) = 0;
+
+    /** Specifies the number of training samples taken in each step of Mini-Batch Gradient
+    Descent. Will only be used if using LogisticRegression::MINI_BATCH training algorithm. It
+    has to take values less than the total number of training samples. */
+    /** @see setMiniBatchSize */
+    CV_WRAP virtual int getMiniBatchSize() const = 0;
+    /** @copybrief getMiniBatchSize @see getMiniBatchSize */
+    CV_WRAP virtual void setMiniBatchSize(int val) = 0;
+
+    /** Termination criteria of the algorithm. */
+    /** @see setTermCriteria */
+    CV_WRAP virtual TermCriteria getTermCriteria() const = 0;
+    /** @copybrief getTermCriteria @see getTermCriteria */
+    CV_WRAP virtual void setTermCriteria(TermCriteria val) = 0;
+
+    //! Regularization kinds
+    enum RegKinds {
+        REG_DISABLE = -1, //!< Regularization disabled
+        REG_L1 = 0, //!< %L1 norm
+        REG_L2 = 1 //!< %L2 norm
+    };
+
+    //! Training methods
+    enum Methods {
+        BATCH = 0,
+        MINI_BATCH = 1 //!< Set MiniBatchSize to a positive integer when using this method.
+    };
+
+    /** @brief Predicts responses for input samples and returns a float type.
+
+    @param samples The input data for the prediction algorithm. Matrix [m x n], where each row
+        contains variables (features) of one object being classified. Should have data type CV_32F.
+    @param results Predicted labels as a column matrix of type CV_32S.
+    @param flags Not used.
+     */
+    CV_WRAP virtual float predict( InputArray samples, OutputArray results=noArray(), int flags=0 ) const CV_OVERRIDE = 0;
+
+    /** @brief This function returns the trained parameters arranged across rows.
+
+    For a two class classification problem, it returns a row matrix. It returns learnt parameters of
+    the Logistic Regression as a matrix of type CV_32F.
+     */
+    CV_WRAP virtual Mat get_learnt_thetas() const = 0;
+
+    /** @brief Creates empty model.
+
+    Creates Logistic Regression model with parameters given.
+     */
+    CV_WRAP static Ptr<LogisticRegression> create();
+
+    /** @brief Loads and creates a serialized LogisticRegression from a file
+     *
+     * Use LogisticRegression::save to serialize and store an LogisticRegression to disk.
+     * Load the LogisticRegression from this file again, by calling this function with the path to the file.
+     * Optionally specify the node for the file containing the classifier
+     *
+     * @param filepath path to serialized LogisticRegression
+     * @param nodeName name of node containing the classifier
+     */
+    CV_WRAP static Ptr<LogisticRegression> load(const String& filepath , const String& nodeName = String());
+};
+
+
+/****************************************************************************************\
+*                        Stochastic Gradient Descent SVM Classifier                      *
+\****************************************************************************************/
+
+/*!
+@brief Stochastic Gradient Descent SVM classifier
+
+SVMSGD provides a fast and easy-to-use implementation of the SVM classifier using the Stochastic Gradient Descent approach,
+as presented in @cite bottou2010large.
+
+The classifier has following parameters:
+- model type,
+- margin type,
+- margin regularization (\f$\lambda\f$),
+- initial step size (\f$\gamma_0\f$),
+- step decreasing power (\f$c\f$),
+- and termination criteria.
+
+The model type may have one of the following values: \ref SGD and \ref ASGD.
+
+- \ref SGD is the classic version of SVMSGD classifier: every next step is calculated by the formula
+  \f[w_{t+1} = w_t - \gamma(t) \frac{dQ_i}{dw} |_{w = w_t}\f]
+  where
+  - \f$w_t\f$ is the weights vector for decision function at step \f$t\f$,
+  - \f$\gamma(t)\f$ is the step size of model parameters at the iteration \f$t\f$, it is decreased on each step by the formula
+    \f$\gamma(t) = \gamma_0  (1 + \lambda  \gamma_0 t) ^ {-c}\f$
+  - \f$Q_i\f$ is the target functional from SVM task for sample with number \f$i\f$, this sample is chosen stochastically on each step of the algorithm.
+
+- \ref ASGD is Average Stochastic Gradient Descent SVM Classifier. ASGD classifier averages weights vector on each step of algorithm by the formula
+\f$\widehat{w}_{t+1} = \frac{t}{1+t}\widehat{w}_{t} + \frac{1}{1+t}w_{t+1}\f$
+
+The recommended model type is ASGD (following @cite bottou2010large).
+
+The margin type may have one of the following values: \ref SOFT_MARGIN or \ref HARD_MARGIN.
+
+- You should use \ref HARD_MARGIN type, if you have linearly separable sets.
+- You should use \ref SOFT_MARGIN type, if you have non-linearly separable sets or sets with outliers.
+- In the general case (if you know nothing about linear separability of your sets), use SOFT_MARGIN.
+
+The other parameters may be described as follows:
+- Margin regularization parameter is responsible for weights decreasing at each step and for the strength of restrictions on outliers
+  (the less the parameter, the less probability that an outlier will be ignored).
+  Recommended value for SGD model is 0.0001, for ASGD model is 0.00001.
+
+- Initial step size parameter is the initial value for the step size \f$\gamma(t)\f$.
+  You will have to find the best initial step for your problem.
+
+- Step decreasing power is the power parameter for \f$\gamma(t)\f$ decreasing by the formula, mentioned above.
+  Recommended value for SGD model is 1, for ASGD model is 0.75.
+
+- Termination criteria can be TermCriteria::COUNT, TermCriteria::EPS or TermCriteria::COUNT + TermCriteria::EPS.
+  You will have to find the best termination criteria for your problem.
+
+Note that the parameters margin regularization, initial step size, and step decreasing power should be positive.
+
+To use SVMSGD algorithm do as follows:
+
+- first, create the SVMSGD object. The algorithm will set optimal parameters by default, but you can set your own parameters via functions setSvmsgdType(),
+  setMarginType(), setMarginRegularization(), setInitialStepSize(), and setStepDecreasingPower().
+
+- then the SVM model can be trained using the train features and the correspondent labels by the method train().
+
+- after that, the label of a new feature vector can be predicted using the method predict().
+
+@code
+// Create empty object
+cv::Ptr<SVMSGD> svmsgd = SVMSGD::create();
+
+// Train the Stochastic Gradient Descent SVM
+svmsgd->train(trainData);
+
+// Predict labels for the new samples
+svmsgd->predict(samples, responses);
+@endcode
+
+*/
+
+class CV_EXPORTS_W SVMSGD : public cv::ml::StatModel
+{
+public:
+
+    /** SVMSGD type.
+    ASGD is often the preferable choice. */
+    enum SvmsgdType
+    {
+        SGD, //!< Stochastic Gradient Descent
+        ASGD //!< Average Stochastic Gradient Descent
+    };
+
+    /** Margin type.*/
+    enum MarginType
+    {
+        SOFT_MARGIN, //!< General case, suits to the case of non-linearly separable sets, allows outliers.
+        HARD_MARGIN  //!< More accurate for the case of linearly separable sets.
+    };
+
+    /**
+     * @return the weights of the trained model (decision function f(x) = weights * x + shift).
+    */
+    CV_WRAP virtual Mat getWeights() = 0;
+
+    /**
+     * @return the shift of the trained model (decision function f(x) = weights * x + shift).
+    */
+    CV_WRAP virtual float getShift() = 0;
+
+    /** @brief Creates empty model.
+     * Use StatModel::train to train the model. Since %SVMSGD has several parameters, you may want to
+     * find the best parameters for your problem or use setOptimalParameters() to set some default parameters.
+    */
+    CV_WRAP static Ptr<SVMSGD> create();
+
+    /** @brief Loads and creates a serialized SVMSGD from a file
+     *
+     * Use SVMSGD::save to serialize and store an SVMSGD to disk.
+     * Load the SVMSGD from this file again, by calling this function with the path to the file.
+     * Optionally specify the node for the file containing the classifier
+     *
+     * @param filepath path to serialized SVMSGD
+     * @param nodeName name of node containing the classifier
+     */
+    CV_WRAP static Ptr<SVMSGD> load(const String& filepath , const String& nodeName = String());
+
+    /** @brief Function sets optimal parameters values for chosen SVM SGD model.
+     * @param svmsgdType is the type of SVMSGD classifier.
+     * @param marginType is the type of margin constraint.
+    */
+    CV_WRAP virtual void setOptimalParameters(int svmsgdType = SVMSGD::ASGD, int marginType = SVMSGD::SOFT_MARGIN) = 0;
+
+    /** @brief %Algorithm type, one of SVMSGD::SvmsgdType. */
+    /** @see setSvmsgdType */
+    CV_WRAP virtual int getSvmsgdType() const = 0;
+    /** @copybrief getSvmsgdType @see getSvmsgdType */
+    CV_WRAP virtual void setSvmsgdType(int svmsgdType) = 0;
+
+    /** @brief %Margin type, one of SVMSGD::MarginType. */
+    /** @see setMarginType */
+    CV_WRAP virtual int getMarginType() const = 0;
+    /** @copybrief getMarginType @see getMarginType */
+    CV_WRAP virtual void setMarginType(int marginType) = 0;
+
+    /** @brief Parameter marginRegularization of a %SVMSGD optimization problem. */
+    /** @see setMarginRegularization */
+    CV_WRAP virtual float getMarginRegularization() const = 0;
+    /** @copybrief getMarginRegularization @see getMarginRegularization */
+    CV_WRAP virtual void setMarginRegularization(float marginRegularization) = 0;
+
+    /** @brief Parameter initialStepSize of a %SVMSGD optimization problem. */
+    /** @see setInitialStepSize */
+    CV_WRAP virtual float getInitialStepSize() const = 0;
+    /** @copybrief getInitialStepSize @see getInitialStepSize */
+    CV_WRAP virtual void setInitialStepSize(float InitialStepSize) = 0;
+
+    /** @brief Parameter stepDecreasingPower of a %SVMSGD optimization problem. */
+    /** @see setStepDecreasingPower */
+    CV_WRAP virtual float getStepDecreasingPower() const = 0;
+    /** @copybrief getStepDecreasingPower @see getStepDecreasingPower */
+    CV_WRAP virtual void setStepDecreasingPower(float stepDecreasingPower) = 0;
+
+    /** @brief Termination criteria of the training algorithm.
+    You can specify the maximum number of iterations (maxCount) and/or how much the error could
+    change between the iterations to make the algorithm continue (epsilon).*/
+    /** @see setTermCriteria */
+    CV_WRAP virtual TermCriteria getTermCriteria() const = 0;
+    /** @copybrief getTermCriteria @see getTermCriteria */
+    CV_WRAP virtual void setTermCriteria(const cv::TermCriteria &val) = 0;
+};
+
+
+/****************************************************************************************\
+*                           Auxiliary functions declarations                              *
+\****************************************************************************************/
+
+/** @brief Generates _sample_ from multivariate normal distribution
+
+@param mean an average row vector
+@param cov symmetric covariation matrix
+@param nsamples returned samples count
+@param samples returned samples array
+*/
+CV_EXPORTS void randMVNormal( InputArray mean, InputArray cov, int nsamples, OutputArray samples);
+
+/** @brief Creates test set */
+CV_EXPORTS void createConcentricSpheresTestSet( int nsamples, int nfeatures, int nclasses,
+                                                OutputArray samples, OutputArray responses);
+
+
+/****************************************************************************************\
+*                                   Simulated annealing solver                             *
+\****************************************************************************************/
+
+#ifdef CV_DOXYGEN
+/** @brief This class declares example interface for system state used in simulated annealing optimization algorithm.
+
+@note This class is not defined in C++ code and can't be use directly - you need your own implementation with the same methods.
+*/
+struct SimulatedAnnealingSolverSystem
+{
+    /** Give energy value for a state of system.*/
+    double energy() const;
+    /** Function which change the state of system (random perturbation).*/
+    void changeState();
+    /** Function to reverse to the previous state. Can be called once only after changeState(). */
+    void reverseState();
+};
+#endif // CV_DOXYGEN
+
+/** @brief The class implements simulated annealing for optimization.
+
+@cite Kirkpatrick83 for details
+
+@param solverSystem optimization system (see SimulatedAnnealingSolverSystem)
+@param initialTemperature initial temperature
+@param finalTemperature final temperature
+@param coolingRatio temperature step multiplies
+@param iterationsPerStep number of iterations per temperature changing step
+@param lastTemperature optional output for last used temperature
+@param rngEnergy specify custom random numbers generator (cv::theRNG() by default)
+*/
+template<class SimulatedAnnealingSolverSystem>
+int simulatedAnnealingSolver(SimulatedAnnealingSolverSystem& solverSystem,
+     double initialTemperature, double finalTemperature, double coolingRatio,
+     size_t iterationsPerStep,
+     CV_OUT double* lastTemperature = NULL,
+     cv::RNG& rngEnergy = cv::theRNG()
+);
+
+//! @} ml
+
+}
+}
+
+#include <opencv2/ml/ml.inl.hpp>
+
+#endif // __cplusplus
+#endif // OPENCV_ML_HPP
+
+/* End of file. */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/ml/ml.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/ml/ml.hpp
new file mode 100644
index 000000000000..f6f9cd8f8932
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/ml/ml.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/ml.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/ml/ml.inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/ml/ml.inl.hpp
new file mode 100644
index 000000000000..dc9c78393a5a
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/ml/ml.inl.hpp
@@ -0,0 +1,60 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_ML_INL_HPP
+#define OPENCV_ML_INL_HPP
+
+namespace cv { namespace ml {
+
+// declared in ml.hpp
+template<class SimulatedAnnealingSolverSystem>
+int simulatedAnnealingSolver(SimulatedAnnealingSolverSystem& solverSystem,
+     double initialTemperature, double finalTemperature, double coolingRatio,
+     size_t iterationsPerStep,
+     CV_OUT double* lastTemperature,
+     cv::RNG& rngEnergy
+)
+{
+    CV_Assert(finalTemperature > 0);
+    CV_Assert(initialTemperature > finalTemperature);
+    CV_Assert(iterationsPerStep > 0);
+    CV_Assert(coolingRatio < 1.0f);
+    double Ti = initialTemperature;
+    double previousEnergy = solverSystem.energy();
+    int exchange = 0;
+    while (Ti > finalTemperature)
+    {
+        for (size_t i = 0; i < iterationsPerStep; i++)
+        {
+            solverSystem.changeState();
+            double newEnergy = solverSystem.energy();
+            if (newEnergy < previousEnergy)
+            {
+                previousEnergy = newEnergy;
+                exchange++;
+            }
+            else
+            {
+                double r = rngEnergy.uniform(0.0, 1.0);
+                if (r < std::exp(-(newEnergy - previousEnergy) / Ti))
+                {
+                    previousEnergy = newEnergy;
+                    exchange++;
+                }
+                else
+                {
+                    solverSystem.reverseState();
+                }
+            }
+        }
+        Ti *= coolingRatio;
+    }
+    if (lastTemperature)
+        *lastTemperature = Ti;
+    return exchange;
+}
+
+}} //namespace
+
+#endif // OPENCV_ML_INL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect.hpp
new file mode 100644
index 000000000000..7f1189060806
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect.hpp
@@ -0,0 +1,873 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OBJDETECT_HPP
+#define OPENCV_OBJDETECT_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/objdetect/aruco_detector.hpp"
+#include "opencv2/objdetect/graphical_code_detector.hpp"
+
+/**
+@defgroup objdetect Object Detection
+
+@{
+    @defgroup objdetect_cascade_classifier Cascade Classifier for Object Detection
+
+    The object detector described below has been initially proposed by Paul Viola @cite Viola01 and
+    improved by Rainer Lienhart @cite Lienhart02 .
+
+    First, a classifier (namely a *cascade of boosted classifiers working with haar-like features*) is
+    trained with a few hundred sample views of a particular object (i.e., a face or a car), called
+    positive examples, that are scaled to the same size (say, 20x20), and negative examples - arbitrary
+    images of the same size.
+
+    After a classifier is trained, it can be applied to a region of interest (of the same size as used
+    during the training) in an input image. The classifier outputs a "1" if the region is likely to show
+    the object (i.e., face/car), and "0" otherwise. To search for the object in the whole image one can
+    move the search window across the image and check every location using the classifier. The
+    classifier is designed so that it can be easily "resized" in order to be able to find the objects of
+    interest at different sizes, which is more efficient than resizing the image itself. So, to find an
+    object of an unknown size in the image the scan procedure should be done several times at different
+    scales.
+
+    The word "cascade" in the classifier name means that the resultant classifier consists of several
+    simpler classifiers (*stages*) that are applied subsequently to a region of interest until at some
+    stage the candidate is rejected or all the stages are passed. The word "boosted" means that the
+    classifiers at every stage of the cascade are complex themselves and they are built out of basic
+    classifiers using one of four different boosting techniques (weighted voting). Currently Discrete
+    Adaboost, Real Adaboost, Gentle Adaboost and Logitboost are supported. The basic classifiers are
+    decision-tree classifiers with at least 2 leaves. Haar-like features are the input to the basic
+    classifiers, and are calculated as described below. The current algorithm uses the following
+    Haar-like features:
+
+    ![image](pics/haarfeatures.png)
+
+    The feature used in a particular classifier is specified by its shape (1a, 2b etc.), position within
+    the region of interest and the scale (this scale is not the same as the scale used at the detection
+    stage, though these two scales are multiplied). For example, in the case of the third line feature
+    (2c) the response is calculated as the difference between the sum of image pixels under the
+    rectangle covering the whole feature (including the two white stripes and the black stripe in the
+    middle) and the sum of the image pixels under the black stripe multiplied by 3 in order to
+    compensate for the differences in the size of areas. The sums of pixel values over a rectangular
+    regions are calculated rapidly using integral images (see below and the integral description).
+
+    Check @ref tutorial_cascade_classifier "the corresponding tutorial" for more details.
+
+    The following reference is for the detection part only. There is a separate application called
+    opencv_traincascade that can train a cascade of boosted classifiers from a set of samples.
+
+    @note In the new C++ interface it is also possible to use LBP (local binary pattern) features in
+    addition to Haar-like features. .. [Viola01] Paul Viola and Michael J. Jones. Rapid Object Detection
+    using a Boosted Cascade of Simple Features. IEEE CVPR, 2001. The paper is available online at
+    <https://github.com/SvHey/thesis/blob/master/Literature/ObjectDetection/violaJones_CVPR2001.pdf>
+
+    @defgroup objdetect_hog HOG (Histogram of Oriented Gradients) descriptor and object detector
+    @defgroup objdetect_barcode Barcode detection and decoding
+    @defgroup objdetect_qrcode QRCode detection and encoding
+    @defgroup objdetect_dnn_face DNN-based face detection and recognition
+
+    Check @ref tutorial_dnn_face "the corresponding tutorial" for more details.
+
+    @defgroup objdetect_common Common functions and classes
+    @defgroup objdetect_aruco ArUco markers and boards detection for robust camera pose estimation
+    @{
+        ArUco Marker Detection
+        Square fiducial markers (also known as Augmented Reality Markers) are useful for easy,
+        fast and robust camera pose estimation.
+
+        The main functionality of ArucoDetector class is detection of markers in an image. If the markers are grouped
+        as a board, then you can try to recover the missing markers with ArucoDetector::refineDetectedMarkers().
+        ArUco markers can also be used for advanced chessboard corner finding. To do this, group the markers in the
+        CharucoBoard and find the corners of the chessboard with the CharucoDetector::detectBoard().
+
+        The implementation is based on the ArUco Library by R. Muñoz-Salinas and S. Garrido-Jurado @cite Aruco2014.
+
+        Markers can also be detected based on the AprilTag 2 @cite wang2016iros fiducial detection method.
+
+        @sa @cite Aruco2014
+        This code has been originally developed by Sergio Garrido-Jurado as a project
+        for Google Summer of Code 2015 (GSoC 15).
+    @}
+
+@}
+ */
+
+typedef struct CvHaarClassifierCascade CvHaarClassifierCascade;
+
+namespace cv
+{
+
+//! @addtogroup objdetect_common
+//! @{
+
+///////////////////////////// Object Detection ////////////////////////////
+
+/** @brief This class is used for grouping object candidates detected by Cascade Classifier, HOG etc.
+
+instance of the class is to be passed to cv::partition
+ */
+class CV_EXPORTS SimilarRects
+{
+public:
+    SimilarRects(double _eps) : eps(_eps) {}
+    inline bool operator()(const Rect& r1, const Rect& r2) const
+    {
+        double delta = eps * ((std::min)(r1.width, r2.width) + (std::min)(r1.height, r2.height)) * 0.5;
+        return std::abs(r1.x - r2.x) <= delta &&
+            std::abs(r1.y - r2.y) <= delta &&
+            std::abs(r1.x + r1.width - r2.x - r2.width) <= delta &&
+            std::abs(r1.y + r1.height - r2.y - r2.height) <= delta;
+    }
+    double eps;
+};
+
+/** @brief Groups the object candidate rectangles.
+
+@param rectList Input/output vector of rectangles. Output vector includes retained and grouped
+rectangles. (The Python list is not modified in place.)
+@param groupThreshold Minimum possible number of rectangles minus 1. The threshold is used in a
+group of rectangles to retain it.
+@param eps Relative difference between sides of the rectangles to merge them into a group.
+
+The function is a wrapper for the generic function partition . It clusters all the input rectangles
+using the rectangle equivalence criteria that combines rectangles with similar sizes and similar
+locations. The similarity is defined by eps. When eps=0 , no clustering is done at all. If
+\f$\texttt{eps}\rightarrow +\inf\f$ , all the rectangles are put in one cluster. Then, the small
+clusters containing less than or equal to groupThreshold rectangles are rejected. In each other
+cluster, the average rectangle is computed and put into the output rectangle list.
+ */
+CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps = 0.2);
+/** @overload */
+CV_EXPORTS_W void groupRectangles(CV_IN_OUT std::vector<Rect>& rectList, CV_OUT std::vector<int>& weights,
+                                  int groupThreshold, double eps = 0.2);
+/** @overload */
+CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, int groupThreshold,
+                                  double eps, std::vector<int>* weights, std::vector<double>* levelWeights );
+/** @overload */
+CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, std::vector<int>& rejectLevels,
+                                  std::vector<double>& levelWeights, int groupThreshold, double eps = 0.2);
+/** @overload */
+CV_EXPORTS   void groupRectangles_meanshift(std::vector<Rect>& rectList, std::vector<double>& foundWeights,
+                                            std::vector<double>& foundScales,
+                                            double detectThreshold = 0.0, Size winDetSize = Size(64, 128));
+//! @}
+
+//! @addtogroup objdetect_cascade_classifier
+//! @{
+
+template<> struct DefaultDeleter<CvHaarClassifierCascade>{ CV_EXPORTS void operator ()(CvHaarClassifierCascade* obj) const; };
+
+enum { CASCADE_DO_CANNY_PRUNING    = 1,
+       CASCADE_SCALE_IMAGE         = 2,
+       CASCADE_FIND_BIGGEST_OBJECT = 4,
+       CASCADE_DO_ROUGH_SEARCH     = 8
+     };
+
+class CV_EXPORTS_W BaseCascadeClassifier : public Algorithm
+{
+public:
+    virtual ~BaseCascadeClassifier();
+    virtual bool empty() const CV_OVERRIDE = 0;
+    virtual bool load( const String& filename ) = 0;
+    virtual void detectMultiScale( InputArray image,
+                           CV_OUT std::vector<Rect>& objects,
+                           double scaleFactor,
+                           int minNeighbors, int flags,
+                           Size minSize, Size maxSize ) = 0;
+
+    virtual void detectMultiScale( InputArray image,
+                           CV_OUT std::vector<Rect>& objects,
+                           CV_OUT std::vector<int>& numDetections,
+                           double scaleFactor,
+                           int minNeighbors, int flags,
+                           Size minSize, Size maxSize ) = 0;
+
+    virtual void detectMultiScale( InputArray image,
+                                   CV_OUT std::vector<Rect>& objects,
+                                   CV_OUT std::vector<int>& rejectLevels,
+                                   CV_OUT std::vector<double>& levelWeights,
+                                   double scaleFactor,
+                                   int minNeighbors, int flags,
+                                   Size minSize, Size maxSize,
+                                   bool outputRejectLevels ) = 0;
+
+    virtual bool isOldFormatCascade() const = 0;
+    virtual Size getOriginalWindowSize() const = 0;
+    virtual int getFeatureType() const = 0;
+    virtual void* getOldCascade() = 0;
+
+    class CV_EXPORTS MaskGenerator
+    {
+    public:
+        virtual ~MaskGenerator() {}
+        virtual Mat generateMask(const Mat& src)=0;
+        virtual void initializeMask(const Mat& /*src*/) { }
+    };
+    virtual void setMaskGenerator(const Ptr<MaskGenerator>& maskGenerator) = 0;
+    virtual Ptr<MaskGenerator> getMaskGenerator() = 0;
+};
+
+/** @example samples/cpp/facedetect.cpp
+This program demonstrates usage of the Cascade classifier class
+\image html Cascade_Classifier_Tutorial_Result_Haar.jpg "Sample screenshot" width=321 height=254
+*/
+/** @brief Cascade classifier class for object detection.
+ */
+class CV_EXPORTS_W CascadeClassifier
+{
+public:
+    CV_WRAP CascadeClassifier();
+    /** @brief Loads a classifier from a file.
+
+    @param filename Name of the file from which the classifier is loaded.
+     */
+    CV_WRAP CascadeClassifier(const String& filename);
+    ~CascadeClassifier();
+    /** @brief Checks whether the classifier has been loaded.
+    */
+    CV_WRAP bool empty() const;
+    /** @brief Loads a classifier from a file.
+
+    @param filename Name of the file from which the classifier is loaded. The file may contain an old
+    HAAR classifier trained by the haartraining application or a new cascade classifier trained by the
+    traincascade application.
+     */
+    CV_WRAP bool load( const String& filename );
+    /** @brief Reads a classifier from a FileStorage node.
+
+    @note The file may contain a new cascade classifier (trained by the traincascade application) only.
+     */
+    CV_WRAP bool read( const FileNode& node );
+
+    /** @brief Detects objects of different sizes in the input image. The detected objects are returned as a list
+    of rectangles.
+
+    @param image Matrix of the type CV_8U containing an image where objects are detected.
+    @param objects Vector of rectangles where each rectangle contains the detected object, the
+    rectangles may be partially outside the original image.
+    @param scaleFactor Parameter specifying how much the image size is reduced at each image scale.
+    @param minNeighbors Parameter specifying how many neighbors each candidate rectangle should have
+    to retain it.
+    @param flags Parameter with the same meaning for an old cascade as in the function
+    cvHaarDetectObjects. It is not used for a new cascade.
+    @param minSize Minimum possible object size. Objects smaller than that are ignored.
+    @param maxSize Maximum possible object size. Objects larger than that are ignored. If `maxSize == minSize` model is evaluated on single scale.
+    */
+    CV_WRAP void detectMultiScale( InputArray image,
+                          CV_OUT std::vector<Rect>& objects,
+                          double scaleFactor = 1.1,
+                          int minNeighbors = 3, int flags = 0,
+                          Size minSize = Size(),
+                          Size maxSize = Size() );
+
+    /** @overload
+    @param image Matrix of the type CV_8U containing an image where objects are detected.
+    @param objects Vector of rectangles where each rectangle contains the detected object, the
+    rectangles may be partially outside the original image.
+    @param numDetections Vector of detection numbers for the corresponding objects. An object's number
+    of detections is the number of neighboring positively classified rectangles that were joined
+    together to form the object.
+    @param scaleFactor Parameter specifying how much the image size is reduced at each image scale.
+    @param minNeighbors Parameter specifying how many neighbors each candidate rectangle should have
+    to retain it.
+    @param flags Parameter with the same meaning for an old cascade as in the function
+    cvHaarDetectObjects. It is not used for a new cascade.
+    @param minSize Minimum possible object size. Objects smaller than that are ignored.
+    @param maxSize Maximum possible object size. Objects larger than that are ignored. If `maxSize == minSize` model is evaluated on single scale.
+    */
+    CV_WRAP_AS(detectMultiScale2) void detectMultiScale( InputArray image,
+                          CV_OUT std::vector<Rect>& objects,
+                          CV_OUT std::vector<int>& numDetections,
+                          double scaleFactor=1.1,
+                          int minNeighbors=3, int flags=0,
+                          Size minSize=Size(),
+                          Size maxSize=Size() );
+
+    /** @overload
+    This function allows you to retrieve the final stage decision certainty of classification.
+    For this, one needs to set `outputRejectLevels` on true and provide the `rejectLevels` and `levelWeights` parameter.
+    For each resulting detection, `levelWeights` will then contain the certainty of classification at the final stage.
+    This value can then be used to separate strong from weaker classifications.
+
+    A code sample on how to use it efficiently can be found below:
+    @code
+    Mat img;
+    vector<double> weights;
+    vector<int> levels;
+    vector<Rect> detections;
+    CascadeClassifier model("/path/to/your/model.xml");
+    model.detectMultiScale(img, detections, levels, weights, 1.1, 3, 0, Size(), Size(), true);
+    cerr << "Detection " << detections[0] << " with weight " << weights[0] << endl;
+    @endcode
+    */
+    CV_WRAP_AS(detectMultiScale3) void detectMultiScale( InputArray image,
+                                  CV_OUT std::vector<Rect>& objects,
+                                  CV_OUT std::vector<int>& rejectLevels,
+                                  CV_OUT std::vector<double>& levelWeights,
+                                  double scaleFactor = 1.1,
+                                  int minNeighbors = 3, int flags = 0,
+                                  Size minSize = Size(),
+                                  Size maxSize = Size(),
+                                  bool outputRejectLevels = false );
+
+    CV_WRAP bool isOldFormatCascade() const;
+    CV_WRAP Size getOriginalWindowSize() const;
+    CV_WRAP int getFeatureType() const;
+    void* getOldCascade();
+
+    CV_WRAP static bool convert(const String& oldcascade, const String& newcascade);
+
+    void setMaskGenerator(const Ptr<BaseCascadeClassifier::MaskGenerator>& maskGenerator);
+    Ptr<BaseCascadeClassifier::MaskGenerator> getMaskGenerator();
+
+    Ptr<BaseCascadeClassifier> cc;
+};
+
+CV_EXPORTS Ptr<BaseCascadeClassifier::MaskGenerator> createFaceDetectionMaskGenerator();
+//! @}
+
+//! @addtogroup objdetect_hog
+//! @{
+//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
+
+//! struct for detection region of interest (ROI)
+struct DetectionROI
+{
+   //! scale(size) of the bounding box
+   double scale;
+   //! set of requested locations to be evaluated
+   std::vector<cv::Point> locations;
+   //! vector that will contain confidence values for each location
+   std::vector<double> confidences;
+};
+
+/**@brief Implementation of HOG (Histogram of Oriented Gradients) descriptor and object detector.
+
+the HOG descriptor algorithm introduced by Navneet Dalal and Bill Triggs @cite Dalal2005 .
+
+useful links:
+
+https://hal.inria.fr/inria-00548512/document/
+
+https://en.wikipedia.org/wiki/Histogram_of_oriented_gradients
+
+https://software.intel.com/en-us/ipp-dev-reference-histogram-of-oriented-gradients-hog-descriptor
+
+http://www.learnopencv.com/histogram-of-oriented-gradients
+
+http://www.learnopencv.com/handwritten-digits-classification-an-opencv-c-python-tutorial
+
+ */
+struct CV_EXPORTS_W HOGDescriptor
+{
+public:
+    enum HistogramNormType { L2Hys = 0 //!< Default histogramNormType
+         };
+    enum { DEFAULT_NLEVELS = 64 //!< Default nlevels value.
+         };
+    enum DescriptorStorageFormat { DESCR_FORMAT_COL_BY_COL, DESCR_FORMAT_ROW_BY_ROW };
+
+    /**@brief Creates the HOG descriptor and detector with default parameters.
+
+    aqual to HOGDescriptor(Size(64,128), Size(16,16), Size(8,8), Size(8,8), 9 )
+    */
+    CV_WRAP HOGDescriptor() : winSize(64,128), blockSize(16,16), blockStride(8,8),
+        cellSize(8,8), nbins(9), derivAperture(1), winSigma(-1),
+        histogramNormType(HOGDescriptor::L2Hys), L2HysThreshold(0.2), gammaCorrection(true),
+        free_coef(-1.f), nlevels(HOGDescriptor::DEFAULT_NLEVELS), signedGradient(false)
+    {}
+
+    /** @overload
+    @param _winSize sets winSize with given value.
+    @param _blockSize sets blockSize with given value.
+    @param _blockStride sets blockStride with given value.
+    @param _cellSize sets cellSize with given value.
+    @param _nbins sets nbins with given value.
+    @param _derivAperture sets derivAperture with given value.
+    @param _winSigma sets winSigma with given value.
+    @param _histogramNormType sets histogramNormType with given value.
+    @param _L2HysThreshold sets L2HysThreshold with given value.
+    @param _gammaCorrection sets gammaCorrection with given value.
+    @param _nlevels sets nlevels with given value.
+    @param _signedGradient sets signedGradient with given value.
+    */
+    CV_WRAP HOGDescriptor(Size _winSize, Size _blockSize, Size _blockStride,
+                  Size _cellSize, int _nbins, int _derivAperture=1, double _winSigma=-1,
+                  HOGDescriptor::HistogramNormType _histogramNormType=HOGDescriptor::L2Hys,
+                  double _L2HysThreshold=0.2, bool _gammaCorrection=false,
+                  int _nlevels=HOGDescriptor::DEFAULT_NLEVELS, bool _signedGradient=false)
+    : winSize(_winSize), blockSize(_blockSize), blockStride(_blockStride), cellSize(_cellSize),
+    nbins(_nbins), derivAperture(_derivAperture), winSigma(_winSigma),
+    histogramNormType(_histogramNormType), L2HysThreshold(_L2HysThreshold),
+    gammaCorrection(_gammaCorrection), free_coef(-1.f), nlevels(_nlevels), signedGradient(_signedGradient)
+    {}
+
+    /** @overload
+
+    Creates the HOG descriptor and detector and loads HOGDescriptor parameters and coefficients for the linear SVM classifier from a file.
+    @param filename The file name containing HOGDescriptor properties and coefficients for the linear SVM classifier.
+    */
+    CV_WRAP HOGDescriptor(const String& filename)
+    {
+        load(filename);
+    }
+
+    /** @overload
+    @param d the HOGDescriptor which cloned to create a new one.
+    */
+    HOGDescriptor(const HOGDescriptor& d)
+    {
+        d.copyTo(*this);
+    }
+
+    /**@brief Default destructor.
+    */
+    virtual ~HOGDescriptor() {}
+
+    /**@brief Returns the number of coefficients required for the classification.
+    */
+    CV_WRAP size_t getDescriptorSize() const;
+
+    /** @brief Checks if detector size equal to descriptor size.
+    */
+    CV_WRAP bool checkDetectorSize() const;
+
+    /** @brief Returns winSigma value
+    */
+    CV_WRAP double getWinSigma() const;
+
+    /**@example samples/cpp/peopledetect.cpp
+    */
+    /**@brief Sets coefficients for the linear SVM classifier.
+    @param svmdetector coefficients for the linear SVM classifier.
+    */
+    CV_WRAP virtual void setSVMDetector(InputArray svmdetector);
+
+    /** @brief Reads HOGDescriptor parameters and coefficients for the linear SVM classifier from a file node.
+    @param fn File node
+    */
+    virtual bool read(FileNode& fn);
+
+    /** @brief Stores HOGDescriptor parameters and coefficients for the linear SVM classifier in a file storage.
+    @param fs File storage
+    @param objname Object name
+    */
+    virtual void write(FileStorage& fs, const String& objname) const;
+
+    /** @brief loads HOGDescriptor parameters and coefficients for the linear SVM classifier from a file
+    @param filename Name of the file to read.
+    @param objname The optional name of the node to read (if empty, the first top-level node will be used).
+    */
+    CV_WRAP virtual bool load(const String& filename, const String& objname = String());
+
+    /** @brief saves HOGDescriptor parameters and coefficients for the linear SVM classifier to a file
+    @param filename File name
+    @param objname Object name
+    */
+    CV_WRAP virtual void save(const String& filename, const String& objname = String()) const;
+
+    /** @brief clones the HOGDescriptor
+    @param c cloned HOGDescriptor
+    */
+    virtual void copyTo(HOGDescriptor& c) const;
+
+    /**@example samples/cpp/train_HOG.cpp
+    */
+    /** @brief Computes HOG descriptors of given image.
+    @param img Matrix of the type CV_8U containing an image where HOG features will be calculated.
+    @param descriptors Matrix of the type CV_32F
+    @param winStride Window stride. It must be a multiple of block stride.
+    @param padding Padding
+    @param locations Vector of Point
+    */
+    CV_WRAP virtual void compute(InputArray img,
+                         CV_OUT std::vector<float>& descriptors,
+                         Size winStride = Size(), Size padding = Size(),
+                         const std::vector<Point>& locations = std::vector<Point>()) const;
+
+    /** @brief Performs object detection without a multi-scale window.
+    @param img Matrix of the type CV_8U or CV_8UC3 containing an image where objects are detected.
+    @param foundLocations Vector of point where each point contains left-top corner point of detected object boundaries.
+    @param weights Vector that will contain confidence values for each detected object.
+    @param hitThreshold Threshold for the distance between features and SVM classifying plane.
+    Usually it is 0 and should be specified in the detector coefficients (as the last free coefficient).
+    But if the free coefficient is omitted (which is allowed), you can specify it manually here.
+    @param winStride Window stride. It must be a multiple of block stride.
+    @param padding Padding
+    @param searchLocations Vector of Point includes set of requested locations to be evaluated.
+    */
+    CV_WRAP virtual void detect(InputArray img, CV_OUT std::vector<Point>& foundLocations,
+                        CV_OUT std::vector<double>& weights,
+                        double hitThreshold = 0, Size winStride = Size(),
+                        Size padding = Size(),
+                        const std::vector<Point>& searchLocations = std::vector<Point>()) const;
+
+    /** @brief Performs object detection without a multi-scale window.
+    @param img Matrix of the type CV_8U or CV_8UC3 containing an image where objects are detected.
+    @param foundLocations Vector of point where each point contains left-top corner point of detected object boundaries.
+    @param hitThreshold Threshold for the distance between features and SVM classifying plane.
+    Usually it is 0 and should be specified in the detector coefficients (as the last free coefficient).
+    But if the free coefficient is omitted (which is allowed), you can specify it manually here.
+    @param winStride Window stride. It must be a multiple of block stride.
+    @param padding Padding
+    @param searchLocations Vector of Point includes locations to search.
+    */
+    virtual void detect(InputArray img, CV_OUT std::vector<Point>& foundLocations,
+                        double hitThreshold = 0, Size winStride = Size(),
+                        Size padding = Size(),
+                        const std::vector<Point>& searchLocations=std::vector<Point>()) const;
+
+    /** @brief Detects objects of different sizes in the input image. The detected objects are returned as a list
+    of rectangles.
+    @param img Matrix of the type CV_8U or CV_8UC3 containing an image where objects are detected.
+    @param foundLocations Vector of rectangles where each rectangle contains the detected object.
+    @param foundWeights Vector that will contain confidence values for each detected object.
+    @param hitThreshold Threshold for the distance between features and SVM classifying plane.
+    Usually it is 0 and should be specified in the detector coefficients (as the last free coefficient).
+    But if the free coefficient is omitted (which is allowed), you can specify it manually here.
+    @param winStride Window stride. It must be a multiple of block stride.
+    @param padding Padding
+    @param scale Coefficient of the detection window increase.
+    @param groupThreshold Coefficient to regulate the similarity threshold. When detected, some objects can be covered
+    by many rectangles. 0 means not to perform grouping.
+    @param useMeanshiftGrouping indicates grouping algorithm
+    */
+    CV_WRAP virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
+                                  CV_OUT std::vector<double>& foundWeights, double hitThreshold = 0,
+                                  Size winStride = Size(), Size padding = Size(), double scale = 1.05,
+                                  double groupThreshold = 2.0, bool useMeanshiftGrouping = false) const;
+
+    /** @brief Detects objects of different sizes in the input image. The detected objects are returned as a list
+    of rectangles.
+    @param img Matrix of the type CV_8U or CV_8UC3 containing an image where objects are detected.
+    @param foundLocations Vector of rectangles where each rectangle contains the detected object.
+    @param hitThreshold Threshold for the distance between features and SVM classifying plane.
+    Usually it is 0 and should be specified in the detector coefficients (as the last free coefficient).
+    But if the free coefficient is omitted (which is allowed), you can specify it manually here.
+    @param winStride Window stride. It must be a multiple of block stride.
+    @param padding Padding
+    @param scale Coefficient of the detection window increase.
+    @param groupThreshold Coefficient to regulate the similarity threshold. When detected, some objects can be covered
+    by many rectangles. 0 means not to perform grouping.
+    @param useMeanshiftGrouping indicates grouping algorithm
+    */
+    virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
+                                  double hitThreshold = 0, Size winStride = Size(),
+                                  Size padding = Size(), double scale = 1.05,
+                                  double groupThreshold = 2.0, bool useMeanshiftGrouping = false) const;
+
+    /** @brief  Computes gradients and quantized gradient orientations.
+    @param img Matrix contains the image to be computed
+    @param grad Matrix of type CV_32FC2 contains computed gradients
+    @param angleOfs Matrix of type CV_8UC2 contains quantized gradient orientations
+    @param paddingTL Padding from top-left
+    @param paddingBR Padding from bottom-right
+    */
+    CV_WRAP virtual void computeGradient(InputArray img, InputOutputArray grad, InputOutputArray angleOfs,
+                                 Size paddingTL = Size(), Size paddingBR = Size()) const;
+
+    /** @brief Returns coefficients of the classifier trained for people detection (for 64x128 windows).
+    */
+    CV_WRAP static std::vector<float> getDefaultPeopleDetector();
+
+    /**@example samples/tapi/hog.cpp
+    */
+    /** @brief Returns coefficients of the classifier trained for people detection (for 48x96 windows).
+    */
+    CV_WRAP static std::vector<float> getDaimlerPeopleDetector();
+
+    //! Detection window size. Align to block size and block stride. Default value is Size(64,128).
+    CV_PROP Size winSize;
+
+    //! Block size in pixels. Align to cell size. Default value is Size(16,16).
+    CV_PROP Size blockSize;
+
+    //! Block stride. It must be a multiple of cell size. Default value is Size(8,8).
+    CV_PROP Size blockStride;
+
+    //! Cell size. Default value is Size(8,8).
+    CV_PROP Size cellSize;
+
+    //! Number of bins used in the calculation of histogram of gradients. Default value is 9.
+    CV_PROP int nbins;
+
+    //! not documented
+    CV_PROP int derivAperture;
+
+    //! Gaussian smoothing window parameter.
+    CV_PROP double winSigma;
+
+    //! histogramNormType
+    CV_PROP HOGDescriptor::HistogramNormType histogramNormType;
+
+    //! L2-Hys normalization method shrinkage.
+    CV_PROP double L2HysThreshold;
+
+    //! Flag to specify whether the gamma correction preprocessing is required or not.
+    CV_PROP bool gammaCorrection;
+
+    //! coefficients for the linear SVM classifier.
+    CV_PROP std::vector<float> svmDetector;
+
+    //! coefficients for the linear SVM classifier used when OpenCL is enabled
+    UMat oclSvmDetector;
+
+    //! not documented
+    float free_coef;
+
+    //! Maximum number of detection window increases. Default value is 64
+    CV_PROP int nlevels;
+
+    //! Indicates signed gradient will be used or not
+    CV_PROP bool signedGradient;
+
+    /** @brief evaluate specified ROI and return confidence value for each location
+    @param img Matrix of the type CV_8U or CV_8UC3 containing an image where objects are detected.
+    @param locations Vector of Point
+    @param foundLocations Vector of Point where each Point is detected object's top-left point.
+    @param confidences confidences
+    @param hitThreshold Threshold for the distance between features and SVM classifying plane. Usually
+    it is 0 and should be specified in the detector coefficients (as the last free coefficient). But if
+    the free coefficient is omitted (which is allowed), you can specify it manually here
+    @param winStride winStride
+    @param padding padding
+    */
+    virtual void detectROI(InputArray img, const std::vector<cv::Point> &locations,
+                                   CV_OUT std::vector<cv::Point>& foundLocations, CV_OUT std::vector<double>& confidences,
+                                   double hitThreshold = 0, cv::Size winStride = Size(),
+                                   cv::Size padding = Size()) const;
+
+    /** @brief evaluate specified ROI and return confidence value for each location in multiple scales
+    @param img Matrix of the type CV_8U or CV_8UC3 containing an image where objects are detected.
+    @param foundLocations Vector of rectangles where each rectangle contains the detected object.
+    @param locations Vector of DetectionROI
+    @param hitThreshold Threshold for the distance between features and SVM classifying plane. Usually it is 0 and should be specified
+    in the detector coefficients (as the last free coefficient). But if the free coefficient is omitted (which is allowed), you can specify it manually here.
+    @param groupThreshold Minimum possible number of rectangles minus 1. The threshold is used in a group of rectangles to retain it.
+    */
+    virtual void detectMultiScaleROI(InputArray img,
+                                     CV_OUT std::vector<cv::Rect>& foundLocations,
+                                     std::vector<DetectionROI>& locations,
+                                     double hitThreshold = 0,
+                                     int groupThreshold = 0) const;
+
+    /** @brief Groups the object candidate rectangles.
+    @param rectList  Input/output vector of rectangles. Output vector includes retained and grouped rectangles. (The Python list is not modified in place.)
+    @param weights Input/output vector of weights of rectangles. Output vector includes weights of retained and grouped rectangles. (The Python list is not modified in place.)
+    @param groupThreshold Minimum possible number of rectangles minus 1. The threshold is used in a group of rectangles to retain it.
+    @param eps Relative difference between sides of the rectangles to merge them into a group.
+    */
+    void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
+};
+//! @}
+
+//! @addtogroup objdetect_qrcode
+//! @{
+
+class CV_EXPORTS_W QRCodeEncoder {
+protected:
+    QRCodeEncoder();  // use ::create()
+public:
+    virtual ~QRCodeEncoder();
+
+    enum EncodeMode {
+        MODE_AUTO              = -1,
+        MODE_NUMERIC           = 1, // 0b0001
+        MODE_ALPHANUMERIC      = 2, // 0b0010
+        MODE_BYTE              = 4, // 0b0100
+        MODE_ECI               = 7, // 0b0111
+        MODE_KANJI             = 8, // 0b1000
+        MODE_STRUCTURED_APPEND = 3  // 0b0011
+    };
+
+    enum CorrectionLevel {
+        CORRECT_LEVEL_L = 0,
+        CORRECT_LEVEL_M = 1,
+        CORRECT_LEVEL_Q = 2,
+        CORRECT_LEVEL_H = 3
+    };
+
+    enum ECIEncodings {
+        ECI_UTF8 = 26
+    };
+
+    /** @brief QR code encoder parameters. */
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+
+        //! The optional version of QR code (by default - maximum possible depending on the length of the string).
+        CV_PROP_RW int version;
+
+        //! The optional level of error correction (by default - the lowest).
+        CV_PROP_RW CorrectionLevel correction_level;
+
+        //! The optional encoding mode - Numeric, Alphanumeric, Byte, Kanji, ECI or Structured Append.
+        CV_PROP_RW EncodeMode mode;
+
+        //! The optional number of QR codes to generate in Structured Append mode.
+        CV_PROP_RW int structure_number;
+    };
+
+    /** @brief Constructor
+    @param parameters QR code encoder parameters QRCodeEncoder::Params
+    */
+    static CV_WRAP
+    Ptr<QRCodeEncoder> create(const QRCodeEncoder::Params& parameters = QRCodeEncoder::Params());
+
+    /** @brief Generates QR code from input string.
+     @param encoded_info Input string to encode.
+     @param qrcode Generated QR code.
+    */
+    CV_WRAP virtual void encode(const String& encoded_info, OutputArray qrcode) = 0;
+
+    /** @brief Generates QR code from input string in Structured Append mode. The encoded message is splitting over a number of QR codes.
+     @param encoded_info Input string to encode.
+     @param qrcodes Vector of generated QR codes.
+    */
+    CV_WRAP virtual void encodeStructuredAppend(const String& encoded_info, OutputArrayOfArrays qrcodes) = 0;
+
+};
+class CV_EXPORTS_W_SIMPLE QRCodeDetector : public GraphicalCodeDetector
+{
+public:
+    CV_WRAP QRCodeDetector();
+
+    /** @brief sets the epsilon used during the horizontal scan of QR code stop marker detection.
+     @param epsX Epsilon neighborhood, which allows you to determine the horizontal pattern
+     of the scheme 1:1:3:1:1 according to QR code standard.
+    */
+    CV_WRAP QRCodeDetector& setEpsX(double epsX);
+    /** @brief sets the epsilon used during the vertical scan of QR code stop marker detection.
+     @param epsY Epsilon neighborhood, which allows you to determine the vertical pattern
+     of the scheme 1:1:3:1:1 according to QR code standard.
+     */
+    CV_WRAP QRCodeDetector& setEpsY(double epsY);
+
+    /** @brief use markers to improve the position of the corners of the QR code
+     *
+     * alignmentMarkers using by default
+     */
+    CV_WRAP QRCodeDetector& setUseAlignmentMarkers(bool useAlignmentMarkers);
+
+    /** @brief Decodes QR code on a curved surface in image once it's found by the detect() method.
+
+     Returns UTF8-encoded output string or empty string if the code cannot be decoded.
+     @param img grayscale or color (BGR) image containing QR code.
+     @param points Quadrangle vertices found by detect() method (or some other algorithm).
+     @param straight_qrcode The optional output image containing rectified and binarized QR code
+     */
+    CV_WRAP cv::String decodeCurved(InputArray img, InputArray points, OutputArray straight_qrcode = noArray());
+
+    /** @brief Both detects and decodes QR code on a curved surface
+
+     @param img grayscale or color (BGR) image containing QR code.
+     @param points optional output array of vertices of the found QR code quadrangle. Will be empty if not found.
+     @param straight_qrcode The optional output image containing rectified and binarized QR code
+     */
+    CV_WRAP std::string detectAndDecodeCurved(InputArray img, OutputArray points=noArray(),
+                                              OutputArray straight_qrcode = noArray());
+};
+
+class CV_EXPORTS_W_SIMPLE QRCodeDetectorAruco : public GraphicalCodeDetector {
+public:
+    CV_WRAP QRCodeDetectorAruco();
+
+    struct CV_EXPORTS_W_SIMPLE Params {
+        CV_WRAP Params();
+
+        /** @brief The minimum allowed pixel size of a QR module in the smallest image in the image pyramid, default 4.f */
+        CV_PROP_RW float minModuleSizeInPyramid;
+
+        /** @brief The maximum allowed relative rotation for finder patterns in the same QR code, default pi/12 */
+        CV_PROP_RW float maxRotation;
+
+        /** @brief The maximum allowed relative mismatch in module sizes for finder patterns in the same QR code, default 1.75f */
+        CV_PROP_RW float maxModuleSizeMismatch;
+
+        /** @brief The maximum allowed module relative mismatch for timing pattern module, default 2.f
+         *
+         * If relative mismatch of timing pattern module more this value, penalty points will be added.
+         * If a lot of penalty points are added, QR code will be rejected. */
+        CV_PROP_RW float maxTimingPatternMismatch;
+
+        /** @brief The maximum allowed percentage of penalty points out of total pins in timing pattern, default 0.4f */
+        CV_PROP_RW float maxPenalties;
+
+        /** @brief The maximum allowed relative color mismatch in the timing pattern, default 0.2f*/
+        CV_PROP_RW float maxColorsMismatch;
+
+        /** @brief The algorithm find QR codes with almost minimum timing pattern score and minimum size, default 0.9f
+         *
+         * The QR code with the minimum "timing pattern score" and minimum "size" is selected as the best QR code.
+         * If for the current QR code "timing pattern score" * scaleTimingPatternScore < "previous timing pattern score" and "size" < "previous size", then
+         * current QR code set as the best QR code. */
+        CV_PROP_RW float scaleTimingPatternScore;
+    };
+
+    /** @brief QR code detector constructor for Aruco-based algorithm. See cv::QRCodeDetectorAruco::Params */
+    CV_WRAP explicit QRCodeDetectorAruco(const QRCodeDetectorAruco::Params& params);
+
+    /** @brief Detector parameters getter. See cv::QRCodeDetectorAruco::Params */
+    CV_WRAP const QRCodeDetectorAruco::Params& getDetectorParameters() const;
+
+    /** @brief Detector parameters setter. See cv::QRCodeDetectorAruco::Params */
+    CV_WRAP QRCodeDetectorAruco& setDetectorParameters(const QRCodeDetectorAruco::Params& params);
+
+    /** @brief Aruco detector parameters are used to search for the finder patterns. */
+    CV_WRAP const aruco::DetectorParameters& getArucoParameters() const;
+
+    /** @brief Aruco detector parameters are used to search for the finder patterns. */
+    CV_WRAP void setArucoParameters(const aruco::DetectorParameters& params);
+};
+
+//! @}
+}
+
+#include "opencv2/objdetect/detection_based_tracker.hpp"
+#include "opencv2/objdetect/face.hpp"
+#include "opencv2/objdetect/charuco_detector.hpp"
+#include "opencv2/objdetect/barcode.hpp"
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_board.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_board.hpp
new file mode 100644
index 000000000000..e8300c82bf00
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_board.hpp
@@ -0,0 +1,199 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_ARUCO_BOARD_HPP
+#define OPENCV_OBJDETECT_ARUCO_BOARD_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace aruco {
+//! @addtogroup objdetect_aruco
+//! @{
+
+class Dictionary;
+
+/** @brief Board of ArUco markers
+ *
+ * A board is a set of markers in the 3D space with a common coordinate system.
+ * The common form of a board of marker is a planar (2D) board, however any 3D layout can be used.
+ * A Board object is composed by:
+ * - The object points of the marker corners, i.e. their coordinates respect to the board system.
+ * - The dictionary which indicates the type of markers of the board
+ * - The identifier of all the markers in the board.
+ */
+class CV_EXPORTS_W_SIMPLE Board {
+public:
+    /** @brief Common Board constructor
+     *
+     * @param objPoints array of object points of all the marker corners in the board
+     * @param dictionary the dictionary of markers employed for this board
+     * @param ids vector of the identifiers of the markers in the board
+     */
+    CV_WRAP Board(InputArrayOfArrays objPoints, const Dictionary& dictionary, InputArray ids);
+
+    /** @brief return the Dictionary of markers employed for this board
+     */
+    CV_WRAP const Dictionary& getDictionary() const;
+
+    /** @brief return array of object points of all the marker corners in the board.
+     *
+     * Each marker include its 4 corners in this order:
+     * -   objPoints[i][0] - left-top point of i-th marker
+     * -   objPoints[i][1] - right-top point of i-th marker
+     * -   objPoints[i][2] - right-bottom point of i-th marker
+     * -   objPoints[i][3] - left-bottom point of i-th marker
+     *
+     * Markers are placed in a certain order - row by row, left to right in every row. For M markers, the size is Mx4.
+     */
+    CV_WRAP const std::vector<std::vector<Point3f> >& getObjPoints() const;
+
+    /** @brief vector of the identifiers of the markers in the board (should be the same size as objPoints)
+     * @return vector of the identifiers of the markers
+     */
+    CV_WRAP const std::vector<int>& getIds() const;
+
+    /** @brief get coordinate of the bottom right corner of the board, is set when calling the function create()
+     */
+    CV_WRAP const Point3f& getRightBottomCorner() const;
+
+    /** @brief Given a board configuration and a set of detected markers, returns the corresponding
+     * image points and object points, can be used in solvePnP()
+     *
+     * @param detectedCorners List of detected marker corners of the board.
+     * For cv::Board and cv::GridBoard the method expects std::vector<std::vector<Point2f>> or std::vector<Mat> with Aruco marker corners.
+     * For cv::CharucoBoard the method expects std::vector<Point2f> or Mat with ChAruco corners (chess board corners matched with Aruco markers).
+     *
+     * @param detectedIds List of identifiers for each marker or charuco corner.
+     * For any Board class the method expects std::vector<int> or Mat.
+     *
+     * @param objPoints Vector of marker points in the board coordinate space.
+     * For any Board class the method expects std::vector<cv::Point3f> objectPoints or cv::Mat
+     *
+     * @param imgPoints Vector of marker points in the image coordinate space.
+     * For any Board class the method expects std::vector<cv::Point2f> objectPoints or cv::Mat
+     *
+     * @sa solvePnP
+     */
+    CV_WRAP void matchImagePoints(InputArrayOfArrays detectedCorners, InputArray detectedIds,
+                                  OutputArray objPoints, OutputArray imgPoints) const;
+
+     /** @brief Draw a planar board
+     *
+     * @param outSize size of the output image in pixels.
+     * @param img output image with the board. The size of this image will be outSize
+     * and the board will be on the center, keeping the board proportions.
+     * @param marginSize minimum margins (in pixels) of the board in the output image
+     * @param borderBits width of the marker borders.
+     *
+     * This function return the image of the board, ready to be printed.
+     */
+    CV_WRAP void generateImage(Size outSize, OutputArray img, int marginSize = 0, int borderBits = 1) const;
+
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    Board();
+
+    struct Impl;
+protected:
+    Board(const Ptr<Impl>& impl);
+    Ptr<Impl> impl;
+};
+
+/** @brief Planar board with grid arrangement of markers
+ *
+ * More common type of board. All markers are placed in the same plane in a grid arrangement.
+ * The board image can be drawn using generateImage() method.
+ */
+class CV_EXPORTS_W_SIMPLE GridBoard : public Board {
+public:
+    /**
+     * @brief GridBoard constructor
+     *
+     * @param size number of markers in x and y directions
+     * @param markerLength marker side length (normally in meters)
+     * @param markerSeparation separation between two markers (same unit as markerLength)
+     * @param dictionary dictionary of markers indicating the type of markers
+     * @param ids set of marker ids in dictionary to use on board.
+     */
+    CV_WRAP GridBoard(const Size& size, float markerLength, float markerSeparation,
+                      const Dictionary &dictionary, InputArray ids = noArray());
+
+    CV_WRAP Size getGridSize() const;
+    CV_WRAP float getMarkerLength() const;
+    CV_WRAP float getMarkerSeparation() const;
+
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    GridBoard();
+};
+
+/**
+ * @brief ChArUco board is a planar chessboard where the markers are placed inside the white squares of a chessboard.
+ *
+ * The benefits of ChArUco boards is that they provide both, ArUco markers versatility and chessboard corner precision,
+ * which is important for calibration and pose estimation. The board image can be drawn using generateImage() method.
+ */
+class CV_EXPORTS_W_SIMPLE CharucoBoard : public Board {
+public:
+    /** @brief CharucoBoard constructor
+     *
+     * @param size number of chessboard squares in x and y directions
+     * @param squareLength squareLength chessboard square side length (normally in meters)
+     * @param markerLength marker side length (same unit than squareLength)
+     * @param dictionary dictionary of markers indicating the type of markers
+     * @param ids array of id used markers
+     * The first markers in the dictionary are used to fill the white chessboard squares.
+     */
+    CV_WRAP CharucoBoard(const Size& size, float squareLength, float markerLength,
+                         const Dictionary &dictionary, InputArray ids = noArray());
+
+    /** @brief set legacy chessboard pattern.
+     *
+     * Legacy setting creates chessboard patterns starting with a white box in the upper left corner
+     * if there is an even row count of chessboard boxes, otherwise it starts with a black box.
+     * This setting ensures compatibility to patterns created with OpenCV versions prior OpenCV 4.6.0.
+     * See https://github.com/opencv/opencv/issues/23152.
+     *
+     * Default value: false.
+     */
+    CV_WRAP void setLegacyPattern(bool legacyPattern);
+    CV_WRAP bool getLegacyPattern() const;
+
+    CV_WRAP Size getChessboardSize() const;
+    CV_WRAP float getSquareLength() const;
+    CV_WRAP float getMarkerLength() const;
+
+    /** @brief get CharucoBoard::chessboardCorners
+     */
+    CV_WRAP std::vector<Point3f> getChessboardCorners() const;
+
+    /** @brief get CharucoBoard::nearestMarkerIdx, for each charuco corner, nearest marker index in ids array
+     */
+    CV_PROP std::vector<std::vector<int> > getNearestMarkerIdx() const;
+
+    /** @brief get CharucoBoard::nearestMarkerCorners, for each charuco corner, nearest marker corner id of each marker
+     */
+    CV_PROP std::vector<std::vector<int> > getNearestMarkerCorners() const;
+
+    /** @brief check whether the ChArUco markers are collinear
+     *
+     * @param charucoIds list of identifiers for each corner in charucoCorners per frame.
+     * @return bool value, 1 (true) if detected corners form a line, 0 (false) if they do not.
+     * solvePnP, calibration functions will fail if the corners are collinear (true).
+     *
+     * The number of ids in charucoIDs should be <= the number of chessboard corners in the board.
+     * This functions checks whether the charuco corners are on a straight line (returns true, if so), or not (false).
+     * Axis parallel, as well as diagonal and other straight lines detected.  Degenerate cases:
+     * for number of charucoIDs <= 2,the function returns true.
+     */
+    CV_WRAP bool checkCharucoCornersCollinear(InputArray charucoIds) const;
+
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    CharucoBoard();
+};
+
+//! @}
+
+}
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_detector.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_detector.hpp
new file mode 100644
index 000000000000..9d30d55d176e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_detector.hpp
@@ -0,0 +1,400 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_ARUCO_DETECTOR_HPP
+#define OPENCV_OBJDETECT_ARUCO_DETECTOR_HPP
+
+#include <opencv2/objdetect/aruco_dictionary.hpp>
+#include <opencv2/objdetect/aruco_board.hpp>
+
+namespace cv {
+namespace aruco {
+
+//! @addtogroup objdetect_aruco
+//! @{
+
+enum CornerRefineMethod{
+    CORNER_REFINE_NONE,     ///< Tag and corners detection based on the ArUco approach
+    CORNER_REFINE_SUBPIX,   ///< ArUco approach and refine the corners locations using corner subpixel accuracy
+    CORNER_REFINE_CONTOUR,  ///< ArUco approach and refine the corners locations using the contour-points line fitting
+    CORNER_REFINE_APRILTAG, ///< Tag and corners detection based on the AprilTag 2 approach @cite wang2016iros
+};
+
+/** @brief struct DetectorParameters is used by ArucoDetector
+ */
+struct CV_EXPORTS_W_SIMPLE DetectorParameters {
+    CV_WRAP DetectorParameters() {
+        adaptiveThreshWinSizeMin = 3;
+        adaptiveThreshWinSizeMax = 23;
+        adaptiveThreshWinSizeStep = 10;
+        adaptiveThreshConstant = 7;
+        minMarkerPerimeterRate = 0.03;
+        maxMarkerPerimeterRate = 4.;
+        polygonalApproxAccuracyRate = 0.03;
+        minCornerDistanceRate = 0.05;
+        minDistanceToBorder = 3;
+        minMarkerDistanceRate = 0.125;
+        cornerRefinementMethod = (int)CORNER_REFINE_NONE;
+        cornerRefinementWinSize = 5;
+        relativeCornerRefinmentWinSize = 0.3f;
+        cornerRefinementMaxIterations = 30;
+        cornerRefinementMinAccuracy = 0.1;
+        markerBorderBits = 1;
+        perspectiveRemovePixelPerCell = 4;
+        perspectiveRemoveIgnoredMarginPerCell = 0.13;
+        maxErroneousBitsInBorderRate = 0.35;
+        minOtsuStdDev = 5.0;
+        errorCorrectionRate = 0.6;
+        aprilTagQuadDecimate = 0.0;
+        aprilTagQuadSigma = 0.0;
+        aprilTagMinClusterPixels = 5;
+        aprilTagMaxNmaxima = 10;
+        aprilTagCriticalRad = (float)(10* CV_PI /180);
+        aprilTagMaxLineFitMse = 10.0;
+        aprilTagMinWhiteBlackDiff = 5;
+        aprilTagDeglitch = 0;
+        detectInvertedMarker = false;
+        useAruco3Detection = false;
+        minSideLengthCanonicalImg = 32;
+        minMarkerLengthRatioOriginalImg = 0.0;
+    }
+
+    /** @brief Read a new set of DetectorParameters from FileNode (use FileStorage.root()).
+     */
+    CV_WRAP bool readDetectorParameters(const FileNode& fn);
+
+    /** @brief Write a set of DetectorParameters to FileStorage
+     */
+    CV_WRAP bool writeDetectorParameters(FileStorage& fs, const String& name = String());
+
+    /// minimum window size for adaptive thresholding before finding contours (default 3).
+    CV_PROP_RW int adaptiveThreshWinSizeMin;
+
+    /// maximum window size for adaptive thresholding before finding contours (default 23).
+    CV_PROP_RW int adaptiveThreshWinSizeMax;
+
+    /// increments from adaptiveThreshWinSizeMin to adaptiveThreshWinSizeMax during the thresholding (default 10).
+    CV_PROP_RW int adaptiveThreshWinSizeStep;
+
+    /// constant for adaptive thresholding before finding contours (default 7)
+    CV_PROP_RW double adaptiveThreshConstant;
+
+    /** @brief determine minimum perimeter for marker contour to be detected.
+     *
+     * This is defined as a rate respect to the maximum dimension of the input image (default 0.03).
+     */
+    CV_PROP_RW double minMarkerPerimeterRate;
+
+    /** @brief determine maximum perimeter for marker contour to be detected.
+     *
+     * This is defined as a rate respect to the maximum dimension of the input image (default 4.0).
+     */
+    CV_PROP_RW double maxMarkerPerimeterRate;
+
+    /// minimum accuracy during the polygonal approximation process to determine which contours are squares. (default 0.03)
+    CV_PROP_RW double polygonalApproxAccuracyRate;
+
+    /// minimum distance between corners for detected markers relative to its perimeter (default 0.05)
+    CV_PROP_RW double minCornerDistanceRate;
+
+    /// minimum distance of any corner to the image border for detected markers (in pixels) (default 3)
+    CV_PROP_RW int minDistanceToBorder;
+
+    /** @brief minimum average distance between the corners of the two markers to be grouped (default 0.125).
+     *
+     * The rate is relative to the smaller perimeter of the two markers.
+     * Two markers are grouped if average distance between the corners of the two markers is less than
+     * min(MarkerPerimeter1, MarkerPerimeter2)*minMarkerDistanceRate.
+     *
+     * default value is 0.125 because 0.125*MarkerPerimeter = (MarkerPerimeter / 4) * 0.5 = half the side of the marker.
+     *
+     * @note default value was changed from 0.05 after 4.8.1 release, because the filtering algorithm has been changed.
+     * Now a few candidates from the same group can be added to the list of candidates if they are far from each other.
+     * @sa minGroupDistance.
+     */
+    CV_PROP_RW double minMarkerDistanceRate;
+
+    /** @brief minimum average distance between the corners of the two markers in group to add them to the list of candidates
+     *
+     * The average distance between the corners of the two markers is calculated relative to its module size (default 0.21).
+     */
+    CV_PROP_RW float minGroupDistance = 0.21f;
+
+    /** @brief default value CORNER_REFINE_NONE */
+    CV_PROP_RW int cornerRefinementMethod;
+
+    /** @brief maximum window size for the corner refinement process (in pixels) (default 5).
+     *
+     * The window size may decrease if the ArUco marker is too small, check relativeCornerRefinmentWinSize.
+     * The final window size is calculated as:
+     * min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+     * where averageArucoModuleSize is average module size of ArUco marker in pixels.
+     * (ArUco marker is composed of black and white modules)
+     */
+    CV_PROP_RW int cornerRefinementWinSize;
+
+    /** @brief Dynamic window size for corner refinement relative to Aruco module size (default 0.3).
+     *
+     * The final window size is calculated as:
+     * min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+     * where averageArucoModuleSize is average module size of ArUco marker in pixels.
+     * (ArUco marker is composed of black and white modules)
+     * In the case of markers located far from each other, it may be useful to increase the value of the parameter to 0.4-0.5.
+     * In the case of markers located close to each other, it may be useful to decrease the parameter value to 0.1-0.2.
+     */
+    CV_PROP_RW float relativeCornerRefinmentWinSize;
+
+    /// maximum number of iterations for stop criteria of the corner refinement process (default 30).
+    CV_PROP_RW int cornerRefinementMaxIterations;
+
+    /// minimum error for the stop cristeria of the corner refinement process (default: 0.1)
+    CV_PROP_RW double cornerRefinementMinAccuracy;
+
+    /// number of bits of the marker border, i.e. marker border width (default 1).
+    CV_PROP_RW int markerBorderBits;
+
+    /// number of bits (per dimension) for each cell of the marker when removing the perspective (default 4).
+    CV_PROP_RW int perspectiveRemovePixelPerCell;
+
+    /** @brief width of the margin of pixels on each cell not considered for the determination of the cell bit.
+     *
+     * Represents the rate respect to the total size of the cell, i.e. perspectiveRemovePixelPerCell (default 0.13)
+     */
+    CV_PROP_RW double perspectiveRemoveIgnoredMarginPerCell;
+
+    /** @brief  maximum number of accepted erroneous bits in the border (i.e. number of allowed white bits in the border).
+     *
+     * Represented as a rate respect to the total number of bits per marker (default 0.35).
+     */
+    CV_PROP_RW double maxErroneousBitsInBorderRate;
+
+    /** @brief minimun standard deviation in pixels values during the decodification step to apply Otsu
+     * thresholding (otherwise, all the bits are set to 0 or 1 depending on mean higher than 128 or not) (default 5.0)
+     */
+    CV_PROP_RW double minOtsuStdDev;
+
+    /// error correction rate respect to the maximun error correction capability for each dictionary (default 0.6).
+    CV_PROP_RW double errorCorrectionRate;
+
+    /** @brief April :: User-configurable parameters.
+     *
+     * Detection of quads can be done on a lower-resolution image, improving speed at a cost of
+     * pose accuracy and a slight decrease in detection rate. Decoding the binary payload is still
+     */
+    CV_PROP_RW float aprilTagQuadDecimate;
+
+    /// what Gaussian blur should be applied to the segmented image (used for quad detection?)
+    CV_PROP_RW float aprilTagQuadSigma;
+
+    // April :: Internal variables
+    /// reject quads containing too few pixels (default 5).
+    CV_PROP_RW int aprilTagMinClusterPixels;
+
+    /// how many corner candidates to consider when segmenting a group of pixels into a quad (default 10).
+    CV_PROP_RW int aprilTagMaxNmaxima;
+
+    /** @brief reject quads where pairs of edges have angles that are close to straight or close to 180 degrees.
+     *
+     * Zero means that no quads are rejected. (In radians) (default 10*PI/180)
+     */
+    CV_PROP_RW float aprilTagCriticalRad;
+
+    /// when fitting lines to the contours, what is the maximum mean squared error
+    CV_PROP_RW float aprilTagMaxLineFitMse;
+
+    /** @brief add an extra check that the white model must be (overall) brighter than the black model.
+     *
+     * When we build our model of black & white pixels, we add an extra check that the white model must be (overall)
+     * brighter than the black model. How much brighter? (in pixel values, [0,255]), (default 5)
+     */
+    CV_PROP_RW int aprilTagMinWhiteBlackDiff;
+
+    /// should the thresholded image be deglitched? Only useful for very noisy images (default 0).
+    CV_PROP_RW int aprilTagDeglitch;
+
+    /** @brief to check if there is a white marker.
+     *
+     * In order to generate a "white" marker just invert a normal marker by using a tilde, ~markerImage. (default false)
+     */
+    CV_PROP_RW bool detectInvertedMarker;
+
+    /** @brief enable the new and faster Aruco detection strategy.
+     *
+     * Proposed in the paper:
+     * Romero-Ramirez et al: Speeded up detection of squared fiducial markers (2018)
+     * https://www.researchgate.net/publication/325787310_Speeded_Up_Detection_of_Squared_Fiducial_Markers
+     */
+    CV_PROP_RW bool useAruco3Detection;
+
+    /// minimum side length of a marker in the canonical image. Latter is the binarized image in which contours are searched.
+    CV_PROP_RW int minSideLengthCanonicalImg;
+
+    /// range [0,1], eq (2) from paper. The parameter tau_i has a direct influence on the processing speed.
+    CV_PROP_RW float minMarkerLengthRatioOriginalImg;
+};
+
+/** @brief struct RefineParameters is used by ArucoDetector
+ */
+struct CV_EXPORTS_W_SIMPLE RefineParameters {
+    CV_WRAP RefineParameters(float minRepDistance = 10.f, float errorCorrectionRate = 3.f, bool checkAllOrders = true);
+
+
+    /** @brief Read a new set of RefineParameters from FileNode (use FileStorage.root()).
+     */
+    CV_WRAP bool readRefineParameters(const FileNode& fn);
+
+    /** @brief Write a set of RefineParameters to FileStorage
+     */
+    CV_WRAP bool writeRefineParameters(FileStorage& fs, const String& name = String());
+
+    /** @brief minRepDistance minimum distance between the corners of the rejected candidate and the reprojected marker
+    in order to consider it as a correspondence.
+     */
+    CV_PROP_RW float minRepDistance;
+
+    /** @brief errorCorrectionRate rate of allowed erroneous bits respect to the error correction capability of the used dictionary.
+     *
+     * -1 ignores the error correction step.
+     */
+    CV_PROP_RW float errorCorrectionRate;
+
+    /** @brief checkAllOrders consider the four posible corner orders in the rejectedCorners array.
+     *
+     * If it set to false, only the provided corner order is considered (default true).
+     */
+    CV_PROP_RW bool checkAllOrders;
+};
+
+/** @brief The main functionality of ArucoDetector class is detection of markers in an image with detectMarkers() method.
+ *
+ * After detecting some markers in the image, you can try to find undetected markers from this dictionary with
+ * refineDetectedMarkers() method.
+ *
+ * @see DetectorParameters, RefineParameters
+ */
+class CV_EXPORTS_W ArucoDetector : public Algorithm
+{
+public:
+    /** @brief Basic ArucoDetector constructor
+     *
+     * @param dictionary indicates the type of markers that will be searched
+     * @param detectorParams marker detection parameters
+     * @param refineParams marker refine detection parameters
+     */
+    CV_WRAP ArucoDetector(const Dictionary &dictionary = getPredefinedDictionary(cv::aruco::DICT_4X4_50),
+                          const DetectorParameters &detectorParams = DetectorParameters(),
+                          const RefineParameters& refineParams = RefineParameters());
+
+    /** @brief Basic marker detection
+     *
+     * @param image input image
+     * @param corners vector of detected marker corners. For each marker, its four corners
+     * are provided, (e.g std::vector<std::vector<cv::Point2f> > ). For N detected markers,
+     * the dimensions of this array is Nx4. The order of the corners is clockwise.
+     * @param ids vector of identifiers of the detected markers. The identifier is of type int
+     * (e.g. std::vector<int>). For N detected markers, the size of ids is also N.
+     * The identifiers have the same order than the markers in the imgPoints array.
+     * @param rejectedImgPoints contains the imgPoints of those squares whose inner code has not a
+     * correct codification. Useful for debugging purposes.
+     *
+     * Performs marker detection in the input image. Only markers included in the specific dictionary
+     * are searched. For each detected marker, it returns the 2D position of its corner in the image
+     * and its corresponding identifier.
+     * Note that this function does not perform pose estimation.
+     * @note The function does not correct lens distortion or takes it into account. It's recommended to undistort
+     * input image with corresponding camera model, if camera parameters are known
+     * @sa undistort, estimatePoseSingleMarkers,  estimatePoseBoard
+     */
+    CV_WRAP void detectMarkers(InputArray image, OutputArrayOfArrays corners, OutputArray ids,
+                               OutputArrayOfArrays rejectedImgPoints = noArray()) const;
+
+    /** @brief Refine not detected markers based on the already detected and the board layout
+     *
+     * @param image input image
+     * @param board layout of markers in the board.
+     * @param detectedCorners vector of already detected marker corners.
+     * @param detectedIds vector of already detected marker identifiers.
+     * @param rejectedCorners vector of rejected candidates during the marker detection process.
+     * @param cameraMatrix optional input 3x3 floating-point camera matrix
+     * \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$
+     * @param distCoeffs optional vector of distortion coefficients
+     * \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6],[s_1, s_2, s_3, s_4]])\f$ of 4, 5, 8 or 12 elements
+     * @param recoveredIdxs Optional array to returns the indexes of the recovered candidates in the
+     * original rejectedCorners array.
+     *
+     * This function tries to find markers that were not detected in the basic detecMarkers function.
+     * First, based on the current detected marker and the board layout, the function interpolates
+     * the position of the missing markers. Then it tries to find correspondence between the reprojected
+     * markers and the rejected candidates based on the minRepDistance and errorCorrectionRate parameters.
+     * If camera parameters and distortion coefficients are provided, missing markers are reprojected
+     * using projectPoint function. If not, missing marker projections are interpolated using global
+     * homography, and all the marker corners in the board must have the same Z coordinate.
+     */
+    CV_WRAP void refineDetectedMarkers(InputArray image, const Board &board,
+                                       InputOutputArrayOfArrays detectedCorners,
+                                       InputOutputArray detectedIds, InputOutputArrayOfArrays rejectedCorners,
+                                       InputArray cameraMatrix = noArray(), InputArray distCoeffs = noArray(),
+                                       OutputArray recoveredIdxs = noArray()) const;
+
+    CV_WRAP const Dictionary& getDictionary() const;
+    CV_WRAP void setDictionary(const Dictionary& dictionary);
+
+    CV_WRAP const DetectorParameters& getDetectorParameters() const;
+    CV_WRAP void setDetectorParameters(const DetectorParameters& detectorParameters);
+
+    CV_WRAP const RefineParameters& getRefineParameters() const;
+    CV_WRAP void setRefineParameters(const RefineParameters& refineParameters);
+
+    /** @brief Stores algorithm parameters in a file storage
+    */
+    virtual void write(FileStorage& fs) const override;
+
+    /** @brief simplified API for language bindings
+    */
+    CV_WRAP inline void write(FileStorage& fs, const String& name) { Algorithm::write(fs, name); }
+
+    /** @brief Reads algorithm parameters from a file storage
+    */
+    CV_WRAP virtual void read(const FileNode& fn) override;
+protected:
+    struct ArucoDetectorImpl;
+    Ptr<ArucoDetectorImpl> arucoDetectorImpl;
+};
+
+/** @brief Draw detected markers in image
+ *
+ * @param image input/output image. It must have 1 or 3 channels. The number of channels is not altered.
+ * @param corners positions of marker corners on input image.
+ * (e.g std::vector<std::vector<cv::Point2f> > ). For N detected markers, the dimensions of
+ * this array should be Nx4. The order of the corners should be clockwise.
+ * @param ids vector of identifiers for markers in markersCorners .
+ * Optional, if not provided, ids are not painted.
+ * @param borderColor color of marker borders. Rest of colors (text color and first corner color)
+ * are calculated based on this one to improve visualization.
+ *
+ * Given an array of detected marker corners and its corresponding ids, this functions draws
+ * the markers in the image. The marker borders are painted and the markers identifiers if provided.
+ * Useful for debugging purposes.
+ */
+CV_EXPORTS_W void drawDetectedMarkers(InputOutputArray image, InputArrayOfArrays corners,
+                                      InputArray ids = noArray(), Scalar borderColor = Scalar(0, 255, 0));
+
+/** @brief Generate a canonical marker image
+ *
+ * @param dictionary dictionary of markers indicating the type of markers
+ * @param id identifier of the marker that will be returned. It has to be a valid id in the specified dictionary.
+ * @param sidePixels size of the image in pixels
+ * @param img output image with the marker
+ * @param borderBits width of the marker border.
+ *
+ * This function returns a marker image in its canonical form (i.e. ready to be printed)
+ */
+CV_EXPORTS_W void generateImageMarker(const Dictionary &dictionary, int id, int sidePixels, OutputArray img,
+                                      int borderBits = 1);
+
+//! @}
+
+}
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_dictionary.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_dictionary.hpp
new file mode 100644
index 000000000000..bc7b934b2a60
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/aruco_dictionary.hpp
@@ -0,0 +1,155 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_DICTIONARY_HPP
+#define OPENCV_OBJDETECT_DICTIONARY_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace aruco {
+
+//! @addtogroup objdetect_aruco
+//! @{
+
+
+/** @brief Dictionary is a set of unique ArUco markers of the same size
+ *
+ * `bytesList` storing as 2-dimensions Mat with 4-th channels (CV_8UC4 type was used) and contains the marker codewords where:
+ * - bytesList.rows is the dictionary size
+ * - each marker is encoded using `nbytes = ceil(markerSize*markerSize/8.)` bytes
+ * - each row contains all 4 rotations of the marker, so its length is `4*nbytes`
+ * - the byte order in the bytesList[i] row:
+ * `//bytes without rotation/bytes with rotation 1/bytes with rotation 2/bytes with rotation 3//`
+ * So `bytesList.ptr(i)[k*nbytes + j]` is the j-th byte of i-th marker, in its k-th rotation.
+ * @note Python bindings generate matrix with shape of bytesList `dictionary_size x nbytes x 4`,
+ * but it should be indexed like C++ version. Python example for j-th byte of i-th marker, in its k-th rotation:
+ * `aruco_dict.bytesList[id].ravel()[k*nbytes + j]`
+ */
+class CV_EXPORTS_W_SIMPLE Dictionary {
+
+    public:
+    CV_PROP_RW Mat bytesList;         ///< marker code information. See class description for more details
+    CV_PROP_RW int markerSize;        ///< number of bits per dimension
+    CV_PROP_RW int maxCorrectionBits; ///< maximum number of bits that can be corrected
+
+    CV_WRAP Dictionary();
+
+    /** @brief Basic ArUco dictionary constructor
+     *
+     * @param bytesList bits for all ArUco markers in dictionary see memory layout in the class description
+     * @param _markerSize ArUco marker size in units
+     * @param maxcorr maximum number of bits that can be corrected
+     */
+    CV_WRAP Dictionary(const Mat &bytesList, int _markerSize, int maxcorr = 0);
+
+    /** @brief Read a new dictionary from FileNode.
+     *
+     * Dictionary example in YAML format:\n
+     * nmarkers: 35\n
+     * markersize: 6\n
+     * maxCorrectionBits: 5\n
+     * marker_0: "101011111011111001001001101100000000"\n
+     * ...\n
+     * marker_34: "011111010000111011111110110101100101"
+     */
+    CV_WRAP bool readDictionary(const cv::FileNode& fn);
+
+    /** @brief Write a dictionary to FileStorage, format is the same as in readDictionary().
+     */
+    CV_WRAP void writeDictionary(FileStorage& fs, const String& name = String());
+
+    /** @brief Given a matrix of bits. Returns whether if marker is identified or not.
+     *
+     * Returns reference to the marker id in the dictionary (if any) and its rotation.
+     */
+    CV_WRAP bool identify(const Mat &onlyBits, CV_OUT int &idx, CV_OUT int &rotation, double maxCorrectionRate) const;
+
+    /** @brief Returns Hamming distance of the input bits to the specific id.
+     *
+     * If `allRotations` flag is set, the four posible marker rotations are considered
+     */
+    CV_WRAP int getDistanceToId(InputArray bits, int id, bool allRotations = true) const;
+
+
+    /** @brief Generate a canonical marker image
+     */
+    CV_WRAP void generateImageMarker(int id, int sidePixels, OutputArray _img, int borderBits = 1) const;
+
+
+    /** @brief Transform matrix of bits to list of bytes with 4 marker rotations
+      */
+    CV_WRAP static Mat getByteListFromBits(const Mat &bits);
+
+
+    /** @brief Transform list of bytes to matrix of bits
+      */
+    CV_WRAP static Mat getBitsFromByteList(const Mat &byteList, int markerSize);
+};
+
+
+
+
+/** @brief Predefined markers dictionaries/sets
+ *
+ * Each dictionary indicates the number of bits and the number of markers contained
+ * - DICT_ARUCO_ORIGINAL: standard ArUco Library Markers. 1024 markers, 5x5 bits, 0 minimum
+                          distance
+ */
+enum PredefinedDictionaryType {
+    DICT_4X4_50 = 0,        ///< 4x4 bits, minimum hamming distance between any two codes = 4, 50 codes
+    DICT_4X4_100,           ///< 4x4 bits, minimum hamming distance between any two codes = 3, 100 codes
+    DICT_4X4_250,           ///< 4x4 bits, minimum hamming distance between any two codes = 3, 250 codes
+    DICT_4X4_1000,          ///< 4x4 bits, minimum hamming distance between any two codes = 2, 1000 codes
+    DICT_5X5_50,            ///< 5x5 bits, minimum hamming distance between any two codes = 8, 50 codes
+    DICT_5X5_100,           ///< 5x5 bits, minimum hamming distance between any two codes = 7, 100 codes
+    DICT_5X5_250,           ///< 5x5 bits, minimum hamming distance between any two codes = 6, 250 codes
+    DICT_5X5_1000,          ///< 5x5 bits, minimum hamming distance between any two codes = 5, 1000 codes
+    DICT_6X6_50,            ///< 6x6 bits, minimum hamming distance between any two codes = 13, 50 codes
+    DICT_6X6_100,           ///< 6x6 bits, minimum hamming distance between any two codes = 12, 100 codes
+    DICT_6X6_250,           ///< 6x6 bits, minimum hamming distance between any two codes = 11, 250 codes
+    DICT_6X6_1000,          ///< 6x6 bits, minimum hamming distance between any two codes = 9, 1000 codes
+    DICT_7X7_50,            ///< 7x7 bits, minimum hamming distance between any two codes = 19, 50 codes
+    DICT_7X7_100,           ///< 7x7 bits, minimum hamming distance between any two codes = 18, 100 codes
+    DICT_7X7_250,           ///< 7x7 bits, minimum hamming distance between any two codes = 17, 250 codes
+    DICT_7X7_1000,          ///< 7x7 bits, minimum hamming distance between any two codes = 14, 1000 codes
+    DICT_ARUCO_ORIGINAL,    ///< 6x6 bits, minimum hamming distance between any two codes = 3, 1024 codes
+    DICT_APRILTAG_16h5,     ///< 4x4 bits, minimum hamming distance between any two codes = 5, 30 codes
+    DICT_APRILTAG_25h9,     ///< 5x5 bits, minimum hamming distance between any two codes = 9, 35 codes
+    DICT_APRILTAG_36h10,    ///< 6x6 bits, minimum hamming distance between any two codes = 10, 2320 codes
+    DICT_APRILTAG_36h11,     ///< 6x6 bits, minimum hamming distance between any two codes = 11, 587 codes
+    DICT_ARUCO_MIP_36h12     ///< 6x6 bits, minimum hamming distance between any two codes = 12, 250 codes
+};
+
+
+/** @brief Returns one of the predefined dictionaries defined in PredefinedDictionaryType
+  */
+CV_EXPORTS Dictionary getPredefinedDictionary(PredefinedDictionaryType name);
+
+
+/** @brief Returns one of the predefined dictionaries referenced by DICT_*.
+  */
+CV_EXPORTS_W Dictionary getPredefinedDictionary(int dict);
+
+/** @brief Extend base dictionary by new nMarkers
+  *
+  * @param nMarkers number of markers in the dictionary
+  * @param markerSize number of bits per dimension of each markers
+  * @param baseDictionary Include the markers in this dictionary at the beginning (optional)
+  * @param randomSeed a user supplied seed for theRNG()
+  *
+  * This function creates a new dictionary composed by nMarkers markers and each markers composed
+  * by markerSize x markerSize bits. If baseDictionary is provided, its markers are directly
+  * included and the rest are generated based on them. If the size of baseDictionary is higher
+  * than nMarkers, only the first nMarkers in baseDictionary are taken and no new marker is added.
+  */
+CV_EXPORTS_W Dictionary extendDictionary(int nMarkers, int markerSize, const Dictionary &baseDictionary = Dictionary(),
+                                         int randomSeed=0);
+
+
+
+//! @}
+}
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/barcode.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/barcode.hpp
new file mode 100644
index 000000000000..c20b67c0b29e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/barcode.hpp
@@ -0,0 +1,111 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+// Copyright (c) 2020-2021 darkliang wangberlinT Certseeds
+
+#ifndef OPENCV_OBJDETECT_BARCODE_HPP
+#define OPENCV_OBJDETECT_BARCODE_HPP
+
+#include <opencv2/core.hpp>
+#include <opencv2/objdetect/graphical_code_detector.hpp>
+
+namespace cv {
+namespace barcode {
+
+//! @addtogroup objdetect_barcode
+//! @{
+
+class CV_EXPORTS_W_SIMPLE BarcodeDetector : public cv::GraphicalCodeDetector
+{
+public:
+    /** @brief Initialize the BarcodeDetector.
+    */
+    CV_WRAP BarcodeDetector();
+    /** @brief Initialize the BarcodeDetector.
+     *
+     * Parameters allow to load _optional_ Super Resolution DNN model for better quality.
+     * @param prototxt_path prototxt file path for the super resolution model
+     * @param model_path model file path for the super resolution model
+     */
+    CV_WRAP BarcodeDetector(CV_WRAP_FILE_PATH const std::string &prototxt_path, CV_WRAP_FILE_PATH const std::string &model_path);
+    ~BarcodeDetector();
+
+    /** @brief Decodes barcode in image once it's found by the detect() method.
+     *
+     * @param img grayscale or color (BGR) image containing bar code.
+     * @param points vector of rotated rectangle vertices found by detect() method (or some other algorithm).
+     * For N detected barcodes, the dimensions of this array should be [N][4].
+     * Order of four points in vector<Point2f> is bottomLeft, topLeft, topRight, bottomRight.
+     * @param decoded_info UTF8-encoded output vector of string or empty vector of string if the codes cannot be decoded.
+     * @param decoded_type vector strings, specifies the type of these barcodes
+     * @return true if at least one valid barcode have been found
+     */
+    CV_WRAP bool decodeWithType(InputArray img,
+                             InputArray points,
+                             CV_OUT std::vector<std::string> &decoded_info,
+                             CV_OUT std::vector<std::string> &decoded_type) const;
+
+    /** @brief Both detects and decodes barcode
+
+     * @param img grayscale or color (BGR) image containing barcode.
+     * @param decoded_info UTF8-encoded output vector of string(s) or empty vector of string if the codes cannot be decoded.
+     * @param decoded_type vector of strings, specifies the type of these barcodes
+     * @param points optional output vector of vertices of the found  barcode rectangle. Will be empty if not found.
+     * @return true if at least one valid barcode have been found
+     */
+    CV_WRAP bool detectAndDecodeWithType(InputArray img,
+                                      CV_OUT std::vector<std::string> &decoded_info,
+                                      CV_OUT std::vector<std::string> &decoded_type,
+                                      OutputArray points = noArray()) const;
+
+    /** @brief Get detector downsampling threshold.
+     *
+     * @return detector downsampling threshold
+     */
+    CV_WRAP double getDownsamplingThreshold() const;
+
+    /** @brief Set detector downsampling threshold.
+     *
+     * By default, the detect method resizes the input image to this limit if the smallest image size is is greater than the threshold.
+     * Increasing this value can improve detection accuracy and the number of results at the expense of performance.
+     * Correlates with detector scales. Setting this to a large value will disable downsampling.
+     * @param thresh downsampling limit to apply (default 512)
+     * @see setDetectorScales
+     */
+    CV_WRAP BarcodeDetector& setDownsamplingThreshold(double thresh);
+
+    /** @brief Returns detector box filter sizes.
+     *
+     * @param sizes output parameter for returning the sizes.
+     */
+    CV_WRAP void getDetectorScales(CV_OUT std::vector<float>& sizes) const;
+
+    /** @brief Set detector box filter sizes.
+     *
+     * Adjusts the value and the number of box filters used in the detect step.
+     * The filter sizes directly correlate with the expected line widths for a barcode. Corresponds to expected barcode distance.
+     * If the downsampling limit is increased, filter sizes need to be adjusted in an inversely proportional way.
+     * @param sizes box filter sizes, relative to minimum dimension of the image (default [0.01, 0.03, 0.06, 0.08])
+     */
+    CV_WRAP BarcodeDetector& setDetectorScales(const std::vector<float>& sizes);
+
+    /** @brief Get detector gradient magnitude threshold.
+     *
+     * @return detector gradient magnitude threshold.
+     */
+    CV_WRAP double getGradientThreshold() const;
+
+    /** @brief Set detector gradient magnitude threshold.
+     *
+     * Sets the coherence threshold for detected bounding boxes.
+     * Increasing this value will generate a closer fitted bounding box width and can reduce false-positives.
+     * Values between 16 and 1024 generally work, while too high of a value will remove valid detections.
+     * @param thresh gradient magnitude threshold (default 64).
+     */
+    CV_WRAP BarcodeDetector& setGradientThreshold(double thresh);
+};
+//! @}
+
+}} // cv::barcode::
+
+#endif // OPENCV_OBJDETECT_BARCODE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/charuco_detector.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/charuco_detector.hpp
new file mode 100644
index 000000000000..e10cb3f02542
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/charuco_detector.hpp
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_CHARUCO_DETECTOR_HPP
+#define OPENCV_OBJDETECT_CHARUCO_DETECTOR_HPP
+
+#include "opencv2/objdetect/aruco_detector.hpp"
+
+namespace cv {
+namespace aruco {
+
+//! @addtogroup objdetect_aruco
+//! @{
+
+struct CV_EXPORTS_W_SIMPLE CharucoParameters {
+    CV_WRAP CharucoParameters() {
+        minMarkers = 2;
+        tryRefineMarkers = false;
+    }
+    /// cameraMatrix optional 3x3 floating-point camera matrix
+    CV_PROP_RW Mat cameraMatrix;
+
+    /// distCoeffs optional vector of distortion coefficients
+    CV_PROP_RW Mat distCoeffs;
+
+    /// minMarkers number of adjacent markers that must be detected to return a charuco corner, default = 2
+    CV_PROP_RW int minMarkers;
+
+    /// try to use refine board, default false
+    CV_PROP_RW bool tryRefineMarkers;
+};
+
+class CV_EXPORTS_W CharucoDetector : public Algorithm {
+public:
+    /** @brief Basic CharucoDetector constructor
+     *
+     * @param board ChAruco board
+     * @param charucoParams charuco detection parameters
+     * @param detectorParams marker detection parameters
+     * @param refineParams marker refine detection parameters
+     */
+    CV_WRAP CharucoDetector(const CharucoBoard& board,
+                            const CharucoParameters& charucoParams = CharucoParameters(),
+                            const DetectorParameters &detectorParams = DetectorParameters(),
+                            const RefineParameters& refineParams = RefineParameters());
+
+    CV_WRAP const CharucoBoard& getBoard() const;
+    CV_WRAP void setBoard(const CharucoBoard& board);
+
+    CV_WRAP const CharucoParameters& getCharucoParameters() const;
+    CV_WRAP void setCharucoParameters(CharucoParameters& charucoParameters);
+
+    CV_WRAP const DetectorParameters& getDetectorParameters() const;
+    CV_WRAP void setDetectorParameters(const DetectorParameters& detectorParameters);
+
+    CV_WRAP const RefineParameters& getRefineParameters() const;
+    CV_WRAP void setRefineParameters(const RefineParameters& refineParameters);
+
+    /**
+     * @brief detect aruco markers and interpolate position of ChArUco board corners
+     * @param image input image necesary for corner refinement. Note that markers are not detected and
+     * should be sent in corners and ids parameters.
+     * @param charucoCorners interpolated chessboard corners.
+     * @param charucoIds interpolated chessboard corners identifiers.
+     * @param markerCorners vector of already detected markers corners. For each marker, its four
+     * corners are provided, (e.g std::vector<std::vector<cv::Point2f> > ). For N detected markers, the
+     * dimensions of this array should be Nx4. The order of the corners should be clockwise.
+     * If markerCorners and markerCorners are empty, the function detect aruco markers and ids.
+     * @param markerIds list of identifiers for each marker in corners.
+     *  If markerCorners and markerCorners are empty, the function detect aruco markers and ids.
+     *
+     * This function receives the detected markers and returns the 2D position of the chessboard corners
+     * from a ChArUco board using the detected Aruco markers.
+     *
+     * If markerCorners and markerCorners are empty, the detectMarkers() will run and detect aruco markers and ids.
+     *
+     * If camera parameters are provided, the process is based in an approximated pose estimation, else it is based on local homography.
+     * Only visible corners are returned. For each corner, its corresponding identifier is also returned in charucoIds.
+     * @sa findChessboardCorners
+     * @note After OpenCV 4.6.0, there was an incompatible change in the ChArUco pattern generation algorithm for even row counts.
+     * Use cv::aruco::CharucoBoard::setLegacyPattern() to ensure compatibility with patterns created using OpenCV versions prior to 4.6.0.
+     * For more information, see the issue: https://github.com/opencv/opencv/issues/23152
+     */
+    CV_WRAP void detectBoard(InputArray image, OutputArray charucoCorners, OutputArray charucoIds,
+                             InputOutputArrayOfArrays markerCorners = noArray(),
+                             InputOutputArray markerIds = noArray()) const;
+
+    /**
+     * @brief Detect ChArUco Diamond markers
+     *
+     * @param image input image necessary for corner subpixel.
+     * @param diamondCorners output list of detected diamond corners (4 corners per diamond). The order
+     * is the same than in marker corners: top left, top right, bottom right and bottom left. Similar
+     * format than the corners returned by detectMarkers (e.g std::vector<std::vector<cv::Point2f> > ).
+     * @param diamondIds ids of the diamonds in diamondCorners. The id of each diamond is in fact of
+     * type Vec4i, so each diamond has 4 ids, which are the ids of the aruco markers composing the
+     * diamond.
+     * @param markerCorners list of detected marker corners from detectMarkers function.
+     * If markerCorners and markerCorners are empty, the function detect aruco markers and ids.
+     * @param markerIds list of marker ids in markerCorners.
+     * If markerCorners and markerCorners are empty, the function detect aruco markers and ids.
+     *
+     * This function detects Diamond markers from the previous detected ArUco markers. The diamonds
+     * are returned in the diamondCorners and diamondIds parameters. If camera calibration parameters
+     * are provided, the diamond search is based on reprojection. If not, diamond search is based on
+     * homography. Homography is faster than reprojection, but less accurate.
+     */
+    CV_WRAP void detectDiamonds(InputArray image, OutputArrayOfArrays diamondCorners, OutputArray diamondIds,
+                                InputOutputArrayOfArrays markerCorners = noArray(),
+                                InputOutputArray markerIds = noArray()) const;
+protected:
+    struct CharucoDetectorImpl;
+    Ptr<CharucoDetectorImpl> charucoDetectorImpl;
+};
+
+/**
+ * @brief Draws a set of Charuco corners
+ * @param image input/output image. It must have 1 or 3 channels. The number of channels is not
+ * altered.
+ * @param charucoCorners vector of detected charuco corners
+ * @param charucoIds list of identifiers for each corner in charucoCorners
+ * @param cornerColor color of the square surrounding each corner
+ *
+ * This function draws a set of detected Charuco corners. If identifiers vector is provided, it also
+ * draws the id of each corner.
+ */
+CV_EXPORTS_W void drawDetectedCornersCharuco(InputOutputArray image, InputArray charucoCorners,
+                                             InputArray charucoIds = noArray(), Scalar cornerColor = Scalar(255, 0, 0));
+
+/**
+ * @brief Draw a set of detected ChArUco Diamond markers
+ *
+ * @param image input/output image. It must have 1 or 3 channels. The number of channels is not
+ * altered.
+ * @param diamondCorners positions of diamond corners in the same format returned by
+ * detectCharucoDiamond(). (e.g std::vector<std::vector<cv::Point2f> > ). For N detected markers,
+ * the dimensions of this array should be Nx4. The order of the corners should be clockwise.
+ * @param diamondIds vector of identifiers for diamonds in diamondCorners, in the same format
+ * returned by detectCharucoDiamond() (e.g. std::vector<Vec4i>).
+ * Optional, if not provided, ids are not painted.
+ * @param borderColor color of marker borders. Rest of colors (text color and first corner color)
+ * are calculated based on this one.
+ *
+ * Given an array of detected diamonds, this functions draws them in the image. The marker borders
+ * are painted and the markers identifiers if provided.
+ * Useful for debugging purposes.
+ */
+CV_EXPORTS_W void drawDetectedDiamonds(InputOutputArray image, InputArrayOfArrays diamondCorners,
+                                       InputArray diamondIds = noArray(),
+                                       Scalar borderColor = Scalar(0, 0, 255));
+
+//! @}
+
+}
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/detection_based_tracker.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/detection_based_tracker.hpp
new file mode 100644
index 000000000000..8050278b4232
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/detection_based_tracker.hpp
@@ -0,0 +1,222 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OBJDETECT_DBT_HPP
+#define OPENCV_OBJDETECT_DBT_HPP
+
+#include <opencv2/core.hpp>
+
+#include <vector>
+
+namespace cv
+{
+
+//! @addtogroup objdetect_cascade_classifier
+//! @{
+
+class CV_EXPORTS DetectionBasedTracker
+{
+    public:
+        struct CV_EXPORTS Parameters
+        {
+            int maxTrackLifetime;
+            int minDetectionPeriod; //the minimal time between run of the big object detector (on the whole frame) in ms (1000 mean 1 sec), default=0
+
+            Parameters();
+        };
+
+        class IDetector
+        {
+            public:
+                IDetector():
+                    minObjSize(96, 96),
+                    maxObjSize(INT_MAX, INT_MAX),
+                    minNeighbours(2),
+                    scaleFactor(1.1f)
+                {}
+
+                virtual void detect(const cv::Mat& image, std::vector<cv::Rect>& objects) = 0;
+
+                void setMinObjectSize(const cv::Size& min)
+                {
+                    minObjSize = min;
+                }
+                void setMaxObjectSize(const cv::Size& max)
+                {
+                    maxObjSize = max;
+                }
+                cv::Size getMinObjectSize() const
+                {
+                    return minObjSize;
+                }
+                cv::Size getMaxObjectSize() const
+                {
+                    return maxObjSize;
+                }
+                float getScaleFactor()
+                {
+                    return scaleFactor;
+                }
+                void setScaleFactor(float value)
+                {
+                    scaleFactor = value;
+                }
+                int getMinNeighbours()
+                {
+                    return minNeighbours;
+                }
+                void setMinNeighbours(int value)
+                {
+                    minNeighbours = value;
+                }
+                virtual ~IDetector() {}
+
+            protected:
+                cv::Size minObjSize;
+                cv::Size maxObjSize;
+                int minNeighbours;
+                float scaleFactor;
+        };
+
+        DetectionBasedTracker(cv::Ptr<IDetector> mainDetector, cv::Ptr<IDetector> trackingDetector, const Parameters& params);
+        virtual ~DetectionBasedTracker();
+
+        virtual bool run();
+        virtual void stop();
+        virtual void resetTracking();
+
+        virtual void process(const cv::Mat& imageGray);
+
+        bool setParameters(const Parameters& params);
+        const Parameters& getParameters() const;
+
+
+        typedef std::pair<cv::Rect, int> Object;
+        virtual void getObjects(std::vector<cv::Rect>& result) const;
+        virtual void getObjects(std::vector<Object>& result) const;
+
+        enum ObjectStatus
+        {
+            DETECTED_NOT_SHOWN_YET,
+            DETECTED,
+            DETECTED_TEMPORARY_LOST,
+            WRONG_OBJECT
+        };
+        struct ExtObject
+        {
+            int id;
+            cv::Rect location;
+            ObjectStatus status;
+            ExtObject(int _id, cv::Rect _location, ObjectStatus _status)
+                :id(_id), location(_location), status(_status)
+            {
+            }
+        };
+        virtual void getObjects(std::vector<ExtObject>& result) const;
+
+
+        virtual int addObject(const cv::Rect& location); //returns id of the new object
+
+    protected:
+        class SeparateDetectionWork;
+        cv::Ptr<SeparateDetectionWork> separateDetectionWork;
+        friend void* workcycleObjectDetectorFunction(void* p);
+
+        struct InnerParameters
+        {
+            int numLastPositionsToTrack;
+            int numStepsToWaitBeforeFirstShow;
+            int numStepsToTrackWithoutDetectingIfObjectHasNotBeenShown;
+            int numStepsToShowWithoutDetecting;
+
+            float coeffTrackingWindowSize;
+            float coeffObjectSizeToTrack;
+            float coeffObjectSpeedUsingInPrediction;
+
+            InnerParameters();
+        };
+        Parameters parameters;
+        InnerParameters innerParameters;
+
+        struct TrackedObject
+        {
+            typedef std::vector<cv::Rect> PositionsVector;
+
+            PositionsVector lastPositions;
+
+            int numDetectedFrames;
+            int numFramesNotDetected;
+            int id;
+
+            TrackedObject(const cv::Rect& rect):numDetectedFrames(1), numFramesNotDetected(0)
+            {
+                lastPositions.push_back(rect);
+                id=getNextId();
+            }
+
+            static int getNextId()
+            {
+                static int _id=0;
+                return _id++;
+            }
+        };
+
+        int numTrackedSteps;
+        std::vector<TrackedObject> trackedObjects;
+
+        std::vector<float> weightsPositionsSmoothing;
+        std::vector<float> weightsSizesSmoothing;
+
+        cv::Ptr<IDetector> cascadeForTracking;
+
+        void updateTrackedObjects(const std::vector<cv::Rect>& detectedObjects);
+        cv::Rect calcTrackedObjectPositionToShow(int i) const;
+        cv::Rect calcTrackedObjectPositionToShow(int i, ObjectStatus& status) const;
+        void detectInRegion(const cv::Mat& img, const cv::Rect& r, std::vector<cv::Rect>& detectedObjectsInRegions);
+};
+
+//! @}
+
+} //end of cv namespace
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/face.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/face.hpp
new file mode 100644
index 000000000000..bfa04cbd16eb
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/face.hpp
@@ -0,0 +1,163 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_OBJDETECT_FACE_HPP
+#define OPENCV_OBJDETECT_FACE_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv
+{
+
+//! @addtogroup objdetect_dnn_face
+//! @{
+
+/** @brief DNN-based face detector
+
+model download link: https://github.com/opencv/opencv_zoo/tree/master/models/face_detection_yunet
+ */
+class CV_EXPORTS_W FaceDetectorYN
+{
+public:
+    virtual ~FaceDetectorYN() {}
+
+    /** @brief Set the size for the network input, which overwrites the input size of creating model. Call this method when the size of input image does not match the input size when creating model
+     *
+     * @param input_size the size of the input image
+     */
+    CV_WRAP virtual void setInputSize(const Size& input_size) = 0;
+
+    CV_WRAP virtual Size getInputSize() = 0;
+
+    /** @brief Set the score threshold to filter out bounding boxes of score less than the given value
+     *
+     * @param score_threshold threshold for filtering out bounding boxes
+     */
+    CV_WRAP virtual void setScoreThreshold(float score_threshold) = 0;
+
+    CV_WRAP virtual float getScoreThreshold() = 0;
+
+    /** @brief Set the Non-maximum-suppression threshold to suppress bounding boxes that have IoU greater than the given value
+     *
+     * @param nms_threshold threshold for NMS operation
+     */
+    CV_WRAP virtual void setNMSThreshold(float nms_threshold) = 0;
+
+    CV_WRAP virtual float getNMSThreshold() = 0;
+
+    /** @brief Set the number of bounding boxes preserved before NMS
+     *
+     * @param top_k the number of bounding boxes to preserve from top rank based on score
+     */
+    CV_WRAP virtual void setTopK(int top_k) = 0;
+
+    CV_WRAP virtual int getTopK() = 0;
+
+    /** @brief Detects faces in the input image. Following is an example output.
+
+     * ![image](pics/lena-face-detection.jpg)
+
+     *  @param image an image to detect
+     *  @param faces detection results stored in a 2D cv::Mat of shape [num_faces, 15]
+     *  - 0-1: x, y of bbox top left corner
+     *  - 2-3: width, height of bbox
+     *  - 4-5: x, y of right eye (blue point in the example image)
+     *  - 6-7: x, y of left eye (red point in the example image)
+     *  - 8-9: x, y of nose tip (green point in the example image)
+     *  - 10-11: x, y of right corner of mouth (pink point in the example image)
+     *  - 12-13: x, y of left corner of mouth (yellow point in the example image)
+     *  - 14: face score
+     */
+    CV_WRAP virtual int detect(InputArray image, OutputArray faces) = 0;
+
+    /** @brief Creates an instance of face detector class with given parameters
+     *
+     *  @param model the path to the requested model
+     *  @param config the path to the config file for compability, which is not requested for ONNX models
+     *  @param input_size the size of the input image
+     *  @param score_threshold the threshold to filter out bounding boxes of score smaller than the given value
+     *  @param nms_threshold the threshold to suppress bounding boxes of IoU bigger than the given value
+     *  @param top_k keep top K bboxes before NMS
+     *  @param backend_id the id of backend
+     *  @param target_id the id of target device
+     */
+    CV_WRAP static Ptr<FaceDetectorYN> create(CV_WRAP_FILE_PATH const String& model,
+                                              CV_WRAP_FILE_PATH const String& config,
+                                              const Size& input_size,
+                                              float score_threshold = 0.9f,
+                                              float nms_threshold = 0.3f,
+                                              int top_k = 5000,
+                                              int backend_id = 0,
+                                              int target_id = 0);
+
+    /** @overload
+     *
+     *  @param framework Name of origin framework
+     *  @param bufferModel A buffer with a content of binary file with weights
+     *  @param bufferConfig A buffer with a content of text file contains network configuration
+     *  @param input_size the size of the input image
+     *  @param score_threshold the threshold to filter out bounding boxes of score smaller than the given value
+     *  @param nms_threshold the threshold to suppress bounding boxes of IoU bigger than the given value
+     *  @param top_k keep top K bboxes before NMS
+     *  @param backend_id the id of backend
+     *  @param target_id the id of target device
+     */
+    CV_WRAP static Ptr<FaceDetectorYN> create(const String& framework,
+                                              const std::vector<uchar>& bufferModel,
+                                              const std::vector<uchar>& bufferConfig,
+                                              const Size& input_size,
+                                              float score_threshold = 0.9f,
+                                              float nms_threshold = 0.3f,
+                                              int top_k = 5000,
+                                              int backend_id = 0,
+                                              int target_id = 0);
+
+};
+
+/** @brief DNN-based face recognizer
+
+model download link: https://github.com/opencv/opencv_zoo/tree/master/models/face_recognition_sface
+ */
+class CV_EXPORTS_W FaceRecognizerSF
+{
+public:
+    virtual ~FaceRecognizerSF() {}
+
+    /** @brief Definition of distance used for calculating the distance between two face features
+     */
+    enum DisType { FR_COSINE=0, FR_NORM_L2=1 };
+
+    /** @brief Aligning image to put face on the standard position
+     *  @param src_img input image
+     *  @param face_box the detection result used for indicate face in input image
+     *  @param aligned_img output aligned image
+     */
+    CV_WRAP virtual void alignCrop(InputArray src_img, InputArray face_box, OutputArray aligned_img) const = 0;
+
+    /** @brief Extracting face feature from aligned image
+     *  @param aligned_img input aligned image
+     *  @param face_feature output face feature
+     */
+    CV_WRAP virtual void feature(InputArray aligned_img, OutputArray face_feature) = 0;
+
+    /** @brief Calculating the distance between two face features
+     *  @param face_feature1 the first input feature
+     *  @param face_feature2 the second input feature of the same size and the same type as face_feature1
+     *  @param dis_type defining the similarity with optional values "FR_OSINE" or "FR_NORM_L2"
+     */
+    CV_WRAP virtual double match(InputArray face_feature1, InputArray face_feature2, int dis_type = FaceRecognizerSF::FR_COSINE) const = 0;
+
+    /** @brief Creates an instance of this class with given parameters
+     *  @param model the path of the onnx model used for face recognition
+     *  @param config the path to the config file for compability, which is not requested for ONNX models
+     *  @param backend_id the id of backend
+     *  @param target_id the id of target device
+     */
+    CV_WRAP static Ptr<FaceRecognizerSF> create(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config, int backend_id = 0, int target_id = 0);
+};
+
+//! @}
+} // namespace cv
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/graphical_code_detector.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/graphical_code_detector.hpp
new file mode 100644
index 000000000000..ed697c50c055
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/graphical_code_detector.hpp
@@ -0,0 +1,85 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_GRAPHICAL_CODE_DETECTOR_HPP
+#define OPENCV_OBJDETECT_GRAPHICAL_CODE_DETECTOR_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+
+//! @addtogroup objdetect_common
+//! @{
+
+class CV_EXPORTS_W_SIMPLE GraphicalCodeDetector {
+public:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    GraphicalCodeDetector();
+
+    GraphicalCodeDetector(const GraphicalCodeDetector&) = default;
+    GraphicalCodeDetector(GraphicalCodeDetector&&) = default;
+    GraphicalCodeDetector& operator=(const GraphicalCodeDetector&) = default;
+    GraphicalCodeDetector& operator=(GraphicalCodeDetector&&) = default;
+
+    /** @brief Detects graphical code in image and returns the quadrangle containing the code.
+     @param img grayscale or color (BGR) image containing (or not) graphical code.
+     @param points Output vector of vertices of the minimum-area quadrangle containing the code.
+     */
+    CV_WRAP bool detect(InputArray img, OutputArray points) const;
+
+    /** @brief Decodes graphical code in image once it's found by the detect() method.
+
+     Returns UTF8-encoded output string or empty string if the code cannot be decoded.
+     @param img grayscale or color (BGR) image containing graphical code.
+     @param points Quadrangle vertices found by detect() method (or some other algorithm).
+     @param straight_code The optional output image containing binarized code, will be empty if not found.
+     */
+    CV_WRAP std::string decode(InputArray img, InputArray points, OutputArray straight_code = noArray()) const;
+
+    /** @brief Both detects and decodes graphical code
+
+     @param img grayscale or color (BGR) image containing graphical code.
+     @param points optional output array of vertices of the found graphical code quadrangle, will be empty if not found.
+     @param straight_code The optional output image containing binarized code
+     */
+    CV_WRAP std::string detectAndDecode(InputArray img, OutputArray points = noArray(),
+                                        OutputArray straight_code = noArray()) const;
+
+
+    /** @brief Detects graphical codes in image and returns the vector of the quadrangles containing the codes.
+     @param img grayscale or color (BGR) image containing (or not) graphical codes.
+     @param points Output vector of vector of vertices of the minimum-area quadrangle containing the codes.
+     */
+    CV_WRAP bool detectMulti(InputArray img, OutputArray points) const;
+
+    /** @brief Decodes graphical codes in image once it's found by the detect() method.
+     @param img grayscale or color (BGR) image containing graphical codes.
+     @param decoded_info UTF8-encoded output vector of string or empty vector of string if the codes cannot be decoded.
+     @param points vector of Quadrangle vertices found by detect() method (or some other algorithm).
+     @param straight_code The optional output vector of images containing binarized codes
+     */
+    CV_WRAP bool decodeMulti(InputArray img, InputArray points, CV_OUT std::vector<std::string>& decoded_info,
+                             OutputArrayOfArrays straight_code = noArray()) const;
+
+    /** @brief Both detects and decodes graphical codes
+    @param img grayscale or color (BGR) image containing graphical codes.
+    @param decoded_info UTF8-encoded output vector of string or empty vector of string if the codes cannot be decoded.
+    @param points optional output vector of vertices of the found graphical code quadrangles. Will be empty if not found.
+    @param straight_code The optional vector of images containing binarized codes
+
+    - If there are QR codes encoded with a Structured Append mode on the image and all of them detected and decoded correctly,
+    method writes a full message to position corresponds to 0-th code in a sequence. The rest of QR codes from the same sequence
+    have empty string.
+    */
+    CV_WRAP bool detectAndDecodeMulti(InputArray img, CV_OUT std::vector<std::string>& decoded_info, OutputArray points = noArray(),
+                                      OutputArrayOfArrays straight_code = noArray()) const;
+    struct Impl;
+protected:
+    Ptr<Impl> p;
+};
+
+//! @}
+
+}
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/objdetect.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/objdetect.hpp
new file mode 100644
index 000000000000..3ee284f4275e
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/objdetect/objdetect.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/objdetect.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/opencv.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/opencv.hpp
new file mode 100644
index 000000000000..d17b94a4eac0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/opencv.hpp
@@ -0,0 +1,95 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_ALL_HPP
+#define OPENCV_ALL_HPP
+
+// File that defines what modules where included during the build of OpenCV
+// These are purely the defines of the correct HAVE_OPENCV_modulename values
+#include "opencv2/opencv_modules.hpp"
+
+// Then the list of defines is checked to include the correct headers
+// Core library is always included --> without no OpenCV functionality available
+#include "opencv2/core.hpp"
+
+// Then the optional modules are checked
+#ifdef HAVE_OPENCV_CALIB3D
+#include "opencv2/calib3d.hpp"
+#endif
+#ifdef HAVE_OPENCV_FEATURES2D
+#include "opencv2/features2d.hpp"
+#endif
+#ifdef HAVE_OPENCV_DNN
+#include "opencv2/dnn.hpp"
+#endif
+#ifdef HAVE_OPENCV_FLANN
+#include "opencv2/flann.hpp"
+#endif
+#ifdef HAVE_OPENCV_HIGHGUI
+#include "opencv2/highgui.hpp"
+#endif
+#ifdef HAVE_OPENCV_IMGCODECS
+#include "opencv2/imgcodecs.hpp"
+#endif
+#ifdef HAVE_OPENCV_IMGPROC
+#include "opencv2/imgproc.hpp"
+#endif
+#ifdef HAVE_OPENCV_ML
+#include "opencv2/ml.hpp"
+#endif
+#ifdef HAVE_OPENCV_OBJDETECT
+#include "opencv2/objdetect.hpp"
+#endif
+#ifdef HAVE_OPENCV_PHOTO
+#include "opencv2/photo.hpp"
+#endif
+#ifdef HAVE_OPENCV_STITCHING
+#include "opencv2/stitching.hpp"
+#endif
+#ifdef HAVE_OPENCV_VIDEO
+#include "opencv2/video.hpp"
+#endif
+#ifdef HAVE_OPENCV_VIDEOIO
+#include "opencv2/videoio.hpp"
+#endif
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/opencv_modules.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/opencv_modules.hpp
new file mode 100644
index 000000000000..c9e24d845feb
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/opencv_modules.hpp
@@ -0,0 +1,30 @@
+/*
+ *      ** File generated automatically, do not modify **
+ *
+ * This file defines the list of modules available in current build configuration
+ *
+ *
+*/
+
+// This definition means that OpenCV is built with enabled non-free code.
+// For example, patented algorithms for non-profit/non-commercial use only.
+/* #undef OPENCV_ENABLE_NONFREE */
+
+#define HAVE_OPENCV_CALIB3D
+#define HAVE_OPENCV_CORE
+#define HAVE_OPENCV_DNN
+#define HAVE_OPENCV_FEATURES2D
+#define HAVE_OPENCV_FLANN
+#define HAVE_OPENCV_GAPI
+#define HAVE_OPENCV_HIGHGUI
+#define HAVE_OPENCV_IMGCODECS
+#define HAVE_OPENCV_IMGPROC
+#define HAVE_OPENCV_ML
+#define HAVE_OPENCV_OBJDETECT
+#define HAVE_OPENCV_PHOTO
+#define HAVE_OPENCV_STITCHING
+#define HAVE_OPENCV_VIDEO
+#define HAVE_OPENCV_VIDEOIO
+#define HAVE_OPENCV_WORLD
+
+
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/photo.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/photo.hpp
new file mode 100644
index 000000000000..392232851a45
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/photo.hpp
@@ -0,0 +1,857 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_PHOTO_HPP
+#define OPENCV_PHOTO_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+/**
+@defgroup photo Computational Photography
+
+This module includes photo processing algorithms
+@{
+    @defgroup photo_inpaint Inpainting
+    @defgroup photo_denoise Denoising
+    @defgroup photo_hdr HDR imaging
+
+    This section describes high dynamic range imaging algorithms namely tonemapping, exposure alignment,
+    camera calibration with multiple exposures and exposure fusion.
+
+    @defgroup photo_decolor Contrast Preserving Decolorization
+
+    Useful links:
+
+    http://www.cse.cuhk.edu.hk/leojia/projects/color2gray/index.html
+
+    @defgroup photo_clone Seamless Cloning
+
+    Useful links:
+
+    https://www.learnopencv.com/seamless-cloning-using-opencv-python-cpp
+
+    @defgroup photo_render Non-Photorealistic Rendering
+
+    Useful links:
+
+    http://www.inf.ufrgs.br/~eslgastal/DomainTransform
+
+    https://www.learnopencv.com/non-photorealistic-rendering-using-opencv-python-c/
+
+@}
+  */
+
+namespace cv
+{
+
+//! @addtogroup photo
+//! @{
+
+//! @addtogroup photo_inpaint
+//! @{
+//! the inpainting algorithm
+enum
+{
+    INPAINT_NS    = 0, //!< Use Navier-Stokes based method
+    INPAINT_TELEA = 1 //!< Use the algorithm proposed by Alexandru Telea @cite Telea04
+};
+
+/** @brief Restores the selected region in an image using the region neighborhood.
+
+@param src Input 8-bit, 16-bit unsigned or 32-bit float 1-channel or 8-bit 3-channel image.
+@param inpaintMask Inpainting mask, 8-bit 1-channel image. Non-zero pixels indicate the area that
+needs to be inpainted.
+@param dst Output image with the same size and type as src .
+@param inpaintRadius Radius of a circular neighborhood of each point inpainted that is considered
+by the algorithm.
+@param flags Inpainting method that could be cv::INPAINT_NS or cv::INPAINT_TELEA
+
+The function reconstructs the selected image area from the pixel near the area boundary. The
+function may be used to remove dust and scratches from a scanned photo, or to remove undesirable
+objects from still images or video. See <http://en.wikipedia.org/wiki/Inpainting> for more details.
+
+@note
+   -   An example using the inpainting technique can be found at
+        opencv_source_code/samples/cpp/inpaint.cpp
+   -   (Python) An example using the inpainting technique can be found at
+        opencv_source_code/samples/python/inpaint.py
+ */
+CV_EXPORTS_W void inpaint( InputArray src, InputArray inpaintMask,
+        OutputArray dst, double inpaintRadius, int flags );
+
+//! @} photo_inpaint
+
+//! @addtogroup photo_denoise
+//! @{
+
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit 1-channel, 2-channel, 3-channel or 4-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength. Big h value perfectly removes noise but also
+removes image details, smaller h value preserves details but also preserves some noise
+
+This function expected to be applied to grayscale images. For colored images look at
+fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
+image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
+image to CIELAB colorspace and then separately denoise L and AB components with different h
+parameter.
+ */
+CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float h = 3,
+        int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in dst. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
+
+This function expected to be applied to grayscale images. For colored images look at
+fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
+image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
+image to CIELAB colorspace and then separately denoise L and AB components with different h
+parameter.
+ */
+CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst,
+                                        const std::vector<float>& h,
+                                        int templateWindowSize = 7, int searchWindowSize = 21,
+                                        int normType = NORM_L2);
+
+/** @brief Modification of fastNlMeansDenoising function for colored images
+
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
+removes noise but also removes image details, smaller h value preserves details but also preserves
+some noise
+@param hColor The same as h but for color components. For most images value equals 10
+will be enough to remove colored noise and do not distort colors
+
+The function converts image to CIELAB colorspace and then separately denoise L and AB components
+with given h parameters using fastNlMeansDenoising function.
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingColored( InputArray src, OutputArray dst,
+        float h = 3, float hColor = 3,
+        int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Modification of fastNlMeansDenoising function for images sequence where consecutive images have been
+captured in small period of time. For example video. This version of the function is for grayscale
+images or for manual manipulation with colorspaces. See @cite Buades2005DenoisingIS for more details
+(open access [here](https://static.aminer.org/pdf/PDF/000/317/196/spatio_temporal_wiener_filtering_of_image_sequences_using_a_parametric.pdf)).
+
+@param srcImgs Input 8-bit 1-channel, 2-channel, 3-channel or
+4-channel images sequence. All images should have the same type and
+size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength. Bigger h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
+        int imgToDenoiseIndex, int temporalWindowSize,
+        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Modification of fastNlMeansDenoising function for images sequence where consecutive images have been
+captured in small period of time. For example video. This version of the function is for grayscale
+images or for manual manipulation with colorspaces. See @cite Buades2005DenoisingIS for more details
+(open access [here](https://static.aminer.org/pdf/PDF/000/317/196/spatio_temporal_wiener_filtering_of_image_sequences_using_a_parametric.pdf)).
+
+@param srcImgs Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel images sequence. All images should
+have the same type and size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in dst. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
+                                             int imgToDenoiseIndex, int temporalWindowSize,
+                                             const std::vector<float>& h,
+                                             int templateWindowSize = 7, int searchWindowSize = 21,
+                                             int normType = NORM_L2);
+
+/** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
+
+@param srcImgs Input 8-bit 3-channel images sequence. All images should have the same type and
+size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
+removes noise but also removes image details, smaller h value preserves details but also preserves
+some noise.
+@param hColor The same as h but for color components.
+
+The function converts images to CIELAB colorspace and then separately denoise L and AB components
+with given h parameters using fastNlMeansDenoisingMulti function.
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingColoredMulti( InputArrayOfArrays srcImgs, OutputArray dst,
+        int imgToDenoiseIndex, int temporalWindowSize,
+        float h = 3, float hColor = 3,
+        int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Primal-dual algorithm is an algorithm for solving special types of variational problems (that is,
+finding a function to minimize some functional). As the image denoising, in particular, may be seen
+as the variational problem, primal-dual algorithm then can be used to perform denoising and this is
+exactly what is implemented.
+
+It should be noted, that this implementation was taken from the July 2013 blog entry
+@cite MA13 , which also contained (slightly more general) ready-to-use source code on Python.
+Subsequently, that code was rewritten on C++ with the usage of openCV by Vadim Pisarevsky at the end
+of July 2013 and finally it was slightly adapted by later authors.
+
+Although the thorough discussion and justification of the algorithm involved may be found in
+@cite ChambolleEtAl, it might make sense to skim over it here, following @cite MA13 . To begin
+with, we consider the 1-byte gray-level images as the functions from the rectangular domain of
+pixels (it may be seen as set
+\f$\left\{(x,y)\in\mathbb{N}\times\mathbb{N}\mid 1\leq x\leq n,\;1\leq y\leq m\right\}\f$ for some
+\f$m,\;n\in\mathbb{N}\f$) into \f$\{0,1,\dots,255\}\f$. We shall denote the noised images as \f$f_i\f$ and with
+this view, given some image \f$x\f$ of the same size, we may measure how bad it is by the formula
+
+\f[\left\|\left\|\nabla x\right\|\right\| + \lambda\sum_i\left\|\left\|x-f_i\right\|\right\|\f]
+
+\f$\|\|\cdot\|\|\f$ here denotes \f$L_2\f$-norm and as you see, the first addend states that we want our
+image to be smooth (ideally, having zero gradient, thus being constant) and the second states that
+we want our result to be close to the observations we've got. If we treat \f$x\f$ as a function, this is
+exactly the functional what we seek to minimize and here the Primal-Dual algorithm comes into play.
+
+@param observations This array should contain one or more noised versions of the image that is to
+be restored.
+@param result Here the denoised image will be stored. There is no need to do pre-allocation of
+storage space, as it will be automatically allocated, if necessary.
+@param lambda Corresponds to \f$\lambda\f$ in the formulas above. As it is enlarged, the smooth
+(blurred) images are treated more favorably than detailed (but maybe more noised) ones. Roughly
+speaking, as it becomes smaller, the result will be more blur but more sever outliers will be
+removed.
+@param niters Number of iterations that the algorithm will run. Of course, as more iterations as
+better, but it is hard to quantitatively refine this statement, so just use the default and
+increase it if the results are poor.
+ */
+CV_EXPORTS_W void denoise_TVL1(const std::vector<Mat>& observations,Mat& result, double lambda=1.0, int niters=30);
+
+//! @} photo_denoise
+
+//! @addtogroup photo_hdr
+//! @{
+
+enum { LDR_SIZE = 256 };
+
+/** @brief Base class for tonemapping algorithms - tools that are used to map HDR image to 8-bit range.
+ */
+class CV_EXPORTS_W Tonemap : public Algorithm
+{
+public:
+    /** @brief Tonemaps image
+
+    @param src source image - CV_32FC3 Mat (float 32 bits 3 channels)
+    @param dst destination image - CV_32FC3 Mat with values in [0, 1] range
+     */
+    CV_WRAP virtual void process(InputArray src, OutputArray dst) = 0;
+
+    CV_WRAP virtual float getGamma() const = 0;
+    CV_WRAP virtual void setGamma(float gamma) = 0;
+};
+
+/** @brief Creates simple linear mapper with gamma correction
+
+@param gamma positive value for gamma correction. Gamma value of 1.0 implies no correction, gamma
+equal to 2.2f is suitable for most displays.
+Generally gamma \> 1 brightens the image and gamma \< 1 darkens it.
+ */
+CV_EXPORTS_W Ptr<Tonemap> createTonemap(float gamma = 1.0f);
+
+/** @brief Adaptive logarithmic mapping is a fast global tonemapping algorithm that scales the image in
+logarithmic domain.
+
+Since it's a global operator the same function is applied to all the pixels, it is controlled by the
+bias parameter.
+
+Optional saturation enhancement is possible as described in @cite FL02 .
+
+For more information see @cite DM03 .
+ */
+class CV_EXPORTS_W TonemapDrago : public Tonemap
+{
+public:
+
+    CV_WRAP virtual float getSaturation() const = 0;
+    CV_WRAP virtual void setSaturation(float saturation) = 0;
+
+    CV_WRAP virtual float getBias() const = 0;
+    CV_WRAP virtual void setBias(float bias) = 0;
+};
+
+/** @brief Creates TonemapDrago object
+
+@param gamma gamma value for gamma correction. See createTonemap
+@param saturation positive saturation enhancement value. 1.0 preserves saturation, values greater
+than 1 increase saturation and values less than 1 decrease it.
+@param bias value for bias function in [0, 1] range. Values from 0.7 to 0.9 usually give best
+results, default value is 0.85.
+ */
+CV_EXPORTS_W Ptr<TonemapDrago> createTonemapDrago(float gamma = 1.0f, float saturation = 1.0f, float bias = 0.85f);
+
+
+/** @brief This is a global tonemapping operator that models human visual system.
+
+Mapping function is controlled by adaptation parameter, that is computed using light adaptation and
+color adaptation.
+
+For more information see @cite RD05 .
+ */
+class CV_EXPORTS_W TonemapReinhard : public Tonemap
+{
+public:
+    CV_WRAP virtual float getIntensity() const = 0;
+    CV_WRAP virtual void setIntensity(float intensity) = 0;
+
+    CV_WRAP virtual float getLightAdaptation() const = 0;
+    CV_WRAP virtual void setLightAdaptation(float light_adapt) = 0;
+
+    CV_WRAP virtual float getColorAdaptation() const = 0;
+    CV_WRAP virtual void setColorAdaptation(float color_adapt) = 0;
+};
+
+/** @brief Creates TonemapReinhard object
+
+@param gamma gamma value for gamma correction. See createTonemap
+@param intensity result intensity in [-8, 8] range. Greater intensity produces brighter results.
+@param light_adapt light adaptation in [0, 1] range. If 1 adaptation is based only on pixel
+value, if 0 it's global, otherwise it's a weighted mean of this two cases.
+@param color_adapt chromatic adaptation in [0, 1] range. If 1 channels are treated independently,
+if 0 adaptation level is the same for each channel.
+ */
+CV_EXPORTS_W Ptr<TonemapReinhard>
+createTonemapReinhard(float gamma = 1.0f, float intensity = 0.0f, float light_adapt = 1.0f, float color_adapt = 0.0f);
+
+/** @brief This algorithm transforms image to contrast using gradients on all levels of gaussian pyramid,
+transforms contrast values to HVS response and scales the response. After this the image is
+reconstructed from new contrast values.
+
+For more information see @cite MM06 .
+ */
+class CV_EXPORTS_W TonemapMantiuk : public Tonemap
+{
+public:
+    CV_WRAP virtual float getScale() const = 0;
+    CV_WRAP virtual void setScale(float scale) = 0;
+
+    CV_WRAP virtual float getSaturation() const = 0;
+    CV_WRAP virtual void setSaturation(float saturation) = 0;
+};
+
+/** @brief Creates TonemapMantiuk object
+
+@param gamma gamma value for gamma correction. See createTonemap
+@param scale contrast scale factor. HVS response is multiplied by this parameter, thus compressing
+dynamic range. Values from 0.6 to 0.9 produce best results.
+@param saturation saturation enhancement value. See createTonemapDrago
+ */
+CV_EXPORTS_W Ptr<TonemapMantiuk>
+createTonemapMantiuk(float gamma = 1.0f, float scale = 0.7f, float saturation = 1.0f);
+
+/** @brief The base class for algorithms that align images of the same scene with different exposures
+ */
+class CV_EXPORTS_W AlignExposures : public Algorithm
+{
+public:
+    /** @brief Aligns images
+
+    @param src vector of input images
+    @param dst vector of aligned images
+    @param times vector of exposure time values for each image
+    @param response 256x1 matrix with inverse camera response function for each pixel value, it should
+    have the same number of channels as images.
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, std::vector<Mat>& dst,
+                                 InputArray times, InputArray response) = 0;
+};
+
+/** @brief This algorithm converts images to median threshold bitmaps (1 for pixels brighter than median
+luminance and 0 otherwise) and than aligns the resulting bitmaps using bit operations.
+
+It is invariant to exposure, so exposure values and camera response are not necessary.
+
+In this implementation new image regions are filled with zeros.
+
+For more information see @cite GW03 .
+ */
+class CV_EXPORTS_W AlignMTB : public AlignExposures
+{
+public:
+    CV_WRAP virtual void process(InputArrayOfArrays src, std::vector<Mat>& dst,
+                                 InputArray times, InputArray response) CV_OVERRIDE = 0;
+
+    /** @brief Short version of process, that doesn't take extra arguments.
+
+    @param src vector of input images
+    @param dst vector of aligned images
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, std::vector<Mat>& dst) = 0;
+
+    /** @brief Calculates shift between two images, i. e. how to shift the second image to correspond it with the
+    first.
+
+    @param img0 first image
+    @param img1 second image
+     */
+    CV_WRAP virtual Point calculateShift(InputArray img0, InputArray img1) = 0;
+    /** @brief Helper function, that shift Mat filling new regions with zeros.
+
+    @param src input image
+    @param dst result image
+    @param shift shift value
+     */
+    CV_WRAP virtual void shiftMat(InputArray src, OutputArray dst, const Point shift) = 0;
+    /** @brief Computes median threshold and exclude bitmaps of given image.
+
+    @param img input image
+    @param tb median threshold bitmap
+    @param eb exclude bitmap
+     */
+    CV_WRAP virtual void computeBitmaps(InputArray img, OutputArray tb, OutputArray eb) = 0;
+
+    CV_WRAP virtual int getMaxBits() const = 0;
+    CV_WRAP virtual void setMaxBits(int max_bits) = 0;
+
+    CV_WRAP virtual int getExcludeRange() const = 0;
+    CV_WRAP virtual void setExcludeRange(int exclude_range) = 0;
+
+    CV_WRAP virtual bool getCut() const = 0;
+    CV_WRAP virtual void setCut(bool value) = 0;
+};
+
+/** @brief Creates AlignMTB object
+
+@param max_bits logarithm to the base 2 of maximal shift in each dimension. Values of 5 and 6 are
+usually good enough (31 and 63 pixels shift respectively).
+@param exclude_range range for exclusion bitmap that is constructed to suppress noise around the
+median value.
+@param cut if true cuts images, otherwise fills the new regions with zeros.
+ */
+CV_EXPORTS_W Ptr<AlignMTB> createAlignMTB(int max_bits = 6, int exclude_range = 4, bool cut = true);
+
+/** @brief The base class for camera response calibration algorithms.
+ */
+class CV_EXPORTS_W CalibrateCRF : public Algorithm
+{
+public:
+    /** @brief Recovers inverse camera response.
+
+    @param src vector of input images
+    @param dst 256x1 matrix with inverse camera response function
+    @param times vector of exposure time values for each image
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst, InputArray times) = 0;
+};
+
+/** @brief Inverse camera response function is extracted for each brightness value by minimizing an objective
+function as linear system. Objective function is constructed using pixel values on the same position
+in all images, extra term is added to make the result smoother.
+
+For more information see @cite DM97 .
+ */
+class CV_EXPORTS_W CalibrateDebevec : public CalibrateCRF
+{
+public:
+    CV_WRAP virtual float getLambda() const = 0;
+    CV_WRAP virtual void setLambda(float lambda) = 0;
+
+    CV_WRAP virtual int getSamples() const = 0;
+    CV_WRAP virtual void setSamples(int samples) = 0;
+
+    CV_WRAP virtual bool getRandom() const = 0;
+    CV_WRAP virtual void setRandom(bool random) = 0;
+};
+
+/** @brief Creates CalibrateDebevec object
+
+@param samples number of pixel locations to use
+@param lambda smoothness term weight. Greater values produce smoother results, but can alter the
+response.
+@param random if true sample pixel locations are chosen at random, otherwise they form a
+rectangular grid.
+ */
+CV_EXPORTS_W Ptr<CalibrateDebevec> createCalibrateDebevec(int samples = 70, float lambda = 10.0f, bool random = false);
+
+/** @brief Inverse camera response function is extracted for each brightness value by minimizing an objective
+function as linear system. This algorithm uses all image pixels.
+
+For more information see @cite RB99 .
+ */
+class CV_EXPORTS_W CalibrateRobertson : public CalibrateCRF
+{
+public:
+    CV_WRAP virtual int getMaxIter() const = 0;
+    CV_WRAP virtual void setMaxIter(int max_iter) = 0;
+
+    CV_WRAP virtual float getThreshold() const = 0;
+    CV_WRAP virtual void setThreshold(float threshold) = 0;
+
+    CV_WRAP virtual Mat getRadiance() const = 0;
+};
+
+/** @brief Creates CalibrateRobertson object
+
+@param max_iter maximal number of Gauss-Seidel solver iterations.
+@param threshold target difference between results of two successive steps of the minimization.
+ */
+CV_EXPORTS_W Ptr<CalibrateRobertson> createCalibrateRobertson(int max_iter = 30, float threshold = 0.01f);
+
+/** @brief The base class algorithms that can merge exposure sequence to a single image.
+ */
+class CV_EXPORTS_W MergeExposures : public Algorithm
+{
+public:
+    /** @brief Merges images.
+
+    @param src vector of input images
+    @param dst result image
+    @param times vector of exposure time values for each image
+    @param response 256x1 matrix with inverse camera response function for each pixel value, it should
+    have the same number of channels as images.
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst,
+                                 InputArray times, InputArray response) = 0;
+};
+
+/** @brief The resulting HDR image is calculated as weighted average of the exposures considering exposure
+values and camera response.
+
+For more information see @cite DM97 .
+ */
+class CV_EXPORTS_W MergeDebevec : public MergeExposures
+{
+public:
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst,
+                                 InputArray times, InputArray response) CV_OVERRIDE = 0;
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst, InputArray times) = 0;
+};
+
+/** @brief Creates MergeDebevec object
+ */
+CV_EXPORTS_W Ptr<MergeDebevec> createMergeDebevec();
+
+/** @brief Pixels are weighted using contrast, saturation and well-exposedness measures, than images are
+combined using laplacian pyramids.
+
+The resulting image weight is constructed as weighted average of contrast, saturation and
+well-exposedness measures.
+
+The resulting image doesn't require tonemapping and can be converted to 8-bit image by multiplying
+by 255, but it's recommended to apply gamma correction and/or linear tonemapping.
+
+For more information see @cite MK07 .
+ */
+class CV_EXPORTS_W MergeMertens : public MergeExposures
+{
+public:
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst,
+                                 InputArray times, InputArray response) CV_OVERRIDE = 0;
+    /** @brief Short version of process, that doesn't take extra arguments.
+
+    @param src vector of input images
+    @param dst result image
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst) = 0;
+
+    CV_WRAP virtual float getContrastWeight() const = 0;
+    CV_WRAP virtual void setContrastWeight(float contrast_weiht) = 0;
+
+    CV_WRAP virtual float getSaturationWeight() const = 0;
+    CV_WRAP virtual void setSaturationWeight(float saturation_weight) = 0;
+
+    CV_WRAP virtual float getExposureWeight() const = 0;
+    CV_WRAP virtual void setExposureWeight(float exposure_weight) = 0;
+};
+
+/** @brief Creates MergeMertens object
+
+@param contrast_weight contrast measure weight. See MergeMertens.
+@param saturation_weight saturation measure weight
+@param exposure_weight well-exposedness measure weight
+ */
+CV_EXPORTS_W Ptr<MergeMertens>
+createMergeMertens(float contrast_weight = 1.0f, float saturation_weight = 1.0f, float exposure_weight = 0.0f);
+
+/** @brief The resulting HDR image is calculated as weighted average of the exposures considering exposure
+values and camera response.
+
+For more information see @cite RB99 .
+ */
+class CV_EXPORTS_W MergeRobertson : public MergeExposures
+{
+public:
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst,
+                                 InputArray times, InputArray response) CV_OVERRIDE = 0;
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst, InputArray times) = 0;
+};
+
+/** @brief Creates MergeRobertson object
+ */
+CV_EXPORTS_W Ptr<MergeRobertson> createMergeRobertson();
+
+//! @} photo_hdr
+
+//! @addtogroup photo_decolor
+//! @{
+
+/** @brief Transforms a color image to a grayscale image. It is a basic tool in digital printing, stylized
+black-and-white photograph rendering, and in many single channel image processing applications
+@cite CL12 .
+
+@param src Input 8-bit 3-channel image.
+@param grayscale Output 8-bit 1-channel image.
+@param color_boost Output 8-bit 3-channel image.
+
+This function is to be applied on color images.
+ */
+CV_EXPORTS_W void decolor( InputArray src, OutputArray grayscale, OutputArray color_boost);
+
+//! @} photo_decolor
+
+//! @addtogroup photo_clone
+//! @{
+
+
+//! seamlessClone algorithm flags
+enum
+{
+    /** The power of the method is fully expressed when inserting objects with complex outlines into a new background*/
+    NORMAL_CLONE = 1,
+    /** The classic method, color-based selection and alpha masking might be time consuming and often leaves an undesirable
+    halo. Seamless cloning, even averaged with the original image, is not effective. Mixed seamless cloning based on a loose selection proves effective.*/
+    MIXED_CLONE  = 2,
+    /** Monochrome transfer allows the user to easily replace certain features of one object by alternative features.*/
+    MONOCHROME_TRANSFER = 3};
+
+
+/** @example samples/cpp/tutorial_code/photo/seamless_cloning/cloning_demo.cpp
+An example using seamlessClone function
+*/
+/** @brief Image editing tasks concern either global changes (color/intensity corrections, filters,
+deformations) or local changes concerned to a selection. Here we are interested in achieving local
+changes, ones that are restricted to a region manually selected (ROI), in a seamless and effortless
+manner. The extent of the changes ranges from slight distortions to complete replacement by novel
+content @cite PM03 .
+
+@param src Input 8-bit 3-channel image.
+@param dst Input 8-bit 3-channel image.
+@param mask Input 8-bit 1 or 3-channel image.
+@param p Point in dst image where object is placed.
+@param blend Output image with the same size and type as dst.
+@param flags Cloning method that could be cv::NORMAL_CLONE, cv::MIXED_CLONE or cv::MONOCHROME_TRANSFER
+ */
+CV_EXPORTS_W void seamlessClone( InputArray src, InputArray dst, InputArray mask, Point p,
+        OutputArray blend, int flags);
+
+/** @brief Given an original color image, two differently colored versions of this image can be mixed
+seamlessly.
+
+@param src Input 8-bit 3-channel image.
+@param mask Input 8-bit 1 or 3-channel image.
+@param dst Output image with the same size and type as src .
+@param red_mul R-channel multiply factor.
+@param green_mul G-channel multiply factor.
+@param blue_mul B-channel multiply factor.
+
+Multiplication factor is between .5 to 2.5.
+ */
+CV_EXPORTS_W void colorChange(InputArray src, InputArray mask, OutputArray dst, float red_mul = 1.0f,
+        float green_mul = 1.0f, float blue_mul = 1.0f);
+
+/** @brief Applying an appropriate non-linear transformation to the gradient field inside the selection and
+then integrating back with a Poisson solver, modifies locally the apparent illumination of an image.
+
+@param src Input 8-bit 3-channel image.
+@param mask Input 8-bit 1 or 3-channel image.
+@param dst Output image with the same size and type as src.
+@param alpha Value ranges between 0-2.
+@param beta Value ranges between 0-2.
+
+This is useful to highlight under-exposed foreground objects or to reduce specular reflections.
+ */
+CV_EXPORTS_W void illuminationChange(InputArray src, InputArray mask, OutputArray dst,
+        float alpha = 0.2f, float beta = 0.4f);
+
+/** @brief By retaining only the gradients at edge locations, before integrating with the Poisson solver, one
+washes out the texture of the selected region, giving its contents a flat aspect. Here Canny Edge %Detector is used.
+
+@param src Input 8-bit 3-channel image.
+@param mask Input 8-bit 1 or 3-channel image.
+@param dst Output image with the same size and type as src.
+@param low_threshold %Range from 0 to 100.
+@param high_threshold Value \> 100.
+@param kernel_size The size of the Sobel kernel to be used.
+
+@note
+The algorithm assumes that the color of the source image is close to that of the destination. This
+assumption means that when the colors don't match, the source image color gets tinted toward the
+color of the destination image.
+ */
+CV_EXPORTS_W void textureFlattening(InputArray src, InputArray mask, OutputArray dst,
+        float low_threshold = 30, float high_threshold = 45,
+        int kernel_size = 3);
+
+//! @} photo_clone
+
+//! @addtogroup photo_render
+//! @{
+
+//! Edge preserving filters
+enum
+{
+    RECURS_FILTER = 1, //!< Recursive Filtering
+    NORMCONV_FILTER = 2 //!< Normalized Convolution Filtering
+};
+
+/** @brief Filtering is the fundamental operation in image and video processing. Edge-preserving smoothing
+filters are used in many different applications @cite EM11 .
+
+@param src Input 8-bit 3-channel image.
+@param dst Output 8-bit 3-channel image.
+@param flags Edge preserving filters: cv::RECURS_FILTER or cv::NORMCONV_FILTER
+@param sigma_s %Range between 0 to 200.
+@param sigma_r %Range between 0 to 1.
+ */
+CV_EXPORTS_W void edgePreservingFilter(InputArray src, OutputArray dst, int flags = 1,
+        float sigma_s = 60, float sigma_r = 0.4f);
+
+/** @brief This filter enhances the details of a particular image.
+
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src.
+@param sigma_s %Range between 0 to 200.
+@param sigma_r %Range between 0 to 1.
+ */
+CV_EXPORTS_W void detailEnhance(InputArray src, OutputArray dst, float sigma_s = 10,
+        float sigma_r = 0.15f);
+
+/** @example samples/cpp/tutorial_code/photo/non_photorealistic_rendering/npr_demo.cpp
+An example using non-photorealistic line drawing functions
+*/
+/** @brief Pencil-like non-photorealistic line drawing
+
+@param src Input 8-bit 3-channel image.
+@param dst1 Output 8-bit 1-channel image.
+@param dst2 Output image with the same size and type as src.
+@param sigma_s %Range between 0 to 200.
+@param sigma_r %Range between 0 to 1.
+@param shade_factor %Range between 0 to 0.1.
+ */
+CV_EXPORTS_W void pencilSketch(InputArray src, OutputArray dst1, OutputArray dst2,
+        float sigma_s = 60, float sigma_r = 0.07f, float shade_factor = 0.02f);
+
+/** @brief Stylization aims to produce digital imagery with a wide variety of effects not focused on
+photorealism. Edge-aware filters are ideal for stylization, as they can abstract regions of low
+contrast while preserving, or enhancing, high-contrast features.
+
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src.
+@param sigma_s %Range between 0 to 200.
+@param sigma_r %Range between 0 to 1.
+ */
+CV_EXPORTS_W void stylization(InputArray src, OutputArray dst, float sigma_s = 60,
+        float sigma_r = 0.45f);
+
+//! @} photo_render
+
+//! @} photo
+
+} // cv
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/photo/cuda.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/photo/cuda.hpp
new file mode 100644
index 000000000000..709ad2d26f9d
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/photo/cuda.hpp
@@ -0,0 +1,157 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_PHOTO_CUDA_HPP
+#define OPENCV_PHOTO_CUDA_HPP
+
+#include "opencv2/core/cuda.hpp"
+
+namespace cv { namespace cuda {
+
+//! @addtogroup photo_denoise
+//! @{
+
+/** @brief Performs pure non local means denoising without any simplification, and thus it is not fast.
+
+@param src Source image. Supports only CV_8UC1, CV_8UC2 and CV_8UC3.
+@param dst Destination image.
+@param h Filter sigma regulating filter strength for color.
+@param search_window Size of search window.
+@param block_size Size of block used for computing weights.
+@param borderMode Border type. See borderInterpolate for details. BORDER_REFLECT101 ,
+BORDER_REPLICATE , BORDER_CONSTANT , BORDER_REFLECT and BORDER_WRAP are supported for now.
+@param stream Stream for the asynchronous version.
+
+@sa
+   fastNlMeansDenoising
+ */
+CV_EXPORTS void nonLocalMeans(InputArray src, OutputArray dst,
+                            float h,
+                            int search_window = 21,
+                            int block_size = 7,
+                            int borderMode = BORDER_DEFAULT,
+                            Stream& stream = Stream::Null());
+CV_WRAP inline void nonLocalMeans(const GpuMat& src, CV_OUT GpuMat& dst,
+                            float h,
+                            int search_window = 21,
+                            int block_size = 7,
+                            int borderMode = BORDER_DEFAULT,
+                            Stream& stream = Stream::Null())
+{
+    nonLocalMeans(InputArray(src), OutputArray(dst), h, search_window, block_size, borderMode, stream);
+}
+
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit 1-channel, 2-channel or 3-channel image.
+@param dst Output image with the same size and type as src .
+@param h Parameter regulating filter strength. Big h value perfectly removes noise but also
+removes image details, smaller h value preserves details but also preserves some noise
+@param search_window Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater search_window - greater
+denoising time. Recommended value 21 pixels
+@param block_size Size in pixels of the template patch that is used to compute weights. Should be
+odd. Recommended value 7 pixels
+@param stream Stream for the asynchronous invocations.
+
+This function expected to be applied to grayscale images. For colored images look at
+FastNonLocalMeansDenoising::labMethod.
+
+@sa
+   fastNlMeansDenoising
+ */
+CV_EXPORTS void fastNlMeansDenoising(InputArray src, OutputArray dst,
+                                    float h,
+                                    int search_window = 21,
+                                    int block_size = 7,
+                                    Stream& stream = Stream::Null());
+CV_WRAP inline void fastNlMeansDenoising(const GpuMat& src, CV_OUT GpuMat& dst,
+                                    float h,
+                                    int search_window = 21,
+                                    int block_size = 7,
+                                    Stream& stream = Stream::Null())
+{
+    fastNlMeansDenoising(InputArray(src), OutputArray(dst), h, search_window, block_size, stream);
+}
+
+/** @brief Modification of fastNlMeansDenoising function for colored images
+
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src .
+@param h_luminance Parameter regulating filter strength. Big h value perfectly removes noise but
+also removes image details, smaller h value preserves details but also preserves some noise
+@param photo_render float The same as h but for color components. For most images value equals 10 will be
+enough to remove colored noise and do not distort colors
+@param search_window Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater search_window - greater
+denoising time. Recommended value 21 pixels
+@param block_size Size in pixels of the template patch that is used to compute weights. Should be
+odd. Recommended value 7 pixels
+@param stream Stream for the asynchronous invocations.
+
+The function converts image to CIELAB colorspace and then separately denoise L and AB components
+with given h parameters using FastNonLocalMeansDenoising::simpleMethod function.
+
+@sa
+   fastNlMeansDenoisingColored
+ */
+CV_EXPORTS void fastNlMeansDenoisingColored(InputArray src, OutputArray dst,
+                                            float h_luminance, float photo_render,
+                                            int search_window = 21,
+                                            int block_size = 7,
+                                            Stream& stream = Stream::Null());
+CV_WRAP inline void fastNlMeansDenoisingColored(const GpuMat& src, CV_OUT GpuMat& dst,
+                                            float h_luminance, float photo_render,
+                                            int search_window = 21,
+                                            int block_size = 7,
+                                            Stream& stream = Stream::Null())
+{
+    fastNlMeansDenoisingColored(InputArray(src), OutputArray(dst), h_luminance, photo_render, search_window, block_size, stream);
+}
+
+//! @} photo
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_PHOTO_CUDA_HPP */
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/photo/legacy/constants_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/photo/legacy/constants_c.h
new file mode 100644
index 000000000000..ec1d4403fde3
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/photo/legacy/constants_c.h
@@ -0,0 +1,14 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_PHOTO_LEGACY_CONSTANTS_H
+#define OPENCV_PHOTO_LEGACY_CONSTANTS_H
+
+enum InpaintingModes
+{
+    CV_INPAINT_NS      =0,
+    CV_INPAINT_TELEA   =1
+};
+
+#endif // OPENCV_PHOTO_LEGACY_CONSTANTS_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/photo/photo.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/photo/photo.hpp
new file mode 100644
index 000000000000..8af5e9f0fae2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/photo/photo.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/photo.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching.hpp
new file mode 100644
index 000000000000..8a8122312419
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching.hpp
@@ -0,0 +1,365 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_STITCHER_HPP
+#define OPENCV_STITCHING_STITCHER_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/stitching/warpers.hpp"
+#include "opencv2/stitching/detail/matchers.hpp"
+#include "opencv2/stitching/detail/motion_estimators.hpp"
+#include "opencv2/stitching/detail/exposure_compensate.hpp"
+#include "opencv2/stitching/detail/seam_finders.hpp"
+#include "opencv2/stitching/detail/blenders.hpp"
+#include "opencv2/stitching/detail/camera.hpp"
+
+
+#if defined(Status)
+#  warning Detected X11 'Status' macro definition, it can cause build conflicts. Please, include this header before any X11 headers.
+#endif
+
+
+/**
+@defgroup stitching Images stitching
+
+This figure illustrates the stitching module pipeline implemented in the Stitcher class. Using that
+class it's possible to configure/remove some steps, i.e. adjust the stitching pipeline according to
+the particular needs. All building blocks from the pipeline are available in the detail namespace,
+one can combine and use them separately.
+
+The implemented stitching pipeline is very similar to the one proposed in @cite BL07 .
+
+![stitching pipeline](StitchingPipeline.jpg)
+
+Camera models
+-------------
+
+There are currently 2 camera models implemented in stitching pipeline.
+
+- _Homography model_ expecting perspective transformations between images
+  implemented in @ref cv::detail::BestOf2NearestMatcher cv::detail::HomographyBasedEstimator
+  cv::detail::BundleAdjusterReproj cv::detail::BundleAdjusterRay
+- _Affine model_ expecting affine transformation with 6 DOF or 4 DOF implemented in
+  @ref cv::detail::AffineBestOf2NearestMatcher cv::detail::AffineBasedEstimator
+  cv::detail::BundleAdjusterAffine cv::detail::BundleAdjusterAffinePartial cv::AffineWarper
+
+Homography model is useful for creating photo panoramas captured by camera,
+while affine-based model can be used to stitch scans and object captured by
+specialized devices. Use @ref cv::Stitcher::create to get preconfigured pipeline for one
+of those models.
+
+@note
+Certain detailed settings of @ref cv::Stitcher might not make sense. Especially
+you should not mix classes implementing affine model and classes implementing
+Homography model, as they work with different transformations.
+
+@{
+    @defgroup stitching_match Features Finding and Images Matching
+    @defgroup stitching_rotation Rotation Estimation
+    @defgroup stitching_autocalib Autocalibration
+    @defgroup stitching_warp Images Warping
+    @defgroup stitching_seam Seam Estimation
+    @defgroup stitching_exposure Exposure Compensation
+    @defgroup stitching_blend Image Blenders
+@}
+  */
+
+namespace cv {
+
+//! @addtogroup stitching
+//! @{
+
+/** @example samples/cpp/stitching.cpp
+A basic example on image stitching
+*/
+
+/** @example samples/python/stitching.py
+A basic example on image stitching in Python.
+*/
+
+/** @example samples/cpp/stitching_detailed.cpp
+A detailed example on image stitching
+*/
+
+/** @brief High level image stitcher.
+
+It's possible to use this class without being aware of the entire stitching pipeline. However, to
+be able to achieve higher stitching stability and quality of the final images at least being
+familiar with the theory is recommended.
+
+@note
+-   A basic example on image stitching can be found at
+    opencv_source_code/samples/cpp/stitching.cpp
+-   A basic example on image stitching in Python can be found at
+    opencv_source_code/samples/python/stitching.py
+-   A detailed example on image stitching can be found at
+    opencv_source_code/samples/cpp/stitching_detailed.cpp
+ */
+class CV_EXPORTS_W Stitcher
+{
+public:
+    /**
+     * When setting a resolution for stitching, this values is a placeholder
+     * for preserving the original resolution.
+     */
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
+    static constexpr double ORIG_RESOL = -1.0;
+#else
+    // support MSVS 2013
+    static const double ORIG_RESOL; // Initialized in stitcher.cpp
+#endif
+
+    enum Status
+    {
+        OK = 0,
+        ERR_NEED_MORE_IMGS = 1,
+        ERR_HOMOGRAPHY_EST_FAIL = 2,
+        ERR_CAMERA_PARAMS_ADJUST_FAIL = 3
+    };
+
+    enum Mode
+    {
+        /** Mode for creating photo panoramas. Expects images under perspective
+        transformation and projects resulting pano to sphere.
+
+        @sa detail::BestOf2NearestMatcher SphericalWarper
+        */
+        PANORAMA = 0,
+        /** Mode for composing scans. Expects images under affine transformation does
+        not compensate exposure by default.
+
+        @sa detail::AffineBestOf2NearestMatcher AffineWarper
+        */
+        SCANS = 1,
+
+    };
+
+    /** @brief Creates a Stitcher configured in one of the stitching modes.
+
+    @param mode Scenario for stitcher operation. This is usually determined by source of images
+    to stitch and their transformation. Default parameters will be chosen for operation in given
+    scenario.
+    @return Stitcher class instance.
+     */
+    CV_WRAP static Ptr<Stitcher> create(Mode mode = Stitcher::PANORAMA);
+
+    CV_WRAP double registrationResol() const { return registr_resol_; }
+    CV_WRAP void setRegistrationResol(double resol_mpx) { registr_resol_ = resol_mpx; }
+
+    CV_WRAP double seamEstimationResol() const { return seam_est_resol_; }
+    CV_WRAP void setSeamEstimationResol(double resol_mpx) { seam_est_resol_ = resol_mpx; }
+
+    CV_WRAP double compositingResol() const { return compose_resol_; }
+    CV_WRAP void setCompositingResol(double resol_mpx) { compose_resol_ = resol_mpx; }
+
+    CV_WRAP double panoConfidenceThresh() const { return conf_thresh_; }
+    CV_WRAP void setPanoConfidenceThresh(double conf_thresh) { conf_thresh_ = conf_thresh; }
+
+    CV_WRAP bool waveCorrection() const { return do_wave_correct_; }
+    CV_WRAP void setWaveCorrection(bool flag) { do_wave_correct_ = flag; }
+
+    CV_WRAP InterpolationFlags interpolationFlags() const { return interp_flags_; }
+    CV_WRAP void setInterpolationFlags(InterpolationFlags interp_flags) { interp_flags_ = interp_flags; }
+
+    detail::WaveCorrectKind waveCorrectKind() const { return wave_correct_kind_; }
+    void setWaveCorrectKind(detail::WaveCorrectKind kind) { wave_correct_kind_ = kind; }
+
+    Ptr<Feature2D> featuresFinder() { return features_finder_; }
+    Ptr<Feature2D> featuresFinder() const { return features_finder_; }
+    void setFeaturesFinder(Ptr<Feature2D> features_finder)
+        { features_finder_ = features_finder; }
+
+    Ptr<detail::FeaturesMatcher> featuresMatcher() { return features_matcher_; }
+    Ptr<detail::FeaturesMatcher> featuresMatcher() const { return features_matcher_; }
+    void setFeaturesMatcher(Ptr<detail::FeaturesMatcher> features_matcher)
+        { features_matcher_ = features_matcher; }
+
+    const cv::UMat& matchingMask() const { return matching_mask_; }
+    void setMatchingMask(const cv::UMat &mask)
+    {
+        CV_Assert(mask.type() == CV_8U && mask.cols == mask.rows);
+        matching_mask_ = mask.clone();
+    }
+
+    Ptr<detail::BundleAdjusterBase> bundleAdjuster() { return bundle_adjuster_; }
+    const Ptr<detail::BundleAdjusterBase> bundleAdjuster() const { return bundle_adjuster_; }
+    void setBundleAdjuster(Ptr<detail::BundleAdjusterBase> bundle_adjuster)
+        { bundle_adjuster_ = bundle_adjuster; }
+
+    Ptr<detail::Estimator> estimator() { return estimator_; }
+    const Ptr<detail::Estimator> estimator() const { return estimator_; }
+    void setEstimator(Ptr<detail::Estimator> estimator)
+        { estimator_ = estimator; }
+
+    Ptr<WarperCreator> warper() { return warper_; }
+    const Ptr<WarperCreator> warper() const { return warper_; }
+    void setWarper(Ptr<WarperCreator> creator) { warper_ = creator; }
+
+    Ptr<detail::ExposureCompensator> exposureCompensator() { return exposure_comp_; }
+    const Ptr<detail::ExposureCompensator> exposureCompensator() const { return exposure_comp_; }
+    void setExposureCompensator(Ptr<detail::ExposureCompensator> exposure_comp)
+        { exposure_comp_ = exposure_comp; }
+
+    Ptr<detail::SeamFinder> seamFinder() { return seam_finder_; }
+    const Ptr<detail::SeamFinder> seamFinder() const { return seam_finder_; }
+    void setSeamFinder(Ptr<detail::SeamFinder> seam_finder) { seam_finder_ = seam_finder; }
+
+    Ptr<detail::Blender> blender() { return blender_; }
+    const Ptr<detail::Blender> blender() const { return blender_; }
+    void setBlender(Ptr<detail::Blender> b) { blender_ = b; }
+
+    /** @brief These functions try to match the given images and to estimate rotations of each camera.
+
+    @note Use the functions only if you're aware of the stitching pipeline, otherwise use
+    Stitcher::stitch.
+
+    @param images Input images.
+    @param masks Masks for each input image specifying where to look for keypoints (optional).
+    @return Status code.
+     */
+    CV_WRAP Status estimateTransform(InputArrayOfArrays images, InputArrayOfArrays masks = noArray());
+
+    /** @brief These function restors camera rotation and camera intrinsics of each camera
+     *  that can be got with @ref Stitcher::cameras call
+
+    @param images Input images.
+    @param cameras Estimated rotation of cameras for each of the input images.
+    @param component Indices (0-based) of images constituting the final panorama (optional).
+    @return Status code.
+     */
+    Status setTransform(InputArrayOfArrays images,
+                        const std::vector<detail::CameraParams> &cameras,
+                        const std::vector<int> &component);
+    /** @overload */
+    Status setTransform(InputArrayOfArrays images, const std::vector<detail::CameraParams> &cameras);
+
+    /** @overload */
+    CV_WRAP Status composePanorama(OutputArray pano);
+    /** @brief These functions try to compose the given images (or images stored internally from the other function
+    calls) into the final pano under the assumption that the image transformations were estimated
+    before.
+
+    @note Use the functions only if you're aware of the stitching pipeline, otherwise use
+    Stitcher::stitch.
+
+    @param images Input images.
+    @param pano Final pano.
+    @return Status code.
+     */
+    CV_WRAP Status composePanorama(InputArrayOfArrays images, OutputArray pano);
+
+    /** @overload */
+    CV_WRAP Status stitch(InputArrayOfArrays images, OutputArray pano);
+    /** @brief These functions try to stitch the given images.
+
+    @param images Input images.
+    @param masks Masks for each input image specifying where to look for keypoints (optional).
+    @param pano Final pano.
+    @return Status code.
+     */
+    CV_WRAP Status stitch(InputArrayOfArrays images, InputArrayOfArrays masks, OutputArray pano);
+
+    std::vector<int> component() const { return indices_; }
+    std::vector<detail::CameraParams> cameras() const { return cameras_; }
+    CV_WRAP double workScale() const { return work_scale_; }
+
+    /** @brief Return the mask of the panorama.
+
+    The mask is a 8U UMat with the values: 0xFF (white) for pixels filled by the input images,
+    0 (black) for unused pixels. It can be used as the mask for inpaint.
+
+    @return The mask.
+     */
+    UMat resultMask() const { return result_mask_; }
+
+private:
+    Status matchImages();
+    Status estimateCameraParams();
+
+    double registr_resol_;
+    double seam_est_resol_;
+    double compose_resol_;
+    double conf_thresh_;
+    InterpolationFlags interp_flags_;
+    Ptr<Feature2D> features_finder_;
+    Ptr<detail::FeaturesMatcher> features_matcher_;
+    cv::UMat matching_mask_;
+    Ptr<detail::BundleAdjusterBase> bundle_adjuster_;
+    Ptr<detail::Estimator> estimator_;
+    bool do_wave_correct_;
+    detail::WaveCorrectKind wave_correct_kind_;
+    Ptr<WarperCreator> warper_;
+    Ptr<detail::ExposureCompensator> exposure_comp_;
+    Ptr<detail::SeamFinder> seam_finder_;
+    Ptr<detail::Blender> blender_;
+
+    std::vector<cv::UMat> imgs_;
+    std::vector<cv::UMat> masks_;
+    std::vector<cv::Size> full_img_sizes_;
+    std::vector<detail::ImageFeatures> features_;
+    std::vector<detail::MatchesInfo> pairwise_matches_;
+    std::vector<cv::UMat> seam_est_imgs_;
+    std::vector<int> indices_;
+    std::vector<detail::CameraParams> cameras_;
+    UMat result_mask_;
+    double work_scale_;
+    double seam_scale_;
+    double seam_work_aspect_;
+    double warped_image_scale_;
+};
+
+/**
+ * @deprecated use Stitcher::create
+ */
+CV_DEPRECATED Ptr<Stitcher> createStitcher(bool try_use_gpu = false);
+
+/**
+ * @deprecated use Stitcher::create
+ */
+CV_DEPRECATED Ptr<Stitcher> createStitcherScans(bool try_use_gpu = false);
+
+//! @} stitching
+
+} // namespace cv
+
+#endif // OPENCV_STITCHING_STITCHER_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/autocalib.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/autocalib.hpp
new file mode 100644
index 000000000000..8eb6212c6501
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/autocalib.hpp
@@ -0,0 +1,86 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_AUTOCALIB_HPP
+#define OPENCV_STITCHING_AUTOCALIB_HPP
+
+#include "opencv2/core.hpp"
+#include "matchers.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching_autocalib
+//! @{
+
+/** @brief Tries to estimate focal lengths from the given homography under the assumption that the camera
+undergoes rotations around its centre only.
+
+@param H Homography.
+@param f0 Estimated focal length along X axis.
+@param f1 Estimated focal length along Y axis.
+@param f0_ok True, if f0 was estimated successfully, false otherwise.
+@param f1_ok True, if f1 was estimated successfully, false otherwise.
+
+See "Construction of Panoramic Image Mosaics with Global and Local Alignment"
+by Heung-Yeung Shum and Richard Szeliski.
+ */
+void CV_EXPORTS_W focalsFromHomography(const Mat &H, double &f0, double &f1, bool &f0_ok, bool &f1_ok);
+
+/** @brief Estimates focal lengths for each given camera.
+
+@param features Features of images.
+@param pairwise_matches Matches between all image pairs.
+@param focals Estimated focal lengths for each camera.
+ */
+void CV_EXPORTS estimateFocal(const std::vector<ImageFeatures> &features,
+                              const std::vector<MatchesInfo> &pairwise_matches,
+                              std::vector<double> &focals);
+
+bool CV_EXPORTS_W calibrateRotatingCamera(const std::vector<Mat> &Hs,CV_OUT Mat &K);
+
+//! @} stitching_autocalib
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_STITCHING_AUTOCALIB_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/blenders.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/blenders.hpp
new file mode 100644
index 000000000000..ec35aa7cbbd5
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/blenders.hpp
@@ -0,0 +1,184 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_BLENDERS_HPP
+#define OPENCV_STITCHING_BLENDERS_HPP
+
+#if defined(NO)
+#  warning Detected Apple 'NO' macro definition, it can cause build conflicts. Please, include this header before any Apple headers.
+#endif
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/cuda.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching_blend
+//! @{
+
+/** @brief Base class for all blenders.
+
+Simple blender which puts one image over another
+*/
+class CV_EXPORTS_W Blender
+{
+public:
+    virtual ~Blender() {}
+
+    enum { NO, FEATHER, MULTI_BAND };
+    CV_WRAP static Ptr<Blender> createDefault(int type, bool try_gpu = false);
+
+    /** @brief Prepares the blender for blending.
+
+    @param corners Source images top-left corners
+    @param sizes Source image sizes
+     */
+    CV_WRAP virtual void prepare(const std::vector<Point> &corners, const std::vector<Size> &sizes);
+    /** @overload */
+    CV_WRAP virtual void prepare(Rect dst_roi);
+    /** @brief Processes the image.
+
+    @param img Source image
+    @param mask Source image mask
+    @param tl Source image top-left corners
+     */
+    CV_WRAP virtual void feed(InputArray img, InputArray mask, Point tl);
+    /** @brief Blends and returns the final pano.
+
+    @param dst Final pano
+    @param dst_mask Final pano mask
+     */
+    CV_WRAP virtual void blend(CV_IN_OUT InputOutputArray dst,CV_IN_OUT  InputOutputArray dst_mask);
+
+protected:
+    UMat dst_, dst_mask_;
+    Rect dst_roi_;
+};
+
+/** @brief Simple blender which mixes images at its borders.
+ */
+class CV_EXPORTS_W FeatherBlender : public Blender
+{
+public:
+    CV_WRAP FeatherBlender(float sharpness = 0.02f);
+
+    CV_WRAP float sharpness() const { return sharpness_; }
+    CV_WRAP void setSharpness(float val) { sharpness_ = val; }
+
+    CV_WRAP void prepare(Rect dst_roi) CV_OVERRIDE;
+    CV_WRAP void feed(InputArray img, InputArray mask, Point tl) CV_OVERRIDE;
+    CV_WRAP void blend(InputOutputArray dst, InputOutputArray dst_mask) CV_OVERRIDE;
+
+    //! Creates weight maps for fixed set of source images by their masks and top-left corners.
+    //! Final image can be obtained by simple weighting of the source images.
+    CV_WRAP Rect createWeightMaps(const std::vector<UMat> &masks, const std::vector<Point> &corners,
+        CV_IN_OUT std::vector<UMat> &weight_maps);
+
+private:
+    float sharpness_;
+    UMat weight_map_;
+    UMat dst_weight_map_;
+};
+
+inline FeatherBlender::FeatherBlender(float _sharpness) { setSharpness(_sharpness); }
+
+/** @brief Blender which uses multi-band blending algorithm (see @cite BA83).
+ */
+class CV_EXPORTS_W MultiBandBlender : public Blender
+{
+public:
+    CV_WRAP MultiBandBlender(int try_gpu = false, int num_bands = 5, int weight_type = CV_32F);
+
+    CV_WRAP int numBands() const { return actual_num_bands_; }
+    CV_WRAP void setNumBands(int val) { actual_num_bands_ = val; }
+
+    CV_WRAP void prepare(Rect dst_roi) CV_OVERRIDE;
+    CV_WRAP void feed(InputArray img, InputArray mask, Point tl) CV_OVERRIDE;
+    CV_WRAP void blend(CV_IN_OUT InputOutputArray dst, CV_IN_OUT InputOutputArray dst_mask) CV_OVERRIDE;
+
+private:
+    int actual_num_bands_, num_bands_;
+    std::vector<UMat> dst_pyr_laplace_;
+    std::vector<UMat> dst_band_weights_;
+    Rect dst_roi_final_;
+    bool can_use_gpu_;
+    int weight_type_; //CV_32F or CV_16S
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
+    std::vector<cuda::GpuMat> gpu_dst_pyr_laplace_;
+    std::vector<cuda::GpuMat> gpu_dst_band_weights_;
+    std::vector<Point> gpu_tl_points_;
+    std::vector<cuda::GpuMat> gpu_imgs_with_border_;
+    std::vector<std::vector<cuda::GpuMat> > gpu_weight_pyr_gauss_vec_;
+    std::vector<std::vector<cuda::GpuMat> > gpu_src_pyr_laplace_vec_;
+    std::vector<std::vector<cuda::GpuMat> > gpu_ups_;
+    cuda::GpuMat gpu_dst_mask_;
+    cuda::GpuMat gpu_mask_;
+    cuda::GpuMat gpu_img_;
+    cuda::GpuMat gpu_weight_map_;
+    cuda::GpuMat gpu_add_mask_;
+    int gpu_feed_idx_;
+    bool gpu_initialized_;
+#endif
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Auxiliary functions
+
+void CV_EXPORTS_W normalizeUsingWeightMap(InputArray weight, CV_IN_OUT InputOutputArray src);
+
+void CV_EXPORTS_W createWeightMap(InputArray mask, float sharpness, CV_IN_OUT InputOutputArray weight);
+
+void CV_EXPORTS_W createLaplacePyr(InputArray img, int num_levels, CV_IN_OUT std::vector<UMat>& pyr);
+void CV_EXPORTS_W createLaplacePyrGpu(InputArray img, int num_levels, CV_IN_OUT std::vector<UMat>& pyr);
+
+// Restores source image
+void CV_EXPORTS_W restoreImageFromLaplacePyr(CV_IN_OUT std::vector<UMat>& pyr);
+void CV_EXPORTS_W restoreImageFromLaplacePyrGpu(CV_IN_OUT std::vector<UMat>& pyr);
+
+//! @}
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_STITCHING_BLENDERS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/camera.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/camera.hpp
new file mode 100644
index 000000000000..14ecf60f30cc
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/camera.hpp
@@ -0,0 +1,78 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_CAMERA_HPP
+#define OPENCV_STITCHING_CAMERA_HPP
+
+#include "opencv2/core.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching
+//! @{
+
+/** @brief Describes camera parameters.
+
+@note Translation is assumed to be zero during the whole stitching pipeline. :
+ */
+struct CV_EXPORTS_W_SIMPLE CameraParams
+{
+    CameraParams();
+    CameraParams(const CameraParams& other);
+    CameraParams& operator =(const CameraParams& other);
+    CV_WRAP Mat K() const;
+
+    CV_PROP_RW double focal; // Focal length
+    CV_PROP_RW double aspect; // Aspect ratio
+    CV_PROP_RW double ppx; // Principal point X
+    CV_PROP_RW double ppy; // Principal point Y
+    CV_PROP_RW Mat R; // Rotation
+    CV_PROP_RW Mat t; // Translation
+};
+
+//! @}
+
+} // namespace detail
+} // namespace cv
+
+#endif // #ifndef OPENCV_STITCHING_CAMERA_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/exposure_compensate.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/exposure_compensate.hpp
new file mode 100644
index 000000000000..dea76c957bd7
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/exposure_compensate.hpp
@@ -0,0 +1,245 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_EXPOSURE_COMPENSATE_HPP
+#define OPENCV_STITCHING_EXPOSURE_COMPENSATE_HPP
+
+#if defined(NO)
+#  warning Detected Apple 'NO' macro definition, it can cause build conflicts. Please, include this header before any Apple headers.
+#endif
+
+#include "opencv2/core.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching_exposure
+//! @{
+
+/** @brief Base class for all exposure compensators.
+ */
+class CV_EXPORTS_W ExposureCompensator
+{
+public:
+    ExposureCompensator(): updateGain(true) {}
+    virtual ~ExposureCompensator() {}
+
+    enum { NO, GAIN, GAIN_BLOCKS, CHANNELS, CHANNELS_BLOCKS };
+    CV_WRAP static Ptr<ExposureCompensator> createDefault(int type);
+
+    /**
+    @param corners Source image top-left corners
+    @param images Source images
+    @param masks Image masks to update (second value in pair specifies the value which should be used
+    to detect where image is)
+        */
+    CV_WRAP void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+        const std::vector<UMat> &masks);
+    /** @overload */
+    virtual void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+        const std::vector<std::pair<UMat, uchar> > &masks) = 0;
+    /** @brief Compensate exposure in the specified image.
+
+    @param index Image index
+    @param corner Image top-left corner
+    @param image Image to process
+    @param mask Image mask
+        */
+    CV_WRAP virtual void apply(int index, Point corner, InputOutputArray image, InputArray mask) = 0;
+    CV_WRAP virtual void getMatGains(CV_OUT std::vector<Mat>& ) {CV_Error(Error::StsInternal, "");}
+    CV_WRAP virtual void setMatGains(std::vector<Mat>& ) { CV_Error(Error::StsInternal, ""); }
+    CV_WRAP void setUpdateGain(bool b) { updateGain = b; }
+    CV_WRAP bool getUpdateGain() { return updateGain; }
+protected :
+    bool updateGain;
+};
+
+/** @brief Stub exposure compensator which does nothing.
+ */
+class CV_EXPORTS_W NoExposureCompensator : public ExposureCompensator
+{
+public:
+    void feed(const std::vector<Point> &/*corners*/, const std::vector<UMat> &/*images*/,
+              const std::vector<std::pair<UMat,uchar> > &/*masks*/) CV_OVERRIDE { }
+    CV_WRAP void apply(int /*index*/, Point /*corner*/, InputOutputArray /*image*/, InputArray /*mask*/) CV_OVERRIDE { }
+    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; }
+    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; }
+};
+
+/** @brief Exposure compensator which tries to remove exposure related artifacts by adjusting image
+intensities, see @cite BL07 and @cite WJ10 for details.
+ */
+class CV_EXPORTS_W GainCompensator : public ExposureCompensator
+{
+public:
+    // This Constructor only exists to make source level compatibility detector happy
+    CV_WRAP GainCompensator()
+            : GainCompensator(1) {}
+    CV_WRAP GainCompensator(int nr_feeds)
+            : nr_feeds_(nr_feeds), similarity_threshold_(1) {}
+    void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+              const std::vector<std::pair<UMat,uchar> > &masks) CV_OVERRIDE;
+    void singleFeed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+                    const std::vector<std::pair<UMat,uchar> > &masks);
+    CV_WRAP void apply(int index, Point corner, InputOutputArray image, InputArray mask) CV_OVERRIDE;
+    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE ;
+    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE ;
+    CV_WRAP void setNrFeeds(int nr_feeds) { nr_feeds_ = nr_feeds; }
+    CV_WRAP int getNrFeeds() { return nr_feeds_; }
+    CV_WRAP void setSimilarityThreshold(double similarity_threshold) { similarity_threshold_ = similarity_threshold; }
+    CV_WRAP double getSimilarityThreshold() const { return similarity_threshold_; }
+    void prepareSimilarityMask(const std::vector<Point> &corners, const std::vector<UMat> &images);
+    std::vector<double> gains() const;
+
+private:
+    UMat buildSimilarityMask(InputArray src_array1, InputArray src_array2);
+
+    Mat_<double> gains_;
+    int nr_feeds_;
+    double similarity_threshold_;
+    std::vector<UMat> similarities_;
+};
+
+/** @brief Exposure compensator which tries to remove exposure related artifacts by adjusting image
+intensities on each channel independently.
+ */
+class CV_EXPORTS_W ChannelsCompensator : public ExposureCompensator
+{
+public:
+    CV_WRAP ChannelsCompensator(int nr_feeds=1)
+        : nr_feeds_(nr_feeds), similarity_threshold_(1) {}
+    void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+              const std::vector<std::pair<UMat,uchar> > &masks) CV_OVERRIDE;
+    CV_WRAP void apply(int index, Point corner, InputOutputArray image, InputArray mask) CV_OVERRIDE;
+    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE;
+    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE;
+    CV_WRAP void setNrFeeds(int nr_feeds) { nr_feeds_ = nr_feeds; }
+    CV_WRAP int getNrFeeds() { return nr_feeds_; }
+    CV_WRAP void setSimilarityThreshold(double similarity_threshold) { similarity_threshold_ = similarity_threshold; }
+    CV_WRAP double getSimilarityThreshold() const { return similarity_threshold_; }
+    std::vector<Scalar> gains() const { return gains_; }
+
+private:
+    std::vector<Scalar> gains_;
+    int nr_feeds_;
+    double similarity_threshold_;
+};
+
+/** @brief Exposure compensator which tries to remove exposure related artifacts by adjusting image blocks.
+ */
+class CV_EXPORTS_W BlocksCompensator : public ExposureCompensator
+{
+public:
+    BlocksCompensator(int bl_width=32, int bl_height=32, int nr_feeds=1)
+            : bl_width_(bl_width), bl_height_(bl_height), nr_feeds_(nr_feeds), nr_gain_filtering_iterations_(2),
+              similarity_threshold_(1) {}
+    CV_WRAP void apply(int index, Point corner, InputOutputArray image, InputArray mask) CV_OVERRIDE;
+    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE;
+    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE;
+    CV_WRAP void setNrFeeds(int nr_feeds) { nr_feeds_ = nr_feeds; }
+    CV_WRAP int getNrFeeds() { return nr_feeds_; }
+    CV_WRAP void setSimilarityThreshold(double similarity_threshold) { similarity_threshold_ = similarity_threshold; }
+    CV_WRAP double getSimilarityThreshold() const { return similarity_threshold_; }
+    CV_WRAP void setBlockSize(int width, int height) { bl_width_ = width; bl_height_ = height; }
+    CV_WRAP void setBlockSize(Size size) { setBlockSize(size.width, size.height); }
+    CV_WRAP Size getBlockSize() const { return Size(bl_width_, bl_height_); }
+    CV_WRAP void setNrGainsFilteringIterations(int nr_iterations) { nr_gain_filtering_iterations_ = nr_iterations; }
+    CV_WRAP int getNrGainsFilteringIterations() const { return nr_gain_filtering_iterations_; }
+
+protected:
+    template<class Compensator>
+    void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+              const std::vector<std::pair<UMat,uchar> > &masks);
+
+private:
+    UMat getGainMap(const GainCompensator& compensator, int bl_idx, Size bl_per_img);
+    UMat getGainMap(const ChannelsCompensator& compensator, int bl_idx, Size bl_per_img);
+
+    int bl_width_, bl_height_;
+    std::vector<UMat> gain_maps_;
+    int nr_feeds_;
+    int nr_gain_filtering_iterations_;
+    double similarity_threshold_;
+};
+
+/** @brief Exposure compensator which tries to remove exposure related artifacts by adjusting image block
+intensities, see @cite UES01 for details.
+ */
+class CV_EXPORTS_W BlocksGainCompensator : public BlocksCompensator
+{
+public:
+    // This Constructor only exists to make source level compatibility detector happy
+    CV_WRAP BlocksGainCompensator(int bl_width = 32, int bl_height = 32)
+            : BlocksGainCompensator(bl_width, bl_height, 1) {}
+    CV_WRAP BlocksGainCompensator(int bl_width, int bl_height, int nr_feeds)
+            : BlocksCompensator(bl_width, bl_height, nr_feeds) {}
+
+    void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+              const std::vector<std::pair<UMat,uchar> > &masks) CV_OVERRIDE;
+
+    // This function only exists to make source level compatibility detector happy
+    CV_WRAP void apply(int index, Point corner, InputOutputArray image, InputArray mask) CV_OVERRIDE {
+        BlocksCompensator::apply(index, corner, image, mask); }
+    // This function only exists to make source level compatibility detector happy
+    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE { BlocksCompensator::getMatGains(umv); }
+    // This function only exists to make source level compatibility detector happy
+    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE { BlocksCompensator::setMatGains(umv); }
+};
+
+/** @brief Exposure compensator which tries to remove exposure related artifacts by adjusting image block
+on each channel.
+ */
+class CV_EXPORTS_W BlocksChannelsCompensator : public BlocksCompensator
+{
+public:
+    CV_WRAP BlocksChannelsCompensator(int bl_width=32, int bl_height=32, int nr_feeds=1)
+            : BlocksCompensator(bl_width, bl_height, nr_feeds) {}
+
+    void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+              const std::vector<std::pair<UMat,uchar> > &masks) CV_OVERRIDE;
+};
+//! @}
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_STITCHING_EXPOSURE_COMPENSATE_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/matchers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/matchers.hpp
new file mode 100644
index 000000000000..e25668308ed8
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/matchers.hpp
@@ -0,0 +1,267 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_MATCHERS_HPP
+#define OPENCV_STITCHING_MATCHERS_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/features2d.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching_match
+//! @{
+
+/** @brief Structure containing image keypoints and descriptors. */
+struct CV_EXPORTS_W_SIMPLE ImageFeatures
+{
+    CV_PROP_RW int img_idx;
+    CV_PROP_RW Size img_size;
+    CV_PROP_RW std::vector<KeyPoint> keypoints;
+    CV_PROP_RW UMat descriptors;
+    CV_WRAP std::vector<KeyPoint> getKeypoints() { return keypoints; }
+};
+/** @brief
+
+@param featuresFinder
+@param images
+@param features
+@param masks
+*/
+CV_EXPORTS_W void computeImageFeatures(
+    const Ptr<Feature2D> &featuresFinder,
+    InputArrayOfArrays  images,
+    CV_OUT std::vector<ImageFeatures> &features,
+    InputArrayOfArrays masks = noArray());
+
+/** @brief
+
+@param featuresFinder
+@param image
+@param features
+@param mask
+*/
+CV_EXPORTS_AS(computeImageFeatures2) void computeImageFeatures(
+    const Ptr<Feature2D> &featuresFinder,
+    InputArray image,
+    CV_OUT ImageFeatures &features,
+    InputArray mask = noArray());
+
+/** @brief Structure containing information about matches between two images.
+
+It's assumed that there is a transformation between those images. Transformation may be
+homography or affine transformation based on selected matcher.
+
+@sa detail::FeaturesMatcher
+*/
+struct CV_EXPORTS_W_SIMPLE MatchesInfo
+{
+    MatchesInfo();
+    MatchesInfo(const MatchesInfo &other);
+    MatchesInfo& operator =(const MatchesInfo &other);
+
+    CV_PROP_RW int src_img_idx;
+    CV_PROP_RW int dst_img_idx;       //!< Images indices (optional)
+    CV_PROP_RW std::vector<DMatch> matches;
+    CV_PROP_RW std::vector<uchar> inliers_mask;    //!< Geometrically consistent matches mask
+    CV_PROP_RW int num_inliers;                    //!< Number of geometrically consistent matches
+    CV_PROP_RW Mat H;                              //!< Estimated transformation
+    CV_PROP_RW double confidence;                  //!< Confidence two images are from the same panorama
+    CV_WRAP std::vector<DMatch> getMatches() { return matches; }
+    CV_WRAP std::vector<uchar> getInliers() { return inliers_mask; }
+};
+
+/** @brief Feature matchers base class. */
+class CV_EXPORTS_W FeaturesMatcher
+{
+public:
+    CV_WRAP virtual ~FeaturesMatcher() {}
+
+    /** @overload
+    @param features1 First image features
+    @param features2 Second image features
+    @param matches_info Found matches
+    */
+    CV_WRAP_AS(apply) void operator ()(const ImageFeatures &features1, const ImageFeatures &features2,
+                     CV_OUT MatchesInfo& matches_info) { match(features1, features2, matches_info); }
+
+    /** @brief Performs images matching.
+
+    @param features Features of the source images
+    @param pairwise_matches Found pairwise matches
+    @param mask Mask indicating which image pairs must be matched
+
+    The function is parallelized with the TBB library.
+
+    @sa detail::MatchesInfo
+    */
+    CV_WRAP_AS(apply2) void operator ()(const std::vector<ImageFeatures> &features, CV_OUT std::vector<MatchesInfo> &pairwise_matches,
+                                        const cv::UMat &mask = cv::UMat()) { match(features, pairwise_matches, mask); }
+
+    /** @return True, if it's possible to use the same matcher instance in parallel, false otherwise
+    */
+   CV_WRAP bool isThreadSafe() const { return is_thread_safe_; }
+
+    /** @brief Frees unused memory allocated before if there is any.
+    */
+   CV_WRAP virtual void collectGarbage() {}
+
+protected:
+    FeaturesMatcher(bool is_thread_safe = false) : is_thread_safe_(is_thread_safe) {}
+
+    /** @brief This method must implement matching logic in order to make the wrappers
+    detail::FeaturesMatcher::operator()_ work.
+
+    @param features1 first image features
+    @param features2 second image features
+    @param matches_info found matches
+     */
+    virtual void match(const ImageFeatures &features1, const ImageFeatures &features2,
+                       MatchesInfo& matches_info) = 0;
+
+    /** @brief This method implements logic to match features between arbitrary number of features.
+    By default this checks every pair of inputs in the input, but the behaviour can be changed by subclasses.
+
+    @param features vector of image features
+    @param pairwise_matches found matches
+    @param mask (optional) mask indicating which image pairs should be matched
+     */
+    virtual void match(const std::vector<ImageFeatures> &features, std::vector<MatchesInfo> &pairwise_matches,
+                       const cv::UMat &mask = cv::UMat());
+
+    bool is_thread_safe_;
+};
+
+/** @brief Features matcher which finds two best matches for each feature and leaves the best one only if the
+ratio between descriptor distances is greater than the threshold match_conf
+
+@sa detail::FeaturesMatcher
+ */
+class CV_EXPORTS_W BestOf2NearestMatcher : public FeaturesMatcher
+{
+public:
+    /** @brief Constructs a "best of 2 nearest" matcher.
+
+    @param try_use_gpu Should try to use GPU or not
+    @param match_conf Match distances ration threshold
+    @param num_matches_thresh1 Minimum number of matches required for the 2D projective transform
+    estimation used in the inliers classification step
+    @param num_matches_thresh2 Minimum number of matches required for the 2D projective transform
+    re-estimation on inliers
+    @param matches_confindece_thresh Matching confidence threshold to take the match into account.
+    The threshold was determined experimentally and set to 3 by default.
+     */
+    CV_WRAP BestOf2NearestMatcher(bool try_use_gpu = false, float match_conf = 0.3f, int num_matches_thresh1 = 6,
+                          int num_matches_thresh2 = 6, double matches_confindece_thresh = 3.);
+
+    CV_WRAP void collectGarbage() CV_OVERRIDE;
+    CV_WRAP static Ptr<BestOf2NearestMatcher> create(bool try_use_gpu = false, float match_conf = 0.3f, int num_matches_thresh1 = 6,
+        int num_matches_thresh2 = 6, double matches_confindece_thresh = 3.);
+
+protected:
+
+    void match(const ImageFeatures &features1, const ImageFeatures &features2, MatchesInfo &matches_info) CV_OVERRIDE;
+    int num_matches_thresh1_;
+    int num_matches_thresh2_;
+    double matches_confindece_thresh_;
+    Ptr<FeaturesMatcher> impl_;
+};
+
+class CV_EXPORTS_W BestOf2NearestRangeMatcher : public BestOf2NearestMatcher
+{
+public:
+    CV_WRAP BestOf2NearestRangeMatcher(int range_width = 5, bool try_use_gpu = false, float match_conf = 0.3f,
+                            int num_matches_thresh1 = 6, int num_matches_thresh2 = 6);
+
+protected:
+    // indicate that we do not want to hide the base class match method with a different signature
+    using BestOf2NearestMatcher::match;
+    void match(const std::vector<ImageFeatures> &features, std::vector<MatchesInfo> &pairwise_matches,
+               const cv::UMat &mask = cv::UMat()) CV_OVERRIDE;
+
+    int range_width_;
+};
+
+/** @brief Features matcher similar to cv::detail::BestOf2NearestMatcher which
+finds two best matches for each feature and leaves the best one only if the
+ratio between descriptor distances is greater than the threshold match_conf.
+
+Unlike cv::detail::BestOf2NearestMatcher this matcher uses affine
+transformation (affine transformation estimate will be placed in matches_info).
+
+@sa cv::detail::FeaturesMatcher cv::detail::BestOf2NearestMatcher
+ */
+class CV_EXPORTS_W AffineBestOf2NearestMatcher : public BestOf2NearestMatcher
+{
+public:
+    /** @brief Constructs a "best of 2 nearest" matcher that expects affine transformation
+    between images
+
+    @param full_affine whether to use full affine transformation with 6 degress of freedom or reduced
+    transformation with 4 degrees of freedom using only rotation, translation and uniform scaling
+    @param try_use_gpu Should try to use GPU or not
+    @param match_conf Match distances ration threshold
+    @param num_matches_thresh1 Minimum number of matches required for the 2D affine transform
+    estimation used in the inliers classification step
+
+    @sa cv::estimateAffine2D cv::estimateAffinePartial2D
+     */
+    CV_WRAP AffineBestOf2NearestMatcher(bool full_affine = false, bool try_use_gpu = false,
+                                float match_conf = 0.3f, int num_matches_thresh1 = 6) :
+        BestOf2NearestMatcher(try_use_gpu, match_conf, num_matches_thresh1, num_matches_thresh1),
+        full_affine_(full_affine) {}
+
+protected:
+    void match(const ImageFeatures &features1, const ImageFeatures &features2, MatchesInfo &matches_info) CV_OVERRIDE;
+
+    bool full_affine_;
+};
+
+//! @} stitching_match
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_STITCHING_MATCHERS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/motion_estimators.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/motion_estimators.hpp
new file mode 100644
index 000000000000..c03aa520906b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/motion_estimators.hpp
@@ -0,0 +1,373 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_MOTION_ESTIMATORS_HPP
+#define OPENCV_STITCHING_MOTION_ESTIMATORS_HPP
+
+#include "opencv2/core.hpp"
+#include "matchers.hpp"
+#include "util.hpp"
+#include "camera.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching_rotation
+//! @{
+
+/** @brief Rotation estimator base class.
+
+It takes features of all images, pairwise matches between all images and estimates rotations of all
+cameras.
+
+@note The coordinate system origin is implementation-dependent, but you can always normalize the
+rotations in respect to the first camera, for instance. :
+ */
+class CV_EXPORTS_W Estimator
+{
+public:
+    virtual ~Estimator() {}
+
+    /** @brief Estimates camera parameters.
+
+    @param features Features of images
+    @param pairwise_matches Pairwise matches of images
+    @param cameras Estimated camera parameters
+    @return True in case of success, false otherwise
+     */
+    CV_WRAP_AS(apply) bool operator ()(const std::vector<ImageFeatures> &features,
+        const std::vector<MatchesInfo> &pairwise_matches,
+        CV_OUT CV_IN_OUT std::vector<CameraParams> &cameras)
+    {
+        return estimate(features, pairwise_matches, cameras);
+    }
+
+protected:
+    /** @brief This method must implement camera parameters estimation logic in order to make the wrapper
+    detail::Estimator::operator()_ work.
+
+    @param features Features of images
+    @param pairwise_matches Pairwise matches of images
+    @param cameras Estimated camera parameters
+    @return True in case of success, false otherwise
+     */
+    virtual bool estimate(const std::vector<ImageFeatures> &features,
+                          const std::vector<MatchesInfo> &pairwise_matches,
+                          CV_OUT std::vector<CameraParams> &cameras) = 0;
+};
+
+/** @brief Homography based rotation estimator.
+ */
+class CV_EXPORTS_W HomographyBasedEstimator : public Estimator
+{
+public:
+    CV_WRAP HomographyBasedEstimator(bool is_focals_estimated = false)
+        : is_focals_estimated_(is_focals_estimated) {}
+
+private:
+    virtual bool estimate(const std::vector<ImageFeatures> &features,
+                          const std::vector<MatchesInfo> &pairwise_matches,
+                          std::vector<CameraParams> &cameras) CV_OVERRIDE;
+
+    bool is_focals_estimated_;
+};
+
+/** @brief Affine transformation based estimator.
+
+This estimator uses pairwise transformations estimated by matcher to estimate
+final transformation for each camera.
+
+@sa cv::detail::HomographyBasedEstimator
+ */
+class CV_EXPORTS_W AffineBasedEstimator : public Estimator
+{
+public:
+    CV_WRAP AffineBasedEstimator(){}
+private:
+    virtual bool estimate(const std::vector<ImageFeatures> &features,
+                          const std::vector<MatchesInfo> &pairwise_matches,
+                          std::vector<CameraParams> &cameras) CV_OVERRIDE;
+};
+
+/** @brief Base class for all camera parameters refinement methods.
+ */
+class CV_EXPORTS_W BundleAdjusterBase : public Estimator
+{
+public:
+    CV_WRAP Mat refinementMask() const { return refinement_mask_.clone(); }
+    CV_WRAP void setRefinementMask(const Mat &mask)
+    {
+        CV_Assert(mask.type() == CV_8U && mask.size() == Size(3, 3));
+        refinement_mask_ = mask.clone();
+    }
+
+    CV_WRAP double confThresh() const { return conf_thresh_; }
+    CV_WRAP void setConfThresh(double conf_thresh) { conf_thresh_ = conf_thresh; }
+
+    CV_WRAP TermCriteria termCriteria() { return term_criteria_; }
+    CV_WRAP void setTermCriteria(const TermCriteria& term_criteria) { term_criteria_ = term_criteria; }
+
+protected:
+    /** @brief Construct a bundle adjuster base instance.
+
+    @param num_params_per_cam Number of parameters per camera
+    @param num_errs_per_measurement Number of error terms (components) per match
+     */
+    BundleAdjusterBase(int num_params_per_cam, int num_errs_per_measurement)
+        : num_images_(0), total_num_matches_(0),
+          num_params_per_cam_(num_params_per_cam),
+          num_errs_per_measurement_(num_errs_per_measurement),
+          features_(0), pairwise_matches_(0), conf_thresh_(0)
+    {
+        setRefinementMask(Mat::ones(3, 3, CV_8U));
+        setConfThresh(1.);
+        setTermCriteria(TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 1000, DBL_EPSILON));
+    }
+
+    // Runs bundle adjustment
+    virtual bool estimate(const std::vector<ImageFeatures> &features,
+                          const std::vector<MatchesInfo> &pairwise_matches,
+                          std::vector<CameraParams> &cameras) CV_OVERRIDE;
+
+    /** @brief Sets initial camera parameter to refine.
+
+    @param cameras Camera parameters
+     */
+    virtual void setUpInitialCameraParams(const std::vector<CameraParams> &cameras) = 0;
+    /** @brief Gets the refined camera parameters.
+
+    @param cameras Refined camera parameters
+     */
+    virtual void obtainRefinedCameraParams(std::vector<CameraParams> &cameras) const = 0;
+    /** @brief Calculates error vector.
+
+    @param err Error column-vector of length total_num_matches \* num_errs_per_measurement
+     */
+    virtual void calcError(Mat &err) = 0;
+    /** @brief Calculates the cost function jacobian.
+
+    @param jac Jacobian matrix of dimensions
+    (total_num_matches \* num_errs_per_measurement) x (num_images \* num_params_per_cam)
+     */
+    virtual void calcJacobian(Mat &jac) = 0;
+
+    // 3x3 8U mask, where 0 means don't refine respective parameter, != 0 means refine
+    Mat refinement_mask_;
+
+    int num_images_;
+    int total_num_matches_;
+
+    int num_params_per_cam_;
+    int num_errs_per_measurement_;
+
+    const ImageFeatures *features_;
+    const MatchesInfo *pairwise_matches_;
+
+    // Threshold to filter out poorly matched image pairs
+    double conf_thresh_;
+
+    //Levenberg-Marquardt algorithm termination criteria
+    TermCriteria term_criteria_;
+
+    // Camera parameters matrix (CV_64F)
+    Mat cam_params_;
+
+    // Connected images pairs
+    std::vector<std::pair<int,int> > edges_;
+};
+
+
+/** @brief Stub bundle adjuster that does nothing.
+ */
+class CV_EXPORTS_W NoBundleAdjuster : public BundleAdjusterBase
+{
+public:
+    CV_WRAP NoBundleAdjuster() : BundleAdjusterBase(0, 0) {}
+
+private:
+    bool estimate(const std::vector<ImageFeatures> &, const std::vector<MatchesInfo> &,
+                  std::vector<CameraParams> &) CV_OVERRIDE
+    {
+        return true;
+    }
+    void setUpInitialCameraParams(const std::vector<CameraParams> &) CV_OVERRIDE {}
+    void obtainRefinedCameraParams(std::vector<CameraParams> &) const CV_OVERRIDE {}
+    void calcError(Mat &) CV_OVERRIDE {}
+    void calcJacobian(Mat &) CV_OVERRIDE {}
+};
+
+
+/** @brief Implementation of the camera parameters refinement algorithm which minimizes sum of the reprojection
+error squares
+
+It can estimate focal length, aspect ratio, principal point.
+You can affect only on them via the refinement mask.
+ */
+class CV_EXPORTS_W BundleAdjusterReproj : public BundleAdjusterBase
+{
+public:
+    CV_WRAP BundleAdjusterReproj() : BundleAdjusterBase(7, 2) {}
+
+private:
+    void setUpInitialCameraParams(const std::vector<CameraParams> &cameras) CV_OVERRIDE;
+    void obtainRefinedCameraParams(std::vector<CameraParams> &cameras) const CV_OVERRIDE;
+    void calcError(Mat &err) CV_OVERRIDE;
+    void calcJacobian(Mat &jac) CV_OVERRIDE;
+
+    Mat err1_, err2_;
+};
+
+
+/** @brief Implementation of the camera parameters refinement algorithm which minimizes sum of the distances
+between the rays passing through the camera center and a feature. :
+
+It can estimate focal length. It ignores the refinement mask for now.
+ */
+class CV_EXPORTS_W BundleAdjusterRay : public BundleAdjusterBase
+{
+public:
+    CV_WRAP BundleAdjusterRay() : BundleAdjusterBase(4, 3) {}
+
+private:
+    void setUpInitialCameraParams(const std::vector<CameraParams> &cameras) CV_OVERRIDE;
+    void obtainRefinedCameraParams(std::vector<CameraParams> &cameras) const CV_OVERRIDE;
+    void calcError(Mat &err) CV_OVERRIDE;
+    void calcJacobian(Mat &jac) CV_OVERRIDE;
+
+    Mat err1_, err2_;
+};
+
+
+/** @brief Bundle adjuster that expects affine transformation
+represented in homogeneous coordinates in R for each camera param. Implements
+camera parameters refinement algorithm which minimizes sum of the reprojection
+error squares
+
+It estimates all transformation parameters. Refinement mask is ignored.
+
+@sa AffineBasedEstimator AffineBestOf2NearestMatcher BundleAdjusterAffinePartial
+ */
+class CV_EXPORTS_W BundleAdjusterAffine : public BundleAdjusterBase
+{
+public:
+    CV_WRAP BundleAdjusterAffine() : BundleAdjusterBase(6, 2) {}
+
+private:
+    void setUpInitialCameraParams(const std::vector<CameraParams> &cameras) CV_OVERRIDE;
+    void obtainRefinedCameraParams(std::vector<CameraParams> &cameras) const CV_OVERRIDE;
+    void calcError(Mat &err) CV_OVERRIDE;
+    void calcJacobian(Mat &jac) CV_OVERRIDE;
+
+    Mat err1_, err2_;
+};
+
+
+/** @brief Bundle adjuster that expects affine transformation with 4 DOF
+represented in homogeneous coordinates in R for each camera param. Implements
+camera parameters refinement algorithm which minimizes sum of the reprojection
+error squares
+
+It estimates all transformation parameters. Refinement mask is ignored.
+
+@sa AffineBasedEstimator AffineBestOf2NearestMatcher BundleAdjusterAffine
+ */
+class CV_EXPORTS_W BundleAdjusterAffinePartial : public BundleAdjusterBase
+{
+public:
+    CV_WRAP BundleAdjusterAffinePartial() : BundleAdjusterBase(4, 2) {}
+
+private:
+    void setUpInitialCameraParams(const std::vector<CameraParams> &cameras) CV_OVERRIDE;
+    void obtainRefinedCameraParams(std::vector<CameraParams> &cameras) const CV_OVERRIDE;
+    void calcError(Mat &err) CV_OVERRIDE;
+    void calcJacobian(Mat &jac) CV_OVERRIDE;
+
+    Mat err1_, err2_;
+};
+
+
+enum WaveCorrectKind
+{
+    WAVE_CORRECT_HORIZ,
+    WAVE_CORRECT_VERT,
+    WAVE_CORRECT_AUTO
+};
+
+/** @brief Tries to detect the wave correction kind depending
+on whether a panorama spans horizontally or vertically
+
+@param rmats Camera rotation matrices.
+@return The correction kind to use for this panorama
+ */
+CV_EXPORTS
+WaveCorrectKind autoDetectWaveCorrectKind(const std::vector<Mat> &rmats);
+
+/** @brief Tries to make panorama more horizontal (or vertical).
+
+@param rmats Camera rotation matrices.
+@param kind Correction kind, see detail::WaveCorrectKind.
+ */
+void CV_EXPORTS_W waveCorrect(CV_IN_OUT std::vector<Mat> &rmats, WaveCorrectKind kind);
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Auxiliary functions
+
+// Returns matches graph representation in DOT language
+String CV_EXPORTS_W matchesGraphAsString(std::vector<String> &paths, std::vector<MatchesInfo> &pairwise_matches,
+                                         float conf_threshold);
+
+CV_EXPORTS_W std::vector<int>  leaveBiggestComponent(
+        std::vector<ImageFeatures> &features,
+        std::vector<MatchesInfo> &pairwise_matches,
+        float conf_threshold);
+
+void CV_EXPORTS findMaxSpanningTree(
+        int num_images, const std::vector<MatchesInfo> &pairwise_matches,
+        Graph &span_tree, std::vector<int> &centers);
+
+//! @} stitching_rotation
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_STITCHING_MOTION_ESTIMATORS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/seam_finders.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/seam_finders.hpp
new file mode 100644
index 000000000000..9ccfd1442471
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/seam_finders.hpp
@@ -0,0 +1,291 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_SEAM_FINDERS_HPP
+#define OPENCV_STITCHING_SEAM_FINDERS_HPP
+
+#include <set>
+#include "opencv2/core.hpp"
+#include "opencv2/opencv_modules.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching_seam
+//! @{
+
+/** @brief Base class for a seam estimator.
+ */
+class CV_EXPORTS_W SeamFinder
+{
+public:
+    CV_WRAP  virtual ~SeamFinder() {}
+    enum { NO, VORONOI_SEAM, DP_SEAM };
+    /** @brief Estimates seams.
+
+    @param src Source images
+    @param corners Source image top-left corners
+    @param masks Source image masks to update
+     */
+    CV_WRAP virtual void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                      CV_IN_OUT std::vector<UMat> &masks) = 0;
+    CV_WRAP static Ptr<SeamFinder> createDefault(int type);
+};
+
+/** @brief Stub seam estimator which does nothing.
+ */
+class CV_EXPORTS_W NoSeamFinder : public SeamFinder
+{
+public:
+    CV_WRAP void find(const std::vector<UMat>&, const std::vector<Point>&, CV_IN_OUT std::vector<UMat>&) CV_OVERRIDE {}
+};
+
+/** @brief Base class for all pairwise seam estimators.
+ */
+class CV_EXPORTS_W PairwiseSeamFinder : public SeamFinder
+{
+public:
+    CV_WRAP virtual void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                      CV_IN_OUT std::vector<UMat> &masks) CV_OVERRIDE;
+
+protected:
+    void run();
+    /** @brief Resolves masks intersection of two specified images in the given ROI.
+
+    @param first First image index
+    @param second Second image index
+    @param roi Region of interest
+     */
+    virtual void findInPair(size_t first, size_t second, Rect roi) = 0;
+
+    std::vector<UMat> images_;
+    std::vector<Size> sizes_;
+    std::vector<Point> corners_;
+    std::vector<UMat> masks_;
+};
+
+/** @brief Voronoi diagram-based seam estimator.
+ */
+class CV_EXPORTS_W VoronoiSeamFinder : public PairwiseSeamFinder
+{
+public:
+    CV_WRAP virtual void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                      CV_IN_OUT std::vector<UMat> &masks) CV_OVERRIDE;
+    virtual void find(const std::vector<Size> &size, const std::vector<Point> &corners,
+                      std::vector<UMat> &masks);
+private:
+    void findInPair(size_t first, size_t second, Rect roi) CV_OVERRIDE;
+};
+
+
+class CV_EXPORTS_W DpSeamFinder : public SeamFinder
+{
+public:
+    enum CostFunction { COLOR, COLOR_GRAD };
+
+    DpSeamFinder(CostFunction costFunc = COLOR);
+    CV_WRAP DpSeamFinder(String costFunc );
+
+    CostFunction costFunction() const { return costFunc_; }
+    void setCostFunction(CostFunction val) { costFunc_ = val; }
+    CV_WRAP void setCostFunction(String val);
+
+    virtual void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                      std::vector<UMat> &masks) CV_OVERRIDE;
+
+private:
+    enum ComponentState
+    {
+        FIRST = 1, SECOND = 2, INTERS = 4,
+        INTERS_FIRST = INTERS | FIRST,
+        INTERS_SECOND = INTERS | SECOND
+    };
+
+    class ImagePairLess
+    {
+    public:
+        ImagePairLess(const std::vector<Mat> &images, const std::vector<Point> &corners)
+            : src_(&images[0]), corners_(&corners[0]) {}
+
+        bool operator() (const std::pair<size_t, size_t> &l, const std::pair<size_t, size_t> &r) const
+        {
+            Point c1 = corners_[l.first] + Point(src_[l.first].cols / 2, src_[l.first].rows / 2);
+            Point c2 = corners_[l.second] + Point(src_[l.second].cols / 2, src_[l.second].rows / 2);
+            int d1 = (c1 - c2).dot(c1 - c2);
+
+            c1 = corners_[r.first] + Point(src_[r.first].cols / 2, src_[r.first].rows / 2);
+            c2 = corners_[r.second] + Point(src_[r.second].cols / 2, src_[r.second].rows / 2);
+            int d2 = (c1 - c2).dot(c1 - c2);
+
+            return d1 < d2;
+        }
+
+    private:
+        const Mat *src_;
+        const Point *corners_;
+    };
+
+    class ClosePoints
+    {
+    public:
+        ClosePoints(int minDist) : minDist_(minDist) {}
+
+        bool operator() (const Point &p1, const Point &p2) const
+        {
+            int dist2 = (p1.x-p2.x) * (p1.x-p2.x) + (p1.y-p2.y) * (p1.y-p2.y);
+            return dist2 < minDist_ * minDist_;
+        }
+
+    private:
+        int minDist_;
+    };
+
+    void process(
+            const Mat &image1, const Mat &image2, Point tl1, Point tl2,  Mat &mask1, Mat &mask2);
+
+    void findComponents();
+
+    void findEdges();
+
+    void resolveConflicts(
+            const Mat &image1, const Mat &image2, Point tl1, Point tl2, Mat &mask1, Mat &mask2);
+
+    void computeGradients(const Mat &image1, const Mat &image2);
+
+    bool hasOnlyOneNeighbor(int comp);
+
+    bool closeToContour(int y, int x, const Mat_<uchar> &contourMask);
+
+    bool getSeamTips(int comp1, int comp2, Point &p1, Point &p2);
+
+    void computeCosts(
+            const Mat &image1, const Mat &image2, Point tl1, Point tl2,
+            int comp, Mat_<float> &costV, Mat_<float> &costH);
+
+    bool estimateSeam(
+            const Mat &image1, const Mat &image2, Point tl1, Point tl2, int comp,
+            Point p1, Point p2, std::vector<Point> &seam, bool &isHorizontal);
+
+    void updateLabelsUsingSeam(
+            int comp1, int comp2, const std::vector<Point> &seam, bool isHorizontalSeam);
+
+    CostFunction costFunc_;
+
+    // processing images pair data
+    Point unionTl_, unionBr_;
+    Size unionSize_;
+    Mat_<uchar> mask1_, mask2_;
+    Mat_<uchar> contour1mask_, contour2mask_;
+    Mat_<float> gradx1_, grady1_;
+    Mat_<float> gradx2_, grady2_;
+
+    // components data
+    int ncomps_;
+    Mat_<int> labels_;
+    std::vector<ComponentState> states_;
+    std::vector<Point> tls_, brs_;
+    std::vector<std::vector<Point> > contours_;
+    std::set<std::pair<int, int> > edges_;
+};
+
+/** @brief Base class for all minimum graph-cut-based seam estimators.
+ */
+class CV_EXPORTS GraphCutSeamFinderBase
+{
+public:
+    enum CostType { COST_COLOR, COST_COLOR_GRAD };
+};
+
+/** @brief Minimum graph cut-based seam estimator. See details in @cite V03 .
+ */
+class CV_EXPORTS_W GraphCutSeamFinder : public GraphCutSeamFinderBase, public SeamFinder
+{
+public:
+    GraphCutSeamFinder(int cost_type = COST_COLOR_GRAD, float terminal_cost = 10000.f,
+                       float bad_region_penalty = 1000.f);
+    CV_WRAP GraphCutSeamFinder(String cost_type,float terminal_cost = 10000.f,
+        float bad_region_penalty = 1000.f);
+
+    ~GraphCutSeamFinder();
+
+    CV_WRAP void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                      CV_IN_OUT std::vector<UMat> &masks) CV_OVERRIDE;
+
+private:
+    // To avoid GCGraph dependency
+    class Impl;
+    Ptr<PairwiseSeamFinder> impl_;
+};
+
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+class CV_EXPORTS GraphCutSeamFinderGpu : public GraphCutSeamFinderBase, public PairwiseSeamFinder
+{
+public:
+    GraphCutSeamFinderGpu(int cost_type = COST_COLOR_GRAD, float terminal_cost = 10000.f,
+                          float bad_region_penalty = 1000.f)
+                          : cost_type_(cost_type), terminal_cost_(terminal_cost),
+                            bad_region_penalty_(bad_region_penalty) {}
+
+    void find(const std::vector<cv::UMat> &src, const std::vector<cv::Point> &corners,
+              std::vector<cv::UMat> &masks) CV_OVERRIDE;
+    void findInPair(size_t first, size_t second, Rect roi) CV_OVERRIDE;
+
+private:
+    void setGraphWeightsColor(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &mask1, const cv::Mat &mask2,
+                              cv::Mat &terminals, cv::Mat &leftT, cv::Mat &rightT, cv::Mat &top, cv::Mat &bottom);
+    void setGraphWeightsColorGrad(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &dx1, const cv::Mat &dx2,
+                                  const cv::Mat &dy1, const cv::Mat &dy2, const cv::Mat &mask1, const cv::Mat &mask2,
+                                  cv::Mat &terminals, cv::Mat &leftT, cv::Mat &rightT, cv::Mat &top, cv::Mat &bottom);
+    std::vector<Mat> dx_, dy_;
+    int cost_type_;
+    float terminal_cost_;
+    float bad_region_penalty_;
+};
+#endif
+
+//! @}
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_STITCHING_SEAM_FINDERS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/timelapsers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/timelapsers.hpp
new file mode 100644
index 000000000000..f6f3da8a8d50
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/timelapsers.hpp
@@ -0,0 +1,91 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#ifndef OPENCV_STITCHING_TIMELAPSERS_HPP
+#define OPENCV_STITCHING_TIMELAPSERS_HPP
+
+#include "opencv2/core.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching
+//! @{
+
+//  Base Timelapser class, takes a sequence of images, applies appropriate shift, stores result in dst_.
+
+class CV_EXPORTS_W Timelapser
+{
+public:
+
+    enum {AS_IS, CROP};
+
+    virtual ~Timelapser() {}
+
+    CV_WRAP static Ptr<Timelapser> createDefault(int type);
+
+    CV_WRAP virtual void initialize(const std::vector<Point> &corners, const std::vector<Size> &sizes);
+    CV_WRAP virtual void process(InputArray img, InputArray mask, Point tl);
+    CV_WRAP virtual const UMat& getDst() {return dst_;}
+
+protected:
+
+    virtual bool test_point(Point pt);
+
+    UMat dst_;
+    Rect dst_roi_;
+};
+
+
+class CV_EXPORTS_W TimelapserCrop : public Timelapser
+{
+public:
+    virtual void initialize(const std::vector<Point> &corners, const std::vector<Size> &sizes) CV_OVERRIDE;
+};
+
+//! @}
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_STITCHING_TIMELAPSERS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/util.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/util.hpp
new file mode 100644
index 000000000000..bf7a39098c19
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/util.hpp
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_UTIL_HPP
+#define OPENCV_STITCHING_UTIL_HPP
+
+#include <list>
+#include "opencv2/core.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching
+//! @{
+
+class CV_EXPORTS DisjointSets
+{
+public:
+    DisjointSets(int elem_count = 0) { createOneElemSets(elem_count); }
+
+    void createOneElemSets(int elem_count);
+    int findSetByElem(int elem);
+    int mergeSets(int set1, int set2);
+
+    std::vector<int> parent;
+    std::vector<int> size;
+
+private:
+    std::vector<int> rank_;
+};
+
+
+struct CV_EXPORTS GraphEdge
+{
+    GraphEdge(int from, int to, float weight);
+    bool operator <(const GraphEdge& other) const { return weight < other.weight; }
+    bool operator >(const GraphEdge& other) const { return weight > other.weight; }
+
+    int from, to;
+    float weight;
+};
+
+inline GraphEdge::GraphEdge(int _from, int _to, float _weight) : from(_from), to(_to), weight(_weight) {}
+
+
+class CV_EXPORTS Graph
+{
+public:
+    Graph(int num_vertices = 0) { create(num_vertices); }
+    void create(int num_vertices) { edges_.assign(num_vertices, std::list<GraphEdge>()); }
+    int numVertices() const { return static_cast<int>(edges_.size()); }
+    void addEdge(int from, int to, float weight);
+    template <typename B> B forEach(B body) const;
+    template <typename B> B walkBreadthFirst(int from, B body) const;
+
+private:
+    std::vector< std::list<GraphEdge> > edges_;
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Auxiliary functions
+
+CV_EXPORTS_W bool overlapRoi(Point tl1, Point tl2, Size sz1, Size sz2, Rect &roi);
+CV_EXPORTS_W Rect resultRoi(const std::vector<Point> &corners, const std::vector<UMat> &images);
+CV_EXPORTS_W Rect resultRoi(const std::vector<Point> &corners, const std::vector<Size> &sizes);
+CV_EXPORTS_W Rect resultRoiIntersection(const std::vector<Point> &corners, const std::vector<Size> &sizes);
+CV_EXPORTS_W Point resultTl(const std::vector<Point> &corners);
+
+// Returns random 'count' element subset of the {0,1,...,size-1} set
+CV_EXPORTS_W void selectRandomSubset(int count, int size, std::vector<int> &subset);
+
+CV_EXPORTS_W int& stitchingLogLevel();
+
+//! @}
+
+} // namespace detail
+} // namespace cv
+
+#include "util_inl.hpp"
+
+#endif // OPENCV_STITCHING_UTIL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/util_inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/util_inl.hpp
new file mode 100644
index 000000000000..dafab8b81180
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/util_inl.hpp
@@ -0,0 +1,131 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_UTIL_INL_HPP
+#define OPENCV_STITCHING_UTIL_INL_HPP
+
+#include <queue>
+#include "opencv2/core.hpp"
+#include "util.hpp" // Make your IDE see declarations
+
+//! @cond IGNORED
+
+namespace cv {
+namespace detail {
+
+template <typename B>
+B Graph::forEach(B body) const
+{
+    for (int i = 0; i < numVertices(); ++i)
+    {
+        std::list<GraphEdge>::const_iterator edge = edges_[i].begin();
+        for (; edge != edges_[i].end(); ++edge)
+            body(*edge);
+    }
+    return body;
+}
+
+
+template <typename B>
+B Graph::walkBreadthFirst(int from, B body) const
+{
+    std::vector<bool> was(numVertices(), false);
+    std::queue<int> vertices;
+
+    was[from] = true;
+    vertices.push(from);
+
+    while (!vertices.empty())
+    {
+        int vertex = vertices.front();
+        vertices.pop();
+
+        std::list<GraphEdge>::const_iterator edge = edges_[vertex].begin();
+        for (; edge != edges_[vertex].end(); ++edge)
+        {
+            if (!was[edge->to])
+            {
+                body(*edge);
+                was[edge->to] = true;
+                vertices.push(edge->to);
+            }
+        }
+    }
+
+    return body;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Some auxiliary math functions
+
+static inline
+float normL2(const Point3f& a)
+{
+    return a.x * a.x + a.y * a.y + a.z * a.z;
+}
+
+
+static inline
+float normL2(const Point3f& a, const Point3f& b)
+{
+    return normL2(a - b);
+}
+
+
+static inline
+double normL2sq(const Mat &r)
+{
+    return r.dot(r);
+}
+
+
+static inline int sqr(int x) { return x * x; }
+static inline float sqr(float x) { return x * x; }
+static inline double sqr(double x) { return x * x; }
+
+} // namespace detail
+} // namespace cv
+
+//! @endcond
+
+#endif // OPENCV_STITCHING_UTIL_INL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/warpers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/warpers.hpp
new file mode 100644
index 000000000000..d0d7869d4509
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/warpers.hpp
@@ -0,0 +1,706 @@
+ /*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_WARPERS_HPP
+#define OPENCV_STITCHING_WARPERS_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/opencv_modules.hpp"
+
+namespace cv {
+namespace detail {
+
+//! @addtogroup stitching_warp
+//! @{
+
+/** @brief Rotation-only model image warper interface.
+ */
+class CV_EXPORTS RotationWarper
+{
+public:
+    virtual ~RotationWarper() {}
+
+    /** @brief Projects the image point.
+
+    @param pt Source point
+    @param K Camera intrinsic parameters
+    @param R Camera rotation matrix
+    @return Projected point
+     */
+    virtual Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R) = 0;
+
+    /** @brief Projects the image point backward.
+
+    @param pt Projected point
+    @param K Camera intrinsic parameters
+    @param R Camera rotation matrix
+    @return Backward-projected point
+    */
+#if CV_VERSION_MAJOR == 4
+    virtual Point2f warpPointBackward(const Point2f& pt, InputArray K, InputArray R)
+    {
+        CV_UNUSED(pt); CV_UNUSED(K); CV_UNUSED(R);
+        CV_Error(Error::StsNotImplemented, "");
+    }
+#else
+    virtual Point2f warpPointBackward(const Point2f& pt, InputArray K, InputArray R) = 0;
+#endif
+
+    /** @brief Builds the projection maps according to the given camera data.
+
+    @param src_size Source image size
+    @param K Camera intrinsic parameters
+    @param R Camera rotation matrix
+    @param xmap Projection map for the x axis
+    @param ymap Projection map for the y axis
+    @return Projected image minimum bounding box
+     */
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) = 0;
+
+    /** @brief Projects the image.
+
+    @param src Source image
+    @param K Camera intrinsic parameters
+    @param R Camera rotation matrix
+    @param interp_mode Interpolation mode
+    @param border_mode Border extrapolation mode
+    @param dst Projected image
+    @return Project image top-left corner
+     */
+    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                       CV_OUT OutputArray dst) = 0;
+
+    /** @brief Projects the image backward.
+
+    @param src Projected image
+    @param K Camera intrinsic parameters
+    @param R Camera rotation matrix
+    @param interp_mode Interpolation mode
+    @param border_mode Border extrapolation mode
+    @param dst_size Backward-projected image size
+    @param dst Backward-projected image
+     */
+    virtual void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                              Size dst_size, CV_OUT OutputArray dst) = 0;
+
+    /**
+    @param src_size Source image bounding box
+    @param K Camera intrinsic parameters
+    @param R Camera rotation matrix
+    @return Projected image minimum bounding box
+     */
+    virtual Rect warpRoi(Size src_size, InputArray K, InputArray R) = 0;
+
+    virtual float getScale() const { return 1.f; }
+    virtual void setScale(float) {}
+};
+
+/** @brief Base class for warping logic implementation.
+ */
+struct CV_EXPORTS_W_SIMPLE ProjectorBase
+{
+    void setCameraParams(InputArray K = Mat::eye(3, 3, CV_32F),
+                         InputArray R = Mat::eye(3, 3, CV_32F),
+                         InputArray T = Mat::zeros(3, 1, CV_32F));
+
+    float scale;
+    float k[9];
+    float rinv[9];
+    float r_kinv[9];
+    float k_rinv[9];
+    float t[3];
+};
+
+/** @brief Base class for rotation-based warper using a detail::ProjectorBase_ derived class.
+ */
+template <class P>
+class CV_EXPORTS_TEMPLATE RotationWarperBase : public RotationWarper
+{
+public:
+    Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R) CV_OVERRIDE;
+
+    Point2f warpPointBackward(const Point2f &pt, InputArray K, InputArray R) CV_OVERRIDE;
+
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) CV_OVERRIDE;
+
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst) CV_OVERRIDE;
+
+    void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                      Size dst_size, OutputArray dst) CV_OVERRIDE;
+
+    Rect warpRoi(Size src_size, InputArray K, InputArray R) CV_OVERRIDE;
+
+    float getScale() const  CV_OVERRIDE{ return projector_.scale; }
+    void setScale(float val) CV_OVERRIDE { projector_.scale = val; }
+
+protected:
+
+    // Detects ROI of the destination image. It's correct for any projection.
+    virtual void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br);
+
+    // Detects ROI of the destination image by walking over image border.
+    // Correctness for any projection isn't guaranteed.
+    void detectResultRoiByBorder(Size src_size, Point &dst_tl, Point &dst_br);
+
+    P projector_;
+};
+
+
+struct CV_EXPORTS PlaneProjector : ProjectorBase
+{
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+/** @brief Warper that maps an image onto the z = 1 plane.
+ */
+class CV_EXPORTS PlaneWarper : public RotationWarperBase<PlaneProjector>
+{
+public:
+    /** @brief Construct an instance of the plane warper class.
+
+    @param scale Projected image scale multiplier
+     */
+    PlaneWarper(float scale = 1.f) { projector_.scale = scale; }
+
+    Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R) CV_OVERRIDE;
+    Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T);
+
+    Point2f warpPointBackward(const Point2f& pt, InputArray K, InputArray R) CV_OVERRIDE;
+    Point2f warpPointBackward(const Point2f& pt, InputArray K, InputArray R, InputArray T);
+
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, CV_OUT OutputArray xmap, CV_OUT OutputArray ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, CV_OUT OutputArray xmap, CV_OUT OutputArray ymap) CV_OVERRIDE;
+
+    Point warp(InputArray src, InputArray K, InputArray R,
+               int interp_mode, int border_mode, CV_OUT OutputArray dst) CV_OVERRIDE;
+    virtual Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+        CV_OUT OutputArray dst);
+
+    Rect warpRoi(Size src_size, InputArray K, InputArray R) CV_OVERRIDE;
+    Rect warpRoi(Size src_size, InputArray K, InputArray R, InputArray T);
+
+protected:
+    void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br) CV_OVERRIDE;
+};
+
+
+/** @brief Affine warper that uses rotations and translations
+
+ Uses affine transformation in homogeneous coordinates to represent both rotation and
+ translation in camera rotation matrix.
+ */
+class CV_EXPORTS AffineWarper : public PlaneWarper
+{
+public:
+    /** @brief Construct an instance of the affine warper class.
+
+    @param scale Projected image scale multiplier
+     */
+    AffineWarper(float scale = 1.f) : PlaneWarper(scale) {}
+
+    /** @brief Projects the image point.
+
+    @param pt Source point
+    @param K Camera intrinsic parameters
+    @param H Camera extrinsic parameters
+    @return Projected point
+     */
+    Point2f warpPoint(const Point2f &pt, InputArray K, InputArray H) CV_OVERRIDE;
+
+    /** @brief Projects the image point backward.
+
+    @param pt Projected point
+    @param K Camera intrinsic parameters
+    @param H Camera extrinsic parameters
+    @return Backward-projected point
+    */
+    Point2f warpPointBackward(const Point2f &pt, InputArray K, InputArray H) CV_OVERRIDE;
+
+    /** @brief Builds the projection maps according to the given camera data.
+
+    @param src_size Source image size
+    @param K Camera intrinsic parameters
+    @param H Camera extrinsic parameters
+    @param xmap Projection map for the x axis
+    @param ymap Projection map for the y axis
+    @return Projected image minimum bounding box
+     */
+    Rect buildMaps(Size src_size, InputArray K, InputArray H, OutputArray xmap, OutputArray ymap) CV_OVERRIDE;
+
+    /** @brief Projects the image.
+
+    @param src Source image
+    @param K Camera intrinsic parameters
+    @param H Camera extrinsic parameters
+    @param interp_mode Interpolation mode
+    @param border_mode Border extrapolation mode
+    @param dst Projected image
+    @return Project image top-left corner
+     */
+    Point warp(InputArray src, InputArray K, InputArray H,
+               int interp_mode, int border_mode, OutputArray dst) CV_OVERRIDE;
+
+    /**
+    @param src_size Source image bounding box
+    @param K Camera intrinsic parameters
+    @param H Camera extrinsic parameters
+    @return Projected image minimum bounding box
+     */
+    Rect warpRoi(Size src_size, InputArray K, InputArray H) CV_OVERRIDE;
+
+protected:
+    /** @brief Extracts rotation and translation matrices from matrix H representing
+        affine transformation in homogeneous coordinates
+     */
+    void getRTfromHomogeneous(InputArray H, Mat &R, Mat &T);
+};
+
+
+struct CV_EXPORTS_W_SIMPLE SphericalProjector : ProjectorBase
+{
+    CV_WRAP void mapForward(float x, float y, float &u, float &v);
+    CV_WRAP void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+/** @brief Warper that maps an image onto the unit sphere located at the origin.
+
+ Projects image onto unit sphere with origin at (0, 0, 0) and radius scale, measured in pixels.
+ A 360 panorama would therefore have a resulting width of 2 * scale * PI pixels.
+ Poles are located at (0, -1, 0) and (0, 1, 0) points.
+*/
+class CV_EXPORTS SphericalWarper : public RotationWarperBase<SphericalProjector>
+{
+public:
+    /** @brief Construct an instance of the spherical warper class.
+
+    @param scale Radius of the projected sphere, in pixels. An image spanning the
+                 whole sphere will have a width of 2 * scale * PI pixels.
+     */
+    SphericalWarper(float scale) { projector_.scale = scale; }
+
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) CV_OVERRIDE;
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst) CV_OVERRIDE;
+protected:
+    void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br) CV_OVERRIDE;
+};
+
+
+struct CV_EXPORTS CylindricalProjector : ProjectorBase
+{
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+/** @brief Warper that maps an image onto the x\*x + z\*z = 1 cylinder.
+ */
+class CV_EXPORTS CylindricalWarper : public RotationWarperBase<CylindricalProjector>
+{
+public:
+    /** @brief Construct an instance of the cylindrical warper class.
+
+    @param scale Projected image scale multiplier
+     */
+    CylindricalWarper(float scale) { projector_.scale = scale; }
+
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) CV_OVERRIDE;
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst) CV_OVERRIDE;
+protected:
+    void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br) CV_OVERRIDE
+    {
+        RotationWarperBase<CylindricalProjector>::detectResultRoiByBorder(src_size, dst_tl, dst_br);
+    }
+};
+
+
+struct CV_EXPORTS FisheyeProjector : ProjectorBase
+{
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS FisheyeWarper : public RotationWarperBase<FisheyeProjector>
+{
+public:
+    FisheyeWarper(float scale) { projector_.scale = scale; }
+};
+
+
+struct CV_EXPORTS StereographicProjector : ProjectorBase
+{
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS StereographicWarper : public RotationWarperBase<StereographicProjector>
+{
+public:
+    StereographicWarper(float scale) { projector_.scale = scale; }
+};
+
+
+struct CV_EXPORTS CompressedRectilinearProjector : ProjectorBase
+{
+    float a, b;
+
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS CompressedRectilinearWarper : public RotationWarperBase<CompressedRectilinearProjector>
+{
+public:
+    CompressedRectilinearWarper(float scale, float A = 1, float B = 1)
+    {
+        projector_.a = A;
+        projector_.b = B;
+        projector_.scale = scale;
+    }
+};
+
+
+struct CV_EXPORTS CompressedRectilinearPortraitProjector : ProjectorBase
+{
+    float a, b;
+
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS CompressedRectilinearPortraitWarper : public RotationWarperBase<CompressedRectilinearPortraitProjector>
+{
+public:
+   CompressedRectilinearPortraitWarper(float scale, float A = 1, float B = 1)
+   {
+       projector_.a = A;
+       projector_.b = B;
+       projector_.scale = scale;
+   }
+};
+
+
+struct CV_EXPORTS PaniniProjector : ProjectorBase
+{
+    float a, b;
+
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS PaniniWarper : public RotationWarperBase<PaniniProjector>
+{
+public:
+   PaniniWarper(float scale, float A = 1, float B = 1)
+   {
+       projector_.a = A;
+       projector_.b = B;
+       projector_.scale = scale;
+   }
+};
+
+
+struct CV_EXPORTS PaniniPortraitProjector : ProjectorBase
+{
+    float a, b;
+
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS PaniniPortraitWarper : public RotationWarperBase<PaniniPortraitProjector>
+{
+public:
+   PaniniPortraitWarper(float scale, float A = 1, float B = 1)
+   {
+       projector_.a = A;
+       projector_.b = B;
+       projector_.scale = scale;
+   }
+
+};
+
+
+struct CV_EXPORTS MercatorProjector : ProjectorBase
+{
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS MercatorWarper : public RotationWarperBase<MercatorProjector>
+{
+public:
+    MercatorWarper(float scale) { projector_.scale = scale; }
+};
+
+
+struct CV_EXPORTS TransverseMercatorProjector : ProjectorBase
+{
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS TransverseMercatorWarper : public RotationWarperBase<TransverseMercatorProjector>
+{
+public:
+    TransverseMercatorWarper(float scale) { projector_.scale = scale; }
+};
+
+
+class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
+{
+public:
+    PlaneWarperGpu(float scale = 1.f) : PlaneWarper(scale) {}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) CV_OVERRIDE
+    {
+        Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
+        d_xmap_.download(xmap);
+        d_ymap_.download(ymap);
+        return result;
+    }
+
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap) CV_OVERRIDE
+    {
+        Rect result = buildMaps(src_size, K, R, T, d_xmap_, d_ymap_);
+        d_xmap_.download(xmap);
+        d_ymap_.download(ymap);
+        return result;
+    }
+
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst) CV_OVERRIDE
+    {
+        d_src_.upload(src);
+        Point result = warp(d_src_, K, R, interp_mode, border_mode, d_dst_);
+        d_dst_.download(dst);
+        return result;
+    }
+
+    Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+               OutputArray dst) CV_OVERRIDE
+    {
+        d_src_.upload(src);
+        Point result = warp(d_src_, K, R, T, interp_mode, border_mode, d_dst_);
+        d_dst_.download(dst);
+        return result;
+    }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
+
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
+
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
+
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
+
+private:
+    cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
+};
+
+
+class CV_EXPORTS SphericalWarperGpu : public SphericalWarper
+{
+public:
+    SphericalWarperGpu(float scale) : SphericalWarper(scale) {}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) CV_OVERRIDE
+    {
+        Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
+        d_xmap_.download(xmap);
+        d_ymap_.download(ymap);
+        return result;
+    }
+
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst) CV_OVERRIDE
+    {
+        d_src_.upload(src);
+        Point result = warp(d_src_, K, R, interp_mode, border_mode, d_dst_);
+        d_dst_.download(dst);
+        return result;
+    }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
+
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
+
+private:
+    cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
+};
+
+
+class CV_EXPORTS CylindricalWarperGpu : public CylindricalWarper
+{
+public:
+    CylindricalWarperGpu(float scale) : CylindricalWarper(scale) {}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) CV_OVERRIDE
+    {
+        Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
+        d_xmap_.download(xmap);
+        d_ymap_.download(ymap);
+        return result;
+    }
+
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst) CV_OVERRIDE
+    {
+        d_src_.upload(src);
+        Point result = warp(d_src_, K, R, interp_mode, border_mode, d_dst_);
+        d_dst_.download(dst);
+        return result;
+    }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
+
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
+
+private:
+    cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
+};
+
+
+struct CV_EXPORTS SphericalPortraitProjector : ProjectorBase
+{
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+// Projects image onto unit sphere with origin at (0, 0, 0).
+// Poles are located NOT at (0, -1, 0) and (0, 1, 0) points, BUT at (1, 0, 0) and (-1, 0, 0) points.
+class CV_EXPORTS SphericalPortraitWarper : public RotationWarperBase<SphericalPortraitProjector>
+{
+public:
+    SphericalPortraitWarper(float scale) { projector_.scale = scale; }
+
+protected:
+    void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br) CV_OVERRIDE;
+};
+
+struct CV_EXPORTS CylindricalPortraitProjector : ProjectorBase
+{
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS CylindricalPortraitWarper : public RotationWarperBase<CylindricalPortraitProjector>
+{
+public:
+    CylindricalPortraitWarper(float scale) { projector_.scale = scale; }
+
+protected:
+    void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br) CV_OVERRIDE
+    {
+        RotationWarperBase<CylindricalPortraitProjector>::detectResultRoiByBorder(src_size, dst_tl, dst_br);
+    }
+};
+
+struct CV_EXPORTS PlanePortraitProjector : ProjectorBase
+{
+    void mapForward(float x, float y, float &u, float &v);
+    void mapBackward(float u, float v, float &x, float &y);
+};
+
+
+class CV_EXPORTS PlanePortraitWarper : public RotationWarperBase<PlanePortraitProjector>
+{
+public:
+    PlanePortraitWarper(float scale) { projector_.scale = scale; }
+
+protected:
+    void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br) CV_OVERRIDE
+    {
+        RotationWarperBase<PlanePortraitProjector>::detectResultRoiByBorder(src_size, dst_tl, dst_br);
+    }
+};
+
+//! @} stitching_warp
+
+} // namespace detail
+} // namespace cv
+
+#include "warpers_inl.hpp"
+
+#endif // OPENCV_STITCHING_WARPERS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/warpers_inl.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/warpers_inl.hpp
new file mode 100644
index 000000000000..72b5c086725b
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/detail/warpers_inl.hpp
@@ -0,0 +1,782 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_WARPERS_INL_HPP
+#define OPENCV_STITCHING_WARPERS_INL_HPP
+
+#include "opencv2/core.hpp"
+#include "warpers.hpp" // Make your IDE see declarations
+#include <limits>
+
+//! @cond IGNORED
+
+namespace cv {
+namespace detail {
+
+template <class P>
+Point2f RotationWarperBase<P>::warpPoint(const Point2f &pt, InputArray K, InputArray R)
+{
+    projector_.setCameraParams(K, R);
+    Point2f uv;
+    projector_.mapForward(pt.x, pt.y, uv.x, uv.y);
+    return uv;
+}
+
+template <class P>
+Point2f RotationWarperBase<P>::warpPointBackward(const Point2f& pt, InputArray K, InputArray R)
+{
+    projector_.setCameraParams(K, R);
+    Point2f xy;
+    projector_.mapBackward(pt.x, pt.y, xy.x, xy.y);
+    return xy;
+}
+
+template <class P>
+Rect RotationWarperBase<P>::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray _xmap, OutputArray _ymap)
+{
+    projector_.setCameraParams(K, R);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    _xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    _ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+
+    Mat xmap = _xmap.getMat(), ymap = _ymap.getMat();
+
+    float x, y;
+    for (int v = dst_tl.y; v <= dst_br.y; ++v)
+    {
+        for (int u = dst_tl.x; u <= dst_br.x; ++u)
+        {
+            projector_.mapBackward(static_cast<float>(u), static_cast<float>(v), x, y);
+            xmap.at<float>(v - dst_tl.y, u - dst_tl.x) = x;
+            ymap.at<float>(v - dst_tl.y, u - dst_tl.x) = y;
+        }
+    }
+
+    return Rect(dst_tl, dst_br);
+}
+
+
+template <class P>
+Point RotationWarperBase<P>::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                  OutputArray dst)
+{
+    UMat xmap, ymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, xmap, ymap);
+
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    remap(src, dst, xmap, ymap, interp_mode, border_mode);
+
+    return dst_roi.tl();
+}
+
+
+template <class P>
+void RotationWarperBase<P>::warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                         Size dst_size, OutputArray dst)
+{
+    projector_.setCameraParams(K, R);
+
+    Point src_tl, src_br;
+    detectResultRoi(dst_size, src_tl, src_br);
+
+    Size size = src.size();
+    CV_Assert(src_br.x - src_tl.x + 1 == size.width && src_br.y - src_tl.y + 1 == size.height);
+
+    Mat xmap(dst_size, CV_32F);
+    Mat ymap(dst_size, CV_32F);
+
+    float u, v;
+    for (int y = 0; y < dst_size.height; ++y)
+    {
+        for (int x = 0; x < dst_size.width; ++x)
+        {
+            projector_.mapForward(static_cast<float>(x), static_cast<float>(y), u, v);
+            xmap.at<float>(y, x) = u - src_tl.x;
+            ymap.at<float>(y, x) = v - src_tl.y;
+        }
+    }
+
+    dst.create(dst_size, src.type());
+    remap(src, dst, xmap, ymap, interp_mode, border_mode);
+}
+
+
+template <class P>
+Rect RotationWarperBase<P>::warpRoi(Size src_size, InputArray K, InputArray R)
+{
+    projector_.setCameraParams(K, R);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    return Rect(dst_tl, Point(dst_br.x + 1, dst_br.y + 1));
+}
+
+
+template <class P>
+void RotationWarperBase<P>::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br)
+{
+    float tl_uf = (std::numeric_limits<float>::max)();
+    float tl_vf = (std::numeric_limits<float>::max)();
+    float br_uf = -(std::numeric_limits<float>::max)();
+    float br_vf = -(std::numeric_limits<float>::max)();
+
+    float u, v;
+    for (int y = 0; y < src_size.height; ++y)
+    {
+        for (int x = 0; x < src_size.width; ++x)
+        {
+            projector_.mapForward(static_cast<float>(x), static_cast<float>(y), u, v);
+            tl_uf = (std::min)(tl_uf, u); tl_vf = (std::min)(tl_vf, v);
+            br_uf = (std::max)(br_uf, u); br_vf = (std::max)(br_vf, v);
+        }
+    }
+
+    dst_tl.x = static_cast<int>(tl_uf);
+    dst_tl.y = static_cast<int>(tl_vf);
+    dst_br.x = static_cast<int>(br_uf);
+    dst_br.y = static_cast<int>(br_vf);
+}
+
+
+template <class P>
+void RotationWarperBase<P>::detectResultRoiByBorder(Size src_size, Point &dst_tl, Point &dst_br)
+{
+    float tl_uf = (std::numeric_limits<float>::max)();
+    float tl_vf = (std::numeric_limits<float>::max)();
+    float br_uf = -(std::numeric_limits<float>::max)();
+    float br_vf = -(std::numeric_limits<float>::max)();
+
+    float u, v;
+    for (float x = 0; x < src_size.width; ++x)
+    {
+        projector_.mapForward(static_cast<float>(x), 0, u, v);
+        tl_uf = (std::min)(tl_uf, u); tl_vf = (std::min)(tl_vf, v);
+        br_uf = (std::max)(br_uf, u); br_vf = (std::max)(br_vf, v);
+
+        projector_.mapForward(static_cast<float>(x), static_cast<float>(src_size.height - 1), u, v);
+        tl_uf = (std::min)(tl_uf, u); tl_vf = (std::min)(tl_vf, v);
+        br_uf = (std::max)(br_uf, u); br_vf = (std::max)(br_vf, v);
+    }
+    for (int y = 0; y < src_size.height; ++y)
+    {
+        projector_.mapForward(0, static_cast<float>(y), u, v);
+        tl_uf = (std::min)(tl_uf, u); tl_vf = (std::min)(tl_vf, v);
+        br_uf = (std::max)(br_uf, u); br_vf = (std::max)(br_vf, v);
+
+        projector_.mapForward(static_cast<float>(src_size.width - 1), static_cast<float>(y), u, v);
+        tl_uf = (std::min)(tl_uf, u); tl_vf = (std::min)(tl_vf, v);
+        br_uf = (std::max)(br_uf, u); br_vf = (std::max)(br_vf, v);
+    }
+
+    dst_tl.x = static_cast<int>(tl_uf);
+    dst_tl.y = static_cast<int>(tl_vf);
+    dst_br.x = static_cast<int>(br_uf);
+    dst_br.y = static_cast<int>(br_vf);
+}
+
+
+inline
+void PlaneProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float x_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    x_ = t[0] + x_ / z_ * (1 - t[2]);
+    y_ = t[1] + y_ / z_ * (1 - t[2]);
+
+    u = scale * x_;
+    v = scale * y_;
+}
+
+
+inline
+void PlaneProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u = u / scale - t[0];
+    v = v / scale - t[1];
+
+    float z;
+    x = k_rinv[0] * u + k_rinv[1] * v + k_rinv[2] * (1 - t[2]);
+    y = k_rinv[3] * u + k_rinv[4] * v + k_rinv[5] * (1 - t[2]);
+    z = k_rinv[6] * u + k_rinv[7] * v + k_rinv[8] * (1 - t[2]);
+
+    x /= z;
+    y /= z;
+}
+
+
+inline
+void SphericalProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float x_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    u = scale * atan2f(x_, z_);
+    float w = y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_);
+    v = scale * (static_cast<float>(CV_PI) - acosf(w == w ? w : 0));
+}
+
+
+inline
+void SphericalProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= scale;
+    v /= scale;
+
+    float sinv = sinf(static_cast<float>(CV_PI) - v);
+    float x_ = sinv * sinf(u);
+    float y_ = cosf(static_cast<float>(CV_PI) - v);
+    float z_ = sinv * cosf(u);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+
+inline
+void CylindricalProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float x_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    u = scale * atan2f(x_, z_);
+    v = scale * y_ / sqrtf(x_ * x_ + z_ * z_);
+}
+
+
+inline
+void CylindricalProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= scale;
+    v /= scale;
+
+    float x_ = sinf(u);
+    float y_ = v;
+    float z_ = cosf(u);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void FisheyeProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float x_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float u_ = atan2f(x_, z_);
+    float v_ = (float)CV_PI - acosf(y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_));
+
+    u = scale * v_ * cosf(u_);
+    v = scale * v_ * sinf(u_);
+}
+
+inline
+void FisheyeProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= scale;
+    v /= scale;
+
+    float u_ = atan2f(v, u);
+    float v_ = sqrtf(u*u + v*v);
+
+    float sinv = sinf((float)CV_PI - v_);
+    float x_ = sinv * sinf(u_);
+    float y_ = cosf((float)CV_PI - v_);
+    float z_ = sinv * cosf(u_);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void StereographicProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float x_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float u_ = atan2f(x_, z_);
+    float v_ = (float)CV_PI - acosf(y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_));
+
+    float r = sinf(v_) / (1 - cosf(v_));
+
+    u = scale * r * std::cos(u_);
+    v = scale * r * std::sin(u_);
+}
+
+inline
+void StereographicProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= scale;
+    v /= scale;
+
+    float u_ = atan2f(v, u);
+    float r = sqrtf(u*u + v*v);
+    float v_ = 2 * atanf(1.f / r);
+
+    float sinv = sinf((float)CV_PI - v_);
+    float x_ = sinv * sinf(u_);
+    float y_ = cosf((float)CV_PI - v_);
+    float z_ = sinv * cosf(u_);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void CompressedRectilinearProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float x_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float u_ = atan2f(x_, z_);
+    float v_ = asinf(y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_));
+
+    u = scale * a * tanf(u_ / a);
+    v = scale * b * tanf(v_) / cosf(u_);
+}
+
+inline
+void CompressedRectilinearProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= scale;
+    v /= scale;
+
+    float aatg = a * atanf(u / a);
+    float u_ = aatg;
+    float v_ = atanf(v * cosf(aatg) / b);
+
+    float cosv = cosf(v_);
+    float x_ = cosv * sinf(u_);
+    float y_ = sinf(v_);
+    float z_ = cosv * cosf(u_);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void CompressedRectilinearPortraitProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float y_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float x_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float u_ = atan2f(x_, z_);
+    float v_ = asinf(y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_));
+
+    u = - scale * a * tanf(u_ / a);
+    v = scale * b * tanf(v_) / cosf(u_);
+}
+
+inline
+void CompressedRectilinearPortraitProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= - scale;
+    v /= scale;
+
+    float aatg = a * atanf(u / a);
+    float u_ = aatg;
+    float v_ = atanf(v * cosf( aatg ) / b);
+
+    float cosv = cosf(v_);
+    float y_ = cosv * sinf(u_);
+    float x_ = sinf(v_);
+    float z_ = cosv * cosf(u_);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void PaniniProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float x_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float u_ = atan2f(x_, z_);
+    float v_ = asinf(y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_));
+
+    float tg = a * tanf(u_ / a);
+    u = scale * tg;
+
+    float sinu = sinf(u_);
+    if ( fabs(sinu) < 1E-7 )
+        v = scale * b * tanf(v_);
+    else
+        v = scale * b * tg * tanf(v_) / sinu;
+}
+
+inline
+void PaniniProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= scale;
+    v /= scale;
+
+    float lamda = a * atanf(u / a);
+    float u_ = lamda;
+
+    float v_;
+    if ( fabs(lamda) > 1E-7)
+        v_ = atanf(v * sinf(lamda) / (b * a * tanf(lamda / a)));
+    else
+        v_ = atanf(v / b);
+
+    float cosv = cosf(v_);
+    float x_ = cosv * sinf(u_);
+    float y_ = sinf(v_);
+    float z_ = cosv * cosf(u_);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void PaniniPortraitProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float y_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float x_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float u_ = atan2f(x_, z_);
+    float v_ = asinf(y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_));
+
+    float tg = a * tanf(u_ / a);
+    u = - scale * tg;
+
+    float sinu = sinf( u_ );
+    if ( fabs(sinu) < 1E-7 )
+        v = scale * b * tanf(v_);
+    else
+        v = scale * b * tg * tanf(v_) / sinu;
+}
+
+inline
+void PaniniPortraitProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= - scale;
+    v /= scale;
+
+    float lamda = a * atanf(u / a);
+    float u_ = lamda;
+
+    float v_;
+    if ( fabs(lamda) > 1E-7)
+        v_ = atanf(v * sinf(lamda) / (b * a * tanf(lamda/a)));
+    else
+        v_ = atanf(v / b);
+
+    float cosv = cosf(v_);
+    float y_ = cosv * sinf(u_);
+    float x_ = sinf(v_);
+    float z_ = cosv * cosf(u_);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void MercatorProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float x_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float u_ = atan2f(x_, z_);
+    float v_ = asinf(y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_));
+
+    u = scale * u_;
+    v = scale * logf( tanf( (float)(CV_PI/4) + v_/2 ) );
+}
+
+inline
+void MercatorProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= scale;
+    v /= scale;
+
+    float v_ = atanf( sinhf(v) );
+    float u_ = u;
+
+    float cosv = cosf(v_);
+    float x_ = cosv * sinf(u_);
+    float y_ = sinf(v_);
+    float z_ = cosv * cosf(u_);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void TransverseMercatorProjector::mapForward(float x, float y, float &u, float &v)
+{
+    float x_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float u_ = atan2f(x_, z_);
+    float v_ = asinf(y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_));
+
+    float B = cosf(v_) * sinf(u_);
+
+    u = scale / 2 * logf( (1+B) / (1-B) );
+    v = scale * atan2f(tanf(v_), cosf(u_));
+}
+
+inline
+void TransverseMercatorProjector::mapBackward(float u, float v, float &x, float &y)
+{
+    u /= scale;
+    v /= scale;
+
+    float v_ = asinf( sinf(v) / coshf(u) );
+    float u_ = atan2f( sinhf(u), std::cos(v) );
+
+    float cosv = cosf(v_);
+    float x_ = cosv * sinf(u_);
+    float y_ = sinf(v_);
+    float z_ = cosv * cosf(u_);
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void SphericalPortraitProjector::mapForward(float x, float y, float &u0, float &v0)
+{
+    float x0_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y0_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_ = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float x_ = y0_;
+    float y_ = x0_;
+    float u, v;
+
+    u = scale * atan2f(x_, z_);
+    v = scale * (static_cast<float>(CV_PI) - acosf(y_ / sqrtf(x_ * x_ + y_ * y_ + z_ * z_)));
+
+    u0 = -u;//v;
+    v0 = v;//u;
+}
+
+
+inline
+void SphericalPortraitProjector::mapBackward(float u0, float v0, float &x, float &y)
+{
+    float u, v;
+    u = -u0;//v0;
+    v = v0;//u0;
+
+    u /= scale;
+    v /= scale;
+
+    float sinv = sinf(static_cast<float>(CV_PI) - v);
+    float x0_ = sinv * sinf(u);
+    float y0_ = cosf(static_cast<float>(CV_PI) - v);
+    float z_ = sinv * cosf(u);
+
+    float x_ = y0_;
+    float y_ = x0_;
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void CylindricalPortraitProjector::mapForward(float x, float y, float &u0, float &v0)
+{
+    float x0_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y0_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_  = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float x_ = y0_;
+    float y_ = x0_;
+    float u, v;
+
+    u = scale * atan2f(x_, z_);
+    v = scale * y_ / sqrtf(x_ * x_ + z_ * z_);
+
+    u0 = -u;//v;
+    v0 = v;//u;
+}
+
+
+inline
+void CylindricalPortraitProjector::mapBackward(float u0, float v0, float &x, float &y)
+{
+    float u, v;
+    u = -u0;//v0;
+    v = v0;//u0;
+
+    u /= scale;
+    v /= scale;
+
+    float x0_ = sinf(u);
+    float y0_ = v;
+    float z_  = cosf(u);
+
+    float x_ = y0_;
+    float y_ = x0_;
+
+    float z;
+    x = k_rinv[0] * x_ + k_rinv[1] * y_ + k_rinv[2] * z_;
+    y = k_rinv[3] * x_ + k_rinv[4] * y_ + k_rinv[5] * z_;
+    z = k_rinv[6] * x_ + k_rinv[7] * y_ + k_rinv[8] * z_;
+
+    if (z > 0) { x /= z; y /= z; }
+    else x = y = -1;
+}
+
+inline
+void PlanePortraitProjector::mapForward(float x, float y, float &u0, float &v0)
+{
+    float x0_ = r_kinv[0] * x + r_kinv[1] * y + r_kinv[2];
+    float y0_ = r_kinv[3] * x + r_kinv[4] * y + r_kinv[5];
+    float z_  = r_kinv[6] * x + r_kinv[7] * y + r_kinv[8];
+
+    float x_ = y0_;
+    float y_ = x0_;
+
+    x_ = t[0] + x_ / z_ * (1 - t[2]);
+    y_ = t[1] + y_ / z_ * (1 - t[2]);
+
+    float u,v;
+    u = scale * x_;
+    v = scale * y_;
+
+    u0 = -u;
+    v0 = v;
+}
+
+
+inline
+void PlanePortraitProjector::mapBackward(float u0, float v0, float &x, float &y)
+{
+    float u, v;
+    u = -u0;
+    v = v0;
+
+    u = u / scale - t[0];
+    v = v / scale - t[1];
+
+    float z;
+    x = k_rinv[0] * v + k_rinv[1] * u + k_rinv[2] * (1 - t[2]);
+    y = k_rinv[3] * v + k_rinv[4] * u + k_rinv[5] * (1 - t[2]);
+    z = k_rinv[6] * v + k_rinv[7] * u + k_rinv[8] * (1 - t[2]);
+
+    x /= z;
+    y /= z;
+}
+
+
+} // namespace detail
+} // namespace cv
+
+//! @endcond
+
+#endif // OPENCV_STITCHING_WARPERS_INL_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/stitching/warpers.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/warpers.hpp
new file mode 100644
index 000000000000..0a5bf63de216
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/stitching/warpers.hpp
@@ -0,0 +1,277 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_STITCHING_WARPER_CREATORS_HPP
+#define OPENCV_STITCHING_WARPER_CREATORS_HPP
+
+#include "opencv2/stitching/detail/warpers.hpp"
+#include <string>
+
+namespace cv {
+    class CV_EXPORTS_W PyRotationWarper
+    {
+        Ptr<detail::RotationWarper> rw;
+
+    public:
+        CV_WRAP PyRotationWarper(String type, float scale);
+        CV_WRAP PyRotationWarper() {}
+        ~PyRotationWarper() {}
+
+        /** @brief Projects the image point.
+
+        @param pt Source point
+        @param K Camera intrinsic parameters
+        @param R Camera rotation matrix
+        @return Projected point
+        */
+        CV_WRAP Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R);
+
+        /** @brief Projects the image point backward.
+
+        @param pt Projected point
+        @param K Camera intrinsic parameters
+        @param R Camera rotation matrix
+        @return Backward-projected point
+        */
+#if CV_VERSION_MAJOR == 4
+        CV_WRAP Point2f warpPointBackward(const Point2f& pt, InputArray K, InputArray R)
+        {
+            CV_UNUSED(pt); CV_UNUSED(K); CV_UNUSED(R);
+            CV_Error(Error::StsNotImplemented, "");
+        }
+#else
+        CV_WRAP Point2f warpPointBackward(const Point2f &pt, InputArray K, InputArray R);
+#endif
+        /** @brief Builds the projection maps according to the given camera data.
+
+        @param src_size Source image size
+        @param K Camera intrinsic parameters
+        @param R Camera rotation matrix
+        @param xmap Projection map for the x axis
+        @param ymap Projection map for the y axis
+        @return Projected image minimum bounding box
+        */
+        CV_WRAP Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
+
+        /** @brief Projects the image.
+
+        @param src Source image
+        @param K Camera intrinsic parameters
+        @param R Camera rotation matrix
+        @param interp_mode Interpolation mode
+        @param border_mode Border extrapolation mode
+        @param dst Projected image
+        @return Project image top-left corner
+        */
+        CV_WRAP Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+            CV_OUT OutputArray dst);
+
+        /** @brief Projects the image backward.
+
+        @param src Projected image
+        @param K Camera intrinsic parameters
+        @param R Camera rotation matrix
+        @param interp_mode Interpolation mode
+        @param border_mode Border extrapolation mode
+        @param dst_size Backward-projected image size
+        @param dst Backward-projected image
+        */
+        CV_WRAP void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+            Size dst_size, CV_OUT OutputArray dst);
+
+        /**
+        @param src_size Source image bounding box
+        @param K Camera intrinsic parameters
+        @param R Camera rotation matrix
+        @return Projected image minimum bounding box
+        */
+        CV_WRAP Rect warpRoi(Size src_size, InputArray K, InputArray R);
+
+        CV_WRAP float getScale() const { return 1.f; }
+        CV_WRAP void setScale(float) {}
+    };
+
+//! @addtogroup stitching_warp
+//! @{
+
+/** @brief Image warper factories base class.
+ */
+
+class CV_EXPORTS_W WarperCreator
+{
+public:
+    CV_WRAP virtual ~WarperCreator() {}
+    virtual Ptr<detail::RotationWarper> create(float scale) const = 0;
+};
+
+
+/** @brief Plane warper factory class.
+  @sa detail::PlaneWarper
+ */
+class CV_EXPORTS  PlaneWarper : public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::PlaneWarper>(scale); }
+};
+
+/** @brief Affine warper factory class.
+  @sa detail::AffineWarper
+ */
+class CV_EXPORTS  AffineWarper : public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::AffineWarper>(scale); }
+};
+
+/** @brief Cylindrical warper factory class.
+@sa detail::CylindricalWarper
+*/
+class CV_EXPORTS CylindricalWarper: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::CylindricalWarper>(scale); }
+};
+
+/** @brief Spherical warper factory class */
+class CV_EXPORTS SphericalWarper: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::SphericalWarper>(scale); }
+};
+
+class CV_EXPORTS FisheyeWarper : public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::FisheyeWarper>(scale); }
+};
+
+class CV_EXPORTS StereographicWarper: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::StereographicWarper>(scale); }
+};
+
+class CV_EXPORTS CompressedRectilinearWarper: public WarperCreator
+{
+    float a, b;
+public:
+    CompressedRectilinearWarper(float A = 1, float B = 1)
+    {
+        a = A; b = B;
+    }
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::CompressedRectilinearWarper>(scale, a, b); }
+};
+
+class CV_EXPORTS CompressedRectilinearPortraitWarper: public WarperCreator
+{
+    float a, b;
+public:
+    CompressedRectilinearPortraitWarper(float A = 1, float B = 1)
+    {
+        a = A; b = B;
+    }
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::CompressedRectilinearPortraitWarper>(scale, a, b); }
+};
+
+class CV_EXPORTS PaniniWarper: public WarperCreator
+{
+    float a, b;
+public:
+    PaniniWarper(float A = 1, float B = 1)
+    {
+        a = A; b = B;
+    }
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::PaniniWarper>(scale, a, b); }
+};
+
+class CV_EXPORTS PaniniPortraitWarper: public WarperCreator
+{
+    float a, b;
+public:
+    PaniniPortraitWarper(float A = 1, float B = 1)
+    {
+        a = A; b = B;
+    }
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::PaniniPortraitWarper>(scale, a, b); }
+};
+
+class CV_EXPORTS MercatorWarper: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::MercatorWarper>(scale); }
+};
+
+class CV_EXPORTS TransverseMercatorWarper: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::TransverseMercatorWarper>(scale); }
+};
+
+
+
+#ifdef HAVE_OPENCV_CUDAWARPING
+class PlaneWarperGpu: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::PlaneWarperGpu>(scale); }
+};
+
+
+class CylindricalWarperGpu: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::CylindricalWarperGpu>(scale); }
+};
+
+
+class SphericalWarperGpu: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const CV_OVERRIDE { return makePtr<detail::SphericalWarperGpu>(scale); }
+};
+#endif
+
+//! @} stitching_warp
+
+} // namespace cv
+
+#endif // OPENCV_STITCHING_WARPER_CREATORS_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/video.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/video.hpp
new file mode 100644
index 000000000000..b1c19196d499
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/video.hpp
@@ -0,0 +1,58 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_VIDEO_HPP
+#define OPENCV_VIDEO_HPP
+
+/**
+  @defgroup video Video Analysis
+  @{
+    @defgroup video_motion Motion Analysis
+    @defgroup video_track Object Tracking
+  @}
+*/
+
+#include "opencv2/video/tracking.hpp"
+#include "opencv2/video/background_segm.hpp"
+
+#endif //OPENCV_VIDEO_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/video/background_segm.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/video/background_segm.hpp
new file mode 100644
index 000000000000..e1dfa15a9a55
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/video/background_segm.hpp
@@ -0,0 +1,317 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_BACKGROUND_SEGM_HPP
+#define OPENCV_BACKGROUND_SEGM_HPP
+
+#include "opencv2/core.hpp"
+
+namespace cv
+{
+
+//! @addtogroup video_motion
+//! @{
+
+/** @brief Base class for background/foreground segmentation. :
+
+The class is only used to define the common interface for the whole family of background/foreground
+segmentation algorithms.
+ */
+class CV_EXPORTS_W BackgroundSubtractor : public Algorithm
+{
+public:
+    /** @brief Computes a foreground mask.
+
+    @param image Next video frame.
+    @param fgmask The output foreground mask as an 8-bit binary image.
+    @param learningRate The value between 0 and 1 that indicates how fast the background model is
+    learnt. Negative parameter value makes the algorithm to use some automatically chosen learning
+    rate. 0 means that the background model is not updated at all, 1 means that the background model
+    is completely reinitialized from the last frame.
+     */
+    CV_WRAP virtual void apply(InputArray image, OutputArray fgmask, double learningRate=-1) = 0;
+
+    /** @brief Computes a background image.
+
+    @param backgroundImage The output background image.
+
+    @note Sometimes the background image can be very blurry, as it contain the average background
+    statistics.
+     */
+    CV_WRAP virtual void getBackgroundImage(OutputArray backgroundImage) const = 0;
+};
+
+
+/** @brief Gaussian Mixture-based Background/Foreground Segmentation Algorithm.
+
+The class implements the Gaussian mixture model background subtraction described in @cite Zivkovic2004
+and @cite Zivkovic2006 .
+ */
+class CV_EXPORTS_W BackgroundSubtractorMOG2 : public BackgroundSubtractor
+{
+public:
+    /** @brief Returns the number of last frames that affect the background model
+    */
+    CV_WRAP virtual int getHistory() const = 0;
+    /** @brief Sets the number of last frames that affect the background model
+    */
+    CV_WRAP virtual void setHistory(int history) = 0;
+
+    /** @brief Returns the number of gaussian components in the background model
+    */
+    CV_WRAP virtual int getNMixtures() const = 0;
+    /** @brief Sets the number of gaussian components in the background model.
+
+    The model needs to be reinitalized to reserve memory.
+    */
+    CV_WRAP virtual void setNMixtures(int nmixtures) = 0;//needs reinitialization!
+
+    /** @brief Returns the "background ratio" parameter of the algorithm
+
+    If a foreground pixel keeps semi-constant value for about backgroundRatio\*history frames, it's
+    considered background and added to the model as a center of a new component. It corresponds to TB
+    parameter in the paper.
+     */
+    CV_WRAP virtual double getBackgroundRatio() const = 0;
+    /** @brief Sets the "background ratio" parameter of the algorithm
+    */
+    CV_WRAP virtual void setBackgroundRatio(double ratio) = 0;
+
+    /** @brief Returns the variance threshold for the pixel-model match
+
+    The main threshold on the squared Mahalanobis distance to decide if the sample is well described by
+    the background model or not. Related to Cthr from the paper.
+     */
+    CV_WRAP virtual double getVarThreshold() const = 0;
+    /** @brief Sets the variance threshold for the pixel-model match
+    */
+    CV_WRAP virtual void setVarThreshold(double varThreshold) = 0;
+
+    /** @brief Returns the variance threshold for the pixel-model match used for new mixture component generation
+
+    Threshold for the squared Mahalanobis distance that helps decide when a sample is close to the
+    existing components (corresponds to Tg in the paper). If a pixel is not close to any component, it
+    is considered foreground or added as a new component. 3 sigma =\> Tg=3\*3=9 is default. A smaller Tg
+    value generates more components. A higher Tg value may result in a small number of components but
+    they can grow too large.
+     */
+    CV_WRAP virtual double getVarThresholdGen() const = 0;
+    /** @brief Sets the variance threshold for the pixel-model match used for new mixture component generation
+    */
+    CV_WRAP virtual void setVarThresholdGen(double varThresholdGen) = 0;
+
+    /** @brief Returns the initial variance of each gaussian component
+    */
+    CV_WRAP virtual double getVarInit() const = 0;
+    /** @brief Sets the initial variance of each gaussian component
+    */
+    CV_WRAP virtual void setVarInit(double varInit) = 0;
+
+    CV_WRAP virtual double getVarMin() const = 0;
+    CV_WRAP virtual void setVarMin(double varMin) = 0;
+
+    CV_WRAP virtual double getVarMax() const = 0;
+    CV_WRAP virtual void setVarMax(double varMax) = 0;
+
+    /** @brief Returns the complexity reduction threshold
+
+    This parameter defines the number of samples needed to accept to prove the component exists. CT=0.05
+    is a default value for all the samples. By setting CT=0 you get an algorithm very similar to the
+    standard Stauffer&Grimson algorithm.
+     */
+    CV_WRAP virtual double getComplexityReductionThreshold() const = 0;
+    /** @brief Sets the complexity reduction threshold
+    */
+    CV_WRAP virtual void setComplexityReductionThreshold(double ct) = 0;
+
+    /** @brief Returns the shadow detection flag
+
+    If true, the algorithm detects shadows and marks them. See createBackgroundSubtractorMOG2 for
+    details.
+     */
+    CV_WRAP virtual bool getDetectShadows() const = 0;
+    /** @brief Enables or disables shadow detection
+    */
+    CV_WRAP virtual void setDetectShadows(bool detectShadows) = 0;
+
+    /** @brief Returns the shadow value
+
+    Shadow value is the value used to mark shadows in the foreground mask. Default value is 127. Value 0
+    in the mask always means background, 255 means foreground.
+     */
+    CV_WRAP virtual int getShadowValue() const = 0;
+    /** @brief Sets the shadow value
+    */
+    CV_WRAP virtual void setShadowValue(int value) = 0;
+
+    /** @brief Returns the shadow threshold
+
+    A shadow is detected if pixel is a darker version of the background. The shadow threshold (Tau in
+    the paper) is a threshold defining how much darker the shadow can be. Tau= 0.5 means that if a pixel
+    is more than twice darker then it is not shadow. See Prati, Mikic, Trivedi and Cucchiara,
+    *Detecting Moving Shadows...*, IEEE PAMI,2003.
+     */
+    CV_WRAP virtual double getShadowThreshold() const = 0;
+    /** @brief Sets the shadow threshold
+    */
+    CV_WRAP virtual void setShadowThreshold(double threshold) = 0;
+
+    /** @brief Computes a foreground mask.
+
+    @param image Next video frame. Floating point frame will be used without scaling and should be in range \f$[0,255]\f$.
+    @param fgmask The output foreground mask as an 8-bit binary image.
+    @param learningRate The value between 0 and 1 that indicates how fast the background model is
+    learnt. Negative parameter value makes the algorithm to use some automatically chosen learning
+    rate. 0 means that the background model is not updated at all, 1 means that the background model
+    is completely reinitialized from the last frame.
+     */
+    CV_WRAP virtual void apply(InputArray image, OutputArray fgmask, double learningRate=-1) CV_OVERRIDE = 0;
+};
+
+/** @brief Creates MOG2 Background Subtractor
+
+@param history Length of the history.
+@param varThreshold Threshold on the squared Mahalanobis distance between the pixel and the model
+to decide whether a pixel is well described by the background model. This parameter does not
+affect the background update.
+@param detectShadows If true, the algorithm will detect shadows and mark them. It decreases the
+speed a bit, so if you do not need this feature, set the parameter to false.
+ */
+CV_EXPORTS_W Ptr<BackgroundSubtractorMOG2>
+    createBackgroundSubtractorMOG2(int history=500, double varThreshold=16,
+                                   bool detectShadows=true);
+
+/** @brief K-nearest neighbours - based Background/Foreground Segmentation Algorithm.
+
+The class implements the K-nearest neighbours background subtraction described in @cite Zivkovic2006 .
+Very efficient if number of foreground pixels is low.
+ */
+class CV_EXPORTS_W BackgroundSubtractorKNN : public BackgroundSubtractor
+{
+public:
+    /** @brief Returns the number of last frames that affect the background model
+    */
+    CV_WRAP virtual int getHistory() const = 0;
+    /** @brief Sets the number of last frames that affect the background model
+    */
+    CV_WRAP virtual void setHistory(int history) = 0;
+
+    /** @brief Returns the number of data samples in the background model
+    */
+    CV_WRAP virtual int getNSamples() const = 0;
+    /** @brief Sets the number of data samples in the background model.
+
+    The model needs to be reinitalized to reserve memory.
+    */
+    CV_WRAP virtual void setNSamples(int _nN) = 0;//needs reinitialization!
+
+    /** @brief Returns the threshold on the squared distance between the pixel and the sample
+
+    The threshold on the squared distance between the pixel and the sample to decide whether a pixel is
+    close to a data sample.
+     */
+    CV_WRAP virtual double getDist2Threshold() const = 0;
+    /** @brief Sets the threshold on the squared distance
+    */
+    CV_WRAP virtual void setDist2Threshold(double _dist2Threshold) = 0;
+
+    /** @brief Returns the number of neighbours, the k in the kNN.
+
+    K is the number of samples that need to be within dist2Threshold in order to decide that that
+    pixel is matching the kNN background model.
+     */
+    CV_WRAP virtual int getkNNSamples() const = 0;
+    /** @brief Sets the k in the kNN. How many nearest neighbours need to match.
+    */
+    CV_WRAP virtual void setkNNSamples(int _nkNN) = 0;
+
+    /** @brief Returns the shadow detection flag
+
+    If true, the algorithm detects shadows and marks them. See createBackgroundSubtractorKNN for
+    details.
+     */
+    CV_WRAP virtual bool getDetectShadows() const = 0;
+    /** @brief Enables or disables shadow detection
+    */
+    CV_WRAP virtual void setDetectShadows(bool detectShadows) = 0;
+
+    /** @brief Returns the shadow value
+
+    Shadow value is the value used to mark shadows in the foreground mask. Default value is 127. Value 0
+    in the mask always means background, 255 means foreground.
+     */
+    CV_WRAP virtual int getShadowValue() const = 0;
+    /** @brief Sets the shadow value
+    */
+    CV_WRAP virtual void setShadowValue(int value) = 0;
+
+    /** @brief Returns the shadow threshold
+
+    A shadow is detected if pixel is a darker version of the background. The shadow threshold (Tau in
+    the paper) is a threshold defining how much darker the shadow can be. Tau= 0.5 means that if a pixel
+    is more than twice darker then it is not shadow. See Prati, Mikic, Trivedi and Cucchiara,
+    *Detecting Moving Shadows...*, IEEE PAMI,2003.
+     */
+    CV_WRAP virtual double getShadowThreshold() const = 0;
+    /** @brief Sets the shadow threshold
+     */
+    CV_WRAP virtual void setShadowThreshold(double threshold) = 0;
+};
+
+/** @brief Creates KNN Background Subtractor
+
+@param history Length of the history.
+@param dist2Threshold Threshold on the squared distance between the pixel and the sample to decide
+whether a pixel is close to that sample. This parameter does not affect the background update.
+@param detectShadows If true, the algorithm will detect shadows and mark them. It decreases the
+speed a bit, so if you do not need this feature, set the parameter to false.
+ */
+CV_EXPORTS_W Ptr<BackgroundSubtractorKNN>
+    createBackgroundSubtractorKNN(int history=500, double dist2Threshold=400.0,
+                                   bool detectShadows=true);
+
+//! @} video_motion
+
+} // cv
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/video/detail/tracking.detail.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/video/detail/tracking.detail.hpp
new file mode 100644
index 000000000000..3c7823b7dccc
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/video/detail/tracking.detail.hpp
@@ -0,0 +1,406 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VIDEO_DETAIL_TRACKING_HPP
+#define OPENCV_VIDEO_DETAIL_TRACKING_HPP
+
+/*
+ * Partially based on:
+ * ====================================================================================================================
+ *  - [AAM] S. Salti, A. Cavallaro, L. Di Stefano, Adaptive Appearance Modeling for Video Tracking: Survey and Evaluation
+ *  - [AMVOT] X. Li, W. Hu, C. Shen, Z. Zhang, A. Dick, A. van den Hengel, A Survey of Appearance Models in Visual Object Tracking
+ *
+ * This Tracking API has been designed with PlantUML. If you modify this API please change UML files under modules/tracking/doc/uml
+ *
+ */
+
+#include "opencv2/core.hpp"
+
+namespace cv {
+namespace detail {
+inline namespace tracking {
+
+/** @addtogroup tracking_detail
+@{
+*/
+
+/************************************ TrackerFeature Base Classes ************************************/
+
+/** @brief Abstract base class for TrackerFeature that represents the feature.
+*/
+class CV_EXPORTS TrackerFeature
+{
+public:
+    virtual ~TrackerFeature();
+
+    /** @brief Compute the features in the images collection
+    @param images The images
+    @param response The output response
+    */
+    void compute(const std::vector<Mat>& images, Mat& response);
+
+protected:
+    virtual bool computeImpl(const std::vector<Mat>& images, Mat& response) = 0;
+};
+
+/** @brief Class that manages the extraction and selection of features
+
+@cite AAM Feature Extraction and Feature Set Refinement (Feature Processing and Feature Selection).
+See table I and section III C @cite AMVOT Appearance modelling -\> Visual representation (Table II,
+section 3.1 - 3.2)
+
+TrackerFeatureSet is an aggregation of TrackerFeature
+
+@sa
+   TrackerFeature
+
+*/
+class CV_EXPORTS TrackerFeatureSet
+{
+public:
+    TrackerFeatureSet();
+
+    ~TrackerFeatureSet();
+
+    /** @brief Extract features from the images collection
+    @param images The input images
+    */
+    void extraction(const std::vector<Mat>& images);
+
+    /** @brief Add TrackerFeature in the collection. Return true if TrackerFeature is added, false otherwise
+    @param feature The TrackerFeature class
+    */
+    bool addTrackerFeature(const Ptr<TrackerFeature>& feature);
+
+    /** @brief Get the TrackerFeature collection (TrackerFeature name, TrackerFeature pointer)
+    */
+    const std::vector<Ptr<TrackerFeature>>& getTrackerFeatures() const;
+
+    /** @brief Get the responses
+    @note Be sure to call extraction before getResponses Example TrackerFeatureSet::getResponses
+    */
+    const std::vector<Mat>& getResponses() const;
+
+private:
+    void clearResponses();
+    bool blockAddTrackerFeature;
+
+    std::vector<Ptr<TrackerFeature>> features;  // list of features
+    std::vector<Mat> responses;  // list of response after compute
+};
+
+/************************************ TrackerSampler Base Classes ************************************/
+
+/** @brief Abstract base class for TrackerSamplerAlgorithm that represents the algorithm for the specific
+sampler.
+*/
+class CV_EXPORTS TrackerSamplerAlgorithm
+{
+public:
+    virtual ~TrackerSamplerAlgorithm();
+
+    /** @brief Computes the regions starting from a position in an image.
+
+    Return true if samples are computed, false otherwise
+
+    @param image The current frame
+    @param boundingBox The bounding box from which regions can be calculated
+
+    @param sample The computed samples @cite AAM Fig. 1 variable Sk
+    */
+    virtual bool sampling(const Mat& image, const Rect& boundingBox, std::vector<Mat>& sample) = 0;
+};
+
+/**
+ * \brief Class that manages the sampler in order to select regions for the update the model of the tracker
+ * [AAM] Sampling e Labeling. See table I and section III B
+ */
+
+/** @brief Class that manages the sampler in order to select regions for the update the model of the tracker
+
+@cite AAM Sampling e Labeling. See table I and section III B
+
+TrackerSampler is an aggregation of TrackerSamplerAlgorithm
+@sa
+   TrackerSamplerAlgorithm
+ */
+class CV_EXPORTS TrackerSampler
+{
+public:
+    TrackerSampler();
+
+    ~TrackerSampler();
+
+    /** @brief Computes the regions starting from a position in an image
+    @param image The current frame
+    @param boundingBox The bounding box from which regions can be calculated
+    */
+    void sampling(const Mat& image, Rect boundingBox);
+
+    /** @brief Return the collection of the TrackerSamplerAlgorithm
+    */
+    const std::vector<Ptr<TrackerSamplerAlgorithm>>& getSamplers() const;
+
+    /** @brief Return the samples from all TrackerSamplerAlgorithm, @cite AAM Fig. 1 variable Sk
+    */
+    const std::vector<Mat>& getSamples() const;
+
+    /** @brief Add TrackerSamplerAlgorithm in the collection. Return true if sampler is added, false otherwise
+    @param sampler The TrackerSamplerAlgorithm
+    */
+    bool addTrackerSamplerAlgorithm(const Ptr<TrackerSamplerAlgorithm>& sampler);
+
+private:
+    std::vector<Ptr<TrackerSamplerAlgorithm>> samplers;
+    std::vector<Mat> samples;
+    bool blockAddTrackerSampler;
+
+    void clearSamples();
+};
+
+/************************************ TrackerModel Base Classes ************************************/
+
+/** @brief Abstract base class for TrackerTargetState that represents a possible state of the target.
+
+See @cite AAM \f$\hat{x}^{i}_{k}\f$ all the states candidates.
+
+Inherits this class with your Target state, In own implementation you can add scale variation,
+width, height, orientation, etc.
+*/
+class CV_EXPORTS TrackerTargetState
+{
+public:
+    virtual ~TrackerTargetState() {}
+    /** @brief Get the position
+    * @return The position
+    */
+    Point2f getTargetPosition() const;
+
+    /** @brief Set the position
+    * @param position The position
+    */
+    void setTargetPosition(const Point2f& position);
+    /** @brief Get the width of the target
+    * @return The width of the target
+    */
+    int getTargetWidth() const;
+
+    /** @brief Set the width of the target
+    * @param width The width of the target
+    */
+    void setTargetWidth(int width);
+    /** @brief Get the height of the target
+    * @return The height of the target
+    */
+    int getTargetHeight() const;
+
+    /** @brief Set the height of the target
+    * @param height The height of the target
+    */
+    void setTargetHeight(int height);
+
+protected:
+    Point2f targetPosition;
+    int targetWidth;
+    int targetHeight;
+};
+
+/** @brief Represents the model of the target at frame \f$k\f$ (all states and scores)
+
+See @cite AAM The set of the pair \f$\langle \hat{x}^{i}_{k}, C^{i}_{k} \rangle\f$
+@sa TrackerTargetState
+*/
+typedef std::vector<std::pair<Ptr<TrackerTargetState>, float>> ConfidenceMap;
+
+/** @brief Represents the estimate states for all frames
+
+@cite AAM \f$x_{k}\f$ is the trajectory of the target up to time \f$k\f$
+
+@sa TrackerTargetState
+*/
+typedef std::vector<Ptr<TrackerTargetState>> Trajectory;
+
+/** @brief Abstract base class for TrackerStateEstimator that estimates the most likely target state.
+
+See @cite AAM State estimator
+
+See @cite AMVOT Statistical modeling (Fig. 3), Table III (generative) - IV (discriminative) - V (hybrid)
+*/
+class CV_EXPORTS TrackerStateEstimator
+{
+public:
+    virtual ~TrackerStateEstimator();
+
+    /** @brief Estimate the most likely target state, return the estimated state
+    @param confidenceMaps The overall appearance model as a list of :cConfidenceMap
+    */
+    Ptr<TrackerTargetState> estimate(const std::vector<ConfidenceMap>& confidenceMaps);
+
+    /** @brief Update the ConfidenceMap with the scores
+    @param confidenceMaps The overall appearance model as a list of :cConfidenceMap
+    */
+    void update(std::vector<ConfidenceMap>& confidenceMaps);
+
+    /** @brief Create TrackerStateEstimator by tracker state estimator type
+    @param trackeStateEstimatorType The TrackerStateEstimator name
+
+    The modes available now:
+
+    -   "BOOSTING" -- Boosting-based discriminative appearance models. See @cite AMVOT section 4.4
+
+    The modes available soon:
+
+    -   "SVM" -- SVM-based discriminative appearance models. See @cite AMVOT section 4.5
+    */
+    static Ptr<TrackerStateEstimator> create(const String& trackeStateEstimatorType);
+
+    /** @brief Get the name of the specific TrackerStateEstimator
+    */
+    String getClassName() const;
+
+protected:
+    virtual Ptr<TrackerTargetState> estimateImpl(const std::vector<ConfidenceMap>& confidenceMaps) = 0;
+    virtual void updateImpl(std::vector<ConfidenceMap>& confidenceMaps) = 0;
+    String className;
+};
+
+/** @brief Abstract class that represents the model of the target.
+
+It must be instantiated by specialized tracker
+
+See @cite AAM Ak
+
+Inherits this with your TrackerModel
+*/
+class CV_EXPORTS TrackerModel
+{
+public:
+    TrackerModel();
+
+    virtual ~TrackerModel();
+
+    /** @brief Set TrackerEstimator, return true if the tracker state estimator is added, false otherwise
+    @param trackerStateEstimator The TrackerStateEstimator
+    @note You can add only one TrackerStateEstimator
+    */
+    bool setTrackerStateEstimator(Ptr<TrackerStateEstimator> trackerStateEstimator);
+
+    /** @brief Estimate the most likely target location
+
+    @cite AAM ME, Model Estimation table I
+    @param responses Features extracted from TrackerFeatureSet
+    */
+    void modelEstimation(const std::vector<Mat>& responses);
+
+    /** @brief Update the model
+
+    @cite AAM MU, Model Update table I
+    */
+    void modelUpdate();
+
+    /** @brief Run the TrackerStateEstimator, return true if is possible to estimate a new state, false otherwise
+    */
+    bool runStateEstimator();
+
+    /** @brief Set the current TrackerTargetState in the Trajectory
+    @param lastTargetState The current TrackerTargetState
+    */
+    void setLastTargetState(const Ptr<TrackerTargetState>& lastTargetState);
+
+    /** @brief Get the last TrackerTargetState from Trajectory
+    */
+    Ptr<TrackerTargetState> getLastTargetState() const;
+
+    /** @brief Get the list of the ConfidenceMap
+    */
+    const std::vector<ConfidenceMap>& getConfidenceMaps() const;
+
+    /** @brief Get the last ConfidenceMap for the current frame
+    */
+    const ConfidenceMap& getLastConfidenceMap() const;
+
+    /** @brief Get the TrackerStateEstimator
+    */
+    Ptr<TrackerStateEstimator> getTrackerStateEstimator() const;
+
+private:
+    void clearCurrentConfidenceMap();
+
+protected:
+    std::vector<ConfidenceMap> confidenceMaps;
+    Ptr<TrackerStateEstimator> stateEstimator;
+    ConfidenceMap currentConfidenceMap;
+    Trajectory trajectory;
+    int maxCMLength;
+
+    virtual void modelEstimationImpl(const std::vector<Mat>& responses) = 0;
+    virtual void modelUpdateImpl() = 0;
+};
+
+/************************************ Specific TrackerStateEstimator Classes ************************************/
+
+// None
+
+/************************************ Specific TrackerSamplerAlgorithm Classes ************************************/
+
+/** @brief TrackerSampler based on CSC (current state centered), used by MIL algorithm TrackerMIL
+ */
+class CV_EXPORTS TrackerSamplerCSC : public TrackerSamplerAlgorithm
+{
+public:
+    ~TrackerSamplerCSC();
+
+    enum MODE
+    {
+        MODE_INIT_POS = 1,  //!< mode for init positive samples
+        MODE_INIT_NEG = 2,  //!< mode for init negative samples
+        MODE_TRACK_POS = 3,  //!< mode for update positive samples
+        MODE_TRACK_NEG = 4,  //!< mode for update negative samples
+        MODE_DETECT = 5  //!< mode for detect samples
+    };
+
+    struct CV_EXPORTS Params
+    {
+        Params();
+        float initInRad;  //!< radius for gathering positive instances during init
+        float trackInPosRad;  //!< radius for gathering positive instances during tracking
+        float searchWinSize;  //!< size of search window
+        int initMaxNegNum;  //!< # negative samples to use during init
+        int trackMaxPosNum;  //!< # positive samples to use during training
+        int trackMaxNegNum;  //!< # negative samples to use during training
+    };
+
+    /** @brief Constructor
+    @param parameters TrackerSamplerCSC parameters TrackerSamplerCSC::Params
+    */
+    TrackerSamplerCSC(const TrackerSamplerCSC::Params& parameters = TrackerSamplerCSC::Params());
+
+    /** @brief Set the sampling mode of TrackerSamplerCSC
+    @param samplingMode The sampling mode
+
+    The modes are:
+
+    -   "MODE_INIT_POS = 1" -- for the positive sampling in initialization step
+    -   "MODE_INIT_NEG = 2" -- for the negative sampling in initialization step
+    -   "MODE_TRACK_POS = 3" -- for the positive sampling in update step
+    -   "MODE_TRACK_NEG = 4" -- for the negative sampling in update step
+    -   "MODE_DETECT = 5" -- for the sampling in detection step
+    */
+    void setMode(int samplingMode);
+
+    bool sampling(const Mat& image, const Rect& boundingBox, std::vector<Mat>& sample) CV_OVERRIDE;
+
+private:
+    Params params;
+    int mode;
+    RNG rng;
+
+    std::vector<Mat> sampleImage(const Mat& img, int x, int y, int w, int h, float inrad, float outrad = 0, int maxnum = 1000000);
+};
+
+//! @}
+
+}}}  // namespace cv::detail::tracking
+
+#endif  // OPENCV_VIDEO_DETAIL_TRACKING_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/video/legacy/constants_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/video/legacy/constants_c.h
new file mode 100644
index 000000000000..1a98f52961e2
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/video/legacy/constants_c.h
@@ -0,0 +1,16 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VIDEO_LEGACY_CONSTANTS_H
+#define OPENCV_VIDEO_LEGACY_CONSTANTS_H
+
+enum
+{
+    CV_LKFLOW_PYR_A_READY = 1,
+    CV_LKFLOW_PYR_B_READY = 2,
+    CV_LKFLOW_INITIAL_GUESSES = 4,
+    CV_LKFLOW_GET_MIN_EIGENVALS = 8
+};
+
+#endif // OPENCV_VIDEO_LEGACY_CONSTANTS_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/video/tracking.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/video/tracking.hpp
new file mode 100644
index 000000000000..df34a9f97cb6
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/video/tracking.hpp
@@ -0,0 +1,943 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_TRACKING_HPP
+#define OPENCV_TRACKING_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+namespace cv
+{
+
+//! @addtogroup video_track
+//! @{
+
+enum { OPTFLOW_USE_INITIAL_FLOW     = 4,
+       OPTFLOW_LK_GET_MIN_EIGENVALS = 8,
+       OPTFLOW_FARNEBACK_GAUSSIAN   = 256
+     };
+
+/** @brief Finds an object center, size, and orientation.
+
+@param probImage Back projection of the object histogram. See calcBackProject.
+@param window Initial search window.
+@param criteria Stop criteria for the underlying meanShift.
+returns
+(in old interfaces) Number of iterations CAMSHIFT took to converge
+The function implements the CAMSHIFT object tracking algorithm @cite Bradski98 . First, it finds an
+object center using meanShift and then adjusts the window size and finds the optimal rotation. The
+function returns the rotated rectangle structure that includes the object position, size, and
+orientation. The next position of the search window can be obtained with RotatedRect::boundingRect()
+
+See the OpenCV sample camshiftdemo.c that tracks colored objects.
+
+@note
+-   (Python) A sample explaining the camshift tracking algorithm can be found at
+    opencv_source_code/samples/python/camshift.py
+ */
+CV_EXPORTS_W RotatedRect CamShift( InputArray probImage, CV_IN_OUT Rect& window,
+                                   TermCriteria criteria );
+/** @example samples/cpp/camshiftdemo.cpp
+An example using the mean-shift tracking algorithm
+*/
+
+/** @brief Finds an object on a back projection image.
+
+@param probImage Back projection of the object histogram. See calcBackProject for details.
+@param window Initial search window.
+@param criteria Stop criteria for the iterative search algorithm.
+returns
+:   Number of iterations CAMSHIFT took to converge.
+The function implements the iterative object search algorithm. It takes the input back projection of
+an object and the initial position. The mass center in window of the back projection image is
+computed and the search window center shifts to the mass center. The procedure is repeated until the
+specified number of iterations criteria.maxCount is done or until the window center shifts by less
+than criteria.epsilon. The algorithm is used inside CamShift and, unlike CamShift , the search
+window size or orientation do not change during the search. You can simply pass the output of
+calcBackProject to this function. But better results can be obtained if you pre-filter the back
+projection and remove the noise. For example, you can do this by retrieving connected components
+with findContours , throwing away contours with small area ( contourArea ), and rendering the
+remaining contours with drawContours.
+
+ */
+CV_EXPORTS_W int meanShift( InputArray probImage, CV_IN_OUT Rect& window, TermCriteria criteria );
+
+/** @brief Constructs the image pyramid which can be passed to calcOpticalFlowPyrLK.
+
+@param img 8-bit input image.
+@param pyramid output pyramid.
+@param winSize window size of optical flow algorithm. Must be not less than winSize argument of
+calcOpticalFlowPyrLK. It is needed to calculate required padding for pyramid levels.
+@param maxLevel 0-based maximal pyramid level number.
+@param withDerivatives set to precompute gradients for the every pyramid level. If pyramid is
+constructed without the gradients then calcOpticalFlowPyrLK will calculate them internally.
+@param pyrBorder the border mode for pyramid layers.
+@param derivBorder the border mode for gradients.
+@param tryReuseInputImage put ROI of input image into the pyramid if possible. You can pass false
+to force data copying.
+@return number of levels in constructed pyramid. Can be less than maxLevel.
+ */
+CV_EXPORTS_W int buildOpticalFlowPyramid( InputArray img, OutputArrayOfArrays pyramid,
+                                          Size winSize, int maxLevel, bool withDerivatives = true,
+                                          int pyrBorder = BORDER_REFLECT_101,
+                                          int derivBorder = BORDER_CONSTANT,
+                                          bool tryReuseInputImage = true );
+
+/** @example samples/cpp/lkdemo.cpp
+An example using the Lucas-Kanade optical flow algorithm
+*/
+
+/** @brief Calculates an optical flow for a sparse feature set using the iterative Lucas-Kanade method with
+pyramids.
+
+@param prevImg first 8-bit input image or pyramid constructed by buildOpticalFlowPyramid.
+@param nextImg second input image or pyramid of the same size and the same type as prevImg.
+@param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be
+single-precision floating-point numbers.
+@param nextPts output vector of 2D points (with single-precision floating-point coordinates)
+containing the calculated new positions of input features in the second image; when
+OPTFLOW_USE_INITIAL_FLOW flag is passed, the vector must have the same size as in the input.
+@param status output status vector (of unsigned chars); each element of the vector is set to 1 if
+the flow for the corresponding features has been found, otherwise, it is set to 0.
+@param err output vector of errors; each element of the vector is set to an error for the
+corresponding feature, type of the error measure can be set in flags parameter; if the flow wasn't
+found then the error is not defined (use the status parameter to find such cases).
+@param winSize size of the search window at each pyramid level.
+@param maxLevel 0-based maximal pyramid level number; if set to 0, pyramids are not used (single
+level), if set to 1, two levels are used, and so on; if pyramids are passed to input then
+algorithm will use as many levels as pyramids have but no more than maxLevel.
+@param criteria parameter, specifying the termination criteria of the iterative search algorithm
+(after the specified maximum number of iterations criteria.maxCount or when the search window
+moves by less than criteria.epsilon.
+@param flags operation flags:
+ -   **OPTFLOW_USE_INITIAL_FLOW** uses initial estimations, stored in nextPts; if the flag is
+     not set, then prevPts is copied to nextPts and is considered the initial estimate.
+ -   **OPTFLOW_LK_GET_MIN_EIGENVALS** use minimum eigen values as an error measure (see
+     minEigThreshold description); if the flag is not set, then L1 distance between patches
+     around the original and a moved point, divided by number of pixels in a window, is used as a
+     error measure.
+@param minEigThreshold the algorithm calculates the minimum eigen value of a 2x2 normal matrix of
+optical flow equations (this matrix is called a spatial gradient matrix in @cite Bouguet00), divided
+by number of pixels in a window; if this value is less than minEigThreshold, then a corresponding
+feature is filtered out and its flow is not processed, so it allows to remove bad points and get a
+performance boost.
+
+The function implements a sparse iterative version of the Lucas-Kanade optical flow in pyramids. See
+@cite Bouguet00 . The function is parallelized with the TBB library.
+
+@note Some examples:
+
+-   An example using the Lucas-Kanade optical flow algorithm can be found at
+    opencv_source_code/samples/cpp/lkdemo.cpp
+-   (Python) An example using the Lucas-Kanade optical flow algorithm can be found at
+    opencv_source_code/samples/python/lk_track.py
+-   (Python) An example using the Lucas-Kanade tracker for homography matching can be found at
+    opencv_source_code/samples/python/lk_homography.py
+ */
+CV_EXPORTS_W void calcOpticalFlowPyrLK( InputArray prevImg, InputArray nextImg,
+                                        InputArray prevPts, InputOutputArray nextPts,
+                                        OutputArray status, OutputArray err,
+                                        Size winSize = Size(21,21), int maxLevel = 3,
+                                        TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01),
+                                        int flags = 0, double minEigThreshold = 1e-4 );
+
+/** @brief Computes a dense optical flow using the Gunnar Farneback's algorithm.
+
+@param prev first 8-bit single-channel input image.
+@param next second input image of the same size and the same type as prev.
+@param flow computed flow image that has the same size as prev and type CV_32FC2.
+@param pyr_scale parameter, specifying the image scale (\<1) to build pyramids for each image;
+pyr_scale=0.5 means a classical pyramid, where each next layer is twice smaller than the previous
+one.
+@param levels number of pyramid layers including the initial image; levels=1 means that no extra
+layers are created and only the original images are used.
+@param winsize averaging window size; larger values increase the algorithm robustness to image
+noise and give more chances for fast motion detection, but yield more blurred motion field.
+@param iterations number of iterations the algorithm does at each pyramid level.
+@param poly_n size of the pixel neighborhood used to find polynomial expansion in each pixel;
+larger values mean that the image will be approximated with smoother surfaces, yielding more
+robust algorithm and more blurred motion field, typically poly_n =5 or 7.
+@param poly_sigma standard deviation of the Gaussian that is used to smooth derivatives used as a
+basis for the polynomial expansion; for poly_n=5, you can set poly_sigma=1.1, for poly_n=7, a
+good value would be poly_sigma=1.5.
+@param flags operation flags that can be a combination of the following:
+ -   **OPTFLOW_USE_INITIAL_FLOW** uses the input flow as an initial flow approximation.
+ -   **OPTFLOW_FARNEBACK_GAUSSIAN** uses the Gaussian \f$\texttt{winsize}\times\texttt{winsize}\f$
+     filter instead of a box filter of the same size for optical flow estimation; usually, this
+     option gives z more accurate flow than with a box filter, at the cost of lower speed;
+     normally, winsize for a Gaussian window should be set to a larger value to achieve the same
+     level of robustness.
+
+The function finds an optical flow for each prev pixel using the @cite Farneback2003 algorithm so that
+
+\f[\texttt{prev} (y,x)  \sim \texttt{next} ( y + \texttt{flow} (y,x)[1],  x + \texttt{flow} (y,x)[0])\f]
+
+@note Some examples:
+
+-   An example using the optical flow algorithm described by Gunnar Farneback can be found at
+    opencv_source_code/samples/cpp/fback.cpp
+-   (Python) An example using the optical flow algorithm described by Gunnar Farneback can be
+    found at opencv_source_code/samples/python/opt_flow.py
+ */
+CV_EXPORTS_W void calcOpticalFlowFarneback( InputArray prev, InputArray next, InputOutputArray flow,
+                                            double pyr_scale, int levels, int winsize,
+                                            int iterations, int poly_n, double poly_sigma,
+                                            int flags );
+
+/** @brief Computes an optimal affine transformation between two 2D point sets.
+
+@param src First input 2D point set stored in std::vector or Mat, or an image stored in Mat.
+@param dst Second input 2D point set of the same size and the same type as A, or another image.
+@param fullAffine If true, the function finds an optimal affine transformation with no additional
+restrictions (6 degrees of freedom). Otherwise, the class of transformations to choose from is
+limited to combinations of translation, rotation, and uniform scaling (4 degrees of freedom).
+
+The function finds an optimal affine transform *[A|b]* (a 2 x 3 floating-point matrix) that
+approximates best the affine transformation between:
+
+*   Two point sets
+*   Two raster images. In this case, the function first finds some features in the src image and
+    finds the corresponding features in dst image. After that, the problem is reduced to the first
+    case.
+In case of point sets, the problem is formulated as follows: you need to find a 2x2 matrix *A* and
+2x1 vector *b* so that:
+
+\f[[A^*|b^*] = arg  \min _{[A|b]}  \sum _i  \| \texttt{dst}[i] - A { \texttt{src}[i]}^T - b  \| ^2\f]
+where src[i] and dst[i] are the i-th points in src and dst, respectively
+\f$[A|b]\f$ can be either arbitrary (when fullAffine=true ) or have a form of
+\f[\begin{bmatrix} a_{11} & a_{12} & b_1  \\ -a_{12} & a_{11} & b_2  \end{bmatrix}\f]
+when fullAffine=false.
+
+@deprecated Use cv::estimateAffine2D, cv::estimateAffinePartial2D instead. If you are using this function
+with images, extract points using cv::calcOpticalFlowPyrLK and then use the estimation functions.
+
+@sa
+estimateAffine2D, estimateAffinePartial2D, getAffineTransform, getPerspectiveTransform, findHomography
+ */
+CV_DEPRECATED CV_EXPORTS Mat estimateRigidTransform( InputArray src, InputArray dst, bool fullAffine );
+
+enum
+{
+    MOTION_TRANSLATION = 0,
+    MOTION_EUCLIDEAN   = 1,
+    MOTION_AFFINE      = 2,
+    MOTION_HOMOGRAPHY  = 3
+};
+
+/** @brief Computes the Enhanced Correlation Coefficient value between two images @cite EP08 .
+
+@param templateImage single-channel template image; CV_8U or CV_32F array.
+@param inputImage single-channel input image to be warped to provide an image similar to
+ templateImage, same type as templateImage.
+@param inputMask An optional mask to indicate valid values of inputImage.
+
+@sa
+findTransformECC
+ */
+
+CV_EXPORTS_W double computeECC(InputArray templateImage, InputArray inputImage, InputArray inputMask = noArray());
+
+/** @example samples/cpp/image_alignment.cpp
+An example using the image alignment ECC algorithm
+*/
+
+/** @brief Finds the geometric transform (warp) between two images in terms of the ECC criterion @cite EP08 .
+
+@param templateImage single-channel template image; CV_8U or CV_32F array.
+@param inputImage single-channel input image which should be warped with the final warpMatrix in
+order to provide an image similar to templateImage, same type as templateImage.
+@param warpMatrix floating-point \f$2\times 3\f$ or \f$3\times 3\f$ mapping matrix (warp).
+@param motionType parameter, specifying the type of motion:
+ -   **MOTION_TRANSLATION** sets a translational motion model; warpMatrix is \f$2\times 3\f$ with
+     the first \f$2\times 2\f$ part being the unity matrix and the rest two parameters being
+     estimated.
+ -   **MOTION_EUCLIDEAN** sets a Euclidean (rigid) transformation as motion model; three
+     parameters are estimated; warpMatrix is \f$2\times 3\f$.
+ -   **MOTION_AFFINE** sets an affine motion model (DEFAULT); six parameters are estimated;
+     warpMatrix is \f$2\times 3\f$.
+ -   **MOTION_HOMOGRAPHY** sets a homography as a motion model; eight parameters are
+     estimated;\`warpMatrix\` is \f$3\times 3\f$.
+@param criteria parameter, specifying the termination criteria of the ECC algorithm;
+criteria.epsilon defines the threshold of the increment in the correlation coefficient between two
+iterations (a negative criteria.epsilon makes criteria.maxcount the only termination criterion).
+Default values are shown in the declaration above.
+@param inputMask An optional mask to indicate valid values of inputImage.
+@param gaussFiltSize An optional value indicating size of gaussian blur filter; (DEFAULT: 5)
+
+The function estimates the optimum transformation (warpMatrix) with respect to ECC criterion
+(@cite EP08), that is
+
+\f[\texttt{warpMatrix} = \arg\max_{W} \texttt{ECC}(\texttt{templateImage}(x,y),\texttt{inputImage}(x',y'))\f]
+
+where
+
+\f[\begin{bmatrix} x' \\ y' \end{bmatrix} = W \cdot \begin{bmatrix} x \\ y \\ 1 \end{bmatrix}\f]
+
+(the equation holds with homogeneous coordinates for homography). It returns the final enhanced
+correlation coefficient, that is the correlation coefficient between the template image and the
+final warped input image. When a \f$3\times 3\f$ matrix is given with motionType =0, 1 or 2, the third
+row is ignored.
+
+Unlike findHomography and estimateRigidTransform, the function findTransformECC implements an
+area-based alignment that builds on intensity similarities. In essence, the function updates the
+initial transformation that roughly aligns the images. If this information is missing, the identity
+warp (unity matrix) is used as an initialization. Note that if images undergo strong
+displacements/rotations, an initial transformation that roughly aligns the images is necessary
+(e.g., a simple euclidean/similarity transform that allows for the images showing the same image
+content approximately). Use inverse warping in the second image to take an image close to the first
+one, i.e. use the flag WARP_INVERSE_MAP with warpAffine or warpPerspective. See also the OpenCV
+sample image_alignment.cpp that demonstrates the use of the function. Note that the function throws
+an exception if algorithm does not converges.
+
+@sa
+computeECC, estimateAffine2D, estimateAffinePartial2D, findHomography
+ */
+CV_EXPORTS_W double findTransformECC( InputArray templateImage, InputArray inputImage,
+                                      InputOutputArray warpMatrix, int motionType,
+                                      TermCriteria criteria,
+                                      InputArray inputMask, int gaussFiltSize);
+
+/** @overload */
+CV_EXPORTS_W
+double findTransformECC(InputArray templateImage, InputArray inputImage,
+    InputOutputArray warpMatrix, int motionType = MOTION_AFFINE,
+    TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 50, 0.001),
+    InputArray inputMask = noArray());
+
+/** @example samples/cpp/kalman.cpp
+An example using the standard Kalman filter
+*/
+
+/** @brief Kalman filter class.
+
+The class implements a standard Kalman filter <http://en.wikipedia.org/wiki/Kalman_filter>,
+@cite Welch95 . However, you can modify transitionMatrix, controlMatrix, and measurementMatrix to get
+an extended Kalman filter functionality.
+@note In C API when CvKalman\* kalmanFilter structure is not needed anymore, it should be released
+with cvReleaseKalman(&kalmanFilter)
+ */
+class CV_EXPORTS_W KalmanFilter
+{
+public:
+    CV_WRAP KalmanFilter();
+    /** @overload
+    @param dynamParams Dimensionality of the state.
+    @param measureParams Dimensionality of the measurement.
+    @param controlParams Dimensionality of the control vector.
+    @param type Type of the created matrices that should be CV_32F or CV_64F.
+    */
+    CV_WRAP KalmanFilter( int dynamParams, int measureParams, int controlParams = 0, int type = CV_32F );
+
+    /** @brief Re-initializes Kalman filter. The previous content is destroyed.
+
+    @param dynamParams Dimensionality of the state.
+    @param measureParams Dimensionality of the measurement.
+    @param controlParams Dimensionality of the control vector.
+    @param type Type of the created matrices that should be CV_32F or CV_64F.
+     */
+    void init( int dynamParams, int measureParams, int controlParams = 0, int type = CV_32F );
+
+    /** @brief Computes a predicted state.
+
+    @param control The optional input control
+     */
+    CV_WRAP const Mat& predict( const Mat& control = Mat() );
+
+    /** @brief Updates the predicted state from the measurement.
+
+    @param measurement The measured system parameters
+     */
+    CV_WRAP const Mat& correct( const Mat& measurement );
+
+    CV_PROP_RW Mat statePre;           //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
+    CV_PROP_RW Mat statePost;          //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
+    CV_PROP_RW Mat transitionMatrix;   //!< state transition matrix (A)
+    CV_PROP_RW Mat controlMatrix;      //!< control matrix (B) (not used if there is no control)
+    CV_PROP_RW Mat measurementMatrix;  //!< measurement matrix (H)
+    CV_PROP_RW Mat processNoiseCov;    //!< process noise covariance matrix (Q)
+    CV_PROP_RW Mat measurementNoiseCov;//!< measurement noise covariance matrix (R)
+    CV_PROP_RW Mat errorCovPre;        //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
+    CV_PROP_RW Mat gain;               //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
+    CV_PROP_RW Mat errorCovPost;       //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
+
+    // temporary matrices
+    Mat temp1;
+    Mat temp2;
+    Mat temp3;
+    Mat temp4;
+    Mat temp5;
+};
+
+
+/** @brief Read a .flo file
+
+ @param path Path to the file to be loaded
+
+ The function readOpticalFlow loads a flow field from a file and returns it as a single matrix.
+ Resulting Mat has a type CV_32FC2 - floating-point, 2-channel. First channel corresponds to the
+ flow in the horizontal direction (u), second - vertical (v).
+ */
+CV_EXPORTS_W Mat readOpticalFlow( const String& path );
+/** @brief Write a .flo to disk
+
+ @param path Path to the file to be written
+ @param flow Flow field to be stored
+
+ The function stores a flow field in a file, returns true on success, false otherwise.
+ The flow field must be a 2-channel, floating-point matrix (CV_32FC2). First channel corresponds
+ to the flow in the horizontal direction (u), second - vertical (v).
+ */
+CV_EXPORTS_W bool writeOpticalFlow( const String& path, InputArray flow );
+
+/**
+   Base class for dense optical flow algorithms
+*/
+class CV_EXPORTS_W DenseOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates an optical flow.
+
+    @param I0 first 8-bit single-channel input image.
+    @param I1 second input image of the same size and the same type as prev.
+    @param flow computed flow image that has the same size as prev and type CV_32FC2.
+     */
+    CV_WRAP virtual void calc( InputArray I0, InputArray I1, InputOutputArray flow ) = 0;
+    /** @brief Releases all inner buffers.
+    */
+    CV_WRAP virtual void collectGarbage() = 0;
+};
+
+/** @brief Base interface for sparse optical flow algorithms.
+ */
+class CV_EXPORTS_W SparseOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates a sparse optical flow.
+
+    @param prevImg First input image.
+    @param nextImg Second input image of the same size and the same type as prevImg.
+    @param prevPts Vector of 2D points for which the flow needs to be found.
+    @param nextPts Output vector of 2D points containing the calculated new positions of input features in the second image.
+    @param status Output status vector. Each element of the vector is set to 1 if the
+                  flow for the corresponding features has been found. Otherwise, it is set to 0.
+    @param err Optional output vector that contains error response for each point (inverse confidence).
+     */
+    CV_WRAP virtual void calc(InputArray prevImg, InputArray nextImg,
+                      InputArray prevPts, InputOutputArray nextPts,
+                      OutputArray status,
+                      OutputArray err = cv::noArray()) = 0;
+};
+
+
+/** @brief Class computing a dense optical flow using the Gunnar Farneback's algorithm.
+ */
+class CV_EXPORTS_W FarnebackOpticalFlow : public DenseOpticalFlow
+{
+public:
+    CV_WRAP virtual int getNumLevels() const = 0;
+    CV_WRAP virtual void setNumLevels(int numLevels) = 0;
+
+    CV_WRAP virtual double getPyrScale() const = 0;
+    CV_WRAP virtual void setPyrScale(double pyrScale) = 0;
+
+    CV_WRAP virtual bool getFastPyramids() const = 0;
+    CV_WRAP virtual void setFastPyramids(bool fastPyramids) = 0;
+
+    CV_WRAP virtual int getWinSize() const = 0;
+    CV_WRAP virtual void setWinSize(int winSize) = 0;
+
+    CV_WRAP virtual int getNumIters() const = 0;
+    CV_WRAP virtual void setNumIters(int numIters) = 0;
+
+    CV_WRAP virtual int getPolyN() const = 0;
+    CV_WRAP virtual void setPolyN(int polyN) = 0;
+
+    CV_WRAP virtual double getPolySigma() const = 0;
+    CV_WRAP virtual void setPolySigma(double polySigma) = 0;
+
+    CV_WRAP virtual int getFlags() const = 0;
+    CV_WRAP virtual void setFlags(int flags) = 0;
+
+    CV_WRAP static Ptr<FarnebackOpticalFlow> create(
+            int numLevels = 5,
+            double pyrScale = 0.5,
+            bool fastPyramids = false,
+            int winSize = 13,
+            int numIters = 10,
+            int polyN = 5,
+            double polySigma = 1.1,
+            int flags = 0);
+};
+
+/** @brief Variational optical flow refinement
+
+This class implements variational refinement of the input flow field, i.e.
+it uses input flow to initialize the minimization of the following functional:
+\f$E(U) = \int_{\Omega} \delta \Psi(E_I) + \gamma \Psi(E_G) + \alpha \Psi(E_S) \f$,
+where \f$E_I,E_G,E_S\f$ are color constancy, gradient constancy and smoothness terms
+respectively. \f$\Psi(s^2)=\sqrt{s^2+\epsilon^2}\f$ is a robust penalizer to limit the
+influence of outliers. A complete formulation and a description of the minimization
+procedure can be found in @cite Brox2004
+*/
+class CV_EXPORTS_W VariationalRefinement : public DenseOpticalFlow
+{
+public:
+    /** @brief @ref calc function overload to handle separate horizontal (u) and vertical (v) flow components
+    (to avoid extra splits/merges) */
+    CV_WRAP virtual void calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v) = 0;
+
+    /** @brief Number of outer (fixed-point) iterations in the minimization procedure.
+    @see setFixedPointIterations */
+    CV_WRAP virtual int getFixedPointIterations() const = 0;
+    /** @copybrief getFixedPointIterations @see getFixedPointIterations */
+    CV_WRAP virtual void setFixedPointIterations(int val) = 0;
+
+    /** @brief Number of inner successive over-relaxation (SOR) iterations
+        in the minimization procedure to solve the respective linear system.
+    @see setSorIterations */
+    CV_WRAP virtual int getSorIterations() const = 0;
+    /** @copybrief getSorIterations @see getSorIterations */
+    CV_WRAP virtual void setSorIterations(int val) = 0;
+
+    /** @brief Relaxation factor in SOR
+    @see setOmega */
+    CV_WRAP virtual float getOmega() const = 0;
+    /** @copybrief getOmega @see getOmega */
+    CV_WRAP virtual void setOmega(float val) = 0;
+
+    /** @brief Weight of the smoothness term
+    @see setAlpha */
+    CV_WRAP virtual float getAlpha() const = 0;
+    /** @copybrief getAlpha @see getAlpha */
+    CV_WRAP virtual void setAlpha(float val) = 0;
+
+    /** @brief Weight of the color constancy term
+    @see setDelta */
+    CV_WRAP virtual float getDelta() const = 0;
+    /** @copybrief getDelta @see getDelta */
+    CV_WRAP virtual void setDelta(float val) = 0;
+
+    /** @brief Weight of the gradient constancy term
+    @see setGamma */
+    CV_WRAP virtual float getGamma() const = 0;
+    /** @copybrief getGamma @see getGamma */
+    CV_WRAP virtual void setGamma(float val) = 0;
+
+    /** @brief Norm value shift for robust penalizer
+    @see setEpsilon */
+    CV_WRAP virtual float getEpsilon() const = 0;
+    /** @copybrief getEpsilon @see getEpsilon */
+    CV_WRAP virtual void setEpsilon(float val) = 0;
+
+    /** @brief Creates an instance of VariationalRefinement
+    */
+    CV_WRAP static Ptr<VariationalRefinement> create();
+};
+
+/** @brief DIS optical flow algorithm.
+
+This class implements the Dense Inverse Search (DIS) optical flow algorithm. More
+details about the algorithm can be found at @cite Kroeger2016 . Includes three presets with preselected
+parameters to provide reasonable trade-off between speed and quality. However, even the slowest preset is
+still relatively fast, use DeepFlow if you need better quality and don't care about speed.
+
+This implementation includes several additional features compared to the algorithm described in the paper,
+including spatial propagation of flow vectors (@ref getUseSpatialPropagation), as well as an option to
+utilize an initial flow approximation passed to @ref calc (which is, essentially, temporal propagation,
+if the previous frame's flow field is passed).
+*/
+class CV_EXPORTS_W DISOpticalFlow : public DenseOpticalFlow
+{
+public:
+    enum
+    {
+        PRESET_ULTRAFAST = 0,
+        PRESET_FAST = 1,
+        PRESET_MEDIUM = 2
+    };
+
+    /** @brief Finest level of the Gaussian pyramid on which the flow is computed (zero level
+        corresponds to the original image resolution). The final flow is obtained by bilinear upscaling.
+        @see setFinestScale */
+    CV_WRAP virtual int getFinestScale() const = 0;
+    /** @copybrief getFinestScale @see getFinestScale */
+    CV_WRAP virtual void setFinestScale(int val) = 0;
+
+    /** @brief Size of an image patch for matching (in pixels). Normally, default 8x8 patches work well
+        enough in most cases.
+        @see setPatchSize */
+    CV_WRAP virtual int getPatchSize() const = 0;
+    /** @copybrief getPatchSize @see getPatchSize */
+    CV_WRAP virtual void setPatchSize(int val) = 0;
+
+    /** @brief Stride between neighbor patches. Must be less than patch size. Lower values correspond
+        to higher flow quality.
+        @see setPatchStride */
+    CV_WRAP virtual int getPatchStride() const = 0;
+    /** @copybrief getPatchStride @see getPatchStride */
+    CV_WRAP virtual void setPatchStride(int val) = 0;
+
+    /** @brief Maximum number of gradient descent iterations in the patch inverse search stage. Higher values
+        may improve quality in some cases.
+        @see setGradientDescentIterations */
+    CV_WRAP virtual int getGradientDescentIterations() const = 0;
+    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
+    CV_WRAP virtual void setGradientDescentIterations(int val) = 0;
+
+    /** @brief Number of fixed point iterations of variational refinement per scale. Set to zero to
+        disable variational refinement completely. Higher values will typically result in more smooth and
+        high-quality flow.
+    @see setGradientDescentIterations */
+    CV_WRAP virtual int getVariationalRefinementIterations() const = 0;
+    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
+    CV_WRAP virtual void setVariationalRefinementIterations(int val) = 0;
+
+    /** @brief Weight of the smoothness term
+    @see setVariationalRefinementAlpha */
+    CV_WRAP virtual float getVariationalRefinementAlpha() const = 0;
+    /** @copybrief getVariationalRefinementAlpha @see getVariationalRefinementAlpha */
+    CV_WRAP virtual void setVariationalRefinementAlpha(float val) = 0;
+
+    /** @brief Weight of the color constancy term
+    @see setVariationalRefinementDelta */
+    CV_WRAP virtual float getVariationalRefinementDelta() const = 0;
+    /** @copybrief getVariationalRefinementDelta @see getVariationalRefinementDelta */
+    CV_WRAP virtual void setVariationalRefinementDelta(float val) = 0;
+
+    /** @brief Weight of the gradient constancy term
+    @see setVariationalRefinementGamma */
+    CV_WRAP virtual float getVariationalRefinementGamma() const = 0;
+    /** @copybrief getVariationalRefinementGamma @see getVariationalRefinementGamma */
+    CV_WRAP virtual void setVariationalRefinementGamma(float val) = 0;
+
+    /** @brief Norm value shift for robust penalizer
+    @see setVariationalRefinementEpsilon */
+    CV_WRAP virtual float getVariationalRefinementEpsilon() const = 0;
+    /** @copybrief getVariationalRefinementEpsilon @see getVariationalRefinementEpsilon */
+    CV_WRAP virtual void setVariationalRefinementEpsilon(float val) = 0;
+
+
+    /** @brief Whether to use mean-normalization of patches when computing patch distance. It is turned on
+        by default as it typically provides a noticeable quality boost because of increased robustness to
+        illumination variations. Turn it off if you are certain that your sequence doesn't contain any changes
+        in illumination.
+    @see setUseMeanNormalization */
+    CV_WRAP virtual bool getUseMeanNormalization() const = 0;
+    /** @copybrief getUseMeanNormalization @see getUseMeanNormalization */
+    CV_WRAP virtual void setUseMeanNormalization(bool val) = 0;
+
+    /** @brief Whether to use spatial propagation of good optical flow vectors. This option is turned on by
+        default, as it tends to work better on average and can sometimes help recover from major errors
+        introduced by the coarse-to-fine scheme employed by the DIS optical flow algorithm. Turning this
+        option off can make the output flow field a bit smoother, however.
+    @see setUseSpatialPropagation */
+    CV_WRAP virtual bool getUseSpatialPropagation() const = 0;
+    /** @copybrief getUseSpatialPropagation @see getUseSpatialPropagation */
+    CV_WRAP virtual void setUseSpatialPropagation(bool val) = 0;
+
+    /** @brief Creates an instance of DISOpticalFlow
+
+    @param preset one of PRESET_ULTRAFAST, PRESET_FAST and PRESET_MEDIUM
+    */
+    CV_WRAP static Ptr<DISOpticalFlow> create(int preset = DISOpticalFlow::PRESET_FAST);
+};
+
+/** @brief Class used for calculating a sparse optical flow.
+
+The class can calculate an optical flow for a sparse feature set using the
+iterative Lucas-Kanade method with pyramids.
+
+@sa calcOpticalFlowPyrLK
+
+*/
+class CV_EXPORTS_W SparsePyrLKOpticalFlow : public SparseOpticalFlow
+{
+public:
+    CV_WRAP virtual Size getWinSize() const = 0;
+    CV_WRAP virtual void setWinSize(Size winSize) = 0;
+
+    CV_WRAP virtual int getMaxLevel() const = 0;
+    CV_WRAP virtual void setMaxLevel(int maxLevel) = 0;
+
+    CV_WRAP virtual TermCriteria getTermCriteria() const = 0;
+    CV_WRAP virtual void setTermCriteria(TermCriteria& crit) = 0;
+
+    CV_WRAP virtual int getFlags() const = 0;
+    CV_WRAP virtual void setFlags(int flags) = 0;
+
+    CV_WRAP virtual double getMinEigThreshold() const = 0;
+    CV_WRAP virtual void setMinEigThreshold(double minEigThreshold) = 0;
+
+    CV_WRAP static Ptr<SparsePyrLKOpticalFlow> create(
+            Size winSize = Size(21, 21),
+            int maxLevel = 3, TermCriteria crit =
+            TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01),
+            int flags = 0,
+            double minEigThreshold = 1e-4);
+};
+
+
+
+
+/** @brief Base abstract class for the long-term tracker
+ */
+class CV_EXPORTS_W Tracker
+{
+protected:
+    Tracker();
+public:
+    virtual ~Tracker();
+
+    /** @brief Initialize the tracker with a known bounding box that surrounded the target
+    @param image The initial frame
+    @param boundingBox The initial bounding box
+    */
+    CV_WRAP virtual
+    void init(InputArray image, const Rect& boundingBox) = 0;
+
+    /** @brief Update the tracker, find the new most likely bounding box for the target
+    @param image The current frame
+    @param boundingBox The bounding box that represent the new target location, if true was returned, not
+    modified otherwise
+
+    @return True means that target was located and false means that tracker cannot locate target in
+    current frame. Note, that latter *does not* imply that tracker has failed, maybe target is indeed
+    missing from the frame (say, out of sight)
+    */
+    CV_WRAP virtual
+    bool update(InputArray image, CV_OUT Rect& boundingBox) = 0;
+};
+
+
+
+/** @brief The MIL algorithm trains a classifier in an online manner to separate the object from the
+background.
+
+Multiple Instance Learning avoids the drift problem for a robust tracking. The implementation is
+based on @cite MIL .
+
+Original code can be found here <http://vision.ucsd.edu/~bbabenko/project_miltrack.shtml>
+ */
+class CV_EXPORTS_W TrackerMIL : public Tracker
+{
+protected:
+    TrackerMIL();  // use ::create()
+public:
+    virtual ~TrackerMIL() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        //parameters for sampler
+        CV_PROP_RW float samplerInitInRadius;  //!< radius for gathering positive instances during init
+        CV_PROP_RW int samplerInitMaxNegNum;  //!< # negative samples to use during init
+        CV_PROP_RW float samplerSearchWinSize;  //!< size of search window
+        CV_PROP_RW float samplerTrackInRadius;  //!< radius for gathering positive instances during tracking
+        CV_PROP_RW int samplerTrackMaxPosNum;  //!< # positive samples to use during tracking
+        CV_PROP_RW int samplerTrackMaxNegNum;  //!< # negative samples to use during tracking
+        CV_PROP_RW int featureSetNumFeatures;  //!< # features
+    };
+
+    /** @brief Create MIL tracker instance
+     *  @param parameters MIL parameters TrackerMIL::Params
+     */
+    static CV_WRAP
+    Ptr<TrackerMIL> create(const TrackerMIL::Params &parameters = TrackerMIL::Params());
+
+    //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
+
+
+/** @brief the GOTURN (Generic Object Tracking Using Regression Networks) tracker
+ *
+ *  GOTURN (@cite GOTURN) is kind of trackers based on Convolutional Neural Networks (CNN). While taking all advantages of CNN trackers,
+ *  GOTURN is much faster due to offline training without online fine-tuning nature.
+ *  GOTURN tracker addresses the problem of single target tracking: given a bounding box label of an object in the first frame of the video,
+ *  we track that object through the rest of the video. NOTE: Current method of GOTURN does not handle occlusions; however, it is fairly
+ *  robust to viewpoint changes, lighting changes, and deformations.
+ *  Inputs of GOTURN are two RGB patches representing Target and Search patches resized to 227x227.
+ *  Outputs of GOTURN are predicted bounding box coordinates, relative to Search patch coordinate system, in format X1,Y1,X2,Y2.
+ *  Original paper is here: <http://davheld.github.io/GOTURN/GOTURN.pdf>
+ *  As long as original authors implementation: <https://github.com/davheld/GOTURN#train-the-tracker>
+ *  Implementation of training algorithm is placed in separately here due to 3d-party dependencies:
+ *  <https://github.com/Auron-X/GOTURN_Training_Toolkit>
+ *  GOTURN architecture goturn.prototxt and trained model goturn.caffemodel are accessible on opencv_extra GitHub repository.
+ */
+class CV_EXPORTS_W TrackerGOTURN : public Tracker
+{
+protected:
+    TrackerGOTURN();  // use ::create()
+public:
+    virtual ~TrackerGOTURN() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string modelTxt;
+        CV_PROP_RW std::string modelBin;
+    };
+
+    /** @brief Constructor
+    @param parameters GOTURN parameters TrackerGOTURN::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerGOTURN> create(const TrackerGOTURN::Params& parameters = TrackerGOTURN::Params());
+
+    //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
+class CV_EXPORTS_W TrackerDaSiamRPN : public Tracker
+{
+protected:
+    TrackerDaSiamRPN();  // use ::create()
+public:
+    virtual ~TrackerDaSiamRPN() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string model;
+        CV_PROP_RW std::string kernel_cls1;
+        CV_PROP_RW std::string kernel_r1;
+        CV_PROP_RW int backend;
+        CV_PROP_RW int target;
+    };
+
+    /** @brief Constructor
+    @param parameters DaSiamRPN parameters TrackerDaSiamRPN::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerDaSiamRPN> create(const TrackerDaSiamRPN::Params& parameters = TrackerDaSiamRPN::Params());
+
+    /** @brief Return tracking score
+    */
+    CV_WRAP virtual float getTrackingScore() = 0;
+
+    //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
+/** @brief the Nano tracker is a super lightweight dnn-based general object tracking.
+ *
+ *  Nano tracker is much faster and extremely lightweight due to special model structure, the whole model size is about 1.9 MB.
+ *  Nano tracker needs two models: one for feature extraction (backbone) and the another for localization (neckhead).
+ *  Model download link: https://github.com/HonglinChu/SiamTrackers/tree/master/NanoTrack/models/nanotrackv2
+ *  Original repo is here: https://github.com/HonglinChu/NanoTrack
+ *  Author: HongLinChu, 1628464345@qq.com
+ */
+class CV_EXPORTS_W TrackerNano : public Tracker
+{
+protected:
+    TrackerNano();  // use ::create()
+public:
+    virtual ~TrackerNano() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string backbone;
+        CV_PROP_RW std::string neckhead;
+        CV_PROP_RW int backend;
+        CV_PROP_RW int target;
+    };
+
+    /** @brief Constructor
+    @param parameters NanoTrack parameters TrackerNano::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerNano> create(const TrackerNano::Params& parameters = TrackerNano::Params());
+
+    /** @brief Return tracking score
+    */
+    CV_WRAP virtual float getTrackingScore() = 0;
+
+    //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
+/** @brief the VIT tracker is a super lightweight dnn-based general object tracking.
+ *
+ *  VIT tracker is much faster and extremely lightweight due to special model structure, the model file is about 767KB.
+ *  Model download link: https://github.com/opencv/opencv_zoo/tree/main/models/object_tracking_vittrack
+ *  Author: PengyuLiu, 1872918507@qq.com
+ */
+class CV_EXPORTS_W TrackerVit : public Tracker
+{
+protected:
+    TrackerVit();  // use ::create()
+public:
+    virtual ~TrackerVit() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string net;
+        CV_PROP_RW int backend;
+        CV_PROP_RW int target;
+        CV_PROP_RW Scalar meanvalue;
+        CV_PROP_RW Scalar stdvalue;
+    };
+
+    /** @brief Constructor
+    @param parameters vit tracker parameters TrackerVit::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerVit> create(const TrackerVit::Params& parameters = TrackerVit::Params());
+
+    /** @brief Return tracking score
+    */
+    CV_WRAP virtual float getTrackingScore() = 0;
+
+    // void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    // bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
+//! @} video_track
+
+} // cv
+
+#endif
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/video/video.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/video/video.hpp
new file mode 100644
index 000000000000..8267b85d5958
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/video/video.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/video.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/videoio.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/videoio.hpp
new file mode 100644
index 000000000000..fb47036bbf56
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/videoio.hpp
@@ -0,0 +1,1182 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_VIDEOIO_HPP
+#define OPENCV_VIDEOIO_HPP
+
+#include "opencv2/core.hpp"
+
+/**
+  @defgroup videoio Video I/O
+
+  @brief Read and write video or images sequence with OpenCV
+
+  ### See also:
+  - @ref videoio_overview
+  - Tutorials: @ref tutorial_table_of_content_app
+  @{
+    @defgroup videoio_flags_base Flags for video I/O
+    @defgroup videoio_flags_others Additional flags for video I/O API backends
+    @defgroup videoio_hwaccel Hardware-accelerated video decoding and encoding
+    @defgroup videoio_c C API for video I/O
+    @defgroup videoio_ios iOS glue for video I/O
+    @defgroup videoio_winrt WinRT glue for video I/O
+    @defgroup videoio_registry Query I/O API backends registry
+  @}
+*/
+
+////////////////////////////////// video io /////////////////////////////////
+
+typedef struct CvCapture CvCapture;
+typedef struct CvVideoWriter CvVideoWriter;
+
+namespace cv
+{
+
+//! @addtogroup videoio
+//! @{
+
+//! @addtogroup videoio_flags_base
+//! @{
+
+
+/** @brief cv::VideoCapture API backends identifier.
+
+Select preferred API for a capture object.
+To be used in the VideoCapture::VideoCapture() constructor or VideoCapture::open()
+
+@note
+-   Backends are available only if they have been built with your OpenCV binaries.
+See @ref videoio_overview for more information.
+-   Microsoft Media Foundation backend tries to use hardware accelerated transformations
+if possible. Environment flag "OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS" set to 0
+disables it and may improve initialization time. More details:
+https://learn.microsoft.com/en-us/windows/win32/medfound/mf-readwrite-enable-hardware-transforms
+*/
+enum VideoCaptureAPIs {
+       CAP_ANY          = 0,            //!< Auto detect == 0
+       CAP_VFW          = 200,          //!< Video For Windows (obsolete, removed)
+       CAP_V4L          = 200,          //!< V4L/V4L2 capturing support
+       CAP_V4L2         = CAP_V4L,      //!< Same as CAP_V4L
+       CAP_FIREWIRE     = 300,          //!< IEEE 1394 drivers
+       CAP_FIREWARE     = CAP_FIREWIRE, //!< Same value as CAP_FIREWIRE
+       CAP_IEEE1394     = CAP_FIREWIRE, //!< Same value as CAP_FIREWIRE
+       CAP_DC1394       = CAP_FIREWIRE, //!< Same value as CAP_FIREWIRE
+       CAP_CMU1394      = CAP_FIREWIRE, //!< Same value as CAP_FIREWIRE
+       CAP_QT           = 500,          //!< QuickTime (obsolete, removed)
+       CAP_UNICAP       = 600,          //!< Unicap drivers (obsolete, removed)
+       CAP_DSHOW        = 700,          //!< DirectShow (via videoInput)
+       CAP_PVAPI        = 800,          //!< PvAPI, Prosilica GigE SDK
+       CAP_OPENNI       = 900,          //!< OpenNI (for Kinect)
+       CAP_OPENNI_ASUS  = 910,          //!< OpenNI (for Asus Xtion)
+       CAP_ANDROID      = 1000,         //!< MediaNDK (API Level 21+) and NDK Camera (API level 24+) for Android
+       CAP_XIAPI        = 1100,         //!< XIMEA Camera API
+       CAP_AVFOUNDATION = 1200,         //!< AVFoundation framework for iOS (OS X Lion will have the same API)
+       CAP_GIGANETIX    = 1300,         //!< Smartek Giganetix GigEVisionSDK
+       CAP_MSMF         = 1400,         //!< Microsoft Media Foundation (via videoInput). See platform specific notes above.
+       CAP_WINRT        = 1410,         //!< Microsoft Windows Runtime using Media Foundation
+       CAP_INTELPERC    = 1500,         //!< RealSense (former Intel Perceptual Computing SDK)
+       CAP_REALSENSE    = 1500,         //!< Synonym for CAP_INTELPERC
+       CAP_OPENNI2      = 1600,         //!< OpenNI2 (for Kinect)
+       CAP_OPENNI2_ASUS = 1610,         //!< OpenNI2 (for Asus Xtion and Occipital Structure sensors)
+       CAP_OPENNI2_ASTRA= 1620,         //!< OpenNI2 (for Orbbec Astra)
+       CAP_GPHOTO2      = 1700,         //!< gPhoto2 connection
+       CAP_GSTREAMER    = 1800,         //!< GStreamer
+       CAP_FFMPEG       = 1900,         //!< Open and record video file or stream using the FFMPEG library
+       CAP_IMAGES       = 2000,         //!< OpenCV Image Sequence (e.g. img_%02d.jpg)
+       CAP_ARAVIS       = 2100,         //!< Aravis SDK
+       CAP_OPENCV_MJPEG = 2200,         //!< Built-in OpenCV MotionJPEG codec
+       CAP_INTEL_MFX    = 2300,         //!< Intel MediaSDK
+       CAP_XINE         = 2400,         //!< XINE engine (Linux)
+       CAP_UEYE         = 2500,         //!< uEye Camera API
+       CAP_OBSENSOR     = 2600,         //!< For Orbbec 3D-Sensor device/module (Astra+, Femto, Astra2, Gemini2, Gemini2L, Gemini2XL, Femto Mega) attention: Astra2 cameras currently only support Windows and Linux kernel versions no higher than 4.15, and higher versions of Linux kernel may have exceptions.
+     };
+
+
+/** @brief cv::VideoCapture generic properties identifier.
+
+ Reading / writing properties involves many layers. Some unexpected result might happens along this chain.
+ Effective behaviour depends from device hardware, driver and API Backend.
+ @sa videoio_flags_others, VideoCapture::get(), VideoCapture::set()
+*/
+enum VideoCaptureProperties {
+       CAP_PROP_POS_MSEC       =0, //!< Current position of the video file in milliseconds.
+       CAP_PROP_POS_FRAMES     =1, //!< 0-based index of the frame to be decoded/captured next. When the index i is set in RAW mode (CAP_PROP_FORMAT == -1) this will seek to the key frame k, where k <= i.
+       CAP_PROP_POS_AVI_RATIO  =2, //!< Relative position of the video file: 0=start of the film, 1=end of the film.
+       CAP_PROP_FRAME_WIDTH    =3, //!< Width of the frames in the video stream.
+       CAP_PROP_FRAME_HEIGHT   =4, //!< Height of the frames in the video stream.
+       CAP_PROP_FPS            =5, //!< Frame rate.
+       CAP_PROP_FOURCC         =6, //!< 4-character code of codec. see VideoWriter::fourcc .
+       CAP_PROP_FRAME_COUNT    =7, //!< Number of frames in the video file.
+       CAP_PROP_FORMAT         =8, //!< Format of the %Mat objects (see Mat::type()) returned by VideoCapture::retrieve().
+                                   //!< Set value -1 to fetch undecoded RAW video streams (as Mat 8UC1).
+       CAP_PROP_MODE           =9, //!< Backend-specific value indicating the current capture mode.
+       CAP_PROP_BRIGHTNESS    =10, //!< Brightness of the image (only for those cameras that support).
+       CAP_PROP_CONTRAST      =11, //!< Contrast of the image (only for cameras).
+       CAP_PROP_SATURATION    =12, //!< Saturation of the image (only for cameras).
+       CAP_PROP_HUE           =13, //!< Hue of the image (only for cameras).
+       CAP_PROP_GAIN          =14, //!< Gain of the image (only for those cameras that support).
+       CAP_PROP_EXPOSURE      =15, //!< Exposure (only for those cameras that support).
+       CAP_PROP_CONVERT_RGB   =16, //!< Boolean flags indicating whether images should be converted to RGB. <br/>
+                                   //!< *GStreamer note*: The flag is ignored in case if custom pipeline is used. It's user responsibility to interpret pipeline output.
+       CAP_PROP_WHITE_BALANCE_BLUE_U =17, //!< Currently unsupported.
+       CAP_PROP_RECTIFICATION =18, //!< Rectification flag for stereo cameras (note: only supported by DC1394 v 2.x backend currently).
+       CAP_PROP_MONOCHROME    =19,
+       CAP_PROP_SHARPNESS     =20,
+       CAP_PROP_AUTO_EXPOSURE =21, //!< DC1394: exposure control done by camera, user can adjust reference level using this feature.
+       CAP_PROP_GAMMA         =22,
+       CAP_PROP_TEMPERATURE   =23,
+       CAP_PROP_TRIGGER       =24,
+       CAP_PROP_TRIGGER_DELAY =25,
+       CAP_PROP_WHITE_BALANCE_RED_V =26,
+       CAP_PROP_ZOOM          =27,
+       CAP_PROP_FOCUS         =28,
+       CAP_PROP_GUID          =29,
+       CAP_PROP_ISO_SPEED     =30,
+       CAP_PROP_BACKLIGHT     =32,
+       CAP_PROP_PAN           =33,
+       CAP_PROP_TILT          =34,
+       CAP_PROP_ROLL          =35,
+       CAP_PROP_IRIS          =36,
+       CAP_PROP_SETTINGS      =37, //!< Pop up video/camera filter dialog (note: only supported by DSHOW backend currently. The property value is ignored)
+       CAP_PROP_BUFFERSIZE    =38,
+       CAP_PROP_AUTOFOCUS     =39,
+       CAP_PROP_SAR_NUM       =40, //!< Sample aspect ratio: num/den (num)
+       CAP_PROP_SAR_DEN       =41, //!< Sample aspect ratio: num/den (den)
+       CAP_PROP_BACKEND       =42, //!< Current backend (enum VideoCaptureAPIs). Read-only property
+       CAP_PROP_CHANNEL       =43, //!< Video input or Channel Number (only for those cameras that support)
+       CAP_PROP_AUTO_WB       =44, //!< enable/ disable auto white-balance
+       CAP_PROP_WB_TEMPERATURE=45, //!< white-balance color temperature
+       CAP_PROP_CODEC_PIXEL_FORMAT =46,    //!< (read-only) codec's pixel format. 4-character code - see VideoWriter::fourcc . Subset of [AV_PIX_FMT_*](https://github.com/FFmpeg/FFmpeg/blob/master/libavcodec/raw.c) or -1 if unknown
+       CAP_PROP_BITRATE       =47, //!< (read-only) Video bitrate in kbits/s
+       CAP_PROP_ORIENTATION_META=48, //!< (read-only) Frame rotation defined by stream meta (applicable for FFmpeg and AVFoundation back-ends only)
+       CAP_PROP_ORIENTATION_AUTO=49, //!< if true - rotates output frames of CvCapture considering video file's metadata  (applicable for FFmpeg and AVFoundation back-ends only) (https://github.com/opencv/opencv/issues/15499)
+       CAP_PROP_HW_ACCELERATION=50, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in cv::VideoCapture constructor / .open() method. Default value is backend-specific.
+       CAP_PROP_HW_DEVICE      =51, //!< (**open-only**) Hardware device index (select GPU if multiple available). Device enumeration is acceleration type specific.
+       CAP_PROP_HW_ACCELERATION_USE_OPENCL=52, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between HW accelerated decoder and cv::UMat.
+       CAP_PROP_OPEN_TIMEOUT_MSEC=53, //!< (**open-only**) timeout in milliseconds for opening a video capture (applicable for FFmpeg and GStreamer back-ends only)
+       CAP_PROP_READ_TIMEOUT_MSEC=54, //!< (**open-only**) timeout in milliseconds for reading from a video capture (applicable for FFmpeg and GStreamer back-ends only)
+       CAP_PROP_STREAM_OPEN_TIME_USEC =55, //!< (read-only) time in microseconds since Jan 1 1970 when stream was opened. Applicable for FFmpeg backend only. Useful for RTSP and other live streams
+       CAP_PROP_VIDEO_TOTAL_CHANNELS = 56, //!< (read-only) Number of video channels
+       CAP_PROP_VIDEO_STREAM = 57, //!< (**open-only**) Specify video stream, 0-based index. Use -1 to disable video stream from file or IP cameras. Default value is 0.
+       CAP_PROP_AUDIO_STREAM = 58, //!< (**open-only**) Specify stream in multi-language media files, -1 - disable audio processing or microphone. Default value is -1.
+       CAP_PROP_AUDIO_POS = 59, //!< (read-only) Audio position is measured in samples. Accurate audio sample timestamp of previous grabbed fragment. See CAP_PROP_AUDIO_SAMPLES_PER_SECOND and CAP_PROP_AUDIO_SHIFT_NSEC.
+       CAP_PROP_AUDIO_SHIFT_NSEC = 60, //!< (read only) Contains the time difference between the start of the audio stream and the video stream in nanoseconds. Positive value means that audio is started after the first video frame. Negative value means that audio is started before the first video frame.
+       CAP_PROP_AUDIO_DATA_DEPTH = 61, //!< (open, read) Alternative definition to bits-per-sample, but with clear handling of 32F / 32S
+       CAP_PROP_AUDIO_SAMPLES_PER_SECOND = 62, //!< (open, read) determined from file/codec input. If not specified, then selected audio sample rate is 44100
+       CAP_PROP_AUDIO_BASE_INDEX = 63, //!< (read-only) Index of the first audio channel for .retrieve() calls. That audio channel number continues enumeration after video channels.
+       CAP_PROP_AUDIO_TOTAL_CHANNELS = 64, //!< (read-only) Number of audio channels in the selected audio stream (mono, stereo, etc)
+       CAP_PROP_AUDIO_TOTAL_STREAMS = 65, //!< (read-only) Number of audio streams.
+       CAP_PROP_AUDIO_SYNCHRONIZE = 66, //!< (open, read) Enables audio synchronization.
+       CAP_PROP_LRF_HAS_KEY_FRAME = 67, //!< FFmpeg back-end only - Indicates whether the Last Raw Frame (LRF), output from VideoCapture::read() when VideoCapture is initialized with VideoCapture::open(CAP_FFMPEG, {CAP_PROP_FORMAT, -1}) or VideoCapture::set(CAP_PROP_FORMAT,-1) is called before the first call to VideoCapture::read(), contains encoded data for a key frame.
+       CAP_PROP_CODEC_EXTRADATA_INDEX = 68, //!< Positive index indicates that returning extra data is supported by the video back end.  This can be retrieved as cap.retrieve(data, <returned index>).  E.g. When reading from a h264 encoded RTSP stream, the FFmpeg backend could return the SPS and/or PPS if available (if sent in reply to a DESCRIBE request), from calls to cap.retrieve(data, <returned index>).
+       CAP_PROP_FRAME_TYPE = 69, //!< (read-only) FFmpeg back-end only - Frame type ascii code (73 = 'I', 80 = 'P', 66 = 'B' or 63 = '?' if unknown) of the most recently read frame.
+       CAP_PROP_N_THREADS = 70, //!< (**open-only**) Set the maximum number of threads to use. Use 0 to use as many threads as CPU cores (applicable for FFmpeg back-end only).
+#ifndef CV_DOXYGEN
+       CV__CAP_PROP_LATEST
+#endif
+     };
+
+/** @brief cv::VideoWriter generic properties identifier.
+ @sa VideoWriter::get(), VideoWriter::set()
+*/
+enum VideoWriterProperties {
+  VIDEOWRITER_PROP_QUALITY = 1,    //!< Current quality (0..100%) of the encoded videostream. Can be adjusted dynamically in some codecs.
+  VIDEOWRITER_PROP_FRAMEBYTES = 2, //!< (Read-only): Size of just encoded video frame. Note that the encoding order may be different from representation order.
+  VIDEOWRITER_PROP_NSTRIPES = 3,   //!< Number of stripes for parallel encoding. -1 for auto detection.
+  VIDEOWRITER_PROP_IS_COLOR = 4,   //!< If it is not zero, the encoder will expect and encode color frames, otherwise it
+                                   //!< will work with grayscale frames.
+  VIDEOWRITER_PROP_DEPTH = 5,      //!< Defaults to \ref CV_8U.
+  VIDEOWRITER_PROP_HW_ACCELERATION = 6, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in VideoWriter constructor / .open() method. Default value is backend-specific.
+  VIDEOWRITER_PROP_HW_DEVICE       = 7, //!< (**open-only**) Hardware device index (select GPU if multiple available). Device enumeration is acceleration type specific.
+  VIDEOWRITER_PROP_HW_ACCELERATION_USE_OPENCL= 8, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between cv::UMat and HW accelerated encoder.
+  VIDEOWRITER_PROP_RAW_VIDEO = 9, //!< (**open-only**) Set to non-zero to enable encapsulation of an encoded raw video stream. Each raw encoded video frame should be passed to VideoWriter::write() as single row or column of a \ref CV_8UC1 Mat. \note If the key frame interval is not 1 then it must be manually specified by the user. This can either be performed during initialization passing \ref VIDEOWRITER_PROP_KEY_INTERVAL as one of the extra encoder params  to \ref VideoWriter::VideoWriter(const String &, int, double, const Size &, const std::vector< int > &params) or afterwards by setting the \ref VIDEOWRITER_PROP_KEY_FLAG with \ref VideoWriter::set() before writing each frame. FFMpeg backend only.
+  VIDEOWRITER_PROP_KEY_INTERVAL = 10, //!< (**open-only**) Set the key frame interval using raw video encapsulation (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). Defaults to 1 when not set. FFMpeg backend only.
+  VIDEOWRITER_PROP_KEY_FLAG = 11, //!< Set to non-zero to signal that the following frames are key frames or zero if not, when encapsulating raw video (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). FFMpeg backend only.
+#ifndef CV_DOXYGEN
+  CV__VIDEOWRITER_PROP_LATEST
+#endif
+};
+
+//! @} videoio_flags_base
+
+//! @addtogroup videoio_flags_others
+//! @{
+
+/** @name Hardware acceleration support
+    @{
+*/
+
+/** @brief Video Acceleration type
+ *
+ * Used as value in #CAP_PROP_HW_ACCELERATION and #VIDEOWRITER_PROP_HW_ACCELERATION
+ *
+ * @note In case of FFmpeg backend, it translated to enum AVHWDeviceType (https://github.com/FFmpeg/FFmpeg/blob/master/libavutil/hwcontext.h)
+ */
+enum VideoAccelerationType
+{
+    VIDEO_ACCELERATION_NONE     =  0,  //!< Do not require any specific H/W acceleration, prefer software processing.
+                                       //!< Reading of this value means that special H/W accelerated handling is not added or not detected by OpenCV.
+
+    VIDEO_ACCELERATION_ANY      =  1,  //!< Prefer to use H/W acceleration. If no one supported, then fallback to software processing.
+                                       //!< @note H/W acceleration may require special configuration of used environment.
+                                       //!< @note Results in encoding scenario may differ between software and hardware accelerated encoders.
+
+    VIDEO_ACCELERATION_D3D11    =  2,  //!< DirectX 11
+    VIDEO_ACCELERATION_VAAPI    =  3,  //!< VAAPI
+    VIDEO_ACCELERATION_MFX      =  4,  //!< libmfx (Intel MediaSDK/oneVPL)
+};
+
+//! @} Hardware acceleration support
+
+/** @name IEEE 1394 drivers
+    @{
+*/
+
+/** @brief Modes of the IEEE 1394 controlling registers
+(can be: auto, manual, auto single push, absolute Latter allowed with any other mode)
+every feature can have only one mode turned on at a time
+*/
+enum { CAP_PROP_DC1394_OFF                = -4, //!< turn the feature off (not controlled manually nor automatically).
+       CAP_PROP_DC1394_MODE_MANUAL        = -3, //!< set automatically when a value of the feature is set by the user.
+       CAP_PROP_DC1394_MODE_AUTO          = -2,
+       CAP_PROP_DC1394_MODE_ONE_PUSH_AUTO = -1,
+       CAP_PROP_DC1394_MAX                = 31
+     };
+
+//! @} IEEE 1394 drivers
+
+/** @name OpenNI (for Kinect)
+    @{
+*/
+
+//! OpenNI map generators
+enum { CAP_OPENNI_DEPTH_GENERATOR = 1 << 31,
+       CAP_OPENNI_IMAGE_GENERATOR = 1 << 30,
+       CAP_OPENNI_IR_GENERATOR    = 1 << 29,
+       CAP_OPENNI_GENERATORS_MASK = CAP_OPENNI_DEPTH_GENERATOR + CAP_OPENNI_IMAGE_GENERATOR + CAP_OPENNI_IR_GENERATOR
+     };
+
+//! Properties of cameras available through OpenNI backend
+enum { CAP_PROP_OPENNI_OUTPUT_MODE       = 100,
+       CAP_PROP_OPENNI_FRAME_MAX_DEPTH   = 101, //!< In mm
+       CAP_PROP_OPENNI_BASELINE          = 102, //!< In mm
+       CAP_PROP_OPENNI_FOCAL_LENGTH      = 103, //!< In pixels
+       CAP_PROP_OPENNI_REGISTRATION      = 104, //!< Flag that synchronizes the remapping depth map to image map
+                                                //!< by changing depth generator's view point (if the flag is "on") or
+                                                //!< sets this view point to its normal one (if the flag is "off").
+       CAP_PROP_OPENNI_REGISTRATION_ON   = CAP_PROP_OPENNI_REGISTRATION,
+       CAP_PROP_OPENNI_APPROX_FRAME_SYNC = 105,
+       CAP_PROP_OPENNI_MAX_BUFFER_SIZE   = 106,
+       CAP_PROP_OPENNI_CIRCLE_BUFFER     = 107,
+       CAP_PROP_OPENNI_MAX_TIME_DURATION = 108,
+       CAP_PROP_OPENNI_GENERATOR_PRESENT = 109,
+       CAP_PROP_OPENNI2_SYNC             = 110,
+       CAP_PROP_OPENNI2_MIRROR           = 111
+     };
+
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 5054 )
+#endif
+//! OpenNI shortcuts
+enum { CAP_OPENNI_IMAGE_GENERATOR_PRESENT         = CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
+       CAP_OPENNI_IMAGE_GENERATOR_OUTPUT_MODE     = CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_OUTPUT_MODE,
+       CAP_OPENNI_DEPTH_GENERATOR_PRESENT         = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
+       CAP_OPENNI_DEPTH_GENERATOR_BASELINE        = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_BASELINE,
+       CAP_OPENNI_DEPTH_GENERATOR_FOCAL_LENGTH    = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_FOCAL_LENGTH,
+       CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION    = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_REGISTRATION,
+       CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION_ON = CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION,
+       CAP_OPENNI_IR_GENERATOR_PRESENT            = CAP_OPENNI_IR_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
+     };
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
+//! OpenNI data given from depth generator
+enum { CAP_OPENNI_DEPTH_MAP         = 0, //!< Depth values in mm (CV_16UC1)
+       CAP_OPENNI_POINT_CLOUD_MAP   = 1, //!< XYZ in meters (CV_32FC3)
+       CAP_OPENNI_DISPARITY_MAP     = 2, //!< Disparity in pixels (CV_8UC1)
+       CAP_OPENNI_DISPARITY_MAP_32F = 3, //!< Disparity in pixels (CV_32FC1)
+       CAP_OPENNI_VALID_DEPTH_MASK  = 4, //!< CV_8UC1
+
+       CAP_OPENNI_BGR_IMAGE         = 5, //!< Data given from RGB image generator
+       CAP_OPENNI_GRAY_IMAGE        = 6, //!< Data given from RGB image generator
+
+       CAP_OPENNI_IR_IMAGE          = 7  //!< Data given from IR image generator
+     };
+
+//! Supported output modes of OpenNI image generator
+enum { CAP_OPENNI_VGA_30HZ  = 0,
+       CAP_OPENNI_SXGA_15HZ = 1,
+       CAP_OPENNI_SXGA_30HZ = 2,
+       CAP_OPENNI_QVGA_30HZ = 3,
+       CAP_OPENNI_QVGA_60HZ = 4
+     };
+
+//! @} OpenNI
+
+/** @name GStreamer
+    @{
+*/
+
+enum { CAP_PROP_GSTREAMER_QUEUE_LENGTH = 200 //!< Default is 1
+     };
+
+//! @} GStreamer
+
+/** @name PvAPI, Prosilica GigE SDK
+    @{
+*/
+
+//! PVAPI
+enum { CAP_PROP_PVAPI_MULTICASTIP           = 300, //!< IP for enable multicast master mode. 0 for disable multicast.
+       CAP_PROP_PVAPI_FRAMESTARTTRIGGERMODE = 301, //!< FrameStartTriggerMode: Determines how a frame is initiated.
+       CAP_PROP_PVAPI_DECIMATIONHORIZONTAL  = 302, //!< Horizontal sub-sampling of the image.
+       CAP_PROP_PVAPI_DECIMATIONVERTICAL    = 303, //!< Vertical sub-sampling of the image.
+       CAP_PROP_PVAPI_BINNINGX              = 304, //!< Horizontal binning factor.
+       CAP_PROP_PVAPI_BINNINGY              = 305, //!< Vertical binning factor.
+       CAP_PROP_PVAPI_PIXELFORMAT           = 306  //!< Pixel format.
+     };
+
+//! PVAPI: FrameStartTriggerMode
+enum { CAP_PVAPI_FSTRIGMODE_FREERUN     = 0,    //!< Freerun
+       CAP_PVAPI_FSTRIGMODE_SYNCIN1     = 1,    //!< SyncIn1
+       CAP_PVAPI_FSTRIGMODE_SYNCIN2     = 2,    //!< SyncIn2
+       CAP_PVAPI_FSTRIGMODE_FIXEDRATE   = 3,    //!< FixedRate
+       CAP_PVAPI_FSTRIGMODE_SOFTWARE    = 4     //!< Software
+     };
+
+//! PVAPI: DecimationHorizontal, DecimationVertical
+enum { CAP_PVAPI_DECIMATION_OFF       = 1,    //!< Off
+       CAP_PVAPI_DECIMATION_2OUTOF4   = 2,    //!< 2 out of 4 decimation
+       CAP_PVAPI_DECIMATION_2OUTOF8   = 4,    //!< 2 out of 8 decimation
+       CAP_PVAPI_DECIMATION_2OUTOF16  = 8     //!< 2 out of 16 decimation
+     };
+
+//! PVAPI: PixelFormat
+enum { CAP_PVAPI_PIXELFORMAT_MONO8    = 1,    //!< Mono8
+       CAP_PVAPI_PIXELFORMAT_MONO16   = 2,    //!< Mono16
+       CAP_PVAPI_PIXELFORMAT_BAYER8   = 3,    //!< Bayer8
+       CAP_PVAPI_PIXELFORMAT_BAYER16  = 4,    //!< Bayer16
+       CAP_PVAPI_PIXELFORMAT_RGB24    = 5,    //!< Rgb24
+       CAP_PVAPI_PIXELFORMAT_BGR24    = 6,    //!< Bgr24
+       CAP_PVAPI_PIXELFORMAT_RGBA32   = 7,    //!< Rgba32
+       CAP_PVAPI_PIXELFORMAT_BGRA32   = 8,    //!< Bgra32
+     };
+
+//! @} PvAPI
+
+/** @name XIMEA Camera API
+    @{
+*/
+
+//! Properties of cameras available through XIMEA SDK backend
+enum { CAP_PROP_XI_DOWNSAMPLING                                 = 400, //!< Change image resolution by binning or skipping.
+       CAP_PROP_XI_DATA_FORMAT                                  = 401, //!< Output data format.
+       CAP_PROP_XI_OFFSET_X                                     = 402, //!< Horizontal offset from the origin to the area of interest (in pixels).
+       CAP_PROP_XI_OFFSET_Y                                     = 403, //!< Vertical offset from the origin to the area of interest (in pixels).
+       CAP_PROP_XI_TRG_SOURCE                                   = 404, //!< Defines source of trigger.
+       CAP_PROP_XI_TRG_SOFTWARE                                 = 405, //!< Generates an internal trigger. PRM_TRG_SOURCE must be set to TRG_SOFTWARE.
+       CAP_PROP_XI_GPI_SELECTOR                                 = 406, //!< Selects general purpose input.
+       CAP_PROP_XI_GPI_MODE                                     = 407, //!< Set general purpose input mode.
+       CAP_PROP_XI_GPI_LEVEL                                    = 408, //!< Get general purpose level.
+       CAP_PROP_XI_GPO_SELECTOR                                 = 409, //!< Selects general purpose output.
+       CAP_PROP_XI_GPO_MODE                                     = 410, //!< Set general purpose output mode.
+       CAP_PROP_XI_LED_SELECTOR                                 = 411, //!< Selects camera signalling LED.
+       CAP_PROP_XI_LED_MODE                                     = 412, //!< Define camera signalling LED functionality.
+       CAP_PROP_XI_MANUAL_WB                                    = 413, //!< Calculates White Balance(must be called during acquisition).
+       CAP_PROP_XI_AUTO_WB                                      = 414, //!< Automatic white balance.
+       CAP_PROP_XI_AEAG                                         = 415, //!< Automatic exposure/gain.
+       CAP_PROP_XI_EXP_PRIORITY                                 = 416, //!< Exposure priority (0.5 - exposure 50%, gain 50%).
+       CAP_PROP_XI_AE_MAX_LIMIT                                 = 417, //!< Maximum limit of exposure in AEAG procedure.
+       CAP_PROP_XI_AG_MAX_LIMIT                                 = 418, //!< Maximum limit of gain in AEAG procedure.
+       CAP_PROP_XI_AEAG_LEVEL                                   = 419, //!< Average intensity of output signal AEAG should achieve(in %).
+       CAP_PROP_XI_TIMEOUT                                      = 420, //!< Image capture timeout in milliseconds.
+       CAP_PROP_XI_EXPOSURE                                     = 421, //!< Exposure time in microseconds.
+       CAP_PROP_XI_EXPOSURE_BURST_COUNT                         = 422, //!< Sets the number of times of exposure in one frame.
+       CAP_PROP_XI_GAIN_SELECTOR                                = 423, //!< Gain selector for parameter Gain allows to select different type of gains.
+       CAP_PROP_XI_GAIN                                         = 424, //!< Gain in dB.
+       CAP_PROP_XI_DOWNSAMPLING_TYPE                            = 426, //!< Change image downsampling type.
+       CAP_PROP_XI_BINNING_SELECTOR                             = 427, //!< Binning engine selector.
+       CAP_PROP_XI_BINNING_VERTICAL                             = 428, //!< Vertical Binning - number of vertical photo-sensitive cells to combine together.
+       CAP_PROP_XI_BINNING_HORIZONTAL                           = 429, //!< Horizontal Binning - number of horizontal photo-sensitive cells to combine together.
+       CAP_PROP_XI_BINNING_PATTERN                              = 430, //!< Binning pattern type.
+       CAP_PROP_XI_DECIMATION_SELECTOR                          = 431, //!< Decimation engine selector.
+       CAP_PROP_XI_DECIMATION_VERTICAL                          = 432, //!< Vertical Decimation - vertical sub-sampling of the image - reduces the vertical resolution of the image by the specified vertical decimation factor.
+       CAP_PROP_XI_DECIMATION_HORIZONTAL                        = 433, //!< Horizontal Decimation - horizontal sub-sampling of the image - reduces the horizontal resolution of the image by the specified vertical decimation factor.
+       CAP_PROP_XI_DECIMATION_PATTERN                           = 434, //!< Decimation pattern type.
+       CAP_PROP_XI_TEST_PATTERN_GENERATOR_SELECTOR              = 587, //!< Selects which test pattern generator is controlled by the TestPattern feature.
+       CAP_PROP_XI_TEST_PATTERN                                 = 588, //!< Selects which test pattern type is generated by the selected generator.
+       CAP_PROP_XI_IMAGE_DATA_FORMAT                            = 435, //!< Output data format.
+       CAP_PROP_XI_SHUTTER_TYPE                                 = 436, //!< Change sensor shutter type(CMOS sensor).
+       CAP_PROP_XI_SENSOR_TAPS                                  = 437, //!< Number of taps.
+       CAP_PROP_XI_AEAG_ROI_OFFSET_X                            = 439, //!< Automatic exposure/gain ROI offset X.
+       CAP_PROP_XI_AEAG_ROI_OFFSET_Y                            = 440, //!< Automatic exposure/gain ROI offset Y.
+       CAP_PROP_XI_AEAG_ROI_WIDTH                               = 441, //!< Automatic exposure/gain ROI Width.
+       CAP_PROP_XI_AEAG_ROI_HEIGHT                              = 442, //!< Automatic exposure/gain ROI Height.
+       CAP_PROP_XI_BPC                                          = 445, //!< Correction of bad pixels.
+       CAP_PROP_XI_WB_KR                                        = 448, //!< White balance red coefficient.
+       CAP_PROP_XI_WB_KG                                        = 449, //!< White balance green coefficient.
+       CAP_PROP_XI_WB_KB                                        = 450, //!< White balance blue coefficient.
+       CAP_PROP_XI_WIDTH                                        = 451, //!< Width of the Image provided by the device (in pixels).
+       CAP_PROP_XI_HEIGHT                                       = 452, //!< Height of the Image provided by the device (in pixels).
+       CAP_PROP_XI_REGION_SELECTOR                              = 589, //!< Selects Region in Multiple ROI which parameters are set by width, height, ... ,region mode.
+       CAP_PROP_XI_REGION_MODE                                  = 595, //!< Activates/deactivates Region selected by Region Selector.
+       CAP_PROP_XI_LIMIT_BANDWIDTH                              = 459, //!< Set/get bandwidth(datarate)(in Megabits).
+       CAP_PROP_XI_SENSOR_DATA_BIT_DEPTH                        = 460, //!< Sensor output data bit depth.
+       CAP_PROP_XI_OUTPUT_DATA_BIT_DEPTH                        = 461, //!< Device output data bit depth.
+       CAP_PROP_XI_IMAGE_DATA_BIT_DEPTH                         = 462, //!< bitdepth of data returned by function xiGetImage.
+       CAP_PROP_XI_OUTPUT_DATA_PACKING                          = 463, //!< Device output data packing (or grouping) enabled. Packing could be enabled if output_data_bit_depth > 8 and packing capability is available.
+       CAP_PROP_XI_OUTPUT_DATA_PACKING_TYPE                     = 464, //!< Data packing type. Some cameras supports only specific packing type.
+       CAP_PROP_XI_IS_COOLED                                    = 465, //!< Returns 1 for cameras that support cooling.
+       CAP_PROP_XI_COOLING                                      = 466, //!< Start camera cooling.
+       CAP_PROP_XI_TARGET_TEMP                                  = 467, //!< Set sensor target temperature for cooling.
+       CAP_PROP_XI_CHIP_TEMP                                    = 468, //!< Camera sensor temperature.
+       CAP_PROP_XI_HOUS_TEMP                                    = 469, //!< Camera housing temperature.
+       CAP_PROP_XI_HOUS_BACK_SIDE_TEMP                          = 590, //!< Camera housing back side temperature.
+       CAP_PROP_XI_SENSOR_BOARD_TEMP                            = 596, //!< Camera sensor board temperature.
+       CAP_PROP_XI_CMS                                          = 470, //!< Mode of color management system.
+       CAP_PROP_XI_APPLY_CMS                                    = 471, //!< Enable applying of CMS profiles to xiGetImage (see XI_PRM_INPUT_CMS_PROFILE, XI_PRM_OUTPUT_CMS_PROFILE).
+       CAP_PROP_XI_IMAGE_IS_COLOR                               = 474, //!< Returns 1 for color cameras.
+       CAP_PROP_XI_COLOR_FILTER_ARRAY                           = 475, //!< Returns color filter array type of RAW data.
+       CAP_PROP_XI_GAMMAY                                       = 476, //!< Luminosity gamma.
+       CAP_PROP_XI_GAMMAC                                       = 477, //!< Chromaticity gamma.
+       CAP_PROP_XI_SHARPNESS                                    = 478, //!< Sharpness Strength.
+       CAP_PROP_XI_CC_MATRIX_00                                 = 479, //!< Color Correction Matrix element [0][0].
+       CAP_PROP_XI_CC_MATRIX_01                                 = 480, //!< Color Correction Matrix element [0][1].
+       CAP_PROP_XI_CC_MATRIX_02                                 = 481, //!< Color Correction Matrix element [0][2].
+       CAP_PROP_XI_CC_MATRIX_03                                 = 482, //!< Color Correction Matrix element [0][3].
+       CAP_PROP_XI_CC_MATRIX_10                                 = 483, //!< Color Correction Matrix element [1][0].
+       CAP_PROP_XI_CC_MATRIX_11                                 = 484, //!< Color Correction Matrix element [1][1].
+       CAP_PROP_XI_CC_MATRIX_12                                 = 485, //!< Color Correction Matrix element [1][2].
+       CAP_PROP_XI_CC_MATRIX_13                                 = 486, //!< Color Correction Matrix element [1][3].
+       CAP_PROP_XI_CC_MATRIX_20                                 = 487, //!< Color Correction Matrix element [2][0].
+       CAP_PROP_XI_CC_MATRIX_21                                 = 488, //!< Color Correction Matrix element [2][1].
+       CAP_PROP_XI_CC_MATRIX_22                                 = 489, //!< Color Correction Matrix element [2][2].
+       CAP_PROP_XI_CC_MATRIX_23                                 = 490, //!< Color Correction Matrix element [2][3].
+       CAP_PROP_XI_CC_MATRIX_30                                 = 491, //!< Color Correction Matrix element [3][0].
+       CAP_PROP_XI_CC_MATRIX_31                                 = 492, //!< Color Correction Matrix element [3][1].
+       CAP_PROP_XI_CC_MATRIX_32                                 = 493, //!< Color Correction Matrix element [3][2].
+       CAP_PROP_XI_CC_MATRIX_33                                 = 494, //!< Color Correction Matrix element [3][3].
+       CAP_PROP_XI_DEFAULT_CC_MATRIX                            = 495, //!< Set default Color Correction Matrix.
+       CAP_PROP_XI_TRG_SELECTOR                                 = 498, //!< Selects the type of trigger.
+       CAP_PROP_XI_ACQ_FRAME_BURST_COUNT                        = 499, //!< Sets number of frames acquired by burst. This burst is used only if trigger is set to FrameBurstStart.
+       CAP_PROP_XI_DEBOUNCE_EN                                  = 507, //!< Enable/Disable debounce to selected GPI.
+       CAP_PROP_XI_DEBOUNCE_T0                                  = 508, //!< Debounce time (x * 10us).
+       CAP_PROP_XI_DEBOUNCE_T1                                  = 509, //!< Debounce time (x * 10us).
+       CAP_PROP_XI_DEBOUNCE_POL                                 = 510, //!< Debounce polarity (pol = 1 t0 - falling edge, t1 - rising edge).
+       CAP_PROP_XI_LENS_MODE                                    = 511, //!< Status of lens control interface. This shall be set to XI_ON before any Lens operations.
+       CAP_PROP_XI_LENS_APERTURE_VALUE                          = 512, //!< Current lens aperture value in stops. Examples: 2.8, 4, 5.6, 8, 11.
+       CAP_PROP_XI_LENS_FOCUS_MOVEMENT_VALUE                    = 513, //!< Lens current focus movement value to be used by XI_PRM_LENS_FOCUS_MOVE in motor steps.
+       CAP_PROP_XI_LENS_FOCUS_MOVE                              = 514, //!< Moves lens focus motor by steps set in XI_PRM_LENS_FOCUS_MOVEMENT_VALUE.
+       CAP_PROP_XI_LENS_FOCUS_DISTANCE                          = 515, //!< Lens focus distance in cm.
+       CAP_PROP_XI_LENS_FOCAL_LENGTH                            = 516, //!< Lens focal distance in mm.
+       CAP_PROP_XI_LENS_FEATURE_SELECTOR                        = 517, //!< Selects the current feature which is accessible by XI_PRM_LENS_FEATURE.
+       CAP_PROP_XI_LENS_FEATURE                                 = 518, //!< Allows access to lens feature value currently selected by XI_PRM_LENS_FEATURE_SELECTOR.
+       CAP_PROP_XI_DEVICE_MODEL_ID                              = 521, //!< Returns device model id.
+       CAP_PROP_XI_DEVICE_SN                                    = 522, //!< Returns device serial number.
+       CAP_PROP_XI_IMAGE_DATA_FORMAT_RGB32_ALPHA                = 529, //!< The alpha channel of RGB32 output image format.
+       CAP_PROP_XI_IMAGE_PAYLOAD_SIZE                           = 530, //!< Buffer size in bytes sufficient for output image returned by xiGetImage.
+       CAP_PROP_XI_TRANSPORT_PIXEL_FORMAT                       = 531, //!< Current format of pixels on transport layer.
+       CAP_PROP_XI_SENSOR_CLOCK_FREQ_HZ                         = 532, //!< Sensor clock frequency in Hz.
+       CAP_PROP_XI_SENSOR_CLOCK_FREQ_INDEX                      = 533, //!< Sensor clock frequency index. Sensor with selected frequencies have possibility to set the frequency only by this index.
+       CAP_PROP_XI_SENSOR_OUTPUT_CHANNEL_COUNT                  = 534, //!< Number of output channels from sensor used for data transfer.
+       CAP_PROP_XI_FRAMERATE                                    = 535, //!< Define framerate in Hz.
+       CAP_PROP_XI_COUNTER_SELECTOR                             = 536, //!< Select counter.
+       CAP_PROP_XI_COUNTER_VALUE                                = 537, //!< Counter status.
+       CAP_PROP_XI_ACQ_TIMING_MODE                              = 538, //!< Type of sensor frames timing.
+       CAP_PROP_XI_AVAILABLE_BANDWIDTH                          = 539, //!< Calculate and returns available interface bandwidth(int Megabits).
+       CAP_PROP_XI_BUFFER_POLICY                                = 540, //!< Data move policy.
+       CAP_PROP_XI_LUT_EN                                       = 541, //!< Activates LUT.
+       CAP_PROP_XI_LUT_INDEX                                    = 542, //!< Control the index (offset) of the coefficient to access in the LUT.
+       CAP_PROP_XI_LUT_VALUE                                    = 543, //!< Value at entry LUTIndex of the LUT.
+       CAP_PROP_XI_TRG_DELAY                                    = 544, //!< Specifies the delay in microseconds (us) to apply after the trigger reception before activating it.
+       CAP_PROP_XI_TS_RST_MODE                                  = 545, //!< Defines how time stamp reset engine will be armed.
+       CAP_PROP_XI_TS_RST_SOURCE                                = 546, //!< Defines which source will be used for timestamp reset. Writing this parameter will trigger settings of engine (arming).
+       CAP_PROP_XI_IS_DEVICE_EXIST                              = 547, //!< Returns 1 if camera connected and works properly.
+       CAP_PROP_XI_ACQ_BUFFER_SIZE                              = 548, //!< Acquisition buffer size in buffer_size_unit. Default bytes.
+       CAP_PROP_XI_ACQ_BUFFER_SIZE_UNIT                         = 549, //!< Acquisition buffer size unit in bytes. Default 1. E.g. Value 1024 means that buffer_size is in KiBytes.
+       CAP_PROP_XI_ACQ_TRANSPORT_BUFFER_SIZE                    = 550, //!< Acquisition transport buffer size in bytes.
+       CAP_PROP_XI_BUFFERS_QUEUE_SIZE                           = 551, //!< Queue of field/frame buffers.
+       CAP_PROP_XI_ACQ_TRANSPORT_BUFFER_COMMIT                  = 552, //!< Number of buffers to commit to low level.
+       CAP_PROP_XI_RECENT_FRAME                                 = 553, //!< GetImage returns most recent frame.
+       CAP_PROP_XI_DEVICE_RESET                                 = 554, //!< Resets the camera to default state.
+       CAP_PROP_XI_COLUMN_FPN_CORRECTION                        = 555, //!< Correction of column FPN.
+       CAP_PROP_XI_ROW_FPN_CORRECTION                           = 591, //!< Correction of row FPN.
+       CAP_PROP_XI_SENSOR_MODE                                  = 558, //!< Current sensor mode. Allows to select sensor mode by one integer. Setting of this parameter affects: image dimensions and downsampling.
+       CAP_PROP_XI_HDR                                          = 559, //!< Enable High Dynamic Range feature.
+       CAP_PROP_XI_HDR_KNEEPOINT_COUNT                          = 560, //!< The number of kneepoints in the PWLR.
+       CAP_PROP_XI_HDR_T1                                       = 561, //!< Position of first kneepoint(in % of XI_PRM_EXPOSURE).
+       CAP_PROP_XI_HDR_T2                                       = 562, //!< Position of second kneepoint (in % of XI_PRM_EXPOSURE).
+       CAP_PROP_XI_KNEEPOINT1                                   = 563, //!< Value of first kneepoint (% of sensor saturation).
+       CAP_PROP_XI_KNEEPOINT2                                   = 564, //!< Value of second kneepoint (% of sensor saturation).
+       CAP_PROP_XI_IMAGE_BLACK_LEVEL                            = 565, //!< Last image black level counts. Can be used for Offline processing to recall it.
+       CAP_PROP_XI_HW_REVISION                                  = 571, //!< Returns hardware revision number.
+       CAP_PROP_XI_DEBUG_LEVEL                                  = 572, //!< Set debug level.
+       CAP_PROP_XI_AUTO_BANDWIDTH_CALCULATION                   = 573, //!< Automatic bandwidth calculation.
+       CAP_PROP_XI_FFS_FILE_ID                                  = 594, //!< File number.
+       CAP_PROP_XI_FFS_FILE_SIZE                                = 580, //!< Size of file.
+       CAP_PROP_XI_FREE_FFS_SIZE                                = 581, //!< Size of free camera FFS.
+       CAP_PROP_XI_USED_FFS_SIZE                                = 582, //!< Size of used camera FFS.
+       CAP_PROP_XI_FFS_ACCESS_KEY                               = 583, //!< Setting of key enables file operations on some cameras.
+       CAP_PROP_XI_SENSOR_FEATURE_SELECTOR                      = 585, //!< Selects the current feature which is accessible by XI_PRM_SENSOR_FEATURE_VALUE.
+       CAP_PROP_XI_SENSOR_FEATURE_VALUE                         = 586, //!< Allows access to sensor feature value currently selected by XI_PRM_SENSOR_FEATURE_SELECTOR.
+     };
+
+//! @} XIMEA
+
+
+/** @name ARAVIS Camera API
+    @{
+*/
+
+//! Properties of cameras available through ARAVIS backend
+enum { CAP_PROP_ARAVIS_AUTOTRIGGER                              = 600 //!< Automatically trigger frame capture if camera is configured with software trigger
+};
+
+//! @} ARAVIS
+
+/** @name AVFoundation framework for iOS
+    @{
+*/
+
+//! Properties of cameras available through AVFOUNDATION backend
+enum { CAP_PROP_IOS_DEVICE_FOCUS        = 9001,
+       CAP_PROP_IOS_DEVICE_EXPOSURE     = 9002,
+       CAP_PROP_IOS_DEVICE_FLASH        = 9003,
+       CAP_PROP_IOS_DEVICE_WHITEBALANCE = 9004,
+       CAP_PROP_IOS_DEVICE_TORCH        = 9005
+     };
+
+//! @} AVFoundation framework for iOS
+
+
+/** @name Smartek Giganetix GigEVisionSDK
+    @{
+*/
+
+//! Properties of cameras available through Smartek Giganetix Ethernet Vision backend
+/* --- Vladimir Litvinenko (litvinenko.vladimir@gmail.com) --- */
+enum { CAP_PROP_GIGA_FRAME_OFFSET_X   = 10001,
+       CAP_PROP_GIGA_FRAME_OFFSET_Y   = 10002,
+       CAP_PROP_GIGA_FRAME_WIDTH_MAX  = 10003,
+       CAP_PROP_GIGA_FRAME_HEIGH_MAX  = 10004,
+       CAP_PROP_GIGA_FRAME_SENS_WIDTH = 10005,
+       CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006
+     };
+
+//! @} Smartek
+
+/** @name Intel Perceptual Computing SDK
+    @{
+*/
+enum { CAP_PROP_INTELPERC_PROFILE_COUNT               = 11001,
+       CAP_PROP_INTELPERC_PROFILE_IDX                 = 11002,
+       CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE  = 11003,
+       CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE      = 11004,
+       CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD  = 11005,
+       CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ     = 11006,
+       CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT     = 11007
+     };
+
+//! Intel Perceptual Streams
+enum { CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
+       CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
+       CAP_INTELPERC_IR_GENERATOR    = 1 << 27,
+       CAP_INTELPERC_GENERATORS_MASK = CAP_INTELPERC_DEPTH_GENERATOR + CAP_INTELPERC_IMAGE_GENERATOR + CAP_INTELPERC_IR_GENERATOR
+     };
+
+enum { CAP_INTELPERC_DEPTH_MAP              = 0, //!< Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
+       CAP_INTELPERC_UVDEPTH_MAP            = 1, //!< Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
+       CAP_INTELPERC_IR_MAP                 = 2, //!< Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
+       CAP_INTELPERC_IMAGE                  = 3
+     };
+
+//! @} Intel Perceptual
+
+/** @name gPhoto2 connection
+    @{
+*/
+
+/** @brief gPhoto2 properties
+
+If `propertyId` is less than 0 then work on widget with that __additive inversed__ camera setting ID
+Get IDs by using CAP_PROP_GPHOTO2_WIDGET_ENUMERATE.
+@see CvCaptureCAM_GPHOTO2 for more info
+*/
+enum { CAP_PROP_GPHOTO2_PREVIEW           = 17001, //!< Capture only preview from liveview mode.
+       CAP_PROP_GPHOTO2_WIDGET_ENUMERATE  = 17002, //!< Readonly, returns (const char *).
+       CAP_PROP_GPHOTO2_RELOAD_CONFIG     = 17003, //!< Trigger, only by set. Reload camera settings.
+       CAP_PROP_GPHOTO2_RELOAD_ON_CHANGE  = 17004, //!< Reload all settings on set.
+       CAP_PROP_GPHOTO2_COLLECT_MSGS      = 17005, //!< Collect messages with details.
+       CAP_PROP_GPHOTO2_FLUSH_MSGS        = 17006, //!< Readonly, returns (const char *).
+       CAP_PROP_SPEED                     = 17007, //!< Exposure speed. Can be readonly, depends on camera program.
+       CAP_PROP_APERTURE                  = 17008, //!< Aperture. Can be readonly, depends on camera program.
+       CAP_PROP_EXPOSUREPROGRAM           = 17009, //!< Camera exposure program.
+       CAP_PROP_VIEWFINDER                = 17010  //!< Enter liveview mode.
+     };
+
+//! @} gPhoto2
+
+
+/** @name Images backend
+    @{
+*/
+
+/** @brief Images backend properties
+
+*/
+enum { CAP_PROP_IMAGES_BASE = 18000,
+       CAP_PROP_IMAGES_LAST = 19000 // excluding
+     };
+
+//! @} Images
+
+/** @name OBSENSOR (for Orbbec 3D-Sensor device/module )
+    @{
+*/
+//! OBSENSOR data given from image generator
+enum VideoCaptureOBSensorDataType{
+    CAP_OBSENSOR_DEPTH_MAP = 0, //!< Depth values in mm (CV_16UC1)
+    CAP_OBSENSOR_BGR_IMAGE = 1, //!< Data given from BGR stream generator
+    CAP_OBSENSOR_IR_IMAGE = 2   //!< Data given from IR stream generator(CV_16UC1)
+};
+
+//! OBSENSOR stream generator
+enum VideoCaptureOBSensorGenerators{
+    CAP_OBSENSOR_DEPTH_GENERATOR = 1 << 29,
+    CAP_OBSENSOR_IMAGE_GENERATOR = 1 << 28,
+    CAP_OBSENSOR_IR_GENERATOR    = 1 << 27,
+    CAP_OBSENSOR_GENERATORS_MASK = CAP_OBSENSOR_DEPTH_GENERATOR + CAP_OBSENSOR_IMAGE_GENERATOR + CAP_OBSENSOR_IR_GENERATOR
+};
+
+//!OBSENSOR properties
+enum VideoCaptureOBSensorProperties{
+    // INTRINSIC
+    CAP_PROP_OBSENSOR_INTRINSIC_FX=26001,
+    CAP_PROP_OBSENSOR_INTRINSIC_FY=26002,
+    CAP_PROP_OBSENSOR_INTRINSIC_CX=26003,
+    CAP_PROP_OBSENSOR_INTRINSIC_CY=26004,
+};
+
+//! @} OBSENSOR
+
+//! @} videoio_flags_others
+
+
+class IVideoCapture;
+//! @cond IGNORED
+namespace internal { class VideoCapturePrivateAccessor; }
+//! @endcond IGNORED
+
+/** @brief Class for video capturing from video files, image sequences or cameras.
+
+The class provides C++ API for capturing video from cameras or for reading video files and image sequences.
+
+Here is how the class can be used:
+@include samples/cpp/videocapture_basic.cpp
+
+@note In @ref videoio_c "C API" the black-box structure `CvCapture` is used instead of %VideoCapture.
+@note
+-   (C++) A basic sample on using the %VideoCapture interface can be found at
+    `OPENCV_SOURCE_CODE/samples/cpp/videocapture_starter.cpp`
+-   (Python) A basic sample on using the %VideoCapture interface can be found at
+    `OPENCV_SOURCE_CODE/samples/python/video.py`
+-   (Python) A multi threaded video processing sample can be found at
+    `OPENCV_SOURCE_CODE/samples/python/video_threaded.py`
+-   (Python) %VideoCapture sample showcasing some features of the Video4Linux2 backend
+    `OPENCV_SOURCE_CODE/samples/python/video_v4l2.py`
+ */
+class CV_EXPORTS_W VideoCapture
+{
+public:
+    /** @brief Default constructor
+    @note In @ref videoio_c "C API", when you finished working with video, release CvCapture structure with
+    cvReleaseCapture(), or use Ptr\<CvCapture\> that calls cvReleaseCapture() automatically in the
+    destructor.
+     */
+    CV_WRAP VideoCapture();
+
+    /** @overload
+    @brief  Opens a video file or a capturing device or an IP video stream for video capturing with API Preference
+
+    @param filename it can be:
+    - name of video file (eg. `video.avi`)
+    - or image sequence (eg. `img_%02d.jpg`, which will read samples like `img_00.jpg, img_01.jpg, img_02.jpg, ...`)
+    - or URL of video stream (eg. `protocol://host:port/script_name?script_params|auth`)
+    - or GStreamer pipeline string in gst-launch tool format in case if GStreamer is used as backend
+      Note that each video stream or IP camera feed has its own URL scheme. Please refer to the
+      documentation of source stream to know the right URL.
+    @param apiPreference preferred Capture API backends to use. Can be used to enforce a specific reader
+    implementation if multiple are available: e.g. cv::CAP_FFMPEG or cv::CAP_IMAGES or cv::CAP_DSHOW.
+
+    @sa cv::VideoCaptureAPIs
+    */
+    CV_WRAP explicit VideoCapture(const String& filename, int apiPreference = CAP_ANY);
+
+    /** @overload
+    @brief Opens a video file or a capturing device or an IP video stream for video capturing with API Preference and parameters
+
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+    */
+    CV_WRAP explicit VideoCapture(const String& filename, int apiPreference, const std::vector<int>& params);
+
+    /** @overload
+    @brief  Opens a camera for video capturing
+
+    @param index id of the video capturing device to open. To open default camera using default backend just pass 0.
+    (to backward compatibility usage of camera_id + domain_offset (CAP_*) is valid when apiPreference is CAP_ANY)
+    @param apiPreference preferred Capture API backends to use. Can be used to enforce a specific reader
+    implementation if multiple are available: e.g. cv::CAP_DSHOW or cv::CAP_MSMF or cv::CAP_V4L.
+
+    @sa cv::VideoCaptureAPIs
+    */
+    CV_WRAP explicit VideoCapture(int index, int apiPreference = CAP_ANY);
+
+    /** @overload
+    @brief Opens a camera for video capturing with API Preference and parameters
+
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+    */
+    CV_WRAP explicit VideoCapture(int index, int apiPreference, const std::vector<int>& params);
+
+    /** @brief Default destructor
+
+    The method first calls VideoCapture::release to close the already opened file or camera.
+    */
+    virtual ~VideoCapture();
+
+    /** @brief  Opens a video file or a capturing device or an IP video stream for video capturing.
+
+    @overload
+
+    Parameters are same as the constructor VideoCapture(const String& filename, int apiPreference = CAP_ANY)
+    @return `true` if the file has been successfully opened
+
+    The method first calls VideoCapture::release to close the already opened file or camera.
+     */
+    CV_WRAP virtual bool open(const String& filename, int apiPreference = CAP_ANY);
+
+    /** @brief  Opens a video file or a capturing device or an IP video stream for video capturing with API Preference and parameters
+
+    @overload
+
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+
+    @return `true` if the file has been successfully opened
+
+    The method first calls VideoCapture::release to close the already opened file or camera.
+     */
+    CV_WRAP virtual bool open(const String& filename, int apiPreference, const std::vector<int>& params);
+
+    /** @brief  Opens a camera for video capturing
+
+    @overload
+
+    Parameters are same as the constructor VideoCapture(int index, int apiPreference = CAP_ANY)
+    @return `true` if the camera has been successfully opened.
+
+    The method first calls VideoCapture::release to close the already opened file or camera.
+    */
+    CV_WRAP virtual bool open(int index, int apiPreference = CAP_ANY);
+
+    /** @brief  Opens a camera for video capturing with API Preference and parameters
+
+    @overload
+
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+
+    @return `true` if the camera has been successfully opened.
+
+    The method first calls VideoCapture::release to close the already opened file or camera.
+    */
+    CV_WRAP virtual bool open(int index, int apiPreference, const std::vector<int>& params);
+
+    /** @brief Returns true if video capturing has been initialized already.
+
+    If the previous call to VideoCapture constructor or VideoCapture::open() succeeded, the method returns
+    true.
+     */
+    CV_WRAP virtual bool isOpened() const;
+
+    /** @brief Closes video file or capturing device.
+
+    The method is automatically called by subsequent VideoCapture::open and by VideoCapture
+    destructor.
+
+    The C function also deallocates memory and clears \*capture pointer.
+     */
+    CV_WRAP virtual void release();
+
+    /** @brief Grabs the next frame from video file or capturing device.
+
+    @return `true` (non-zero) in the case of success.
+
+    The method/function grabs the next frame from video file or camera and returns true (non-zero) in
+    the case of success.
+
+    The primary use of the function is in multi-camera environments, especially when the cameras do not
+    have hardware synchronization. That is, you call VideoCapture::grab() for each camera and after that
+    call the slower method VideoCapture::retrieve() to decode and get frame from each camera. This way
+    the overhead on demosaicing or motion jpeg decompression etc. is eliminated and the retrieved frames
+    from different cameras will be closer in time.
+
+    Also, when a connected camera is multi-head (for example, a stereo camera or a Kinect device), the
+    correct way of retrieving data from it is to call VideoCapture::grab() first and then call
+    VideoCapture::retrieve() one or more times with different values of the channel parameter.
+
+    @ref tutorial_kinect_openni
+     */
+    CV_WRAP virtual bool grab();
+
+    /** @brief Decodes and returns the grabbed video frame.
+
+    @param [out] image the video frame is returned here. If no frames has been grabbed the image will be empty.
+    @param flag it could be a frame index or a driver specific flag
+    @return `false` if no frames has been grabbed
+
+    The method decodes and returns the just grabbed frame. If no frames has been grabbed
+    (camera has been disconnected, or there are no more frames in video file), the method returns false
+    and the function returns an empty image (with %cv::Mat, test it with Mat::empty()).
+
+    @sa read()
+
+    @note In @ref videoio_c "C API", functions cvRetrieveFrame() and cv.RetrieveFrame() return image stored inside the video
+    capturing structure. It is not allowed to modify or release the image! You can copy the frame using
+    cvCloneImage and then do whatever you want with the copy.
+     */
+    CV_WRAP virtual bool retrieve(OutputArray image, int flag = 0);
+
+    /** @brief Stream operator to read the next video frame.
+    @sa read()
+    */
+    virtual VideoCapture& operator >> (CV_OUT Mat& image);
+
+    /** @overload
+    @sa read()
+    */
+    virtual VideoCapture& operator >> (CV_OUT UMat& image);
+
+    /** @brief Grabs, decodes and returns the next video frame.
+
+    @param [out] image the video frame is returned here. If no frames has been grabbed the image will be empty.
+    @return `false` if no frames has been grabbed
+
+    The method/function combines VideoCapture::grab() and VideoCapture::retrieve() in one call. This is the
+    most convenient method for reading video files or capturing data from decode and returns the just
+    grabbed frame. If no frames has been grabbed (camera has been disconnected, or there are no more
+    frames in video file), the method returns false and the function returns empty image (with %cv::Mat, test it with Mat::empty()).
+
+    @note In @ref videoio_c "C API", functions cvRetrieveFrame() and cv.RetrieveFrame() return image stored inside the video
+    capturing structure. It is not allowed to modify or release the image! You can copy the frame using
+    cvCloneImage and then do whatever you want with the copy.
+     */
+    CV_WRAP virtual bool read(OutputArray image);
+
+    /** @brief Sets a property in the VideoCapture.
+
+    @param propId Property identifier from cv::VideoCaptureProperties (eg. cv::CAP_PROP_POS_MSEC, cv::CAP_PROP_POS_FRAMES, ...)
+    or one from @ref videoio_flags_others
+    @param value Value of the property.
+    @return `true` if the property is supported by backend used by the VideoCapture instance.
+    @note Even if it returns `true` this doesn't ensure that the property
+    value has been accepted by the capture device. See note in VideoCapture::get()
+     */
+    CV_WRAP virtual bool set(int propId, double value);
+
+    /** @brief Returns the specified VideoCapture property
+
+    @param propId Property identifier from cv::VideoCaptureProperties (eg. cv::CAP_PROP_POS_MSEC, cv::CAP_PROP_POS_FRAMES, ...)
+    or one from @ref videoio_flags_others
+    @return Value for the specified property. Value 0 is returned when querying a property that is
+    not supported by the backend used by the VideoCapture instance.
+
+    @note Reading / writing properties involves many layers. Some unexpected result might happens
+    along this chain.
+    @code{.txt}
+    VideoCapture -> API Backend -> Operating System -> Device Driver -> Device Hardware
+    @endcode
+    The returned value might be different from what really used by the device or it could be encoded
+    using device dependent rules (eg. steps or percentage). Effective behaviour depends from device
+    driver and API Backend
+
+    */
+    CV_WRAP virtual double get(int propId) const;
+
+    /** @brief Returns used backend API name
+
+     @note Stream should be opened.
+     */
+    CV_WRAP String getBackendName() const;
+
+    /** Switches exceptions mode
+     *
+     * methods raise exceptions if not successful instead of returning an error code
+     */
+    CV_WRAP void setExceptionMode(bool enable) { throwOnFail = enable; }
+
+    /// query if exception mode is active
+    CV_WRAP bool getExceptionMode() const { return throwOnFail; }
+
+
+    /** @brief Wait for ready frames from VideoCapture.
+
+    @param streams input video streams
+    @param readyIndex stream indexes with grabbed frames (ready to use .retrieve() to fetch actual frame)
+    @param timeoutNs number of nanoseconds (0 - infinite)
+    @return `true` if streamReady is not empty
+
+    @throws Exception %Exception on stream errors (check .isOpened() to filter out malformed streams) or VideoCapture type is not supported
+
+    The primary use of the function is in multi-camera environments.
+    The method fills the ready state vector, grabs video frame, if camera is ready.
+
+    After this call use VideoCapture::retrieve() to decode and fetch frame data.
+    */
+    CV_WRAP static
+    bool waitAny(
+            const std::vector<VideoCapture>& streams,
+            CV_OUT std::vector<int>& readyIndex,
+            int64 timeoutNs = 0);
+
+protected:
+    Ptr<CvCapture> cap;
+    Ptr<IVideoCapture> icap;
+    bool throwOnFail;
+
+    friend class internal::VideoCapturePrivateAccessor;
+};
+
+class IVideoWriter;
+
+/** @example samples/cpp/tutorial_code/videoio/video-write/video-write.cpp
+Check @ref tutorial_video_write "the corresponding tutorial" for more details
+*/
+
+/** @example samples/cpp/videowriter_basic.cpp
+An example using VideoCapture and VideoWriter class
+*/
+
+/** @brief Video writer class.
+
+The class provides C++ API for writing video files or image sequences.
+*/
+class CV_EXPORTS_W VideoWriter
+{
+public:
+    /** @brief Default constructors
+
+    The constructors/functions initialize video writers.
+    -   On Linux FFMPEG is used to write videos;
+    -   On Windows FFMPEG or MSWF or DSHOW is used;
+    -   On MacOSX AVFoundation is used.
+     */
+    CV_WRAP VideoWriter();
+
+    /** @overload
+    @param filename Name of the output video file.
+    @param fourcc 4-character code of codec used to compress the frames. For example,
+    VideoWriter::fourcc('P','I','M','1') is a MPEG-1 codec, VideoWriter::fourcc('M','J','P','G')
+    is a motion-jpeg codec etc. List of codes can be obtained at
+    [MSDN](https://docs.microsoft.com/en-us/windows/win32/medfound/video-fourccs) page
+    or with this [page](https://fourcc.org/codecs.php)
+    of the fourcc site for a more complete list). FFMPEG backend with MP4 container natively uses
+    other values as fourcc code: see [ObjectType](http://mp4ra.org/#/codecs),
+    so you may receive a warning message from OpenCV about fourcc code conversion.
+    @param fps Framerate of the created video stream.
+    @param frameSize Size of the video frames.
+    @param isColor If it is not zero, the encoder will expect and encode color frames, otherwise it
+    will work with grayscale frames.
+
+    @b Tips:
+    - With some backends `fourcc=-1` pops up the codec selection dialog from the system.
+    - To save image sequence use a proper filename (eg. `img_%02d.jpg`) and `fourcc=0`
+      OR `fps=0`. Use uncompressed image format (eg. `img_%02d.BMP`) to save raw frames.
+    - Most codecs are lossy. If you want lossless video file you need to use a lossless codecs
+      (eg. FFMPEG FFV1, Huffman HFYU, Lagarith LAGS, etc...)
+    - If FFMPEG is enabled, using `codec=0; fps=0;` you can create an uncompressed (raw) video file.
+    - If FFMPEG is used, we allow frames of odd width or height, but in this case we truncate
+      the rightmost column/the bottom row. Probably, this should be handled more elegantly,
+      but some internal functions inside FFMPEG swscale require even width/height.
+    */
+    CV_WRAP VideoWriter(const String& filename, int fourcc, double fps,
+                Size frameSize, bool isColor = true);
+
+    /** @overload
+    The `apiPreference` parameter allows to specify API backends to use. Can be used to enforce a specific reader implementation
+    if multiple are available: e.g. cv::CAP_FFMPEG or cv::CAP_GSTREAMER.
+     */
+    CV_WRAP VideoWriter(const String& filename, int apiPreference, int fourcc, double fps,
+                Size frameSize, bool isColor = true);
+
+    /** @overload
+     * The `params` parameter allows to specify extra encoder parameters encoded as pairs (paramId_1, paramValue_1, paramId_2, paramValue_2, ... .)
+     * see cv::VideoWriterProperties
+     */
+    CV_WRAP VideoWriter(const String& filename, int fourcc, double fps, const Size& frameSize,
+                        const std::vector<int>& params);
+
+    /** @overload
+     */
+    CV_WRAP VideoWriter(const String& filename, int apiPreference, int fourcc, double fps,
+                        const Size& frameSize, const std::vector<int>& params);
+
+    /** @brief Default destructor
+
+    The method first calls VideoWriter::release to close the already opened file.
+    */
+    virtual ~VideoWriter();
+
+    /** @brief Initializes or reinitializes video writer.
+
+    The method opens video writer. Parameters are the same as in the constructor
+    VideoWriter::VideoWriter.
+    @return `true` if video writer has been successfully initialized
+
+    The method first calls VideoWriter::release to close the already opened file.
+     */
+    CV_WRAP virtual bool open(const String& filename, int fourcc, double fps,
+                      Size frameSize, bool isColor = true);
+
+    /** @overload
+     */
+    CV_WRAP bool open(const String& filename, int apiPreference, int fourcc, double fps,
+                      Size frameSize, bool isColor = true);
+
+    /** @overload
+     */
+    CV_WRAP bool open(const String& filename, int fourcc, double fps, const Size& frameSize,
+                      const std::vector<int>& params);
+
+    /** @overload
+     */
+    CV_WRAP bool open(const String& filename, int apiPreference, int fourcc, double fps,
+                      const Size& frameSize, const std::vector<int>& params);
+
+    /** @brief Returns true if video writer has been successfully initialized.
+    */
+    CV_WRAP virtual bool isOpened() const;
+
+    /** @brief Closes the video writer.
+
+    The method is automatically called by subsequent VideoWriter::open and by the VideoWriter
+    destructor.
+     */
+    CV_WRAP virtual void release();
+
+    /** @brief Stream operator to write the next video frame.
+    @sa write
+    */
+    virtual VideoWriter& operator << (const Mat& image);
+
+    /** @overload
+    @sa write
+    */
+    virtual VideoWriter& operator << (const UMat& image);
+
+    /** @brief Writes the next video frame
+
+    @param image The written frame. In general, color images are expected in BGR format.
+
+    The function/method writes the specified image to video file. It must have the same size as has
+    been specified when opening the video writer.
+     */
+    CV_WRAP virtual void write(InputArray image);
+
+    /** @brief Sets a property in the VideoWriter.
+
+     @param propId Property identifier from cv::VideoWriterProperties (eg. cv::VIDEOWRITER_PROP_QUALITY)
+     or one of @ref videoio_flags_others
+
+     @param value Value of the property.
+     @return  `true` if the property is supported by the backend used by the VideoWriter instance.
+     */
+    CV_WRAP virtual bool set(int propId, double value);
+
+    /** @brief Returns the specified VideoWriter property
+
+     @param propId Property identifier from cv::VideoWriterProperties (eg. cv::VIDEOWRITER_PROP_QUALITY)
+     or one of @ref videoio_flags_others
+
+     @return Value for the specified property. Value 0 is returned when querying a property that is
+     not supported by the backend used by the VideoWriter instance.
+     */
+    CV_WRAP virtual double get(int propId) const;
+
+    /** @brief Concatenates 4 chars to a fourcc code
+
+    @return a fourcc code
+
+    This static method constructs the fourcc code of the codec to be used in the constructor
+    VideoWriter::VideoWriter or VideoWriter::open.
+     */
+    CV_WRAP static int fourcc(char c1, char c2, char c3, char c4);
+
+    /** @brief Returns used backend API name
+
+     @note Stream should be opened.
+     */
+    CV_WRAP String getBackendName() const;
+
+protected:
+    Ptr<CvVideoWriter> writer;
+    Ptr<IVideoWriter> iwriter;
+
+    static Ptr<IVideoWriter> create(const String& filename, int fourcc, double fps,
+                                    Size frameSize, bool isColor = true);
+};
+
+//! @cond IGNORED
+template<> struct DefaultDeleter<CvCapture>{ CV_EXPORTS void operator ()(CvCapture* obj) const; };
+template<> struct DefaultDeleter<CvVideoWriter>{ CV_EXPORTS void operator ()(CvVideoWriter* obj) const; };
+//! @endcond IGNORED
+
+//! @} videoio
+
+} // cv
+
+#endif //OPENCV_VIDEOIO_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/videoio/cap_ios.h b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/cap_ios.h
new file mode 100644
index 000000000000..207ad46ceefe
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/cap_ios.h
@@ -0,0 +1,150 @@
+/*  For iOS video I/O
+ *  by Eduard Feicho on 29/07/12
+ *  Copyright 2012. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#import <UIKit/UIKit.h>
+#import <Accelerate/Accelerate.h>
+#import <AVFoundation/AVFoundation.h>
+#import <ImageIO/ImageIO.h>
+#include "opencv2/core.hpp"
+
+//! @addtogroup videoio_ios
+//! @{
+
+/////////////////////////////////////// CvAbstractCamera /////////////////////////////////////
+
+@class CvAbstractCamera;
+
+CV_EXPORTS @interface CvAbstractCamera : NSObject
+{
+    UIDeviceOrientation currentDeviceOrientation;
+
+    BOOL cameraAvailable;
+}
+
+@property (nonatomic, strong) AVCaptureSession* captureSession;
+@property (nonatomic, strong) AVCaptureConnection* videoCaptureConnection;
+
+@property (nonatomic, readonly) BOOL running;
+@property (nonatomic, readonly) BOOL captureSessionLoaded;
+
+@property (nonatomic, assign) int defaultFPS;
+@property (nonatomic, readonly) AVCaptureVideoPreviewLayer *captureVideoPreviewLayer;
+@property (nonatomic, assign) AVCaptureDevicePosition defaultAVCaptureDevicePosition;
+@property (nonatomic, assign) AVCaptureVideoOrientation defaultAVCaptureVideoOrientation;
+@property (nonatomic, assign) BOOL useAVCaptureVideoPreviewLayer;
+@property (nonatomic, strong) NSString *const defaultAVCaptureSessionPreset;
+
+@property (nonatomic, assign) int imageWidth;
+@property (nonatomic, assign) int imageHeight;
+
+@property (nonatomic, strong) UIView* parentView;
+
+- CV_UNUSED(start);
+- CV_UNUSED(stop);
+- CV_UNUSED(switchCameras);
+
+- (id)initWithParentView:(UIView*)parent;
+
+- CV_UNUSED(createCaptureOutput);
+- CV_UNUSED(createVideoPreviewLayer);
+- CV_UNUSED(updateOrientation);
+
+- CV_UNUSED(lockFocus);
+- CV_UNUSED(unlockFocus);
+- CV_UNUSED(lockExposure);
+- CV_UNUSED(unlockExposure);
+- CV_UNUSED(lockBalance);
+- CV_UNUSED(unlockBalance);
+
+@end
+
+///////////////////////////////// CvVideoCamera ///////////////////////////////////////////
+
+@class CvVideoCamera;
+
+CV_EXPORTS @protocol CvVideoCameraDelegate <NSObject>
+
+#ifdef __cplusplus
+// delegate method for processing image frames
+- (void)processImage:(cv::Mat&)image;
+#endif
+
+@end
+
+CV_EXPORTS @interface CvVideoCamera : CvAbstractCamera<AVCaptureVideoDataOutputSampleBufferDelegate>
+{
+    AVCaptureVideoDataOutput *videoDataOutput;
+
+    dispatch_queue_t videoDataOutputQueue;
+    CALayer *customPreviewLayer;
+
+    CMTime lastSampleTime;
+
+}
+
+@property (nonatomic, weak) id<CvVideoCameraDelegate> delegate;
+@property (nonatomic, assign) BOOL grayscaleMode;
+
+@property (nonatomic, assign) BOOL recordVideo;
+@property (nonatomic, assign) BOOL rotateVideo;
+@property (nonatomic, strong) AVAssetWriterInput* recordAssetWriterInput;
+@property (nonatomic, strong) AVAssetWriterInputPixelBufferAdaptor* recordPixelBufferAdaptor;
+@property (nonatomic, strong) AVAssetWriter* recordAssetWriter;
+
+- (void)adjustLayoutToInterfaceOrientation:(UIInterfaceOrientation)interfaceOrientation;
+- CV_UNUSED(layoutPreviewLayer);
+- CV_UNUSED(saveVideo);
+- (NSURL *)videoFileURL;
+- (NSString *)videoFileString;
+
+
+@end
+
+///////////////////////////////// CvPhotoCamera ///////////////////////////////////////////
+
+@class CvPhotoCamera;
+
+CV_EXPORTS @protocol CvPhotoCameraDelegate <NSObject>
+
+- (void)photoCamera:(CvPhotoCamera*)photoCamera capturedImage:(UIImage *)image;
+- (void)photoCameraCancel:(CvPhotoCamera*)photoCamera;
+
+@end
+
+CV_EXPORTS @interface CvPhotoCamera : CvAbstractCamera
+{
+    AVCaptureStillImageOutput *stillImageOutput;
+}
+
+@property (nonatomic, weak) id<CvPhotoCameraDelegate> delegate;
+
+- CV_UNUSED(takePicture);
+
+@end
+
+//! @} videoio_ios
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/videoio/legacy/constants_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/legacy/constants_c.h
new file mode 100644
index 000000000000..f9831e358ae4
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/legacy/constants_c.h
@@ -0,0 +1,434 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VIDEOIO_LEGACY_CONSTANTS_H
+#define OPENCV_VIDEOIO_LEGACY_CONSTANTS_H
+
+enum
+{
+    CV_CAP_ANY      =0,     // autodetect
+
+    CV_CAP_MIL      =100,   // MIL proprietary drivers
+
+    CV_CAP_VFW      =200,   // platform native
+    CV_CAP_V4L      =200,
+    CV_CAP_V4L2     =200,
+
+    CV_CAP_FIREWARE =300,   // IEEE 1394 drivers
+    CV_CAP_FIREWIRE =300,
+    CV_CAP_IEEE1394 =300,
+    CV_CAP_DC1394   =300,
+    CV_CAP_CMU1394  =300,
+
+    CV_CAP_STEREO   =400,   // TYZX proprietary drivers
+    CV_CAP_TYZX     =400,
+    CV_TYZX_LEFT    =400,
+    CV_TYZX_RIGHT   =401,
+    CV_TYZX_COLOR   =402,
+    CV_TYZX_Z       =403,
+
+    CV_CAP_QT       =500,   // QuickTime
+
+    CV_CAP_UNICAP   =600,   // Unicap drivers
+
+    CV_CAP_DSHOW    =700,   // DirectShow (via videoInput)
+    CV_CAP_MSMF     =1400,  // Microsoft Media Foundation (via videoInput)
+
+    CV_CAP_PVAPI    =800,   // PvAPI, Prosilica GigE SDK
+
+    CV_CAP_OPENNI   =900,   // OpenNI (for Kinect)
+    CV_CAP_OPENNI_ASUS =910,   // OpenNI (for Asus Xtion)
+
+    CV_CAP_ANDROID  =1000,  // Android - not used
+    CV_CAP_ANDROID_BACK =CV_CAP_ANDROID+99, // Android back camera - not used
+    CV_CAP_ANDROID_FRONT =CV_CAP_ANDROID+98, // Android front camera - not used
+
+    CV_CAP_XIAPI    =1100,   // XIMEA Camera API
+
+    CV_CAP_AVFOUNDATION = 1200,  // AVFoundation framework for iOS (OS X Lion will have the same API)
+
+    CV_CAP_GIGANETIX = 1300,  // Smartek Giganetix GigEVisionSDK
+
+    CV_CAP_INTELPERC = 1500, // Intel Perceptual Computing
+
+    CV_CAP_OPENNI2 = 1600,   // OpenNI2 (for Kinect)
+    CV_CAP_GPHOTO2 = 1700,
+    CV_CAP_GSTREAMER = 1800, // GStreamer
+    CV_CAP_FFMPEG = 1900,    // FFMPEG
+    CV_CAP_IMAGES = 2000,    // OpenCV Image Sequence (e.g. img_%02d.jpg)
+
+    CV_CAP_ARAVIS = 2100     // Aravis GigE SDK
+};
+
+enum
+{
+    // modes of the controlling registers (can be: auto, manual, auto single push, absolute Latter allowed with any other mode)
+    // every feature can have only one mode turned on at a time
+    CV_CAP_PROP_DC1394_OFF         = -4,  //turn the feature off (not controlled manually nor automatically)
+    CV_CAP_PROP_DC1394_MODE_MANUAL = -3, //set automatically when a value of the feature is set by the user
+    CV_CAP_PROP_DC1394_MODE_AUTO = -2,
+    CV_CAP_PROP_DC1394_MODE_ONE_PUSH_AUTO = -1,
+    CV_CAP_PROP_POS_MSEC       =0,
+    CV_CAP_PROP_POS_FRAMES     =1,
+    CV_CAP_PROP_POS_AVI_RATIO  =2,
+    CV_CAP_PROP_FRAME_WIDTH    =3,
+    CV_CAP_PROP_FRAME_HEIGHT   =4,
+    CV_CAP_PROP_FPS            =5,
+    CV_CAP_PROP_FOURCC         =6,
+    CV_CAP_PROP_FRAME_COUNT    =7,
+    CV_CAP_PROP_FORMAT         =8,
+    CV_CAP_PROP_MODE           =9,
+    CV_CAP_PROP_BRIGHTNESS    =10,
+    CV_CAP_PROP_CONTRAST      =11,
+    CV_CAP_PROP_SATURATION    =12,
+    CV_CAP_PROP_HUE           =13,
+    CV_CAP_PROP_GAIN          =14,
+    CV_CAP_PROP_EXPOSURE      =15,
+    CV_CAP_PROP_CONVERT_RGB   =16,
+    CV_CAP_PROP_WHITE_BALANCE_BLUE_U =17,
+    CV_CAP_PROP_RECTIFICATION =18,
+    CV_CAP_PROP_MONOCHROME    =19,
+    CV_CAP_PROP_SHARPNESS     =20,
+    CV_CAP_PROP_AUTO_EXPOSURE =21, // exposure control done by camera,
+                                   // user can adjust reference level
+                                   // using this feature
+    CV_CAP_PROP_GAMMA         =22,
+    CV_CAP_PROP_TEMPERATURE   =23,
+    CV_CAP_PROP_TRIGGER       =24,
+    CV_CAP_PROP_TRIGGER_DELAY =25,
+    CV_CAP_PROP_WHITE_BALANCE_RED_V =26,
+    CV_CAP_PROP_ZOOM          =27,
+    CV_CAP_PROP_FOCUS         =28,
+    CV_CAP_PROP_GUID          =29,
+    CV_CAP_PROP_ISO_SPEED     =30,
+    CV_CAP_PROP_MAX_DC1394    =31,
+    CV_CAP_PROP_BACKLIGHT     =32,
+    CV_CAP_PROP_PAN           =33,
+    CV_CAP_PROP_TILT          =34,
+    CV_CAP_PROP_ROLL          =35,
+    CV_CAP_PROP_IRIS          =36,
+    CV_CAP_PROP_SETTINGS      =37,
+    CV_CAP_PROP_BUFFERSIZE    =38,
+    CV_CAP_PROP_AUTOFOCUS     =39,
+    CV_CAP_PROP_SAR_NUM       =40,
+    CV_CAP_PROP_SAR_DEN       =41,
+
+    CV_CAP_PROP_AUTOGRAB      =1024, // property for videoio class CvCapture_Android only
+    CV_CAP_PROP_SUPPORTED_PREVIEW_SIZES_STRING=1025, // readonly, tricky property, returns cpnst char* indeed
+    CV_CAP_PROP_PREVIEW_FORMAT=1026, // readonly, tricky property, returns cpnst char* indeed
+
+    // OpenNI map generators
+    CV_CAP_OPENNI_DEPTH_GENERATOR = 1 << 31,
+    CV_CAP_OPENNI_IMAGE_GENERATOR = 1 << 30,
+    CV_CAP_OPENNI_IR_GENERATOR    = 1 << 29,
+    CV_CAP_OPENNI_GENERATORS_MASK = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_OPENNI_IMAGE_GENERATOR + CV_CAP_OPENNI_IR_GENERATOR,
+
+    // Properties of cameras available through OpenNI interfaces
+    CV_CAP_PROP_OPENNI_OUTPUT_MODE     = 100,
+    CV_CAP_PROP_OPENNI_FRAME_MAX_DEPTH = 101, // in mm
+    CV_CAP_PROP_OPENNI_BASELINE        = 102, // in mm
+    CV_CAP_PROP_OPENNI_FOCAL_LENGTH    = 103, // in pixels
+    CV_CAP_PROP_OPENNI_REGISTRATION    = 104, // flag
+    CV_CAP_PROP_OPENNI_REGISTRATION_ON = CV_CAP_PROP_OPENNI_REGISTRATION, // flag that synchronizes the remapping depth map to image map
+                                                                          // by changing depth generator's view point (if the flag is "on") or
+                                                                          // sets this view point to its normal one (if the flag is "off").
+    CV_CAP_PROP_OPENNI_APPROX_FRAME_SYNC = 105,
+    CV_CAP_PROP_OPENNI_MAX_BUFFER_SIZE   = 106,
+    CV_CAP_PROP_OPENNI_CIRCLE_BUFFER     = 107,
+    CV_CAP_PROP_OPENNI_MAX_TIME_DURATION = 108,
+
+    CV_CAP_PROP_OPENNI_GENERATOR_PRESENT = 109,
+    CV_CAP_PROP_OPENNI2_SYNC = 110,
+    CV_CAP_PROP_OPENNI2_MIRROR = 111,
+
+    CV_CAP_OPENNI_IMAGE_GENERATOR_PRESENT         = CV_CAP_OPENNI_IMAGE_GENERATOR + CV_CAP_PROP_OPENNI_GENERATOR_PRESENT,
+    CV_CAP_OPENNI_IMAGE_GENERATOR_OUTPUT_MODE     = CV_CAP_OPENNI_IMAGE_GENERATOR + CV_CAP_PROP_OPENNI_OUTPUT_MODE,
+    CV_CAP_OPENNI_DEPTH_GENERATOR_PRESENT         = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_PROP_OPENNI_GENERATOR_PRESENT,
+    CV_CAP_OPENNI_DEPTH_GENERATOR_BASELINE        = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_PROP_OPENNI_BASELINE,
+    CV_CAP_OPENNI_DEPTH_GENERATOR_FOCAL_LENGTH    = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_PROP_OPENNI_FOCAL_LENGTH,
+    CV_CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION    = CV_CAP_OPENNI_DEPTH_GENERATOR + CV_CAP_PROP_OPENNI_REGISTRATION,
+    CV_CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION_ON = CV_CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION,
+    CV_CAP_OPENNI_IR_GENERATOR_PRESENT            = CV_CAP_OPENNI_IR_GENERATOR + CV_CAP_PROP_OPENNI_GENERATOR_PRESENT,
+
+    // Properties of cameras available through GStreamer interface
+    CV_CAP_GSTREAMER_QUEUE_LENGTH           = 200, // default is 1
+
+    // PVAPI
+    CV_CAP_PROP_PVAPI_MULTICASTIP           = 300, // ip for anable multicast master mode. 0 for disable multicast
+    CV_CAP_PROP_PVAPI_FRAMESTARTTRIGGERMODE = 301, // FrameStartTriggerMode: Determines how a frame is initiated
+    CV_CAP_PROP_PVAPI_DECIMATIONHORIZONTAL  = 302, // Horizontal sub-sampling of the image
+    CV_CAP_PROP_PVAPI_DECIMATIONVERTICAL    = 303, // Vertical sub-sampling of the image
+    CV_CAP_PROP_PVAPI_BINNINGX              = 304, // Horizontal binning factor
+    CV_CAP_PROP_PVAPI_BINNINGY              = 305, // Vertical binning factor
+    CV_CAP_PROP_PVAPI_PIXELFORMAT           = 306, // Pixel format
+
+    // Properties of cameras available through XIMEA SDK interface
+    CV_CAP_PROP_XI_DOWNSAMPLING                                 = 400, // Change image resolution by binning or skipping.
+    CV_CAP_PROP_XI_DATA_FORMAT                                  = 401, // Output data format.
+    CV_CAP_PROP_XI_OFFSET_X                                     = 402, // Horizontal offset from the origin to the area of interest (in pixels).
+    CV_CAP_PROP_XI_OFFSET_Y                                     = 403, // Vertical offset from the origin to the area of interest (in pixels).
+    CV_CAP_PROP_XI_TRG_SOURCE                                   = 404, // Defines source of trigger.
+    CV_CAP_PROP_XI_TRG_SOFTWARE                                 = 405, // Generates an internal trigger. PRM_TRG_SOURCE must be set to TRG_SOFTWARE.
+    CV_CAP_PROP_XI_GPI_SELECTOR                                 = 406, // Selects general purpose input
+    CV_CAP_PROP_XI_GPI_MODE                                     = 407, // Set general purpose input mode
+    CV_CAP_PROP_XI_GPI_LEVEL                                    = 408, // Get general purpose level
+    CV_CAP_PROP_XI_GPO_SELECTOR                                 = 409, // Selects general purpose output
+    CV_CAP_PROP_XI_GPO_MODE                                     = 410, // Set general purpose output mode
+    CV_CAP_PROP_XI_LED_SELECTOR                                 = 411, // Selects camera signalling LED
+    CV_CAP_PROP_XI_LED_MODE                                     = 412, // Define camera signalling LED functionality
+    CV_CAP_PROP_XI_MANUAL_WB                                    = 413, // Calculates White Balance(must be called during acquisition)
+    CV_CAP_PROP_XI_AUTO_WB                                      = 414, // Automatic white balance
+    CV_CAP_PROP_XI_AEAG                                         = 415, // Automatic exposure/gain
+    CV_CAP_PROP_XI_EXP_PRIORITY                                 = 416, // Exposure priority (0.5 - exposure 50%, gain 50%).
+    CV_CAP_PROP_XI_AE_MAX_LIMIT                                 = 417, // Maximum limit of exposure in AEAG procedure
+    CV_CAP_PROP_XI_AG_MAX_LIMIT                                 = 418,  // Maximum limit of gain in AEAG procedure
+    CV_CAP_PROP_XI_AEAG_LEVEL                                   = 419, // Average intensity of output signal AEAG should achieve(in %)
+    CV_CAP_PROP_XI_TIMEOUT                                      = 420, // Image capture timeout in milliseconds
+    CV_CAP_PROP_XI_EXPOSURE                                     = 421, // Exposure time in microseconds
+    CV_CAP_PROP_XI_EXPOSURE_BURST_COUNT                         = 422, // Sets the number of times of exposure in one frame.
+    CV_CAP_PROP_XI_GAIN_SELECTOR                                = 423, // Gain selector for parameter Gain allows to select different type of gains.
+    CV_CAP_PROP_XI_GAIN                                         = 424, // Gain in dB
+    CV_CAP_PROP_XI_DOWNSAMPLING_TYPE                            = 426, // Change image downsampling type.
+    CV_CAP_PROP_XI_BINNING_SELECTOR                             = 427, // Binning engine selector.
+    CV_CAP_PROP_XI_BINNING_VERTICAL                             = 428, // Vertical Binning - number of vertical photo-sensitive cells to combine together.
+    CV_CAP_PROP_XI_BINNING_HORIZONTAL                           = 429, // Horizontal Binning - number of horizontal photo-sensitive cells to combine together.
+    CV_CAP_PROP_XI_BINNING_PATTERN                              = 430, // Binning pattern type.
+    CV_CAP_PROP_XI_DECIMATION_SELECTOR                          = 431, // Decimation engine selector.
+    CV_CAP_PROP_XI_DECIMATION_VERTICAL                          = 432, // Vertical Decimation - vertical sub-sampling of the image - reduces the vertical resolution of the image by the specified vertical decimation factor.
+    CV_CAP_PROP_XI_DECIMATION_HORIZONTAL                        = 433, // Horizontal Decimation - horizontal sub-sampling of the image - reduces the horizontal resolution of the image by the specified vertical decimation factor.
+    CV_CAP_PROP_XI_DECIMATION_PATTERN                           = 434, // Decimation pattern type.
+    CV_CAP_PROP_XI_TEST_PATTERN_GENERATOR_SELECTOR              = 587, // Selects which test pattern generator is controlled by the TestPattern feature.
+    CV_CAP_PROP_XI_TEST_PATTERN                                 = 588, // Selects which test pattern type is generated by the selected generator.
+    CV_CAP_PROP_XI_IMAGE_DATA_FORMAT                            = 435, // Output data format.
+    CV_CAP_PROP_XI_SHUTTER_TYPE                                 = 436, // Change sensor shutter type(CMOS sensor).
+    CV_CAP_PROP_XI_SENSOR_TAPS                                  = 437, // Number of taps
+    CV_CAP_PROP_XI_AEAG_ROI_OFFSET_X                            = 439, // Automatic exposure/gain ROI offset X
+    CV_CAP_PROP_XI_AEAG_ROI_OFFSET_Y                            = 440, // Automatic exposure/gain ROI offset Y
+    CV_CAP_PROP_XI_AEAG_ROI_WIDTH                               = 441, // Automatic exposure/gain ROI Width
+    CV_CAP_PROP_XI_AEAG_ROI_HEIGHT                              = 442, // Automatic exposure/gain ROI Height
+    CV_CAP_PROP_XI_BPC                                          = 445, // Correction of bad pixels
+    CV_CAP_PROP_XI_WB_KR                                        = 448, // White balance red coefficient
+    CV_CAP_PROP_XI_WB_KG                                        = 449, // White balance green coefficient
+    CV_CAP_PROP_XI_WB_KB                                        = 450, // White balance blue coefficient
+    CV_CAP_PROP_XI_WIDTH                                        = 451, // Width of the Image provided by the device (in pixels).
+    CV_CAP_PROP_XI_HEIGHT                                       = 452, // Height of the Image provided by the device (in pixels).
+    CV_CAP_PROP_XI_REGION_SELECTOR                              = 589, // Selects Region in Multiple ROI which parameters are set by width, height, ... ,region mode
+    CV_CAP_PROP_XI_REGION_MODE                                  = 595, // Activates/deactivates Region selected by Region Selector
+    CV_CAP_PROP_XI_LIMIT_BANDWIDTH                              = 459, // Set/get bandwidth(datarate)(in Megabits)
+    CV_CAP_PROP_XI_SENSOR_DATA_BIT_DEPTH                        = 460, // Sensor output data bit depth.
+    CV_CAP_PROP_XI_OUTPUT_DATA_BIT_DEPTH                        = 461, // Device output data bit depth.
+    CV_CAP_PROP_XI_IMAGE_DATA_BIT_DEPTH                         = 462, // bitdepth of data returned by function xiGetImage
+    CV_CAP_PROP_XI_OUTPUT_DATA_PACKING                          = 463, // Device output data packing (or grouping) enabled. Packing could be enabled if output_data_bit_depth > 8 and packing capability is available.
+    CV_CAP_PROP_XI_OUTPUT_DATA_PACKING_TYPE                     = 464, // Data packing type. Some cameras supports only specific packing type.
+    CV_CAP_PROP_XI_IS_COOLED                                    = 465, // Returns 1 for cameras that support cooling.
+    CV_CAP_PROP_XI_COOLING                                      = 466, // Start camera cooling.
+    CV_CAP_PROP_XI_TARGET_TEMP                                  = 467, // Set sensor target temperature for cooling.
+    CV_CAP_PROP_XI_CHIP_TEMP                                    = 468, // Camera sensor temperature
+    CV_CAP_PROP_XI_HOUS_TEMP                                    = 469, // Camera housing temperature
+    CV_CAP_PROP_XI_HOUS_BACK_SIDE_TEMP                          = 590, // Camera housing back side temperature
+    CV_CAP_PROP_XI_SENSOR_BOARD_TEMP                            = 596, // Camera sensor board temperature
+    CV_CAP_PROP_XI_CMS                                          = 470, // Mode of color management system.
+    CV_CAP_PROP_XI_APPLY_CMS                                    = 471, // Enable applying of CMS profiles to xiGetImage (see XI_PRM_INPUT_CMS_PROFILE, XI_PRM_OUTPUT_CMS_PROFILE).
+    CV_CAP_PROP_XI_IMAGE_IS_COLOR                               = 474, // Returns 1 for color cameras.
+    CV_CAP_PROP_XI_COLOR_FILTER_ARRAY                           = 475, // Returns color filter array type of RAW data.
+    CV_CAP_PROP_XI_GAMMAY                                       = 476, // Luminosity gamma
+    CV_CAP_PROP_XI_GAMMAC                                       = 477, // Chromaticity gamma
+    CV_CAP_PROP_XI_SHARPNESS                                    = 478, // Sharpness Strength
+    CV_CAP_PROP_XI_CC_MATRIX_00                                 = 479, // Color Correction Matrix element [0][0]
+    CV_CAP_PROP_XI_CC_MATRIX_01                                 = 480, // Color Correction Matrix element [0][1]
+    CV_CAP_PROP_XI_CC_MATRIX_02                                 = 481, // Color Correction Matrix element [0][2]
+    CV_CAP_PROP_XI_CC_MATRIX_03                                 = 482, // Color Correction Matrix element [0][3]
+    CV_CAP_PROP_XI_CC_MATRIX_10                                 = 483, // Color Correction Matrix element [1][0]
+    CV_CAP_PROP_XI_CC_MATRIX_11                                 = 484, // Color Correction Matrix element [1][1]
+    CV_CAP_PROP_XI_CC_MATRIX_12                                 = 485, // Color Correction Matrix element [1][2]
+    CV_CAP_PROP_XI_CC_MATRIX_13                                 = 486, // Color Correction Matrix element [1][3]
+    CV_CAP_PROP_XI_CC_MATRIX_20                                 = 487, // Color Correction Matrix element [2][0]
+    CV_CAP_PROP_XI_CC_MATRIX_21                                 = 488, // Color Correction Matrix element [2][1]
+    CV_CAP_PROP_XI_CC_MATRIX_22                                 = 489, // Color Correction Matrix element [2][2]
+    CV_CAP_PROP_XI_CC_MATRIX_23                                 = 490, // Color Correction Matrix element [2][3]
+    CV_CAP_PROP_XI_CC_MATRIX_30                                 = 491, // Color Correction Matrix element [3][0]
+    CV_CAP_PROP_XI_CC_MATRIX_31                                 = 492, // Color Correction Matrix element [3][1]
+    CV_CAP_PROP_XI_CC_MATRIX_32                                 = 493, // Color Correction Matrix element [3][2]
+    CV_CAP_PROP_XI_CC_MATRIX_33                                 = 494, // Color Correction Matrix element [3][3]
+    CV_CAP_PROP_XI_DEFAULT_CC_MATRIX                            = 495, // Set default Color Correction Matrix
+    CV_CAP_PROP_XI_TRG_SELECTOR                                 = 498, // Selects the type of trigger.
+    CV_CAP_PROP_XI_ACQ_FRAME_BURST_COUNT                        = 499, // Sets number of frames acquired by burst. This burst is used only if trigger is set to FrameBurstStart
+    CV_CAP_PROP_XI_DEBOUNCE_EN                                  = 507, // Enable/Disable debounce to selected GPI
+    CV_CAP_PROP_XI_DEBOUNCE_T0                                  = 508, // Debounce time (x * 10us)
+    CV_CAP_PROP_XI_DEBOUNCE_T1                                  = 509, // Debounce time (x * 10us)
+    CV_CAP_PROP_XI_DEBOUNCE_POL                                 = 510, // Debounce polarity (pol = 1 t0 - falling edge, t1 - rising edge)
+    CV_CAP_PROP_XI_LENS_MODE                                    = 511, // Status of lens control interface. This shall be set to XI_ON before any Lens operations.
+    CV_CAP_PROP_XI_LENS_APERTURE_VALUE                          = 512, // Current lens aperture value in stops. Examples: 2.8, 4, 5.6, 8, 11
+    CV_CAP_PROP_XI_LENS_FOCUS_MOVEMENT_VALUE                    = 513, // Lens current focus movement value to be used by XI_PRM_LENS_FOCUS_MOVE in motor steps.
+    CV_CAP_PROP_XI_LENS_FOCUS_MOVE                              = 514, // Moves lens focus motor by steps set in XI_PRM_LENS_FOCUS_MOVEMENT_VALUE.
+    CV_CAP_PROP_XI_LENS_FOCUS_DISTANCE                          = 515, // Lens focus distance in cm.
+    CV_CAP_PROP_XI_LENS_FOCAL_LENGTH                            = 516, // Lens focal distance in mm.
+    CV_CAP_PROP_XI_LENS_FEATURE_SELECTOR                        = 517, // Selects the current feature which is accessible by XI_PRM_LENS_FEATURE.
+    CV_CAP_PROP_XI_LENS_FEATURE                                 = 518, // Allows access to lens feature value currently selected by XI_PRM_LENS_FEATURE_SELECTOR.
+    CV_CAP_PROP_XI_DEVICE_MODEL_ID                              = 521, // Return device model id
+    CV_CAP_PROP_XI_DEVICE_SN                                    = 522, // Return device serial number
+    CV_CAP_PROP_XI_IMAGE_DATA_FORMAT_RGB32_ALPHA                = 529, // The alpha channel of RGB32 output image format.
+    CV_CAP_PROP_XI_IMAGE_PAYLOAD_SIZE                           = 530, // Buffer size in bytes sufficient for output image returned by xiGetImage
+    CV_CAP_PROP_XI_TRANSPORT_PIXEL_FORMAT                       = 531, // Current format of pixels on transport layer.
+    CV_CAP_PROP_XI_SENSOR_CLOCK_FREQ_HZ                         = 532, // Sensor clock frequency in Hz.
+    CV_CAP_PROP_XI_SENSOR_CLOCK_FREQ_INDEX                      = 533, // Sensor clock frequency index. Sensor with selected frequencies have possibility to set the frequency only by this index.
+    CV_CAP_PROP_XI_SENSOR_OUTPUT_CHANNEL_COUNT                  = 534, // Number of output channels from sensor used for data transfer.
+    CV_CAP_PROP_XI_FRAMERATE                                    = 535, // Define framerate in Hz
+    CV_CAP_PROP_XI_COUNTER_SELECTOR                             = 536, // Select counter
+    CV_CAP_PROP_XI_COUNTER_VALUE                                = 537, // Counter status
+    CV_CAP_PROP_XI_ACQ_TIMING_MODE                              = 538, // Type of sensor frames timing.
+    CV_CAP_PROP_XI_AVAILABLE_BANDWIDTH                          = 539, // Calculate and return available interface bandwidth(int Megabits)
+    CV_CAP_PROP_XI_BUFFER_POLICY                                = 540, // Data move policy
+    CV_CAP_PROP_XI_LUT_EN                                       = 541, // Activates LUT.
+    CV_CAP_PROP_XI_LUT_INDEX                                    = 542, // Control the index (offset) of the coefficient to access in the LUT.
+    CV_CAP_PROP_XI_LUT_VALUE                                    = 543, // Value at entry LUTIndex of the LUT
+    CV_CAP_PROP_XI_TRG_DELAY                                    = 544, // Specifies the delay in microseconds (us) to apply after the trigger reception before activating it.
+    CV_CAP_PROP_XI_TS_RST_MODE                                  = 545, // Defines how time stamp reset engine will be armed
+    CV_CAP_PROP_XI_TS_RST_SOURCE                                = 546, // Defines which source will be used for timestamp reset. Writing this parameter will trigger settings of engine (arming)
+    CV_CAP_PROP_XI_IS_DEVICE_EXIST                              = 547, // Returns 1 if camera connected and works properly.
+    CV_CAP_PROP_XI_ACQ_BUFFER_SIZE                              = 548, // Acquisition buffer size in buffer_size_unit. Default bytes.
+    CV_CAP_PROP_XI_ACQ_BUFFER_SIZE_UNIT                         = 549, // Acquisition buffer size unit in bytes. Default 1. E.g. Value 1024 means that buffer_size is in KiBytes
+    CV_CAP_PROP_XI_ACQ_TRANSPORT_BUFFER_SIZE                    = 550, // Acquisition transport buffer size in bytes
+    CV_CAP_PROP_XI_BUFFERS_QUEUE_SIZE                           = 551, // Queue of field/frame buffers
+    CV_CAP_PROP_XI_ACQ_TRANSPORT_BUFFER_COMMIT                  = 552, // Number of buffers to commit to low level
+    CV_CAP_PROP_XI_RECENT_FRAME                                 = 553, // GetImage returns most recent frame
+    CV_CAP_PROP_XI_DEVICE_RESET                                 = 554, // Resets the camera to default state.
+    CV_CAP_PROP_XI_COLUMN_FPN_CORRECTION                        = 555, // Correction of column FPN
+    CV_CAP_PROP_XI_ROW_FPN_CORRECTION                           = 591, // Correction of row FPN
+    CV_CAP_PROP_XI_SENSOR_MODE                                  = 558, // Current sensor mode. Allows to select sensor mode by one integer. Setting of this parameter affects: image dimensions and downsampling.
+    CV_CAP_PROP_XI_HDR                                          = 559, // Enable High Dynamic Range feature.
+    CV_CAP_PROP_XI_HDR_KNEEPOINT_COUNT                          = 560, // The number of kneepoints in the PWLR.
+    CV_CAP_PROP_XI_HDR_T1                                       = 561, // position of first kneepoint(in % of XI_PRM_EXPOSURE)
+    CV_CAP_PROP_XI_HDR_T2                                       = 562, // position of second kneepoint (in % of XI_PRM_EXPOSURE)
+    CV_CAP_PROP_XI_KNEEPOINT1                                   = 563, // value of first kneepoint (% of sensor saturation)
+    CV_CAP_PROP_XI_KNEEPOINT2                                   = 564, // value of second kneepoint (% of sensor saturation)
+    CV_CAP_PROP_XI_IMAGE_BLACK_LEVEL                            = 565, // Last image black level counts. Can be used for Offline processing to recall it.
+    CV_CAP_PROP_XI_HW_REVISION                                  = 571, // Returns hardware revision number.
+    CV_CAP_PROP_XI_DEBUG_LEVEL                                  = 572, // Set debug level
+    CV_CAP_PROP_XI_AUTO_BANDWIDTH_CALCULATION                   = 573, // Automatic bandwidth calculation,
+    CV_CAP_PROP_XI_FFS_FILE_ID                                  = 594, // File number.
+    CV_CAP_PROP_XI_FFS_FILE_SIZE                                = 580, // Size of file.
+    CV_CAP_PROP_XI_FREE_FFS_SIZE                                = 581, // Size of free camera FFS.
+    CV_CAP_PROP_XI_USED_FFS_SIZE                                = 582, // Size of used camera FFS.
+    CV_CAP_PROP_XI_FFS_ACCESS_KEY                               = 583, // Setting of key enables file operations on some cameras.
+    CV_CAP_PROP_XI_SENSOR_FEATURE_SELECTOR                      = 585, // Selects the current feature which is accessible by XI_PRM_SENSOR_FEATURE_VALUE.
+    CV_CAP_PROP_XI_SENSOR_FEATURE_VALUE                         = 586, // Allows access to sensor feature value currently selected by XI_PRM_SENSOR_FEATURE_SELECTOR.
+
+
+    // Properties for Android cameras
+    CV_CAP_PROP_ANDROID_FLASH_MODE = 8001,
+    CV_CAP_PROP_ANDROID_FOCUS_MODE = 8002,
+    CV_CAP_PROP_ANDROID_WHITE_BALANCE = 8003,
+    CV_CAP_PROP_ANDROID_ANTIBANDING = 8004,
+    CV_CAP_PROP_ANDROID_FOCAL_LENGTH = 8005,
+    CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_NEAR = 8006,
+    CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_OPTIMAL = 8007,
+    CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_FAR = 8008,
+    CV_CAP_PROP_ANDROID_EXPOSE_LOCK = 8009,
+    CV_CAP_PROP_ANDROID_WHITEBALANCE_LOCK = 8010,
+
+    // Properties of cameras available through AVFOUNDATION interface
+    CV_CAP_PROP_IOS_DEVICE_FOCUS = 9001,
+    CV_CAP_PROP_IOS_DEVICE_EXPOSURE = 9002,
+    CV_CAP_PROP_IOS_DEVICE_FLASH = 9003,
+    CV_CAP_PROP_IOS_DEVICE_WHITEBALANCE = 9004,
+    CV_CAP_PROP_IOS_DEVICE_TORCH = 9005,
+
+    // Properties of cameras available through Smartek Giganetix Ethernet Vision interface
+    /* --- Vladimir Litvinenko (litvinenko.vladimir@gmail.com) --- */
+    CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
+    CV_CAP_PROP_GIGA_FRAME_OFFSET_Y = 10002,
+    CV_CAP_PROP_GIGA_FRAME_WIDTH_MAX = 10003,
+    CV_CAP_PROP_GIGA_FRAME_HEIGH_MAX = 10004,
+    CV_CAP_PROP_GIGA_FRAME_SENS_WIDTH = 10005,
+    CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006,
+
+    CV_CAP_PROP_INTELPERC_PROFILE_COUNT               = 11001,
+    CV_CAP_PROP_INTELPERC_PROFILE_IDX                 = 11002,
+    CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE  = 11003,
+    CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE      = 11004,
+    CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD  = 11005,
+    CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ     = 11006,
+    CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT     = 11007,
+
+    // Intel PerC streams
+    CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
+    CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
+    CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR
+};
+
+enum
+{
+    // Data given from depth generator.
+    CV_CAP_OPENNI_DEPTH_MAP                 = 0, // Depth values in mm (CV_16UC1)
+    CV_CAP_OPENNI_POINT_CLOUD_MAP           = 1, // XYZ in meters (CV_32FC3)
+    CV_CAP_OPENNI_DISPARITY_MAP             = 2, // Disparity in pixels (CV_8UC1)
+    CV_CAP_OPENNI_DISPARITY_MAP_32F         = 3, // Disparity in pixels (CV_32FC1)
+    CV_CAP_OPENNI_VALID_DEPTH_MASK          = 4, // CV_8UC1
+
+    // Data given from RGB image generator.
+    CV_CAP_OPENNI_BGR_IMAGE                 = 5,
+    CV_CAP_OPENNI_GRAY_IMAGE                = 6,
+
+    // Data given from IR image generator.
+    CV_CAP_OPENNI_IR_IMAGE                  = 7
+};
+
+// Supported output modes of OpenNI image generator
+enum
+{
+    CV_CAP_OPENNI_VGA_30HZ     = 0,
+    CV_CAP_OPENNI_SXGA_15HZ    = 1,
+    CV_CAP_OPENNI_SXGA_30HZ    = 2,
+    CV_CAP_OPENNI_QVGA_30HZ    = 3,
+    CV_CAP_OPENNI_QVGA_60HZ    = 4
+};
+
+enum
+{
+    CV_CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
+    CV_CAP_INTELPERC_UVDEPTH_MAP            = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
+    CV_CAP_INTELPERC_IR_MAP                 = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
+    CV_CAP_INTELPERC_IMAGE                  = 3
+};
+
+// gPhoto2 properties, if propertyId is less than 0 then work on widget with that __additive inversed__ camera setting ID
+// Get IDs by using CAP_PROP_GPHOTO2_WIDGET_ENUMERATE.
+// @see CvCaptureCAM_GPHOTO2 for more info
+enum
+{
+    CV_CAP_PROP_GPHOTO2_PREVIEW           = 17001, // Capture only preview from liveview mode.
+    CV_CAP_PROP_GPHOTO2_WIDGET_ENUMERATE  = 17002, // Readonly, returns (const char *).
+    CV_CAP_PROP_GPHOTO2_RELOAD_CONFIG     = 17003, // Trigger, only by set. Reload camera settings.
+    CV_CAP_PROP_GPHOTO2_RELOAD_ON_CHANGE  = 17004, // Reload all settings on set.
+    CV_CAP_PROP_GPHOTO2_COLLECT_MSGS      = 17005, // Collect messages with details.
+    CV_CAP_PROP_GPHOTO2_FLUSH_MSGS        = 17006, // Readonly, returns (const char *).
+    CV_CAP_PROP_SPEED                     = 17007, // Exposure speed. Can be readonly, depends on camera program.
+    CV_CAP_PROP_APERTURE                  = 17008, // Aperture. Can be readonly, depends on camera program.
+    CV_CAP_PROP_EXPOSUREPROGRAM           = 17009, // Camera exposure program.
+    CV_CAP_PROP_VIEWFINDER                = 17010  // Enter liveview mode.
+};
+
+//! Macro to construct the fourcc code of the codec. Same as CV_FOURCC()
+#define CV_FOURCC_MACRO(c1, c2, c3, c4) (((c1) & 255) + (((c2) & 255) << 8) + (((c3) & 255) << 16) + (((c4) & 255) << 24))
+
+/** @brief Constructs the fourcc code of the codec function
+
+Simply call it with 4 chars fourcc code like `CV_FOURCC('I', 'Y', 'U', 'V')`
+
+List of codes can be obtained at [Video Codecs by FOURCC](https://fourcc.org/codecs.php) page.
+FFMPEG backend with MP4 container natively uses other values as fourcc code:
+see [ObjectType](http://mp4ra.org/#/codecs).
+*/
+CV_INLINE int CV_FOURCC(char c1, char c2, char c3, char c4)
+{
+    return CV_FOURCC_MACRO(c1, c2, c3, c4);
+}
+
+//! (Windows only) Open Codec Selection Dialog
+#define CV_FOURCC_PROMPT -1
+//! (Linux only) Use default codec for specified filename
+#define CV_FOURCC_DEFAULT CV_FOURCC('I', 'Y', 'U', 'V')
+
+#endif // OPENCV_VIDEOIO_LEGACY_CONSTANTS_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/videoio/registry.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/registry.hpp
new file mode 100644
index 000000000000..cf72247b3fa0
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/registry.hpp
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VIDEOIO_REGISTRY_HPP
+#define OPENCV_VIDEOIO_REGISTRY_HPP
+
+#include <opencv2/videoio.hpp>
+
+namespace cv { namespace videoio_registry {
+/** @addtogroup videoio_registry
+This section contains API description how to query/configure available Video I/O backends.
+
+Runtime configuration options:
+- enable debug mode: `OPENCV_VIDEOIO_DEBUG=1`
+- change backend priority: `OPENCV_VIDEOIO_PRIORITY_<backend>=9999`
+- disable backend: `OPENCV_VIDEOIO_PRIORITY_<backend>=0`
+- specify list of backends with high priority (>100000): `OPENCV_VIDEOIO_PRIORITY_LIST=FFMPEG,GSTREAMER`
+
+@{
+ */
+
+
+/** @brief Returns backend API name or "UnknownVideoAPI(xxx)"
+@param api backend ID (#VideoCaptureAPIs)
+*/
+CV_EXPORTS_W cv::String getBackendName(VideoCaptureAPIs api);
+
+/** @brief Returns list of all available backends */
+CV_EXPORTS_W std::vector<VideoCaptureAPIs> getBackends();
+
+/** @brief Returns list of available backends which works via `cv::VideoCapture(int index)` */
+CV_EXPORTS_W std::vector<VideoCaptureAPIs> getCameraBackends();
+
+/** @brief Returns list of available backends which works via `cv::VideoCapture(filename)` */
+CV_EXPORTS_W std::vector<VideoCaptureAPIs> getStreamBackends();
+
+/** @brief Returns list of available backends which works via `cv::VideoWriter()` */
+CV_EXPORTS_W std::vector<VideoCaptureAPIs> getWriterBackends();
+
+/** @brief Returns true if backend is available */
+CV_EXPORTS_W bool hasBackend(VideoCaptureAPIs api);
+
+/** @brief Returns true if backend is built in (false if backend is used as plugin) */
+CV_EXPORTS_W bool isBackendBuiltIn(VideoCaptureAPIs api);
+
+/** @brief Returns description and ABI/API version of videoio plugin's camera interface */
+CV_EXPORTS_W std::string getCameraBackendPluginVersion(
+    VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+);
+
+/** @brief Returns description and ABI/API version of videoio plugin's stream capture interface */
+CV_EXPORTS_W std::string getStreamBackendPluginVersion(
+    VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+);
+
+/** @brief Returns description and ABI/API version of videoio plugin's writer interface */
+CV_EXPORTS_W std::string getWriterBackendPluginVersion(
+    VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+);
+
+
+//! @}
+}} // namespace
+
+#endif // OPENCV_VIDEOIO_REGISTRY_HPP
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/videoio/videoio.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/videoio.hpp
new file mode 100644
index 000000000000..ec84cf7a6874
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/videoio.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/videoio.hpp"
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/videoio/videoio_c.h b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/videoio_c.h
new file mode 100644
index 000000000000..cf1a6d0411fe
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/videoio/videoio_c.h
@@ -0,0 +1,153 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_VIDEOIO_H
+#define OPENCV_VIDEOIO_H
+
+#include "opencv2/core/core_c.h"
+
+#include "opencv2/videoio/legacy/constants_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+  @addtogroup videoio_c
+  @{
+*/
+
+/****************************************************************************************\
+*                         Working with Video Files and Cameras                           *
+\****************************************************************************************/
+
+/** @brief "black box" capture structure
+
+In C++ use cv::VideoCapture
+*/
+typedef struct CvCapture CvCapture;
+
+/** @brief start capturing frames from video file
+*/
+CVAPI(CvCapture*) cvCreateFileCapture( const char* filename );
+
+/** @brief start capturing frames from video file. allows specifying a preferred API to use
+*/
+CVAPI(CvCapture*) cvCreateFileCaptureWithPreference( const char* filename , int apiPreference);
+
+/** @brief start capturing frames from camera: index = camera_index + domain_offset (CV_CAP_*)
+*/
+CVAPI(CvCapture*) cvCreateCameraCapture( int index );
+
+/** @brief grab a frame, return 1 on success, 0 on fail.
+
+  this function is thought to be fast
+*/
+CVAPI(int) cvGrabFrame( CvCapture* capture );
+
+/** @brief get the frame grabbed with cvGrabFrame(..)
+
+  This function may apply some frame processing like
+  frame decompression, flipping etc.
+  @warning !!!DO NOT RELEASE or MODIFY the retrieved frame!!!
+*/
+CVAPI(IplImage*) cvRetrieveFrame( CvCapture* capture, int streamIdx CV_DEFAULT(0) );
+
+/** @brief Just a combination of cvGrabFrame and cvRetrieveFrame
+
+  @warning !!!DO NOT RELEASE or MODIFY the retrieved frame!!!
+*/
+CVAPI(IplImage*) cvQueryFrame( CvCapture* capture );
+
+/** @brief stop capturing/reading and free resources
+*/
+CVAPI(void) cvReleaseCapture( CvCapture** capture );
+
+/** @brief retrieve capture properties
+*/
+CVAPI(double) cvGetCaptureProperty( CvCapture* capture, int property_id );
+/** @brief set capture properties
+*/
+CVAPI(int)    cvSetCaptureProperty( CvCapture* capture, int property_id, double value );
+
+/** @brief Return the type of the capturer (eg, ::CV_CAP_VFW, ::CV_CAP_UNICAP)
+
+It is unknown if created with ::CV_CAP_ANY
+*/
+CVAPI(int)    cvGetCaptureDomain( CvCapture* capture);
+
+/** @brief "black box" video file writer structure
+
+In C++ use cv::VideoWriter
+*/
+typedef struct CvVideoWriter CvVideoWriter;
+
+/** @brief initialize video file writer
+*/
+CVAPI(CvVideoWriter*) cvCreateVideoWriter( const char* filename, int fourcc,
+                                           double fps, CvSize frame_size,
+                                           int is_color CV_DEFAULT(1));
+
+/** @brief write frame to video file
+*/
+CVAPI(int) cvWriteFrame( CvVideoWriter* writer, const IplImage* image );
+
+/** @brief close video file writer
+*/
+CVAPI(void) cvReleaseVideoWriter( CvVideoWriter** writer );
+
+// ***************************************************************************************
+//! @name Obsolete functions/synonyms
+//! @{
+#define cvCaptureFromCAM cvCreateCameraCapture //!< @deprecated use cvCreateCameraCapture() instead
+#define cvCaptureFromFile cvCreateFileCapture  //!< @deprecated use cvCreateFileCapture() instead
+#define cvCaptureFromAVI cvCaptureFromFile     //!< @deprecated use cvCreateFileCapture() instead
+#define cvCreateAVIWriter cvCreateVideoWriter  //!< @deprecated use cvCreateVideoWriter() instead
+#define cvWriteToAVI cvWriteFrame              //!< @deprecated use cvWriteFrame() instead
+//!  @} Obsolete...
+
+//! @} videoio_c
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //OPENCV_VIDEOIO_H
diff --git a/3rdparty/opencv/opencv410/build/include/opencv2/world.hpp b/3rdparty/opencv/opencv410/build/include/opencv2/world.hpp
new file mode 100644
index 000000000000..4902c2f2a645
--- /dev/null
+++ b/3rdparty/opencv/opencv410/build/include/opencv2/world.hpp
@@ -0,0 +1,58 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_WORLD_HPP
+#define OPENCV_WORLD_HPP
+
+#include "opencv2/core.hpp"
+
+#ifdef __cplusplus
+namespace cv
+{
+
+CV_EXPORTS_W bool initAll();
+
+}
+
+#endif
+
+#endif
diff --git a/rpcs3/CMakeLists.txt b/rpcs3/CMakeLists.txt
index a8675d945e6a..fd7b16d3c1e1 100644
--- a/rpcs3/CMakeLists.txt
+++ b/rpcs3/CMakeLists.txt
@@ -77,6 +77,9 @@ target_sources(rpcs3
     Input/mm_joystick_handler.cpp
     Input/pad_thread.cpp
     Input/product_info.cpp
+    Input/ps_move_config.cpp
+    Input/ps_move_handler.cpp
+    Input/ps_move_tracker.cpp
     Input/raw_mouse_config.cpp
     Input/raw_mouse_handler.cpp
     Input/sdl_pad_handler.cpp
diff --git a/rpcs3/Emu/Cell/Modules/cellGem.cpp b/rpcs3/Emu/Cell/Modules/cellGem.cpp
index ed38dd850558..57f47a8f4530 100644
--- a/rpcs3/Emu/Cell/Modules/cellGem.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellGem.cpp
@@ -25,8 +25,6 @@
 
 LOG_CHANNEL(cellGem);
 
-extern u32 get_buffer_size_by_format(s32 format, s32 width, s32 height);
-
 template <>
 void fmt_class_string<gem_btn>::format(std::string& out, u64 arg)
 {
@@ -674,7 +672,7 @@ class gem_tracker
 
 		if (m_camera_info.buffer.addr() != addr && m_camera_info.pbuf[0].addr() != addr && m_camera_info.pbuf[1].addr() != addr)
 		{
-			cellGem.error("gem_tracker: unexcepted image address: addr=0x%x, expected one of: 0x%x, 0x%x, 0x%x", addr, m_camera_info.buffer.addr(), m_camera_info.pbuf[0].addr(), m_camera_info.pbuf[1].addr());
+			cellGem.error("gem_tracker: unexpected image address: addr=0x%x, expected one of: 0x%x, 0x%x, 0x%x", addr, m_camera_info.buffer.addr(), m_camera_info.pbuf[0].addr(), m_camera_info.pbuf[1].addr());
 			return false;
 		}
 
@@ -683,15 +681,15 @@ class gem_tracker
 		const auto& [width, height] = get_video_resolution(m_camera_info);
 		const u32 expected_size = get_buffer_size_by_format(m_camera_info.format, width, height);
 
-		if (!m_camera_info.bytesize || m_camera_info.bytesize != expected_size)
+		if (!m_camera_info.bytesize || static_cast<u32>(m_camera_info.bytesize) != expected_size)
 		{
-			cellGem.error("gem_tracker: unexcepted image size: size=%d, expected=%d", m_camera_info.bytesize, expected_size);
+			cellGem.error("gem_tracker: unexpected image size: size=%d, expected=%d", m_camera_info.bytesize, expected_size);
 			return false;
 		}
 
 		if (!m_camera_info.bytesize)
 		{
-			cellGem.error("gem_tracker: unexcepted image size: %d", m_camera_info.bytesize);
+			cellGem.error("gem_tracker: unexpected image size: %d", m_camera_info.bytesize);
 			return false;
 		}
 
diff --git a/rpcs3/Input/ps_move_tracker.cpp b/rpcs3/Input/ps_move_tracker.cpp
index d6e3ded71557..2e0d22ea0572 100644
--- a/rpcs3/Input/ps_move_tracker.cpp
+++ b/rpcs3/Input/ps_move_tracker.cpp
@@ -18,9 +18,6 @@ namespace gem
 	                                 u8* video_data_out, u32 video_data_out_size);
 }
 
-template class ps_move_tracker<false>;
-template class ps_move_tracker<true>;
-
 template <bool DiagnosticsEnabled>
 ps_move_tracker<DiagnosticsEnabled>::ps_move_tracker()
 {
@@ -240,8 +237,6 @@ void ps_move_tracker<DiagnosticsEnabled>::process_hues()
 	const u32 width = m_width;
 	const u32 height = m_height;
 
-	static const double sqrt3 = sqrt(3);
-
 	if constexpr (DiagnosticsEnabled)
 	{
 		std::fill(m_hues.begin(), m_hues.end(), 0);
@@ -562,3 +557,6 @@ std::tuple<s16, float, float> ps_move_tracker<DiagnosticsEnabled>::rgb_to_hsv(fl
 
 	return { hue, saturation, cmax };
 }
+
+template class ps_move_tracker<false>;
+template class ps_move_tracker<true>;
diff --git a/rpcs3/rpcs3qt/CMakeLists.txt b/rpcs3/rpcs3qt/CMakeLists.txt
index ea23b2b3a275..7e6b17cf50e4 100644
--- a/rpcs3/rpcs3qt/CMakeLists.txt
+++ b/rpcs3/rpcs3qt/CMakeLists.txt
@@ -65,6 +65,7 @@ add_library(rpcs3_ui STATIC
     pkg_install_dialog.cpp
     progress_dialog.cpp
     progress_indicator.cpp
+    ps_move_tracker_dialog.cpp
     qt_camera_handler.cpp
     qt_camera_video_sink.cpp
     qt_music_handler.cpp
@@ -116,6 +117,7 @@ add_library(rpcs3_ui STATIC
     pad_settings_dialog.ui
     patch_creator_dialog.ui
     patch_manager_dialog.ui
+    ps_move_tracker_dialog.ui
     settings_dialog.ui
     shortcut_dialog.ui
     welcome_dialog.ui
diff --git a/rpcs3/rpcs3qt/pad_settings_dialog.cpp b/rpcs3/rpcs3qt/pad_settings_dialog.cpp
index 288230bcc7f6..5b24dafcae10 100644
--- a/rpcs3/rpcs3qt/pad_settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/pad_settings_dialog.cpp
@@ -1994,6 +1994,7 @@ QString pad_settings_dialog::GetLocalizedPadName(pad_handler handler, const QStr
 		case pad_handler::ds4: return tr("DS4 Pad #%0").arg(index);
 		case pad_handler::dualsense: return tr("DualSense Pad #%0").arg(index);
 		case pad_handler::skateboard: return tr("Skateboard #%0").arg(index);
+		case pad_handler::move: return tr("PS Move #%0").arg(index);
 #ifdef _WIN32
 		case pad_handler::xinput: return tr("XInput Pad #%0").arg(index);
 		case pad_handler::mm: return tr("Joystick #%0").arg(index);