From 2088b8f89b95ab13a2c695b0a7c1a29842b1bd66 Mon Sep 17 00:00:00 2001 From: Attila Csok Date: Thu, 16 Jan 2025 08:15:22 +0200 Subject: [PATCH] [intel-npu] Adding NPU_DYNAMIC_QUANTIZATION property (#28316) ### Details: - Adding NPU_DYNAMIC_QUANTIZATION public RW boolean property ### Tickets: - *EISW-148716* --- .../npu-device.rst | 1 + .../pyopenvino/core/properties/properties.cpp | 1 + .../tests/test_runtime/test_properties.py | 5 ++ .../openvino/runtime/intel_npu/properties.hpp | 8 +++ src/plugins/intel_npu/README.md | 1 + .../al/include/intel_npu/config/compiler.hpp | 22 ++++++++ .../src/al/include/intel_npu/icompiler.hpp | 5 ++ .../intel_npu/src/al/src/config/compiler.cpp | 1 + .../src/driver_compiler_adapter.cpp | 29 ++++++---- .../intel_npu/src/plugin/include/metrics.hpp | 1 + .../intel_npu/src/plugin/include/plugin.hpp | 8 ++- .../src/plugin/src/compiled_model.cpp | 6 +++ .../intel_npu/src/plugin/src/plugin.cpp | 54 +++++++++++++++++-- .../overload/compiled_model/property.cpp | 1 - 14 files changed, 127 insertions(+), 16 deletions(-) diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst index bbc6cbbc84d5d1..a3bdbfc7c2b7d1 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst @@ -143,6 +143,7 @@ offer a limited set of supported OpenVINO features. ov::enable_profiling ov::workload_type ov::intel_npu::compilation_mode_params + ov::intel_npu::compiler_dynamic_quantization ov::intel_npu::turbo ov::intel_npu::tiles ov::intel_npu::max_tiles diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index 6f417b52716efd..8b1e64b0dd4d18 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -336,4 +336,5 @@ void regmodule_properties(py::module m) { wrap_property_RW(m_intel_npu, ov::intel_npu::max_tiles, "max_tiles"); wrap_property_RW(m_intel_npu, ov::intel_npu::bypass_umd_caching, "bypass_umd_caching"); wrap_property_RW(m_intel_npu, ov::intel_npu::defer_weights_load, "defer_weights_load"); + wrap_property_RW(m_intel_npu, ov::intel_npu::compiler_dynamic_quantization, "compiler_dynamic_quantization"); } diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index fc726a0915a97d..447e668d3ad82d 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -452,6 +452,11 @@ def test_properties_ro(ov_property_ro, expected_value): "NPU_DEFER_WEIGHTS_LOAD", ((True, True),), ), + ( + intel_npu.compiler_dynamic_quantization, + "NPU_COMPILER_DYNAMIC_QUANTIZATION", + ((True, True),), + ), ], ) def test_properties_rw(ov_property_rw, expected_value, test_values): diff --git a/src/inference/include/openvino/runtime/intel_npu/properties.hpp b/src/inference/include/openvino/runtime/intel_npu/properties.hpp index c5bf2331ad7dff..723a8b26f555d4 100644 --- a/src/inference/include/openvino/runtime/intel_npu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_npu/properties.hpp @@ -69,6 +69,14 @@ static constexpr ov::Property compiler_ver */ static constexpr ov::Property compilation_mode_params{"NPU_COMPILATION_MODE_PARAMS"}; +/** + * @brief [Only for NPU compiler] + * Type: boolean + * Set or verify state of dynamic quantization in the NPU compiler + * @ingroup ov_runtime_npu_prop_cpp_api + */ +static constexpr ov::Property compiler_dynamic_quantization{"NPU_COMPILER_DYNAMIC_QUANTIZATION"}; + /** * @brief [Only for NPU plugin] * Type: std::bool diff --git a/src/plugins/intel_npu/README.md b/src/plugins/intel_npu/README.md index b5da3bff6e1b47..e0601800abcb7d 100644 --- a/src/plugins/intel_npu/README.md +++ b/src/plugins/intel_npu/README.md @@ -173,6 +173,7 @@ The following properties are supported: | `ov::intel_npu::driver_version`/
`NPU_DRIVER_VERSION` | RO | NPU driver version. | `N/A` | `N/A` | | `ov::intel_npu::compiler_version`/
`NPU_COMPILER_VERSION` | RO | NPU compiler version. MSB 16 bits are Major version, LSB 16 bits are Minor version | `N/A` | `N/A` | | `ov::intel_npu::compilation_mode_params`/
`NPU_COMPILATION_MODE_PARAMS` | RW | Set various parameters supported by the NPU compiler. (See bellow) | ``| `N/A` | +| `ov::intel_npu::compiler_dynamic_quantization`/
`NPU_COMPILER_DYNAMIC_QUANTIZATION` | RW | Enable/Disable dynamic quantization by NPU compiler | `YES` / `NO` | `N/A` | | `ov::intel_npu::turbo`/
`NPU_TURBO` | RW | Set Turbo mode on/off | `YES`/ `NO`| `NO` | | `ov::intel_npu::tiles`/
`NPU_TILES` | RW | Sets the number of npu tiles to compile the model for | `[0-]` | `-1` | | `ov::intel_npu::max_tiles`/
`NPU_MAX_TILES` | RW | Maximum number of tiles supported by the device we compile for. Can be set for offline compilation. If not set, it will be populated by driver.| `[0-]` | `[1-6] depends on npu platform` | diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/compiler.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/compiler.hpp index 3e905a09757223..ba2767fa56065e 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/compiler.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/compiler.hpp @@ -357,4 +357,26 @@ struct COMPILATION_NUM_THREADS final : OptionBase { + static std::string_view key() { + return ov::intel_npu::compiler_dynamic_quantization.name(); + } + + static bool defaultValue() { + return false; + } + + static OptionMode mode() { + return OptionMode::CompileTime; + } + + static bool isPublic() { + return true; + } +}; + } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp index 5ed60641b8fa1e..5751eec3c544ae 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp @@ -12,6 +12,11 @@ namespace intel_npu { +#ifndef ICOMPILER_MAKE_VERSION +/// @brief Generates npu compiler (generic 'oneAPI') API version number +# define ICOMPILER_MAKE_VERSION(_major, _minor) ((_major << 16) | (_minor & 0x0000ffff)) +#endif // ICOMPILER_MAKE_VERSION + /** * @struct NetworkDescription * @brief The object returned by the compiler diff --git a/src/plugins/intel_npu/src/al/src/config/compiler.cpp b/src/plugins/intel_npu/src/al/src/config/compiler.cpp index 71d12147c816ce..7ccb0dff85905c 100644 --- a/src/plugins/intel_npu/src/al/src/config/compiler.cpp +++ b/src/plugins/intel_npu/src/al/src/config/compiler.cpp @@ -24,6 +24,7 @@ void intel_npu::registerCompilerOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); } // diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp index 95ac5b1c10b0db..d7c4def10c8c93 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp @@ -426,10 +426,10 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config, optLevelStr << keyOfOptL << KEY_VALUE_SEPARATOR << "\\d+"; std::ostringstream perfHintStr; perfHintStr << keyOfPerfHO << KEY_VALUE_SEPARATOR << "\\S+"; - logger.warning("%s property is not suppored by this compiler version. Removing from parameters", + logger.warning("%s property is not supported by this compiler version. Removing from parameters", keyOfOptL.c_str()); valueOfParams = std::regex_replace(valueOfParams, std::regex(optLevelStr.str()), ""); - logger.warning("%s property is not suppored by this compiler version. Removing from parameters", + logger.warning("%s property is not supported by this compiler version. Removing from parameters", keyOfPerfHO.c_str()); valueOfParams = std::regex_replace(valueOfParams, std::regex(perfHintStr.str()), ""); @@ -487,7 +487,7 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config, pinningstr << ov::hint::enable_cpu_pinning.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER; logger.warning( - "ENABLE_CPU_PINNING property is not suppored by this compiler version. Removing from parameters"); + "ENABLE_CPU_PINNING property is not supported by this compiler version. Removing from parameters"); content = std::regex_replace(content, std::regex(pinningstr.str()), ""); } @@ -499,9 +499,9 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config, std::ostringstream maxtilestr; maxtilestr << ov::intel_npu::max_tiles.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\d+" << VALUE_DELIMITER; - logger.warning("NPU_STEPPING property is not suppored by this compiler version. Removing from parameters"); + logger.warning("NPU_STEPPING property is not supported by this compiler version. Removing from parameters"); content = std::regex_replace(content, std::regex(stepstr.str()), ""); - logger.warning("NPU_MAX_TILES property is not suppored by this compiler version. Removing from parameters"); + logger.warning("NPU_MAX_TILES property is not supported by this compiler version. Removing from parameters"); content = std::regex_replace(content, std::regex(maxtilestr.str()), ""); } @@ -511,13 +511,13 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config, precstr << ov::hint::inference_precision.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER; logger.warning( - "INFERENCE_PRECISION_HINT property is not suppored by this compiler version. Removing from parameters"); + "INFERENCE_PRECISION_HINT property is not supported by this compiler version. Removing from parameters"); content = std::regex_replace(content, std::regex(precstr.str()), ""); } /// Replacing NPU_TILES (for all versions) with NPU_DPU_GROUPS for backwards compatibility if (std::regex_search(content, std::regex(ov::intel_npu::tiles.name()))) { - logger.warning("NPU_TILES property is not suppored by this compiler version. Swaping it to " + logger.warning("NPU_TILES property is not supported by this compiler version. Swaping it to " "NPU_DPU_GROUPS (obsolete)"); content = std::regex_replace(content, std::regex(ov::intel_npu::tiles.name()), "NPU_DPU_GROUPS"); } @@ -528,7 +528,7 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config, batchstr << ov::intel_npu::batch_mode.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER; - logger.warning("NPU_BATCH_MODE property is not suppored by this compiler version. Removing from parameters"); + logger.warning("NPU_BATCH_MODE property is not supported by this compiler version. Removing from parameters"); content = std::regex_replace(content, std::regex(batchstr.str()), ""); } @@ -538,10 +538,21 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config, batchstr << ov::hint::execution_mode.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" << VALUE_DELIMITER; logger.warning( - "EXECUTION_MODE_HINT property is not suppored by this compiler version. Removing from parameters"); + "EXECUTION_MODE_HINT property is not supported by this compiler version. Removing from parameters"); content = std::regex_replace(content, std::regex(batchstr.str()), ""); } + // COMPILER_DYNAMIC_QUANTIZATION is not supported in versions < 7.1 - need to remove it + if ((compilerVersion.major < 7) || (compilerVersion.major == 7 && compilerVersion.minor < 1)) { + std::ostringstream dqstr; + dqstr << ov::intel_npu::compiler_dynamic_quantization.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+" + << VALUE_DELIMITER; + logger.warning( + "COMPILER_DYNAMIC_QUANTIZATION property is not supported by this compiler version. Removing from " + "parameters"); + content = std::regex_replace(content, std::regex(dqstr.str()), ""); + } + // NPU_DEFER_WEIGHTS_LOAD is needed at runtime only { std::ostringstream batchstr; diff --git a/src/plugins/intel_npu/src/plugin/include/metrics.hpp b/src/plugins/intel_npu/src/plugin/include/metrics.hpp index f3652d6d7add65..91f78a9cd773f6 100644 --- a/src/plugins/intel_npu/src/plugin/include/metrics.hpp +++ b/src/plugins/intel_npu/src/plugin/include/metrics.hpp @@ -58,6 +58,7 @@ class Metrics final { }; const std::vector _cachingProperties = {ov::device::architecture.name(), ov::intel_npu::compilation_mode_params.name(), + ov::intel_npu::compiler_dynamic_quantization.name(), ov::intel_npu::tiles.name(), ov::intel_npu::dpu_groups.name(), ov::intel_npu::dma_engines.name(), diff --git a/src/plugins/intel_npu/src/plugin/include/plugin.hpp b/src/plugins/intel_npu/src/plugin/include/plugin.hpp index ec78ab223d3f35..b13be5000513ec 100644 --- a/src/plugins/intel_npu/src/plugin/include/plugin.hpp +++ b/src/plugins/intel_npu/src/plugin/include/plugin.hpp @@ -61,10 +61,14 @@ class Plugin : public ov::IPlugin { std::unique_ptr _metrics; // properties map: {name -> [supported, mutable, eval function]} - std::map>> _properties; - std::vector _supportedProperties; + mutable std::map>> + _properties; + mutable std::vector _supportedProperties; static std::atomic _compiledModelLoadCounter; + + void reset_compiler_dependent_properties() const; + void reset_supported_properties() const; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp index 516518f6999cd3..c680e0a59ad0a8 100644 --- a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp @@ -264,6 +264,12 @@ void CompiledModel::initialize_properties() { [](const Config& config) { return config.get(); }}}, + {ov::intel_npu::compiler_dynamic_quantization.name(), + {true, + ov::PropertyMutability::RO, + [](const Config& config) { + return config.get(); + }}}, {ov::intel_npu::turbo.name(), {isPropertySupported(ov::intel_npu::turbo.name()), ov::PropertyMutability::RO, diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 301a71887054f2..a0af187d42944d 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -7,11 +7,8 @@ #include #include "compiled_model.hpp" -#include "npuw/compiled_model.hpp" -#include "npuw/llm_compiled_model.hpp" -#include "npuw/serialization.hpp" -#include "driver_compiler_adapter.hpp" #include "compiler_adapter_factory.hpp" +#include "driver_compiler_adapter.hpp" #include "intel_npu/common/device_helpers.hpp" #include "intel_npu/common/icompiler_adapter.hpp" #include "intel_npu/common/igraph.hpp" @@ -23,6 +20,8 @@ #include "intel_npu/utils/zero/zero_init.hpp" #include "metadata.hpp" #include "npuw/compiled_model.hpp" +#include "npuw/llm_compiled_model.hpp" +#include "npuw/serialization.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/parameter.hpp" #include "openvino/runtime/intel_npu/properties.hpp" @@ -450,6 +449,12 @@ Plugin::Plugin() [](const Config& config) { return config.get(); }}}, + {ov::intel_npu::compiler_dynamic_quantization.name(), + {false, + ov::PropertyMutability::RW, + [](const Config& config) { + return config.get(); + }}}, {ov::intel_npu::turbo.name(), {_backends->isCommandQueueExtSupported(), ov::PropertyMutability::RW, @@ -566,7 +571,12 @@ Plugin::Plugin() {ov::intel_npu::batch_mode.name(), {false, ov::PropertyMutability::RW, [](const Config& config) { return config.getString(); }}}}; +} +void Plugin::reset_supported_properties() const { + /// reset first + _supportedProperties.clear(); /// Mutable member + /// populate for (auto& property : _properties) { if (std::get<0>(property.second)) { _supportedProperties.emplace_back(ov::PropertyName(property.first, std::get<1>(property.second))); @@ -574,9 +584,27 @@ Plugin::Plugin() } } +void Plugin::reset_compiler_dependent_properties() const { + // get active compiler version + CompilerAdapterFactory compilerAdapterFactory; + auto dummyCompiler = compilerAdapterFactory.getCompiler(_backends->getIEngineBackend(), _globalConfig); + uint32_t active_compiler_version = dummyCompiler->get_version(); + + // NPU_COMPILER_DYNAMIC_QUANTIZATION + // unpublish if compiler version requirement is not met + if (_properties.find(ov::intel_npu::compiler_dynamic_quantization.name()) != _properties.end()) { + if (active_compiler_version >= ICOMPILER_MAKE_VERSION(7, 1)) { + std::get<0>(_properties[ov::intel_npu::compiler_dynamic_quantization.name()]) = true; /// mark supported + } else { + std::get<0>(_properties[ov::intel_npu::compiler_dynamic_quantization.name()]) = false; // mark unsupported + } + } +} + void Plugin::set_property(const ov::AnyMap& properties) { const std::map config = any_copy(properties); update_log_level(config); + bool compiler_type_change = false; for (const auto& configEntry : config) { if (_properties.find(configEntry.first) == _properties.end()) { OPENVINO_THROW("Unsupported configuration key: ", configEntry.first); @@ -584,6 +612,10 @@ void Plugin::set_property(const ov::AnyMap& properties) { if (std::get<1>(_properties[configEntry.first]) == ov::PropertyMutability::RO) { OPENVINO_THROW("READ-ONLY configuration key: ", configEntry.first); } + if (configEntry.first == ov::intel_npu::compiler_type.name()) { + // we just assume its a change, not compare against old value + compiler_type_change = true; + } } } @@ -595,12 +627,26 @@ void Plugin::set_property(const ov::AnyMap& properties) { for (const auto& entry : config) { _config[entry.first] = entry.second; } + + if (compiler_type_change) { + // if compiler type was changed > need to reset properties to match the new compiler + // since properties have changed > need to reset supported_properties as well + reset_compiler_dependent_properties(); + reset_supported_properties(); + } } ov::Any Plugin::get_property(const std::string& name, const ov::AnyMap& arguments) const { const std::map& amends = any_copy(arguments); const Config amendedConfig = merge_configs(_globalConfig, amends); + /// Special case for supportedProperties + /// populate it at first get + if (name == ov::supported_properties.name() && _supportedProperties.size() < 1) { + reset_compiler_dependent_properties(); + reset_supported_properties(); + } + auto&& configIterator = _properties.find(name); if (configIterator != _properties.cend()) { return std::get<2>(configIterator->second)(amendedConfig); diff --git a/src/plugins/intel_npu/tests/functional/internal/overload/compiled_model/property.cpp b/src/plugins/intel_npu/tests/functional/internal/overload/compiled_model/property.cpp index 6ecc1886778b9d..6565c9bbb7a9c1 100644 --- a/src/plugins/intel_npu/tests/functional/internal/overload/compiled_model/property.cpp +++ b/src/plugins/intel_npu/tests/functional/internal/overload/compiled_model/property.cpp @@ -123,7 +123,6 @@ std::vector> plugin_public_mutable_properties = std::vector> plugin_internal_mutable_properties = { {ov::intel_npu::compilation_mode_params.name(), ov::Any("use-user-precision=false propagate-quant-dequant=0")}, {ov::intel_npu::dma_engines.name(), ov::Any(1)}, - {ov::intel_npu::compiler_type.name(), ov::Any(ov::intel_npu::CompilerType::MLIR)}, {ov::intel_npu::platform.name(), ov::Any(ov::intel_npu::Platform::AUTO_DETECT)}, {ov::intel_npu::compilation_mode.name(), ov::Any("DefaultHW")}, {ov::intel_npu::max_tiles.name(), ov::Any(8)},