diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 247a940c7..f803de40a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -61,7 +61,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install '.[dev,train]' + python -m pip install '.[train, onnx, openvino, dev]' - name: Run unit tests run: | diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css index 0bab3e76f..d5a916a4d 100644 --- a/docs/_static/css/custom.css +++ b/docs/_static/css/custom.css @@ -41,6 +41,7 @@ dl.class > dt { border-color: rgb(55 65 81); background-color: #e3e3e3; color: #404040; /* Override the colors imposed by */ + max-width: 18rem; } .components > .box:nth-child(1) > .header { diff --git a/docs/conf.py b/docs/conf.py index c7d536ee1..d8cd6d304 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -43,6 +43,7 @@ "sphinx.ext.intersphinx", "sphinx.ext.linkcode", "sphinx_inline_tabs", + "sphinxcontrib.mermaid", ] # Add any paths that contain templates here, relative to this directory. @@ -68,6 +69,7 @@ "datasets": ("https://huggingface.co/docs/datasets/main/en/", None), "transformers": ("https://huggingface.co/docs/transformers/main/en/", None), "huggingface_hub": ("https://huggingface.co/docs/huggingface_hub/main/en/", None), + "optimum": ("https://huggingface.co/docs/optimum/main/en/", None), "torch": ("https://pytorch.org/docs/stable/", None), } diff --git a/docs/img/backends_benchmark_cpu.png b/docs/img/backends_benchmark_cpu.png new file mode 100644 index 000000000..72ee321da Binary files /dev/null and b/docs/img/backends_benchmark_cpu.png differ diff --git a/docs/img/backends_benchmark_gpu.png b/docs/img/backends_benchmark_gpu.png new file mode 100644 index 000000000..11e0a00ee Binary files /dev/null and b/docs/img/backends_benchmark_gpu.png differ diff --git a/docs/installation.md b/docs/installation.md index 84ce4c0e4..707ad31d8 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,10 +1,14 @@ # Installation -We recommend **Python 3.8+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.34.0+](https://github.com/huggingface/transformers)**. There are three options to install Sentence Transformers: +We recommend **Python 3.8+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.41.0+](https://github.com/huggingface/transformers)**. There are 5 extra options to install Sentence Transformers: * **Default:** This allows for loading, saving, and inference (i.e., getting embeddings) of models. -* **Default and Training**: All of the above plus training. +* **ONNX:** This allows for loading, saving, inference, optimizing, and quantizing of models using the ONNX backend. +* **OpenVINO:** This allows for loading, saving, and inference of models using the OpenVINO backend. +* **Default and Training**: Like **Default**, plus training. * **Development**: All of the above plus some dependencies for developing Sentence Transformers, see [Editable Install](#editable-install). +Note that you can mix and match the various extras, e.g. ``pip install -U "sentence-transformers[train, onnx-gpu]"`` + ## Install with pip ```eval_rst @@ -15,6 +19,24 @@ We recommend **Python 3.8+**, **[PyTorch 1.11.0+](https://pytorch.org/get-starte pip install -U sentence-transformers +.. tab:: ONNX + + For GPU and CPU: + :: + + pip install -U "sentence-transformers[onnx-gpu]" + + For CPU only: + :: + + pip install -U "sentence-transformers[onnx]" + +.. tab:: OpenVINO + + :: + + pip install -U "sentence-transformers[openvino]" + .. tab:: Default and Training :: @@ -47,6 +69,24 @@ We recommend **Python 3.8+**, **[PyTorch 1.11.0+](https://pytorch.org/get-starte conda install -c conda-forge sentence-transformers +.. tab:: ONNX + + For GPU and CPU: + :: + + pip install -U "sentence-transformers[onnx-gpu]" + + For CPU only: + :: + + pip install -U "sentence-transformers[onnx]" + +.. tab:: OpenVINO + + :: + + pip install -U "sentence-transformers[openvino]" + .. tab:: Default and Training :: @@ -81,6 +121,24 @@ You can install ``sentence-transformers`` directly from source to take advantage pip install git+https://github.com/UKPLab/sentence-transformers.git +.. tab:: ONNX + + For GPU and CPU: + :: + + pip install -U "sentence-transformers[onnx-gpu] @ git+https://github.com/UKPLab/sentence-transformers.git" + + For CPU only: + :: + + pip install -U "sentence-transformers[onnx] @ git+https://github.com/UKPLab/sentence-transformers.git" + +.. tab:: OpenVINO + + :: + + pip install -U "sentence-transformers[openvino] @ git+https://github.com/UKPLab/sentence-transformers.git" + .. tab:: Default and Training :: diff --git a/docs/package_reference/util.md b/docs/package_reference/util.md index a684df522..3e81f6de2 100644 --- a/docs/package_reference/util.md +++ b/docs/package_reference/util.md @@ -7,6 +7,12 @@ :members: paraphrase_mining, semantic_search, community_detection, http_get, truncate_embeddings, normalize_embeddings, is_training_available, mine_hard_negatives ``` +## Model Optimization +```eval_rst +.. automodule:: sentence_transformers.backend + :members: export_optimized_onnx_model, export_dynamic_quantized_onnx_model +``` + ## Similarity Metrics ```eval_rst diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 14f22eed9..3d4c61b82 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -23,6 +23,7 @@ Once you have `installed `_ Sentence Transformers, you can ea - :meth:`SentenceTransformer.similarity_pairwise ` - `SentenceTransformer > Usage <./sentence_transformer/usage/usage.html>`_ + - `SentenceTransformer > Usage > Speeding up Inference <./sentence_transformer/usage/efficiency.html>`_ - `SentenceTransformer > Pretrained Models <./sentence_transformer/pretrained_models.html>`_ - `SentenceTransformer > Training Overview <./sentence_transformer/training_overview.html>`_ - `SentenceTransformer > Dataset Overview <./sentence_transformer/dataset_overview.html>`_ @@ -55,10 +56,14 @@ Once you have `installed `_ Sentence Transformers, you can ea # [0.6660, 1.0000, 0.1411], # [0.1046, 0.1411, 1.0000]]) -With ``SentenceTransformer("all-MiniLM-L6-v2")`` we pick which `Sentence Transformer model `_ we load. In this example, we load `all-MiniLM-L6-v2 `_, which is a MiniLM model finetuned on a large dataset of over 1 billion training pairs. Using `SentenceTransformer.similarity() <./package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer.similarity>`_, we compute the similarity between all pairs of sentences. As expected, the similarity between the first two sentences (0.6660) is higher than the similarity between the first and the third sentence (0.1046) or the second and the third sentence (0.1411). +With ``SentenceTransformer("all-MiniLM-L6-v2")`` we pick which `Sentence Transformer model `_ we load. In this example, we load `all-MiniLM-L6-v2 `_, which is a MiniLM model finetuned on a large dataset of over 1 billion training pairs. Using :meth:`SentenceTransformer.similarity() `, we compute the similarity between all pairs of sentences. As expected, the similarity between the first two sentences (0.6660) is higher than the similarity between the first and the third sentence (0.1046) or the second and the third sentence (0.1411). Finetuning Sentence Transformer models is easy and requires only a few lines of code. For more information, see the `Training Overview <./sentence_transformer/training_overview.html>`_ section. +.. tip:: + + Read `Sentence Transformer > Usage > Speeding up Inference `_ for tips on how to speed up inference of models by up to 2x-3x. + Cross Encoder ------------- diff --git a/docs/requirements.txt b/docs/requirements.txt index 312d2e2eb..e952976da 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -6,4 +6,5 @@ sphinx_markdown_tables==0.0.17 recommonmark==0.7.1 sphinx-copybutton==0.5.2 sphinx_inline_tabs==2023.4.21 +sphinxcontrib-mermaid==0.8.1 -e .. \ No newline at end of file diff --git a/docs/sentence_transformer/pretrained_models.md b/docs/sentence_transformer/pretrained_models.md index b7afc6cb0..316b361ea 100644 --- a/docs/sentence_transformer/pretrained_models.md +++ b/docs/sentence_transformer/pretrained_models.md @@ -31,6 +31,10 @@ similarities = model.similarity(embeddings, embeddings) - **Model sizes**: it is recommended to filter away the large models that might not be feasible without excessive hardware. - **Experimentation is key**: models that perform well on the leaderboard do not necessarily do well on your tasks, it is **crucial** to experiment with various promising models. + +.. tip:: + + Read `Sentence Transformer > Usage > Speeding up Inference <./usage/efficiency.html>`_ for tips on how to speed up inference of models by up to 2x-3x. ``` ## Original Models diff --git a/docs/sentence_transformer/usage/efficiency.rst b/docs/sentence_transformer/usage/efficiency.rst new file mode 100644 index 000000000..c30770078 --- /dev/null +++ b/docs/sentence_transformer/usage/efficiency.rst @@ -0,0 +1,443 @@ + +Speeding up Inference +===================== + +Sentence Transformers supports 3 backends for computing embeddings, each with its own optimizations for speeding up inference: + + +.. raw:: html + +
+ +
PyTorch
+ The default backend for Sentence Transformers. +
+ +
ONNX
+ Flexible and efficient model accelerator. +
+ +
OpenVINO
+ Optimization of models, primarily for Intel Hardware. +
+ +
Benchmarks
+ Benchmarks for the different backends. +
+
+
+ +PyTorch +------- + +The PyTorch backend is the default backend for Sentence Transformers. If you don't specify a device, it will use the strongest available option across "cuda", "mps", and "cpu". Its default usage looks like this: + +.. code-block:: python + + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer("all-MiniLM-L6-v2") + + sentences = ["This is an example sentence", "Each sentence is converted"] + embeddings = model.encode(sentences) + +If you're using a GPU, then you can use the following options to speed up your inference: + +.. tab:: float16 (fp16) + + Float32 (fp32, full precision) is the default floating-point format in ``torch``, whereas float16 (fp16, half precision) is a reduced-precision floating-point format that can speed up inference on GPUs at a minimal loss of model accuracy. To use it, you can specify the ``torch_dtype`` during initialization or call :meth:`model.half() ` on the initialized model: + + .. code-block:: python + + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer("all-MiniLM-L6-v2", model_kwargs={"torch_dtype": "float16"}) + # or: model.half() + + sentences = ["This is an example sentence", "Each sentence is converted"] + embeddings = model.encode(sentences) + +.. tab:: bfloat16 (bf16) + + Bfloat16 (bf16) is similar to fp16, but preserves more of the original accuracy of fp32. To use it, you can specify the ``torch_dtype`` during initialization or call :meth:`model.bfloat16() ` on the initialized model: + + .. code-block:: python + + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer("all-MiniLM-L6-v2", model_kwargs={"torch_dtype": "bfloat16"}) + # or: model.bfloat16() + + sentences = ["This is an example sentence", "Each sentence is converted"] + embeddings = model.encode(sentences) + +ONNX +---- + +ONNX can be used to speed up inference by converting the model to ONNX format and using ONNX Runtime to run the model. To use the ONNX backend, you must install Sentence Transformers with the ``onnx`` or ``onnx-gpu`` extra for CPU or GPU acceleration, respectively: + +.. code-block:: bash + + pip install sentence-transformers[onnx-gpu] + # or + pip install sentence-transformers[onnx] + +To convert a model to ONNX format, you can use the following code: + +.. code-block:: python + + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer("all-MiniLM-L6-v2", backend="onnx") + + sentences = ["This is an example sentence", "Each sentence is converted"] + embeddings = model.encode(sentences) + +If the model path or repository already contains a model in ONNX format, Sentence Transformers will automatically use it. Otherwise, it will convert the model to ONNX the format. + +All keyword arguments passed via ``model_kwargs`` will be passed on to :meth:`ORTModel.from_pretrained `. Some notable arguments include: + +* ``provider``: ONNX Runtime provider to use for loading the model, e.g. ``"CPUExecutionProvider"`` . See https://onnxruntime.ai/docs/execution-providers/ for possible providers. If not specified, the strongest provider (E.g. ``"CUDAExecutionProvider"``) will be used. +* ``file_name``: The name of the ONNX file to load. If not specified, will default to ``"model.onnx"`` or otherwise ``"onnx/model.onnx"``. This argument is useful for specifying optimized or quantized models. +* ``export``: A boolean flag specifying whether the model will be exported. If not provided, ``export`` will be set to ``True`` if the model repository or directory does not already contain an ONNX model. + +.. tip:: + + It's heavily recommended to save the exported model to prevent having to re-export it every time you run your code. You can do this by calling :meth:`model.save_pretrained() ` if your model was local: + + .. code-block:: python + + model = SentenceTransformer("path/to/my/model", backend="onnx") + model.save_pretrained("path/to/my/model") + + or with :meth:`model.push_to_hub() ` if your model was from the Hugging Face Hub: + + .. code-block:: python + + model = SentenceTransformer("intfloat/multilingual-e5-small", backend="onnx") + model.push_to_hub("intfloat/multilingual-e5-small", create_pr=True) + +Optimizing ONNX Models +^^^^^^^^^^^^^^^^^^^^^^ + +ONNX models can be optimized using Optimum, allowing for speedups on CPUs and GPUs alike. To do this, you can use the :func:`~sentence_transformers.backend.export_optimized_onnx_model` function, which saves the optimized in a directory or model repository that you specify. It expects: + +- ``model``: a Sentence Transformer model loaded with the ONNX backend. +- ``optimization_config``: ``"O1"``, ``"O2"``, ``"O3"``, or ``"O4"`` representing optimization levels from :class:`~optimum.onnxruntime.AutoOptimizationConfig`, or an :class:`~optimum.onnxruntime.OptimizationConfig` instance. +- ``model_name_or_path``: a path to save the optimized model file, or the repository name if you want to push it to the Hugging Face Hub. +- ``push_to_hub``: (Optional) a boolean to push the optimized model to the Hugging Face Hub. +- ``create_pr``: (Optional) a boolean to create a pull request when pushing to the Hugging Face Hub. Useful when you don't have write access to the repository. +- ``file_suffix``: (Optional) a string to append to the model name when saving it. If not specified, the optimization level name string will be used, or just ``"optimized"`` if the optimization config was not just a string optimization level. + +See this example for exporting a model with :doc:`optimization level 3 ` (basic and extended general optimizations, transformers-specific fusions, fast Gelu approximation): + +.. tab:: Hugging Face Hub Model + + Only optimize once:: + + from sentence_transformers import SentenceTransformer, export_optimized_onnx_model + + model = SentenceTransformer("all-MiniLM-L6-v2", backend="onnx") + export_optimized_onnx_model(model, "O3", "all-MiniLM-L6-v2", push_to_hub=True, create_pr=True) + + Before the pull request gets merged:: + + from sentence_transformers import SentenceTransformer + + pull_request_nr = 2 # TODO: Update this to the number of your pull request + model = SentenceTransformer( + "all-MiniLM-L6-v2", + backend="onnx", + model_kwargs={"file_name": "onnx/model_O3.onnx"}, + revision=f"refs/pr/{pull_request_nr}" + ) + + Once the pull request gets merged:: + + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer( + "all-MiniLM-L6-v2", + backend="onnx", + model_kwargs={"file_name": "onnx/model_O3.onnx"}, + ) + +.. tab:: Local Model + + Only optimize once:: + + from sentence_transformers import SentenceTransformer, export_optimized_onnx_model + + model = SentenceTransformer("path/to/my/mpnet-legal-finetuned", backend="onnx") + export_optimized_onnx_model(model, "O3", "path/to/my/mpnet-legal-finetuned") + + After optimizing:: + + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer( + "path/to/my/mpnet-legal-finetuned", + backend="onnx", + model_kwargs={"file_name": "onnx/model_O3.onnx"}, + ) + +Quantizing ONNX Models +^^^^^^^^^^^^^^^^^^^^^^ + +ONNX models can be quantized to int8 precision using Optimum, allowing for faster inference on CPUs. To do this, you can use the :func:`~sentence_transformers.backend.export_dynamic_quantized_onnx_model` function, which saves the quantized in a directory or model repository that you specify. Dynamic quantization, unlike static quantization, does not require a calibration dataset. It expects: + +- ``model``: a Sentence Transformer model loaded with the ONNX backend. +- ``quantization_config``: ``"arm64"``, ``"avx2"``, ``"avx512"``, or ``"avx512_vnni"`` representing quantization configurations from :class:`~optimum.onnxruntime.AutoQuantizationConfig`, or an :class:`~optimum.onnxruntime.QuantizationConfig` instance. +- ``model_name_or_path``: a path to save the quantized model file, or the repository name if you want to push it to the Hugging Face Hub. +- ``push_to_hub``: (Optional) a boolean to push the quantized model to the Hugging Face Hub. +- ``create_pr``: (Optional) a boolean to create a pull request when pushing to the Hugging Face Hub. Useful when you don't have write access to the repository. +- ``file_suffix``: (Optional) a string to append to the model name when saving it. If not specified, ``"qint8_quantized"`` will be used. + +On my CPU, each of the default quantization configurations (``"arm64"``, ``"avx2"``, ``"avx512"``, ``"avx512_vnni"``) resulted in roughly equivalent speedups. + +See this example for quantizing a model to ``int8`` with :doc:`avx512_vnni `: + +.. tab:: Hugging Face Hub Model + + Only quantize once:: + + from sentence_transformers import SentenceTransformer, export_dynamic_quantized_onnx_model + + model = SentenceTransformer("all-MiniLM-L6-v2", backend="onnx") + export_dynamic_quantized_onnx_model(model, "avx512_vnni", "all-MiniLM-L6-v2", push_to_hub=True, create_pr=True) + + Before the pull request gets merged:: + + from sentence_transformers import SentenceTransformer + + pull_request_nr = 2 # TODO: Update this to the number of your pull request + model = SentenceTransformer( + "all-MiniLM-L6-v2", + backend="onnx", + model_kwargs={"file_name": "onnx/model_qint8_avx512_vnni.onnx"}, + revision=f"refs/pr/{pull_request_nr}" + ) + + Once the pull request gets merged:: + + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer( + "all-MiniLM-L6-v2", + backend="onnx", + model_kwargs={"file_name": "onnx/model_qint8_avx512_vnni.onnx"}, + ) + +.. tab:: Local Model + + Only quantize once:: + + from sentence_transformers import SentenceTransformer, export_dynamic_quantized_onnx_model + + model = SentenceTransformer("path/to/my/mpnet-legal-finetuned", backend="onnx") + export_dynamic_quantized_onnx_model(model, "O3", "path/to/my/mpnet-legal-finetuned") + + After quantizing:: + + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer( + "path/to/my/mpnet-legal-finetuned", + backend="onnx", + model_kwargs={"file_name": "onnx/model_qint8_avx512_vnni.onnx"}, + ) + +OpenVINO +-------- + +OpenVINO allows for accelerated inference on CPUs by exporting the model to the OpenVINO format. To use the OpenVINO backend, you must install Sentence Transformers with the ``openvino`` extra: + +.. code-block:: bash + + pip install sentence-transformers[openvino] + +To convert a model to OpenVINO format, you can use the following code: + +.. code-block:: python + + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer("all-MiniLM-L6-v2", backend="openvino") + + sentences = ["This is an example sentence", "Each sentence is converted"] + embeddings = model.encode(sentences) + +.. raw:: html + + All keyword arguments passed via model_kwargs will be passed on to OVBaseModel.from_pretrained(). Some notable arguments include: + +* ``file_name``: The name of the ONNX file to load. If not specified, will default to ``"openvino_model.xml"`` or otherwise ``"openvino/openvino_model.xml"``. This argument is useful for specifying optimized or quantized models. +* ``export``: A boolean flag specifying whether the model will be exported. If not provided, ``export`` will be set to ``True`` if the model repository or directory does not already contain an OpenVINO model. + +.. tip:: + + It's heavily recommended to save the exported model to prevent having to re-export it every time you run your code. You can do this by calling :meth:`model.save_pretrained() ` if your model was local: + + .. code-block:: python + + model = SentenceTransformer("path/to/my/model", backend="openvino") + model.save_pretrained("path/to/my/model") + + or with :meth:`model.push_to_hub() ` if your model was from the Hugging Face Hub: + + .. code-block:: python + + model = SentenceTransformer("intfloat/multilingual-e5-small", backend="openvino") + model.push_to_hub("intfloat/multilingual-e5-small", create_pr=True) + +Benchmarks +---------- + +The following images show the benchmark results for the different backends on GPUs and CPUs. The results are averaged across 4 models of various sizes, 3 datasets, and numerous batch sizes. + +.. raw:: html + +
+ Expand the benchmark details + +
+ Speedup ratio: + + Performance ratio: The same models and hardware was used. We compare the performance against the performance of PyTorch with fp32, i.e. the default backend and precision. +
    +
  • + Evaluation: +
      +
    • + Semantic Textual Similarity: Spearman rank correlation based on cosine similarity on the sentence-transformers/stsb test set, computed via the EmbeddingSimilarityEvaluator. +
    • +
    • + Information Retrieval: NDCG@10 based on cosine similarity on the entire NanoBEIR collection of datasets, computed via the InformationRetrievalEvaluator. +
    • +
    +
  • +
+ +
    +
  • + Backends: +
      +
    • + torch-fp32: PyTorch with float32 precision (default). +
    • +
    • + torch-fp16: PyTorch with float16 precision, via model_kwargs={"torch_dtype": "float16"}. +
    • +
    • + torch-bf16: PyTorch with bfloat16 precision, via model_kwargs={"torch_dtype": "bfloat16"}. +
    • +
    • + onnx: ONNX with float32 precision, via backend="onnx". +
    • +
    • + onnx-O1: ONNX with float32 precision and O1 optimization, via export_optimized_onnx_model(..., "O1", ...) and backend="onnx". +
    • +
    • + onnx-O2: ONNX with float32 precision and O2 optimization, via export_optimized_onnx_model(..., "O2", ...) and backend="onnx". +
    • +
    • + onnx-O3: ONNX with float32 precision and O3 optimization, via export_optimized_onnx_model(..., "O3", ...) and backend="onnx". +
    • +
    • + onnx-O4: ONNX with float16 precision and O4 optimization, via export_optimized_onnx_model(..., "O4", ...) and backend="onnx". +
    • +
    • + onnx-qint8: ONNX quantized to int8 with "avx512_vnni", via export_dynamic_quantized_onnx_model(..., "avx512_vnni", ...) and backend="onnx". The different quantization configurations resulted in roughly equivalent speedups. +
    • +
    • + openvino: OpenVINO, via backend="openvino". +
    • +
    • + openvino-igpu: OpenVINO, via backend="openvino" and model_kwargs={"device": "GPU"}) to use the iGPU from my CPU. +
    • +
    +
  • +
+ + Note that the aggressive averaging across models, datasets, and batch sizes prevents some more intricate patterns from being visible. For example, for GPUs, if we only consider the stsb dataset with the shortest texts, ONNX becomes better: 1.46x for ONNX, and ONNX-O4 reaches 1.83x whereas fp16 and bf16 reach 1.54x and 1.53x respectively. So, for shorter texts we recommend ONNX on GPU.
+
+ For CPU, ONNX is also stronger for the stsb dataset with the shortest texts: 1.39x for ONNX, outperforming 1.29x for OpenVINO. ONNX with int8 quantization is even stronger with a 3.08x speedup. For longer texts, ONNX and OpenVINO can even perform slightly worse than PyTorch, so we recommend testing the different backends with your specific model and data to find the best one for your use case. + +
+
+ + +.. image:: ../../img/backends_benchmark_gpu.png + :alt: Benchmark for GPUs + :width: 45% + +.. image:: ../../img/backends_benchmark_cpu.png + :alt: Benchmark for CPUs + :width: 45% + +Recommendations +^^^^^^^^^^^^^^^ + +Based on the benchmarks, this flowchart should help you decide which backend to use for your model: + +.. mermaid:: + + %%{init: { + "theme": "neutral", + "flowchart": { + "curve": "bumpY" + } + }}%% + graph TD + A(What is your hardware?) -->|GPU| B(Is your text usually smaller than 500 characters?) + A -->|CPU| C(Is a 0.4% accuracy loss acceptable?) + B -->|yes| D[onnx-O4] + B -->|no| F[float16] + C -->|yes| G[onnx-int8] + C -->|no| H(Do you have an Intel CPU?) + H -->|yes| I[openvino] + H -->|no| J[onnx] + click D "#optimizing-onnx-models" + click F "#pytorch" + click G "#quantizing-onnx-models" + click I "#openvino" + click J "#onnx" + +.. note:: + + Your milage may vary, and you should always test the different backends with your specific model and data to find the best one for your use case. \ No newline at end of file diff --git a/docs/sentence_transformer/usage/usage.rst b/docs/sentence_transformer/usage/usage.rst index c7cddc0e6..a0c0f7e17 100644 --- a/docs/sentence_transformer/usage/usage.rst +++ b/docs/sentence_transformer/usage/usage.rst @@ -56,5 +56,6 @@ Once you have `installed <../../installation.html>`_ Sentence Transformers, you ../../../examples/applications/parallel-sentence-mining/README ../../../examples/applications/image-search/README ../../../examples/applications/embedding-quantization/README - custom_models.md + efficiency + custom_models diff --git a/index.rst b/index.rst index 980701531..3c5b1cc1f 100644 --- a/index.rst +++ b/index.rst @@ -1,6 +1,6 @@ -.. note:: +.. tip:: - Sentence Transformers v3.0 just released, introducing a new training API for Sentence Transformer models. Read `SentenceTransformer > Training Overview `_ to learn more about the training API, and check out `v3.0 Release Notes `_ for details on the other changes. + Sentence Transformers v3.2 just released, introducing the ONNX and OpenVINO backends for Sentence Transformer models. Read `SentenceTransformer > Usage > Speeding up Inference `_ to learn more about the new backends and what they can mean for your inference speed. SentenceTransformers Documentation ================================== @@ -63,6 +63,7 @@ Consider reading one of the following sections to answer the related questions: * How to **use** Sentence Transformer models? `Sentence Transformers > Usage `_ * What Sentence Transformer **models** can I use? `Sentence Transformers > Pretrained Models `_ +* How do I make Sentence Transformer models **faster**? `Sentence Transformers > Usage > Speeding up Inference `_ * How do I **train/finetune** a Sentence Transformer model? `Sentence Transformers > Training Overview `_ * How to **use** Cross Encoder models? `Cross Encoder > Usage `_ * What Cross Encoder **models** can I use? `Cross Encoder > Pretrained Models `_ diff --git a/pyproject.toml b/pyproject.toml index 092f334c2..8771d81a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,12 +33,12 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ - "transformers>=4.38.0,<5.0.0", + "transformers>=4.41.0,<5.0.0", "tqdm", "torch>=1.11.0", "scikit-learn", "scipy", - "huggingface-hub>=0.19.3", + "huggingface-hub>=0.20.0", "Pillow", ] @@ -49,6 +49,9 @@ Repository = "https://github.com/UKPLab/sentence-transformers/" [project.optional-dependencies] train = ["datasets", "accelerate>=0.20.3"] +onnx = ["optimum[onnxruntime]>=1.23.0"] +onnx-gpu = ["optimum[onnxruntime-gpu]>=1.23.0"] +openvino = ["optimum-intel[openvino]>=1.20.0"] dev = ["datasets", "accelerate>=0.20.3", "pre-commit", "pytest", "pytest-cov"] [build-system] diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py index 26fb32126..1a8cb2efb 100644 --- a/sentence_transformers/SentenceTransformer.py +++ b/sentence_transformers/SentenceTransformer.py @@ -104,6 +104,13 @@ class SentenceTransformer(nn.Sequential, FitMixin): or `"flash_attention_2"` (using `Dao-AILab/flash-attention `_). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation. + - ``provider``: If backend is "onnx", this is the provider to use for inference, for example "CPUExecutionProvider", + "CUDAExecutionProvider", etc. See https://onnxruntime.ai/docs/execution-providers/ for all ONNX execution providers. + - ``file_name``: If backend is "onnx" or "openvino", this is the file name to load, useful for loading optimized + or quantized ONNX or OpenVINO models. + - ``export``: If backend is "onnx" or "openvino", then this is a boolean flag specifying whether this model should + be exported to the backend. If not specified, the model will be exported only if the model repository or directory + does not already contain an exported model. See the `PreTrainedModel.from_pretrained `_ @@ -119,6 +126,9 @@ class SentenceTransformer(nn.Sequential, FitMixin): model_card_data (:class:`~sentence_transformers.model_card.SentenceTransformerModelCardData`, optional): A model card data object that contains information about the model. This is used to generate a model card when saving the model. If not set, a default model card data object is created. + backend (str): The backend to use for inference. Can be one of "torch" (default), "onnx", or "openvino". + See https://sbert.net/docs/sentence_transformer/usage/efficiency.html for benchmarking information + on the different backends. Example: :: @@ -165,6 +175,7 @@ def __init__( tokenizer_kwargs: dict[str, Any] | None = None, config_kwargs: dict[str, Any] | None = None, model_card_data: SentenceTransformerModelCardData | None = None, + backend: Literal["torch", "onnx", "openvino"] = "torch", ) -> None: # Note: self._load_sbert_model can also update `self.prompts` and `self.default_prompt_name` self.prompts = prompts or {} @@ -177,6 +188,7 @@ def __init__( self._model_card_vars = {} self._model_card_text = None self._model_config = {} + self.backend = backend if use_auth_token is not None: warnings.warn( "The `use_auth_token` argument is deprecated and will be removed in v4 of SentenceTransformers.", @@ -368,6 +380,14 @@ def __init__( # Pass the model to the model card data for later use in generating a model card upon saving this model self.model_card_data.register_model(self) + def get_backend(self) -> Literal["torch", "onnx", "openvino"]: + """Return the backend used for inference, which can be one of "torch", "onnx", or "openvino". + + Returns: + str: The backend used for inference. + """ + return self.backend + @overload def encode( self, @@ -1313,12 +1333,13 @@ def push_to_hub( token: str | None = None, private: bool | None = None, safe_serialization: bool = True, - commit_message: str = "Add new SentenceTransformer model.", + commit_message: str | None = None, local_model_path: str | None = None, exist_ok: bool = False, replace_model_card: bool = False, train_datasets: list[str] | None = None, revision: str | None = None, + create_pr: bool = False, ) -> str: """ Uploads all elements of this Sentence Transformer to a new HuggingFace Hub repository. @@ -1334,6 +1355,7 @@ def push_to_hub( replace_model_card (bool, optional): If true, replace an existing model card in the hub with the automatically created model card train_datasets (List[str], optional): Datasets used to train the model. If set, the datasets will be added to the model card in the Hub. revision (str, optional): Branch to push the uploaded files to + create_pr (bool, optional): If True, create a pull request instead of pushing directly to the main branch Returns: str: The url of the commit of your model in the repository on the Hugging Face Hub. @@ -1343,20 +1365,67 @@ def push_to_hub( repo_id=repo_id, private=private, repo_type=None, - exist_ok=exist_ok, + exist_ok=exist_ok or create_pr, ) repo_id = repo_url.repo_id # Update the repo_id in case the old repo_id didn't contain a user or organization self.model_card_data.set_model_id(repo_id) if revision is not None: api.create_branch(repo_id=repo_id, branch=revision, exist_ok=True) + + if commit_message is None: + backend = self.get_backend() + if backend == "torch": + commit_message = "Add new SentenceTransformer model" + else: + commit_message = f"Add new SentenceTransformer model with an {backend} backend" + + commit_description = "" + if create_pr: + commit_description = f"""\ +Hello! + +*This pull request has been automatically generated from the [`push_to_hub`](https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer.push_to_hub) method from the Sentence Transformers library.* + +## Full Model Architecture: +``` +{self} +``` + +## Tip: +Consider testing this pull request before merging by loading the model from this PR with the `revision` argument: +```python +from sentence_transformers import SentenceTransformer + +# TODO: Fill in the PR number +pr_number = 2 +model = SentenceTransformer( + "{repo_id}", + revision=f"refs/pr/{{pr_number}}", + backend="{self.get_backend()}", +) + +# Verify that everything works as expected +embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."]) +print(embeddings.shape) + +similarities = model.similarity(embeddings, embeddings) +print(similarities) +``` +""" + if local_model_path: folder_url = api.upload_folder( - repo_id=repo_id, folder_path=local_model_path, commit_message=commit_message, revision=revision + repo_id=repo_id, + folder_path=local_model_path, + commit_message=commit_message, + commit_description=commit_description, + revision=revision, + create_pr=create_pr, ) else: with tempfile.TemporaryDirectory() as tmp_dir: create_model_card = replace_model_card or not os.path.exists(os.path.join(tmp_dir, "README.md")) - self.save( + self.save_pretrained( tmp_dir, model_name=repo_url.repo_id, create_model_card=create_model_card, @@ -1364,18 +1433,17 @@ def push_to_hub( safe_serialization=safe_serialization, ) folder_url = api.upload_folder( - repo_id=repo_id, folder_path=tmp_dir, commit_message=commit_message, revision=revision + repo_id=repo_id, + folder_path=tmp_dir, + commit_message=commit_message, + commit_description=commit_description, + revision=revision, + create_pr=create_pr, ) - refs = api.list_repo_refs(repo_id=repo_id) - for branch in refs.branches: - if revision is None and branch.name == "main": - return f"https://huggingface.co/{repo_id}/commit/{branch.target_commit}" - elif branch.name == revision: - return f"https://huggingface.co/{repo_id}/commit/{branch.target_commit}" - - # This isn't expected to ever be reached. - return folder_url + if create_pr: + return folder_url.pr_url + return folder_url.commit_url def _text_length(self, text: list[int] | list[list[int]]) -> int: """ @@ -1457,13 +1525,19 @@ def _load_auto_model( model_args=model_kwargs, tokenizer_args=tokenizer_kwargs, config_args=config_kwargs, + backend=self.backend, ) pooling_model = Pooling(transformer_model.get_word_embedding_dimension(), "mean") self.model_card_data.set_base_model(model_name_or_path, revision=revision) return [transformer_model, pooling_model] def _load_module_class_from_ref( - self, class_ref: str, model_name_or_path: str, trust_remote_code: bool, model_kwargs: dict[str, Any] | None + self, + class_ref: str, + model_name_or_path: str, + trust_remote_code: bool, + revision: str | None, + model_kwargs: dict[str, Any] | None, ) -> nn.Module: # If the class is from sentence_transformers, we can directly import it, # otherwise, we try to import it dynamically, and if that fails, we fall back to the default import @@ -1476,6 +1550,7 @@ def _load_module_class_from_ref( return get_class_from_dynamic_module( class_ref, model_name_or_path, + revision=revision, code_revision=code_revision, ) except OSError: @@ -1578,7 +1653,7 @@ def _load_sbert_model( for module_config in modules_config: class_ref = module_config["type"] module_class = self._load_module_class_from_ref( - class_ref, model_name_or_path, trust_remote_code, model_kwargs + class_ref, model_name_or_path, trust_remote_code, revision, model_kwargs ) # For Transformer, don't load the full directory, rely on `transformers` instead @@ -1643,10 +1718,10 @@ def _load_sbert_model( # Try to initialize the module with a lot of kwargs, but only if the module supports them # Otherwise we fall back to the load method - try: - module = module_class(model_name_or_path, cache_dir=cache_folder, **kwargs) - except TypeError: - module = module_class.load(model_name_or_path) + # try: + module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs) + # except TypeError: + # module = module_class.load(model_name_or_path) else: # Normalize does not require any files to be loaded if module_class == Normalize: @@ -1684,6 +1759,9 @@ def device(self) -> device: Get torch.device from module, assuming that the whole module has one device. In case there are no PyTorch parameters, fall back to CPU. """ + if isinstance(self[0], Transformer): + return self[0].auto_model.device + try: return next(self.parameters()).device except StopIteration: diff --git a/sentence_transformers/__init__.py b/sentence_transformers/__init__.py index 1d6c5f0f5..2c382bdb9 100644 --- a/sentence_transformers/__init__.py +++ b/sentence_transformers/__init__.py @@ -6,6 +6,7 @@ import importlib import os +from sentence_transformers.backend import export_dynamic_quantized_onnx_model, export_optimized_onnx_model from sentence_transformers.cross_encoder.CrossEncoder import CrossEncoder from sentence_transformers.datasets import ParallelSentencesDataset, SentencesDataset from sentence_transformers.LoggingHandler import LoggingHandler @@ -34,4 +35,6 @@ "SentenceTransformerTrainingArguments", "SentenceTransformerModelCardData", "quantize_embeddings", + "export_optimized_onnx_model", + "export_dynamic_quantized_onnx_model", ] diff --git a/sentence_transformers/backend.py b/sentence_transformers/backend.py new file mode 100644 index 000000000..eef76352e --- /dev/null +++ b/sentence_transformers/backend.py @@ -0,0 +1,261 @@ +from __future__ import annotations + +import logging +import os +import shutil +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING, Callable, Literal + +import huggingface_hub + +logger = logging.getLogger(__name__) + +if TYPE_CHECKING: + from sentence_transformers.SentenceTransformer import SentenceTransformer + + try: + from optimum.onnxruntime.configuration import OptimizationConfig, QuantizationConfig + except ImportError: + pass + + +def export_optimized_onnx_model( + model: SentenceTransformer, + optimization_config: OptimizationConfig | Literal["O1", "O2", "O3", "O4"], + model_name_or_path: str, + push_to_hub: bool = False, + create_pr: bool = False, + file_suffix: str | None = None, +) -> None: + """ + Export an optimized ONNX model from a SentenceTransformer model. + + The O1-O4 optimization levels are defined by Optimum and are documented here: + https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/optimization + + The optimization levels are: + + - O1: basic general optimizations. + - O2: basic and extended general optimizations, transformers-specific fusions. + - O3: same as O2 with GELU approximation. + - O4: same as O3 with mixed precision (fp16, GPU-only) + + See https://sbert.net/docs/sentence_transformer/usage/efficiency.html for more information & benchmarks. + + Args: + model (SentenceTransformer): The SentenceTransformer model to be optimized. Must be loaded with `backend="onnx"`. + optimization_config (OptimizationConfig | Literal["O1", "O2", "O3", "O4"]): The optimization configuration or level. + model_name_or_path (str): The path or Hugging Face Hub repository name where the optimized model will be saved. + push_to_hub (bool, optional): Whether to push the optimized model to the Hugging Face Hub. Defaults to False. + create_pr (bool, optional): Whether to create a pull request when pushing to the Hugging Face Hub. Defaults to False. + file_suffix (str | None, optional): The suffix to add to the optimized model file name. Defaults to None. + + Raises: + ImportError: If the required packages `optimum` and `onnxruntime` are not installed. + ValueError: If the provided model is not a valid SentenceTransformer model loaded with `backend="onnx"`. + ValueError: If the provided optimization_config is not valid. + + Returns: + None + """ + from sentence_transformers import SentenceTransformer + from sentence_transformers.models.Transformer import Transformer + + try: + from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTOptimizer + from optimum.onnxruntime.configuration import AutoOptimizationConfig + except ImportError: + raise ImportError( + "Please install Optimum and ONNX Runtime to use this function. " + "You can install them with pip: `pip install optimum[onnxruntime]` " + "or `pip install optimum[onnxruntime-gpu]`" + ) + + if ( + not isinstance(model, SentenceTransformer) + or not len(model) + or not isinstance(model[0], Transformer) + or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction) + ): + raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.') + + ort_model: ORTModelForFeatureExtraction = model[0].auto_model + optimizer = ORTOptimizer.from_pretrained(ort_model) + + if isinstance(optimization_config, str): + if optimization_config not in AutoOptimizationConfig._LEVELS: + raise ValueError( + "optimization_config must be an OptimizationConfig instance or one of 'O1', 'O2', 'O3', 'O4'." + ) + + file_suffix = file_suffix or optimization_config + optimization_config = getattr(AutoOptimizationConfig, optimization_config)() + + if file_suffix is None: + file_suffix = "optimized" + + save_or_push_to_hub_onnx_model( + export_function=lambda save_dir: optimizer.optimize(optimization_config, save_dir, file_suffix=file_suffix), + export_function_name="export_optimized_onnx_model", + config=optimization_config, + model_name_or_path=model_name_or_path, + push_to_hub=push_to_hub, + create_pr=create_pr, + file_suffix=file_suffix, + ) + + +def export_dynamic_quantized_onnx_model( + model: SentenceTransformer, + quantization_config: QuantizationConfig | Literal["arm64", "avx2", "avx512", "avx512_vnni"], + model_name_or_path: str, + push_to_hub: bool = False, + create_pr: bool = False, + file_suffix: str | None = None, +) -> None: + """ + Export a quantized ONNX model from a SentenceTransformer model. + + This function applies dynamic quantization, i.e. without a calibration dataset. + Each of the default quantization configurations quantize the model to int8, allowing + for faster inference on CPUs, but are likely slower on GPUs. + + See https://sbert.net/docs/sentence_transformer/usage/efficiency.html for more information & benchmarks. + + Args: + model (SentenceTransformer): The SentenceTransformer model to be quantized. Must be loaded with `backend="onnx"`. + quantization_config (QuantizationConfig): The quantization configuration. + model_name_or_path (str): The path or Hugging Face Hub repository name where the quantized model will be saved. + push_to_hub (bool, optional): Whether to push the quantized model to the Hugging Face Hub. Defaults to False. + create_pr (bool, optional): Whether to create a pull request when pushing to the Hugging Face Hub. Defaults to False. + file_suffix (str | None, optional): The suffix to add to the quantized model file name. Defaults to None. + + Raises: + ImportError: If the required packages `optimum` and `onnxruntime` are not installed. + ValueError: If the provided model is not a valid SentenceTransformer model loaded with `backend="onnx"`. + ValueError: If the provided quantization_config is not valid. + + Returns: + None + """ + from sentence_transformers import SentenceTransformer + from sentence_transformers.models.Transformer import Transformer + + try: + from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTQuantizer + from optimum.onnxruntime.configuration import AutoQuantizationConfig + except ImportError: + raise ImportError( + "Please install Optimum and ONNX Runtime to use this function. " + "You can install them with pip: `pip install optimum[onnxruntime]` " + "or `pip install optimum[onnxruntime-gpu]`" + ) + + if ( + not isinstance(model, SentenceTransformer) + or not len(model) + or not isinstance(model[0], Transformer) + or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction) + ): + raise ValueError('The model must be a SentenceTransformer model loaded with `backend="onnx"`.') + + ort_model: ORTModelForFeatureExtraction = model[0].auto_model + quantizer = ORTQuantizer.from_pretrained(ort_model) + + if isinstance(quantization_config, str): + if quantization_config not in ["arm64", "avx2", "avx512", "avx512_vnni"]: + raise ValueError( + "quantization_config must be an QuantizationConfig instance or one of 'arm64', 'avx2', 'avx512', or 'avx512_vnni'." + ) + + quantization_config_name = quantization_config[:] + quantization_config = getattr(AutoQuantizationConfig, quantization_config)(is_static=False) + file_suffix = file_suffix or f"{quantization_config.weights_dtype.name.lower()}_{quantization_config_name}" + + if file_suffix is None: + file_suffix = f"{quantization_config.weights_dtype.name.lower()}_quantized" + + save_or_push_to_hub_onnx_model( + export_function=lambda save_dir: quantizer.quantize(quantization_config, save_dir, file_suffix=file_suffix), + export_function_name="export_dynamic_quantized_onnx_model", + config=quantization_config, + model_name_or_path=model_name_or_path, + push_to_hub=push_to_hub, + create_pr=create_pr, + file_suffix=file_suffix, + ) + + +def save_or_push_to_hub_onnx_model( + export_function: Callable, + export_function_name: str, + config, + model_name_or_path: str, + push_to_hub: bool = False, + create_pr: bool = False, + file_suffix: str | None = None, +): + if push_to_hub: + with tempfile.TemporaryDirectory() as save_dir: + export_function(save_dir) + file_name = f"model_{file_suffix}.onnx" + source = (Path(save_dir) / file_name).as_posix() + destination = (Path("onnx") / file_name).as_posix() + + commit_description = "" + if create_pr: + opt_config_string = repr(config).replace("(", "(\n\t").replace(", ", ",\n\t").replace(")", "\n)") + commit_description = f"""\ +Hello! + +*This pull request has been automatically generated from the [`{export_function_name}`](https://sbert.net/docs/package_reference/util.html#sentence_transformers.backend.{export_function_name}) function from the Sentence Transformers library.* + +## Config +```python +{opt_config_string} +``` + +## Tip: +Consider testing this pull request before merging by loading the model from this PR with the `revision` argument: +```python +from sentence_transformers import SentenceTransformer + +# TODO: Fill in the PR number +pr_number = 2 +model = SentenceTransformer( + "{model_name_or_path}", + revision=f"refs/pr/{{pr_number}}", + backend="onnx", + model_kwargs={{"file_name": "{destination}"}}, +) + +# Verify that everything works as expected +embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."]) +print(embeddings.shape) + +similarities = model.similarity(embeddings, embeddings) +print(similarities) +``` +""" + + huggingface_hub.upload_file( + path_or_fileobj=source, + path_in_repo=destination, + repo_id=model_name_or_path, + repo_type="model", + commit_message=f"Add exported ONNX model {file_name!r}", + commit_description=commit_description, + create_pr=create_pr, + ) + + else: + with tempfile.TemporaryDirectory() as save_dir: + export_function(save_dir) + + file_name = f"model_{file_suffix}.onnx" + source = os.path.join(save_dir, file_name) + destination = os.path.join(model_name_or_path, "onnx", file_name) + # Create destination if it does not exist + os.makedirs(os.path.dirname(destination), exist_ok=True) + shutil.copy(source, destination) diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py index 2d3786e15..7592278bf 100644 --- a/sentence_transformers/models/Transformer.py +++ b/sentence_transformers/models/Transformer.py @@ -1,13 +1,27 @@ from __future__ import annotations import json +import logging import os -from typing import Any +from fnmatch import fnmatch +from pathlib import Path +from typing import Any, Callable +import huggingface_hub import torch from torch import nn from transformers import AutoConfig, AutoModel, AutoTokenizer, MT5Config, T5Config +logger = logging.getLogger(__name__) + + +def _save_pretrained_wrapper(_save_pretrained_fn: Callable, subfolder: str) -> Callable[..., None]: + def wrapper(save_directory: str | Path, **kwargs) -> None: + os.makedirs(Path(save_directory) / subfolder, exist_ok=True) + return _save_pretrained_fn(Path(save_directory) / subfolder, **kwargs) + + return wrapper + class Transformer(nn.Module): """Hugging Face AutoModel to generate token embeddings. @@ -29,6 +43,8 @@ class Transformer(nn.Module): model is cased or not) tokenizer_name_or_path: Name or path of the tokenizer. When None, then model_name_or_path is used + backend: Backend used for model inference. Can be `torch`, `onnx`, + or `openvino`. Default is `torch`. """ save_in_root: bool = True @@ -43,10 +59,12 @@ def __init__( cache_dir: str | None = None, do_lower_case: bool = False, tokenizer_name_or_path: str = None, + backend: str = "torch", ) -> None: super().__init__() self.config_keys = ["max_seq_length", "do_lower_case"] self.do_lower_case = do_lower_case + self.backend = backend if model_args is None: model_args = {} if tokenizer_args is None: @@ -55,7 +73,7 @@ def __init__( config_args = {} config = AutoConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir) - self._load_model(model_name_or_path, config, cache_dir, **model_args) + self._load_model(model_name_or_path, config, cache_dir, backend, **model_args) if max_seq_length is not None and "model_max_length" not in tokenizer_args: tokenizer_args["model_max_length"] = max_seq_length @@ -79,16 +97,228 @@ def __init__( if tokenizer_name_or_path is not None: self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__ - def _load_model(self, model_name_or_path, config, cache_dir, **model_args) -> None: + def _load_model(self, model_name_or_path, config, cache_dir, backend, **model_args) -> None: """Loads the transformer model""" - if isinstance(config, T5Config): - self._load_t5_model(model_name_or_path, config, cache_dir, **model_args) - elif isinstance(config, MT5Config): - self._load_mt5_model(model_name_or_path, config, cache_dir, **model_args) + if backend == "torch": + if isinstance(config, T5Config): + self._load_t5_model(model_name_or_path, config, cache_dir, **model_args) + elif isinstance(config, MT5Config): + self._load_mt5_model(model_name_or_path, config, cache_dir, **model_args) + else: + self.auto_model = AutoModel.from_pretrained( + model_name_or_path, config=config, cache_dir=cache_dir, **model_args + ) + elif backend == "onnx": + self._load_onnx_model(model_name_or_path, config, cache_dir, **model_args) + elif backend == "openvino": + self._load_openvino_model(model_name_or_path, config, cache_dir, **model_args) + else: + raise ValueError(f"Unsupported backend '{backend}'. `backend` should be `torch`, `onnx`, or `openvino`.") + + def _load_openvino_model(self, model_name_or_path, config, cache_dir, **model_args) -> None: + if isinstance(config, T5Config) or isinstance(config, MT5Config): + raise ValueError("T5 models are not yet supported by the OpenVINO backend.") + + try: + from optimum.intel import OVModelForFeatureExtraction + from optimum.intel.openvino import OV_XML_FILE_NAME + except ModuleNotFoundError: + raise Exception( + "Using the OpenVINO backend requires installing Optimum and OpenVINO. " + "You can install them with pip: `pip install optimum[openvino]`." + ) + + load_path = Path(model_name_or_path) + is_local = load_path.exists() + backend_name = "OpenVINO" + target_file_glob = "openvino*.xml" + + # Determine whether the model should be exported or whether we can load it directly + export, model_args = self._backend_should_export( + load_path, is_local, model_args, OV_XML_FILE_NAME, target_file_glob, backend_name + ) + + # If we're exporting, then there's no need for a file_name to load the model from + if export: + model_args.pop("file_name", None) + + # ov_config can be either a dictionary, or point to a json file with an OpenVINO config + if "ov_config" in model_args: + ov_config = model_args["ov_config"] + if not isinstance(ov_config, dict): + if not Path(ov_config).exists(): + raise ValueError( + "ov_config should be a dictionary or a path to a .json file containing an OpenVINO config" + ) + with open(ov_config, encoding="utf-8") as f: + model_args["ov_config"] = json.load(f) + else: + model_args["ov_config"] = {} + + # Either load an exported model, or export the model to ONNX + self.auto_model: OVModelForFeatureExtraction = OVModelForFeatureExtraction.from_pretrained( + model_name_or_path, + config=config, + cache_dir=cache_dir, + export=export, + **model_args, + ) + # Wrap the save_pretrained method to save the model in the correct subfolder + self.auto_model._save_pretrained = _save_pretrained_wrapper(self.auto_model._save_pretrained, self.backend) + + # Warn the user to save the model if they haven't already + if export: + self._backend_warn_to_save(model_name_or_path, is_local, backend_name) + + def _load_onnx_model(self, model_name_or_path, config, cache_dir, **model_args) -> None: + try: + import onnxruntime as ort + from optimum.onnxruntime import ONNX_WEIGHTS_NAME, ORTModelForFeatureExtraction + except ModuleNotFoundError: + raise Exception( + "Using the ONNX backend requires installing Optimum and ONNX Runtime. " + "You can install them with pip: `pip install optimum[onnxruntime]` " + "or `pip install optimum[onnxruntime-gpu]`" + ) + + # Default to the highest priority available provider if not specified + # E.g. Tensorrt > CUDA > CPU + model_args["provider"] = model_args.pop("provider", ort.get_available_providers()[0]) + + load_path = Path(model_name_or_path) + is_local = load_path.exists() + backend_name = "ONNX" + target_file_glob = "*.onnx" + + # Determine whether the model should be exported or whether we can load it directly + export, model_args = self._backend_should_export( + load_path, is_local, model_args, ONNX_WEIGHTS_NAME, target_file_glob, backend_name + ) + + # If we're exporting, then there's no need for a file_name to load the model from + if export: + model_args.pop("file_name", None) + + # Either load an exported model, or export the model to ONNX + self.auto_model: ORTModelForFeatureExtraction = ORTModelForFeatureExtraction.from_pretrained( + model_name_or_path, + config=config, + cache_dir=cache_dir, + export=export, + **model_args, + ) + # Wrap the save_pretrained method to save the model in the correct subfolder + self.auto_model._save_pretrained = _save_pretrained_wrapper(self.auto_model._save_pretrained, self.backend) + + # Warn the user to save the model if they haven't already + if export: + self._backend_warn_to_save(model_name_or_path, is_local, backend_name) + + def _backend_should_export( + self, + load_path: Path, + is_local: bool, + model_args: dict[str, Any], + target_file_name: str, + target_file_glob: str, + backend_name: str, + ) -> tuple[bool, dict[str, Any]]: + """ + Determines whether the model should be exported to the backend, or if it can be loaded directly. + Also update the `file_name` and `subfolder` model_args if necessary. + + These are the cases: + + 1. If export is set in model_args, just return export + 2. If `/` exists; set export to False + 3. If `/` exists; set export to False and set subfolder to the backend (e.g. "onnx") + 4. If `` contains a folder, add those folders to the subfolder and set the file_name to the last part + + We will warn if: + + 1. The expected file does not exist in the model directory given the optional file_name and subfolder. + If there are valid files for this backend, but they're don't align with file_name, then we give a useful warning. + 2. Multiple files are found in the model directory that match the target file name and the user did not + specify the desired file name via `model_kwargs={"file_name": ""}` + + Args: + load_path: The model repository or directory, as a Path instance + is_local: Whether the model is local or remote, i.e. whether load_path is a local directory + model_args: The model_args dictionary. Notable keys are "export", "file_name", and "subfolder" + target_file_name: The expected file name in the model directory, e.g. "model.onnx" or "openvino_model.xml" + target_file_glob: The glob pattern to match the target file name, e.g. "*.onnx" or "openvino*.xml" + backend_name: The human-readable name of the backend for use in warnings, e.g. "ONNX" or "OpenVINO" + + Returns: + Tuple[bool, dict[str, Any]]: A tuple of the export boolean and the updated model_args dictionary. + """ + + export = model_args.pop("export", None) + if export is not None: + return export, model_args + + file_name = model_args.get("file_name", target_file_name) + subfolder = model_args.get("subfolder", None) + primary_full_path = Path(subfolder, file_name).as_posix() if subfolder else file_name + secondary_full_path = ( + Path(subfolder, self.backend, file_name).as_posix() + if subfolder + else Path(self.backend, file_name).as_posix() + ) + glob_pattern = f"{subfolder}/**/{target_file_glob}" if subfolder else f"**/{target_file_glob}" + + # Get the list of files in the model directory that match the target file name + if is_local: + model_file_names = [path.relative_to(load_path).as_posix() for path in load_path.glob(glob_pattern)] else: - self.auto_model = AutoModel.from_pretrained( - model_name_or_path, config=config, cache_dir=cache_dir, **model_args + all_files = huggingface_hub.list_repo_files( + load_path.as_posix(), + repo_type="model", + revision=model_args.get("revision", None), + token=model_args.get("token", None), + ) + model_file_names = [fname for fname in all_files if fnmatch(fname, glob_pattern)] + + # First check if the expected file exists in the root of the model directory + # If it doesn't, check if it exists in the backend subfolder. + # If it does, set the subfolder to include the backend + export = primary_full_path not in model_file_names + if export and "subfolder" not in model_args: + export = secondary_full_path not in model_file_names + if not export: + if len(model_file_names) > 1 and "file_name" not in model_args: + logger.warning( + f"Multiple {backend_name} files found in {load_path.as_posix()!r}: {model_file_names}, defaulting to {secondary_full_path!r}. " + f'Please specify the desired file name via `model_kwargs={{"file_name": ""}}`.' + ) + model_args["subfolder"] = self.backend + model_args["file_name"] = file_name + + # If the file_name contains subfolders, set it as the subfolder instead + file_name_parts = Path(file_name).parts + if len(file_name_parts) > 1: + model_args["file_name"] = file_name_parts[-1] + model_args["subfolder"] = Path(model_args.get("subfolder", ""), *file_name_parts[:-1]).as_posix() + + if export: + logger.warning( + f"No {file_name!r} found in {load_path.as_posix()!r}. Exporting the model to {backend_name}." ) + if model_file_names: + logger.warning( + f"If you intended to load one of the {model_file_names} {backend_name} files, " + f'please specify the desired file name via `model_kwargs={{"file_name": "{model_file_names[0]}"}}`.' + ) + + return export, model_args + + def _backend_warn_to_save(self, model_name_or_path: str, is_local: str, backend_name: str) -> None: + to_log = f"Saving the exported {backend_name} model is heavily recommended to avoid having to export it again." + if is_local: + to_log += f" Do so with `model.save_pretrained({model_name_or_path!r})`." + else: + to_log += f" Do so with `model.push_to_hub({model_name_or_path!r}, create_pr=True)`." + logger.warning(to_log) def _load_t5_model(self, model_name_or_path, config, cache_dir, **model_args) -> None: """Loads the encoder model from T5""" diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index 13f67d86e..bb4238aae 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -1303,8 +1303,8 @@ def is_sentence_transformer_model( load_file_path( model_name_or_path, "modules.json", - token, - cache_folder, + token=token, + cache_folder=cache_folder, revision=revision, local_files_only=local_files_only, ) @@ -1314,8 +1314,8 @@ def is_sentence_transformer_model( def load_file_path( model_name_or_path: str, filename: str, - token: bool | str | None, - cache_folder: str | None, + token: bool | str | None = None, + cache_folder: str | None = None, revision: str | None = None, local_files_only: bool = False, ) -> str | None: @@ -1356,8 +1356,8 @@ def load_file_path( def load_dir_path( model_name_or_path: str, directory: str, - token: bool | str | None, - cache_folder: str | None, + token: bool | str | None = None, + cache_folder: str | None = None, revision: str | None = None, local_files_only: bool = False, ) -> str | None: diff --git a/tests/conftest.py b/tests/conftest.py index 2ec25e60d..c505dd49b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,6 +23,16 @@ def stsb_bert_tiny_model_reused() -> SentenceTransformer: return SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-safetensors") +@pytest.fixture() +def stsb_bert_tiny_model_onnx() -> SentenceTransformer: + return SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-onnx") + + +@pytest.fixture() +def stsb_bert_tiny_model_openvino() -> SentenceTransformer: + return SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-openvino") + + @pytest.fixture() def paraphrase_distilroberta_base_v1_model() -> SentenceTransformer: return SentenceTransformer("paraphrase-distilroberta-base-v1") diff --git a/tests/test_backends.py b/tests/test_backends.py new file mode 100644 index 000000000..8b4a13966 --- /dev/null +++ b/tests/test_backends.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import gc +import json +import os +import tempfile +from pathlib import Path + +import numpy as np +import pytest + +try: + from optimum.intel import OVModelForFeatureExtraction + from optimum.onnxruntime import ORTModelForFeatureExtraction +except ImportError: + pytest.skip("OpenVINO and ONNX backends are not available", allow_module_level=True) + +from sentence_transformers import SentenceTransformer + + +## Testing exporting: +@pytest.mark.parametrize( + ["backend", "expected_auto_model_class"], + [ + ("onnx", ORTModelForFeatureExtraction), + ("openvino", OVModelForFeatureExtraction), + ], +) +@pytest.mark.parametrize( + "model_kwargs", [{}, {"file_name": "wrong_file_name"}] +) # <- Using a file_name is fine when exporting +def test_backend_export(backend, expected_auto_model_class, model_kwargs) -> None: + model = SentenceTransformer( + "sentence-transformers-testing/stsb-bert-tiny-safetensors", backend=backend, model_kwargs=model_kwargs + ) + assert model.get_backend() == backend + assert isinstance(model[0].auto_model, expected_auto_model_class) + embedding = model.encode("Hello, World!") + assert embedding.shape == (model.get_sentence_embedding_dimension(),) + + +def test_backend_no_export_crash(): + # ONNX Crashes when it can't export & the model repo/path doesn't contain an exported model + with pytest.raises(OSError): + SentenceTransformer( + "sentence-transformers-testing/stsb-bert-tiny-safetensors", backend="onnx", model_kwargs={"export": False} + ) + + # OpenVINO will forcibly override the export=False if the model repo/path doesn't contain an exported model + # But only starting from v1.19.0 + model = SentenceTransformer( + "sentence-transformers-testing/stsb-bert-tiny-safetensors", backend="openvino", model_kwargs={"export": False} + ) + assert isinstance(model[0].auto_model, OVModelForFeatureExtraction) + + +## Testing loading exported models: +@pytest.mark.parametrize( + ["backend", "model_id"], + [ + ("onnx", "sentence-transformers-testing/stsb-bert-tiny-onnx"), + ("openvino", "sentence-transformers-testing/stsb-bert-tiny-openvino"), + ], +) +@pytest.mark.parametrize( + ["model_kwargs", "exception"], + [ + [{}, False], + [{"file_name": "wrong_file_name", "export": True}, False], # Using a file_name is fine when exporting + [{"file_name": "wrong_file_name", "export": False}, True], # ... but fails when not exporting + ], +) +def test_backend_load(backend, model_id, model_kwargs, exception) -> None: + if exception: + with pytest.raises((OSError, RuntimeError)): + SentenceTransformer(model_id, backend=backend, model_kwargs=model_kwargs) + else: + model = SentenceTransformer(model_id, backend=backend, model_kwargs=model_kwargs) + assert model.get_backend() == backend + embedding = model.encode("Hello, World!") + assert embedding.shape == (model.get_sentence_embedding_dimension(),) + + +def test_onnx_provider_crash() -> None: + with pytest.raises(ValueError): + SentenceTransformer( + "sentence-transformers-testing/stsb-bert-tiny-onnx", + backend="onnx", + model_kwargs={"provider": "incorrect_provider"}, + ) + + +def test_openvino_provider() -> None: + model = SentenceTransformer( + "sentence-transformers-testing/stsb-bert-tiny-openvino", + backend="openvino", + model_kwargs={"ov_config": {"INFERENCE_PRECISION_HINT": "precision_1"}}, + ) + assert model[0].auto_model.ov_config == {"INFERENCE_PRECISION_HINT": "precision_1", "PERFORMANCE_HINT": "LATENCY"} + + with tempfile.TemporaryDirectory() as temp_dir: + ov_config_path = os.path.join(temp_dir, "ov_config.json") + with open(ov_config_path, "w") as ov_config_file: + json.dump({"INFERENCE_PRECISION_HINT": "precision_2"}, ov_config_file) + + model = SentenceTransformer( + "sentence-transformers-testing/stsb-bert-tiny-openvino", + backend="openvino", + model_kwargs={"ov_config": ov_config_path}, + ) + assert model[0].auto_model.ov_config == { + "INFERENCE_PRECISION_HINT": "precision_2", + "PERFORMANCE_HINT": "LATENCY", + } + + +def test_incorrect_backend() -> None: + with pytest.raises(ValueError): + SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-safetensors", backend="incorrect_backend") + + +def test_openvino_backend() -> None: + model_id = "sentence-transformers-testing/stsb-bert-tiny-safetensors" + # Test that OpenVINO output is close to PyTorch output + pytorch_model = SentenceTransformer(model_id) + openvino_model = SentenceTransformer( + model_id, + backend="openvino", + model_kwargs={"ov_config": {"INFERENCE_PRECISION_HINT": "f32"}}, + ) + pytorch_result = pytorch_model.encode(["Hello there!"]) + openvino_result = openvino_model.encode(["Hello there!"]) + assert np.allclose(openvino_result, pytorch_result, atol=0.000001), "OpenVINO and Pytorch outputs are not close" + + with tempfile.TemporaryDirectory() as tmpdirname: + # Test that loading with ov_config file works as expected + config_file = str(Path(tmpdirname) / "ov_config.json") + with open(Path(config_file), "w") as f: + f.write('{"NUM_STREAMS" : "2"}') + openvino_model_with_config = SentenceTransformer( + model_id, + backend="openvino", + model_kwargs={"ov_config": config_file}, + ) + # The transformers model is an Optimum model with an OpenVINO inference request property + assert openvino_model_with_config[0].auto_model.request.get_property("NUM_STREAMS") == 2 + + # Test that saving and loading local OpenVINO models works as expected + openvino_model_with_config.save_pretrained(tmpdirname) + local_openvino_model = SentenceTransformer( + tmpdirname, backend="openvino", model_kwargs={"ov_config": {"INFERENCE_PRECISION_HINT": "f32"}} + ) + local_openvino_result = local_openvino_model.encode(["Hello there!"]) + assert np.allclose( + local_openvino_result, openvino_result + ), "OpenVINO saved model output differs from in-memory converted model" + del local_openvino_model + gc.collect() diff --git a/tests/test_multi_process.py b/tests/test_multi_process.py index 5966e779b..5de94a7ab 100644 --- a/tests/test_multi_process.py +++ b/tests/test_multi_process.py @@ -10,6 +10,10 @@ from sentence_transformers import SentenceTransformer +@pytest.mark.skip( + "This test fails if optimum.intel.openvino is imported, because openvinotoolkit/nncf " + "patches torch._C._nn.gelu in a way that breaks pickling." +) @pytest.mark.parametrize("normalize_embeddings", (False, True)) @pytest.mark.parametrize("prompt_name", (None, "retrieval")) def test_encode_multi_process( diff --git a/tests/test_sentence_transformer.py b/tests/test_sentence_transformer.py index 3d96bead2..9d8e9c347 100644 --- a/tests/test_sentence_transformer.py +++ b/tests/test_sentence_transformer.py @@ -15,7 +15,7 @@ import numpy as np import pytest import torch -from huggingface_hub import GitRefInfo, GitRefs, HfApi, RepoUrl +from huggingface_hub import CommitInfo, HfApi, RepoUrl from torch import nn from sentence_transformers import SentenceTransformer, util @@ -108,30 +108,26 @@ def mock_create_repo(self, repo_id, **kwargs): def mock_upload_folder(self, **kwargs): nonlocal mock_upload_folder_kwargs mock_upload_folder_kwargs = kwargs - - def mock_list_repo_refs(self, repo_id=None, **kwargs): - try: - git_ref_info = GitRefInfo(name="main", ref="refs/heads/main", target_commit="123456") - git_ref_info2 = GitRefInfo(name="revision_test", ref="refs/heads/revision_test", target_commit="678901") - except TypeError: - git_ref_info = GitRefInfo(dict(name="main", ref="refs/heads/main", targetCommit="123456")) - git_ref_info2 = GitRefInfo( - dict(name="revision_test", ref="refs/heads/revision_test", target_commit="678901") + if kwargs.get("revision") is None: + return CommitInfo( + commit_url=f"https://huggingface.co/{kwargs.get('repo_id')}/commit/123456", + commit_message="commit_message", + commit_description="commit_description", + oid="oid", + ) + else: + return CommitInfo( + commit_url=f"https://huggingface.co/{kwargs.get('repo_id')}/commit/678901", + commit_message="commit_message", + commit_description="commit_description", + oid="oid", ) - # workaround for https://github.com/huggingface/huggingface_hub/issues/1956 - git_ref_kwargs = {"branches": [git_ref_info, git_ref_info2], "converts": [], "tags": [], "pull_requests": None} - try: - return GitRefs(**git_ref_kwargs) - except TypeError: - git_ref_kwargs.pop("pull_requests") - return GitRefs(**git_ref_kwargs) def mock_create_branch(self, repo_id, branch, revision=None, **kwargs): return None monkeypatch.setattr(HfApi, "create_repo", mock_create_repo) monkeypatch.setattr(HfApi, "upload_folder", mock_upload_folder) - monkeypatch.setattr(HfApi, "list_repo_refs", mock_list_repo_refs) monkeypatch.setattr(HfApi, "create_branch", mock_create_branch) model = SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-safetensors")