diff --git a/.github/labeler.yml b/.github/labeler.yml index f46dac561..f5bb7b448 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,6 +1,8 @@ # Integrations integration:instructor-embedders: -- components/instructor-embedders/**/* +- components/embedders/instructor-embedders/**/* +integration:unstructured-fileconverter: +- components/converters/unstructured_fileconverter/**/* integration:chroma: - document_stores/chroma/**/* integration:elasticsearch: diff --git a/.github/workflows/components_unstructured_fileconverter.yml b/.github/workflows/components_unstructured_fileconverter.yml new file mode 100644 index 000000000..f60573f79 --- /dev/null +++ b/.github/workflows/components_unstructured_fileconverter.yml @@ -0,0 +1,59 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / Components / unstructured-fileconverter + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "components/converters/unstructured_fileconverter/**" + - ".github/workflows/components_unstructured_fileconverter.yml" + +concurrency: + group: components_unstructured_fileconverter-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + services: + unstructured-api: + image: "quay.io/unstructured-io/unstructured-api:latest" + ports: + - 8000:8000 + options: >- + --health-cmd "curl --fail http://localhost:8000/healthcheck || exit 1" + --health-interval 10s + --health-timeout 1s + --health-retries 10 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + working-directory: components/converters/unstructured_fileconverter + if: matrix.python-version == '3.9' + run: hatch run lint:all + + - name: Run tests + working-directory: components/converters/unstructured_fileconverter + run: hatch run cov diff --git a/README.md b/README.md index cdbb0e1ab..475da2c7d 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai | [chroma-haystack](document_stores/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / Document Stores / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_chroma.yml) | | [elasticsearch-haystack](document_stores/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / Document Stores / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml) | | [instructor-embedders-haystack](components/embedders/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml) | +| [unstructured-fileconverter-haystack](components/converters/unstructured_fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured-fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_unstructured_fileconverter.yml) ## Contributing diff --git a/components/converters/unstructured_fileconverter/LICENSE b/components/converters/unstructured_fileconverter/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/components/converters/unstructured_fileconverter/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/components/converters/unstructured_fileconverter/README.md b/components/converters/unstructured_fileconverter/README.md new file mode 100644 index 000000000..18a826bca --- /dev/null +++ b/components/converters/unstructured_fileconverter/README.md @@ -0,0 +1,86 @@ +# Unstructured FileConverter for Haystack + + + +Component for the Haystack (2.x) LLM framework to easily convert files and directories into Documents using the Unstructured API. + +**[Unstructured](https://unstructured-io.github.io/unstructured/index.html)** provides a series of tools to do **ETL for LLMs**. This component calls the Unstructured API that simply extracts text and other information from a vast range of file formats. +**[Supported file types](https://unstructured-io.github.io/unstructured/api.html#supported-file-types)**. + +**[Haystack](https://github.com/deepset-ai/haystack)** is an **orchestration framework** to build customizable, production-ready **LLM applications**. +Once your files are converted into Documents, you can start building RAG, question answering, semantic search applications and more. + +- [Installation](#installation) +- [Usage](#usage) +- [Configuration](#configuration) + +## Installation + +```bash +pip install unstructured-fileconverter-haystack +``` + +### Hosted API +If you plan to use the hosted version of the Unstructured API, you just need the **(free) Unsctructured API key**. You can get it by signing up [here](https://unstructured.io/api-key). + +### Local API (Docker) +If you want to run your own local instance of the Unstructured API, you need Docker and you can find instructions [here](https://unstructured-io.github.io/unstructured/api.html#using-docker-images). + +In short, this should work: +```bash +docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0 +``` + +## Usage + +### In isolation +```python +import os +from unstructured_fileconverter_haystack import UnstructuredFileConverter + +os.environ["UNSTRUCTURED_API_KEY"] = "YOUR-API-KEY" + +converter = UnstructuredFileConverter() + +documents = converter.run(paths = ["a/file/path.pdf", "a/directory/path"])["documents"] + +``` + +### In a Haystack Pipeline +```python +import os +from haystack.preview import Pipeline +from haystack.preview.components.writers import DocumentWriter +from haystack.preview.document_stores import MemoryDocumentStore +from unstructured_fileconverter_haystack import UnstructuredFileConverter + +os.environ["UNSTRUCTURED_API_KEY"] = "YOUR-API-KEY" + +document_store = MemoryDocumentStore() + +indexing = Pipeline() +indexing.add_component("converter", UnstructuredFileConverter()) +indexing.add_component("writer", DocumentWriter(document_store)) +indexing.connect("converter", "writer") + +indexing.run({"converter": {"paths": ["a/file/path.pdf", "a/directory/path"]}}) +``` + +## Configuration + +### Initialization parameters +- `api_url`: URL of the Unstructured API. Defaults to the hosted version. If you run the API locally, you should specify this parameter. +- `api_key`: API key for the Unstructured API (https://unstructured.io/#get-api-key). + If you run the API locally, it is not needed. + If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. +- `document_creation_mode`: How to create Haystack Documents from the elements returned by Unstructured. + - `"one-doc-per-file"`: One Haystack Document per file. All elements are concatenated into one text field. + - `"one-doc-per-page"`: One Haystack Document per page. All elements on a page are concatenated into one text field. + - `"one-doc-per-element"`: One Haystack Document per element. Each element is converted to a Haystack Document + - `separator`: Separator between elements when concatenating them into one text field. +- `unstructured_kwargs`: Additional keyword arguments that are passed to the Unstructured API. They can be helpful to improve or speed up the conversion. See https://unstructured-io.github.io/unstructured/api.html#parameters. + +### `run` method +The method `run` just expects a list of paths (files or directories) in the `paths` parameter. + +If `paths` contains a directory, all files in the first level of the directory are converted. Subdirectories are ignored. diff --git a/components/converters/unstructured_fileconverter/pyproject.toml b/components/converters/unstructured_fileconverter/pyproject.toml new file mode 100644 index 000000000..faaba8d71 --- /dev/null +++ b/components/converters/unstructured_fileconverter/pyproject.toml @@ -0,0 +1,178 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "unstructured-fileconverter-haystack" +dynamic = ["version"] +description = 'Haystack 2.x component to convert files into Documents using the Unstructured API' +readme = "README.md" +requires-python = ">=3.8" +license = "Apache-2.0" +keywords = [] +authors = [ + { name = "deepset GmbH", email = "info@deepset.ai" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + # we distribute the preview version of Haystack 2.0 under the package "haystack-ai" + "haystack-ai", + "unstructured", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/components/converters/unstructured_fileconverter#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/components/converters/unstructured_fileconverter" + +[tool.hatch.version] +path = "src/unstructured_fileconverter_haystack/__about__.py" + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", + "pytest-xdist", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11"] + +[tool.hatch.envs.lint] +detached = true +dependencies = [ + "black>=23.1.0", + "mypy>=1.0.0", + "ruff>=0.0.243", +] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive {args:src/unstructured_fileconverter_haystack tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["unstructured_fileconverter_haystack"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.coverage.run] +source_pkgs = ["unstructured_fileconverter_haystack", "tests"] +branch = true +parallel = true +omit = [ + "src/unstructured_fileconverter/__about__.py", +] + +[tool.coverage.paths] +unstructured_fileconverter_haystack = ["src/unstructured_fileconverter_haystack", "*/unstructured-fileconverter-haystack/src/unstructured_fileconverter_haystack"] +tests = ["tests", "*/unstructured-fileconverter-haystack/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pytest.ini_options] +minversion = "6.0" +markers = [ + "unit: unit tests", + "integration: integration tests" +] + +[[tool.mypy.overrides]] +module = [ + "haystack.*", + "pytest.*" +] +ignore_missing_imports = true diff --git a/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__about__.py b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__about__.py new file mode 100644 index 000000000..7200d918c --- /dev/null +++ b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +__version__ = "0.0.4" diff --git a/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__init__.py b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__init__.py new file mode 100644 index 000000000..bcce95bea --- /dev/null +++ b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from unstructured_fileconverter_haystack.fileconverter import UnstructuredFileConverter + +__all__ = ["UnstructuredFileConverter"] diff --git a/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py new file mode 100644 index 000000000..c8201d8da --- /dev/null +++ b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py @@ -0,0 +1,173 @@ +import logging +import os +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Union + +from haystack.preview import Document, component, default_to_dict +from tqdm import tqdm +from unstructured.documents.elements import Element # type: ignore[import] +from unstructured.partition.api import partition_via_api # type: ignore[import] + +logger = logging.getLogger(__name__) + +UNSTRUCTURED_HOSTED_API_URL = "https://api.unstructured.io/general/v0/general" + + +@component +class UnstructuredFileConverter: + """ + Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + """ + + def __init__( + self, + api_url: str = UNSTRUCTURED_HOSTED_API_URL, + api_key: Optional[str] = None, + document_creation_mode: Literal[ + "one-doc-per-file", "one-doc-per-page", "one-doc-per-element" + ] = "one-doc-per-file", + separator: str = "\n\n", + unstructured_kwargs: Optional[Dict[str, Any]] = None, + progress_bar: bool = True, # noqa: FBT001, FBT002 + ): + """ + :param api_url: URL of the Unstructured API. Defaults to the hosted version. + If you run the API locally, specify the URL of your local API (e.g. http://localhost:8000/general/v0/general). + See https://unstructured-io.github.io/unstructured/api.html#using-the-api-locally for more information. + :param api_key: API key for the Unstructured API (https://unstructured.io/#get-api-key). + If you run the API locally, it is not needed. + If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. + :param document_creation_mode: How to create Haystack Documents from the elements returned by Unstructured. + - "one-doc-per-file": One Haystack Document per file. All elements are concatenated into one text field. + - "one-doc-per-page": One Haystack Document per page. + All elements on a page are concatenated into one text field. + - "one-doc-per-element": One Haystack Document per element. + Each element is converted to a Haystack Document. + :param separator: Separator between elements when concatenating them into one text field. + :param unstructured_kwargs: Additional keyword arguments that are passed to the Unstructured API. + See https://unstructured-io.github.io/unstructured/api.html. + :param progress_bar: Show a progress bar for the conversion. Defaults to True. + """ + + self.api_url = api_url + self.document_creation_mode = document_creation_mode + self.unstructured_kwargs = unstructured_kwargs or {} + self.separator = separator + self.progress_bar = progress_bar + + is_hosted_api = api_url == UNSTRUCTURED_HOSTED_API_URL + if api_key is None and is_hosted_api: + try: + api_key = os.environ["UNSTRUCTURED_API_KEY"] + except KeyError as e: + msg = ( + "To use the hosted version of Unstructured, you need to set the environment variable " + "UNSTRUCTURED_API_KEY (recommended) or explictly pass the parameter api_key." + ) + raise ValueError(msg) from e + self.api_key = api_key + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + + # do not serialize api_key + return default_to_dict( + self, + api_url=self.api_url, + document_creation_mode=self.document_creation_mode, + separator=self.separator, + unstructured_kwargs=self.unstructured_kwargs, + progress_bar=self.progress_bar, + ) + + @component.output_types(documents=List[Document]) + def run(self, paths: Union[List[str], List[os.PathLike]]): + """ + Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + + :param paths: List of paths to convert. Paths can be files or directories. + If a path is a directory, all files in the directory are converted. Subdirectories are ignored. + """ + + unique_paths = {Path(path) for path in paths} + filepaths = {path for path in unique_paths if path.is_file()} + filepaths_in_directories = { + filepath for path in unique_paths if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file() + } + + all_filepaths = filepaths.union(filepaths_in_directories) + + # currently, the files are converted sequentially to gently handle API failures + documents = [] + + for filepath in tqdm( + all_filepaths, desc="Converting files to Haystack Documents", disable=not self.progress_bar + ): + elements = self._partition_file_into_elements(filepath=filepath) + docs_for_file = self._create_documents( + filepath=filepath, + elements=elements, + document_creation_mode=self.document_creation_mode, + separator=self.separator, + ) + documents.extend(docs_for_file) + + return {"documents": documents} + + def _create_documents( + self, + filepath: Path, + elements: List[Element], + document_creation_mode: Literal["one-doc-per-file", "one-doc-per-page", "one-doc-per-element"], + separator: str, + ) -> List[Document]: + """ + Create Haystack Documents from the elements returned by Unstructured. + """ + docs = [] + + if document_creation_mode == "one-doc-per-file": + text = separator.join([str(el) for el in elements]) + docs = [Document(content=text, meta={"name": str(filepath)})] + + elif document_creation_mode == "one-doc-per-page": + texts_per_page: defaultdict[int, str] = defaultdict(str) + meta_per_page: defaultdict[int, dict] = defaultdict(dict) + for el in elements: + metadata = {"name": str(filepath)} + if hasattr(el, "metadata"): + metadata.update(el.metadata.to_dict()) + page_number = int(metadata.get("page_number", 1)) + + texts_per_page[page_number] += str(el) + separator + meta_per_page[page_number].update(metadata) + + docs = [Document(content=texts_per_page[page], meta=meta_per_page[page]) for page in texts_per_page.keys()] + + elif document_creation_mode == "one-doc-per-element": + for el in elements: + metadata = {"name": str(filepath)} + if hasattr(el, "metadata"): + metadata.update(el.metadata.to_dict()) + if hasattr(el, "category"): + metadata["category"] = el.category + doc = Document(content=str(el), meta=metadata) + docs.append(doc) + + return docs + + def _partition_file_into_elements(self, filepath: Path) -> List[Element]: + """ + Partition a file into elements using the Unstructured API. + """ + elements = [] + try: + elements = partition_via_api( + filename=str(filepath), api_url=self.api_url, api_key=self.api_key, **self.unstructured_kwargs + ) + except Exception as e: + logger.warning(f"Unstructured could not process file {filepath}. Error: {e}") + return elements diff --git a/components/converters/unstructured_fileconverter/tests/samples/sample_pdf.pdf b/components/converters/unstructured_fileconverter/tests/samples/sample_pdf.pdf new file mode 100644 index 000000000..6384246e8 Binary files /dev/null and b/components/converters/unstructured_fileconverter/tests/samples/sample_pdf.pdf differ diff --git a/components/converters/unstructured_fileconverter/tests/test_fileconverter.py b/components/converters/unstructured_fileconverter/tests/test_fileconverter.py new file mode 100644 index 000000000..07c7be1f4 --- /dev/null +++ b/components/converters/unstructured_fileconverter/tests/test_fileconverter.py @@ -0,0 +1,97 @@ +from pathlib import Path + +import pytest + +from unstructured_fileconverter_haystack import UnstructuredFileConverter + + +@pytest.fixture +def samples_path(): + return Path(__file__).parent / "samples" + + +class TestUnstructuredFileConverter: + def test_init_default(self): + converter = UnstructuredFileConverter(api_key="test-api-key") + assert converter.api_url == "https://api.unstructured.io/general/v0/general" + assert converter.api_key == "test-api-key" + assert converter.document_creation_mode == "one-doc-per-file" + assert converter.separator == "\n\n" + assert converter.unstructured_kwargs == {} + assert converter.progress_bar + + def test_init_with_parameters(self): + converter = UnstructuredFileConverter( + api_url="http://custom-url:8000/general", + document_creation_mode="one-doc-per-element", + separator="|", + unstructured_kwargs={"foo": "bar"}, + progress_bar=False, + ) + assert converter.api_url == "http://custom-url:8000/general" + assert converter.api_key is None + assert converter.document_creation_mode == "one-doc-per-element" + assert converter.separator == "|" + assert converter.unstructured_kwargs == {"foo": "bar"} + assert not converter.progress_bar + + def test_to_dict(self): + converter = UnstructuredFileConverter(api_key="test-api-key") + converter_dict = converter.to_dict() + + assert converter_dict == { + "type": "unstructured_fileconverter_haystack.fileconverter.UnstructuredFileConverter", + "init_parameters": { + "api_url": "https://api.unstructured.io/general/v0/general", + "document_creation_mode": "one-doc-per-file", + "separator": "\n\n", + "unstructured_kwargs": {}, + "progress_bar": True, + }, + } + + @pytest.mark.integration + def test_run_one_doc_per_file(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-file" + ) + + documents = local_converter.run([pdf_path])["documents"] + + assert len(documents) == 1 + assert documents[0].meta == {"name": str(pdf_path)} + + @pytest.mark.integration + def test_run_one_doc_per_page(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-page" + ) + + documents = local_converter.run([pdf_path])["documents"] + + assert len(documents) == 4 + for i, doc in enumerate(documents, start=1): + assert doc.meta["name"] == str(pdf_path) + assert doc.meta["page_number"] == i + + @pytest.mark.integration + def test_run_one_doc_per_element(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" + ) + + documents = local_converter.run([pdf_path])["documents"] + + assert len(documents) > 4 + for doc in documents: + assert doc.meta["name"] == str(pdf_path) + assert "page_number" in doc.meta + + # elements have a category attribute that is saved in the document meta + assert "category" in doc.meta