From e6b41d081deebdea4b2775e798a8e68d8f9ab8c5 Mon Sep 17 00:00:00 2001 From: Martin Triska Date: Wed, 18 Dec 2024 18:47:08 +0100 Subject: [PATCH] community: DocumentLoaderAsParser wrapper (#27749) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description This pull request introduces the `DocumentLoaderAsParser` class, which acts as an adapter to transform document loaders into parsers within the LangChain framework. The class enables document loaders that accept a `file_path` parameter to be utilized as blob parsers. This is particularly useful for integrating various document loading capabilities seamlessly into the LangChain ecosystem. When merged in together with PR https://github.com/langchain-ai/langchain/pull/27716 It opens options for `SharePointLoader` / `OneDriveLoader` to process any filetype that has a document loader. ### Features - **Flexible Parsing**: The `DocumentLoaderAsParser` class can adapt any document loader that meets the criteria of accepting a `file_path` argument, allowing for lazy parsing of documents. - **Compatibility**: The class has been designed to work with various document loaders, making it versatile for different use cases. ### Usage Example To use the `DocumentLoaderAsParser`, you would initialize it with a suitable document loader class and any required parameters. Here’s an example of how to do this with the `UnstructuredExcelLoader`: ```python from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers.documentloader_adapter import DocumentLoaderAsParser from langchain_community.document_loaders.excel import UnstructuredExcelLoader # Initialize the parser adapter with UnstructuredExcelLoader xlsx_parser = DocumentLoaderAsParser(UnstructuredExcelLoader, mode="paged") # Use parser, for ex. pass it to MimeTypeBasedParser MimeTypeBasedParser( handlers={ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": xlsx_parser } ) ``` - **Dependencies:** None - **Twitter handle:** @martintriska1 If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Chester Curme --- .../parsers/documentloader_adapter.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 libs/community/langchain_community/document_loaders/parsers/documentloader_adapter.py diff --git a/libs/community/langchain_community/document_loaders/parsers/documentloader_adapter.py b/libs/community/langchain_community/document_loaders/parsers/documentloader_adapter.py new file mode 100644 index 0000000000000..38be56d9a24e5 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/documentloader_adapter.py @@ -0,0 +1,67 @@ +import inspect +from typing import Any, Dict, Iterator, Type + +from langchain.document_loaders.base import BaseBlobParser, BaseLoader +from langchain_core._api import beta +from langchain_core.documents import Document +from langchain_core.documents.base import Blob + + +@beta() +class DocumentLoaderAsParser(BaseBlobParser): + """A wrapper class that adapts a document loader to function as a parser. + + This class is a work-around that adapts a document loader to function as a parser. + It is recommended to use a proper parser, if available. + + Requires the document loader to accept a `file_path` parameter. + """ + + DocumentLoaderType: Type[BaseLoader] + doc_loader_kwargs: Dict[str, Any] + + def __init__(self, document_loader_class: Type[BaseLoader], **kwargs: Any) -> None: + """ + Initializes the DocumentLoaderAsParser with a specific document loader class + and additional arguments. + + Args: + document_loader_class (Type[BaseLoader]): The document loader class to adapt + as a parser. + **kwargs: Additional arguments passed to the document loader's constructor. + + Raises: + TypeError: If the specified document loader does not accept a `file_path` parameter, + an exception is raised, as only loaders with this parameter can be adapted. + + Example: + ``` + from langchain_community.document_loaders.excel import UnstructuredExcelLoader + + # Initialize parser adapter with a document loader + excel_parser = DocumentLoaderAsParser(UnstructuredExcelLoader, mode="elements") + ``` + """ # noqa: E501 + super().__init__() + self.DocumentLoaderClass = document_loader_class + self.document_loader_kwargs = kwargs + + # Ensure the document loader class has a `file_path` parameter + init_signature = inspect.signature(document_loader_class.__init__) + if "file_path" not in init_signature.parameters: + raise TypeError( + f"{document_loader_class.__name__} does not accept `file_path`." + "Only document loaders with `file_path` parameter" + "can be morphed into a parser." + ) + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """ + Use underlying DocumentLoader to lazily parse the blob. + """ + doc_loader = self.DocumentLoaderClass( + file_path=blob.path, **self.document_loader_kwargs + ) # type: ignore + for document in doc_loader.lazy_load(): + document.metadata.update(blob.metadata) + yield document