Skip to content

Commit

Permalink
unstructured: fix metadata order mixed up (#336)
Browse files Browse the repository at this point in the history
* Optional meta field for UnstructuredFileConverter with proper tests

* black lint

* Adding multiple files and meta list test case

* Black formatting test

* Fixing metadata page number bug. Deep copy of dict

* Folder of files test

* Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* Renaming "name" meta to "file_path" and deepcopy fix

* Fix Ruff Complaining

* Removing unique file logic using set that does not preserve file orders. Raise error if glob and metadata list because unsafe

* Better test to make sure metadata order are preserved.

* Make a failing test if metadata list and directory

* filepaths as lists

* Update integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py

Co-authored-by: Stefano Fiorucci <[email protected]>

* update meta docstrings

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
  • Loading branch information
lambda-science and anakin87 authored Feb 5, 2024
1 parent 40845c2 commit 96f6ade
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -105,19 +105,23 @@ def run(
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of paths, because the two lists will be zipped.
Please note that if the paths contain directories, the length of the meta list must match
the actual number of files contained.
Please note that if the paths contain directories, meta can only be a single dictionary
(same metadata for all files).
Defaults to `None`.
"""

unique_paths = {Path(path) for path in paths}
filepaths = {path for path in unique_paths if path.is_file()}
filepaths_in_directories = {
filepath for path in unique_paths if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file()
}

all_filepaths = filepaths.union(filepaths_in_directories)

paths_obj = [Path(path) for path in paths]
filepaths = [path for path in paths_obj if path.is_file()]
filepaths_in_directories = [
filepath for path in paths_obj if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file()
]
if filepaths_in_directories and isinstance(meta, list):
error = """"If providing directories in the `paths` parameter,
`meta` can only be a dictionary (metadata applied to every file),
and not a list. To specify different metadata for each file,
provide an explicit list of direct paths instead."""
raise ValueError(error)

all_filepaths = filepaths + filepaths_in_directories
# currently, the files are converted sequentially to gently handle API failures
documents = []
meta_list = normalize_metadata(meta, sources_count=len(all_filepaths))
Expand Down
19 changes: 17 additions & 2 deletions integrations/unstructured/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,10 @@ def test_run_one_doc_per_element_with_meta(self, samples_path):
@pytest.mark.integration
def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):
pdf_path = [samples_path / "sample_pdf.pdf", samples_path / "sample_pdf2.pdf"]
meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}]
meta = [
{"custom_meta": "sample_pdf.pdf", "common_meta": "common"},
{"custom_meta": "sample_pdf2.pdf", "common_meta": "common"},
]
local_converter = UnstructuredFileConverter(
api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element"
)
Expand All @@ -163,6 +166,7 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):

assert len(documents) > 4
for doc in documents:
assert doc.meta["custom_meta"] == doc.meta["filename"]
assert "file_path" in doc.meta
assert "page_number" in doc.meta
# elements have a category attribute that is saved in the document meta
Expand All @@ -171,9 +175,20 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):
assert doc.meta["common_meta"] == "common"

@pytest.mark.integration
def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path):
def test_run_one_doc_per_element_with_meta_list_folder_fail(self, samples_path):
pdf_path = [samples_path]
meta = [{"custom_meta": "foobar", "common_meta": "common"}, {"other_meta": "barfoo", "common_meta": "common"}]
local_converter = UnstructuredFileConverter(
api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element"
)
with pytest.raises(ValueError):
local_converter.run(paths=pdf_path, meta=meta)["documents"]

@pytest.mark.integration
def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path):
pdf_path = [samples_path]
meta = {"common_meta": "common"}

local_converter = UnstructuredFileConverter(
api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element"
)
Expand Down

0 comments on commit 96f6ade

Please sign in to comment.