Skip to content

Commit

Permalink
Merge branch 'main' into xlsx-converter
Browse files Browse the repository at this point in the history
  • Loading branch information
sjrl authored Dec 20, 2024
2 parents 669550d + f4d9c2b commit 4502e51
Show file tree
Hide file tree
Showing 62 changed files with 4,446 additions and 388 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ repos:
rev: v2.3.0
hooks:
- id: codespell
exclude: "haystack/data/abbreviations"
args: ["--toml", "pyproject.toml"]
additional_dependencies:
- tomli
Expand Down
2 changes: 1 addition & 1 deletion docs/pydoc/config/data_classess_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/dataclasses]
modules:
["answer", "byte_stream", "chat_message", "document", "streaming_chunk", "sparse_embedding"]
["answer", "byte_stream", "chat_message", "document", "streaming_chunk", "sparse_embedding", "tool"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
Expand Down
2 changes: 1 addition & 1 deletion e2e/pipelines/test_dense_doc_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_dense_doc_search_pipeline(tmp_path, samples_path):
indexing_pipeline.add_component(instance=DocumentJoiner(), name="joiner")
indexing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
indexing_pipeline.add_component(
instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter"
instance=DocumentSplitter(split_by="period", split_length=250, split_overlap=30), name="splitter"
)
indexing_pipeline.add_component(
instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder"
Expand Down
6 changes: 1 addition & 5 deletions e2e/pipelines/test_preprocessing_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
#
# SPDX-License-Identifier: Apache-2.0

import json

from haystack import Pipeline
from haystack.components.classifiers import DocumentLanguageClassifier
from haystack.components.converters import TextFileToDocument
Expand All @@ -25,9 +23,7 @@ def test_preprocessing_pipeline(tmp_path):
instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router"
)
preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
preprocessing_pipeline.add_component(
instance=DocumentSplitter(split_by="sentence", split_length=1), name="splitter"
)
preprocessing_pipeline.add_component(instance=DocumentSplitter(split_by="period", split_length=1), name="splitter")
preprocessing_pipeline.add_component(
instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder"
)
Expand Down
6 changes: 3 additions & 3 deletions haystack/components/builders/chat_prompt_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from jinja2.sandbox import SandboxedEnvironment

from haystack import component, default_from_dict, default_to_dict, logging
from haystack.dataclasses.chat_message import ChatMessage, ChatRole
from haystack.dataclasses.chat_message import ChatMessage, ChatRole, TextContent

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -197,10 +197,10 @@ def run(
if message.text is None:
raise ValueError(f"The provided ChatMessage has no text. ChatMessage: {message}")
compiled_template = self._env.from_string(message.text)
rendered_content = compiled_template.render(template_variables_combined)
rendered_text = compiled_template.render(template_variables_combined)
# deep copy the message to avoid modifying the original message
rendered_message: ChatMessage = deepcopy(message)
rendered_message.content = rendered_content
rendered_message._content = [TextContent(text=rendered_text)]
processed_messages.append(rendered_message)
else:
processed_messages.append(message)
Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import copy
import hashlib
import os
import warnings
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Union
Expand Down Expand Up @@ -143,12 +142,6 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
azure_output.append(result.to_dict())

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
8 changes: 0 additions & 8 deletions haystack/components/converters/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -94,13 +93,6 @@ def run(

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Expand Down
8 changes: 0 additions & 8 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import csv
import io
import os
import warnings
from dataclasses import dataclass
from enum import Enum
from io import StringIO
Expand Down Expand Up @@ -189,13 +188,6 @@ def run(
)
continue

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

docx_metadata = self._get_docx_metadata(document=docx_document)
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}

Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -123,12 +122,6 @@ def run(

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import json
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union

Expand Down Expand Up @@ -280,12 +279,6 @@ def run(

data = self._get_content_and_meta(bytestream)

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
for text, extra_meta in data:
merged_metadata = {**bytestream.meta, **metadata, **extra_meta}

Expand Down
8 changes: 0 additions & 8 deletions haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -112,13 +111,6 @@ def run(

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)

Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -172,12 +171,6 @@ def run(
)

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -104,12 +103,6 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
8 changes: 1 addition & 7 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -220,12 +219,7 @@ def run(
)

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -139,12 +138,6 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
7 changes: 0 additions & 7 deletions haystack/components/converters/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -93,12 +92,6 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
from haystack.utils.url_validation import is_valid_http_url

with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\"'") as huggingface_hub_import:
with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import:
from huggingface_hub import InferenceClient

logger = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
from haystack.utils.url_validation import is_valid_http_url

with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\"'") as huggingface_hub_import:
with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import:
from huggingface_hub import InferenceClient

logger = logging.getLogger(__name__)
Expand Down
Loading

0 comments on commit 4502e51

Please sign in to comment.