Skip to content

Commit

Permalink
Merge branch 'main' into voyageai-reranker-defaults
Browse files Browse the repository at this point in the history
  • Loading branch information
Adversarian authored Dec 18, 2024
2 parents c0447b0 + 6c0f018 commit 57d801f
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 42 deletions.
31 changes: 22 additions & 9 deletions llama-index-core/llama_index/core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,25 +934,38 @@ def __init__(self, **data: Any) -> None:
If 'extra_info' was passed, store it in 'metadata'.
"""
if "doc_id" in data:
value = data.pop("doc_id")
if "id_" in data:
msg = "Cannot pass both 'doc_id' and 'id_' to create a Document, use 'id_'"
raise ValueError(msg)
data["id_"] = data.pop("doc_id")
msg = "'doc_id' is deprecated and 'id_' will be used instead"
logging.warning(msg)
else:
data["id_"] = value

if "extra_info" in data:
value = data.pop("extra_info")
if "metadata" in data:
msg = "Cannot pass both 'extra_info' and 'metadata' to create a Document, use 'metadata'"
raise ValueError(msg)
data["metadata"] = data.pop("extra_info")
msg = "'extra_info' is deprecated and 'metadata' will be used instead"
logging.warning(msg)
else:
data["metadata"] = value

if "text" in data:
text = data.pop("text")
if "text_resource" in data:
msg = "Cannot pass both 'text' and 'text_resource' to create a Document, use 'text_resource'"
raise ValueError(msg)
data["text_resource"] = MediaResource(text=data.pop("text"))
msg = "'text' is deprecated and 'text_resource' will be used instead"
logging.warning(msg)
else:
data["text_resource"] = MediaResource(text=text)

super().__init__(**data)

@model_serializer(mode="wrap")
def custom_model_dump(self, handler: Any) -> Dict[str, Any]:
"""For full backward compatibility with the text field, we customize the model serializer."""
data = super().custom_model_dump(handler)
data["text"] = self.text
return data

@property
def text(self) -> str:
"""Provided for backward compatibility, it returns the content of text_resource."""
Expand Down
97 changes: 70 additions & 27 deletions llama-index-core/tests/schema/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import base64
import logging
from io import BytesIO
from pathlib import Path
from unittest import mock
Expand Down Expand Up @@ -100,33 +101,46 @@ def test_build_text_node_text_resource() -> None:
assert node.text == "test data"


def test_document_init() -> None:
doc = Document(doc_id="test")
assert doc.doc_id == "test"
assert doc.id_ == "test"
with pytest.raises(
ValueError,
match="Cannot pass both 'doc_id' and 'id_' to create a Document, use 'id_'",
):
doc = Document(id_="test", doc_id="test")

doc = Document(extra_info={"key": "value"})
assert doc.metadata == {"key": "value"}
with pytest.raises(
ValueError,
match="Cannot pass both 'extra_info' and 'metadata' to create a Document, use 'metadata'",
):
doc = Document(extra_info={}, metadata={})

doc = Document(text="test")
assert doc.text == "test"
assert doc.text_resource
assert doc.text_resource.text == "test"
with pytest.raises(
ValueError,
match="Cannot pass both 'text' and 'text_resource' to create a Document, use 'text_resource'",
):
doc = Document(text="test", text_resource="test")
def test_document_init(caplog) -> None:
with caplog.at_level(logging.WARNING):
# Legacy init
doc = Document(doc_id="test")
assert doc.doc_id == "test"
assert doc.id_ == "test"
# Legacy init mixed with new
doc = Document(id_="test", doc_id="legacy_test")
assert "'doc_id' is deprecated and 'id_' will be used instead" in caplog.text
assert doc.id_ == "test"
caplog.clear()

# Legacy init
doc = Document(extra_info={"key": "value"})
assert doc.metadata == {"key": "value"}
assert doc.extra_info == {"key": "value"}
# Legacy init mixed with new
doc = Document(extra_info={"old_key": "old_value"}, metadata={"key": "value"})
assert (
"'extra_info' is deprecated and 'metadata' will be used instead"
in caplog.text
)
assert doc.metadata == {"key": "value"}
assert doc.extra_info == {"key": "value"}
caplog.clear()

# Legacy init
doc = Document(text="test")
assert doc.text == "test"
assert doc.text_resource
assert doc.text_resource.text == "test"
# Legacy init mixed with new
doc = Document(text="legacy_test", text_resource=MediaResource(text="test"))
assert (
"'text' is deprecated and 'text_resource' will be used instead"
in caplog.text
)
assert doc.text == "test"
assert doc.text_resource
assert doc.text_resource.text == "test"


def test_document_properties():
Expand All @@ -145,6 +159,35 @@ def test_document_str():
assert str(doc) == "Doc ID: test_id\nText: Lo..."


def test_document_legacy_roundtrip():
origin = Document(id_="test_id", text="this is a test")
assert origin.model_dump() == {
"id_": "test_id",
"embedding": None,
"metadata": {},
"excluded_embed_metadata_keys": [],
"excluded_llm_metadata_keys": [],
"relationships": {},
"metadata_template": "{key}: {value}",
"metadata_separator": "\n",
"text": "this is a test",
"text_resource": {
"embeddings": None,
"text": "this is a test",
"mimetype": None,
"path": None,
"url": None,
},
"image_resource": None,
"audio_resource": None,
"video_resource": None,
"text_template": "{metadata_str}\n\n{content}",
"class_name": "Document",
}
dest = Document(**origin.model_dump())
assert dest.text == "this is a test"


def test_image_document_empty():
doc = ImageDocument(id_="test")
assert doc.id_ == "test"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
"video_resource": None,
"text_template": "{metadata_str}\n\n{content}",
"class_name": "Document",
"text": '{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "sample", "origin": {"mimetype": "text/html", "binary_hash": 42, "filename": "sample.html"}, "furniture": {"self_ref": "#/furniture", "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "children": [{"$ref": "#/texts/0"}, {"$ref": "#/texts/1"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"$ref": "#/body"}, "children": [], "label": "paragraph", "prov": [], "orig": "Some text", "text": "Some text"}, {"self_ref": "#/texts/1", "parent": {"$ref": "#/body"}, "children": [], "label": "paragraph", "prov": [], "orig": "Another paragraph", "text": "Another paragraph"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {}}',
}
]
}
Expand Down Expand Up @@ -106,6 +107,7 @@
"video_resource": None,
"text_template": "{metadata_str}\n\n{content}",
"class_name": "Document",
"text": "Some text\n\nAnother paragraph",
}
]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import logging
import os
from typing import Any, Dict, List, Optional, cast
from datetime import date

import pymongo
from llama_index.core.bridge.pydantic import PrivateAttr
Expand Down Expand Up @@ -261,7 +260,6 @@ def add(
self._embedding_key: node.get_embedding(),
self._text_key: node.get_content(metadata_mode=MetadataMode.NONE) or "",
self._metadata_key: metadata,
"timeStamp": date.today(),
}
data_to_insert.append(entry)
ids.append(node.node_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ exclude = ["**/BUILD"]
license = "MIT"
name = "llama-index-vector-stores-azurecosmosmongo"
readme = "README.md"
version = "0.3.0"
version = "0.4.0"

[tool.poetry.dependencies]
python = ">=3.9,<4.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"""
import logging
from typing import Any, Optional, Dict, cast, List
from datetime import date

from azure.identity import ClientSecretCredential
from azure.cosmos import CosmosClient
Expand Down Expand Up @@ -289,7 +288,6 @@ def add(
self._embedding_key: node.get_embedding(),
self._text_key: node.get_content(metadata_mode=MetadataMode.NONE) or "",
self._metadata_key: metadata,
"timeStamp": date.today(),
}
data_to_insert.append(entry)
ids.append(node.node_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ exclude = ["**/BUILD"]
license = "MIT"
name = "llama-index-vector-stores-azurecosmosnosql"
readme = "README.md"
version = "1.2.0"
version = "1.3.0"

[tool.poetry.dependencies]
python = ">=3.9,<4.0"
Expand Down

0 comments on commit 57d801f

Please sign in to comment.