Skip to content

Commit

Permalink
fix: delete chunks from namespace by metadata id
Browse files Browse the repository at this point in the history
  • Loading branch information
glorenzo972 committed Jun 21, 2024
1 parent 7157f78 commit e1d9d7b
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 57 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
*Andrea Sponziello*
### **Copyrigth**: *Tiledesk SRL*

## [2024-06-21]
### 0.2.3
- fix: delete chunks from namespace by metadata id
- added: /api/desc/namespace/{ns} for namespace description

## [2024-06-15]
### 0.2.2
- fix: indexing of txt documents
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tilellm"
version = "0.2.2"
version = "0.2.3"
description = "tiledesk for RAG"
authors = ["Gianluca Lorenzo <[email protected]>"]
repository = "https://github.com/Tiledesk/tiledesk-llm"
Expand Down
29 changes: 19 additions & 10 deletions tilellm/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,16 @@
delete_id_from_namespace,
get_ids_namespace,
get_listitems_namespace,
get_desc_namespace,
get_list_namespace,
get_sources_namespace)

import logging

# parser = argparse.ArgumentParser(description="Tiledesk: llms integration")
# parser.add_argument("--host", default="0.0.0.0", help="Hostname for FastAPI")
# parser.add_argument("--port", type=int, default=8000, help="Port for FastAPI")
# parser.add_argument("--redis_url", default="redis://localhost:6379/0", help="Redis url")
# parser.add_argument("--environment", default="dev", help="Environment dev|prod")
# parser.add_argument("--log_path", default="log_conf.yaml", help="Log configuration file path. Default log_conf.yaml")

# args = parser.parse_args()

ENVIRONMENTS = {
'serverless': '.environ',
'prod': '.environ.prod',
'pod': '.environ.prod',
}

expiration_in_seconds = 48 * 60 * 60
Expand All @@ -56,9 +49,10 @@


environment = os.environ.get("PINECONE_TYPE", "serverless")
# environment = "prod"
# environment = "serverless"
load_dotenv(ENVIRONMENTS.get(environment) or '.environ')


# print(os.environ.get("PINECONE_API_KEY"))
# os.environ.__setitem__("ENVIRON", environment)

Expand Down Expand Up @@ -370,6 +364,21 @@ async def list_namespace_main():
logger.error(ex)
raise HTTPException(status_code=400, detail=repr(ex))

@app.get("/api/desc/namespace/{namespace}")
async def list_namespace_items_main(namespace: str):
"""
Get description for given namespace
:param namespace: namespace_id
:return: description of namespace
"""
try:
logger.info(f"retrieve description for namespace {namespace}")
result = await get_desc_namespace(namespace)

return JSONResponse(content=result.model_dump(exclude_none=True))
except Exception as ex:
logger.error(ex)
raise HTTPException(status_code=400, detail=repr(ex))

@app.get("/api/listitems/namespace/{namespace}")
async def list_namespace_items_main(namespace: str):
Expand Down
8 changes: 8 additions & 0 deletions tilellm/controller/openai_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,14 @@ async def get_listitems_namespace(namespace: str, repo=None):
except Exception as ex:
raise ex

@inject_repo
async def get_desc_namespace(namespace: str, repo=None):
try:
return await repo.get_pc_desc_namespace(namespace=namespace)
except Exception as ex:
raise ex



@inject_repo
async def get_sources_namespace(source: str, namespace: str, repo=None):
Expand Down
14 changes: 14 additions & 0 deletions tilellm/models/item_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ class ItemSingle(BaseModel):
embedding: str = Field(default_factory=lambda: "text-embedding-ada-002")
namespace: str | None = None
webhook: str = Field(default_factory=lambda: "")
chunk_size: int = Field(default_factory=lambda: 256)
chunk_overlap: int = Field(default_factory=lambda: 10)



class MetadataItem(BaseModel):
Expand Down Expand Up @@ -136,5 +139,16 @@ class PineconeItemNamespaceResult(BaseModel):
vector_count: int


class PineconeIdSummaryResult(BaseModel):
metadata_id: str
source: str
chunks_count: int


class PineconeNamespaceResult(BaseModel):
namespaces: Optional[List[PineconeItemNamespaceResult]]


class PineconeDescNamespaceResult(BaseModel):
namespace_desc: PineconeItemNamespaceResult
ids: Optional[List[PineconeIdSummaryResult]]
83 changes: 74 additions & 9 deletions tilellm/store/pinecone_repository_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,16 @@
from tilellm.models.item_model import (MetadataItem,
PineconeQueryResult,
PineconeItems,
PineconeIndexingResult
PineconeIndexingResult,
PineconeItemNamespaceResult,
PineconeIdSummaryResult,
PineconeDescNamespaceResult
)
from tilellm.tools.document_tool_simple import (get_content_by_url,
get_content_by_url_with_bs,
load_document,
load_from_wikipedia
)

from tilellm.shared import const
from langchain_core.vectorstores import VectorStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
import uuid
import os
from typing import List, Dict

import logging

Expand Down Expand Up @@ -220,6 +216,75 @@ async def get_pc_all_obj_namespace(namespace: str):

raise ex

@staticmethod
async def get_pc_desc_namespace(namespace: str):
"""
Query Pinecone to get all object
:param namespace:
:return:
"""
import pinecone

try:
pc = pinecone.Pinecone(
api_key=const.PINECONE_API_KEY
)

host = pc.describe_index(const.PINECONE_INDEX).host
index = pc.Index(name=const.PINECONE_INDEX, host=host)

# vector_store = Pinecone.from_existing_index(const.PINECONE_INDEX, )
describe = index.describe_index_stats()
print(describe)
logger.debug(describe)
namespaces = describe.get("namespaces", {})
total_vectors = 1
description = PineconeItemNamespaceResult(namespace=namespace, vector_count=0)
if namespaces:
if namespace in namespaces.keys():
total_vectors = namespaces.get(namespace).get('vector_count')
description = PineconeItemNamespaceResult(namespace=namespace, vector_count=total_vectors)

logger.debug(f"pinecone total vector in {namespace}: {total_vectors}")
print(description)
batch_size = min([total_vectors, 10000])

pc_res = index.query(
vector=[0] * 1536, # [0,0,0,0......0]
top_k=batch_size,
# filter={"id": {"$eq": id}},
namespace=namespace,
include_values=False,
include_metadata=True
)
matches = pc_res.get('matches')
# from pprint import pprint
# pprint(matches)
# ids = [obj.get('id') for obj in matches]
# print(type(matches[0].get('id')))
result = []
ids_count: Dict[str, PineconeIdSummaryResult] = {}

for obj in matches:
metadata_id = obj.get('metadata').get('id')
if metadata_id in ids_count:
ids_count[metadata_id].chunks_count += 1
else:
ids_count[metadata_id] = PineconeIdSummaryResult(metadata_id=metadata_id,
source=obj.get('metadata').get('source'),
chunks_count=1)

res = PineconeDescNamespaceResult(namespace_desc=description, ids=list(ids_count.values()))
print(res)
logger.debug(res)
return res

except Exception as ex:

logger.error(ex)

raise ex

@staticmethod
async def get_pc_sources_namespace(source: str, namespace: str):
"""
Expand Down
42 changes: 7 additions & 35 deletions tilellm/store/pinecone_repository_pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ async def add_pc_item(self, item):
embedding = item.embedding
namespace = item.namespace
scrape_type = item.scrape_type
chunk_size = item.chunk_size
chunk_overlap = item.chunk_overlap
try:
await self.delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace)
except Exception as ex:
Expand Down Expand Up @@ -77,7 +79,7 @@ async def add_pc_item(self, item):
elif value is None:
document.metadata[key] = ""

chunks.extend(self.chunk_data(data=[document]))
chunks.extend(self.chunk_data(data=[document], chunk_size=chunk_size, chunk_overlap=chunk_overlap))

# from pprint import pprint
# pprint(documents)
Expand Down Expand Up @@ -113,7 +115,7 @@ async def add_pc_item(self, item):
metadata = MetadataItem(id=metadata_id, source=source, type=type_source, embedding=embedding)
document = Document(page_content=content, metadata=metadata.dict())

chunks.extend(self.chunk_data(data=[document]))
chunks.extend(self.chunk_data(data=[document], chunk_size=chunk_size, chunk_overlap=chunk_overlap))
total_tokens, cost = self.calc_embedding_cost(chunks, embedding)
a = vector_store.from_documents(chunks,
embedding=oai_embeddings,
Expand All @@ -138,7 +140,6 @@ async def delete_pc_ids_namespace(self, metadata_id: str, namespace: str):
:param namespace:
:return:
"""
# FIXME problema con namespace di cardinalità superiore a 10000

import pinecone
try:
Expand All @@ -148,49 +149,20 @@ async def delete_pc_ids_namespace(self, metadata_id: str, namespace: str):

host = pc.describe_index(const.PINECONE_INDEX).host
index = pc.Index(name=const.PINECONE_INDEX, host=host)
# vector_store = Pinecone.from_existing_index(const.PINECONE_INDEX, )
describe = index.describe_index_stats()
logger.debug(describe)
namespaces = describe.get("namespaces", {})
total_vectors = 1
# batch_size = 100

offset = 0

if namespaces:
if namespace in namespaces.keys():
total_vectors = namespaces.get(namespace).get('vector_count')

logger.debug(total_vectors)

batch_size = min([total_vectors, 10000])
while offset < total_vectors:

pc_res = index.query(
vector=[0] * 1536, # [0,0,0,0......0]
top_k=batch_size,
filter={"id": {"$eq": metadata_id}},
namespace=namespace,
include_values=False,
include_metadata=False
)
matches = pc_res.get('matches')

ids = [obj.get('id') for obj in matches]

if offset == 0 and not ids:
raise IndexError(f"Empty list for {metadata_id} and namespace {namespace}")
elif offset > 0 and not ids:
break

index.delete(
ids=ids,
namespace=namespace)

if batch_size < 10000:
break
else:
offset += len(ids)
index.delete(
filter={"id": {"$eq": metadata_id}},
namespace=namespace)

except Exception as ex:
# logger.error(ex)
Expand Down
6 changes: 4 additions & 2 deletions tilellm/store/pinecone_repository_serverless.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ async def add_pc_item(self, item):
embedding = item.embedding
namespace = item.namespace
scrape_type = item.scrape_type
chunk_size = item.chunk_size
chunk_overlap = item.chunk_overlap
try:
await self.delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace)
except Exception as ex:
Expand Down Expand Up @@ -80,7 +82,7 @@ async def add_pc_item(self, item):
elif value is None:
document.metadata[key] = ""

chunks.extend(self.chunk_data(data=[document]))
chunks.extend(self.chunk_data(data=[document], chunk_size=chunk_size, chunk_overlap=chunk_overlap))

# from pprint import pprint
# pprint(documents)
Expand Down Expand Up @@ -124,7 +126,7 @@ async def add_pc_item(self, item):
metadata = MetadataItem(id=metadata_id, source=source, type=type_source, embedding=embedding)
document = Document(page_content=content, metadata=metadata.dict())

chunks.extend(self.chunk_data(data=[document]))
chunks.extend(self.chunk_data(data=[document], chunk_size=chunk_size, chunk_overlap=chunk_overlap))
total_tokens, cost = self.calc_embedding_cost(chunks, embedding)
a = vector_store.from_documents(chunks,
embedding=oai_embeddings,
Expand Down

0 comments on commit e1d9d7b

Please sign in to comment.