Skip to content

Commit

Permalink
fixed: delete ids from namespace
Browse files Browse the repository at this point in the history
  • Loading branch information
glorenzo972 committed May 6, 2024
1 parent a9be29e commit 9040424
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 22 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
### **Copyrigth**: *Tiledesk SRL*


## [2024-05-06]

### 0.1.13
- fixed: delete ids from namespace. top_k max 10k

## [2024-05-03]

### 0.1.12
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tilellm"
version = "0.1.12"
version = "0.1.13"
description = "tiledesk for RAG"
authors = ["Gianluca Lorenzo <[email protected]>"]
repository = "https://github.com/Tiledesk/tiledesk-llm"
Expand Down
56 changes: 35 additions & 21 deletions tilellm/store/pinecone_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,9 @@ async def delete_pc_ids_namespace(metadata_id: str, namespace: str):
:param namespace:
:return:
"""
import pinecone



# FIXME problema con namespace di cardinalità superiore a 10000

import pinecone
try:
pc = pinecone.Pinecone(
api_key=const.PINECONE_API_KEY
Expand All @@ -158,29 +156,44 @@ async def delete_pc_ids_namespace(metadata_id: str, namespace: str):
logger.debug(describe)
namespaces = describe.get("namespaces", {})
total_vectors = 1
# batch_size = 100

offset = 0

if namespaces:
if namespace in namespaces.keys():
total_vectors = namespaces.get(namespace).get('vector_count')

logger.debug(total_vectors)
pc_res = index.query(
vector=[0] * 1536, # [0,0,0,0......0]
top_k=total_vectors,
filter={"id": {"$eq": metadata_id}},
namespace=namespace,
include_values=False,
include_metadata=False
)
matches = pc_res.get('matches')

ids = [obj.get('id') for obj in matches]
if not ids:
raise IndexError(f"Empty list for {metadata_id} and namespace {namespace}")
batch_size = min([total_vectors, 10000])
while offset < total_vectors:

pc_res = index.query(
vector=[0] * 1536, # [0,0,0,0......0]
top_k=batch_size,
filter={"id": {"$eq": metadata_id}},
namespace=namespace,
include_values=False,
include_metadata=False
)
matches = pc_res.get('matches')

ids = [obj.get('id') for obj in matches]

index.delete(
ids=ids,
namespace=namespace)
if offset == 0 and not ids:
raise IndexError(f"Empty list for {metadata_id} and namespace {namespace}")
elif offset > 0 and not ids:
break

index.delete(
ids=ids,
namespace=namespace)

if batch_size < 10000:
break
else:
offset += len(ids)

except Exception as ex:

Expand Down Expand Up @@ -218,16 +231,17 @@ async def get_pc_ids_namespace(metadata_id: str, namespace: str):
total_vectors = namespaces.get(namespace).get('vector_count')

logger.debug(f"pinecone total vector in {namespace}: {total_vectors}")

batch_size = min([total_vectors, 10000])
pc_res = index.query(
vector=[0] * 1536, # [0,0,0,0......0]
top_k=total_vectors,
top_k=batch_size,
filter={"id": {"$eq": metadata_id}},
namespace=namespace,
include_values=False,
include_metadata=True
)
matches = pc_res.get('matches')

# from pprint import pprint
# pprint(matches)
# ids = [obj.get('id') for obj in matches]
Expand Down

0 comments on commit 9040424

Please sign in to comment.