Skip to content

Commit

Permalink
added scrape_type
Browse files Browse the repository at this point in the history
  • Loading branch information
glorenzo972 committed Jun 6, 2024
1 parent 3851115 commit fd8c012
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 118 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
*Andrea Sponziello*
### **Copyrigth**: *Tiledesk SRL*

## [2024-06-06]

### 0.1.19
- minor fix: return 400 if url is not correct

## [2024-05-20]

### 0.1.18
Expand Down
41 changes: 0 additions & 41 deletions log_conf.json

This file was deleted.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tilellm"
version = "0.1.18"
version = "0.1.19"
description = "tiledesk for RAG"
authors = ["Gianluca Lorenzo <[email protected]>"]
repository = "https://github.com/Tiledesk/tiledesk-llm"
Expand Down
2 changes: 1 addition & 1 deletion tilellm/controller/openai_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ async def delete_id_from_namespace(metadata_id:str, namespace:str):
:param namespace:
:return:
"""
from tilellm.store.pinecone_repository import delete_pc_ids_namespace
from tilellm.store.pinecone_repository import delete_pc_ids_namespace # , delete_pc_ids_namespace1
try:
return await delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace)
except Exception as ex:
Expand Down
184 changes: 124 additions & 60 deletions tilellm/store/pinecone_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,68 +47,73 @@ async def add_pc_item(item):
total_tokens = 0
cost = 0

if type_source == 'url':
documents = get_content_by_url(source, scrape_type)
for document in documents:
document.metadata["id"] = metadata_id
document.metadata["source"] = source
document.metadata["type"] = type_source
document.metadata["embedding"] = embedding

for key, value in document.metadata.items():
if isinstance(value, list) and all(item is None for item in value):
document.metadata[key] = [""]
elif value is None:
document.metadata[key] = ""

chunks.extend(chunk_data(data=[document]))



# from pprint import pprint
# pprint(documents)
logger.debug(documents)

a = vector_store.from_documents(chunks,
embedding=oai_embeddings,
index_name=const.PINECONE_INDEX,
namespace=namespace,
text_key=const.PINECONE_TEXT_KEY)

total_tokens, cost = calc_embedding_cost(chunks, embedding)
logger.info(f"chunks: {len(chunks)}, total_tokens: {total_tokens}, cost: {cost: .6f}")
try:
if type_source == 'url':
documents = get_content_by_url(source, scrape_type)
for document in documents:
document.metadata["id"] = metadata_id
document.metadata["source"] = source
document.metadata["type"] = type_source
document.metadata["embedding"] = embedding

for key, value in document.metadata.items():
if isinstance(value, list) and all(item is None for item in value):
document.metadata[key] = [""]
elif value is None:
document.metadata[key] = ""

chunks.extend(chunk_data(data=[document]))



# from pprint import pprint
# pprint(documents)
logger.debug(documents)

a = vector_store.from_documents(chunks,
embedding=oai_embeddings,
index_name=const.PINECONE_INDEX,
namespace=namespace,
text_key=const.PINECONE_TEXT_KEY)

total_tokens, cost = calc_embedding_cost(chunks, embedding)
logger.info(f"chunks: {len(chunks)}, total_tokens: {total_tokens}, cost: {cost: .6f}")

# from pprint import pprint
# pprint(documents)
elif type_source == 'urlbs':
doc_array = get_content_by_url_with_bs(source)
chunks = list()
for doc in doc_array:
metadata = MetadataItem(id=metadata_id, source=source, type=type_source, embedding=embedding)
document = Document(page_content=doc, metadata=metadata.dict())
chunks.append(document)
# chunks.extend(chunk_data(data=documents))
total_tokens, cost = calc_embedding_cost(chunks, embedding)
a = vector_store.from_documents(chunks,
embedding=oai_embeddings,
index_name=const.PINECONE_INDEX,
namespace=namespace,
text_key=const.PINECONE_TEXT_KEY)

# from pprint import pprint
# pprint(documents)
elif type_source == 'urlbs':
doc_array = get_content_by_url_with_bs(source)
chunks = list()
for doc in doc_array:
else:
metadata = MetadataItem(id=metadata_id, source=source, type=type_source, embedding=embedding)
document = Document(page_content=doc, metadata=metadata.dict())
chunks.append(document)
# chunks.extend(chunk_data(data=documents))
total_tokens, cost = calc_embedding_cost(chunks, embedding)
a = vector_store.from_documents(chunks,
embedding=oai_embeddings,
index_name=const.PINECONE_INDEX,
namespace=namespace,
text_key=const.PINECONE_TEXT_KEY)
document = Document(page_content=content, metadata=metadata.dict())

else:
metadata = MetadataItem(id=metadata_id, source=source, type=type_source, embedding=embedding)
document = Document(page_content=content, metadata=metadata.dict())

chunks.extend(chunk_data(data=[document]))
total_tokens, cost = calc_embedding_cost(chunks, embedding)
a = vector_store.from_documents(chunks,
embedding=oai_embeddings,
index_name=const.PINECONE_INDEX,
namespace=namespace,
text_key=const.PINECONE_TEXT_KEY)

pinecone_result = PineconeIndexingResult(id=metadata_id, chunks=len(chunks), total_tokens=total_tokens,
cost=f"{cost:.6f}")
chunks.extend(chunk_data(data=[document]))
total_tokens, cost = calc_embedding_cost(chunks, embedding)
a = vector_store.from_documents(chunks,
embedding=oai_embeddings,
index_name=const.PINECONE_INDEX,
namespace=namespace,
text_key=const.PINECONE_TEXT_KEY)

pinecone_result = PineconeIndexingResult(id=metadata_id, chunks=len(chunks), total_tokens=total_tokens,
cost=f"{cost:.6f}")
except Exception as ex:
pinecone_result = PineconeIndexingResult(id=metadata_id, chunks=len(chunks), total_tokens=total_tokens,
status=400,
cost=f"{cost:.6f}")
# {"id": f"{id}", "chunks": f"{len(chunks)}", "total_tokens": f"{total_tokens}", "cost": f"{cost:.6f}"}
return pinecone_result

Expand Down Expand Up @@ -149,7 +154,7 @@ async def delete_pc_ids_namespace(metadata_id: str, namespace: str):
pc = pinecone.Pinecone(
api_key=const.PINECONE_API_KEY
)
# index_host = "https://tilellm-s9kvboq.svc.apw5-4e34-81fa.pinecone.io"#os.environ.get("PINECONE_INDEX_HOST")

host = pc.describe_index(const.PINECONE_INDEX).host
index = pc.Index(name=const.PINECONE_INDEX, host=host)
# vector_store = Pinecone.from_existing_index(const.PINECONE_INDEX, )
Expand Down Expand Up @@ -201,6 +206,65 @@ async def delete_pc_ids_namespace(metadata_id: str, namespace: str):
raise ex


async def delete_pc_ids_namespace_pod(metadata_id: str, namespace: str):
"""
Delete from pinecone items
:param metadata_id:
:param namespace:
:return:
"""
# FIXME problema con namespace di cardinalità superiore a 10000

import pinecone
try:
pc = pinecone.Pinecone(
api_key=const.PINECONE_API_KEY
)
# index_host = "https://tilellm-s9kvboq.svc.apw5-4e34-81fa.pinecone.io"#os.environ.get("PINECONE_INDEX_HOST")
host = pc.describe_index(const.PINECONE_INDEX).host
index = pc.Index(name=const.PINECONE_INDEX, host=host)
# vector_store = Pinecone.from_existing_index(const.PINECONE_INDEX, )
describe = index.describe_index_stats()
logger.debug(describe)
namespaces = describe.get("namespaces", {})
total_vectors = 1
# batch_size = 100

import numpy as np
import time
# Next, we'll create a random vector to use as a query.
# query_vector = np.random.uniform(-1, 1, size=1536).tolist()
query_vector = [0] * 1536
# Now, cycle through the index, and add a slight sleep time in between batches to make sure we don't overwhelm the index.
deletes = []
deleted = 0
batch_size = 100
results = index.query(vector=query_vector,
filter={"id": {"$eq": metadata_id}},
top_k=batch_size,
namespace=namespace,
include_values=False,
include_metadata=False)
while len(results['matches']) > 0:
ids = [i['id'] for i in results['matches']]
index.delete(ids=ids, namespace=namespace)
deleted += len(ids)
time.sleep(1.50)
results = index.query(vector=query_vector,
filter={"id": {"$eq": metadata_id}},
top_k=batch_size,
namespace=namespace,
include_values=False,
include_metadata=False)

logger.info(f"{deleted}")
return deleted

except Exception as ex:
# logger.error(ex)
raise ex


async def get_pc_ids_namespace(metadata_id: str, namespace: str):
"""
Get from Pinecone all items from namespace given document id
Expand Down
33 changes: 18 additions & 15 deletions tilellm/tools/document_tool_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,24 @@

#"https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/"
def get_content_by_url(url: str, scrape_type: int):
urls =[url]
if scrape_type == 0:
loader = UnstructuredURLLoader(
urls=urls, mode="elements", strategy="fast",
)
else:
loader = UnstructuredURLLoader(
urls=urls, mode="single"
)
docs = loader.load()

# from pprint import pprint
# pprint(docs)

return docs
try:
urls = [url]
if scrape_type == 0:
loader = UnstructuredURLLoader(
urls=urls, mode="elements", strategy="fast", continue_on_failure=False
)
else:
loader = UnstructuredURLLoader(
urls=urls, mode="single", continue_on_failure=False
)
docs = loader.load()

# from pprint import pprint
# pprint(docs)

return docs
except Exception as ex:
raise ex

def get_content_by_url_with_bs(url:str):
html = requests.get(url)
Expand Down

0 comments on commit fd8c012

Please sign in to comment.