Skip to content

Commit

Permalink
Refactor hash method to eliminate redundant file loading
Browse files Browse the repository at this point in the history
  • Loading branch information
d4rkc0de committed Sep 22, 2024
1 parent 03db010 commit 74968ff
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
21 changes: 16 additions & 5 deletions backend/app/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import logging
from pathlib import Path
import hashlib

from .database import SQLiteDB
from .settings import CustomFormatter
Expand All @@ -23,11 +24,12 @@

async def summarize_document(doc: Document):
logger.info(f"Processing file {doc.metadata['file_path']}")
if db.is_file_exist(doc.metadata['file_path'], doc.hash):
doc_hash = get_file_hash(doc.metadata['file_path'])
if db.is_file_exist(doc.metadata['file_path'], doc_hash):
summary = db.get_file_summary(doc.metadata['file_path'])
else:
summary = await model.summarize_document_api(doc.text)
db.insert_file_summary(doc.metadata['file_path'], doc.hash, summary)
db.insert_file_summary(doc.metadata['file_path'], doc_hash, summary)
return {
"file_path": doc.metadata['file_path'],
"summary": summary
Expand All @@ -36,11 +38,12 @@ async def summarize_document(doc: Document):

async def summarize_image_document(doc: ImageDocument):
logger.info(f"Processing image {doc.image_path}")
if db.is_file_exist(doc.image_path, doc.hash):
image_hash = get_file_hash(doc.image_path)
if db.is_file_exist(doc.image_path, image_hash):
summary = db.get_file_summary(doc.image_path)
else:
summary = await model.summarize_image_api(image_path=doc.image_path)
db.insert_file_summary(doc.image_path, doc.hash, summary)
db.insert_file_summary(doc.image_path, image_hash, summary)
return {
"file_path": doc.image_path,
"summary": summary
Expand Down Expand Up @@ -129,11 +132,19 @@ def update_file(root_path, item):
os.makedirs(dst_dir)
if os.path.isfile(src_file):
shutil.move(src_file, dst_file)
new_hash = SimpleDirectoryReader(input_files=[dst_file]).load_data()[0].hash
new_hash = get_file_hash(dst_file)
db.update_file(src_file, dst_file, new_hash)


async def search_files(root_path: str, recursive: bool, required_exts: list, search_query: str):
summaries = await get_dir_summaries(root_path, recursive, required_exts)
files = await model.search_files_api(summaries, search_query)
return files


def get_file_hash(file_path):
hash_func = hashlib.new('sha256')
with open(file_path, 'rb') as f:
while chunk := f.read(8192):
hash_func.update(chunk)
return hash_func.hexdigest()
2 changes: 1 addition & 1 deletion backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ openai
pydantic-settings
llama-index
# needed by llama index
git+https://github.com/openai/whisper.git
git+https://github.com/openai/whisper.git # heavy library, remove if you don't want to treat media files
pydub
docx2txt
nbconvert

0 comments on commit 74968ff

Please sign in to comment.