Skip to content

Commit

Permalink
Merge pull request #129 from knmlprz/chunking
Browse files Browse the repository at this point in the history
Documents with chunks
  • Loading branch information
TheJimmyNowak authored May 22, 2024
2 parents af9a03e + bf9e69f commit c83ac31
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 8 deletions.
36 changes: 36 additions & 0 deletions api/chunks/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from documents.models import Document
from pydantic import BaseModel
from itertools import batched


class ChunkData(BaseModel):
text: str
chunk_idx: int
start_char: int
end_char: int
document_idx: int


def split_document_into_chunks(
document: Document, chunk_size: int = 100
) -> list[ChunkData]:
"""Splits document into chunks of size chunk_size and returns them as ChunkData objects."""
chunks = []
start_char = 0

for i, chunk in enumerate(batched(document.text, chunk_size)):
next_start_char = start_char + len(chunk)

chunks.append(
ChunkData(
text="".join(chunk),
chunk_idx=i,
start_char=start_char,
end_char=next_start_char - 1,
document_idx=document.id,
)
)

start_char = next_start_char

return chunks
1 change: 1 addition & 0 deletions api/chunks/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
delete_chunk_controller,
)


from ninja import Router
from ninja.pagination import LimitOffsetPagination, paginate

Expand Down
33 changes: 25 additions & 8 deletions api/documents/controllers.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
from http import HTTPStatus

from django.db import transaction
from chunks.utils import split_document_into_chunks
from chunks.models import Chunk
from documents.models import Document
from documents.schemas import DocumentIn, DocumentOut


def create_document_controller(payload: DocumentIn) -> tuple[HTTPStatus, DocumentOut]:
document = Document(**payload.dict())
document.full_clean()
document.save()
"""Creating the document and chunks of that document.
Making sure that both chunks and document are created by using atomic transaction.
"""
with transaction.atomic():
document = Document(**payload.dict())
document.full_clean()
document.save()

chunks = split_document_into_chunks(document, 100)
chunk_instances = [Chunk(**chunk_data) for chunk_data in chunks]
Chunk.objects.bulk_create(chunk_instances)

return HTTPStatus.CREATED, document


Expand All @@ -21,11 +33,16 @@ def retrieve_document_controller(id: int) -> DocumentOut:


def update_document_controller(payload: DocumentIn, id: int) -> DocumentOut:
document = Document.objects.get(id=id)
for attr, value in payload.dict().items():
setattr(document, attr, value)
document.full_clean()
document.save()
with transaction.atomic():
document = Document.objects.filter(id=id).update(**payload.dict())
document.full_clean()
document.save()

# Removing old chunks and generating new ones.
Chunk.objects.filter(chunks__document_idx=document).delete()
chunks = split_document_into_chunks(document, 100)
chunk_instances = [Chunk(**chunk_data) for chunk_data in chunks]
Chunk.objects.bulk_create(chunk_instances)
return document


Expand Down

0 comments on commit c83ac31

Please sign in to comment.