Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support async #12

Merged
merged 2 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__pycache__
*.pyc
.venv
.venv*
.DS_Store
venv/
/.vscode
Expand Down
1,322 changes: 903 additions & 419 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "semantic-chunkers"
version = "0.0.6"
version = "0.0.7"
description = "Super advanced chunking methods for AI"
authors = ["Aurelio AI <[email protected]>"]
readme = "README.md"
Expand All @@ -16,7 +16,7 @@ regex = "^2023.12.25"
tiktoken = ">=0.7.0,<1.0.0"
matplotlib = { version = "^3.8.3", optional = true}
requests-mock = "^1.12.1"
semantic-router = ">=0.0.20,<0.1.0"
semantic-router = ">=0.0.48,<0.1.0"

[tool.poetry.extras]
stats = ["matplotlib"]
Expand All @@ -28,6 +28,7 @@ pytest = "^7.4.3"
pytest-mock = "^3.12.0"
pytest-cov = "^4.1.0"
pytest-xdist = "^3.5.0"
pytest-asyncio = "^0.23.7"
mypy = "^1.7.1"
types-pyyaml = "^6.0.12.12"
types-requests = "^2.31.0"
Expand Down
2 changes: 1 addition & 1 deletion semantic_chunkers/chunkers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from colorama import Fore, Style
from pydantic.v1 import BaseModel, Extra

from semantic_router.encoders.base import BaseEncoder

from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter

Expand Down
52 changes: 49 additions & 3 deletions semantic_chunkers/chunkers/consecutive.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Any, List
from tqdm.auto import tqdm

import numpy as np

from semantic_router.encoders.base import BaseEncoder
from semantic_chunkers.schema import Chunk
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter

Expand Down Expand Up @@ -58,6 +58,40 @@ def _chunk(self, splits: List[Any], batch_size: int = 64) -> List[Chunk]:
self.chunks = chunks
return chunks

async def _async_chunk(
self, splits: List[Any], batch_size: int = 64
) -> List[Chunk]:
"""Merge splits into chunks using semantic similarity.

:param splits: splits to be merged into chunks.

:return: list of chunks.
"""
split_embeds = []
num_splits = len(splits)
for i in tqdm(range(0, num_splits, batch_size)):
split_embeds.extend(await self.encoder.acall(splits[i : i + batch_size]))
norm_embeds = split_embeds / np.linalg.norm(split_embeds, axis=1, keepdims=True)
sim_matrix = np.matmul(norm_embeds, norm_embeds.T)
chunks = []
curr_split_start_idx = 0

for idx in tqdm(range(1, norm_embeds.shape[0])):
curr_sim_score = sim_matrix[idx - 1][idx]
if idx < len(sim_matrix) and curr_sim_score < self.score_threshold:
chunks.append(
Chunk(
splits=splits[curr_split_start_idx:idx],
is_triggered=True,
triggered_score=curr_sim_score,
)
)
curr_split_start_idx = idx
# append final chunk
chunks.append(Chunk(splits=splits[curr_split_start_idx:]))
self.chunks = chunks
return chunks

def __call__(self, docs: List[Any]) -> List[List[Chunk]]:
"""Split documents into smaller chunks based on semantic similarity.

Expand All @@ -76,3 +110,15 @@ def __call__(self, docs: List[Any]) -> List[List[Chunk]]:
doc_chunks = self._chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks

async def acall(self, docs: List[Any]) -> List[List[Chunk]]:
all_chunks = []
for doc in docs:
# split the document into sentences (if needed)
if isinstance(doc, str):
splits = self._split(doc)
else:
splits = doc
doc_chunks = await self._async_chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks
81 changes: 78 additions & 3 deletions semantic_chunkers/chunkers/cumulative.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Any, List
from tqdm.auto import tqdm

import numpy as np

from semantic_router.encoders import BaseEncoder
from semantic_chunkers.schema import Chunk
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter

Expand Down Expand Up @@ -76,6 +76,62 @@ def _chunk(self, splits: List[Any], batch_size: int = 64) -> List[Chunk]:

return chunks

async def _async_chunk(
self, splits: List[Any], batch_size: int = 64
) -> List[Chunk]:
"""Merge splits into chunks using semantic similarity.

:param splits: splits to be merged into chunks.

:return: list of chunks.
"""
chunks = []
curr_chunk_start_idx = 0
num_splits = len(splits)

for idx in tqdm(range(num_splits)):
if idx + 1 < num_splits: # Ensure there is a next document to compare with.
if idx == 0:
# On the first iteration, compare the
# first document directly to the second.
curr_chunk_docs = splits[idx]
else:
# For subsequent iterations, compare cumulative
# documents up to the current one with the next.
curr_chunk_docs = "\n".join(splits[curr_chunk_start_idx : idx + 1])
next_doc = splits[idx + 1]

# Embedding and similarity calculation remains the same.
curr_chunk_docs_embed_result = await self.encoder.acall(
[curr_chunk_docs]
)
next_doc_embed_result = await self.encoder.acall([next_doc])
curr_chunk_docs_embed = curr_chunk_docs_embed_result[0]
next_doc_embed = next_doc_embed_result[0]

curr_sim_score = np.dot(curr_chunk_docs_embed, next_doc_embed) / (
np.linalg.norm(curr_chunk_docs_embed)
* np.linalg.norm(next_doc_embed)
)
# Decision to chunk based on similarity score.
if curr_sim_score < self.score_threshold:
chunks.append(
Chunk(
splits=list(splits[curr_chunk_start_idx : idx + 1]),
is_triggered=True,
triggered_score=curr_sim_score,
)
)
curr_chunk_start_idx = (
idx + 1
) # Update the start index for the next segment.

# Add the last segment after the loop.
if curr_chunk_start_idx < num_splits:
chunks.append(Chunk(splits=list(splits[curr_chunk_start_idx:])))

return chunks

def __call__(self, docs: List[str]) -> List[List[Chunk]]:
"""Split documents into smaller chunks based on semantic similarity.

Expand All @@ -94,3 +150,22 @@ def __call__(self, docs: List[str]) -> List[List[Chunk]]:
doc_chunks = self._chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks

async def acall(self, docs: List[str]) -> List[List[Chunk]]:
"""Split documents into smaller chunks based on semantic similarity.

:param docs: list of text documents to be chunk, if only wanted to
chunk a single document, pass it as a list with a single element.

:return: list of list objects containing the chunks.
"""
all_chunks = []
for doc in docs:
# split the document into sentences (if needed)
if isinstance(doc, str):
splits = self._split(doc)
else:
splits = doc
doc_chunks = await self._async_chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks
Loading
Loading