Skip to content

Commit

Permalink
chore: lint
Browse files Browse the repository at this point in the history
  • Loading branch information
simjak committed Jul 3, 2024
1 parent b6e8b0d commit d3d4b16
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 27 deletions.
2 changes: 1 addition & 1 deletion semantic_chunkers/chunkers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from colorama import Fore, Style
from pydantic.v1 import BaseModel, Extra

from semantic_router.encoders.base import BaseEncoder

from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter

Expand Down
13 changes: 7 additions & 6 deletions semantic_chunkers/chunkers/consecutive.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Any, List
from tqdm.auto import tqdm

import numpy as np

from semantic_router.encoders.base import BaseEncoder
from semantic_chunkers.schema import Chunk
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter

Expand Down Expand Up @@ -58,7 +58,9 @@ def _chunk(self, splits: List[Any], batch_size: int = 64) -> List[Chunk]:
self.chunks = chunks
return chunks

async def _async_chunk(self, splits: List[Any], batch_size: int = 64) -> List[Chunk]:
async def _async_chunk(
self, splits: List[Any], batch_size: int = 64
) -> List[Chunk]:
"""Merge splits into chunks using semantic similarity.
:param splits: splits to be merged into chunks.
Expand Down Expand Up @@ -90,7 +92,6 @@ async def _async_chunk(self, splits: List[Any], batch_size: int = 64) -> List[Ch
self.chunks = chunks
return chunks


def __call__(self, docs: List[Any]) -> List[List[Chunk]]:
"""Split documents into smaller chunks based on semantic similarity.
Expand Down Expand Up @@ -120,4 +121,4 @@ async def acall(self, docs: List[Any]) -> List[List[Chunk]]:
splits = doc
doc_chunks = await self._async_chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks
return all_chunks
12 changes: 7 additions & 5 deletions semantic_chunkers/chunkers/cumulative.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Any, List
from tqdm.auto import tqdm

import numpy as np

from semantic_router.encoders import BaseEncoder
from semantic_chunkers.schema import Chunk
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter

Expand Down Expand Up @@ -76,7 +76,9 @@ def _chunk(self, splits: List[Any], batch_size: int = 64) -> List[Chunk]:

return chunks

async def _async_chunk(self, splits: List[Any], batch_size: int = 64) -> List[Chunk]:
async def _async_chunk(
self, splits: List[Any], batch_size: int = 64
) -> List[Chunk]:
"""Merge splits into chunks using semantic similarity.
:param splits: splits to be merged into chunks.
Expand Down Expand Up @@ -166,4 +168,4 @@ async def acall(self, docs: List[str]) -> List[List[Chunk]]:
splits = doc
doc_chunks = await self._async_chunk(splits)
all_chunks.append(doc_chunks)
return all_chunks
return all_chunks
11 changes: 4 additions & 7 deletions semantic_chunkers/chunkers/statistical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@
from typing import Any, List

import numpy as np

from semantic_router.encoders.base import BaseEncoder
from semantic_chunkers.schema import Chunk
from tqdm.auto import tqdm

from semantic_chunkers.chunkers.base import BaseChunker
from semantic_chunkers.schema import Chunk
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter
from semantic_chunkers.utils.text import tiktoken_length
from semantic_chunkers.utils.logger import logger

from tqdm.auto import tqdm
from semantic_chunkers.utils.text import tiktoken_length


@dataclass
Expand Down Expand Up @@ -236,7 +235,6 @@ def __call__(self, docs: List[str], batch_size: int = 64) -> List[List[Chunk]]:
raise ValueError("The document must be a string.")
return all_chunks


async def acall(self, docs: List[str], batch_size: int = 64) -> List[List[Chunk]]:
"""Split documents into smaller chunks based on semantic similarity.
Expand Down Expand Up @@ -265,7 +263,6 @@ async def acall(self, docs: List[str], batch_size: int = 64) -> List[List[Chunk]
raise ValueError("The document must be a string.")
return all_chunks


def _encode_documents(self, docs: List[str]) -> np.ndarray:
"""
Encodes a list of documents into embeddings. If the number of documents
Expand Down
1 change: 0 additions & 1 deletion semantic_chunkers/splitters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from semantic_chunkers.splitters.base import BaseSplitter
from semantic_chunkers.splitters.sentence import RegexSplitter


__all__ = [
"BaseSplitter",
"RegexSplitter",
Expand Down
3 changes: 2 additions & 1 deletion semantic_chunkers/splitters/sentence.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import regex
from typing import List

import regex

from semantic_chunkers.splitters.base import BaseSplitter


Expand Down
14 changes: 8 additions & 6 deletions tests/unit/test_splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@

import numpy as np
import pytest

from semantic_router.encoders.base import BaseEncoder
from semantic_router.encoders.cohere import CohereEncoder
from semantic_chunkers import BaseChunker
from semantic_chunkers import BaseSplitter
from semantic_chunkers import ConsecutiveChunker
from semantic_chunkers import CumulativeChunker
from semantic_chunkers import StatisticalChunker

from semantic_chunkers import (
BaseChunker,
BaseSplitter,
ConsecutiveChunker,
CumulativeChunker,
StatisticalChunker,
)


def test_consecutive_sim_splitter():
Expand Down Expand Up @@ -112,6 +113,7 @@ def test_cumulative_sim_splitter():
# The expected outcome needs to match the logic defined in your mock_encoder's side_effect
assert len(splits) == 5, f"{len(splits)}"


@pytest.mark.asyncio
async def test_async_cumulative_sim_splitter():
# Mock the BaseEncoder
Expand Down

0 comments on commit d3d4b16

Please sign in to comment.