From 525890beeac4e1f7c144516f82d8733e66c5987b Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Sun, 2 Jun 2024 21:11:59 +1000 Subject: [PATCH] Added an optional progress bar. --- CHANGELOG.md | 5 +++++ README.md | 6 +++--- pyproject.toml | 3 ++- src/semchunk/semchunk.py | 13 ++++++++++--- tests/test_semchunk.py | 5 ++++- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea38019..5a7a8da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ ## Changelog 🔄 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.0.0] - 2024-06-02 +### Added +- Added a `progress` argument to the chunker returned by `chunkerify()` that, when set to `True` and multiple texts are passed, displays a progress bar. + ## [0.3.2] - 2024-06-01 ### Fixed - Fixed a bug where a `DivisionByZeroError` would be raised where a token counter returned zero tokens when called from `merge_splits()`, courtesy of [@jcobol](https://github.com/jcobol) ([#5](https://github.com/umarbutler/semchunk/pull/5)) ([7fd64eb](https://github.com/umarbutler/semchunk/pull/5/commits/7fd64eb8cf51f45702c59f43795be9a00c7d0d17)), fixing [#4](https://github.com/umarbutler/semchunk/issues/4). @@ -56,6 +60,7 @@ All notable changes to `semchunk` will be documented here. This project adheres ### Added - Added the `chunk()` function, which splits text into semantically meaningful chunks of a specified size as determined by a provided token counter. +[1.0.0]: https://github.com/umarbutler/semchunk/compare/v0.3.2...v1.0.0 [0.3.2]: https://github.com/umarbutler/semchunk/compare/v0.3.1...v0.3.2 [0.3.1]: https://github.com/umarbutler/semchunk/compare/v0.3.0...v0.3.1 [0.3.0]: https://github.com/umarbutler/semchunk/compare/v0.2.4...v0.3.0 diff --git a/README.md b/README.md index 5fbb8d2..a4bfe77 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # semchunk - + `semchunk` is a fast and lightweight pure Python library for splitting text into semantically meaningful chunks. @@ -35,7 +35,7 @@ chunker = semchunk.chunkerify('umarbutler/emubert', chunk_size) or \ # The resulting `chunker` can take and chunk a single text or a list of texts, returning a list of # chunks or a list of lists of chunks, respectively. assert chunker(text) == ['The quick', 'brown', 'fox', 'jumps', 'over the', 'lazy', 'dog.'] -assert chunker([text]) == [['The quick', 'brown', 'fox', 'jumps', 'over the', 'lazy', 'dog.']] +assert chunker([text], progress = True) == [['The quick', 'brown', 'fox', 'jumps', 'over the', 'lazy', 'dog.']] ``` ### Chunkerify @@ -59,7 +59,7 @@ def chunkerify( `memoize` flags whether to memoize the token counter. It defaults to `True`. -This function returns a callable that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. +This function returns a callable that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. The callable can also be passed a `progress` argument which if set to `True` and multiple texts are passed, will display a progress bar. ### Chunk ```python diff --git a/pyproject.toml b/pyproject.toml index 8cb1389..77784c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "semchunk" -version = "0.3.2" +version = "1.0.0" authors = [ {name="Umar Butler", email="umar@umar.au"}, ] @@ -45,6 +45,7 @@ classifiers = [ "Typing :: Typed" ] dependencies = [ + "tqdm", ] [project.urls] diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 8873280..7c024df 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -9,6 +9,8 @@ from itertools import accumulate from contextlib import suppress +from tqdm import tqdm + if TYPE_CHECKING: import tiktoken import tokenizers @@ -242,18 +244,23 @@ def faster_token_counter(text: str) -> int: token_counter = _memoized_token_counters.setdefault(token_counter, cache(token_counter)) # Construct and return the chunker. - def chunker(text_or_texts: str | Sequence[str]) -> list[str] | list[list[str]]: + def chunker(text_or_texts: str | Sequence[str], progress: bool = False) -> list[str] | list[list[str]]: """Split text or texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter. Args: text_or_texts (str | Sequence[str]): The text or texts to be chunked. Returns: - list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.""" + list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. + progress (bool, optional): Whether to display a progress bar when chunking multiple texts. Defaults to `False`.""" if isinstance(text_or_texts, str): return chunk(text_or_texts, chunk_size, token_counter, memoize = False) - return [chunk(text, chunk_size, token_counter, memoize = False) for text in text_or_texts] + if progress: + return [chunk(text, chunk_size, token_counter, memoize = False) for text in tqdm(text_or_texts)] + + else: + return [chunk(text, chunk_size, token_counter, memoize = False) for text in text_or_texts] return chunker diff --git a/tests/test_semchunk.py b/tests/test_semchunk.py index 3d5f4e7..d8170b2 100644 --- a/tests/test_semchunk.py +++ b/tests/test_semchunk.py @@ -82,4 +82,7 @@ def tiktoken_token_counter(text: str) -> int: except ValueError: worked = True - assert worked \ No newline at end of file + assert worked + + # Try enabling a progress bar. + chunker(['ThisIs\tATest.', 'ThisIs\tATest.'], progress = True) \ No newline at end of file