diff --git a/README.md b/README.md index 2260b43..66e05ed 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -# Chunker: Flexible Text Chunking for Elixir +# TextChunker: Flexible Text Chunking for Elixir ## About -Chunker is an Elixir library for segmenting large text documents, optimizing them for efficient embedding and storage within vector databases for use in resource augmented generation (RAG) applications. +TextChunker is an Elixir library for segmenting large text documents, optimizing them for efficient embedding and storage within vector databases for use in resource augmented generation (RAG) applications. It prioritizes context preservation and adaptability, and is therefore ideal for analytical, NLP, and other applications where understanding the relationship between text segments is crucial. @@ -23,12 +23,12 @@ Fill the gap in the Elixir ecosystem for a good semantic text chunker, and give ## Installation -Add Chunker to your mix.exs: +Add TextChunker to your mix.exs: ```elixir def deps do [ - {:chunker, "~> 0.1.1"} + {:text_chunker, "~> 0.1.1"} ] end ``` @@ -41,12 +41,6 @@ mix deps.get ## Usage -Begin by aliasing Chunker: - -```elixir -alias Chunker.TextChunker -``` - Chunk your text using the `split` function: ```elixir @@ -96,8 +90,6 @@ You can use Recursive Chunk to split text up into any chunk size you wish, with ## Examples ```elixir -alias Chunker.TextChunker - text = "This is a sample text. It will be split into properly-sized chunks using the Chunker library." opts = [chunk_size: 50, chunk_overlap: 5, format: :plaintext, strategy: &Chunker.Strategies.RecursiveChunk.split/2] @@ -134,4 +126,4 @@ Special thanks to the creators of langchain for their initial approach to recurs ## License -Chunker is released under the MIT License. See the [LICENSE](LICENSE) file for details. +TextChunker is released under the MIT License. See the [LICENSE](LICENSE) file for details. diff --git a/lib/chunker/text_chunker.ex b/lib/text_chunker.ex similarity index 89% rename from lib/chunker/text_chunker.ex rename to lib/text_chunker.ex index 746e869..c1fc6e1 100644 --- a/lib/chunker/text_chunker.ex +++ b/lib/text_chunker.ex @@ -1,4 +1,4 @@ -defmodule Chunker.TextChunker do +defmodule TextChunker do @moduledoc """ Provides a high-level interface for text chunking, employing a configurable splitting strategy (defaults to recursive splitting). Manages options and coordinates the process, tracking chunk metadata. @@ -8,7 +8,7 @@ defmodule Chunker.TextChunker do * **Size and Overlap Control:** Provides options for `:chunk_size` and `:chunk_overlap`. * **Metadata Tracking:** Generates `Chunk` structs containing byte range information. """ - alias Chunker.Strategies.RecursiveChunk + alias TextChunker.Strategies.RecursiveChunk @default_opts [ chunk_size: 2000, @@ -31,11 +31,11 @@ defmodule Chunker.TextChunker do ```elixir iex> long_text = "This is a very long text that needs to be split into smaller pieces for easier handling." - iex> Chunker.TextChunker.split(long_text) + iex> TextChunker.split(long_text) # => [%Chunk{}, %Chunk{}, ...] ``` - iex> Chunker.TextChunker.split(long_text, chunk_size: 10, chunk_overlap: 3) + iex> TextChunker.split(long_text, chunk_size: 10, chunk_overlap: 3) # => Generates many smaller chunks with significant overlap """ diff --git a/lib/chunker/chunk.ex b/lib/text_chunker/chunk.ex similarity index 94% rename from lib/chunker/chunk.ex rename to lib/text_chunker/chunk.ex index e06563d..0763afd 100644 --- a/lib/chunker/chunk.ex +++ b/lib/text_chunker/chunk.ex @@ -1,4 +1,4 @@ -defmodule Chunker.Chunk do +defmodule TextChunker.Chunk do @moduledoc """ Defines the `Chunk` struct, representing a contiguous block of text extracted during the splitting process. It stores the text content along with its corresponding byte range within the original input text. """ diff --git a/lib/chunker/strategies/behaviour.ex b/lib/text_chunker/strategies/behaviour.ex similarity index 72% rename from lib/chunker/strategies/behaviour.ex rename to lib/text_chunker/strategies/behaviour.ex index dfd53af..0d00e19 100644 --- a/lib/chunker/strategies/behaviour.ex +++ b/lib/text_chunker/strategies/behaviour.ex @@ -1,8 +1,8 @@ -defmodule Chunker.ChunkerBehaviour do +defmodule TextChunker.ChunkerBehaviour do @moduledoc """ Defines the contract that must be implemented for all text splitting strategies. """ - alias Chunker.Chunk + alias TextChunker.Chunk @callback split(text :: binary(), opts :: [keyword()]) :: [Chunk.t()] end diff --git a/lib/chunker/strategies/recursive_chunk/recursive_chunk.ex b/lib/text_chunker/strategies/recursive_chunk/recursive_chunk.ex similarity index 96% rename from lib/chunker/strategies/recursive_chunk/recursive_chunk.ex rename to lib/text_chunker/strategies/recursive_chunk/recursive_chunk.ex index b96c0bb..07f0883 100644 --- a/lib/chunker/strategies/recursive_chunk/recursive_chunk.ex +++ b/lib/text_chunker/strategies/recursive_chunk/recursive_chunk.ex @@ -1,4 +1,4 @@ -defmodule Chunker.Strategies.RecursiveChunk do +defmodule TextChunker.Strategies.RecursiveChunk do @moduledoc """ Handles recursive text splitting, aiming to adhere to configured size and overlap requirements. Employs a flexible separator-based approach to break down text into manageable chunks, while generating metadata for each produced chunk. @@ -24,10 +24,10 @@ defmodule Chunker.Strategies.RecursiveChunk do 4. **Metadata Generation:** Tracks byte ranges for each chunk for potential reassembly of the original text. """ - @behaviour Chunker.ChunkerBehaviour + @behaviour TextChunker.ChunkerBehaviour - alias Chunker.Chunk - alias Chunker.Strategies.RecursiveChunk.Separators + alias TextChunker.Chunk + alias TextChunker.Strategies.RecursiveChunk.Separators require Logger @@ -49,14 +49,14 @@ defmodule Chunker.Strategies.RecursiveChunk do ```elixir iex> long_text = "This is a very long text that needs to be split into smaller pieces for easier handling." - iex> Chunker.Strategies.RecursiveChunk.split(long_text, chunk_size: 15, chunk_overlap: 5) + iex> TextChunker.Strategies.RecursiveChunk.split(long_text, chunk_size: 15, chunk_overlap: 5) [ - %Chunker.Chunk{ + %TextChunker.Chunk{ start_byte: 0, end_byte: 47, text: "This is a very long text that needs to be split" }, - %Chunker.Chunk{ + %TextChunker.Chunk{ start_byte: 38, end_byte: 88, text: " be split into smaller pieces for easier handling." diff --git a/lib/chunker/strategies/recursive_chunk/separators.ex b/lib/text_chunker/strategies/recursive_chunk/separators.ex similarity index 97% rename from lib/chunker/strategies/recursive_chunk/separators.ex rename to lib/text_chunker/strategies/recursive_chunk/separators.ex index b506c04..794620e 100644 --- a/lib/chunker/strategies/recursive_chunk/separators.ex +++ b/lib/text_chunker/strategies/recursive_chunk/separators.ex @@ -1,4 +1,4 @@ -defmodule Chunker.Strategies.RecursiveChunk.Separators do +defmodule TextChunker.Strategies.RecursiveChunk.Separators do @moduledoc """ Handles separator configuration for the RecursiveChunk text chunking strategy. diff --git a/test/recursive_chunk_test.exs b/test/recursive_chunk_test.exs index bb465e4..44067c8 100644 --- a/test/recursive_chunk_test.exs +++ b/test/recursive_chunk_test.exs @@ -1,8 +1,7 @@ defmodule TextChunkerTest do use ExUnit.Case - alias Chunker.TestHelpers - alias Chunker.TextChunker + alias TextChunker.TestHelpers @moduletag timeout: :infinity diff --git a/test/support/test_helpers.ex b/test/support/test_helpers.ex index 657f9d4..38a9adc 100644 --- a/test/support/test_helpers.ex +++ b/test/support/test_helpers.ex @@ -1,9 +1,9 @@ -defmodule Chunker.TestHelpers do +defmodule TextChunker.TestHelpers do @moduledoc false @doc """ Extracts the text content from a single `Chunk` struct. """ - def chunk_text(%Chunker.Chunk{} = chunk), do: chunk.text + def chunk_text(%TextChunker.Chunk{} = chunk), do: chunk.text @doc """ Extracts the text content from a list of `Chunk` structs.