Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python splitting // adds python separators #9

Merged
merged 4 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions lib/text_chunker/strategies/recursive_chunk/separators.ex
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,17 @@ defmodule TextChunker.Strategies.RecursiveChunk.Separators do
]
end

def get_separators(:python) do
[
"\nclass ",
"\ndef ",
"\n\tdef ",
"\n\n",
"\n",
" "
]
end

def get_separators(:vue) do
[
"<script",
Expand Down
127 changes: 125 additions & 2 deletions test/recursive_chunk_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ defmodule TextChunkerTest do

@moduletag timeout: :infinity

describe "plaintext chunker" do
describe "chunker with plaintext separators" do
test "splits multiple sentences correctly" do
opts = [
chunk_size: 50,
Expand Down Expand Up @@ -203,7 +203,7 @@ defmodule TextChunkerTest do
end
end

describe "markdown chunker" do
describe "chunker with markdown separators" do
test "splits a simple markdown file" do
opts = [
chunk_size: 100,
Expand Down Expand Up @@ -232,4 +232,127 @@ defmodule TextChunkerTest do
assert result == expected_result
end
end

describe "chunker with python separators" do
test "splits a simple python file sensibly with no overlap" do
opts = [
chunk_size: 100,
chunk_overlap: 0,
format: :python
]

{:ok, text} = File.read("test/support/fixtures/document_fixtures/test_code.py")

result = text |> TextChunker.split(opts) |> TestHelpers.extract_text_from_chunks()

expected_result =
[
"class PetShop:\n \"\"\"Represents a pet shop with inventory and sales functionality.\"\"\"",
"\n\n def __init__(self, name):\n self.name = name\n self.inventory = {}",
"\n\n def add_pet(self, pet_type, quantity):",
"\n \"\"\"Adds a specified quantity of a pet type to the inventory.\"\"\"",
"\n if pet_type in self.inventory:\n self.inventory[pet_type] += quantity",
"\n else:\n self.inventory[pet_type] = quantity",
"\n\n def sell_pet(self, pet_type, quantity):",
"\n \"\"\"Sells a specified quantity of a pet type.\"\"\"",
"\n if pet_type in self.inventory and self.inventory[pet_type] >= quantity:",
"\n self.inventory[pet_type] -= quantity\n return True\n else:",
"\n return False",
"\n\n def get_pet_count(self, pet_type):",
"\n \"\"\"Returns the current count of a specific pet type.\"\"\"",
"\n return self.inventory.get(pet_type, 0)\n"
]

assert result == expected_result
end

test "splits a simple python file sensibly with overlap" do
opts = [
chunk_size: 100,
chunk_overlap: 50,
format: :python
]

{:ok, text} = File.read("test/support/fixtures/document_fixtures/test_code.py")

result = text |> TextChunker.split(opts) |> TestHelpers.extract_text_from_chunks()

expected_result =
[
"class PetShop:\n \"\"\"Represents a pet shop with inventory and sales functionality.\"\"\"",
"\n\n def __init__(self, name):\n self.name = name\n self.inventory = {}",
"\n\n def add_pet(self, pet_type, quantity):",
"\n \"\"\"Adds a specified quantity of a pet type to the inventory.\"\"\"",
"\n if pet_type in self.inventory:\n self.inventory[pet_type] += quantity",
"\n self.inventory[pet_type] += quantity\n else:",
"\n else:\n self.inventory[pet_type] = quantity",
"\n\n def sell_pet(self, pet_type, quantity):",
"\n def sell_pet(self, pet_type, quantity):\n \"\"\"Sells a specified quantity of a pet type.\"\"\"",
"\n if pet_type in self.inventory and self.inventory[pet_type] >= quantity:",
"\n self.inventory[pet_type] -= quantity\n return True\n else:",
"\n return True\n else:\n return False",
"\n\n def get_pet_count(self, pet_type):",
"\n \"\"\"Returns the current count of a specific pet type.\"\"\"",
"\n return self.inventory.get(pet_type, 0)\n"
]

assert result == expected_result
end
end

describe "chunker with javascript separators" do
test "splits a simple javascript file sensibly with no overlap" do
opts = [
chunk_size: 100,
chunk_overlap: 0,
format: :javascript
]

{:ok, text} = File.read("test/support/fixtures/document_fixtures/test_code.js")

result = text |> TextChunker.split(opts) |> TestHelpers.extract_text_from_chunks()

expected_result =
[
"class PetShop {\n constructor(name) {\n this.name = name;\n this.inventory = {};\n }",
"\n\n addPet(petType, quantity) {\n ",
" if (this.inventory[petType]) {\n this.inventory[petType] += quantity;\n } else {",
"\n this.inventory[petType] = quantity;\n }\n }",
"\n\n sellPet(petType, quantity) {\n ",
" if (this.inventory[petType] && this.inventory[petType] >= quantity) {",
"\n this.inventory[petType] -= quantity;\n return true;\n } else {",
"\n return false;\n }\n }",
"\n\n getPetCount(petType) {\n return this.inventory[petType] || 0; \n }\n}\n"
]

assert result == expected_result
end

test "splits a simple javascript file sensibly with overlap" do
opts = [
chunk_size: 100,
chunk_overlap: 50,
format: :javascript
]

{:ok, text} = File.read("test/support/fixtures/document_fixtures/test_code.js")

result = text |> TextChunker.split(opts) |> TestHelpers.extract_text_from_chunks()

expected_result =
[
"class PetShop {\n constructor(name) {\n this.name = name;\n this.inventory = {};\n }",
"\n\n addPet(petType, quantity) {\n ",
" if (this.inventory[petType]) {\n this.inventory[petType] += quantity;\n } else {",
"\n } else {\n this.inventory[petType] = quantity;\n }\n }",
"\n\n sellPet(petType, quantity) {\n ",
" if (this.inventory[petType] && this.inventory[petType] >= quantity) {",
"\n this.inventory[petType] -= quantity;\n return true;\n } else {",
"\n return true;\n } else {\n return false;\n }\n }",
"\n\n getPetCount(petType) {\n return this.inventory[petType] || 0; \n }\n}\n"
]

assert result == expected_result
end
end
end
27 changes: 27 additions & 0 deletions test/support/fixtures/document_fixtures/test_code.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
class PetShop {
constructor(name) {
this.name = name;
this.inventory = {};
}

addPet(petType, quantity) {
if (this.inventory[petType]) {
this.inventory[petType] += quantity;
} else {
this.inventory[petType] = quantity;
}
}

sellPet(petType, quantity) {
if (this.inventory[petType] && this.inventory[petType] >= quantity) {
this.inventory[petType] -= quantity;
return true;
} else {
return false;
}
}

getPetCount(petType) {
return this.inventory[petType] || 0;
}
}
25 changes: 25 additions & 0 deletions test/support/fixtures/document_fixtures/test_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
class PetShop:
"""Represents a pet shop with inventory and sales functionality."""

def __init__(self, name):
self.name = name
self.inventory = {}

def add_pet(self, pet_type, quantity):
"""Adds a specified quantity of a pet type to the inventory."""

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it's kinda nice for our purposes that python doc blocks (or whatever they call them) are inside the methods 🤔

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, it would be a pain otherwise

if pet_type in self.inventory:
self.inventory[pet_type] += quantity
else:
self.inventory[pet_type] = quantity

def sell_pet(self, pet_type, quantity):
"""Sells a specified quantity of a pet type."""
if pet_type in self.inventory and self.inventory[pet_type] >= quantity:
self.inventory[pet_type] -= quantity
return True
else:
return False

def get_pet_count(self, pet_type):
"""Returns the current count of a specific pet type."""
return self.inventory.get(pet_type, 0)
Loading