From ad403d955b0f8f67f0b4cb842577cb83e1a3457d Mon Sep 17 00:00:00 2001 From: Saurav Panda Date: Fri, 16 Aug 2024 01:34:59 -0700 Subject: [PATCH] feat: final update to RAG generation --- .gitignore | 3 +- .../kaizen/helpers/test_create_folder.py | 76 ++- .../helpers/test_create_pr_description.py | 127 +++-- .../helpers/test_create_pr_review_text.py | 28 +- .../kaizen/helpers/test_create_test_files.py | 218 +++++---- .../kaizen/helpers/test_get_parent_folder.py | 24 +- .../kaizen/helpers/test_get_web_html.py | 59 ++- Dockerfile | 12 + config.json | 21 + db_setup/init.sql | 57 +-- examples/ragify_codebase/main.py | 25 +- install_tree_sitter_languages.sh | 51 ++ kaizen/generator/unit_test.py | 12 +- kaizen/llms/provider.py | 9 + kaizen/retriever/code_chunker.py | 272 ++++++----- kaizen/retriever/feedback_system.py | 18 + kaizen/retriever/llama_index_retriever.py | 354 +++++++++----- kaizen/retriever/query_processor.py | 0 kaizen/retriever/result_processor.py | 0 kaizen/retriever/tree_sitter_utils.py | 107 +++++ poetry.lock | 453 ++++++++++++------ pyproject.toml | 2 + 22 files changed, 1298 insertions(+), 630 deletions(-) create mode 100644 install_tree_sitter_languages.sh create mode 100644 kaizen/retriever/feedback_system.py delete mode 100644 kaizen/retriever/query_processor.py delete mode 100644 kaizen/retriever/result_processor.py create mode 100644 kaizen/retriever/tree_sitter_utils.py diff --git a/.gitignore b/.gitignore index 79fb5dac..16d3c87f 100644 --- a/.gitignore +++ b/.gitignore @@ -164,4 +164,5 @@ cython_debug/ node_modules .next -.cloudcode \ No newline at end of file +.cloudcode +tree_sitter_languages/ \ No newline at end of file diff --git a/.kaizen/unit_test/kaizen/helpers/test_create_folder.py b/.kaizen/unit_test/kaizen/helpers/test_create_folder.py index 0348f3c0..1ef355e3 100644 --- a/.kaizen/unit_test/kaizen/helpers/test_create_folder.py +++ b/.kaizen/unit_test/kaizen/helpers/test_create_folder.py @@ -6,33 +6,44 @@ # Mock logger logger = mock.Mock() + @pytest.fixture def mock_os_path_exists(): - with mock.patch('os.path.exists') as mock_exists: + with mock.patch("os.path.exists") as mock_exists: yield mock_exists + @pytest.fixture def mock_os_makedirs(): - with mock.patch('os.makedirs') as mock_makedirs: + with mock.patch("os.makedirs") as mock_makedirs: yield mock_makedirs + @pytest.fixture def mock_logger_debug(): - with mock.patch('kaizen.helpers.output.logger.debug') as mock_debug: + with mock.patch("kaizen.helpers.output.logger.debug") as mock_debug: yield mock_debug -def test_create_new_folder_when_not_exists(mock_os_path_exists, mock_os_makedirs, mock_logger_debug): - folder_path = 'new_folder' + +def test_create_new_folder_when_not_exists( + mock_os_path_exists, mock_os_makedirs, mock_logger_debug +): + folder_path = "new_folder" mock_os_path_exists.return_value = False create_folder(folder_path) mock_os_path_exists.assert_called_once_with(folder_path) mock_os_makedirs.assert_called_once_with(folder_path) - mock_logger_debug.assert_called_once_with(f"Folder '{folder_path}' created successfully.") + mock_logger_debug.assert_called_once_with( + f"Folder '{folder_path}' created successfully." + ) + -def test_do_nothing_when_folder_already_exists(mock_os_path_exists, mock_os_makedirs, mock_logger_debug): - folder_path = 'existing_folder' +def test_do_nothing_when_folder_already_exists( + mock_os_path_exists, mock_os_makedirs, mock_logger_debug +): + folder_path = "existing_folder" mock_os_path_exists.return_value = True create_folder(folder_path) @@ -41,47 +52,66 @@ def test_do_nothing_when_folder_already_exists(mock_os_path_exists, mock_os_make mock_os_makedirs.assert_not_called() mock_logger_debug.assert_called_once_with(f"Folder '{folder_path}' already exists.") + def test_raise_value_error_when_folder_path_is_empty(): with pytest.raises(ValueError, match="Folder path cannot be empty"): - create_folder('') + create_folder("") + -def test_create_deeply_nested_folder(mock_os_path_exists, mock_os_makedirs, mock_logger_debug): - folder_path = 'a/b/c/d/e/f/g' +def test_create_deeply_nested_folder( + mock_os_path_exists, mock_os_makedirs, mock_logger_debug +): + folder_path = "a/b/c/d/e/f/g" mock_os_path_exists.return_value = False create_folder(folder_path) mock_os_path_exists.assert_called_once_with(folder_path) mock_os_makedirs.assert_called_once_with(folder_path) - mock_logger_debug.assert_called_once_with(f"Folder '{folder_path}' created successfully.") + mock_logger_debug.assert_called_once_with( + f"Folder '{folder_path}' created successfully." + ) + -def test_create_folder_with_special_characters(mock_os_path_exists, mock_os_makedirs, mock_logger_debug): - folder_path = 'folder_with_special_!@#$%^&*()' +def test_create_folder_with_special_characters( + mock_os_path_exists, mock_os_makedirs, mock_logger_debug +): + folder_path = "folder_with_special_!@#$%^&*()" mock_os_path_exists.return_value = False create_folder(folder_path) mock_os_path_exists.assert_called_once_with(folder_path) mock_os_makedirs.assert_called_once_with(folder_path) - mock_logger_debug.assert_called_once_with(f"Folder '{folder_path}' created successfully.") + mock_logger_debug.assert_called_once_with( + f"Folder '{folder_path}' created successfully." + ) -def test_create_folder_with_max_path_length(mock_os_path_exists, mock_os_makedirs, mock_logger_debug): + +def test_create_folder_with_max_path_length( + mock_os_path_exists, mock_os_makedirs, mock_logger_debug +): # Adjusting the max path length to a more typical value for modern filesystems - max_path_length = os.pathconf('/', 'PC_PATH_MAX') - folder_path = 'a' * max_path_length + max_path_length = os.pathconf("/", "PC_PATH_MAX") + folder_path = "a" * max_path_length mock_os_path_exists.return_value = False create_folder(folder_path) mock_os_path_exists.assert_called_once_with(folder_path) mock_os_makedirs.assert_called_once_with(folder_path) - mock_logger_debug.assert_called_once_with(f"Folder '{folder_path}' created successfully.") + mock_logger_debug.assert_called_once_with( + f"Folder '{folder_path}' created successfully." + ) + -def test_create_folder_with_invalid_characters(mock_os_path_exists, mock_os_makedirs, mock_logger_debug): +def test_create_folder_with_invalid_characters( + mock_os_path_exists, mock_os_makedirs, mock_logger_debug +): # Assuming the filesystem does not allow characters like ':', '*', '?', '<', '>', '|' - invalid_characters = [':', '*', '?', '<', '>', '|'] + invalid_characters = [":", "*", "?", "<", ">", "|"] for char in invalid_characters: - folder_path = f'invalid{char}folder' + folder_path = f"invalid{char}folder" mock_os_path_exists.return_value = False with pytest.raises(OSError): @@ -89,4 +119,4 @@ def test_create_folder_with_invalid_characters(mock_os_path_exists, mock_os_make mock_os_path_exists.assert_called_once_with(folder_path) mock_os_makedirs.assert_not_called() - mock_logger_debug.assert_not_called() \ No newline at end of file + mock_logger_debug.assert_not_called() diff --git a/.kaizen/unit_test/kaizen/helpers/test_create_pr_description.py b/.kaizen/unit_test/kaizen/helpers/test_create_pr_description.py index 4280b502..e006cabf 100644 --- a/.kaizen/unit_test/kaizen/helpers/test_create_pr_description.py +++ b/.kaizen/unit_test/kaizen/helpers/test_create_pr_description.py @@ -2,51 +2,91 @@ import time from kaizen.helpers.output import create_pr_description -DESC_COLLAPSIBLE_TEMPLATE = "
Original Description\n\n{desc}\n\n
" +DESC_COLLAPSIBLE_TEMPLATE = ( + "
Original Description\n\n{desc}\n\n
" +) -@pytest.mark.parametrize("desc, original_desc, expected", [ - # Normal Cases - ("This is a PR description", "This is the original detailed description", - "This is a PR description\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\nThis is the original detailed description\n\n
"), - ("Fixes a bug", "This fixes a bug in the system", - "Fixes a bug\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\nThis fixes a bug in the system\n\n
"), - # Edge Cases - ("", "This is the original detailed description", - "\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\nThis is the original detailed description\n\n
"), - ("This is a PR description", "", - "This is a PR description\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n\n\n
"), - ("", "", - "\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n\n\n
"), - ("# Heading\n* Bullet", "**Bold**\n_Italic_", - "# Heading\n* Bullet\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n**Bold**\n_Italic_\n\n
"), - # Special Characters and HTML Tags - ("

Title

", "

This is a bold statement

", - "

Title

\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n

This is a bold statement

\n\n
"), - ("Special characters: !@#$%^&*()", "More special characters: ~`<>?", - "Special characters: !@#$%^&*()\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\nMore special characters: ~`<>?\n\n
"), -]) + +@pytest.mark.parametrize( + "desc, original_desc, expected", + [ + # Normal Cases + ( + "This is a PR description", + "This is the original detailed description", + "This is a PR description\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\nThis is the original detailed description\n\n
", + ), + ( + "Fixes a bug", + "This fixes a bug in the system", + "Fixes a bug\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\nThis fixes a bug in the system\n\n
", + ), + # Edge Cases + ( + "", + "This is the original detailed description", + "\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\nThis is the original detailed description\n\n
", + ), + ( + "This is a PR description", + "", + "This is a PR description\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n\n\n
", + ), + ( + "", + "", + "\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n\n\n
", + ), + ( + "# Heading\n* Bullet", + "**Bold**\n_Italic_", + "# Heading\n* Bullet\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n**Bold**\n_Italic_\n\n
", + ), + # Special Characters and HTML Tags + ( + "

Title

", + "

This is a bold statement

", + "

Title

\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n

This is a bold statement

\n\n
", + ), + ( + "Special characters: !@#$%^&*()", + "More special characters: ~`<>?", + "Special characters: !@#$%^&*()\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\nMore special characters: ~`<>?\n\n
", + ), + ], +) def test_create_pr_description_normal_and_edge_cases(desc, original_desc, expected): assert create_pr_description(desc, original_desc) == expected -@pytest.mark.parametrize("desc, original_desc, expected_error_message", [ - # Error Handling - (None, "This is the original detailed description", "desc must be a string"), - (123, "This is the original detailed description", "desc must be a string"), - ([], "This is the original detailed description", "desc must be a string"), - ("This is a PR description", None, "original_desc must be a string"), - ("This is a PR description", 123, "original_desc must be a string"), - ("This is a PR description", [], "original_desc must be a string"), -]) -def test_create_pr_description_error_handling(desc, original_desc, expected_error_message): + +@pytest.mark.parametrize( + "desc, original_desc, expected_error_message", + [ + # Error Handling + (None, "This is the original detailed description", "desc must be a string"), + (123, "This is the original detailed description", "desc must be a string"), + ([], "This is the original detailed description", "desc must be a string"), + ("This is a PR description", None, "original_desc must be a string"), + ("This is a PR description", 123, "original_desc must be a string"), + ("This is a PR description", [], "original_desc must be a string"), + ], +) +def test_create_pr_description_error_handling( + desc, original_desc, expected_error_message +): with pytest.raises(TypeError) as exc_info: create_pr_description(desc, original_desc) assert str(exc_info.value) == expected_error_message -@pytest.mark.parametrize("desc, original_desc", [ - # Boundary Conditions - ("a" * 10000, "b" * 10000), - ("a" * 100000, "b" * 100000), -]) + +@pytest.mark.parametrize( + "desc, original_desc", + [ + # Boundary Conditions + ("a" * 10000, "b" * 10000), + ("a" * 100000, "b" * 100000), + ], +) def test_create_pr_description_boundary_conditions(desc, original_desc): start_time = time.time() result = create_pr_description(desc, original_desc) @@ -56,9 +96,18 @@ def test_create_pr_description_boundary_conditions(desc, original_desc): assert result.startswith(desc) assert result.endswith(DESC_COLLAPSIBLE_TEMPLATE.format(desc=original_desc)) assert "> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️" in result - assert len(result) == len(desc) + len(original_desc) + len("\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n\n\n
") - 2 + assert ( + len(result) + == len(desc) + + len(original_desc) + + len( + "\n\n> ✨ Generated with love by [Kaizen](https://cloudcode.ai) ❤️\n\n
Original Description\n\n\n\n
" + ) + - 2 + ) # Removed the arbitrary 1-second boundary condition print(f"Execution time: {execution_time} seconds") + if __name__ == "__main__": - pytest.main() \ No newline at end of file + pytest.main() diff --git a/.kaizen/unit_test/kaizen/helpers/test_create_pr_review_text.py b/.kaizen/unit_test/kaizen/helpers/test_create_pr_review_text.py index f6761b9b..ef4d8de7 100644 --- a/.kaizen/unit_test/kaizen/helpers/test_create_pr_review_text.py +++ b/.kaizen/unit_test/kaizen/helpers/test_create_pr_review_text.py @@ -15,6 +15,7 @@ """ + @pytest.fixture def setup_single_topic_single_review(): return { @@ -32,6 +33,7 @@ def setup_single_topic_single_review(): ] } + @pytest.fixture def setup_multiple_topics_multiple_reviews(): return { @@ -55,7 +57,7 @@ def setup_multiple_topics_multiple_reviews(): "end_line": 40, "file_name": "file2.py", "severity_level": 7, - } + }, ], "topic2": [ { @@ -68,14 +70,18 @@ def setup_multiple_topics_multiple_reviews(): "file_name": "file3.py", "severity_level": 5, } - ] + ], } + def test_empty_topics(): topics = {} - expected_output = "## Code Review\n\n✅ **All Clear:** This PR is ready to merge! 👍\n\n" + expected_output = ( + "## Code Review\n\n✅ **All Clear:** This PR is ready to merge! 👍\n\n" + ) assert create_pr_review_text(topics) == expected_output + def test_single_topic_single_review(setup_single_topic_single_review): topics = setup_single_topic_single_review expected_output = ( @@ -96,6 +102,7 @@ def test_single_topic_single_review(setup_single_topic_single_review): ) assert create_pr_review_text(topics) == expected_output + def test_multiple_topics_multiple_reviews(setup_multiple_topics_multiple_reviews): topics = setup_multiple_topics_multiple_reviews expected_output = ( @@ -139,6 +146,7 @@ def test_multiple_topics_multiple_reviews(setup_multiple_topics_multiple_reviews ) assert create_pr_review_text(topics) == expected_output + def test_reviews_with_missing_fields(): topics = { "topic1": [ @@ -181,7 +189,7 @@ def test_reviews_with_missing_fields(): "end_line": 80, "file_name": "final_test_file.py", # Missing severity_level - } + }, ] } expected_output = ( @@ -235,6 +243,7 @@ def test_reviews_with_missing_fields(): ) assert create_pr_review_text(topics) == expected_output + def test_reviews_with_missing_comment(): topics = { "topic1": [ @@ -268,9 +277,10 @@ def test_reviews_with_missing_comment(): ) assert create_pr_review_text(topics) == expected_output + def test_empty_list_in_topics(): - topics = { - "topic1": [] - } - expected_output = "## Code Review\n\n✅ **All Clear:** This PR is ready to merge! 👍\n\n" - assert create_pr_review_text(topics) == expected_output \ No newline at end of file + topics = {"topic1": []} + expected_output = ( + "## Code Review\n\n✅ **All Clear:** This PR is ready to merge! 👍\n\n" + ) + assert create_pr_review_text(topics) == expected_output diff --git a/.kaizen/unit_test/kaizen/helpers/test_create_test_files.py b/.kaizen/unit_test/kaizen/helpers/test_create_test_files.py index f318d808..0cf4ad3b 100644 --- a/.kaizen/unit_test/kaizen/helpers/test_create_test_files.py +++ b/.kaizen/unit_test/kaizen/helpers/test_create_test_files.py @@ -4,45 +4,60 @@ from unittest import mock from kaizen.helpers.output import create_test_files + # Mocking dependencies @pytest.fixture def mock_dependencies(): - with mock.patch('kaizen.helpers.output.create_folder') as mock_create_folder, \ - mock.patch('kaizen.helpers.output.general.clean_python_code') as mock_clean_python_code, \ - mock.patch('kaizen.helpers.output.logger') as mock_logger: + with mock.patch( + "kaizen.helpers.output.create_folder" + ) as mock_create_folder, mock.patch( + "kaizen.helpers.output.general.clean_python_code" + ) as mock_clean_python_code, mock.patch( + "kaizen.helpers.output.logger" + ) as mock_logger: yield mock_create_folder, mock_clean_python_code, mock_logger + # Utility function to read file content def read_file_content(file_path): - with open(file_path, 'r') as f: + with open(file_path, "r") as f: return f.read() + # Utility function to sanitize file names def sanitize_filename(filename): - return "".join(c if c.isalnum() or c in (' ', '.', '_') else '_' for c in filename) + return "".join(c if c.isalnum() or c in (" ", ".", "_") else "_" for c in filename) + # Test single module with a single test def test_single_module_single_test(tmp_path, mock_dependencies): mock_create_folder, mock_clean_python_code, mock_logger = mock_dependencies mock_clean_python_code.return_value = "def test_example():\n assert True" - json_tests = [{ - "folder_name": "module1", - "module_title": "Module 1", - "importance": "High", - "tests": [{ - "test_name": "Test Example", - "test_description": "This is a test example.", - "code": "def test_example():\n assert True" - }] - }] + json_tests = [ + { + "folder_name": "module1", + "module_title": "Module 1", + "importance": "High", + "tests": [ + { + "test_name": "Test Example", + "test_description": "This is a test example.", + "code": "def test_example():\n assert True", + } + ], + } + ] create_test_files(json_tests, tmp_path) # Assertions assert os.path.exists(os.path.join(tmp_path, "tests.json")) assert os.path.exists(os.path.join(tmp_path, "module1", "test_test_example.py")) - assert "Importance: High" in read_file_content(os.path.join(tmp_path, "module1", "test_test_example.py")) + assert "Importance: High" in read_file_content( + os.path.join(tmp_path, "module1", "test_test_example.py") + ) + # Test multiple modules with multiple tests def test_multiple_modules_multiple_tests(tmp_path, mock_dependencies): @@ -58,14 +73,14 @@ def test_multiple_modules_multiple_tests(tmp_path, mock_dependencies): { "test_name": "Test Example 1", "test_description": "This is test example 1.", - "code": "def test_example_1():\n assert True" + "code": "def test_example_1():\n assert True", }, { "test_name": "Test Example 2", "test_description": "This is test example 2.", - "code": "def test_example_2():\n assert True" - } - ] + "code": "def test_example_2():\n assert True", + }, + ], }, { "folder_name": "module2", @@ -75,10 +90,10 @@ def test_multiple_modules_multiple_tests(tmp_path, mock_dependencies): { "test_name": "Test Example 3", "test_description": "This is test example 3.", - "code": "def test_example_3():\n assert True" + "code": "def test_example_3():\n assert True", } - ] - } + ], + }, ] create_test_files(json_tests, tmp_path) @@ -89,6 +104,7 @@ def test_multiple_modules_multiple_tests(tmp_path, mock_dependencies): assert os.path.exists(os.path.join(tmp_path, "module1", "test_test_example_2.py")) assert os.path.exists(os.path.join(tmp_path, "module2", "test_test_example_3.py")) + # Test empty json_tests list def test_empty_json_tests(tmp_path, mock_dependencies): mock_create_folder, mock_clean_python_code, mock_logger = mock_dependencies @@ -101,21 +117,26 @@ def test_empty_json_tests(tmp_path, mock_dependencies): assert os.path.exists(os.path.join(tmp_path, "tests.json")) assert os.path.getsize(os.path.join(tmp_path, "tests.json")) == 0 + # Test names with special characters def test_special_characters_in_test_names(tmp_path, mock_dependencies): mock_create_folder, mock_clean_python_code, mock_logger = mock_dependencies mock_clean_python_code.return_value = "def test_example():\n assert True" - json_tests = [{ - "folder_name": "module1", - "module_title": "Module 1", - "importance": "High", - "tests": [{ - "test_name": "Test Example!@#", - "test_description": "This is a test example with special characters.", - "code": "def test_example():\n assert True" - }] - }] + json_tests = [ + { + "folder_name": "module1", + "module_title": "Module 1", + "importance": "High", + "tests": [ + { + "test_name": "Test Example!@#", + "test_description": "This is a test example with special characters.", + "code": "def test_example():\n assert True", + } + ], + } + ] create_test_files(json_tests, tmp_path) @@ -123,22 +144,27 @@ def test_special_characters_in_test_names(tmp_path, mock_dependencies): sanitized_name = sanitize_filename("test_test_example!@#.py") assert os.path.exists(os.path.join(tmp_path, "module1", sanitized_name)) + # Test very long test names def test_very_long_test_names(tmp_path, mock_dependencies): mock_create_folder, mock_clean_python_code, mock_logger = mock_dependencies mock_clean_python_code.return_value = "def test_example():\n assert True" long_test_name = "Test " + "Example " * 50 - json_tests = [{ - "folder_name": "module1", - "module_title": "Module 1", - "importance": "High", - "tests": [{ - "test_name": long_test_name, - "test_description": "This is a very long test name.", - "code": "def test_example():\n assert True" - }] - }] + json_tests = [ + { + "folder_name": "module1", + "module_title": "Module 1", + "importance": "High", + "tests": [ + { + "test_name": long_test_name, + "test_description": "This is a very long test name.", + "code": "def test_example():\n assert True", + } + ], + } + ] create_test_files(json_tests, tmp_path) @@ -146,23 +172,30 @@ def test_very_long_test_names(tmp_path, mock_dependencies): file_name = "test_" + "_".join(long_test_name.lower().split(" ")) + ".py" assert os.path.exists(os.path.join(tmp_path, "module1", file_name)) assert len(file_name) <= 255 # Assuming a common file system limit - assert "def test_example():\n assert True" in read_file_content(os.path.join(tmp_path, "module1", file_name)) + assert "def test_example():\n assert True" in read_file_content( + os.path.join(tmp_path, "module1", file_name) + ) + # Test clean code function returns empty string def test_clean_code_returns_empty_string(tmp_path, mock_dependencies): mock_create_folder, mock_clean_python_code, mock_logger = mock_dependencies mock_clean_python_code.return_value = "" - json_tests = [{ - "folder_name": "module1", - "module_title": "Module 1", - "importance": "High", - "tests": [{ - "test_name": "Test Example", - "test_description": "This is a test example.", - "code": "def test_example():\n assert True" - }] - }] + json_tests = [ + { + "folder_name": "module1", + "module_title": "Module 1", + "importance": "High", + "tests": [ + { + "test_name": "Test Example", + "test_description": "This is a test example.", + "code": "def test_example():\n assert True", + } + ], + } + ] create_test_files(json_tests, tmp_path) @@ -171,21 +204,26 @@ def test_clean_code_returns_empty_string(tmp_path, mock_dependencies): assert not os.path.exists(os.path.join(tmp_path, "module1", "test_test_example.py")) assert not os.path.exists(os.path.join(tmp_path, "tests.json")) + # Test file writing permission issues def test_file_writing_permission_issues(tmp_path, mock_dependencies): mock_create_folder, mock_clean_python_code, mock_logger = mock_dependencies mock_clean_python_code.return_value = "def test_example():\n assert True" - json_tests = [{ - "folder_name": "module1", - "module_title": "Module 1", - "importance": "High", - "tests": [{ - "test_name": "Test Example", - "test_description": "This is a test example.", - "code": "def test_example():\n assert True" - }] - }] + json_tests = [ + { + "folder_name": "module1", + "module_title": "Module 1", + "importance": "High", + "tests": [ + { + "test_name": "Test Example", + "test_description": "This is a test example.", + "code": "def test_example():\n assert True", + } + ], + } + ] # Simulate permission error with mock.patch("builtins.open", mock.mock_open()) as mock_file: @@ -197,6 +235,7 @@ def test_file_writing_permission_issues(tmp_path, mock_dependencies): # Assertions assert not os.path.exists(os.path.join(tmp_path, "tests.json")) + # Test maximum number of modules def test_maximum_number_of_modules(tmp_path, mock_dependencies): mock_create_folder, mock_clean_python_code, mock_logger = mock_dependencies @@ -207,40 +246,51 @@ def test_maximum_number_of_modules(tmp_path, mock_dependencies): "folder_name": f"module{i}", "module_title": f"Module {i}", "importance": "High", - "tests": [{ - "test_name": f"Test Example {i}", - "test_description": f"This is test example {i}.", - "code": "def test_example():\n assert True" - }] - } for i in range(100) + "tests": [ + { + "test_name": f"Test Example {i}", + "test_description": f"This is test example {i}.", + "code": "def test_example():\n assert True", + } + ], + } + for i in range(100) ] create_test_files(json_tests, tmp_path) # Assertions for i in range(100): - assert os.path.exists(os.path.join(tmp_path, f"module{i}", f"test_test_example_{i}.py")) + assert os.path.exists( + os.path.join(tmp_path, f"module{i}", f"test_test_example_{i}.py") + ) + # Test maximum number of tests per module def test_maximum_number_of_tests_per_module(tmp_path, mock_dependencies): mock_create_folder, mock_clean_python_code, mock_logger = mock_dependencies mock_clean_python_code.return_value = "def test_example():\n assert True" - json_tests = [{ - "folder_name": "module1", - "module_title": "Module 1", - "importance": "High", - "tests": [ - { - "test_name": f"Test Example {i}", - "test_description": f"This is test example {i}.", - "code": "def test_example():\n assert True" - } for i in range(100) - ] - }] + json_tests = [ + { + "folder_name": "module1", + "module_title": "Module 1", + "importance": "High", + "tests": [ + { + "test_name": f"Test Example {i}", + "test_description": f"This is test example {i}.", + "code": "def test_example():\n assert True", + } + for i in range(100) + ], + } + ] create_test_files(json_tests, tmp_path) # Assertions for i in range(100): - assert os.path.exists(os.path.join(tmp_path, "module1", f"test_test_example_{i}.py")) \ No newline at end of file + assert os.path.exists( + os.path.join(tmp_path, "module1", f"test_test_example_{i}.py") + ) diff --git a/.kaizen/unit_test/kaizen/helpers/test_get_parent_folder.py b/.kaizen/unit_test/kaizen/helpers/test_get_parent_folder.py index cfc86496..974c699c 100644 --- a/.kaizen/unit_test/kaizen/helpers/test_get_parent_folder.py +++ b/.kaizen/unit_test/kaizen/helpers/test_get_parent_folder.py @@ -5,32 +5,42 @@ from unittest import mock from kaizen.helpers.output import get_parent_folder + # Correct implementation of get_parent_folder() def get_parent_folder(): return os.path.dirname(os.getcwd()) + # Test function for normal case def test_get_parent_folder_normal(): expected = os.path.dirname(os.getcwd()) result = get_parent_folder() assert result == expected, f"Expected {expected}, but got {result}" + # Test function for error handling case def test_get_parent_folder_error_handling(): - with mock.patch('os.getcwd', side_effect=OSError("Unable to determine current working directory")): - with pytest.raises(OSError, match="Unable to determine current working directory"): + with mock.patch( + "os.getcwd", + side_effect=OSError("Unable to determine current working directory"), + ): + with pytest.raises( + OSError, match="Unable to determine current working directory" + ): get_parent_folder() - - with mock.patch('os.getcwd', side_effect=Exception("Unknown error")): + + with mock.patch("os.getcwd", side_effect=Exception("Unknown error")): with pytest.raises(Exception, match="Unknown error"): get_parent_folder() + # Test function for nested directory structure def test_get_parent_folder_nested(): - with mock.patch('os.getcwd', return_value='/home/user/project/subfolder'): - expected = '/home/user/project' + with mock.patch("os.getcwd", return_value="/home/user/project/subfolder"): + expected = "/home/user/project" result = get_parent_folder() assert result == expected, f"Expected {expected}, but got {result}" + if __name__ == "__main__": - pytest.main() \ No newline at end of file + pytest.main() diff --git a/.kaizen/unit_test/kaizen/helpers/test_get_web_html.py b/.kaizen/unit_test/kaizen/helpers/test_get_web_html.py index 969948da..c54e9d24 100644 --- a/.kaizen/unit_test/kaizen/helpers/test_get_web_html.py +++ b/.kaizen/unit_test/kaizen/helpers/test_get_web_html.py @@ -7,19 +7,24 @@ # Assuming the get_web_html function is defined in kaizen/helpers/output.py from kaizen.helpers.output import get_web_html + @pytest.fixture def mock_get_html(): - with patch('kaizen.helpers.output.get_html', new_callable=AsyncMock) as mock: + with patch("kaizen.helpers.output.get_html", new_callable=AsyncMock) as mock: yield mock + @pytest.fixture def mock_nest_asyncio(): - with patch('kaizen.helpers.output.nest_asyncio.apply') as mock: + with patch("kaizen.helpers.output.nest_asyncio.apply") as mock: yield mock -@pytest.mark.parametrize("html_content, expected_output", [ - ( - """ + +@pytest.mark.parametrize( + "html_content, expected_output", + [ + ( + """ Test @@ -33,7 +38,7 @@ def mock_nest_asyncio(): """, - """ + """
@@ -43,18 +48,15 @@ def mock_nest_asyncio():

- """ - ), - ( - "", # Empty HTML content - "" - ), - ( - "

Nothing to remove here!

", # No removable elements - "\n \n

\n Nothing to remove here!\n

\n \n" - ), - ( - """ + """, + ), + ("", ""), # Empty HTML content + ( + "

Nothing to remove here!

", # No removable elements + "\n \n

\n Nothing to remove here!\n

\n \n", + ), + ( + """ Test @@ -62,7 +64,7 @@ def mock_nest_asyncio(): """, - """ + """

@@ -70,10 +72,13 @@ def mock_nest_asyncio():

- """ - ) -]) -async def test_get_web_html_normal_cases(mock_get_html, mock_nest_asyncio, html_content, expected_output): + """, + ), + ], +) +async def test_get_web_html_normal_cases( + mock_get_html, mock_nest_asyncio, html_content, expected_output +): mock_get_html.return_value = html_content url = "https://cloudcode.ai" @@ -82,6 +87,7 @@ async def test_get_web_html_normal_cases(mock_get_html, mock_nest_asyncio, html_ assert result.strip() == expected_output.strip() mock_nest_asyncio.assert_called_once() + async def test_get_web_html_invalid_url(mock_get_html, mock_nest_asyncio): mock_get_html.side_effect = Exception("Network error") @@ -90,13 +96,16 @@ async def test_get_web_html_invalid_url(mock_get_html, mock_nest_asyncio): await get_web_html(url) mock_nest_asyncio.assert_called_once() + async def test_get_web_html_large_content(mock_get_html, mock_nest_asyncio): large_html_content = "" + "

Test

" * 10000 + "" - expected_output = "\n \n" + "

\n Test\n

\n" * 10000 + " \n" + expected_output = ( + "\n \n" + "

\n Test\n

\n" * 10000 + " \n" + ) mock_get_html.return_value = large_html_content url = "https://cloudcode.ai" result = await get_web_html(url) assert result.strip() == expected_output.strip() - mock_nest_asyncio.assert_called_once() \ No newline at end of file + mock_nest_asyncio.assert_called_once() diff --git a/Dockerfile b/Dockerfile index 0d900049..e56f0e4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,12 @@ FROM python:3.12-slim # Set the working directory in the container WORKDIR /app +# Install system dependencies +RUN apt-get update && apt-get install -y \ + git \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + # Install Poetry RUN pip install --no-cache-dir poetry @@ -16,6 +22,12 @@ RUN poetry install --no-dev --no-root # Copy the application code into the container COPY . . +# Make the installation script executable +RUN chmod +x install_tree_sitter_languages.sh + +# Run the Tree-sitter language installation script +RUN ./install_tree_sitter_languages.sh + # Expose the port on which the application will run EXPOSE 8000 diff --git a/config.json b/config.json index 6c0dbe29..8647f8cb 100644 --- a/config.json +++ b/config.json @@ -4,6 +4,27 @@ "enable_observability_logging": false, "redis_enabled": true, "models": [ + { + "model_name": "embedding", + "litellm_params": { + "model": "azure/text-embedding-small", + "input_cost_per_token": 0.000000015, + "output_cost_per_token": 0.0000006, + "api_key": "os.environ/AZURE_API_KEY", + "api_base": "os.environ/AZURE_API_BASE" + } + }, + { + "model_name": "small", + "litellm_params": { + "model": "azure/gpt-4o-mini", + "input_cost_per_token": 0.000000015, + "output_cost_per_token": 0.0000006, + "api_key": "os.environ/AZURE_API_KEY", + "api_base": "os.environ/AZURE_API_BASE", + "base_model": "azure/gpt-4o-mini" + } + }, { "model_name": "default", "litellm_params": { diff --git a/db_setup/init.sql b/db_setup/init.sql index 5f8720a1..223492fa 100644 --- a/db_setup/init.sql +++ b/db_setup/init.sql @@ -20,54 +20,49 @@ CREATE TABLE files ( programming_language TEXT ); --- Table to store code snippets -CREATE TABLE code_snippets ( - snippet_id SERIAL PRIMARY KEY, +-- Table to store function abstractions +CREATE TABLE function_abstractions ( + function_id SERIAL PRIMARY KEY, file_id INTEGER NOT NULL REFERENCES files(file_id), - snippet_text TEXT NOT NULL, + function_name TEXT NOT NULL, + function_signature TEXT NOT NULL, + abstract_functionality TEXT NOT NULL, + complexity_score FLOAT, + input_output_description TEXT, start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL, - functionality TEXT, - context TEXT + end_line INTEGER NOT NULL ); --- Table to store vector embeddings for code snippets -CREATE TABLE embeddings ( +-- Table to store vector embeddings for function abstractions +CREATE TABLE function_embeddings ( embedding_id SERIAL PRIMARY KEY, - snippet_id INTEGER NOT NULL REFERENCES code_snippets(snippet_id), - vector VECTOR NOT NULL + function_id INTEGER NOT NULL REFERENCES function_abstractions(function_id), + vector VECTOR(1536) NOT NULL ); --- Table to store AI-generated summaries for code snippets -CREATE TABLE snippet_summaries ( - summary_id SERIAL PRIMARY KEY, - snippet_id INTEGER NOT NULL REFERENCES code_snippets(snippet_id), - summary TEXT NOT NULL, - summary_quality_score FLOAT -); --- Node level data for AST -CREATE TABLE ast_nodes ( +CREATE TABLE syntax_nodes ( node_id SERIAL PRIMARY KEY, file_id INTEGER NOT NULL REFERENCES files(file_id), node_type TEXT NOT NULL, start_line INTEGER NOT NULL, - end_line INTEGER NOT NULL - -- Add other common node properties here + end_line INTEGER NOT NULL, + node_content TEXT, + language TEXT NOT NULL +); + +-- Table to store node relationships +CREATE TABLE node_relationships ( + relationship_id SERIAL PRIMARY KEY, + parent_node_id INTEGER NOT NULL REFERENCES syntax_nodes(node_id), + child_node_id INTEGER NOT NULL REFERENCES syntax_nodes(node_id), + relationship_type TEXT NOT NULL ); -- Table to store node properties CREATE TABLE node_properties ( property_id SERIAL PRIMARY KEY, - node_id INTEGER NOT NULL REFERENCES ast_nodes(node_id), + node_id INTEGER NOT NULL REFERENCES syntax_nodes(node_id), property_name TEXT NOT NULL, property_value TEXT NOT NULL -); - --- Table to store node relationships -CREATE TABLE node_relationships ( - relationship_id SERIAL PRIMARY KEY, - parent_node_id INTEGER NOT NULL REFERENCES ast_nodes(node_id), - child_node_id INTEGER NOT NULL REFERENCES ast_nodes(node_id), - relationship_type TEXT NOT NULL ); \ No newline at end of file diff --git a/examples/ragify_codebase/main.py b/examples/ragify_codebase/main.py index 13bab95b..411dadbf 100644 --- a/examples/ragify_codebase/main.py +++ b/examples/ragify_codebase/main.py @@ -1,9 +1,22 @@ from kaizen.retriever.llama_index_retriever import RepositoryAnalyzer -# Usage -analyzer = RepositoryAnalyzer(database_config={}) -analyzer.analyze_repository("./kaizen") +# Initialize the analyzer +analyzer = RepositoryAnalyzer() -# Query example -result = analyzer.query("How is function X related to function Y?") -print(result) +# Set up the repository (do this when you first analyze a repo or when you want to update it) +analyzer.setup_repository("./github_app/") + +# Perform queries (you can do this as many times as you want without calling setup_repository again) +results = analyzer.query("Find functions that handle authentication") +for result in results: + print(f"File: {result['file_path']}") + print(f"Abstraction: {result['abstraction']}") + print(f"Code:\n{result['code']}") + print(f"Relevance Score: {result['relevance_score']}") + print("---") + +# # If you make changes to the repository and want to update the analysis: +# analyzer.setup_repository("/path/to/your/repo") + +# Then you can query again with the updated data +results = analyzer.query("authentication") diff --git a/install_tree_sitter_languages.sh b/install_tree_sitter_languages.sh new file mode 100644 index 00000000..cb247dd3 --- /dev/null +++ b/install_tree_sitter_languages.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Directory to store the language libraries +LANGUAGE_DIR="tree_sitter_languages" + +# List of languages to install +LANGUAGES=( + "python" + "javascript" + "typescript" + "rust" +) + +# Create the language directory if it doesn't exist +mkdir -p "$LANGUAGE_DIR" + +# Function to install a language +install_language() { + lang=$1 + echo "Installing Tree-sitter parser for $lang..." + + # Clone the repository if it doesn't exist + if [ ! -d "$LANGUAGE_DIR/tree-sitter-$lang" ]; then + git clone "https://github.com/tree-sitter/tree-sitter-$lang" "$LANGUAGE_DIR/tree-sitter-$lang" + fi + + # Navigate to the repository directory + cd "$LANGUAGE_DIR/tree-sitter-$lang" + + # Update submodules + git submodule update --init + + # Compile the parser + cc -fPIC -c -I./src src/parser.c + cc -shared *.o -o "../$lang.so" + + # Clean up object files + rm *.o + + # Navigate back to the original directory + cd ../.. + + echo "Tree-sitter parser for $lang installed successfully." +} + +# Install each language +for lang in "${LANGUAGES[@]}"; do + install_language $lang +done + +echo "All Tree-sitter parsers have been installed." \ No newline at end of file diff --git a/kaizen/generator/unit_test.py b/kaizen/generator/unit_test.py index 66b8623e..7c85ded5 100644 --- a/kaizen/generator/unit_test.py +++ b/kaizen/generator/unit_test.py @@ -59,7 +59,11 @@ def _setup_directories(self): self._create_output_folder(self.output_folder) def generate_tests_from_dir( - self, dir_path: str, output_path: str = None, max_critique: int = 3, verbose: bool = False, + self, + dir_path: str, + output_path: str = None, + max_critique: int = 3, + verbose: bool = False, enable_critique: bool = False, ): """ @@ -69,14 +73,14 @@ def generate_tests_from_dir( self.enable_critique = enable_critique self.verbose = verbose if verbose else self.verbose self.output_folder = output_path if output_path else self.output_folder - for file_path in Path(dir_path).rglob('*.*'): + for file_path in Path(dir_path).rglob("*.*"): try: self.generate_tests(file_path=str(file_path), output_path=output_path) except Exception as e: print(f"Error: Could not generate tests for {file_path}: {e}") - + return {}, self.total_usage - + def generate_tests( self, file_path: str, diff --git a/kaizen/llms/provider.py b/kaizen/llms/provider.py index 5d575bf9..e58836fa 100644 --- a/kaizen/llms/provider.py +++ b/kaizen/llms/provider.py @@ -232,3 +232,12 @@ def get_usage_cost(self, total_usage: Dict[str, int], model: str = None) -> floa return litellm.cost_per_token( model, total_usage["prompt_tokens"], total_usage["completion_tokens"] ) + + def get_text_embedding(self, text): + # for model in self.config["language_model"]["models"]: + # if model["model_name"] == "embedding": + # break + response = self.provider.embedding( + model="embedding", input=[text], dimensions=1536 + ) + return response["data"], response["usage"] diff --git a/kaizen/retriever/code_chunker.py b/kaizen/retriever/code_chunker.py index b7749652..ebd0d15f 100644 --- a/kaizen/retriever/code_chunker.py +++ b/kaizen/retriever/code_chunker.py @@ -1,140 +1,156 @@ -import ast -import esprima -import escodegen -import json - - -ParsedBody = { - "functions": {}, - "classes": {}, - "hooks": {}, - "components": {}, - "other_blocks": [], +import os +import subprocess +from tree_sitter import Language, Parser +from typing import Dict, List, Any + +ParsedBody = Dict[str, Dict[str, Any]] + +# Define the languages and their GitHub repositories +LANGUAGES = { + "python": "https://github.com/tree-sitter/tree-sitter-python", + "javascript": "https://github.com/tree-sitter/tree-sitter-javascript", + "typescript": "https://github.com/tree-sitter/tree-sitter-typescript", + "rust": "https://github.com/tree-sitter/tree-sitter-rust", } - -def chunk_python_code(code): - tree = ast.parse(code) - functions = {} - classes = {} - other_blocks = [] - current_block = [] - - for node in ast.iter_child_nodes(tree): - if isinstance(node, ast.FunctionDef): - functions[node.name] = ast.unparse(node) - elif isinstance(node, ast.ClassDef): - methods = {} - for item in node.body: - if isinstance(item, ast.FunctionDef): - methods[item.name] = ast.unparse(item) - classes[node.name] = {"definition": ast.unparse(node), "methods": methods} - elif isinstance(node, (ast.If, ast.For, ast.While)): - other_blocks.append(ast.unparse(node)) - else: - current_block.append(ast.unparse(node)) - - if current_block: - other_blocks.append("\n".join(current_block)) - - body = ParsedBody - body["functions"] = functions - body["classes"] = classes - body["other_blocks"] = other_blocks - return body +# Directory to store the language libraries +LANGUAGE_DIR = os.path.join(os.path.dirname(__file__), "tree_sitter_languages") + + +def ensure_language_installed(language: str) -> None: + if not os.path.exists(LANGUAGE_DIR): + os.makedirs(LANGUAGE_DIR) + + lang_file = os.path.join(LANGUAGE_DIR, f"{language}.so") + if not os.path.exists(lang_file): + repo_url = LANGUAGES[language] + repo_dir = os.path.join(LANGUAGE_DIR, f"tree-sitter-{language}") + + if not os.path.exists(repo_dir): + subprocess.run(["git", "clone", repo_url, repo_dir], check=True) + + subprocess.run( + ["bash", "-c", f"cd {repo_dir} && git submodule update --init"], check=True + ) + Language.build_library(lang_file, [repo_dir]) + + +def get_parser(language: str) -> Parser: + ensure_language_installed(language) + parser = Parser() + lang_file = os.path.join(LANGUAGE_DIR, f"{language}.so") + lang = Language(lang_file, language) + parser.set_language(lang) + return parser + + +def traverse_tree(node, code_bytes: bytes) -> Dict[str, Any]: + if node.type in [ + "function_definition", + "function_declaration", + "arrow_function", + "method_definition", + ]: + return { + "type": "function", + "name": ( + node.child_by_field_name("name").text.decode("utf8") + if node.child_by_field_name("name") + else "anonymous" + ), + "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), + } + elif node.type in ["class_definition", "class_declaration"]: + return { + "type": "class", + "name": node.child_by_field_name("name").text.decode("utf8"), + "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), + } + elif node.type in ["jsx_element", "jsx_self_closing_element"]: + return { + "type": "component", + "name": ( + node.child_by_field_name("opening_element") + .child_by_field_name("name") + .text.decode("utf8") + if node.type == "jsx_element" + else node.child_by_field_name("name").text.decode("utf8") + ), + "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), + } + elif node.type == "impl_item": + return { + "type": "impl", + "name": node.child_by_field_name("type").text.decode("utf8"), + "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), + } + else: + return None -def chunk_javascript_code(code): - tree = esprima.parseModule(code, jsx=True, tolerant=True) - functions = {} - classes = {} - components = {} - hooks = {} - other_blocks = [] +def chunk_code(code: str, language: str) -> ParsedBody: + parser = get_parser(language) + tree = parser.parse(code.encode("utf8")) - def ast_to_source(node): - try: - return escodegen.generate(node) - except Exception: - return f"// Unable to generate code for {node.type}" + body: ParsedBody = { + "functions": {}, + "classes": {}, + "hooks": {}, + "components": {}, + "other_blocks": [], + } + code_bytes = code.encode("utf8") def process_node(node): - if node.type == "FunctionDeclaration": - if is_react_component(node): - components[node.id.name] = ast_to_source(node) - else: - functions[node.id.name] = ast_to_source(node) - elif node.type == "ClassDeclaration": - if is_react_component(node): - components[node.id.name] = ast_to_source(node) - else: - methods = {} - for item in node.body.body: - if item.type == "MethodDefinition": - methods[item.key.name] = ast_to_source(item) - classes[node.id.name] = { - "definition": ast_to_source(node), - "methods": methods, - } - elif node.type == "VariableDeclaration": - for decl in node.declarations: - if decl.init and decl.init.type == "ArrowFunctionExpression": - if is_react_component(decl.init): - components[decl.id.name] = ast_to_source(node) - elif is_react_hook(decl.id.name): - hooks[decl.id.name] = ast_to_source(node) - else: - functions[decl.id.name] = ast_to_source(node) + result = traverse_tree(node, code_bytes) + if result: + if result["type"] == "function": + if is_react_hook(result["name"]): + body["hooks"][result["name"]] = result["code"] + elif is_react_component(result["code"]): + body["components"][result["name"]] = result["code"] else: - other_blocks.append(ast_to_source(node)) - elif node.type in [ - "ImportDeclaration", - "ExportDefaultDeclaration", - "ExportNamedDeclaration", - ]: - other_blocks.append(ast_to_source(node)) + body["functions"][result["name"]] = result["code"] + elif result["type"] == "class": + if is_react_component(result["code"]): + body["components"][result["name"]] = result["code"] + else: + body["classes"][result["name"]] = result["code"] + elif result["type"] == "component": + body["components"][result["name"]] = result["code"] + elif result["type"] == "impl": + body["classes"][result["name"]] = result["code"] else: - other_blocks.append(ast_to_source(node)) - - def is_react_component(node): - # Check if the function/class is likely a React component - if node.type == "FunctionDeclaration" or node.type == "ArrowFunctionExpression": - body = node.body.body if node.body.type == "BlockStatement" else [node.body] - return any( - stmt.type == "ReturnStatement" - and stmt.argument - and stmt.argument.type == "JSXElement" - for stmt in body - ) - elif node.type == "ClassDeclaration": - return any( - method.key.name == "render" - for method in node.body.body - if method.type == "MethodDefinition" - ) - return False - - def is_react_hook(name): - # Check if the function name starts with 'use' - return name.startswith("use") and name[3].isupper() - - for node in tree.body: - process_node(node) - - # return functions, classes, components, hooks, other_blocks - body = ParsedBody - body["functions"] = functions - body["classes"] = classes - body["other_blocks"] = other_blocks - body["components"] = components - body["hooks"] = hooks + for child in node.children: + process_node(child) + + process_node(tree.root_node) + + # Collect remaining code as other_blocks + collected_ranges = [] + for section in body.values(): + if isinstance(section, dict): + for code_block in section.values(): + start = code.index(code_block) + collected_ranges.append((start, start + len(code_block))) + + collected_ranges.sort() + last_end = 0 + for start, end in collected_ranges: + if start > last_end: + body["other_blocks"].append(code[last_end:start].strip()) + last_end = end + if last_end < len(code): + body["other_blocks"].append(code[last_end:].strip()) + return body -def chunk_code(code, language): - if language.lower() == "python": - return chunk_python_code(code) - elif language.lower() in ["javascript", "js"]: - return chunk_javascript_code(code) - else: - raise ValueError("Unsupported language. Please use 'python' or 'javascript'.") +def is_react_hook(name: str) -> bool: + return name.startswith("use") and len(name) > 3 and name[3].isupper() + + +def is_react_component(code: str) -> bool: + return ( + "React" in code or "jsx" in code.lower() or "tsx" in code.lower() or "<" in code + ) diff --git a/kaizen/retriever/feedback_system.py b/kaizen/retriever/feedback_system.py new file mode 100644 index 00000000..8c47a1ec --- /dev/null +++ b/kaizen/retriever/feedback_system.py @@ -0,0 +1,18 @@ +from typing import Dict, Any + + +class AbstractionFeedback: + def __init__(self): + self.feedback_store: Dict[str, Dict[str, Any]] = {} + + def add_feedback( + self, code_id: str, abstraction: str, rating: int, correction: str = None + ) -> None: + self.feedback_store[code_id] = { + "abstraction": abstraction, + "rating": rating, + "correction": correction, + } + + def get_feedback(self, code_id: str) -> Dict[str, Any]: + return self.feedback_store.get(code_id, None) diff --git a/kaizen/retriever/llama_index_retriever.py b/kaizen/retriever/llama_index_retriever.py index 10c09206..86abec50 100644 --- a/kaizen/retriever/llama_index_retriever.py +++ b/kaizen/retriever/llama_index_retriever.py @@ -1,71 +1,182 @@ import os -from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex +import logging +import openai +from llama_index.core import ( + SimpleDirectoryReader, + StorageContext, + VectorStoreIndex, + Document, +) from llama_index.vector_stores.postgres import PGVectorStore from sqlalchemy import create_engine, text import ast +from llama_index.core import VectorStoreIndex + import networkx as nx +from typing import List, Dict, Any +from concurrent.futures import ThreadPoolExecutor, as_completed +import tiktoken +from kaizen.llms.provider import LLMProvider +from kaizen.retriever.code_chunker import chunk_code + +# Set up logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Set up OpenAI API key +openai.api_key = os.environ.get("OPENAI_API_KEY") + +# Initialize tokenizer +tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") class RepositoryAnalyzer: def __init__(self): - self.index = None + logger.info("Initializing RepositoryAnalyzer") self.engine = create_engine( - f"postgresql://{os.environ['PG_USER']}:{os.environ['PG_PASSWORD']}@{os.environ['PG_HOST']}:{os.environ['PG_PORT']}/{os.environ['db_name']}" + f"postgresql://{os.environ['POSTGRES_USER']}:{os.environ['POSTGRES_PASSWORD']}@{os.environ['POSTGRES_HOST']}:{os.environ['POSTGRES_PORT']}/{os.environ['POSTGRES_DB']}", + pool_size=10, + max_overflow=20, ) self.graph = nx.DiGraph() - - def load_index(self, folder_path): - documents = SimpleDirectoryReader(folder_path).load_data() - - vector_store = PGVectorStore.from_params( - database=os.environ["db_name"], - host=os.environ["PG_HOST"], - password=os.environ["PG_PASSWORD"], - port=os.environ["PG_PORT"], - user=os.environ["PG_USER"], + self.vector_store = PGVectorStore.from_params( + database=os.environ["POSTGRES_DB"], + host=os.environ["POSTGRES_HOST"], + password=os.environ["POSTGRES_PASSWORD"], + port=os.environ["POSTGRES_PORT"], + user=os.environ["POSTGRES_USER"], table_name="embeddings", - embed_dim=512, # openai embedding dimension + embed_dim=1536, # OpenAI's text-embedding-ada-002 dimension ) - - storage_context = StorageContext.from_defaults(vector_store=vector_store) - self.index = VectorStoreIndex.from_documents( - documents, storage_context=storage_context, show_progress=True + self.provider = LLMProvider() + self.storage_context = StorageContext.from_defaults( + vector_store=self.vector_store ) + logger.info("RepositoryAnalyzer initialized successfully") - def parse_repository(self, repo_path): - for root, dirs, files in os.walk(repo_path): - for file in files: - if file.endswith(".py"): - file_path = os.path.join(root, file) - self.parse_file(file_path) - - def parse_file(self, file_path): - with open(file_path, "r") as file: - content = file.read() - - tree = ast.parse(content) - for node in ast.walk(tree): - if isinstance(node, ast.FunctionDef): - self.process_function(node, file_path) - - def process_function(self, node, file_path): - function_name = node.name - start_line = node.lineno - end_line = node.end_lineno - - # Store function information in the database - self.store_function_in_db(function_name, file_path, start_line, end_line) - - # Analyze function calls within the function - for sub_node in ast.walk(node): - if isinstance(sub_node, ast.Call): - if isinstance(sub_node.func, ast.Name): - called_function = sub_node.func.id - self.graph.add_edge(function_name, called_function) - - def store_function_in_db(self, function_name, file_path, start_line, end_line): - with self.engine.connect() as connection: - # Insert into files table if not exists + def setup_repository(self, repo_path: str): + self.total_usage = self.provider.DEFAULT_USAGE + logger.info(f"Starting repository setup for: {repo_path}") + self.parse_repository(repo_path) + self.store_function_relationships() + logger.info("Repository setup completed successfully") + + def parse_repository(self, repo_path: str): + logger.info(f"Parsing repository: {repo_path}") + with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: + futures = [] + for root, _, files in os.walk(repo_path): + for file in files: + if file.endswith( + (".py", ".js", ".ts", ".rs") + ): # Add more extensions as needed + file_path = os.path.join(root, file) + futures.append(executor.submit(self.parse_file, file_path)) + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + logger.error(f"Error in parsing file: {str(e)}") + logger.info("Repository parsing completed") + + def parse_file(self, file_path: str): + logger.debug(f"Parsing file: {file_path}") + try: + with open(file_path, "r", encoding="utf-8") as file: + content = file.read() + + language = self.get_language_from_extension(file_path) + chunked_code = chunk_code(content, language) + + for section, items in chunked_code.items(): + if isinstance(items, dict): + for name, code_info in items.items(): + self.process_code_block(code_info, file_path, section, name) + elif isinstance(items, list): + for i, code_info in enumerate(items): + self.process_code_block( + code_info, file_path, section, f"{section}_{i}" + ) + logger.debug(f"Successfully parsed file: {file_path}") + except Exception as e: + logger.error(f"Error processing file {file_path}: {str(e)}") + + @staticmethod + def get_language_from_extension(file_path: str) -> str: + ext = os.path.splitext(file_path)[1].lower() + return { + ".py": "python", + ".js": "javascript", + ".jsx": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".rs": "rust", + }.get(ext, "unknown") + + def process_code_block( + self, code_info: Dict[str, Any], file_path: str, section: str, name: str + ): + logger.debug(f"Processing code block: {section} - {name}") + code = code_info["code"] + language = self.get_language_from_extension(file_path) + abstraction, usage = self.generate_abstraction(code, language) + total_usage = self.provider.update_usage() + + snippet_id = self.store_code_in_db(code, abstraction, file_path, section, name) + self.store_abstraction_and_embedding(snippet_id, abstraction) + + if section == "functions": + self.analyze_function_calls(name, code) + logger.debug(f"Finished processing code block: {section} - {name}") + + def generate_abstraction( + self, code_block: str, language: str, max_tokens: int = 300 + ) -> str: + prompt = f"""Generate a concise yet comprehensive abstract description of the following {language} code block. + Include information about: + 1. The purpose or functionality of the code + 2. Input parameters and return values (if applicable) + 3. Any important algorithms or data structures used + 4. Key dependencies or external libraries used + 5. Any notable design patterns or architectural choices + 6. Potential edge cases or error handling + + Code: + ```{language} + {code_block} + ``` + """ + + estimated_prompt_tokens = len(tokenizer.encode(prompt)) + adjusted_max_tokens = min(max(150, estimated_prompt_tokens), 1000) + + try: + abstraction, usage = self.provider.chat_completion( + model="small", + messages=[ + { + "role": "system", + "content": "You are an expert programmer tasked with generating comprehensive and accurate abstractions of code snippets.", + }, + {"role": "user", "content": prompt}, + ], + max_tokens=adjusted_max_tokens, + n=1, + temperature=0.5, + ) + return abstraction + + except Exception as e: + raise e + + def store_code_in_db( + self, code: str, abstraction: str, file_path: str, section: str, name: str + ) -> int: + logger.debug(f"Storing code in DB: {file_path} - {section} - {name}") + with self.engine.begin() as connection: file_query = text( """ INSERT INTO files (repo_id, file_path, file_name, file_ext, programming_language) @@ -74,98 +185,107 @@ def store_function_in_db(self, function_name, file_path, start_line, end_line): RETURNING file_id """ ) - file_result = connection.execute( + file_id = connection.execute( file_query, { "repo_id": 1, # Assuming repo_id is 1, adjust as needed "file_path": file_path, "file_name": os.path.basename(file_path), - "file_ext": ".py", - "programming_language": "Python", + "file_ext": os.path.splitext(file_path)[1], + "programming_language": self.get_language_from_extension(file_path), }, - ) - file_id = file_result.fetchone()[0] + ).scalar_one() - # Insert into code_snippets table snippet_query = text( """ - INSERT INTO code_snippets (file_id, snippet_text, start_line, end_line, functionality) - VALUES (:file_id, :snippet_text, :start_line, :end_line, :functionality) + INSERT INTO code_snippets (file_id, snippet_text, functionality, context) + VALUES (:file_id, :snippet_text, :functionality, :context) RETURNING snippet_id """ ) - snippet_result = connection.execute( + snippet_id = connection.execute( snippet_query, { "file_id": file_id, - "snippet_text": function_name, # This should be the actual function code - "start_line": start_line, - "end_line": end_line, - "functionality": f"Function: {function_name}", + "snippet_text": code, + "functionality": abstraction, + "context": f"{section}: {name}", }, - ) - snippet_id = snippet_result.fetchone()[0] + ).scalar_one() - # Insert into ast_nodes table - node_query = text( - """ - INSERT INTO ast_nodes (file_id, node_type, start_line, end_line) - VALUES (:file_id, :node_type, :start_line, :end_line) - RETURNING node_id - """ - ) - node_result = connection.execute( - node_query, - { - "file_id": file_id, - "node_type": "FunctionDef", - "start_line": start_line, - "end_line": end_line, - }, - ) - node_id = node_result.fetchone()[0] + logger.debug(f"Code stored in DB with snippet_id: {snippet_id}") + return snippet_id - # Insert function name as a property - prop_query = text( - """ - INSERT INTO node_properties (node_id, property_name, property_value) - VALUES (:node_id, :property_name, :property_value) - """ - ) - connection.execute( - prop_query, - { - "node_id": node_id, - "property_name": "function_name", - "property_value": function_name, - }, - ) + def store_abstraction_and_embedding(self, snippet_id: int, abstraction: str): + logger.debug(f"Storing abstraction and embedding for snippet_id: {snippet_id}") + + embedding = self.provider.get_text_embedding(abstraction) + doc = Document(text=abstraction, metadata={"snippet_id": snippet_id}) + self.vector_store.add_documents([doc], embedding_vectors=[embedding]) + + logger.debug(f"Abstraction and embedding stored for snippet_id: {snippet_id}") + + def analyze_function_calls(self, function_name: str, code: str): + logger.debug(f"Analyzing function calls for: {function_name}") + try: + tree = ast.parse(code) + for node in ast.walk(tree): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name): + self.graph.add_edge(function_name, node.func.id) + logger.debug(f"Added edge: {function_name} -> {node.func.id}") + except SyntaxError: + logger.error(f"Syntax error in function {function_name}") def store_function_relationships(self): - for caller, callee in self.graph.edges(): - with self.engine.connect() as connection: + logger.info("Storing function relationships") + with self.engine.begin() as connection: + for caller, callee in self.graph.edges(): query = text( """ INSERT INTO node_relationships (parent_node_id, child_node_id, relationship_type) VALUES ( - (SELECT node_id FROM ast_nodes WHERE node_type = 'FunctionDef' AND node_id IN - (SELECT node_id FROM node_properties WHERE property_name = 'function_name' AND property_value = :caller) - ), - (SELECT node_id FROM ast_nodes WHERE node_type = 'FunctionDef' AND node_id IN - (SELECT node_id FROM node_properties WHERE property_name = 'function_name' AND property_value = :callee) - ), + (SELECT snippet_id FROM code_snippets WHERE context LIKE :caller), + (SELECT snippet_id FROM code_snippets WHERE context LIKE :callee), 'calls' ) + ON CONFLICT DO NOTHING """ ) - connection.execute(query, {"caller": caller, "callee": callee}) + connection.execute( + query, {"caller": f"%{caller}", "callee": f"%{callee}"} + ) + logger.info("Function relationships stored successfully") - def query(self, query_text): - # Perform retrieval using the index - response = self.index.query(query_text) - return response + def query(self, query_text: str, num_results: int = 5) -> List[Dict[str, Any]]: + logger.info(f"Performing query: '{query_text}'") - def analyze_repository(self, repo_path): - self.parse_repository(repo_path) - self.store_function_relationships() - self.load_index(repo_path) + index = VectorStoreIndex.from_vector_store(self.vector_store) + + # Create a query engine + query_engine = index.as_query_engine(similarity_top_k=num_results) + + # Perform the query + response = query_engine.query(query_text) + + results = [] + with self.engine.connect() as connection: + for node in response.source_nodes: + snippet_id = node.metadata["snippet_id"] + query = text(""" + SELECT cs.snippet_text, cs.functionality, f.file_path + FROM code_snippets cs + JOIN files f ON cs.file_id = f.file_id + WHERE cs.snippet_id = :snippet_id + """) + result = connection.execute(query, {"snippet_id": snippet_id}).fetchone() + if result: + results.append({ + "code": result[0], + "abstraction": result[1], + "file_path": result[2], + "relevance_score": node.score if hasattr(node, 'score') else 1.0 + }) + + sorted_results = sorted(results, key=lambda x: x["relevance_score"], reverse=True) + logger.info(f"Query completed. Found {len(sorted_results)} results.") + return sorted_results diff --git a/kaizen/retriever/query_processor.py b/kaizen/retriever/query_processor.py deleted file mode 100644 index e69de29b..00000000 diff --git a/kaizen/retriever/result_processor.py b/kaizen/retriever/result_processor.py deleted file mode 100644 index e69de29b..00000000 diff --git a/kaizen/retriever/tree_sitter_utils.py b/kaizen/retriever/tree_sitter_utils.py new file mode 100644 index 00000000..356e39a5 --- /dev/null +++ b/kaizen/retriever/tree_sitter_utils.py @@ -0,0 +1,107 @@ +import os +from functools import lru_cache +from tree_sitter import Language, Parser +from typing import Dict, Any +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Directory where the language libraries are stored +LANGUAGE_DIR = "/app/tree_sitter_languages" + +class LanguageLoader: + @staticmethod + @lru_cache(maxsize=None) + def load_language(language: str) -> Language: + try: + lang_file = os.path.join(LANGUAGE_DIR, f"{language}.so") + if not os.path.exists(lang_file): + raise FileNotFoundError(f"Language file for {language} not found.") + return Language(lang_file, language) + except Exception as e: + logger.error(f"Failed to load language {language}: {str(e)}") + raise + +class ParserFactory: + @staticmethod + @lru_cache(maxsize=None) + def get_parser(language: str) -> Parser: + try: + parser = Parser() + lang = LanguageLoader.load_language(language) + parser.set_language(lang) + return parser + except Exception as e: + logger.error(f"Failed to create parser for {language}: {str(e)}") + raise + +def traverse_tree(node, code_bytes: bytes) -> Dict[str, Any]: + if node.type in [ + "function_definition", + "function_declaration", + "arrow_function", + "method_definition", + ]: + return { + "type": "function", + "name": ( + node.child_by_field_name("name").text.decode("utf8") + if node.child_by_field_name("name") + else "anonymous" + ), + "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), + } + elif node.type in ["class_definition", "class_declaration"]: + return { + "type": "class", + "name": node.child_by_field_name("name").text.decode("utf8"), + "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), + } + elif node.type in ["jsx_element", "jsx_self_closing_element"]: + return { + "type": "component", + "name": ( + node.child_by_field_name("opening_element") + .child_by_field_name("name") + .text.decode("utf8") + if node.type == "jsx_element" + else node.child_by_field_name("name").text.decode("utf8") + ), + "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), + } + elif node.type == "impl_item": + return { + "type": "impl", + "name": node.child_by_field_name("type").text.decode("utf8"), + "code": code_bytes[node.start_byte : node.end_byte].decode("utf8"), + } + else: + return None + +def parse_code(code: str, language: str) -> Dict[str, Any]: + try: + parser = ParserFactory.get_parser(language) + tree = parser.parse(bytes(code, "utf8")) + return traverse_tree(tree.root_node, code.encode("utf8")) + except Exception as e: + logger.error(f"Failed to parse {language} code: {str(e)}") + raise + +def check_language_files(): + required_languages = ["python", "javascript", "typescript", "rust"] + missing_languages = [] + for lang in required_languages: + try: + LanguageLoader.load_language(lang) + except FileNotFoundError: + missing_languages.append(lang) + + if missing_languages: + logger.warning(f"Missing language files for: {', '.join(missing_languages)}") + else: + logger.info("All required language files are present.") + +# Call this function at the start of your application +check_language_files() \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 1adae5ba..a5bab61b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,3 +1,5 @@ +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. + [[package]] name = "aiohappyeyeballs" version = "2.3.4" @@ -1037,109 +1039,6 @@ files = [ {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] -[[package]] -name = "ijson" -version = "3.3.0" -description = "Iterative JSON parser with standard Python iterator interfaces" -optional = false -python-versions = "*" -files = [ - {file = "ijson-3.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7f7a5250599c366369fbf3bc4e176f5daa28eb6bc7d6130d02462ed335361675"}, - {file = "ijson-3.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f87a7e52f79059f9c58f6886c262061065eb6f7554a587be7ed3aa63e6b71b34"}, - {file = "ijson-3.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b73b493af9e947caed75d329676b1b801d673b17481962823a3e55fe529c8b8b"}, - {file = "ijson-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5576415f3d76290b160aa093ff968f8bf6de7d681e16e463a0134106b506f49"}, - {file = "ijson-3.3.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4e9ffe358d5fdd6b878a8a364e96e15ca7ca57b92a48f588378cef315a8b019e"}, - {file = "ijson-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8643c255a25824ddd0895c59f2319c019e13e949dc37162f876c41a283361527"}, - {file = "ijson-3.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:df3ab5e078cab19f7eaeef1d5f063103e1ebf8c26d059767b26a6a0ad8b250a3"}, - {file = "ijson-3.3.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3dc1fb02c6ed0bae1b4bf96971258bf88aea72051b6e4cebae97cff7090c0607"}, - {file = "ijson-3.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e9afd97339fc5a20f0542c971f90f3ca97e73d3050cdc488d540b63fae45329a"}, - {file = "ijson-3.3.0-cp310-cp310-win32.whl", hash = "sha256:844c0d1c04c40fd1b60f148dc829d3f69b2de789d0ba239c35136efe9a386529"}, - {file = "ijson-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:d654d045adafdcc6c100e8e911508a2eedbd2a1b5f93f930ba13ea67d7704ee9"}, - {file = "ijson-3.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:501dce8eaa537e728aa35810656aa00460a2547dcb60937c8139f36ec344d7fc"}, - {file = "ijson-3.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:658ba9cad0374d37b38c9893f4864f284cdcc7d32041f9808fba8c7bcaadf134"}, - {file = "ijson-3.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2636cb8c0f1023ef16173f4b9a233bcdb1df11c400c603d5f299fac143ca8d70"}, - {file = "ijson-3.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd174b90db68c3bcca273e9391934a25d76929d727dc75224bf244446b28b03b"}, - {file = "ijson-3.3.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:97a9aea46e2a8371c4cf5386d881de833ed782901ac9f67ebcb63bb3b7d115af"}, - {file = "ijson-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c594c0abe69d9d6099f4ece17763d53072f65ba60b372d8ba6de8695ce6ee39e"}, - {file = "ijson-3.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8e0ff16c224d9bfe4e9e6bd0395826096cda4a3ef51e6c301e1b61007ee2bd24"}, - {file = "ijson-3.3.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0015354011303175eae7e2ef5136414e91de2298e5a2e9580ed100b728c07e51"}, - {file = "ijson-3.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:034642558afa57351a0ffe6de89e63907c4cf6849070cc10a3b2542dccda1afe"}, - {file = "ijson-3.3.0-cp311-cp311-win32.whl", hash = "sha256:192e4b65495978b0bce0c78e859d14772e841724d3269fc1667dc6d2f53cc0ea"}, - {file = "ijson-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:72e3488453754bdb45c878e31ce557ea87e1eb0f8b4fc610373da35e8074ce42"}, - {file = "ijson-3.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:988e959f2f3d59ebd9c2962ae71b97c0df58323910d0b368cc190ad07429d1bb"}, - {file = "ijson-3.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b2f73f0d0fce5300f23a1383d19b44d103bb113b57a69c36fd95b7c03099b181"}, - {file = "ijson-3.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0ee57a28c6bf523d7cb0513096e4eb4dac16cd935695049de7608ec110c2b751"}, - {file = "ijson-3.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0155a8f079c688c2ccaea05de1ad69877995c547ba3d3612c1c336edc12a3a5"}, - {file = "ijson-3.3.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ab00721304af1ae1afa4313ecfa1bf16b07f55ef91e4a5b93aeaa3e2bd7917c"}, - {file = "ijson-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40ee3821ee90be0f0e95dcf9862d786a7439bd1113e370736bfdf197e9765bfb"}, - {file = "ijson-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:da3b6987a0bc3e6d0f721b42c7a0198ef897ae50579547b0345f7f02486898f5"}, - {file = "ijson-3.3.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:63afea5f2d50d931feb20dcc50954e23cef4127606cc0ecf7a27128ed9f9a9e6"}, - {file = "ijson-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b5c3e285e0735fd8c5a26d177eca8b52512cdd8687ca86ec77a0c66e9c510182"}, - {file = "ijson-3.3.0-cp312-cp312-win32.whl", hash = "sha256:907f3a8674e489abdcb0206723e5560a5cb1fa42470dcc637942d7b10f28b695"}, - {file = "ijson-3.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:8f890d04ad33262d0c77ead53c85f13abfb82f2c8f078dfbf24b78f59534dfdd"}, - {file = "ijson-3.3.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b9d85a02e77ee8ea6d9e3fd5d515bcc3d798d9c1ea54817e5feb97a9bc5d52fe"}, - {file = "ijson-3.3.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6576cdc36d5a09b0c1a3d81e13a45d41a6763188f9eaae2da2839e8a4240bce"}, - {file = "ijson-3.3.0-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5589225c2da4bb732c9c370c5961c39a6db72cf69fb2a28868a5413ed7f39e6"}, - {file = "ijson-3.3.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad04cf38164d983e85f9cba2804566c0160b47086dcca4cf059f7e26c5ace8ca"}, - {file = "ijson-3.3.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:a3b730ef664b2ef0e99dec01b6573b9b085c766400af363833e08ebc1e38eb2f"}, - {file = "ijson-3.3.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:4690e3af7b134298055993fcbea161598d23b6d3ede11b12dca6815d82d101d5"}, - {file = "ijson-3.3.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:aaa6bfc2180c31a45fac35d40e3312a3d09954638ce0b2e9424a88e24d262a13"}, - {file = "ijson-3.3.0-cp36-cp36m-win32.whl", hash = "sha256:44367090a5a876809eb24943f31e470ba372aaa0d7396b92b953dda953a95d14"}, - {file = "ijson-3.3.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7e2b3e9ca957153557d06c50a26abaf0d0d6c0ddf462271854c968277a6b5372"}, - {file = "ijson-3.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:47c144117e5c0e2babb559bc8f3f76153863b8dd90b2d550c51dab5f4b84a87f"}, - {file = "ijson-3.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29ce02af5fbf9ba6abb70765e66930aedf73311c7d840478f1ccecac53fefbf3"}, - {file = "ijson-3.3.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ac6c3eeed25e3e2cb9b379b48196413e40ac4e2239d910bb33e4e7f6c137745"}, - {file = "ijson-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d92e339c69b585e7b1d857308ad3ca1636b899e4557897ccd91bb9e4a56c965b"}, - {file = "ijson-3.3.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:8c85447569041939111b8c7dbf6f8fa7a0eb5b2c4aebb3c3bec0fb50d7025121"}, - {file = "ijson-3.3.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:542c1e8fddf082159a5d759ee1412c73e944a9a2412077ed00b303ff796907dc"}, - {file = "ijson-3.3.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:30cfea40936afb33b57d24ceaf60d0a2e3d5c1f2335ba2623f21d560737cc730"}, - {file = "ijson-3.3.0-cp37-cp37m-win32.whl", hash = "sha256:6b661a959226ad0d255e49b77dba1d13782f028589a42dc3172398dd3814c797"}, - {file = "ijson-3.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:0b003501ee0301dbf07d1597482009295e16d647bb177ce52076c2d5e64113e0"}, - {file = "ijson-3.3.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3e8d8de44effe2dbd0d8f3eb9840344b2d5b4cc284a14eb8678aec31d1b6bea8"}, - {file = "ijson-3.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9cd5c03c63ae06d4f876b9844c5898d0044c7940ff7460db9f4cd984ac7862b5"}, - {file = "ijson-3.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04366e7e4a4078d410845e58a2987fd9c45e63df70773d7b6e87ceef771b51ee"}, - {file = "ijson-3.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de7c1ddb80fa7a3ab045266dca169004b93f284756ad198306533b792774f10a"}, - {file = "ijson-3.3.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8851584fb931cffc0caa395f6980525fd5116eab8f73ece9d95e6f9c2c326c4c"}, - {file = "ijson-3.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdcfc88347fd981e53c33d832ce4d3e981a0d696b712fbcb45dcc1a43fe65c65"}, - {file = "ijson-3.3.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:3917b2b3d0dbbe3296505da52b3cb0befbaf76119b2edaff30bd448af20b5400"}, - {file = "ijson-3.3.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:e10c14535abc7ddf3fd024aa36563cd8ab5d2bb6234a5d22c77c30e30fa4fb2b"}, - {file = "ijson-3.3.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:3aba5c4f97f4e2ce854b5591a8b0711ca3b0c64d1b253b04ea7b004b0a197ef6"}, - {file = "ijson-3.3.0-cp38-cp38-win32.whl", hash = "sha256:b325f42e26659df1a0de66fdb5cde8dd48613da9c99c07d04e9fb9e254b7ee1c"}, - {file = "ijson-3.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:ff835906f84451e143f31c4ce8ad73d83ef4476b944c2a2da91aec8b649570e1"}, - {file = "ijson-3.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3c556f5553368dff690c11d0a1fb435d4ff1f84382d904ccc2dc53beb27ba62e"}, - {file = "ijson-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e4396b55a364a03ff7e71a34828c3ed0c506814dd1f50e16ebed3fc447d5188e"}, - {file = "ijson-3.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e6850ae33529d1e43791b30575070670070d5fe007c37f5d06aebc1dd152ab3f"}, - {file = "ijson-3.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36aa56d68ea8def26778eb21576ae13f27b4a47263a7a2581ab2ef58b8de4451"}, - {file = "ijson-3.3.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7ec759c4a0fc820ad5dc6a58e9c391e7b16edcb618056baedbedbb9ea3b1524"}, - {file = "ijson-3.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b51bab2c4e545dde93cb6d6bb34bf63300b7cd06716f195dd92d9255df728331"}, - {file = "ijson-3.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:92355f95a0e4da96d4c404aa3cff2ff033f9180a9515f813255e1526551298c1"}, - {file = "ijson-3.3.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8795e88adff5aa3c248c1edce932db003d37a623b5787669ccf205c422b91e4a"}, - {file = "ijson-3.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8f83f553f4cde6d3d4eaf58ec11c939c94a0ec545c5b287461cafb184f4b3a14"}, - {file = "ijson-3.3.0-cp39-cp39-win32.whl", hash = "sha256:ead50635fb56577c07eff3e557dac39533e0fe603000684eea2af3ed1ad8f941"}, - {file = "ijson-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:c8a9befb0c0369f0cf5c1b94178d0d78f66d9cebb9265b36be6e4f66236076b8"}, - {file = "ijson-3.3.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2af323a8aec8a50fa9effa6d640691a30a9f8c4925bd5364a1ca97f1ac6b9b5c"}, - {file = "ijson-3.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f64f01795119880023ba3ce43072283a393f0b90f52b66cc0ea1a89aa64a9ccb"}, - {file = "ijson-3.3.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a716e05547a39b788deaf22725490855337fc36613288aa8ae1601dc8c525553"}, - {file = "ijson-3.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473f5d921fadc135d1ad698e2697025045cd8ed7e5e842258295012d8a3bc702"}, - {file = "ijson-3.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd26b396bc3a1e85f4acebeadbf627fa6117b97f4c10b177d5779577c6607744"}, - {file = "ijson-3.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:25fd49031cdf5fd5f1fd21cb45259a64dad30b67e64f745cc8926af1c8c243d3"}, - {file = "ijson-3.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b72178b1e565d06ab19319965022b36ef41bcea7ea153b32ec31194bec032a2"}, - {file = "ijson-3.3.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d0b6b637d05dbdb29d0bfac2ed8425bb369e7af5271b0cc7cf8b801cb7360c2"}, - {file = "ijson-3.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5378d0baa59ae422905c5f182ea0fd74fe7e52a23e3821067a7d58c8306b2191"}, - {file = "ijson-3.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:99f5c8ab048ee4233cc4f2b461b205cbe01194f6201018174ac269bf09995749"}, - {file = "ijson-3.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:45ff05de889f3dc3d37a59d02096948ce470699f2368b32113954818b21aa74a"}, - {file = "ijson-3.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1efb521090dd6cefa7aafd120581947b29af1713c902ff54336b7c7130f04c47"}, - {file = "ijson-3.3.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87c727691858fd3a1c085d9980d12395517fcbbf02c69fbb22dede8ee03422da"}, - {file = "ijson-3.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0420c24e50389bc251b43c8ed379ab3e3ba065ac8262d98beb6735ab14844460"}, - {file = "ijson-3.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:8fdf3721a2aa7d96577970f5604bd81f426969c1822d467f07b3d844fa2fecc7"}, - {file = "ijson-3.3.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:891f95c036df1bc95309951940f8eea8537f102fa65715cdc5aae20b8523813b"}, - {file = "ijson-3.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed1336a2a6e5c427f419da0154e775834abcbc8ddd703004108121c6dd9eba9d"}, - {file = "ijson-3.3.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0c819f83e4f7b7f7463b2dc10d626a8be0c85fbc7b3db0edc098c2b16ac968e"}, - {file = "ijson-3.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33afc25057377a6a43c892de34d229a86f89ea6c4ca3dd3db0dcd17becae0dbb"}, - {file = "ijson-3.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7914d0cf083471856e9bc2001102a20f08e82311dfc8cf1a91aa422f9414a0d6"}, - {file = "ijson-3.3.0.tar.gz", hash = "sha256:7f172e6ba1bee0d4c8f8ebd639577bfe429dee0f3f96775a067b8bae4492d8a0"}, -] - [[package]] name = "importlib-metadata" version = "8.2.0" @@ -1159,24 +1058,6 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke perf = ["ipython"] test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] -[[package]] -name = "importlib-resources" -version = "6.4.0" -description = "Read resources from Python packages" -optional = false -python-versions = ">=3.8" -files = [ - {file = "importlib_resources-6.4.0-py3-none-any.whl", hash = "sha256:50d10f043df931902d4194ea07ec57960f66a80449ff867bfe782b4c486ba78c"}, - {file = "importlib_resources-6.4.0.tar.gz", hash = "sha256:cdb2b453b8046ca4e3798eb1d84f3cce1446a0e8e7b5ef4efb600f19fc398145"}, -] - -[package.dependencies] -zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["jaraco.test (>=5.4)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"] - [[package]] name = "iniconfig" version = "2.0.0" @@ -1205,6 +1086,17 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] +[[package]] +name = "joblib" +version = "1.4.2" +description = "Lightweight pipelining with Python functions" +optional = false +python-versions = ">=3.8" +files = [ + {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, + {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, +] + [[package]] name = "jsonschema" version = "4.23.0" @@ -1218,9 +1110,7 @@ files = [ [package.dependencies] attrs = ">=22.2.0" -importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} jsonschema-specifications = ">=2023.03.6" -pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""} referencing = ">=0.28.4" rpds-py = ">=0.7.1" @@ -1240,7 +1130,6 @@ files = [ ] [package.dependencies] -importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} referencing = ">=0.31.0" [[package]] @@ -1257,7 +1146,6 @@ files = [ [package.dependencies] aiohttp = "*" click = "*" -ijson = "*" importlib-metadata = ">=6.8.0" jinja2 = ">=3.1.2,<4.0.0" jsonschema = ">=4.22.0,<5.0.0" @@ -1272,15 +1160,87 @@ tokenizers = "*" extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "pynacl (>=1.5.0,<2.0.0)", "resend (>=0.8.0,<0.9.0)"] proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"] +[[package]] +name = "llama-cloud" +version = "0.0.13" +description = "" +optional = false +python-versions = "<4,>=3.8" +files = [ + {file = "llama_cloud-0.0.13-py3-none-any.whl", hash = "sha256:b641450308b80c85eeae7ef9cb5a3b4a3b1823d5cde05b626ce33f7494ec6229"}, + {file = "llama_cloud-0.0.13.tar.gz", hash = "sha256:0e3165a22f8df34a00d13f1f5739438ba4d620f2d8a9289df830078a39fe6f1f"}, +] + +[package.dependencies] +httpx = ">=0.20.0" +pydantic = ">=1.10" + +[[package]] +name = "llama-index" +version = "0.10.65" +description = "Interface between LLMs and your data" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index-0.10.65-py3-none-any.whl", hash = "sha256:3e5c447fa2dc8a5da95dce47a5dfe2e1c6a3b4f40ff4be8688b38ee321ee425c"}, + {file = "llama_index-0.10.65.tar.gz", hash = "sha256:1607c6d5f7ebe6cd016891796eff553c9fe85fde9cf8d211f6fd0f4cdbc7a88e"}, +] + +[package.dependencies] +llama-index-agent-openai = ">=0.1.4,<0.3.0" +llama-index-cli = ">=0.1.2,<0.2.0" +llama-index-core = ">=0.10.65,<0.11.0" +llama-index-embeddings-openai = ">=0.1.5,<0.2.0" +llama-index-indices-managed-llama-cloud = ">=0.2.0" +llama-index-legacy = ">=0.9.48,<0.10.0" +llama-index-llms-openai = ">=0.1.27,<0.2.0" +llama-index-multi-modal-llms-openai = ">=0.1.3,<0.2.0" +llama-index-program-openai = ">=0.1.3,<0.2.0" +llama-index-question-gen-openai = ">=0.1.2,<0.2.0" +llama-index-readers-file = ">=0.1.4,<0.2.0" +llama-index-readers-llama-parse = ">=0.1.2" + +[[package]] +name = "llama-index-agent-openai" +version = "0.2.9" +description = "llama-index agent openai integration" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_agent_openai-0.2.9-py3-none-any.whl", hash = "sha256:d7f0fd4c87124781acd783be603871f8808b1a3969e876a9c96e2ed0844d46ac"}, + {file = "llama_index_agent_openai-0.2.9.tar.gz", hash = "sha256:debe86da6d9d983db32b445ddca7c798ac140fe59573bafded73595b3995f3d5"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.41,<0.11.0" +llama-index-llms-openai = ">=0.1.5,<0.2.0" +openai = ">=1.14.0" + +[[package]] +name = "llama-index-cli" +version = "0.1.13" +description = "llama-index cli" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_cli-0.1.13-py3-none-any.whl", hash = "sha256:5e05bc3ce55ee1bf6e5af7e87631a71d6b6cf8fc2af10cd3947b09b1bac6788d"}, + {file = "llama_index_cli-0.1.13.tar.gz", hash = "sha256:86147ded4439fbab1d6c7c0d72e8f231d2935da9fdf5c9d3f0dde4f35d44aa59"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.11.post1,<0.11.0" +llama-index-embeddings-openai = ">=0.1.1,<0.2.0" +llama-index-llms-openai = ">=0.1.1,<0.2.0" + [[package]] name = "llama-index-core" -version = "0.10.47" +version = "0.10.66" description = "Interface between LLMs and your data" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index_core-0.10.47-py3-none-any.whl", hash = "sha256:9d9f7d0f4861832386bb21326e67277c13aa7472af1dfdef53e8910309c9f569"}, - {file = "llama_index_core-0.10.47.tar.gz", hash = "sha256:a76b6029552e281c2cbc67bcbb2639e4ee9e864d30df78bde24945ddb8ab6422"}, + {file = "llama_index_core-0.10.66-py3-none-any.whl", hash = "sha256:0d4ffaea4a5f0bdc2243d7e71d5f6926a508737088aa5c0af658ea2deac98b4d"}, + {file = "llama_index_core-0.10.66.tar.gz", hash = "sha256:70f5cc9da6ee1c550dfde0bd8ab12e77128cc308714958e2cafb7affbc3f5c87"}, ] [package.dependencies] @@ -1290,10 +1250,9 @@ deprecated = ">=1.2.9.3" dirtyjson = ">=1.0.8,<2.0.0" fsspec = ">=2023.5.0" httpx = "*" -llamaindex-py-client = ">=0.1.18,<0.2.0" nest-asyncio = ">=1.5.8,<2.0.0" networkx = ">=3.0" -nltk = ">=3.8.1,<4.0.0" +nltk = ">=3.8.1" numpy = "<2.0.0" openai = ">=1.1.0" pandas = "*" @@ -1322,19 +1281,120 @@ files = [ [package.dependencies] llama-index-core = ">=0.10.1,<0.11.0" +[[package]] +name = "llama-index-indices-managed-llama-cloud" +version = "0.2.7" +description = "llama-index indices llama-cloud integration" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_indices_managed_llama_cloud-0.2.7-py3-none-any.whl", hash = "sha256:94335504eab2a6baf7361bbd8bda3ae20a68c7d0111587c9a0793440e9edff21"}, + {file = "llama_index_indices_managed_llama_cloud-0.2.7.tar.gz", hash = "sha256:d7e9b4cc50214b3cfcd75ea63cacce4ee36092cb672c003f15fd23ba31c49ec0"}, +] + +[package.dependencies] +llama-cloud = ">=0.0.11" +llama-index-core = ">=0.10.48.post1,<0.11.0" + +[[package]] +name = "llama-index-legacy" +version = "0.9.48.post2" +description = "Interface between LLMs and your data" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_legacy-0.9.48.post2-py3-none-any.whl", hash = "sha256:2581af680a4e577d4f0accd76e8286c5f1054f28a2fb0e8e5758f09ce5da0176"}, + {file = "llama_index_legacy-0.9.48.post2.tar.gz", hash = "sha256:a4c1f10b4d19d005674195c449f4e859022c65c816dcba1a619ef5df922aa212"}, +] + +[package.dependencies] +aiohttp = ">=3.8.6,<4.0.0" +dataclasses-json = "*" +deprecated = ">=1.2.9.3" +dirtyjson = ">=1.0.8,<2.0.0" +fsspec = ">=2023.5.0" +httpx = "*" +nest-asyncio = ">=1.5.8,<2.0.0" +networkx = ">=3.0" +nltk = ">=3.8.1" +numpy = "*" +openai = ">=1.1.0" +pandas = "*" +requests = ">=2.31.0" +SQLAlchemy = {version = ">=1.4.49", extras = ["asyncio"]} +tenacity = ">=8.2.0,<9.0.0" +tiktoken = ">=0.3.3" +typing-extensions = ">=4.5.0" +typing-inspect = ">=0.8.0" + +[package.extras] +gradientai = ["gradientai (>=1.4.0)"] +html = ["beautifulsoup4 (>=4.12.2,<5.0.0)"] +langchain = ["langchain (>=0.0.303)"] +local-models = ["optimum[onnxruntime] (>=1.13.2,<2.0.0)", "sentencepiece (>=0.1.99,<0.2.0)", "transformers[torch] (>=4.33.1,<5.0.0)"] +postgres = ["asyncpg (>=0.28.0,<0.29.0)", "pgvector (>=0.1.0,<0.2.0)", "psycopg2-binary (>=2.9.9,<3.0.0)"] +query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "lm-format-enforcer (>=0.4.3,<0.5.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "scikit-learn", "spacy (>=3.7.1,<4.0.0)"] + [[package]] name = "llama-index-llms-openai" -version = "0.1.22" +version = "0.1.27" description = "llama-index llms openai integration" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index_llms_openai-0.1.22-py3-none-any.whl", hash = "sha256:84a8c910671460ad724ed818192f209f7481e71bcc6528553ba7e66db2e14bcd"}, - {file = "llama_index_llms_openai-0.1.22.tar.gz", hash = "sha256:729bf2ea7043517465e1d585089512b77d8b3ce92233a67c138d5d621061ed56"}, + {file = "llama_index_llms_openai-0.1.27-py3-none-any.whl", hash = "sha256:8da0e90d4a558667d2b9cf1b3f577a4cb7723b7680ed6d22027b0baf9cd5999e"}, + {file = "llama_index_llms_openai-0.1.27.tar.gz", hash = "sha256:37c2d1159b56607d3a807d90260ee25b4f002086d6251c7272afbc53f2514603"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.57,<0.11.0" + +[[package]] +name = "llama-index-multi-modal-llms-openai" +version = "0.1.9" +description = "llama-index multi-modal-llms openai integration" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_multi_modal_llms_openai-0.1.9-py3-none-any.whl", hash = "sha256:614f40427a4671e72742780be8fda77297dbf2942519bffcb2c9de8696a9edff"}, + {file = "llama_index_multi_modal_llms_openai-0.1.9.tar.gz", hash = "sha256:dbacf44d5c2cca07ca424eacd1337583002d70387a3c1868cf8ae743b1dbec4a"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.1,<0.11.0" +llama-index-llms-openai = ">=0.1.1,<0.2.0" + +[[package]] +name = "llama-index-program-openai" +version = "0.1.7" +description = "llama-index program openai integration" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_program_openai-0.1.7-py3-none-any.whl", hash = "sha256:33489b573c1050a3f583ff68fcbc4bcbd49f29e74f3e5baea08ab0d5f363403c"}, + {file = "llama_index_program_openai-0.1.7.tar.gz", hash = "sha256:bf7eb61a073381714be5a049d93b40044dfe51bd4333bee539d1532b7407621f"}, ] [package.dependencies] -llama-index-core = ">=0.10.24,<0.11.0" +llama-index-agent-openai = ">=0.1.1,<0.3.0" +llama-index-core = ">=0.10.57,<0.11.0" +llama-index-llms-openai = ">=0.1.1" + +[[package]] +name = "llama-index-question-gen-openai" +version = "0.1.3" +description = "llama-index question_gen openai integration" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [ + {file = "llama_index_question_gen_openai-0.1.3-py3-none-any.whl", hash = "sha256:1f83b49e8b2e665030d1ec8c54687d6985d9fa8426147b64e46628a9e489b302"}, + {file = "llama_index_question_gen_openai-0.1.3.tar.gz", hash = "sha256:4486198117a45457d2e036ae60b93af58052893cc7d78fa9b6f47dd47b81e2e1"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.1,<0.11.0" +llama-index-llms-openai = ">=0.1.1,<0.2.0" +llama-index-program-openai = ">=0.1.1,<0.2.0" [[package]] name = "llama-index-readers-file" @@ -1356,6 +1416,21 @@ striprtf = ">=0.0.26,<0.0.27" [package.extras] pymupdf = ["pymupdf (>=1.23.21,<2.0.0)"] +[[package]] +name = "llama-index-readers-llama-parse" +version = "0.1.6" +description = "llama-index readers llama-parse integration" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_readers_llama_parse-0.1.6-py3-none-any.whl", hash = "sha256:71d445a2357ce4c632e0fada7c913ac62790e77c062f12d916dd86378380ff1f"}, + {file = "llama_index_readers_llama_parse-0.1.6.tar.gz", hash = "sha256:04f2dcfbb0fb87ce70890f5a2f4f89941d79be6a818b43738f053560e4b451cf"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.7,<0.11.0" +llama-parse = ">=0.4.0" + [[package]] name = "llama-index-vector-stores-postgres" version = "0.1.11" @@ -1375,19 +1450,18 @@ psycopg2-binary = ">=2.9.9,<3.0.0" sqlalchemy = {version = ">=1.4.49,<2.1", extras = ["asyncio"]} [[package]] -name = "llamaindex-py-client" -version = "0.1.19" -description = "" +name = "llama-parse" +version = "0.4.9" +description = "Parse files into RAG-Optimized formats." optional = false -python-versions = "<4,>=3.8" +python-versions = "<4.0,>=3.8.1" files = [ - {file = "llamaindex_py_client-0.1.19-py3-none-any.whl", hash = "sha256:fd9416fd78b97209bf323bc3c7fab314499778563e7274f10853ad560563d10e"}, - {file = "llamaindex_py_client-0.1.19.tar.gz", hash = "sha256:73f74792bb8c092bae6dc626627a09ac13a099fa8d10f8fcc83e17a2b332cca7"}, + {file = "llama_parse-0.4.9-py3-none-any.whl", hash = "sha256:71974a57a73d642608cc406942bee4e7fc1a713fa410f51df67da509479ba544"}, + {file = "llama_parse-0.4.9.tar.gz", hash = "sha256:657f8fa5f7d399f14c0454fc05cae6034da0373f191df6cfca17a1b4a704ef87"}, ] [package.dependencies] -httpx = ">=0.20.0" -pydantic = ">=1.10" +llama-index-core = ">=0.10.29" [[package]] name = "markupsafe" @@ -1771,10 +1845,9 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -1926,17 +1999,6 @@ files = [ {file = "pip-24.2.tar.gz", hash = "sha256:5b5e490b5e9cb275c879595064adce9ebd31b854e3e803740b72f9ccf34a45b8"}, ] -[[package]] -name = "pkgutil-resolve-name" -version = "1.3.10" -description = "Resolve a name to an object." -optional = false -python-versions = ">=3.6" -files = [ - {file = "pkgutil_resolve_name-1.3.10-py3-none-any.whl", hash = "sha256:ca27cc078d25c5ad71a9de0a7a330146c4e014c2462d9af19c6b828280649c5e"}, - {file = "pkgutil_resolve_name-1.3.10.tar.gz", hash = "sha256:357d6c9e6a755653cfd78893817c0853af365dd51ec97f3d358a819373bbd174"}, -] - [[package]] name = "platformdirs" version = "4.2.2" @@ -2707,6 +2769,17 @@ files = [ {file = "rpds_py-0.19.1.tar.gz", hash = "sha256:31dd5794837f00b46f4096aa8ccaa5972f73a938982e32ed817bb520c465e520"}, ] +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -3071,6 +3144,48 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "tree-sitter" +version = "0.22.3" +description = "Python bindings to the Tree-sitter parsing library" +optional = false +python-versions = ">=3.9" +files = [ + {file = "tree-sitter-0.22.3.tar.gz", hash = "sha256:6516bcef5d36e0365670b97c91a169c8b1aa82ea4b60946b879020820718ce3d"}, + {file = "tree_sitter-0.22.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d9a26dd80cf10763527483b02ba35a0b8d9168f324dbbce3f07860256c29bf15"}, + {file = "tree_sitter-0.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4bcbe0a7358628629d9ec8e5687477e12f7c6aae6943b0872afb7170db039b86"}, + {file = "tree_sitter-0.22.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfa45e6bf2542862ce987482fe212ef3153bd331d5bba5873b9f485f8923f65a"}, + {file = "tree_sitter-0.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4545b142da82f9668007180e0081583054682d0154cd6349796ac77dc8520d63"}, + {file = "tree_sitter-0.22.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4978d22fe2868ab9a91125f49bd576ce5f954cc887c19471e0c33e104f37ba71"}, + {file = "tree_sitter-0.22.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0ec593a69f8c4f1c81494147814d11b7fc6c903e5299e084ae7b89caf95cef84"}, + {file = "tree_sitter-0.22.3-cp310-cp310-win_amd64.whl", hash = "sha256:0f66b88b8e9993630613d594e845f3cf2695fef87d0ca1475437cb17eeb72dc5"}, + {file = "tree_sitter-0.22.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9e627eb129421f63378e936b5d0e13b8befa6e7c5267a8a7621a397a84e8f1f7"}, + {file = "tree_sitter-0.22.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3cfa2a9860bfb0404ae28a9cf056dab8f2eb7f1673d8cc9b3f7e21452daad0e0"}, + {file = "tree_sitter-0.22.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9a66cc5f19635119a9d8325bcb00a58ed48427e3c3d307caf7c00d745ac83a5"}, + {file = "tree_sitter-0.22.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de16468ea22c910e67caa91c99be9d6eb73e97e5164480a890f678b22d32faca"}, + {file = "tree_sitter-0.22.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:98c697427f82abab6b39cfe2ade6547d844dd419fa8cfc89031bcdf7c10579b6"}, + {file = "tree_sitter-0.22.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:548aa34f15a29aef1fc8e85507f13e0678a54f1de16461f844d86179b19bb5f6"}, + {file = "tree_sitter-0.22.3-cp311-cp311-win_amd64.whl", hash = "sha256:2fc0e1097fb86623b340141e80a0f2b7668b09d953501d91adc715a577e32c61"}, + {file = "tree_sitter-0.22.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7cb5c145fbd4bcc0cd4851dc4d0a6079a8e2f61257f8c0effc92434f6fb19b14"}, + {file = "tree_sitter-0.22.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d4a592080db6b9472a886f4593b4705d02630721fdbe4a700085fe775fcab20e"}, + {file = "tree_sitter-0.22.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f36bf523763f05edf924126583ea997f905162046c0f184d6fd040cc1ccbf2c5"}, + {file = "tree_sitter-0.22.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8e1193f27c25aab299f4fc154664122c7bfe80633b726bb457356d371479a5b"}, + {file = "tree_sitter-0.22.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:156df7e71a6c6b542ff29526cad6886a41115e42dc768c55101398d68325db54"}, + {file = "tree_sitter-0.22.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:82e1d467ce23dd2ecc37d4fb83965e891fc37b943639c517cd5acf54a2df0ff7"}, + {file = "tree_sitter-0.22.3-cp312-cp312-win_amd64.whl", hash = "sha256:e541a0c08a04f229ba9479a8c441dd267fdaa3e5842ae70a744c178bcaf53fa3"}, + {file = "tree_sitter-0.22.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a85a1d0fdff21cc524a959b3277c311941a9b5b91a862e462c1b55470893884a"}, + {file = "tree_sitter-0.22.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f96c6acd2799bafa28543a267937eec6a3d9ccbdeb6e1d05858114d4cd882da9"}, + {file = "tree_sitter-0.22.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed2708aecd3a4c8d20a89350d3c89ac2f964985ee9117c39357cee3098a9498a"}, + {file = "tree_sitter-0.22.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b2f99535aa4195b20fef18559defaabd9e12fe8ed8806c101d51820f240ca64"}, + {file = "tree_sitter-0.22.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:459a0f3bf8d6dbb9e9f651d67cee3a60f0b799fefd4a33f49a7e9501ada98e35"}, + {file = "tree_sitter-0.22.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4a51bfe99dcd8bbfb0fe95113f0197e6e540db3077abce77a058235beec747a3"}, + {file = "tree_sitter-0.22.3-cp39-cp39-win_amd64.whl", hash = "sha256:8d54ef562492493bf091cb3fd605cb7e60bf1d56634a94ab48075741d823e3a5"}, +] + +[package.extras] +docs = ["sphinx (>=7.3,<8.0)", "sphinx-book-theme"] +tests = ["tree-sitter-html", "tree-sitter-javascript", "tree-sitter-json", "tree-sitter-python", "tree-sitter-rust"] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -3082,6 +3197,32 @@ files = [ {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] +[[package]] +name = "typing-inspect" +version = "0.9.0" +description = "Runtime inspection utilities for typing module." +optional = false +python-versions = "*" +files = [ + {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, + {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"}, +] + +[package.dependencies] +mypy-extensions = ">=0.3.0" +typing-extensions = ">=3.7.4" + +[[package]] +name = "tzdata" +version = "2024.1" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, +] + [[package]] name = "urllib3" version = "2.2.2" @@ -3318,4 +3459,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "^3.9.0" -content-hash = "27a73a66a7f9b6dbdf4dfe56a5e5c2409edc833a97def321b304d6b171997d9c" +content-hash = "3047839778c62f1db3f6405778595f938edc258aca93d573dd6dd42011965adc" diff --git a/pyproject.toml b/pyproject.toml index 6a6d3b5b..be13cacc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,8 @@ esprima = "^4.0.1" escodegen = "^1.0.11" redis = "^5.0.7" tqdm = "^4.66.5" +tree-sitter = "^0.22.3" +llama-index = "^0.10.65" [build-system]