run-llama · jasonnathan · Nov 3, 2024 · Nov 3, 2024 · Nov 4, 2024 · Nov 10, 2024
diff --git a/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/.gitignore b/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/BUILD b/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/Makefile b/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/...-index-integrations/readers/llama-index-readers-chatgpt-conversations/README.md b/...-index-integrations/readers/llama-index-readers-chatgpt-conversations/README.md
@@ -0,0 +1,146 @@
+# LlamaIndex Readers Integration:
+
+## ChatGPT Conversations Reader and Message Node Parser
+
+[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
+
+A custom reader and node parser for processing exported ChatGPT conversation JSON files using [LlamaIndex](https://github.com/jerryjliu/llama_index). This package allows you to load and parse your ChatGPT conversation data, enabling advanced querying and analysis using LlamaIndex's indexing and querying capabilities.
+
+## **Table of Contents**
+
+- [Installation](#installation)
+- [Features](#features)
+- [Usage](#usage)
+  - [Loading Conversations](#loading-conversations)
+  - [Parsing Messages](#parsing-messages)
+  - [Building an Index](#building-an-index)
+- [Examples](#examples)
+- [Tests](#tests)
+- [Contributing](#contributing)
+- [License](#license)
+
+## **Installation**
+
+```bash
+pip install llama-index-readers-chatgpt-conversations
+```
+
+## **Features**
+
+- **ChatGPT Conversation Reader**: Load exported ChatGPT conversation JSON files into LlamaIndex `Document` objects.
+- **Message Parser**: Parse conversation documents into structured nodes with speaker identities and metadata.
+- **Markdown Support**: Process Markdown content within messages, including headers, code blocks, and inline code.
+- **Node Relationships**: Maintain relationships between nodes for contextual understanding.
+
+## **Usage**
+
+### **1. Loading Conversations**
+
+Use the `ChatGPTConversationsReader` to load your exported ChatGPT conversation JSON file.
+
+```python
+from llama_index.readers.chatgpt_conversation_json import (
+    ChatGPTConversationsReader,
+)
+
+# Initialize the reader with the path to your conversations.json file
+reader = ChatGPTConversationsReader(input_file="path/to/conversations.json")
+
+# Load documents
+documents = reader.load_data()
+print(f"Loaded {len(documents)} documents.")
+```
+
+### **2. Parsing Messages**
+
+Use the `ChatGPTMessageNodeParser` to parse the loaded documents into nodes.
+
+```python
+from llama_index.readers.chatgpt_conversation_json import (
+    ChatGPTMessageNodeParser,
+)
+
+# Initialize the message parser
+parser = ChatGPTMessageNodeParser()
+
+# Parse documents into nodes
+nodes = parser(documents)
+print(f"Parsed {len(nodes)} nodes.")
+```
+
+### **3. Building an Index**
+
+Leverage LlamaIndex to build an index over the parsed nodes for querying.
+
+```python
+from llama_index import VectorStoreIndex, StorageContext
+
+# Create a storage context (e.g., using a default in-memory store)
+storage_context = StorageContext.from_defaults()
+
+# Build the index
+index = VectorStoreIndex(nodes, storage_context=storage_context)
+
+# (Optional) Persist the index for later use
+storage_context.persist(persist_dir="./storage")
+```
+
+## **Examples**
+
+Here's a full example combining all steps:
+
+```python
+from llama_index.readers.chatgpt_conversation_json import (
+    ChatGPTConversationsReader,
+    ChatGPTMessageNodeParser,
+)
+from llama_index.core import VectorStoreIndex, StorageContext
+
+# Step 1: Load conversations
+reader = ChatGPTConversationsReader(input_file="path/to/conversations.json")
+documents = reader.load_data()
+
+# Step 2: Parse messages into nodes
+parser = ChatGPTMessageNodeParser()
+nodes = parser(documents)
+
+# Step 3: Build an index over the nodes
+storage_context = StorageContext.from_defaults()
+index = VectorStoreIndex(nodes, storage_context=storage_context)
+
+# Step 4: Query the index
+query_engine = index.as_query_engine()
+response = query_engine.query("What did I ask about data export?")
+print(response)
+```
+
+## **Tests**
+
+To run the tests, navigate to the project root directory and execute:
+
+```bash
+pytest tests
+```
+
+## **License**
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+---
+
+## **Additional Notes**
+
+- **Dependencies**:
+
+  - `llama_index_core`
+  - `markdown-it-py`
+
+- **Compatibility**: The package is compatible with Python 3.8 and above.
+
+- **Publishing**: Update the `pyproject.toml` with the appropriate package name, version, and author information before publishing to PyPI.
+
+---
+
+## **Happy Coding!**
+
+Feel free to reach out if you have any questions or need further assistance.
diff --git a/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/examples/BUILD b/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/examples/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/examples/main.py b/llama-index-integrations/readers/llama-index-readers-chatgpt-conversations/examples/main.py
@@ -0,0 +1,27 @@
+from llama_index.readers.chatgpt_conversations import (
+    ChatGPTConversationsReader,
+    ChatGPTMessageNodeParser,
+)
+
+# Specify the path to your conversations.json file
+conversations_file = "path/to/your/conversations.json"  # Update this path
+
+# Initialize the ChatGPTConversationsReader
+reader = ChatGPTConversationsReader(input_file=conversations_file)
+
+# Load conversations as Documents
+documents = reader.load_data()
+print(f"Number of documents loaded: {len(documents)}")
+
+# Initialize the ChatGPTMessageNodeParser
+parser = ChatGPTMessageNodeParser()
+
+# Parse documents into nodes
+nodes = parser(documents)
+print(f"Number of nodes parsed: {len(nodes)}")
+
+# Optionally, print out some information about the first few nodes
+for i, node in enumerate(nodes[:5]):
+    print(f"\nNode {i + 1}:")
+    print(f"Text: {node.text}")
+    print(f"Metadata: {node.metadata}")
diff --git a/...llama-index-readers-chatgpt-conversations/llama_index/readers/chatgpt_conversations/BUILD b/...llama-index-readers-chatgpt-conversations/llama_index/readers/chatgpt_conversations/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/...index-readers-chatgpt-conversations/llama_index/readers/chatgpt_conversations/__init__.py b/...index-readers-chatgpt-conversations/llama_index/readers/chatgpt_conversations/__init__.py
@@ -0,0 +1,6 @@
+from llama_index.readers.chatgpt_conversations.base import ChatGPTConversationsReader
+from llama_index.readers.chatgpt_conversations.message_node_parser import (
+    ChatGPTMessageNodeParser,
+)
+
+__all__ = ["ChatGPTConversationsReader", "ChatGPTMessageNodeParser"]