Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Notebooks #137

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.env
*.bin
.envrc
.vscode
models/**
db/
33 changes: 33 additions & 0 deletions loaders/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Loaders Module

This module contains data loaders for the ChatKNML chatbot.

## Installation

To install the loaders module, run the following command:

```bash
poetry install
```

## Testing

To run tests, run the following command:

```bash
poetry run pytest
```

## Formatting and linting

Formatting:

```bash
poetry run black .
```

Linting

```bash
poetry run flake8
```
Empty file added loaders/__init__.py
Empty file.
140 changes: 140 additions & 0 deletions loaders/clean_web_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from newspaper import Article
from functools import reduce
from typing import List, Optional
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: both are unnecessary - list[type] and type | None work just fine

from langchain_core.documents import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain.text_splitter import TextSplitter, SpacyTextSplitter


class CleanWebLoader(BaseLoader):
"""
A class for loading web content, extracting and cleaning text using the 'newspaper' library,
and converting it into a specific data structure.

Attributes:
url_list (list[str] | str): Either a string or a list of strings representing URLs.
depth (int): Maximum depth for recursive extraction (default is 1).

Methods:
newspaper_extractor(html): Extracts and cleans text content from HTML using the 'newspaper' library.
ds_converter(docs): Converts a list of documents into a specific data structure.
junk_remover(docs): Identifies and returns a list of suspected junk documents based on specific criteria.
load(): Loads web content from specified URLs, extracts text, and converts it into a specific data structure.
load_and_split(text_splitter, chunk, chunk_overlap): Loads web content, splits it into chunks, and converts it into a specific data structure.

"""

article = Article("")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: are you aware this instance will be shared and may cause weird bugs with overwrites between objects? (this object is created once and will be shared between CleanWebLoader instances)


def __init__(self, url_list: list[str] | str, depth: int = 1):
"""
Initializes the CleanWebLoader instance.

:param url_list: Either a string or a list of strings representing URLs.
:param depth: Maximum depth for recursive extraction (default is 1).
"""
super().__init__()
self.url_list = url_list
self.depth = depth

@staticmethod
def newspaper_extractor(html: str):
"""
Extracts and cleans text content from HTML using the 'newspaper' library.

:param html: HTML content to be processed.
:return: Cleaned and concatenated text extracted from the HTML.
"""
CleanWebLoader.article.set_html(html)
CleanWebLoader.article.parse()
return " ".join(CleanWebLoader.article.text.split())

@staticmethod
def ds_converter(docs: list[Document]):
"""
Converts a list of documents into a specific data structure.

:param docs: List of documents to be converted.
:return: List of dictionaries, each representing a document with 'text' key.
"""
return [{"text": doc.page_content} for doc in docs]

@staticmethod
def junk_remover(docs: list[Document]):
"""
Identifies and returns a list of suspected junk documents based on specific criteria.

:param docs: A list of documents, where each document is represented as a dictionary.
Each dictionary should have a "text" key containing the text content of the document.
:return: A list of suspected junk documents based on the criteria of having less than 300 characters
or having the same text as another document in the input list.
"""
junk_docs = [doc for doc in docs if len(doc.page_content) < 300]
seen_texts = set()
clear_docs = []
for doc in docs:
if "title" not in doc.metadata.keys():
junk_docs.append(doc)
elif doc.page_content not in seen_texts and doc not in junk_docs:
clear_docs.append(doc)
seen_texts.add(doc.page_content)
else:
pass
return clear_docs
Comment on lines +41 to +84
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: those should not be static, add self argument and use self.article.* instead of CleanWebLoader.article.*
ds_converter and junk_remover should be separate from this class as they've got nothing to do with it.
also - TYPING, please check with pyright set to strict


def load(self) -> List[dict]:
"""
Loads web content from specified URLs, extracts text using the 'newspaper' library,
and converts it into a specific data structure using the ds_converter and junk_remover methods.

:return: List of dictionaries, each representing a document with 'text' key.
"""
docs = []
if isinstance(self.url_list, str):
self.url_list = [self.url_list]
for address in self.url_list:
try:
loader = RecursiveUrlLoader(
url=address,
max_depth=self.depth,
extractor=CleanWebLoader.newspaper_extractor,
)
docs.extend(loader.load())
except Exception as e:
print(f"Exception: {e}")
break
docs = reduce(
lambda data, method: method(data),
[CleanWebLoader.junk_remover, CleanWebLoader.ds_converter],
docs,
)
return docs

def load_and_split(
self,
text_splitter: Optional[TextSplitter] = None,
chunk: int = 400,
chunk_overlap: int = 80,
) -> List[dict]:
"""
Loads web content from specified URLs, extracts text using the 'newspaper' library,
splits it into chunks using the provided or default TextSplitter,
and converts it into a specific data structure using the ds_converter and junk_remover methods.

:param text_splitter: Optional TextSplitter instance to use for splitting documents.
:param chunk: Chunk size for text splitting (default is 400).
:param chunk_overlap: Overlap size between chunks (default is 80).
:return: List of dictionaries, each representing a document with 'text' key.
"""
_text_splitter: text_splitter or TextSplitter = SpacyTextSplitter(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: that's some weird-ass typing, wtf

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad, I missed it

pipeline="pl_core_news_sm",
chunk_size=chunk,
chunk_overlap=chunk_overlap,
)
docs = reduce(
lambda data, method: method(data),
[CleanWebLoader.junk_remover, CleanWebLoader.ds_converter],
self.load(),
)
return _text_splitter.split_documents(docs)
117 changes: 117 additions & 0 deletions loaders/local_data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import os
import logging
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders import PyPDFLoader, JSONLoader
from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.csv_loader import CSVLoader
from typing import Union, List


class LocalDataLoader(BaseLoader):

loaders = {
".pdf": PyPDFLoader,
".json": JSONLoader,
".txt": TextLoader,
".csv": CSVLoader,
}
Comment on lines +12 to +17
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: move this to init


def __init__(self, path: Union[List[str], str]):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion:

Suggested change
def __init__(self, path: Union[List[str], str]):
def __init__(self, path: list[str] | str):

"""
Initialize the LocalDataLoader instance.

:param path: A list of paths or a single path pointing to the location of data files.
"""
super().__init__()
self.path_list = path

@staticmethod
def ds_converter(docs):
"""
Converts a list of documents into a specific data structure.

:param docs: List of documents to be converted.
:return: List of dictionaries, each representing a document with 'text' and 'url' keys.
"""
return [{"text": doc.page_content} for doc in docs]

@staticmethod
def junk_remover(docs):
"""
Identifies and returns a list of suspected junk documents based on specific criteria.

:param docs: A list of documents, where each document is represented as a dictionary.
Each dictionary should have a "text" key containing the text content of the document.
:return: A list of suspected junk documents based on the criteria of having less than 300 characters
or having the same text as another document in the input list.
"""
junk_docs = [doc for doc in docs if len(doc.page_content) < 300]
seen_texts = {}
clear_docs = []
for doc in docs:
if doc.page_content not in seen_texts and doc not in junk_docs:
clear_docs.append(doc)
seen_texts.add(doc.page_content)
return clear_docs
Comment on lines +28 to +55
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: yep, those should be shared, if you want them to be inside of this class - make them a mixin


def load(self) -> List[dict]:
"""
Load data from the specified paths using registered data loaders.

This method iterates through each path in the 'path_list', explores the directories, and processes each file
using the appropriate data loader based on the file extension. The loaded data is then appended to a list,
which is further processed to remove junk and convert the data structure. The final list of dictionaries is returned.

:return: A list of dictionaries representing the loaded and processed data.
"""
docs = []
for path in self.path_list:
docs.extend(self._process_directory(path))

docs = self._process_loaded_data(docs)
return docs

def _process_directory(self, path: str) -> List[dict]:
"""
Process all files in the given directory path using the appropriate loaders.

:param path: The path to the directory containing the files.
:return: A list of dictionaries with loaded data.
"""
loaded_docs = []
for root, _, files in os.walk(path):
for file in files:
file_path = os.path.join(root, file)
doc = self._load_file(file_path)
if doc:
loaded_docs.append(doc)
return loaded_docs

def _load_file(self, file_path: str) -> dict:
"""
Load a single file using the appropriate loader based on its extension.

:param file_path: The full path to the file.
:return: A dictionary with loaded data or None if an error occurred.
"""
file_extension = os.path.splitext(file_path)[1].lower()
loader = LocalDataLoader.loaders.get(file_extension)

if loader:
try:
return loader(file_path).load()[0]
except Exception as e:
logging.error(f"Error loading file {file_path}: {e}")
else:
logging.warning(f"No loader found for file type: {file_extension}")
return None

def _process_loaded_data(self, docs: List[dict]) -> List[dict]:
"""
Process loaded data by removing junk and converting the data structure.

:param docs: A list of dictionaries with raw loaded data.
:return: A list of dictionaries with cleaned and processed data.
"""
docs = LocalDataLoader.junk_remover(docs)
return LocalDataLoader.ds_converter(docs)
Loading