-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Notebooks #137
base: main
Are you sure you want to change the base?
Notebooks #137
Changes from all commits
8777375
75aba49
dacee35
aea8b85
e919f6f
102ef8c
ef69ea5
03bf352
25e9d96
0c1cd95
384e7b3
1ab17eb
93e232c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
.env | ||
*.bin | ||
.envrc | ||
.vscode | ||
models/** | ||
db/ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Loaders Module | ||
|
||
This module contains data loaders for the ChatKNML chatbot. | ||
|
||
## Installation | ||
|
||
To install the loaders module, run the following command: | ||
|
||
```bash | ||
poetry install | ||
``` | ||
|
||
## Testing | ||
|
||
To run tests, run the following command: | ||
|
||
```bash | ||
poetry run pytest | ||
``` | ||
|
||
## Formatting and linting | ||
|
||
Formatting: | ||
|
||
```bash | ||
poetry run black . | ||
``` | ||
|
||
Linting | ||
|
||
```bash | ||
poetry run flake8 | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
from newspaper import Article | ||
from functools import reduce | ||
from typing import List, Optional | ||
from langchain_core.documents import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader | ||
from langchain.text_splitter import TextSplitter, SpacyTextSplitter | ||
|
||
|
||
class CleanWebLoader(BaseLoader): | ||
""" | ||
A class for loading web content, extracting and cleaning text using the 'newspaper' library, | ||
and converting it into a specific data structure. | ||
|
||
Attributes: | ||
url_list (list[str] | str): Either a string or a list of strings representing URLs. | ||
depth (int): Maximum depth for recursive extraction (default is 1). | ||
|
||
Methods: | ||
newspaper_extractor(html): Extracts and cleans text content from HTML using the 'newspaper' library. | ||
ds_converter(docs): Converts a list of documents into a specific data structure. | ||
junk_remover(docs): Identifies and returns a list of suspected junk documents based on specific criteria. | ||
load(): Loads web content from specified URLs, extracts text, and converts it into a specific data structure. | ||
load_and_split(text_splitter, chunk, chunk_overlap): Loads web content, splits it into chunks, and converts it into a specific data structure. | ||
|
||
""" | ||
|
||
article = Article("") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: are you aware this instance will be shared and may cause weird bugs with overwrites between objects? (this object is created once and will be shared between |
||
|
||
def __init__(self, url_list: list[str] | str, depth: int = 1): | ||
""" | ||
Initializes the CleanWebLoader instance. | ||
|
||
:param url_list: Either a string or a list of strings representing URLs. | ||
:param depth: Maximum depth for recursive extraction (default is 1). | ||
""" | ||
super().__init__() | ||
self.url_list = url_list | ||
self.depth = depth | ||
|
||
@staticmethod | ||
def newspaper_extractor(html: str): | ||
""" | ||
Extracts and cleans text content from HTML using the 'newspaper' library. | ||
|
||
:param html: HTML content to be processed. | ||
:return: Cleaned and concatenated text extracted from the HTML. | ||
""" | ||
CleanWebLoader.article.set_html(html) | ||
CleanWebLoader.article.parse() | ||
return " ".join(CleanWebLoader.article.text.split()) | ||
|
||
@staticmethod | ||
def ds_converter(docs: list[Document]): | ||
""" | ||
Converts a list of documents into a specific data structure. | ||
|
||
:param docs: List of documents to be converted. | ||
:return: List of dictionaries, each representing a document with 'text' key. | ||
""" | ||
return [{"text": doc.page_content} for doc in docs] | ||
|
||
@staticmethod | ||
def junk_remover(docs: list[Document]): | ||
""" | ||
Identifies and returns a list of suspected junk documents based on specific criteria. | ||
|
||
:param docs: A list of documents, where each document is represented as a dictionary. | ||
Each dictionary should have a "text" key containing the text content of the document. | ||
:return: A list of suspected junk documents based on the criteria of having less than 300 characters | ||
or having the same text as another document in the input list. | ||
""" | ||
junk_docs = [doc for doc in docs if len(doc.page_content) < 300] | ||
seen_texts = set() | ||
clear_docs = [] | ||
for doc in docs: | ||
if "title" not in doc.metadata.keys(): | ||
junk_docs.append(doc) | ||
elif doc.page_content not in seen_texts and doc not in junk_docs: | ||
clear_docs.append(doc) | ||
seen_texts.add(doc.page_content) | ||
else: | ||
pass | ||
return clear_docs | ||
Comment on lines
+41
to
+84
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue: those should not be static, add |
||
|
||
def load(self) -> List[dict]: | ||
""" | ||
Loads web content from specified URLs, extracts text using the 'newspaper' library, | ||
and converts it into a specific data structure using the ds_converter and junk_remover methods. | ||
|
||
:return: List of dictionaries, each representing a document with 'text' key. | ||
""" | ||
docs = [] | ||
if isinstance(self.url_list, str): | ||
self.url_list = [self.url_list] | ||
for address in self.url_list: | ||
try: | ||
loader = RecursiveUrlLoader( | ||
url=address, | ||
max_depth=self.depth, | ||
extractor=CleanWebLoader.newspaper_extractor, | ||
) | ||
docs.extend(loader.load()) | ||
except Exception as e: | ||
print(f"Exception: {e}") | ||
break | ||
docs = reduce( | ||
lambda data, method: method(data), | ||
[CleanWebLoader.junk_remover, CleanWebLoader.ds_converter], | ||
docs, | ||
) | ||
return docs | ||
|
||
def load_and_split( | ||
self, | ||
text_splitter: Optional[TextSplitter] = None, | ||
chunk: int = 400, | ||
chunk_overlap: int = 80, | ||
) -> List[dict]: | ||
""" | ||
Loads web content from specified URLs, extracts text using the 'newspaper' library, | ||
splits it into chunks using the provided or default TextSplitter, | ||
and converts it into a specific data structure using the ds_converter and junk_remover methods. | ||
|
||
:param text_splitter: Optional TextSplitter instance to use for splitting documents. | ||
:param chunk: Chunk size for text splitting (default is 400). | ||
:param chunk_overlap: Overlap size between chunks (default is 80). | ||
:return: List of dictionaries, each representing a document with 'text' key. | ||
""" | ||
_text_splitter: text_splitter or TextSplitter = SpacyTextSplitter( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue: that's some weird-ass typing, wtf There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My bad, I missed it |
||
pipeline="pl_core_news_sm", | ||
chunk_size=chunk, | ||
chunk_overlap=chunk_overlap, | ||
) | ||
docs = reduce( | ||
lambda data, method: method(data), | ||
[CleanWebLoader.junk_remover, CleanWebLoader.ds_converter], | ||
self.load(), | ||
) | ||
return _text_splitter.split_documents(docs) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,117 @@ | ||||||
import os | ||||||
import logging | ||||||
from langchain.document_loaders.base import BaseLoader | ||||||
from langchain.document_loaders import PyPDFLoader, JSONLoader | ||||||
from langchain.document_loaders.text import TextLoader | ||||||
from langchain.document_loaders.csv_loader import CSVLoader | ||||||
from typing import Union, List | ||||||
|
||||||
|
||||||
class LocalDataLoader(BaseLoader): | ||||||
|
||||||
loaders = { | ||||||
".pdf": PyPDFLoader, | ||||||
".json": JSONLoader, | ||||||
".txt": TextLoader, | ||||||
".csv": CSVLoader, | ||||||
} | ||||||
Comment on lines
+12
to
+17
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: move this to init |
||||||
|
||||||
def __init__(self, path: Union[List[str], str]): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion:
Suggested change
|
||||||
""" | ||||||
Initialize the LocalDataLoader instance. | ||||||
|
||||||
:param path: A list of paths or a single path pointing to the location of data files. | ||||||
""" | ||||||
super().__init__() | ||||||
self.path_list = path | ||||||
|
||||||
@staticmethod | ||||||
def ds_converter(docs): | ||||||
""" | ||||||
Converts a list of documents into a specific data structure. | ||||||
|
||||||
:param docs: List of documents to be converted. | ||||||
:return: List of dictionaries, each representing a document with 'text' and 'url' keys. | ||||||
""" | ||||||
return [{"text": doc.page_content} for doc in docs] | ||||||
|
||||||
@staticmethod | ||||||
def junk_remover(docs): | ||||||
""" | ||||||
Identifies and returns a list of suspected junk documents based on specific criteria. | ||||||
|
||||||
:param docs: A list of documents, where each document is represented as a dictionary. | ||||||
Each dictionary should have a "text" key containing the text content of the document. | ||||||
:return: A list of suspected junk documents based on the criteria of having less than 300 characters | ||||||
or having the same text as another document in the input list. | ||||||
""" | ||||||
junk_docs = [doc for doc in docs if len(doc.page_content) < 300] | ||||||
seen_texts = {} | ||||||
clear_docs = [] | ||||||
for doc in docs: | ||||||
if doc.page_content not in seen_texts and doc not in junk_docs: | ||||||
clear_docs.append(doc) | ||||||
seen_texts.add(doc.page_content) | ||||||
return clear_docs | ||||||
Comment on lines
+28
to
+55
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: yep, those should be shared, if you want them to be inside of this class - make them a mixin |
||||||
|
||||||
def load(self) -> List[dict]: | ||||||
""" | ||||||
Load data from the specified paths using registered data loaders. | ||||||
|
||||||
This method iterates through each path in the 'path_list', explores the directories, and processes each file | ||||||
using the appropriate data loader based on the file extension. The loaded data is then appended to a list, | ||||||
which is further processed to remove junk and convert the data structure. The final list of dictionaries is returned. | ||||||
|
||||||
:return: A list of dictionaries representing the loaded and processed data. | ||||||
""" | ||||||
docs = [] | ||||||
for path in self.path_list: | ||||||
docs.extend(self._process_directory(path)) | ||||||
|
||||||
docs = self._process_loaded_data(docs) | ||||||
return docs | ||||||
|
||||||
def _process_directory(self, path: str) -> List[dict]: | ||||||
""" | ||||||
Process all files in the given directory path using the appropriate loaders. | ||||||
|
||||||
:param path: The path to the directory containing the files. | ||||||
:return: A list of dictionaries with loaded data. | ||||||
""" | ||||||
loaded_docs = [] | ||||||
for root, _, files in os.walk(path): | ||||||
for file in files: | ||||||
file_path = os.path.join(root, file) | ||||||
doc = self._load_file(file_path) | ||||||
if doc: | ||||||
loaded_docs.append(doc) | ||||||
return loaded_docs | ||||||
|
||||||
def _load_file(self, file_path: str) -> dict: | ||||||
""" | ||||||
Load a single file using the appropriate loader based on its extension. | ||||||
|
||||||
:param file_path: The full path to the file. | ||||||
:return: A dictionary with loaded data or None if an error occurred. | ||||||
""" | ||||||
file_extension = os.path.splitext(file_path)[1].lower() | ||||||
loader = LocalDataLoader.loaders.get(file_extension) | ||||||
|
||||||
if loader: | ||||||
try: | ||||||
return loader(file_path).load()[0] | ||||||
except Exception as e: | ||||||
logging.error(f"Error loading file {file_path}: {e}") | ||||||
else: | ||||||
logging.warning(f"No loader found for file type: {file_extension}") | ||||||
return None | ||||||
|
||||||
def _process_loaded_data(self, docs: List[dict]) -> List[dict]: | ||||||
""" | ||||||
Process loaded data by removing junk and converting the data structure. | ||||||
|
||||||
:param docs: A list of dictionaries with raw loaded data. | ||||||
:return: A list of dictionaries with cleaned and processed data. | ||||||
""" | ||||||
docs = LocalDataLoader.junk_remover(docs) | ||||||
return LocalDataLoader.ds_converter(docs) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nitpick: both are unnecessary -
list[type]
andtype | None
work just fine