knmlprz · bisd98 · Dec 4, 2023 · Dec 4, 2023 · Dec 4, 2023 · Dec 18, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 .env
 *.bin
 .envrc
+.vscode
 models/**
 db/
diff --git a/loaders/README.md b/loaders/README.md
@@ -0,0 +1,33 @@
+# Loaders Module
+
+This module contains data loaders for the ChatKNML chatbot.
+
+## Installation
+
+To install the loaders module, run the following command:
+
+```bash
+poetry install
+```
+
+## Testing
+
+To run tests, run the following command:
+
+```bash
+poetry run pytest
+```
+
+## Formatting and linting
+
+Formatting:
+
+```bash
+poetry run black .
+```
+
+Linting
+
+```bash
+poetry run flake8
+```
diff --git a/loaders/__init__.py b/loaders/__init__.py
diff --git a/loaders/clean_web_loader.py b/loaders/clean_web_loader.py
@@ -0,0 +1,140 @@
+from newspaper import Article
+from functools import reduce
+from typing import List, Optional
+from langchain_core.documents import Document
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
+from langchain.text_splitter import TextSplitter, SpacyTextSplitter
+
+
+class CleanWebLoader(BaseLoader):
+    """
+    A class for loading web content, extracting and cleaning text using the 'newspaper' library,
+    and converting it into a specific data structure.
+
+    Attributes:
+        url_list (list[str] | str): Either a string or a list of strings representing URLs.
+        depth (int): Maximum depth for recursive extraction (default is 1).
+
+    Methods:
+        newspaper_extractor(html): Extracts and cleans text content from HTML using the 'newspaper' library.
+        ds_converter(docs): Converts a list of documents into a specific data structure.
+        junk_remover(docs): Identifies and returns a list of suspected junk documents based on specific criteria.
+        load(): Loads web content from specified URLs, extracts text, and converts it into a specific data structure.
+        load_and_split(text_splitter, chunk, chunk_overlap): Loads web content, splits it into chunks, and converts it into a specific data structure.
+
+    """
+
+    article = Article("")
+
+    def __init__(self, url_list: list[str] | str, depth: int = 1):
+        """
+        Initializes the CleanWebLoader instance.
+
+        :param url_list: Either a string or a list of strings representing URLs.
+        :param depth: Maximum depth for recursive extraction (default is 1).
+        """
+        super().__init__()
+        self.url_list = url_list
+        self.depth = depth
+
+    @staticmethod
+    def newspaper_extractor(html: str):
+        """
+        Extracts and cleans text content from HTML using the 'newspaper' library.
+
+        :param html: HTML content to be processed.
+        :return: Cleaned and concatenated text extracted from the HTML.
+        """
+        CleanWebLoader.article.set_html(html)
+        CleanWebLoader.article.parse()
+        return " ".join(CleanWebLoader.article.text.split())
+
+    @staticmethod
+    def ds_converter(docs: list[Document]):
+        """
+        Converts a list of documents into a specific data structure.
+
+        :param docs: List of documents to be converted.
+        :return: List of dictionaries, each representing a document with 'text' key.
+        """
+        return [{"text": doc.page_content} for doc in docs]
+
+    @staticmethod
+    def junk_remover(docs: list[Document]):
+        """
+        Identifies and returns a list of suspected junk documents based on specific criteria.
+
+        :param docs: A list of documents, where each document is represented as a dictionary.
+                    Each dictionary should have a "text" key containing the text content of the document.
+        :return: A list of suspected junk documents based on the criteria of having less than 300 characters
+                or having the same text as another document in the input list.
+        """
+        junk_docs = [doc for doc in docs if len(doc.page_content) < 300]
+        seen_texts = set()
+        clear_docs = []
+        for doc in docs:
+            if "title" not in doc.metadata.keys():
+                junk_docs.append(doc)
+            elif doc.page_content not in seen_texts and doc not in junk_docs:
+                clear_docs.append(doc)
+                seen_texts.add(doc.page_content)
+            else:
+                pass
+        return clear_docs
+
+    def load(self) -> List[dict]:
+        """
+        Loads web content from specified URLs, extracts text using the 'newspaper' library,
+        and converts it into a specific data structure using the ds_converter and junk_remover methods.
+
+        :return: List of dictionaries, each representing a document with 'text' key.
+        """
+        docs = []
+        if isinstance(self.url_list, str):
+            self.url_list = [self.url_list]
+        for address in self.url_list:
+            try:
+                loader = RecursiveUrlLoader(
+                    url=address,
+                    max_depth=self.depth,
+                    extractor=CleanWebLoader.newspaper_extractor,
+                )
+                docs.extend(loader.load())
+            except Exception as e:
+                print(f"Exception: {e}")
+                break
+        docs = reduce(
+            lambda data, method: method(data),
+            [CleanWebLoader.junk_remover, CleanWebLoader.ds_converter],
+            docs,
+        )
+        return docs
+
+    def load_and_split(
+        self,
+        text_splitter: Optional[TextSplitter] = None,
+        chunk: int = 400,
+        chunk_overlap: int = 80,
+    ) -> List[dict]:
+        """
+        Loads web content from specified URLs, extracts text using the 'newspaper' library,
+        splits it into chunks using the provided or default TextSplitter,
+        and converts it into a specific data structure using the ds_converter and junk_remover methods.
+
+        :param text_splitter: Optional TextSplitter instance to use for splitting documents.
+        :param chunk: Chunk size for text splitting (default is 400).
+        :param chunk_overlap: Overlap size between chunks (default is 80).
+        :return: List of dictionaries, each representing a document with 'text' key.
+        """
+        _text_splitter: text_splitter or TextSplitter = SpacyTextSplitter(
+            pipeline="pl_core_news_sm",
+            chunk_size=chunk,
+            chunk_overlap=chunk_overlap,
+        )
+        docs = reduce(
+            lambda data, method: method(data),
+            [CleanWebLoader.junk_remover, CleanWebLoader.ds_converter],
+            self.load(),
+        )
+        return _text_splitter.split_documents(docs)
diff --git a/loaders/local_data_loader.py b/loaders/local_data_loader.py
@@ -0,0 +1,117 @@
+import os
+import logging
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders import PyPDFLoader, JSONLoader
+from langchain.document_loaders.text import TextLoader
+from langchain.document_loaders.csv_loader import CSVLoader
+from typing import Union, List
+
+
+class LocalDataLoader(BaseLoader):
+
+    loaders = {
+        ".pdf": PyPDFLoader,
+        ".json": JSONLoader,
+        ".txt": TextLoader,
+        ".csv": CSVLoader,
+    }
+
+    def __init__(self, path: Union[List[str], str]):
-    def __init__(self, path: Union[List[str], str]):
+    def __init__(self, path: list[str] | str):
-    def __init__(self, path: Union[List[str], str]):
+    def __init__(self, path: list[str] | str):
+        """
+        Initialize the LocalDataLoader instance.
+
+        :param path: A list of paths or a single path pointing to the location of data files.
+        """
+        super().__init__()
+        self.path_list = path
+
+    @staticmethod
+    def ds_converter(docs):
+        """
+        Converts a list of documents into a specific data structure.
+
+        :param docs: List of documents to be converted.
+        :return: List of dictionaries, each representing a document with 'text' and 'url' keys.
+        """
+        return [{"text": doc.page_content} for doc in docs]
+
+    @staticmethod
+    def junk_remover(docs):
+        """
+        Identifies and returns a list of suspected junk documents based on specific criteria.
+
+        :param docs: A list of documents, where each document is represented as a dictionary.
+                    Each dictionary should have a "text" key containing the text content of the document.
+        :return: A list of suspected junk documents based on the criteria of having less than 300 characters
+                or having the same text as another document in the input list.
+        """
+        junk_docs = [doc for doc in docs if len(doc.page_content) < 300]
+        seen_texts = {}
+        clear_docs = []
+        for doc in docs:
+            if doc.page_content not in seen_texts and doc not in junk_docs:
+                clear_docs.append(doc)
+                seen_texts.add(doc.page_content)
+        return clear_docs
+
+    def load(self) -> List[dict]:
+        """
+        Load data from the specified paths using registered data loaders.
+
+        This method iterates through each path in the 'path_list', explores the directories, and processes each file
+        using the appropriate data loader based on the file extension. The loaded data is then appended to a list,
+        which is further processed to remove junk and convert the data structure. The final list of dictionaries is returned.
+
+        :return: A list of dictionaries representing the loaded and processed data.
+        """
+        docs = []
+        for path in self.path_list:
+            docs.extend(self._process_directory(path))
+
+        docs = self._process_loaded_data(docs)
+        return docs
+
+    def _process_directory(self, path: str) -> List[dict]:
+        """
+        Process all files in the given directory path using the appropriate loaders.
+
+        :param path: The path to the directory containing the files.
+        :return: A list of dictionaries with loaded data.
+        """
+        loaded_docs = []
+        for root, _, files in os.walk(path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                doc = self._load_file(file_path)
+                if doc:
+                    loaded_docs.append(doc)
+        return loaded_docs
+
+    def _load_file(self, file_path: str) -> dict:
+        """
+        Load a single file using the appropriate loader based on its extension.
+
+        :param file_path: The full path to the file.
+        :return: A dictionary with loaded data or None if an error occurred.
+        """
+        file_extension = os.path.splitext(file_path)[1].lower()
+        loader = LocalDataLoader.loaders.get(file_extension)
+
+        if loader:
+            try:
+                return loader(file_path).load()[0]
+            except Exception as e:
+                logging.error(f"Error loading file {file_path}: {e}")
+        else:
+            logging.warning(f"No loader found for file type: {file_extension}")
+        return None
+
+    def _process_loaded_data(self, docs: List[dict]) -> List[dict]:
+        """
+        Process loaded data by removing junk and converting the data structure.
+
+        :param docs: A list of dictionaries with raw loaded data.
+        :return: A list of dictionaries with cleaned and processed data.
+        """
+        docs = LocalDataLoader.junk_remover(docs)
+        return LocalDataLoader.ds_converter(docs)