Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: modular parser and formatter v0 #175

Merged
merged 14 commits into from
Jan 10, 2025
Merged
Empty file.
26 changes: 0 additions & 26 deletions libs/megaparse/src/megaparse/checker/format_checker.py

This file was deleted.

211 changes: 0 additions & 211 deletions libs/megaparse/src/megaparse/checker/markdown_processor.py

This file was deleted.

41 changes: 31 additions & 10 deletions libs/megaparse/src/megaparse/examples/parse_file.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,39 @@
from megaparse import MegaParse
import asyncio
from pathlib import Path
from typing import List

from langchain_openai import ChatOpenAI
from llama_index.core.schema import Document as LlamaDocument
from llama_parse import LlamaParse
from llama_parse.utils import Language, ResultType
from megaparse.formatter.structured_formatter.custom_structured_formatter import (
CustomStructuredFormatter,
)
from megaparse.megaparse import MegaParse
from megaparse.parser.doctr_parser import DoctrParser
from megaparse.parser.unstructured_parser import UnstructuredParser
import pypdfium2 as pdfium
from megaparse_sdk.schema.extensions import FileExtension
from pydantic import BaseModel, Field


class MyCustomFormat(BaseModel):
title: str = Field(description="The title of the document.")
problem: str = Field(description="The problem statement.")
solution: str = Field(description="The solution statement.")


def main():
parser = UnstructuredParser()
megaparse = MegaParse(parser=parser)
async def main():
# Parse a file
parser = DoctrParser()
model = ChatOpenAI(name="gpt-4o")
formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)

file_path = "./tests/pdf/native/0168029.pdf"
megaparse = MegaParse(ocr_parser=parser, formatters=[formatter_1])

parsed_file = megaparse.load(file_path)
print(f"\n----- File Response : {file_path} -----\n")
print(parsed_file)
file_path = Path("./tests/pdf/sample_pdf.pdf")
result = await megaparse.aload(file_path=file_path)
print(result)


if __name__ == "__main__":
main()
asyncio.run(main())
33 changes: 33 additions & 0 deletions libs/megaparse/src/megaparse/formatter/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from abc import ABC
from pathlib import Path
from typing import List, Union

from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.models.document import Document


class BaseFormatter(ABC):
"""
A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables.
Attributes
----------
model : BaseChatModel
An instance of a chat model used to process and improve the layout of elements.
Methods
-------
improve_layout(elements: List[Element]) -> List[Element]
Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout.
"""

def __init__(self, model: BaseChatModel | None = None):
self.model = model

def format(
self, document: Document, file_path: Path | str | None = None
) -> Union[Document, str]:
raise NotImplementedError("Subclasses should implement this method")

async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Union[Document, str]:
raise NotImplementedError("Subclasses should implement this method")
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pathlib import Path
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.formatter.base import BaseFormatter
from megaparse.models.document import Document
from pydantic import BaseModel


class StructuredFormatter(BaseFormatter):
def __init__(self, model: BaseChatModel, output_model: type[BaseModel]):
super().__init__(model)
self.output_model = output_model

async def aformat(
self,
document: Document,
file_path: Path | str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()

def format(
self,
document: Document,
file_path: Path | str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()
Loading
Loading