Skip to content

Commit

Permalink
chore: files added
Browse files Browse the repository at this point in the history
  • Loading branch information
nuffin committed Oct 18, 2024
1 parent 3e27bfc commit f896635
Show file tree
Hide file tree
Showing 11 changed files with 155 additions and 5 deletions.
36 changes: 36 additions & 0 deletions llmpa/fileparser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,39 @@
"""
Document file types support
"""

from . import docx
from . import pdf
from . import pptx
from . import xlsx
from . import csv
from . import text
from . import image
from . import video

from .mimetype import detect

parsers = {
"application/pdf": pdf.PdfFileParser,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": docx.DocxFileParser,
"application/vnd.openxmlformats-officedocument.presentationml.presentation": pptx.PptxFileParser,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": xlsx.XlsxFileParser,
"text/csv": csv.CsvFileParser,
"text/plain": text.TextFileParser,
"image/gif": image.ImageFileParser,
"image/jpeg": image.ImageFileParser,
"image/png": image.ImageFileParser,
"image/svg+xml": image.ImageFileParser,
"video/mp4": video.VideoFileParser,
"video/quicktime": video.VideoFileParser,
"video/x-msvideo": video.VideoFileParser,
}


def tokenize(filepath: str):
mimetype = detect(filepath)
if mimetype in parsers:
parser = parsers[mimetype](filepath)
parser.tokenize()
else:
print(f"Unsupported file type: {mimetype}")
15 changes: 15 additions & 0 deletions llmpa/fileparser/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from . import tokenize


def main():
import sys

if len(sys.argv) < 2:
print("Usage: document.py <filepath>")
sys.exit(1)
filepath = sys.argv[1]
tokenize(filepath)


if __name__ == "__main__":
main()
10 changes: 10 additions & 0 deletions llmpa/fileparser/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
class BaseFileParser:
def __init__(self, file_path):
self.file_path = file_path
self.file_type = None

def parse(self):
raise NotImplementedError

def tokenize(self):
raise NotImplementedError
12 changes: 12 additions & 0 deletions llmpa/fileparser/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .base import BaseFileParser


class CsvFileParser(BaseFileParser):
def __init__(self, file_path):
super().__init__(file_path)

def parse(self):
print("Parsing csv file")

def tokenize(self):
print("Tokenizing csv file")
12 changes: 12 additions & 0 deletions llmpa/fileparser/docx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .base import BaseFileParser


class DocxFileParser(BaseFileParser):
def __init__(self, file_path):
super().__init__(file_path)

def parse(self):
raise NotImplementedError

def tokenize(self):
print("Tokenizing docx file")
12 changes: 12 additions & 0 deletions llmpa/fileparser/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .base import BaseFileParser


class ImageFileParser(BaseFileParser):
def __init__(self, file_path):
super().__init__(file_path)

def parse(self):
raise NotImplementedError

def tokenize(self):
print("Tokenizing image file")
12 changes: 12 additions & 0 deletions llmpa/fileparser/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .base import BaseFileParser


class PdfFileParser(BaseFileParser):
def __init__(self, file_path):
super().__init__(file_path)

def parse(self):
print("Parsing pdf file")

def tokenize(self):
print("Tokenizing pdf file")
12 changes: 12 additions & 0 deletions llmpa/fileparser/pptx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .base import BaseFileParser


class PptxFileParser(BaseFileParser):
def __init__(self, file_path):
super().__init__(file_path)

def parse(self):
raise NotImplementedError

def tokenize(self):
print("Tokenizing pptx file")
15 changes: 10 additions & 5 deletions llmpa/fileparser/text.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import common
from .base import BaseFileParser


def tokenize(filepath: str):
with open(filepath, "r") as f:
text = f.read()
return text.split()
class TextFileParser(BaseFileParser):
def __init__(self, file_path):
super().__init__(file_path)

def parse(self):
print("Parsing text file")

def tokenize(self):
print("Tokenizing text file")
12 changes: 12 additions & 0 deletions llmpa/fileparser/video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .base import BaseFileParser


class VideoFileParser(BaseFileParser):
def __init__(self, file_path):
super().__init__(file_path)

def parse(self):
print("Parsing video file")

def tokenize(self):
print("Tokenizing video file")
12 changes: 12 additions & 0 deletions llmpa/fileparser/xlsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .base import BaseFileParser


class XlsxFileParser(BaseFileParser):
def __init__(self, file_path):
super().__init__(file_path)

def parse(self):
raise NotImplementedError

def tokenize(self):
print("Tokenizing xlsx file")

0 comments on commit f896635

Please sign in to comment.