Skip to content

Commit

Permalink
feat: big html pagination
Browse files Browse the repository at this point in the history
Signed-off-by: 117503445 <[email protected]>
  • Loading branch information
117503445 committed Aug 23, 2023
1 parent bed0c00 commit b725b8c
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 29 deletions.
13 changes: 13 additions & 0 deletions src/flow_pdf/flow_pdf/dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from worker.html_gen import HTMLGenWorker, DocInputParams, PageInputParams
from pathlib import Path

def main():
htmlGenWorker = HTMLGenWorker()

htmlGenWorker.run(DocInputParams(Path(), Path('./data/flow_pdf_output/The_C++_Programming_Language_4th_Edition_Bjarne_Stroustrup'), -1), None) # type: ignore




if __name__ == '__main__':
main()
5 changes: 4 additions & 1 deletion src/flow_pdf/flow_pdf/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ def get_files_from_cfg() -> list[tuple[Path, Path]]:
dir_input = dir_data / "input"

tags_include = set(cfg["files"]["tags"]["include"])
tags_exclude = set(cfg["files"]["tags"]["exclude"])
if 'exclude' in cfg["files"]["tags"]:
tags_exclude = set(cfg["files"]["tags"]["exclude"])
else:
tags_exclude = set()

files: list[tuple[Path, Path]] = []

Expand Down
70 changes: 42 additions & 28 deletions src/flow_pdf/flow_pdf/worker/html_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,34 +45,48 @@ def run(

html = file.read_text(Path(__file__).parent / "template.html")

soup = BeautifulSoup(html, "html.parser")

# add version to head
for k, v in doc["meta"].items():
soup.html.head.append(soup.new_tag("meta", attrs={"name": k, "content": v})) # type: ignore

for element in doc["elements"]:
if element["type"] == "paragraph":
t = soup.new_tag("p")

for c in element["children"]:
if c["type"] == "text":
t.append(c["text"])
elif c["type"] == "shot":
t.append(
soup.new_tag(
"img", src=c["path"], attrs={"class": "inline-img"}
def mk_html(elements, dest: Path):
soup = BeautifulSoup(html, "html.parser")

# add version to head
for k, v in doc["meta"].items():
soup.html.head.append(soup.new_tag("meta", attrs={"name": k, "content": v})) # type: ignore

for element in elements:
if element["type"] == "paragraph":
t = soup.new_tag("p")

for c in element["children"]:
if c["type"] == "text":
t.append(c["text"])
elif c["type"] == "shot":
t.append(
soup.new_tag(
"img", src=c["path"], attrs={"class": "inline-img"}
)
)
)
else:
self.logger.warning(f"unknown child type {c['type']}")
soup.html.body.append(t) # type: ignore
elif element["type"] == "shot":
t = soup.new_tag("img", src=element["path"], attrs={"class": "shot"})
soup.html.body.append(t) # type: ignore

else:
self.logger.warning(f"unknown element type {element['type']}")
file.write_text(doc_in.dir_output / "output" / "index.html", soup.prettify())
else:
self.logger.warning(f"unknown child type {c['type']}")
soup.html.body.append(t) # type: ignore
elif element["type"] == "shot":
t = soup.new_tag(
"img", src=element["path"], attrs={"class": "shot"}
)
soup.html.body.append(t) # type: ignore
else:
self.logger.warning(f"unknown element type {element['type']}")
file.write_text(dest, soup.prettify())

BIG_ELEMENT_SIZE = 5000

if len(doc["elements"]) < BIG_ELEMENT_SIZE:
mk_html(doc["elements"], doc_in.dir_output / "output" / "index.html")
else:
PER_HTML_ELEMENTS = 500
for i in range(0, int(len(doc["elements"]) / PER_HTML_ELEMENTS) + 1):
mk_html(
doc["elements"][i * PER_HTML_ELEMENTS : (i + 1) * PER_HTML_ELEMENTS],
doc_in.dir_output / "output" / f"part_{i}.html",
)

return DocOutParams(), []

0 comments on commit b725b8c

Please sign in to comment.