feat: big html pagination

Signed-off-by: 117503445 <[email protected]>
117503445 · Aug 23, 2023 · b725b8c · b725b8c
1 parent bed0c00
commit b725b8c
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 29 deletions.
diff --git a/src/flow_pdf/flow_pdf/dev.py b/src/flow_pdf/flow_pdf/dev.py
@@ -0,0 +1,13 @@
+from worker.html_gen import HTMLGenWorker, DocInputParams, PageInputParams
+from pathlib import Path
+
+def main():
+    htmlGenWorker = HTMLGenWorker()
+
+    htmlGenWorker.run(DocInputParams(Path(), Path('./data/flow_pdf_output/The_C++_Programming_Language_4th_Edition_Bjarne_Stroustrup'), -1), None) # type: ignore
+
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/flow_pdf/flow_pdf/main.py b/src/flow_pdf/flow_pdf/main.py
@@ -33,7 +33,10 @@ def get_files_from_cfg() -> list[tuple[Path, Path]]:
     dir_input = dir_data / "input"
 
     tags_include = set(cfg["files"]["tags"]["include"])
-    tags_exclude = set(cfg["files"]["tags"]["exclude"])
+    if 'exclude' in cfg["files"]["tags"]:
+        tags_exclude = set(cfg["files"]["tags"]["exclude"])
+    else:
+        tags_exclude = set()
 
     files: list[tuple[Path, Path]] = []
 

diff --git a/src/flow_pdf/flow_pdf/worker/html_gen.py b/src/flow_pdf/flow_pdf/worker/html_gen.py
@@ -45,34 +45,48 @@ def run(
 
         html = file.read_text(Path(__file__).parent / "template.html")
 
-        soup = BeautifulSoup(html, "html.parser")
-
-        # add version to head
-        for k, v in doc["meta"].items():
-            soup.html.head.append(soup.new_tag("meta", attrs={"name": k, "content": v}))  # type: ignore
-
-        for element in doc["elements"]:
-            if element["type"] == "paragraph":
-                t = soup.new_tag("p")
-
-                for c in element["children"]:
-                    if c["type"] == "text":
-                        t.append(c["text"])
-                    elif c["type"] == "shot":
-                        t.append(
-                            soup.new_tag(
-                                "img", src=c["path"], attrs={"class": "inline-img"}
+        def mk_html(elements, dest: Path):
+            soup = BeautifulSoup(html, "html.parser")
+
+            # add version to head
+            for k, v in doc["meta"].items():
+                soup.html.head.append(soup.new_tag("meta", attrs={"name": k, "content": v}))  # type: ignore
+
+            for element in elements:
+                if element["type"] == "paragraph":
+                    t = soup.new_tag("p")
+
+                    for c in element["children"]:
+                        if c["type"] == "text":
+                            t.append(c["text"])
+                        elif c["type"] == "shot":
+                            t.append(
+                                soup.new_tag(
+                                    "img", src=c["path"], attrs={"class": "inline-img"}
+                                )
                             )
-                        )
-                    else:
-                        self.logger.warning(f"unknown child type {c['type']}")
-                soup.html.body.append(t)  # type: ignore
-            elif element["type"] == "shot":
-                t = soup.new_tag("img", src=element["path"], attrs={"class": "shot"})
-                soup.html.body.append(t)  # type: ignore
-
-            else:
-                self.logger.warning(f"unknown element type {element['type']}")
-        file.write_text(doc_in.dir_output / "output" / "index.html", soup.prettify())
+                        else:
+                            self.logger.warning(f"unknown child type {c['type']}")
+                    soup.html.body.append(t)  # type: ignore
+                elif element["type"] == "shot":
+                    t = soup.new_tag(
+                        "img", src=element["path"], attrs={"class": "shot"}
+                    )
+                    soup.html.body.append(t)  # type: ignore
+                else:
+                    self.logger.warning(f"unknown element type {element['type']}")
+            file.write_text(dest, soup.prettify())
+
+        BIG_ELEMENT_SIZE = 5000
+
+        if len(doc["elements"]) < BIG_ELEMENT_SIZE:
+            mk_html(doc["elements"], doc_in.dir_output / "output" / "index.html")
+        else:
+            PER_HTML_ELEMENTS = 500
+            for i in range(0, int(len(doc["elements"]) / PER_HTML_ELEMENTS) + 1):
+                mk_html(
+                    doc["elements"][i * PER_HTML_ELEMENTS : (i + 1) * PER_HTML_ELEMENTS],
+                    doc_in.dir_output / "output" / f"part_{i}.html",
+                )
 
         return DocOutParams(), []