Merge branch 'main' into dev

opendatalab · Sep 11, 2024 · d88d78a · d88d78a
2 parents c616239 + 7ffee83
commit d88d78a
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 3 deletions.
diff --git a/README-zh_CN.md b/README-zh_CN.md
@@ -9,7 +9,7 @@
 
 [English](./README.md) | 简体中文
 
-[[Models (🤗Hugging Face)]](https://huggingface.co/wanderkid/PDF-Extract-Kit) | [[Models(<img src="./assets/images/modelscope_logo.png" width="20px">ModelScope)]](https://www.modelscope.cn/models/wanderkid/PDF-Extract-Kit) 
+[[Models (🤗Hugging Face)]](https://huggingface.co/wanderkid/PDF-Extract-Kit) | [[Models(<img src="./assets/images/modelscope_logo.png" width="20px">ModelScope)]](https://www.modelscope.cn/models/OpenDataLab/PDF-Extract-Kit) 
 
 
 🔥🔥🔥 [MinerU：基于PDF-Extract-Kit的高效文档内容提取工具](https://github.com/opendatalab/MinerU)

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 
 English | [简体中文](./README-zh_CN.md)
 
-[[Models (🤗Hugging Face)]](https://huggingface.co/wanderkid/PDF-Extract-Kit) | [[Models(<img src="./assets/images/modelscope_logo.png" width="20px">ModelScope)]](https://www.modelscope.cn/models/wanderkid/PDF-Extract-Kit) 
+[[Models (🤗Hugging Face)]](https://huggingface.co/opendatalab/PDF-Extract-Kit) | [[Models(<img src="./assets/images/modelscope_logo.png" width="20px">ModelScope)]](https://www.modelscope.cn/models/OpenDataLab/PDF-Extract-Kit) 
 
 🔥🔥🔥 [MinerU: Efficient Document Content Extraction Tool Based on PDF-Extract-Kit](https://github.com/opendatalab/MinerU)
 
@@ -257,6 +257,7 @@ Parameter explanations:
 - `--output`: Path where the results are saved, default is "output".
 - `--vis`: Whether to visualize the results; if yes, detection results including bounding boxes and categories will be visualized.
 - `--render`: Whether to render the recognized results, including LaTeX code for formulas and plain text, which will be rendered and placed in the detection boxes. Note: This process is very time-consuming, and also requires prior installation of `xelatex` and `imagemagic`.
+- `--batch-size`: Batch size for dataloader. Larger batch sizes are recommended, but smaller sizes require less GPU memory. Default is 128.
 
 > This project is dedicated to using models for high-quality content extraction from documents on diversity. It does not involve reassembling the extracted content into new documents, such as converting PDFs to Markdown. For those needs, please refer to our other GitHub project: [MinerU](https://github.com/opendatalab/MinerU)
 

diff --git a/pdf_extract.py b/pdf_extract.py
@@ -9,6 +9,7 @@
 import shutil
 import torch
 import numpy as np
+import gc
 
 from paddleocr import draw_ocr
 from PIL import Image, ImageDraw, ImageFont
@@ -77,6 +78,7 @@ def __getitem__(self, idx):
     parser = argparse.ArgumentParser()
     parser.add_argument('--pdf', type=str)
     parser.add_argument('--output', type=str, default="output")
+    parser.add_argument('--batch-size', type=int, default=128)
     parser.add_argument('--vis', action='store_true')
     parser.add_argument('--render', action='store_true')
     args = parser.parse_args()
@@ -147,11 +149,17 @@ def __getitem__(self, idx):
                 width = img_W
             )
             doc_layout_result.append(layout_res)
+
+            del mfd_res
+            torch.cuda.empty_cache()
+            gc.collect()
 
         # Formula recognition, collect all formula images in whole pdf file, then batch infer them.
         a = time.time()  
         dataset = MathDataset(mf_image_list, transform=mfr_transform)
-        dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
+
+        dataloader = DataLoader(dataset, batch_size=args.batch_size, num_workers=0)
+
         mfr_res = []
         for imgs in dataloader:
             imgs = imgs.to(device)