diff --git a/src/bisheng_unstructured/api/pipeline.py b/src/bisheng_unstructured/api/pipeline.py index 82f2c21..68b4e1a 100644 --- a/src/bisheng_unstructured/api/pipeline.py +++ b/src/bisheng_unstructured/api/pipeline.py @@ -24,7 +24,7 @@ from bisheng_unstructured.staging.base import convert_to_isd -def partition_pdf(filename, model_params, **kwargs): +def partition_pdf(filename, model_params,scale, **kwargs): if kwargs.get("mode") == "local": # pypdf 进行解析 import pypdf @@ -43,7 +43,7 @@ def partition_pdf(filename, model_params, **kwargs): # if rt_type in {"ocr_sdk", "idp"}: # doc = IDP_PDFDocument(file=filename, model_params=model_params, **kwargs) # else: - doc = PDFDocument(file=filename, model_params=model_params, **kwargs) + doc = PDFDocument(file=filename, model_params=model_params,scale=scale, **kwargs) _ = doc.pages return doc.elements @@ -156,7 +156,7 @@ def predict(self, inp: UnstructuredInput) -> UnstructuredOutput: return UnstructuredOutput(status_code=400, status_message="本地模型不支持图片") if part_func == partition_pdf or part_func == partition_image: - part_inp.update({"model_params": self.pdf_model_params}) + part_inp.update({"model_params": self.pdf_model_params,"scale":inp.scale}) try: elements = part_func(**part_inp) mode = inp.mode diff --git a/src/bisheng_unstructured/api/types.py b/src/bisheng_unstructured/api/types.py index ef81a16..0e67964 100644 --- a/src/bisheng_unstructured/api/types.py +++ b/src/bisheng_unstructured/api/types.py @@ -11,6 +11,7 @@ class UnstructuredInput(BaseModel): mode: str = "text" # text, partition, vis, topdf file_path: Optional[str] = None file_type: Optional[str] = None + scale: Optional[int] = 1 is_scan: Optional[bool] = None diff --git a/src/bisheng_unstructured/config/config.yaml b/src/bisheng_unstructured/config/config.yaml index 4749c31..75e0067 100644 --- a/src/bisheng_unstructured/config/config.yaml +++ b/src/bisheng_unstructured/config/config.yaml @@ -11,13 +11,13 @@ logger_conf: # 日志级别 level: INFO # 和原生不一样,后端会将配置使用eval()执行转为函数用来过滤特定日志级别。推荐lambda - filter: "lambda record: record['level'].name == 'INFO'" + # filter: "lambda record: record['level'].name == 'INFO'" # 日志格式化函数,extra内支持trace_id format: "[{time:YYYY-MM-DD HH:mm:ss.SSSSSS}]|{level}|BISHENG|{extra[trace_id]}|{process.id}|{thread.id}|{message}" # 每天的几点进行切割 rotation: "00:00" retention: "3 Days" - - sink: "/app/logs/err-v0-BISHENG-{HOSTNAME}.log" + - sink: "/app/logs/err-v0-BISHENG-UNS-{HOSTNAME}.log" level: ERROR filter: "lambda record: record['level'].name == 'ERROR'" format: "[{time:YYYY-MM-DD HH:mm:ss.SSSSSS}]|{level}|BISHENG|{extra[trace_id]}||{process.id}|{thread.id}|||#EX_ERR:POS={name},line {line},ERR=500,EMSG={message}" @@ -26,14 +26,14 @@ logger_conf: # pdf解析需要用到的模型配置, 配置了rt_server环境变量的话会替换为对应的地址 pdf_model_params: - layout_ep: "http://192.168.106.12:9001/v2.1/models/elem_layout_v1/infer" - cell_model_ep: "http://192.168.106.12:9001/v2.1/models/elem_table_cell_detect_v1/infer" - rowcol_model_ep: "http://192.168.106.12:9001/v2.1/models/elem_table_rowcol_detect_v1/infer" - table_model_ep: "http://192.168.106.12:9001/v2.1/models/elem_table_detect_v1/infer" - ocr_model_ep: "http://192.168.106.12:9001/v2.1/models/elem_ocr_collection_v3/infer" + layout_ep: "http://10.60.38.67:3011/v2.1/models/elem_layout_v1/infer" + cell_model_ep: "http://10.60.38.67:3011/v2.1/models/elem_table_cell_detect_v1/infer" + rowcol_model_ep: "http://10.60.38.67:3011/v2.1/models/elem_table_rowcol_detect_v1/infer" + table_model_ep: "http://10.60.38.67:3011/v2.1/models/elem_table_detect_v1/infer" + ocr_model_ep: "http://10.60.38.67:3011/v2.1/models/elem_ocr_collection_v3/infer" # 是否全部走ocr识别, false的话则由代码逻辑判断是否需要走ocr识别 -is_all_ocr: false +is_all_ocr: true # ocr识别需要的配置项 ocr_conf: params: diff --git a/src/bisheng_unstructured/documents/pdf_parser/pdf.py b/src/bisheng_unstructured/documents/pdf_parser/pdf.py index be2d557..2f9c766 100644 --- a/src/bisheng_unstructured/documents/pdf_parser/pdf.py +++ b/src/bisheng_unstructured/documents/pdf_parser/pdf.py @@ -271,6 +271,7 @@ def __init__( support_formula: bool = False, enable_isolated_formula: bool = False, n_parallel: int = 10, + scale: float = 1, **kwargs, ) -> None: """Initialize with a file path.""" @@ -301,6 +302,7 @@ def __init__( self.support_formula = support_formula self.enable_isolated_formula = enable_isolated_formula self.n_parallel = n_parallel + self.scale = scale self.is_scan = is_scan self.mode = kwargs.get("mode", "local") super().__init__() @@ -321,7 +323,7 @@ def _get_image_blobs(self, fitz_doc, pdf_reader, n=None, start=0): except Exception: # some pdf input cannot get render image from fitz page = pdf_reader.get_page(pg) - pil_image = page.render().to_pil() + pil_image = page.render(scale=self.scale).to_pil() img_byte_arr = io.BytesIO() pil_image.save(img_byte_arr, format="PNG") bytes_img = img_byte_arr.getvalue() @@ -1102,7 +1104,14 @@ def _get_elem(blocks, is_first=True): b0, b0_label, r0, r0_w, r0_h = _get_elem(groups[i], False) return groups - + def _divide_by_scale(self,input_list,scale): + result = [] + for item in input_list: + if isinstance(item, list): + result.append(self._divide_by_scale(item,scale)) + else: + result.append(item / scale) + return result def _save_to_pages(self, groups, page_inds, lang): TITLE_ID = 3 TEXT_ID = 4 @@ -1118,7 +1127,7 @@ def _save_to_pages(self, groups, page_inds, lang): text = b.block_text element = None - extra_data = {"bboxes": [bbox], "pages": b.pages} + extra_data = {"bboxes": [self._divide_by_scale(bbox,self.scale)], "pages": b.pages} if label == TABLE_ID: # html = b[-1] @@ -1158,7 +1167,7 @@ def _save_to_pages(self, groups, page_inds, lang): # lines = b[-1] lines = b.ts line_cnt = len(lines) - extra_data.update({"bboxes": line_bboxes}) + extra_data.update({"bboxes": self._divide_by_scale(line_bboxes,self.scale)}) if True or lang == "zh": # for join test only line_chars_cnt = [len(line) for line in lines] indexes = [] @@ -1225,7 +1234,7 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index: page_imgs = [] for idx in range(start, start + n): page = pdf_doc.get_page(idx) - pil_image = page.render().to_pil() + pil_image = page.render(scale=self.scale).to_pil() page_imgs.append(pil_image) img_byte_arr = io.BytesIO() pil_image.save(img_byte_arr, format="PNG") @@ -1261,7 +1270,7 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index: type_texts = "".join(type_texts) zh_n = len(re.findall(ZH_CHAR, type_texts)) total_n = len(type_texts) - is_scan = total_n < 200 + is_scan = total_n < 200 or self.scale!=1 if not is_scan: lang = "zh" if zh_n > 200 or zh_n / total_n > 0.5 else "eng" else: @@ -1284,11 +1293,11 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index: # 重新按页数顺序排序下输出的结果 all_blocks = [[] for _ in range(len(futures))] for future in as_completed(futures): - blocks, idx = future.result() + blocks, i = future.result() if not blocks: continue - logger.info("load_layout_result_end idx={} time={}", idx, timer.get()) - all_blocks[idx] = [idx, blocks] + logger.info("load_layout_result_end idx={} time={}", i, timer.get()) + all_blocks[i] = [i, blocks] # 重新排序下输出的结果 for one in all_blocks: diff --git a/src/bisheng_unstructured/models/ocr_agent.py b/src/bisheng_unstructured/models/ocr_agent.py index 5acd3f0..d6b278a 100644 --- a/src/bisheng_unstructured/models/ocr_agent.py +++ b/src/bisheng_unstructured/models/ocr_agent.py @@ -29,21 +29,21 @@ }, "scene_mapping": { "print": { - "det": "general_text_det_v2.0", - "recog": "general_text_reg_nb_v1.0_faster", + "det": "general_text_det_mrcnn_v2.0", + "recog": "transformer-blank-v0.2-faster", }, "hand": { - "det": "general_text_det_v2.0", - "recog": "general_text_reg_nb_v1.0_faster", + "det": "general_text_det_mrcnn_v2.0", + "recog": "transformer-hand-v1.16-faster", }, "print_recog": { - "recog": "general_text_reg_nb_v1.0_faster", + "recog": "transformer-blank-v0.2-faster", }, "hand_recog": { - "recog": "general_text_reg_nb_v1.0_faster", + "recog": "transformer-hand-v1.16-faster", }, "det": { - "det": "general_text_det_v2.0", + "det": "general_text_det_mrcnn_v2.0", }, }, } diff --git "a/tests/BSJMZIT094-2024B \347\247\221\347\240\224\351\241\271\347\233\256\347\256\241\347\220\206\345\212\236\346\263\225\357\274\2102024\347\211\210\357\274\211V1(1).pdf" "b/tests/BSJMZIT094-2024B \347\247\221\347\240\224\351\241\271\347\233\256\347\256\241\347\220\206\345\212\236\346\263\225\357\274\2102024\347\211\210\357\274\211V1(1).pdf" new file mode 100644 index 0000000..b38ba98 Binary files /dev/null and "b/tests/BSJMZIT094-2024B \347\247\221\347\240\224\351\241\271\347\233\256\347\256\241\347\220\206\345\212\236\346\263\225\357\274\2102024\347\211\210\357\274\211V1(1).pdf" differ diff --git a/tests/test_pdf_ll.py b/tests/test_pdf_ll.py new file mode 100644 index 0000000..7a45871 --- /dev/null +++ b/tests/test_pdf_ll.py @@ -0,0 +1,40 @@ +import base64 +from bisheng_unstructured.models import ( + FormulaAgent, + LayoutAgent, + OCRAgent, + RTLayoutAgent, + RTOCRAgent, + RTTableAgent, + RTTableDetAgent, + TableAgent, + TableDetAgent, +) + +url = f"http://10.60.38.67:3011/v2.1/models/" +layout_ep = url + "elem_layout_v1/infer" +cell_model_ep = url + "elem_table_cell_detect_v1/infer" +rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer" +table_model_ep = url + "elem_table_detect_v1/infer" + +model_params = { + "layout_ep": layout_ep, + "cell_model_ep": cell_model_ep, + "rowcol_model_ep": rowcol_model_ep, + "table_model_ep": table_model_ep, +} +class PDFDocumentTest(): + def __init__(): + self.layout_agent = RTLayoutAgent(**model_params) + self.table_agent = RTTableAgent(**model_params) + self.ocr_agent = RTOCRAgent(**model_params) + self.table_det_agent = RTTableDetAgent(**model_params) + def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index: int): + b64_data = base64.b64encode(bytes_img).decode() + layout_inp = {"b64_image": b64_data} + layout = self.layout_agent.predict(layout_inp) + blocks = self._allocate_semantic( + textpage_info, layout, b64_data, img, is_scan, lang, rot_matrix + ) + return blocks, page_index + diff --git a/tests/test_pdf_parser.py b/tests/test_pdf_parser.py index 48de466..96b15c8 100644 --- a/tests/test_pdf_parser.py +++ b/tests/test_pdf_parser.py @@ -5,7 +5,7 @@ from bisheng_unstructured.documents.pdf_parser.pdf import PDFDocument RT_EP = os.environ.get("RT_EP", "192.168.106.12:9001") -TEST_RT_URL = f"http://{RT_EP}/v2.1/models/" +TEST_RT_URL = f"http://10.60.38.67:3011/v2.1/models/" def test_pdf_doc(): @@ -22,8 +22,8 @@ def test_pdf_doc(): "table_model_ep": table_model_ep, } - filename = "examples/docs/layout-parser-paper-fast.pdf" - pdf_doc = PDFDocument(file=filename, model_params=model_params, n=2) + filename = "/mnt/c/Users/wkjob/D/WorkFiles/study/ChatGpt/bisheng/src/bisheng-unstructured/tests/BSJMZIT094-2024B 科研项目管理办法(2024版)V1(1).pdf" + pdf_doc = PDFDocument(file=filename, model_params=model_params, start=3,n=1,scale=1) pages = pdf_doc.pages elements = pdf_doc.elements @@ -156,9 +156,9 @@ def test_pdf_doc7(): "ocr_model_ep": f"{TEST_RT_URL}elem_ocr_collection_v3/infer", } - filename = "examples/docs/maoxuan_scan.pdf" + filename = "examples/docs/模糊+水印-租赁物清单-20231213142249768106738.pdf" pdf_doc = PDFDocument( - file=filename, model_params=model_params, enhance_table=False, start=0, n=100 + file=filename, model_params=model_params, enhance_table=False, start=0, n=100,scale=2 ) pages = pdf_doc.pages elements = pdf_doc.elements @@ -281,4 +281,6 @@ def test_pdf_doc10(): # test_pdf_doc9() # test_regress() -test_pdf_doc10() +# test_pdf_doc10() +test_pdf_doc() +test_pdf_doc() \ No newline at end of file