forked from PaddlePaddle/PaddleOCR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
recovery_to_doc.py
148 lines (134 loc) · 5.26 KB
/
recovery_to_doc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from copy import deepcopy
from docx import Document
from docx import shared
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.section import WD_SECTION
from docx.oxml.ns import qn
from docx.enum.table import WD_TABLE_ALIGNMENT
from ppstructure.recovery.table_process import HtmlToDocx
from ppocr.utils.logging import get_logger
logger = get_logger()
def convert_info_docx(img, res, save_folder, img_name):
doc = Document()
doc.styles['Normal'].font.name = 'Times New Roman'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
doc.styles['Normal'].font.size = shared.Pt(6.5)
flag = 1
for i, region in enumerate(res):
if len(region['res']) == 0:
continue
img_idx = region['img_idx']
if flag == 2 and region['layout'] == 'single':
section = doc.add_section(WD_SECTION.CONTINUOUS)
section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1')
flag = 1
elif flag == 1 and region['layout'] == 'double':
section = doc.add_section(WD_SECTION.CONTINUOUS)
section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2')
flag = 2
if region['type'].lower() == 'figure':
excel_save_folder = os.path.join(save_folder, img_name)
img_path = os.path.join(excel_save_folder,
'{}_{}.jpg'.format(region['bbox'], img_idx))
paragraph_pic = doc.add_paragraph()
paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = paragraph_pic.add_run("")
if flag == 1:
run.add_picture(img_path, width=shared.Inches(5))
elif flag == 2:
run.add_picture(img_path, width=shared.Inches(2))
elif region['type'].lower() == 'title':
doc.add_heading(region['res'][0]['text'])
elif region['type'].lower() == 'table':
parser = HtmlToDocx()
parser.table_style = 'TableGrid'
parser.handle_table(region['res']['html'], doc)
else:
paragraph = doc.add_paragraph()
paragraph_format = paragraph.paragraph_format
for i, line in enumerate(region['res']):
if i == 0:
paragraph_format.first_line_indent = shared.Inches(0.25)
text_run = paragraph.add_run(line['text'] + ' ')
text_run.font.size = shared.Pt(10)
# save to docx
docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name))
doc.save(docx_path)
logger.info('docx save to {}'.format(docx_path))
def sorted_layout_boxes(res, w):
"""
Sort text boxes in order from top to bottom, left to right
args:
res(list):ppstructure results
return:
sorted results(list)
"""
num_boxes = len(res)
if num_boxes == 1:
res[0]['layout'] = 'single'
return res
sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0]))
_boxes = list(sorted_boxes)
new_res = []
res_left = []
res_right = []
i = 0
while True:
if i >= num_boxes:
break
if i == num_boxes - 1:
if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][
'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2:
new_res += res_left
new_res += res_right
_boxes[i]['layout'] = 'single'
new_res.append(_boxes[i])
else:
if _boxes[i]['bbox'][2] > w / 2:
_boxes[i]['layout'] = 'double'
res_right.append(_boxes[i])
new_res += res_left
new_res += res_right
elif _boxes[i]['bbox'][0] < w / 2:
_boxes[i]['layout'] = 'double'
res_left.append(_boxes[i])
new_res += res_left
new_res += res_right
res_left = []
res_right = []
break
elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3 * w / 4:
_boxes[i]['layout'] = 'double'
res_left.append(_boxes[i])
i += 1
elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2:
_boxes[i]['layout'] = 'double'
res_right.append(_boxes[i])
i += 1
else:
new_res += res_left
new_res += res_right
_boxes[i]['layout'] = 'single'
new_res.append(_boxes[i])
res_left = []
res_right = []
i += 1
if res_left:
new_res += res_left
if res_right:
new_res += res_right
return new_res