-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinal_extract_gt.py
112 lines (88 loc) · 3.75 KB
/
final_extract_gt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# This script to extract final annotation for releasing to public.
# After LLM-ref correct, chop off, reject_llm and spell correct by Sonnet
# NOTE, here, last time we clean last noises(where we forgot to clean last step):
# clean single-letter content located at 0, 1 (top bboxes) and -2 -1 (bottom bboxes)
# THEN (actually same time), we use below logic to extract final:
################################################################
# skip 'invalid' and above noises (i.e. don't take them)
# at "perfect" -> take original 'content'
# at "corrected" (still after reject_lm) -> take 'content_c'
# at other statuses ([skip, not_found, not_relevant, reject_lm_x]) -> take:
# check 'status_p', if it is 'intact' -> take original 'content'
# check 'status_p', if it is 'modified' -> take 'content_s'
################################################################
# Also add size (dimension) of original image.
import argparse
import os
import glob
import json
import multiprocessing as mp
from pathlib import Path
from PIL import Image
def load_json(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
return data
except json.JSONDecodeError:
print(f"Error: '{file_path}' is not a valid JSON")
return None
def process_json(args, js_path):
js_basename = os.path.basename(js_path)
im_path = os.path.join(args.im_dir, Path(js_basename).stem + ".jpg")
with Image.open(im_path) as img:
width, height = img.size
para_list = load_json(js_path)
len_list = len(para_list)
out_list = [{"w": width, "h": height}] # add dict elements after the dim
for idx, para in enumerate(para_list):
status = para["status"]
status_p = para.get("status_p", None)
o_text = para["content"].strip()
# skip final described noises
if len(o_text) == 1 and status in {"skip", "not_found", "not_relevant"}:
if o_text[0].isalpha() and (idx < 2 or idx > len_list - 3):
continue
if status == "invalid":
continue
if status == "perfect" or status == "intact":
text = o_text
elif status == "corrected":
text = para["content_c"].strip()
elif status.startswith("reject") or status in {
"skip",
"not_found",
"not_relevant",
}:
if status_p == "intact" or status_p is None:
text = o_text
elif status_p == "modified":
text = para["content_s"].strip()
else:
print("ERR1")
else:
print("ERR2") # if there new status, check again!!
poly = para["polygon"]
out_list.append({"text": text, "polygon": poly})
if not os.path.exists(args.o):
os.makedirs(args.o, exist_ok=True)
save_target = os.path.join(args.o, js_basename)
with open(save_target, "w", encoding="utf-8") as file:
json.dump(out_list, file, indent=2, ensure_ascii=False)
print(f"Saved file {save_target} successfully")
# Work for each book, not for all at the same time.
def main():
parser = argparse.ArgumentParser(description="final extraction")
parser.add_argument("im_dir", help="dir to original images to get size")
parser.add_argument("json_dir", help="dir to load input json files")
parser.add_argument("o", help="where to out/save json files")
args = parser.parse_args()
json_paths = glob.glob(os.path.join(args.json_dir, "*.json"))
# Use multiprocessing to process JSON files in parallel
with mp.Pool(processes=mp.cpu_count() - 4) as pool:
pool.starmap(
process_json,
[(args, js_path) for js_path in json_paths],
)
if __name__ == "__main__":
main()