-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
83 lines (68 loc) · 2.38 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
import unicodedata
import editdistance
replacements = [
("\u2018", "'"), # Left single quotation mark
("\u2019", "'"), # Right single quotation mark
("\u201C", '"'), # Left double quotation mark
("\u201D", '"'), # Right double quotation mark
("\u2013", "-"), # En dash
("\u2014", "-"), # Em dash
("\u2015", "-"), # Horizontal bar
("\u2026", "..."), # Ellipsis
("\u201A", ","), # Single low-9 quotation mark
("\u201E", '"'), # Double low-9 quotation mark
("\u2032", "'"), # Prime, used as an apostrophe in some texts
("\u2033", '"'), # Double prime, used as a quote in some texts
("\"", "''"), # normalize double quotes to two single quotes
]
def normalize_text(text):
text = text.strip()
text = re.sub("\0c", "", text)
text = unicodedata.normalize("NFKD", text)
text = text.strip()
for old, new in replacements:
text = re.sub(old, new, text)
return text
def normalized_errors(text, gt):
errs = editdistance.eval(
normalize_text(text),
normalize_text(gt)
)
return errs, len(normalize_text(gt))
def text_only(text):
text = text.strip()
text = re.sub("\0c", "", text)
text = unicodedata.normalize("NFKD", text)
return re.sub(r"[^a-zA-Z0-9]+", " ", text)
def text_errs(text, gt):
errs = editdistance.eval(
text_only(text).upper(),
text_only(gt).upper()
)
return errs, len(text_only(gt))
class Something:
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
def add(self, other):
self.value += other.value
class OCRErrors:
def __init__(self):
self.total, self.errors = 0, 0
self.ttotal, self.terrors = 0, 0
def add(self, text, gt):
self.text, self.gt = text, gt
errs, total = normalized_errors(text, gt)
self.errors += errs
self.total += total
terrs, ttotal = text_errs(text, gt)
self.terrors += terrs
self.ttotal += ttotal
if False and terrs > errs:
raise ValueError(f"Text only errors ({terrs}) > normalized errors ({errs})")
return errs, total, terrs, ttotal
def __str__(self):
return f"Normalized: {self.errors}/{self.total} ({self.errors/self.total:.4f})\n" + \
f"Text only: {self.terrors}/{self.ttotal} ({self.terrors/self.ttotal:.4f})"