Skip to content

Commit

Permalink
fix: sub_array_range
Browse files Browse the repository at this point in the history
Signed-off-by: 117503445 <[email protected]>
  • Loading branch information
117503445 committed Jun 27, 2023
1 parent a9de325 commit 2345c73
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 32 deletions.
2 changes: 1 addition & 1 deletion config.yaml.example
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
path:
input: ./data
input: ./data/in
output: /tmp/flow-pdf/out

compare:
Expand Down
33 changes: 16 additions & 17 deletions flow_pdf/worker/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def run_page( # type: ignore[override]
# for block in page_in.raw_dict["blocks"]:
# rects = []
# if block["type"] == 0:
# for line in block["lines"]:
# for line in block.lines:
# rects.append(line.bbox)
# add_annot(page, rects, "", "red")
# add_annot(page, rects, "l", "red")
Expand All @@ -89,40 +89,39 @@ def run_page( # type: ignore[override]
# for block in page_in.raw_dict["blocks"]:
# rects = []
# if block["type"] == 0:
# for line in block["lines"]:
# for span in line["spans"]:
# for line in block.lines:
# for span in line.spans:
# rects.append(span.bbox)
# add_annot(page, rects, "", "purple")

# block common span
# for block in page_in.raw_dict["blocks"]:
# rects = []
# if block["type"] == 0:
# for line in block["lines"]:
# for span in line["spans"]:
# for line in block.lines:
# for span in line.spans:
# if is_common_span(span, doc_in.most_common_font, doc_in.common_size_range):
# rects.append(span.bbox)
# add_annot(page, rects, "", "purple")

# block not common span
# rects = []
# for c in page_in.big_blocks:
# for block in c:
# if block["type"] == 0:
# for line in block["lines"]:
# for span in line["spans"]:
# if not is_common_span(span, doc_in.most_common_font, doc_in.common_size_range):
# rects.append(span.bbox)
# add_annot(page, rects, "", "purple")
rects = []
for blocks in page_in.big_blocks:
for block in blocks:
for line in block.lines:
for span in line.spans:
if not is_common_span(span, doc_in.most_common_font, doc_in.common_size_range):
rects.append(span.bbox)
add_annot(page, rects, "", "purple")

# new line
# rects = []
# for b in page_in.big_blocks:
# for i in range(1, len(b["lines"])):
# line = b["lines"][i]
# for i in range(1, len(b.lines)):
# line = b.lines[i]
# delta = line.bbox[0] - b.bbox[0]
# if delta > 1:
# last_line = b["lines"][i - 1]
# last_line = b.lines[i - 1]
# if last_line.bbox[0] - b.bbox[0] < 1:
# rects.append(line.bbox)
# add_annot(page, rects, "new-line", "pink")
Expand Down
28 changes: 14 additions & 14 deletions flow_pdf/worker/font_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,21 +118,21 @@ def sub_array_range(arr: list[float], sub_arr_range: int) -> Range:

arr.sort()

start = end = max_start = max_end = 0
max_count = count = 1

for i in range(1, len(arr)):
if arr[i] - arr[start] < sub_arr_range:
end = i
count += 1
max_start = 0
max_end = 0
max_len = 1

start = 0
end = 0
while end < len(arr):
if arr[end] - arr[start] > sub_arr_range:
start += 1
else:
start = end = i
count = 1

if count > max_count:
max_count = count
max_start = start
max_end = end
if end - start + 1 > max_len:
max_len = end - start + 1
max_start = start
max_end = end
end += 1

return Range(arr[max_start], arr[max_end])

Expand Down

0 comments on commit 2345c73

Please sign in to comment.