diff --git a/docs/todo.md b/docs/todo.md index 7888c90..1c7dd86 100644 --- a/docs/todo.md +++ b/docs/todo.md @@ -36,13 +36,17 @@ - [x] Shot 去除白边 - [x] font size 范围优化 - [x] 出错后前端显示 -- [ ] 大规模数据集 +- [x] 大规模数据集 +- [ ] 基于位置进行 Block 合并 +- [ ] Shot 绝对大小 +- [ ] 前端上传文件后清除 input - [ ] fc 挂载 OSS - [ ] fc-be 内网 endpoint / 挂载 OSS - [ ] fc 删除旧版本数据 - [ ] Table of Contents - [ ] Ligature 连字识别 (Hotstuff) - [ ] shot 可复制文字 alt-data +- [ ] 斜体识别 - [ ] list 识别支持 - [ ] figure 识别支持 - [ ] Docker CLI @@ -71,3 +75,79 @@ Lenzen and Sheikholeslami - 2022 - A Recursive Early-Stopping Phase King Protoco Wang et al. - 2019 - A survey on consensus mechanisms and mining strate - [] 8, 10, 13 shot 延展不足 + +Abraham et al. - 2022 - Efficient and Adaptively Secure Asynchronous Binar + + - [] JSONGen 有问题 + +Aublin et al. - 2013 - Rbft Redundant byzantine fault tolerance + + - [] no-common-span 提取 + +Bankhamer et al. - 2022 - Population Protocols for Exact Plurality Consensus + + - [] 3 big-block 误报,漏报 + +Beaver 等。 - 2010 - Finding a needle in haystack Facebook's photo sto + + - [] 5 6 shot 空白延展过度 + +Chang 等。 - 2008 - Bigtable A distributed storage system for structu + + - [] min() arg is an empty sequence + +Data_Replication_Using_Read-One-Write-All_Monitori + + - [] min() arg is an empty sequence + +Dean and Ghemawat - 2008 - MapReduce simplified data processing on large clu + + - [] 大量 big-block 漏报 + +Gilad 等。 - 2017 - Algorand Scaling byzantine agreements for cryptoc + + - [] 大量 big-block 漏报 + +Guo 等 - 2020 - Dumbo Faster asynchronous bft protocols + + - [] Invalid bandwriter header dimensions/setup + +Kapritsos et al. - 2012 - All about eve Execute-verify replication for mult + + - [] min() arg is an empty sequence + +Kotla 和 Dahlin - 2004 - High throughput Byzantine fault tolerance + + - [] Invalid bandwriter header dimensions/setup + +Li et al. - 2020 - A decentralized blockchain with high throughput an + + - [] 大量 big-block 漏报 + +Li et al. - 2020 - GHAST Breaking confirmation delay barrier in naka + + - [] Invalid bandwriter header dimensions/setup + +Liu 等。 - 2018 - Scalable byzantine consensus via hardware-assisted + + - [] Invalid bandwriter header dimensions/setup + +Miller 等 - 2016 - The honey badger of BFT protocols + + - [] 大量 big-block 漏报 + +practical byzantine fault tolerance + + - [] big-block 排列不规则 + +Sankar 等。 - 2017 - Survey of consensus protocols on blockchain applic + + - [] min() arg is an empty sequence + +Scales 等。 - 2010 - The design of a practical system for fault-toleran + + - [] min() arg is an empty sequence + +Zhu et al. - 2022 - Postharvest quality monitoring and cold chain mana + + - [] min() arg is an empty sequence diff --git a/flow_pdf/main.py b/flow_pdf/main.py index d5a8950..2cf3549 100644 --- a/flow_pdf/main.py +++ b/flow_pdf/main.py @@ -13,6 +13,7 @@ cfg = yaml.load(Path("./config.yaml").read_text(), Loader=yaml.FullLoader) + def get_files_from_cfg(): dir_input = Path(cfg["path"]["input"]) dir_output = Path(cfg["path"]["output"]) @@ -43,13 +44,13 @@ def create_task(file_input: Path, dir_output: Path): try: e.execute() except Exception as e: - logger.error(f'{file_input.name} failed') + logger.error(f"{file_input.name} failed") file.write_text(dir_output / "error.txt", str(e)) logger.info(f"end {file_input.name}, time = {time.perf_counter() - t:.2f}s") if __name__ == "__main__": - with concurrent.futures.ProcessPoolExecutor() as executor: + with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor: futures = [ executor.submit(create_task, file_input, dir_output) for file_input, dir_output in get_files_from_cfg() @@ -57,8 +58,8 @@ def create_task(file_input: Path, dir_output: Path): for future in futures: future.result() - if cfg['compare']['enabled']: - dir_target = Path(cfg['compare']['target']) + if cfg["compare"]["enabled"]: + dir_target = Path(cfg["compare"]["target"]) dir_output_list = [] for _, d in get_files_from_cfg(): @@ -67,16 +68,16 @@ def create_task(file_input: Path, dir_output: Path): for dir_output in dir_output_list: dir_t = dir_target / dir_output.stem - file_t = dir_t / "big_blocks_id" / 'big_blocks_id.json' + file_t = dir_t / "big_blocks_id" / "big_blocks_id.json" if not file_t.exists(): logger.warning(f"target file not found: {file_t}") continue - cur = file.read_json(dir_output / 'big_blocks_id.json') + cur = file.read_json(dir_output / "big_blocks_id.json") expect = file.read_json(file_t) if cur != expect: - logger.debug(f'{dir_output.stem} changed') + logger.debug(f"{dir_output.stem} changed") for i in range(len(cur)): if cur[i] != expect[i]: add_list = [] @@ -87,4 +88,4 @@ def create_task(file_input: Path, dir_output: Path): for j in range(len(expect[i])): if expect[i][j] not in cur[i]: del_list.append(expect[i][j]) - logger.debug(f'page {i}, add: {add_list}, del: {del_list}') + logger.debug(f"page {i}, add: {add_list}, del: {del_list}") diff --git a/flow_pdf/worker/common.py b/flow_pdf/worker/common.py index f76707c..be91e86 100644 --- a/flow_pdf/worker/common.py +++ b/flow_pdf/worker/common.py @@ -354,7 +354,8 @@ def add_annot(page, rects, annot: str, color): if annot: a = f"{annot}-{i}" page.add_freetext_annot( - (rect[0], rect[1], rect[0] + len(a) * 6, rect[1] + 10), + # (rect[0], rect[1], rect[0] + len(a) * 6, rect[1] + 10), + (rect[2] - len(a) * 6, rect[1], rect[2] , rect[1] + 10), a, fill_color=fitz.utils.getColor("white"), border_color=fitz.utils.getColor("black"),